#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
+ _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
- ((*s) == '{' && regcurly(s, FALSE)))
+ ((*s) == '{' && regcurly(s)))
/*
* Flags to be passed up and down.
regnode_ssc *start_class;
} scan_data_t;
-/* The below is perhaps overboard, but this allows us to save a test at the
- * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
- * and 'a' differ by a single bit; the same with the upper and lower case of
- * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
- * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
- * then inverts it to form a mask, with just a single 0, in the bit position
- * where the upper- and lowercase differ. XXX There are about 40 other
- * instances in the Perl core where this micro-optimization could be used.
- * Should decide if maintenance cost is worse, before changing those
- *
- * Returns a boolean as to whether or not 'v' is either a lowercase or
- * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
- * compile-time constant, the generated code is better than some optimizing
- * compilers figure out, amounting to a mask and test. The results are
- * meaningless if 'c' is not one of [A-Za-z] */
-#define isARG2_lower_or_UPPER_ARG1(c, v) \
- (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
-
/*
* Forward declarations for pregcomp()'s friends.
*/
}
STATIC int
-S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
+S_ssc_is_anything(const regnode_ssc *ssc)
{
/* Returns TRUE if the SSC 'ssc' can match the empty string and any code
* point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
}
/* If e.g., both \w and \W are set, matches everything */
- if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
+ if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
int i;
for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
* necessary. */
if (RExC_contains_locale) {
ANYOF_POSIXL_SETALL(ssc);
- ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
}
else {
ANYOF_POSIXL_ZERO(ssc);
}
STATIC int
-S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
- const regnode_ssc *ssc)
+S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
+ const regnode_ssc *ssc)
{
/* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
* to the list of code points matched, and locale posix classes; hence does
return FALSE;
}
- if (RExC_contains_locale
- && ! ((ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
- || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
- || ! ANYOF_POSIXL_TEST_ALL_SET(ssc)))
- {
+ if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
return FALSE;
}
STATIC SV*
S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
- const regnode_charclass_posixl_fold* const node)
+ const regnode_charclass* const node)
{
/* Returns a mortal inversion list defining which code points are matched
* by 'node', which is of type ANYOF. Handles complementing the result if
* possibility. */
SV* invlist = sv_2mortal(_new_invlist(0));
+ SV* only_utf8_locale_invlist = NULL;
unsigned int i;
const U32 n = ARG(node);
bool new_node_has_latin1 = FALSE;
* known until runtime -- we have to assume it could be anything */
return _add_range_to_invlist(invlist, 0, UV_MAX);
}
- else {
+ else if (ary[3] && ary[3] != &PL_sv_undef) {
/* Here no compile-time swash, and no run-time only data. Use the
* node's inversion list */
- invlist = sv_2mortal(invlist_clone(ary[2]));
+ invlist = sv_2mortal(invlist_clone(ary[3]));
+ }
+
+ /* Get the code points valid only under UTF-8 locales */
+ if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
+ && ary[2] && ary[2] != &PL_sv_undef)
+ {
+ only_utf8_locale_invlist = ary[2];
}
}
- /* An ANYOF node contains a bitmap for the first 256 code points, and an
- * inversion list for the others, but if there are code points that should
- * match only conditionally on the target string being UTF-8, those are
- * placed in the inversion list, and not the bitmap. Since there are
- * circumstances under which they could match, they are included in the
- * SSC. But if the ANYOF node is to be inverted, we have to exclude them
- * here, so that when we invert below, the end result actually does include
- * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
- * before we add the unconditionally matched code points */
+ /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
+ * code points, and an inversion list for the others, but if there are code
+ * points that should match only conditionally on the target string being
+ * UTF-8, those are placed in the inversion list, and not the bitmap.
+ * Since there are circumstances under which they could match, they are
+ * included in the SSC. But if the ANYOF node is to be inverted, we have
+ * to exclude them here, so that when we invert below, the end result
+ * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
+ * have to do this here before we add the unconditionally matched code
+ * points */
if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
_invlist_intersection_complement_2nd(invlist,
PL_UpperLatin1,
}
/* Add in the points from the bit map */
- for (i = 0; i < 256; i++) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
invlist = add_cp_to_invlist(invlist, i);
new_node_has_latin1 = TRUE;
_invlist_union(invlist, PL_Latin1, &invlist);
}
- /* Similarly add the UTF-8 locale possible matches */
- if (ANYOF_FLAGS(node) & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(node))
- {
+ /* Similarly add the UTF-8 locale possible matches. These have to be
+ * deferred until after the non-UTF-8 locale ones are taken care of just
+ * above, or it leads to wrong results under ANYOF_INVERT */
+ if (only_utf8_locale_invlist) {
_invlist_union_maybe_complement_2nd(invlist,
- ANYOF_UTF8_LOCALE_INVLIST(node),
+ only_utf8_locale_invlist,
ANYOF_FLAGS(node) & ANYOF_INVERT,
&invlist);
}
#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
-STATIC void
-S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
-{
- /* Take the flags 'and_with' and accumulate them anded into the flags for
- * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
- * The flags 'and_with' should not come from another SSC (otherwise the
- * EMPTY_STRING flag won't work) */
-
- const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS;
-
- PERL_ARGS_ASSERT_SSC_FLAGS_AND;
-
- /* Use just the SSC-related flags from 'and_with' */
- ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS);
- ANYOF_FLAGS(ssc) |= ssc_only_flags;
-}
-
/* 'AND' a given class with another one. Can create false positives. 'ssc'
* should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
* 'and_with' is a regnode_charclass instead of a regnode_ssc. */
}
}
else {
- anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl_fold*) and_with);
+ anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
}
if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
ANYOF_POSIXL_ZERO(ssc);
}
- else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
+ else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
/* Note that the Posix class component P from 'and_with' actually
* looks like:
* standard, in particular almost everything by Microsoft.
* The loop below just changes e.g., \w into \W and vice versa */
- regnode_charclass_posixl_fold temp;
+ regnode_charclass_posixl temp;
int add = 1; /* To calculate the index of the complement */
ANYOF_POSIXL_ZERO(&temp);
}
}
}
- else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
- || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
+ else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
+ || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
{
/* One or the other of P1, P2 is non-empty. */
- ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
+ if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
+ ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
+ }
ssc_union(ssc, anded_cp_list, FALSE);
}
else { /* P1 = P2 = empty */
ored_flags = ANYOF_FLAGS(or_with);
}
else {
- ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
- (regnode_charclass_posixl_fold*) or_with);
+ ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
}
&& ! is_ANYOF_SYNTHETIC(or_with))
{
/* We ignore P2, leaving P1 going forward */
- }
- else { /* Not inverted */
+ } /* else Not inverted */
+ else if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
- if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
+ if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
unsigned int i;
for (i = 0; i < ANYOF_MAX; i += 2) {
if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
ssc_match_all_cp(ssc);
ANYOF_POSIXL_CLEAR(ssc, i);
ANYOF_POSIXL_CLEAR(ssc, i+1);
- if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
- ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
- }
}
}
}
}
PERL_STATIC_INLINE void
-S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
+S_ssc_clear_locale(regnode_ssc *ssc)
{
/* Set the SSC 'ssc' to not match any locale things */
-
PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
assert(is_ANYOF_SYNTHETIC(ssc));
{
/* The inversion list in the SSC is marked mortal; now we need a more
* permanent copy, which is stored the same way that is done in a regular
- * ANYOF node, with the first 256 code points in a bit map */
+ * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
+ * map */
SV* invlist = invlist_clone(ssc->invlist);
populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
- set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
+ set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
+ NULL, NULL, NULL, FALSE);
+
+ /* Make sure is clone-safe */
+ ssc->invlist = NULL;
- /* The code points that could match under /li are already incorporated into
- * the inversion list and bit map */
- ANYOF_FLAGS(ssc) &= ~ANYOF_LOC_FOLD;
+ if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
+ ANYOF_FLAGS(ssc) |= ANYOF_POSIXL;
+ }
- assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
+ assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
}
#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
May be the same as tail.
tail : item following the branch sequence
count : words in the sequence
- flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
+ flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
depth : indent depth
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
regnode *first, regnode *last, regnode *tail,
U32 word_count, U32 flags, U32 depth)
{
- dVAR;
/* first pass, loop through and scan words */
reg_trie_data *trie;
HV *widecharmap = NULL;
});
re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
+ assert(re_trie_maxbuff);
if (!SvIOK(re_trie_maxbuff)) {
sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
}
regnode *noper = NEXTOPER( cur );
const U8 *uc = (U8*)STRING( noper );
const U8 *e = uc + STR_LEN( noper );
- STRLEN foldlen = 0;
+ int foldlen = 0;
U32 wordlen = 0; /* required init */
- STRLEN minbytes = 0;
- STRLEN maxbytes = 0;
+ STRLEN minchars = 0;
+ STRLEN maxchars = 0;
bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
bitmap?*/
TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
}
}
- for ( ; uc < e ; uc += len ) {
+ for ( ; uc < e ; uc += len ) { /* Look at each char in the current
+ branch */
TRIE_CHARCOUNT(trie)++;
TRIE_READ_CHAR;
- /* Acummulate to the current values, the range in the number of
- * bytes that this character could match. The max is presumed to
- * be the same as the folded input (which TRIE_READ_CHAR returns),
- * except that when this is not in UTF-8, it could be matched
- * against a string which is UTF-8, and the variant characters
- * could be 2 bytes instead of the 1 here. Likewise, for the
- * minimum number of bytes when not folded. When folding, the min
- * is assumed to be 1 byte could fold to match the single character
- * here, or in the case of a multi-char fold, 1 byte can fold to
- * the whole sequence. 'foldlen' is used to denote whether we are
- * in such a sequence, skipping the min setting if so. XXX TODO
- * Use the exact list of what folds to each character, from
- * PL_utf8_foldclosures */
- if (UTF) {
- maxbytes += UTF8SKIP(uc);
- if (! folder) {
- /* A non-UTF-8 string could be 1 byte to match our 2 */
- minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
- ? 1
- : UTF8SKIP(uc);
- }
- else {
- if (foldlen) {
- foldlen -= UTF8SKIP(uc);
- }
- else {
- foldlen = is_MULTI_CHAR_FOLD_utf8(uc);
- minbytes++;
- }
- }
+ /* TRIE_READ_CHAR returns the current character, or its fold if /i
+ * is in effect. Under /i, this character can match itself, or
+ * anything that folds to it. If not under /i, it can match just
+ * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
+ * all fold to k, and all are single characters. But some folds
+ * expand to more than one character, so for example LATIN SMALL
+ * LIGATURE FFI folds to the three character sequence 'ffi'. If
+ * the string beginning at 'uc' is 'ffi', it could be matched by
+ * three characters, or just by the one ligature character. (It
+ * could also be matched by two characters: LATIN SMALL LIGATURE FF
+ * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
+ * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
+ * match.) The trie needs to know the minimum and maximum number
+ * of characters that could match so that it can use size alone to
+ * quickly reject many match attempts. The max is simple: it is
+ * the number of folded characters in this branch (since a fold is
+ * never shorter than what folds to it. */
+
+ maxchars++;
+
+ /* And the min is equal to the max if not under /i (indicated by
+ * 'folder' being NULL), or there are no multi-character folds. If
+ * there is a multi-character fold, the min is incremented just
+ * once, for the character that folds to the sequence. Each
+ * character in the sequence needs to be added to the list below of
+ * characters in the trie, but we count only the first towards the
+ * min number of characters needed. This is done through the
+ * variable 'foldlen', which is returned by the macros that look
+ * for these sequences as the number of bytes the sequence
+ * occupies. Each time through the loop, we decrement 'foldlen' by
+ * how many bytes the current char occupies. Only when it reaches
+ * 0 do we increment 'minchars' or look for another multi-character
+ * sequence. */
+ if (folder == NULL) {
+ minchars++;
+ }
+ else if (foldlen > 0) {
+ foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
}
else {
- maxbytes += (UNI_IS_INVARIANT(*uc))
- ? 1
- : 2;
- if (! folder) {
- minbytes++;
- }
- else {
- if (foldlen) {
- foldlen--;
- }
- else {
- foldlen = is_MULTI_CHAR_FOLD_latin1(uc);
- minbytes++;
+ minchars++;
+
+ /* See if *uc is the beginning of a multi-character fold. If
+ * so, we decrement the length remaining to look at, to account
+ * for the current character this iteration. (We can use 'uc'
+ * instead of the fold returned by TRIE_READ_CHAR because for
+ * non-UTF, the latin1_safe macro is smart enough to account
+ * for all the unfolded characters, and because for UTF, the
+ * string will already have been folded earlier in the
+ * compilation process */
+ if (UTF) {
+ if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
+ foldlen -= UTF8SKIP(uc);
}
}
+ else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
+ foldlen--;
+ }
}
+
+ /* The current character (and any potential folds) should be added
+ * to the possible matching characters for this position in this
+ * branch */
if ( uvc < 256 ) {
if ( folder ) {
U8 folded= folder[ (U8) uvc ];
set_bit = 0; /* We've done our bit :-) */
}
} else {
+
+ /* XXX We could come up with the list of code points that fold
+ * to this using PL_utf8_foldclosures, except not for
+ * multi-char folds, as there may be multiple combinations
+ * there that could work, which needs to wait until runtime to
+ * resolve (The comment about LIGATURE FFI above is such an
+ * example */
+
SV** svpp;
if ( !widecharmap )
widecharmap = newHV();
TRIE_STORE_REVCHAR(uvc);
}
}
- }
+ } /* end loop through characters in this branch of the trie */
+
+ /* We take the min and max for this branch and combine to find the min
+ * and max for all branches processed so far */
if( cur == first ) {
- trie->minlen = minbytes;
- trie->maxlen = maxbytes;
- } else if (minbytes < trie->minlen) {
- trie->minlen = minbytes;
- } else if (maxbytes > trie->maxlen) {
- trie->maxlen = maxbytes;
+ trie->minlen = minchars;
+ trie->maxlen = maxchars;
+ } else if (minchars < trie->minlen) {
+ trie->minlen = minchars;
+ } else if (maxchars > trie->maxlen) {
+ trie->maxlen = maxchars;
}
} /* end first pass */
DEBUG_TRIE_COMPILE_r(
: MADE_TRIE;
}
-STATIC void
-S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
+STATIC regnode *
+S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
{
/* The Trie is constructed and compressed now so we can build a fail array if
* it's needed
U32 *fail;
reg_ac_data *aho;
const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
+ regnode *stclass;
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
+ PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
+ PERL_UNUSED_CONTEXT;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
+ if ( OP(source) == TRIE ) {
+ struct regnode_1 *op = (struct regnode_1 *)
+ PerlMemShared_calloc(1, sizeof(struct regnode_1));
+ StructCopy(source,op,struct regnode_1);
+ stclass = (regnode *)op;
+ } else {
+ struct regnode_charclass *op = (struct regnode_charclass *)
+ PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
+ StructCopy(source,op,struct regnode_charclass);
+ stclass = (regnode *)op;
+ }
+ OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
ARG_SET( stclass, data_slot );
aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
});
Safefree(q);
/*RExC_seen |= REG_TRIEDFA_SEEN;*/
+ return stclass;
}
DEBUG_OPTIMISE_r({if (scan){ \
SV * const mysv=sv_newmortal(); \
regnode *Next = regnext(scan); \
- regprop(RExC_rx, mysv, scan); \
+ regprop(RExC_rx, mysv, scan, NULL); \
PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
(int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
Next ? (REG_NODE_NUM(Next)) : 0 ); \
length sequence we are looking for is 2 */
{
int count = 0; /* How many characters in a multi-char fold */
- int len = is_MULTI_CHAR_FOLD_utf8(s);
+ int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
if (! len) { /* Not a multi-char fold: get next char */
s += UTF8SKIP(s);
continue;
else { /* Here is a generic multi-char fold. */
U8* multi_end = s + len;
- /* Count how many characters in it. In the case of /aa, no
- * folds which contain ASCII code points are allowed, so
- * check for those, and skip if found. */
+ /* Count how many characters are in it. In the case of
+ * /aa, no folds which contain ASCII code points are
+ * allowed, so check for those, and skip if found. */
if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
count = utf8_length(s, multi_end);
s = multi_end;
: s_end -1;
while (s < upper) {
- int len = is_MULTI_CHAR_FOLD_latin1(s);
+ int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
if (! len) { /* Not a multi-char fold. */
if (*s == LATIN_SMALL_LETTER_SHARP_S
&& (OP(scan) == EXACTF || OP(scan) == EXACTFL))
}
if (len == 2
- && isARG2_lower_or_UPPER_ARG1('s', *s)
- && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
+ && isALPHA_FOLD_EQ(*s, 's')
+ && isALPHA_FOLD_EQ(*(s+1), 's'))
{
/* EXACTF nodes need to know that the minimum length
/* recursed: which subroutines have we recursed into */
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
- dVAR;
/* There must be at least this number of characters to match */
SSize_t min = 0;
I32 pars = 0, code;
DEBUG_PEEP("Peep", scan, depth);
- /* Its not clear to khw or hv why this is done here, and not in the
- * clauses that deal with EXACT nodes. khw's guess is that it's
- * because of a previous design */
+ /* The reason we do this here we need to deal with things like /(?:f)(?:o)(?:o)/
+ * which cant be dealt with by the normal EXACT parsing code, as each (?:..) is handled
+ * by a different invocation of reg() -- Yves
+ */
JOIN_EXACT(scan,&min_subtract, &unfolded_multi_char, 0);
/* Follow the next-chain of the current node and optimize
}
}
- if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch )
- == BRANCH )
+ if (PERL_ENABLE_TRIE_OPTIMISATION &&
+ OP( startbranch ) == BRANCH )
{
/* demq.
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, tail );
+ regprop(RExC_rx, mysv, tail, NULL);
PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
(int)depth * 2 + 2, "",
"Looking for TRIE'able sequences. Tail node is: ",
#endif
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, cur);
+ regprop(RExC_rx, mysv, cur, NULL);
PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
(int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
- regprop(RExC_rx, mysv, noper);
+ regprop(RExC_rx, mysv, noper, NULL);
PerlIO_printf( Perl_debug_log, " -> %s",
SvPV_nolen_const(mysv));
if ( noper_next ) {
- regprop(RExC_rx, mysv, noper_next );
+ regprop(RExC_rx, mysv, noper_next, NULL);
PerlIO_printf( Perl_debug_log,"\t=> %s\t",
SvPV_nolen_const(mysv));
}
} /* end handle unmergable node */
} /* loop over branches */
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, cur);
+ regprop(RExC_rx, mysv, cur, NULL);
PerlIO_printf( Perl_debug_log,
"%*s- %s (%d) <SCAN FINISHED>\n",
(int)depth * 2 + 2,
* something like this: (?:|) So we can
* turn it into a plain NOTHING op. */
DEBUG_TRIE_COMPILE_r({
- regprop(RExC_rx, mysv, cur);
+ regprop(RExC_rx, mysv, cur, NULL);
PerlIO_printf( Perl_debug_log,
"%*s- %s (%d) <NOTHING BRANCH SEQUENCE>\n", (int)depth * 2 + 2,
"", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
}
flags &= ~SCF_DO_STCLASS;
}
- else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
+ else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
+ EXACTFish */
SSize_t l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
separate code points */
+ const U8 * s = (U8*)STRING(scan);
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
}
if (UTF) {
- const U8 * const s = (U8 *)STRING(scan);
uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
data->longest = &(data->longest_float);
}
}
- if (OP(scan) == EXACTFL) {
- if (flags & SCF_DO_STCLASS_AND) {
- ssc_flags_and(data->start_class, ANYOF_LOCALE);
- }
- else if (flags & SCF_DO_STCLASS_OR) {
- ANYOF_FLAGS(data->start_class) |= ANYOF_LOCALE;
- }
- /* We don't know what the folds are; it could be anything. XXX
- * Actually, we only support UTF-8 encoding for code points
- * above Latin1, so we could know what those folds are. */
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0,
- UV_MAX);
+ if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
+ ssc_clear_locale(data->start_class);
}
- else { /* Non-locale EXACTFish */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
- if (flags & SCF_DO_STCLASS_AND) {
- ssc_clear_locale(data->start_class);
+
+ if (! UTF) {
+
+ /* We punt and assume can match anything if the node begins
+ * with a multi-character fold. Things are complicated. For
+ * example, /ffi/i could match any of:
+ * "\N{LATIN SMALL LIGATURE FFI}"
+ * "\N{LATIN SMALL LIGATURE FF}I"
+ * "F\N{LATIN SMALL LIGATURE FI}"
+ * plus several other things; and making sure we have all the
+ * possibilities is hard. */
+ if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
}
- if (uc < 256) { /* We know what the Latin1 folds are ... */
- if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
- know if anything folds
- with this */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
+ else {
+
+ /* Any Latin1 range character can potentially match any
+ * other depending on the locale */
+ if (OP(scan) == EXACTFL) {
+ _invlist_union(EXACTF_invlist, PL_Latin1,
+ &EXACTF_invlist);
+ }
+ else {
+ /* But otherwise, it matches at least itself. We can
+ * quickly tell if it has a distinct fold, and if so,
+ * it matches that as well */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (IS_IN_SOME_FOLD_L1(uc)) {
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
PL_fold_latin1[uc]);
- if (OP(scan) != EXACTFA) { /* The folds below aren't
- legal under /iaa */
- if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist,
- LATIN_SMALL_LETTER_SHARP_S);
- }
- else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 's');
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 'S');
- }
}
+ }
- /* We also know if there are above-Latin1 code points
- * that fold to this (none legal for ASCII and /iaa) */
- if ((! isASCII(uc) || OP(scan) != EXACTFA)
- && HAS_NONLATIN1_FOLD_CLOSURE(uc))
- {
- /* XXX We could know exactly what does fold to this
- * if the reverse folds are loaded, as currently in
- * S_regclass() */
- _invlist_union(EXACTF_invlist,
- PL_AboveLatin1,
- &EXACTF_invlist);
+ /* Some characters match above-Latin1 ones under /i. This
+ * is true of EXACTFL ones when the locale is UTF-8 */
+ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
+ && (! isASCII(uc) || (OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)))
+ {
+ add_above_Latin1_folds(pRExC_state,
+ (U8) uc,
+ &EXACTF_invlist);
+ }
+ }
+ }
+ else { /* Pattern is UTF-8 */
+ U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
+ STRLEN foldlen = UTF8SKIP(s);
+ const U8* e = s + STR_LEN(scan);
+ SV** listp;
+
+ /* The only code points that aren't folded in a UTF EXACTFish
+ * node are are the problematic ones in EXACTFL nodes */
+ if (OP(scan) == EXACTFL
+ && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
+ {
+ /* We need to check for the possibility that this EXACTFL
+ * node begins with a multi-char fold. Therefore we fold
+ * the first few characters of it so that we can make that
+ * check */
+ U8 *d = folded;
+ int i;
+
+ for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
+ if (isASCII(*s)) {
+ *(d++) = (U8) toFOLD(*s);
+ s++;
+ }
+ else {
+ STRLEN len;
+ to_utf8_fold(s, d, &len);
+ d += len;
+ s += UTF8SKIP(s);
}
}
+
+ /* And set up so the code below that looks in this folded
+ * buffer instead of the node's string */
+ e = d;
+ foldlen = UTF8SKIP(folded);
+ s = folded;
}
- else { /* Non-locale, above Latin1. XXX We don't currently
- know what participates in folds with this, so have
- to assume anything could */
-
- /* XXX We could know exactly what does fold to this if the
- * reverse folds are loaded, as currently in S_regclass().
- * But we do know that under /iaa nothing in the ASCII
- * range can participate */
- if (OP(scan) == EXACTFA) {
- _invlist_union_complement_2nd(EXACTF_invlist,
- PL_XPosix_ptrs[_CC_ASCII],
- &EXACTF_invlist);
+
+ /* When we reach here 's' points to the fold of the first
+ * character(s) of the node; and 'e' points to far enough along
+ * the folded string to be just past any possible multi-char
+ * fold. 'foldlen' is the length in bytes of the first
+ * character in 's'
+ *
+ * Unlike the non-UTF-8 case, the macro for determining if a
+ * string is a multi-char fold requires all the characters to
+ * already be folded. This is because of all the complications
+ * if not. Note that they are folded anyway, except in EXACTFL
+ * nodes. Like the non-UTF case above, we punt if the node
+ * begins with a multi-char fold */
+
+ if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
+ }
+ else { /* Single char fold */
+
+ /* It matches all the things that fold to it, which are
+ * found in PL_utf8_foldclosures (including itself) */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (! PL_utf8_foldclosures) {
+ _load_PL_utf8_foldclosures();
}
- else {
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0, UV_MAX);
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) s, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_tindex(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ assert(c_p);
+
+ c = SvUV(*c_p);
+
+ /* /aa doesn't allow folds between ASCII and non- */
+ if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
+ && isASCII(c) != isASCII(uc))
+ {
+ continue;
+ }
+
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
+ }
}
}
}
if (flags & SCF_DO_SUBSTR)
data->pos_min++;
min++;
- /* Fall through. */
+ /* FALLTHROUGH */
case STAR:
if (flags & SCF_DO_STCLASS) {
mincount = 0;
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
- " SSize_t_MAX=%"UVdf" minnext=%"UVdf
- " maxcount=%"UVdf" mincount=%"UVdf"\n",
+PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
+ " SSize_t_MAX=%"UVuf" minnext=%"UVuf
+ " maxcount=%"UVuf" mincount=%"UVuf"\n",
(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
(UV)mincount);
if (deltanext != SSize_t_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
case NPOSIXL:
invert = 1;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case POSIXL:
namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
ssc_match_all_cp(data->start_class);
ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
ANYOF_POSIXL_CLEAR(data->start_class, complement);
- if (! ANYOF_POSIXL_TEST_ANY_SET(data->start_class))
- {
- ANYOF_FLAGS(data->start_class) &= ~ANYOF_POSIXL;
- }
}
else { /* The usual case; just add this class to the
existing set */
ANYOF_POSIXL_SET(data->start_class, namedclass);
- ANYOF_FLAGS(data->start_class)
- |= ANYOF_LOCALE|ANYOF_POSIXL;
}
}
break;
case NPOSIXA: /* For these, we always know the exact set of
what's matched */
invert = 1;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case POSIXA:
if (FLAGS(scan) == _CC_ASCII) {
my_invlist = PL_XPosix_ptrs[_CC_ASCII];
case NPOSIXD:
case NPOSIXU:
invert = 1;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case POSIXD:
case POSIXU:
my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)]);
else if ( PL_regkind[OP(scan)] == BRANCHJ
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
- && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
+ && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
+ {
if ( OP(scan) == UNLESSM &&
scan->flags == 0 &&
OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
DEBUG_STUDYDATA("OPFAIL",data,depth);
/*DEBUG_PARSE_MSG("opfail");*/
- regprop(RExC_rx, mysv_val, upto);
+ regprop(RExC_rx, mysv_val, upto, NULL);
PerlIO_printf(Perl_debug_log,
"~ replace with OPFAIL pointed at %s (%"IVdf") offset %"IVdf"\n",
SvPV_nolen_const(mysv_val),
*/
ssc_init(pRExC_state, data->start_class);
} else {
- /* AND before and after: combine and continue */
+ /* AND before and after: combine and continue. These
+ * assertions are zero-length, so can match an EMPTY
+ * string */
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
}
}
if (f & SCF_DO_STCLASS_AND) {
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
return count;
}
-/*XXX: todo make this not included in a non debugging perl */
+/*XXX: todo make this not included in a non debugging perl, but appears to be
+ * used anyway there, in 'use re' */
#ifndef PERL_IN_XSUB_RE
void
Perl_reginitcolors(pTHX)
{
- dVAR;
const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
if (s) {
char *t = savepv(s);
regexp_engine const *
Perl_current_re_engine(pTHX)
{
- dVAR;
-
if (IN_PERL_COMPILETIME) {
HV * const table = GvHV(PL_hintgv);
SV **ptr;
REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
{
- dVAR;
regexp_engine const *eng = current_re_engine();
GET_RE_DEBUG_FLAGS_DECL;
/* if we know we have at least two args, create an empty string,
* then concatenate args to that. For no args, return an empty string */
if (!pat && pat_count != 1) {
- pat = newSVpvn("", 0);
+ pat = newSVpvs("");
SAVEFREESV(pat);
alloced = TRUE;
}
if (oplist) {
assert(oplist->op_type == OP_PADAV
|| oplist->op_type == OP_RV2AV);
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);
}
if (SvRMAGICAL(av)) {
pRExC_state->code_blocks[n].src_regex = NULL;
n++;
code = 1;
- oplist = oplist->op_sibling; /* skip CONST */
+ oplist = OP_SIBLING(oplist); /* skip CONST */
assert(oplist);
}
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);;
}
/* apply magic and QR overloading to arg */
{
int n = 0;
STRLEN s;
+
+ PERL_UNUSED_CONTEXT;
for (s = 0; s < plen; s++) {
if (n < pRExC_state->num_code_blocks
OP *expr, const regexp_engine* eng, REGEXP *old_re,
bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
{
- dVAR;
REGEXP *rx;
struct regexp *r;
regexp_internal *ri;
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
ncode++; /* count of DO blocks */
if (ncode) {
if (expr->op_type == OP_CONST)
n = 1;
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
n++;
}
if (expr->op_type == OP_CONST)
new_patternp[n] = cSVOPx_sv(expr);
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
new_patternp[n++] = cSVOPo_sv;
}
assert( expr->op_type == OP_PUSHMARK
|| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
|| expr->op_type == OP_PADRANGE);
- expr = expr->op_sibling;
+ expr = OP_SIBLING(expr);
}
pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
if (rx_flags & PMf_FOLD) {
RExC_contains_i = 1;
}
- if (initial_charset == REGEX_LOCALE_CHARSET) {
- RExC_contains_locale = 1;
- }
- else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
+ if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
/* Set to use unicode semantics if the pattern is in utf8 and has the
* 'depends' charset specified, as it means unicode when utf8 */
else if (PL_regkind[OP(first)] == TRIE &&
((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
{
- regnode *trie_op;
- /* this can happen only on restudy */
- if ( OP(first) == TRIE ) {
- struct regnode_1 *trieop = (struct regnode_1 *)
- PerlMemShared_calloc(1, sizeof(struct regnode_1));
- StructCopy(first,trieop,struct regnode_1);
- trie_op=(regnode *)trieop;
- } else {
- struct regnode_charclass *trieop = (struct regnode_charclass *)
- PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
- StructCopy(first,trieop,struct regnode_charclass);
- trie_op=(regnode *)trieop;
- }
- OP(trie_op)+=2;
- make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
- ri->regstclass = trie_op;
+ /* this can happen only on restudy */
+ ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
}
#endif
else if (REGNODE_SIMPLE(OP(first)))
goto again;
}
else if ((!sawopen || !RExC_sawback) &&
+ !sawlookahead &&
(OP(first) == STAR &&
PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
!(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
ri->regstclass = (regnode*)RExC_rxi->data->data[n];
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
- regprop(r, sv, (regnode*)data.start_class);
+ regprop(r, sv, (regnode*)data.start_class, NULL);
PerlIO_printf(Perl_debug_log,
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
ri->regstclass = (regnode*)RExC_rxi->data->data[n];
r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
- regprop(r, sv, (regnode*)data.start_class);
+ regprop(r, sv, (regnode*)data.start_class, NULL);
PerlIO_printf(Perl_debug_log,
"synthetic stclass \"%s\".\n",
SvPVX_const(sv));});
if (r->intflags & PREGf_ANCH)
r->extflags |= RXf_IS_ANCHORED;
+
{
+ /* this is used to identify "special" patterns that might result
+ * in Perl NOT calling the regex engine and instead doing the match "itself",
+ * particularly special cases in split//. By having the regex compiler
+ * do this pattern matching at a regop level (instead of by inspecting the pattern)
+ * we avoid weird issues with equivalent patterns resulting in different behavior,
+ * AND we allow non Perl engines to get the same optimizations by the setting the
+ * flags appropriately - Yves */
regnode *first = ri->program + 1;
U8 fop = OP(first);
regnode *next = NEXTOPER(first);
r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
}
+
+ if (RExC_contains_locale) {
+ RXp_EXTFLAGS(r) |= RXf_TAINTED;
+ }
+
#ifdef DEBUGGING
if (RExC_paren_names) {
ri->name_list_idx = add_data( pRExC_state, STR_WITH_LEN("a"));
} else if (flags & RXapif_ONE) {
ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
av = MUTABLE_AV(SvRV(ret));
- length = av_len(av);
+ length = av_tindex(av);
SvREFCNT_dec_NN(ret);
return newSViv(length + 1);
} else {
/* The header definitions are in F<inline_invlist.c> */
PERL_STATIC_INLINE UV*
-S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
+S__invlist_array_init(SV* const invlist, const bool will_have_0)
{
/* Returns a pointer to the first element in the inversion list's array.
* This is called upon initialization of an inversion list. Where the
}
PERL_STATIC_INLINE UV*
-S_invlist_array(pTHX_ SV* const invlist)
+S_invlist_array(SV* const invlist)
{
/* Returns the pointer to the inversion list's array. Every time the
* length changes, this needs to be called in case malloc or realloc moved
{
/* Sets the current number of elements stored in the inversion list.
* Updates SvCUR correspondingly */
-
+ PERL_UNUSED_CONTEXT;
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
assert(SvTYPE(invlist) == SVt_INVLIST);
}
PERL_STATIC_INLINE IV*
-S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
+S_get_invlist_previous_index_addr(SV* invlist)
{
/* Return the address of the IV that is reserved to hold the cached index
* */
-
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
assert(SvTYPE(invlist) == SVt_INVLIST);
}
PERL_STATIC_INLINE IV
-S_invlist_previous_index(pTHX_ SV* const invlist)
+S_invlist_previous_index(SV* const invlist)
{
/* Returns cached index of previous search */
}
PERL_STATIC_INLINE void
-S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
+S_invlist_set_previous_index(SV* const invlist, const IV index)
{
/* Caches <index> for later retrieval */
}
PERL_STATIC_INLINE UV
-S_invlist_max(pTHX_ SV* const invlist)
+S_invlist_max(SV* const invlist)
{
/* Returns the maximum number of elements storable in the inversion list's
* array, without having to realloc() */
}
PERL_STATIC_INLINE void
-S_invlist_trim(pTHX_ SV* const invlist)
+S_invlist_trim(SV* const invlist)
{
PERL_ARGS_ASSERT_INVLIST_TRIM;
#ifndef PERL_IN_XSUB_RE
IV
-Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
+Perl__invlist_search(SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
}
void
-Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
- const UV start, const UV end, U8* swatch)
+Perl__invlist_populate_swatch(SV* const invlist,
+ const UV start, const UV end, U8* swatch)
{
/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
* but is used when the swash has an inversion list. This makes this much
}
PERL_STATIC_INLINE STRLEN*
-S_get_invlist_iter_addr(pTHX_ SV* invlist)
+S_get_invlist_iter_addr(SV* invlist)
{
/* Return the address of the UV that contains the current iteration
* position */
}
PERL_STATIC_INLINE void
-S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
+S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
{
PERL_ARGS_ASSERT_INVLIST_ITERINIT;
}
PERL_STATIC_INLINE void
-S_invlist_iterfinish(pTHX_ SV* invlist)
+S_invlist_iterfinish(SV* invlist)
{
/* Terminate iterator for invlist. This is to catch development errors.
* Any iteration that is interrupted before completed should call this
}
STATIC bool
-S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
+S_invlist_iternext(SV* invlist, UV* start, UV* end)
{
/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
* This call sets in <*start> and <*end>, the next range in <invlist>.
}
PERL_STATIC_INLINE bool
-S_invlist_is_iterating(pTHX_ SV* const invlist)
+S_invlist_is_iterating(SV* const invlist)
{
PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
}
PERL_STATIC_INLINE UV
-S_invlist_highest(pTHX_ SV* const invlist)
+S_invlist_highest(SV* const invlist)
{
/* Returns the highest code point that matches an inversion list. This API
* has an ambiguity, as it returns 0 under either the highest is actually
count += 2;
}
}
+
+void
+Perl__load_PL_utf8_foldclosures (pTHX)
+{
+ assert(! PL_utf8_foldclosures);
+
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
+
+ /* This string is just a short named one above \xff */
+ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+ assert(PL_utf8_tofold); /* Verify that worked */
+ }
+ PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+}
#endif
#ifdef PERL_ARGS_ASSERT__INVLISTEQ
}
cs = REGEX_LOCALE_CHARSET;
has_charset_modifier = LOCALE_PAT_MOD;
- RExC_contains_locale = 1;
break;
case UNICODE_PAT_MOD:
if (has_charset_modifier) {
* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
* this flag alerts us to the need to check for that */
{
- dVAR;
regnode *ret; /* Will be the head of the group. */
regnode *br;
regnode *lastbr;
bool is_open = 0;
I32 freeze_paren = 0;
I32 after_freeze = 0;
+ I32 num; /* numeric backreferences */
char * parse_start = RExC_parse; /* MJD */
char * const oregcomp_parse = RExC_parse;
int internal_argval = 0; /* internal_argval is only useful if
!argok */
- if (has_intervening_patws && SIZE_ONLY) {
- ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+ if (has_intervening_patws) {
+ RExC_parse++;
+ vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
}
while ( *RExC_parse && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
- if (has_intervening_patws && SIZE_ONLY) {
- ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+ const char * endptr;
+ if (has_intervening_patws) {
+ RExC_parse++;
+ vFAIL("In '(?...)', the '(' and '?' must be adjacent");
}
RExC_parse++;
RExC_seen |= REG_LOOKBEHIND_SEEN;
RExC_in_lookbehind++;
RExC_parse++;
+ /* FALLTHROUGH */
case '=': /* (?=...) */
RExC_seen_zerolen++;
break;
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#': /* (?#...) */
- /* XXX As soon as we disallow separating the '?' and '*' (by
- * spaces or (?#...) comment), it is believed that this case
- * will be unreachable and can be removed. See
- * [perl #117327] */
- while (*RExC_parse && *RExC_parse != ')')
- RExC_parse++;
- if (*RExC_parse != ')')
- FAIL("Sequence (?#... not terminated");
- nextchar(pRExC_state);
- *flagp = TRYAGAIN;
- return NULL;
case '0' : /* (?0) */
case 'R' : /* (?R) */
if (*RExC_parse != ')')
nextchar(pRExC_state);
return ret;
/*notreached*/
- { /* named and numeric backreferences */
- I32 num;
+ /* named and numeric backreferences */
case '&': /* (?&NAME) */
parse_start = RExC_parse - 1;
named_recursion:
RExC_parse--; /* rewind to let it be handled later */
goto parse_flags;
}
- /*FALLTHROUGH */
+ /* FALLTHROUGH */
case '1': case '2': case '3': case '4': /* (?1) */
case '5': case '6': case '7': case '8': case '9':
RExC_parse--;
parse_recursion:
- num = atoi(RExC_parse);
- parse_start = RExC_parse - 1; /* MJD */
- if (*RExC_parse == '-')
- RExC_parse++;
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ {
+ bool is_neg = FALSE;
+ parse_start = RExC_parse - 1; /* MJD */
+ if (*RExC_parse == '-') {
+ RExC_parse++;
+ is_neg = TRUE;
+ }
+ num = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
+ if (is_neg) {
+ /* Some limit for num? */
+ num = -num;
+ }
+ }
if (*RExC_parse!=')')
vFAIL("Expecting close bracket");
*flagp |= POSTPONED;
nextchar(pRExC_state);
return ret;
- } /* named and numeric backreferences */
+
assert(0); /* NOT REACHED */
case '?': /* (??...) */
}
*flagp |= POSTPONED;
paren = *RExC_parse++;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case '{': /* (?{...}) */
{
U32 n = 0;
REGTAIL(pRExC_state, ret, tail);
goto insert_if;
}
+ /* Fall through to ‘Unknown switch condition’ at the
+ end of the if/else chain. */
}
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
|| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
RExC_parse++;
parno = 0;
if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
- parno = atoi(RExC_parse++);
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
} else if (RExC_parse[0] == '&') {
SV *sv_dat;
RExC_parse++;
/* (?(1)...) */
char c;
char *tmp;
- parno = atoi(RExC_parse++);
-
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
but I can't figure out why. -- dmq*/
return ret;
}
- else {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- vFAIL("Unknown switch condition (?(...))");
- }
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
return handle_regex_sets(pRExC_state, NULL, flagp, depth,
case '=':
case '!':
*flagp &= ~HASWIDTH;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
case '>':
ender = reg_node(pRExC_state, SUCCEED);
break;
SV * const mysv_val1=sv_newmortal();
SV * const mysv_val2=sv_newmortal();
DEBUG_PARSE_MSG("lsbr");
- regprop(RExC_rx, mysv_val1, lastbr);
- regprop(RExC_rx, mysv_val2, ender);
+ regprop(RExC_rx, mysv_val1, lastbr, NULL);
+ regprop(RExC_rx, mysv_val2, ender, NULL);
PerlIO_printf(Perl_debug_log, "~ tying lastbr %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
SvPV_nolen_const(mysv_val1),
(IV)REG_NODE_NUM(lastbr),
SV * const mysv_val1=sv_newmortal();
SV * const mysv_val2=sv_newmortal();
DEBUG_PARSE_MSG("NADA");
- regprop(RExC_rx, mysv_val1, ret);
- regprop(RExC_rx, mysv_val2, ender);
+ regprop(RExC_rx, mysv_val1, ret, NULL);
+ regprop(RExC_rx, mysv_val2, ender, NULL);
PerlIO_printf(Perl_debug_log, "~ converting ret %s (%"IVdf") to ender %s (%"IVdf") offset %"IVdf"\n",
SvPV_nolen_const(mysv_val1),
(IV)REG_NODE_NUM(ret),
STATIC regnode *
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
{
- dVAR;
regnode *ret;
regnode *chain = NULL;
regnode *latest;
STATIC regnode *
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- dVAR;
regnode *ret;
char op;
char *next;
op = *RExC_parse;
- if (op == '{' && regcurly(RExC_parse, FALSE)) {
+ if (op == '{' && regcurly(RExC_parse)) {
maxpos = NULL;
#ifdef RE_TRACK_PATTERN_OFFSETS
parse_start = RExC_parse; /* MJD */
next++;
}
if (*next == '}') { /* got one */
+ const char* endptr;
if (!maxpos)
maxpos = next;
RExC_parse++;
- min = atoi(RExC_parse);
+ min = grok_atou(RExC_parse, &endptr);
if (*maxpos == ',')
maxpos++;
else
maxpos = RExC_parse;
- max = atoi(maxpos);
+ max = grok_atou(maxpos, &endptr);
if (!max && *maxpos != '0')
max = REG_INFTY; /* meaning "infinity" */
else if (max >= REG_INFTY)
* modifier. The other meaning does not, so use a temporary until we find
* out which we are being called with */
p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regwhite( pRExC_state, RExC_parse )
+ ? regpatws(pRExC_state, RExC_parse,
+ TRUE) /* means recognize comments */
: RExC_parse;
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The former is assumed when it can't be the latter. */
- if (*p != '{' || regcurly(p, FALSE)) {
+ if (*p != '{' || regcurly(p)) {
RExC_parse = p;
if (! node_p) {
/* no bare \N allowed in a charclass */
}
PERL_STATIC_INLINE U8
-S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
+S_compute_EXACTish(RExC_state_t *pRExC_state)
{
U8 op;
PERL_STATIC_INLINE void
S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state,
- regnode *node, I32* flagp, STRLEN len, UV code_point)
+ regnode *node, I32* flagp, STRLEN len, UV code_point,
+ bool downgradable)
{
/* This knows the details about sizing an EXACTish node, setting flags for
* it (by setting <*flagp>, and potentially populating it with a single
*
* It knows that under FOLD, the Latin Sharp S and UTF characters above
* 255, must be folded (the former only when the rules indicate it can
- * match 'ss') */
+ * match 'ss')
+ *
+ * When it does the populating, it looks at the flag 'downgradable'. If
+ * true with a node that folds, it checks if the single code point
+ * participates in a fold, and if not downgrades the node to an EXACT.
+ * This helps the optimizer */
bool len_passed_in = cBOOL(len != 0);
U8 character[UTF8_MAXBYTES_CASE+1];
PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
+ /* Don't bother to check for downgrading in PASS1, as it doesn't make any
+ * sizing difference, and is extra work that is thrown away */
+ if (downgradable && ! PASS2) {
+ downgradable = FALSE;
+ }
+
if (! len_passed_in) {
if (UTF) {
if (UNI_IS_INVARIANT(code_point)) {
EBCDIC, but it works there, as the extra invariants
fold to themselves) */
*character = toFOLD((U8) code_point);
+ if (downgradable
+ && *character == code_point
+ && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
+ {
+ OP(node) = EXACT;
+ }
}
len = 1;
}
else if (FOLD && (! LOC
|| ! is_PROBLEMATIC_LOCALE_FOLD_cp(code_point)))
{ /* Folding, and ok to do so now */
- _to_uni_fold_flags(code_point,
+ UV folded = _to_uni_fold_flags(
+ code_point,
character,
&len,
FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
? FOLD_FLAGS_NOMIX_ASCII
: 0));
+ if (downgradable
+ && folded == code_point
+ && ! _invlist_contains_cp(PL_utf8_foldable, code_point))
+ {
+ OP(node) = EXACT;
+ }
}
else if (code_point <= MAX_UTF8_TWO_BYTE) {
uvchr_to_utf8( character, code_point);
len = UTF8SKIP(character);
}
- } /* Else pattern isn't UTF8. We only fold the sharp s, when
- appropriate */
- else if (UNLIKELY(code_point == LATIN_SMALL_LETTER_SHARP_S)
- && FOLD
- && AT_LEAST_UNI_SEMANTICS
- && ! ASCII_FOLD_RESTRICTED)
- {
+ } /* Else pattern isn't UTF8. */
+ else if (! FOLD) {
+ *character = (U8) code_point;
+ len = 1;
+ } /* Else is folded non-UTF8 */
+ else if (LIKELY(code_point != LATIN_SMALL_LETTER_SHARP_S)) {
+
+ /* We don't fold any non-UTF8 except possibly the Sharp s (see
+ * comments at join_exact()); */
+ *character = (U8) code_point;
+ len = 1;
+
+ /* Can turn into an EXACT node if we know the fold at compile time,
+ * and it folds to itself and doesn't particpate in other folds */
+ if (downgradable
+ && ! LOC
+ && PL_fold_latin1[code_point] == code_point
+ && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
+ || (isASCII(code_point) && ASCII_FOLD_RESTRICTED)))
+ {
+ OP(node) = EXACT;
+ }
+ } /* else is Sharp s. May need to fold it */
+ else if (AT_LEAST_UNI_SEMANTICS && ! ASCII_FOLD_RESTRICTED) {
*character = 's';
*(character + 1) = 's';
len = 2;
}
else {
- *character = (U8) code_point;
+ *character = LATIN_SMALL_LETTER_SHARP_S;
len = 1;
}
}
{
*flagp |= SIMPLE;
}
+
+ /* The OP may not be well defined in PASS1 */
+ if (PASS2 && OP(node) == EXACTFL) {
+ RExC_contains_locale = 1;
+ }
}
-/* return atoi(p), unless it's too big to sensibly be a backref,
+/* Parse backref decimal value, unless it's too big to sensibly be a backref,
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
static I32
S_backref_value(char *p)
{
- char *q = p;
-
- for (;isDIGIT(*q); q++); /* calculate length of num */
- if (q - p == 0 || q - p > 9)
+ const char* endptr;
+ UV val = grok_atou(p, &endptr);
+ if (endptr == p || endptr == NULL || val > I32_MAX)
return I32_MAX;
- return atoi(p);
+ return (I32)val;
}
STATIC regnode *
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- dVAR;
regnode *ret = NULL;
I32 flags = 0;
char *parse_start = RExC_parse;
U8 op;
int invert = 0;
+ U8 arg;
GET_RE_DEBUG_FLAGS_DECL;
vFAIL("Internal urp");
/* Supposed to be caught earlier. */
break;
- case '{':
- if (!regcurly(RExC_parse, FALSE)) {
- RExC_parse++;
- goto defchar;
- }
- /* FALL THROUGH */
case '?':
case '+':
case '*':
literal text handling code.
*/
switch ((U8)*++RExC_parse) {
- U8 arg;
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, CANY);
RExC_seen |= REG_CANY_SEEN;
*flagp |= HASWIDTH|SIMPLE;
+ if (SIZE_ONLY) {
+ ckWARNdep(RExC_parse+1, "\\C is deprecated");
+ }
goto finish_meta_pat;
case 'X':
ret = reg_node(pRExC_state, CLUMP);
if (op > BOUNDA) { /* /aa is same as /a */
op = BOUNDA;
}
+ else if (op == BOUNDL) {
+ RExC_contains_locale = 1;
+ }
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if (op > NBOUNDA) { /* /aa is same as /a */
op = NBOUNDA;
}
+ else if (op == NBOUNDL) {
+ RExC_contains_locale = 1;
+ }
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
if (op > POSIXA) { /* /aa is same as /a */
op = POSIXA;
}
+ else if (op == POSIXL) {
+ RExC_contains_locale = 1;
+ }
join_posix_op_known:
}
*flagp |= HASWIDTH|SIMPLE;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
finish_meta_pat:
nextchar(pRExC_state);
}
else {
num = S_backref_value(RExC_parse);
- /* bare \NNN might be backref or octal */
+ /* bare \NNN might be backref or octal - if it is larger than or equal
+ * RExC_npar then it is assumed to be and octal escape.
+ * Note RExC_npar is +1 from the actual number of parens*/
if (num == I32_MAX || (num > 9 && num >= RExC_npar
&& *RExC_parse != '8' && *RExC_parse != '9'))
+ {
/* Probably a character specified in octal, e.g. \35 */
goto defchar;
+ }
}
/* at this point RExC_parse definitely points to a backref
case '\0':
if (RExC_parse >= RExC_end)
FAIL("Trailing \\");
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
/* Do not generate "unrecognized" warnings here, we fall
back into the quick-grab loop below */
case '#':
if (RExC_flags & RXf_PMf_EXTENDED) {
- if ( reg_skipcomment( pRExC_state ) )
+ RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+ if (RExC_parse < RExC_end)
goto tryagain;
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
switch ((U8)*p) {
case '^':
case '$':
p++;
break;
case 'e':
- ender = ASCII_TO_NATIVE('\033');
+ ender = ESC_NATIVE;
p++;
break;
case 'a':
- ender = '\a';
+ ender = '\a';
p++;
break;
case 'o':
* 118 OR as "\11" . "8" depending on whether there
* were 118 capture buffers defined already in the
* pattern. */
- if ( !isDIGIT(p[1]) || S_backref_value(p) <= RExC_npar)
+
+ /* NOTE, RExC_npar is 1 more than the actual number of
+ * parens we have seen so far, hence the < RExC_npar below. */
+
+ if ( !isDIGIT(p[1]) || S_backref_value(p) < RExC_npar)
{ /* Not to be treated as an octal constant, go
find backref */
--p;
goto loopdone;
}
+ /* FALLTHROUGH */
case '0':
{
I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
case '\0':
if (p >= RExC_end)
FAIL("Trailing \\");
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
if (!SIZE_ONLY&& isALPHANUMERIC(*p)) {
/* Include any { following the alpha to emphasize
goto normal_default;
} /* End of switch on '\' */
break;
+ case '{':
+ /* Currently we don't warn when the lbrace is at the start
+ * of a construct. This catches it in the middle of a
+ * literal string, or when its the first thing after
+ * something like "\b" */
+ if (! SIZE_ONLY
+ && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
+ {
+ ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+ }
+ /*FALLTHROUGH*/
default: /* A literal character */
-
- if (! SIZE_ONLY
- && RExC_flags & RXf_PMf_EXTENDED
- && ckWARN_d(WARN_DEPRECATED)
- && is_PATWS_non_low(p, UTF))
- {
- vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
- "Escape literal pattern white space under /x");
- }
-
normal_default:
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
*/
if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
&& (PL_fold[ender] != PL_fold_latin1[ender]
|| ender == LATIN_SMALL_LETTER_SHARP_S
|| (len > 0
- && isARG2_lower_or_UPPER_ARG1('s', ender)
- && isARG2_lower_or_UPPER_ARG1('s',
- *(s-1)))))
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))))
{
maybe_exactfu = FALSE;
}
* as if it turns into an EXACTFU, it could later get
* joined with another 's' that would then wrongly match
* the sharp s */
- if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+ if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
{
maybe_exactfu = FALSE;
}
OP(ret) = EXACTFU;
}
}
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender,
+ FALSE /* Don't look to see if could
+ be turned into an EXACT
+ node, as we have already
+ computed that */
+ );
}
RExC_parse = p - 1;
}
STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGWHITE;
-
- while (p < e) {
- if (isSPACE(*p))
- ++p;
- else if (*p == '#') {
- bool ended = 0;
- do {
- if (*p++ == '\n') {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- }
- else
- break;
- }
- return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
{
/* Returns the next non-pattern-white space, non-comment character (the
* latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. If there is no line break ending a comment,
- * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+ * ended by RExC_end. See also reg_skipcomment */
const char *e = RExC_end;
PERL_ARGS_ASSERT_REGPATWS;
p += len;
}
else if (recognize_comment && *p == '#') {
- bool ended = 0;
- do {
- p++;
- if (is_LNBREAK_safe(p, e, UTF)) {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ p = reg_skipcomment(pRExC_state, p);
}
else
break;
if (end == UV_MAX && start <= 256) {
ANYOF_FLAGS(node) |= ANYOF_ABOVE_LATIN1_ALL;
}
+ else if (end >= 256) {
+ ANYOF_FLAGS(node) |= ANYOF_UTF8;
+ }
/* Quit if are above what we should change */
- if (start > 255) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
break;
}
change_invlist = TRUE;
/* Set all the bits in the range, up to the max that we are doing */
- high = (end < 255) ? end : 255;
+ high = (end < NUM_ANYOF_CODE_POINTS - 1)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(node, i)) {
ANYOF_BITMAP_SET(node, i);
PERL_STATIC_INLINE I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
{
- dVAR;
I32 namedclass = OOB_NAMEDCLASS;
PERL_ARGS_ASSERT_REGPPOSIXCC;
}
STATIC bool
-S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
+S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
{
/* This applies some heuristics at the current parse position (which should
* be at a '[') to see if what follows might be intended to be a [:posix:]
while (RExC_parse < RExC_end) {
SV* current = NULL;
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE); /* means recognize comments */
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
break;
case '\\':
/* Skip white space */
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE /* means recognize comments */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
RExC_flags = save_flags;
goto handle_operand;
}
- /* FALL THROUGH */
+ /* FALLTHROUGH */
default:
RExC_parse += (UTF) ? UTF8SKIP(RExC_parse) : 1;
top_index -= 2;
SvREFCNT_dec_NN(lparen);
- /* FALL THROUGH */
+ /* FALLTHROUGH */
}
handle_operand:
}
#undef IS_OPERAND
+STATIC void
+S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+{
+ /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
+ * innocent-looking character class, like /[ks]/i won't have to go out to
+ * disk to find the possible matches.
+ *
+ * This should be called only for a Latin1-range code points, cp, which is
+ * known to be involved in a simple fold with other code points above
+ * Latin1. It would give false results if /aa has been specified.
+ * Multi-char folds are outside the scope of this, and must be handled
+ * specially.
+ *
+ * XXX It would be better to generate these via regen, in case a new
+ * version of the Unicode standard adds new mappings, though that is not
+ * really likely, and may be caught by the default: case of the switch
+ * below. */
+
+ PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
+
+ assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
+
+ switch (cp) {
+ case 'k':
+ case 'K':
+ *invlist =
+ add_cp_to_invlist(*invlist, KELVIN_SIGN);
+ break;
+ case 's':
+ case 'S':
+ *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
+ break;
+ case MICRO_SIGN:
+ *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
+ *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ *invlist = add_cp_to_invlist(*invlist,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
+ break;
+ default:
+ /* Use deprecated warning to increase the chances of this being
+ * output */
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+ break;
+ }
+}
+
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
* ignored in the recursion by means of a flag:
* <RExC_in_multi_char_class>.)
*
- * ANYOF nodes contain a bit map for the first 256 characters, with the
- * corresponding bit set if that character is in the list. For characters
- * above 255, a range list or swash is used. There are extra bits for \w,
- * etc. in locale ANYOFs, as what these match is not determinable at
- * compile time
+ * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
+ * characters, with the corresponding bit set if that character is in the
+ * list. For characters above this, a range list or swash is used. There
+ * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
+ * determinable at compile time
*
* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
* to be restarted. This can only happen if ret_invlist is non-NULL.
*/
- dVAR;
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
UV value = OOB_UNICODE, save_value = OOB_UNICODE;
* that fold to/from them under /i */
SV* cp_foldable_list = NULL;
+ /* Like cp_list, but code points on this list are valid only when the
+ * runtime locale is UTF-8 */
+ SV* only_utf8_locale_list = NULL;
+
#ifdef EBCDIC
/* In a range, counts how many 0-2 of the ends of it came from literals,
* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
ANYOF_FLAGS(ret) = 0;
RExC_emit += ANYOF_SKIP;
- if (LOC) {
- ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
- }
listsv = newSVpvs_flags("# comment\n", SVs_TEMP);
initial_listsv_len = SvCUR(listsv);
SvTEMP_off(listsv); /* Grr, TEMPs and mortals are conflated. */
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
}
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == ']') {
{
namedclass = regpposixcc(pRExC_state, value, strict);
}
- else if (value == '\\') {
- if (UTF) {
+ else if (value != '\\') {
+#ifdef EBCDIC
+ literal_endpoint++;
+#endif
+ }
+ else {
+ /* Is a backslash; get the code point of the char after it */
+ if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
e = strchr(RExC_parse++, '}');
if (!e)
vFAIL2("Missing right brace on \\%c{}", c);
- while (isSPACE(UCHARAT(RExC_parse)))
+ while (isSPACE(*RExC_parse))
RExC_parse++;
if (e == RExC_parse)
vFAIL2("Empty \\%c{}", c);
n = e - RExC_parse;
- while (isSPACE(UCHARAT(RExC_parse + n - 1)))
+ while (isSPACE(*(RExC_parse + n - 1)))
n--;
}
else {
}
if (!SIZE_ONLY) {
SV* invlist;
- char* formatted;
char* name;
if (UCHARAT(RExC_parse) == '^') {
* that bit) */
value ^= 'P' ^ 'p';
- while (isSPACE(UCHARAT(RExC_parse))) {
+ while (isSPACE(*RExC_parse)) {
RExC_parse++;
n--;
}
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- formatted = Perl_form(aTHX_
+ name = savepv(Perl_form(aTHX_
"%s%.*s%s\n",
(FOLD) ? "__" : "",
(int)n,
RExC_parse,
(FOLD) ? "_i" : ""
- );
- name = savepvn(formatted, strlen(formatted));
+ ));
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
&swash_init_flags
);
if (! swash || ! (invlist = _get_swash_invlist(swash))) {
+ HV* curpkg = (IN_PERL_COMPILETIME)
+ ? PL_curstash
+ : CopSTASH(PL_curcop);
if (swash) {
SvREFCNT_dec_NN(swash);
swash = NULL;
"Property '%"UTF8f"' is unknown",
UTF8fARG(UTF, n, name));
}
+
+ /* If the property name doesn't already have a package
+ * name, add the current one to it so that it can be
+ * referred to outside it. [perl #121777] */
+ if (curpkg && ! instr(name, "::")) {
+ char* pkgname = HvNAME(curpkg);
+ if (strNE(pkgname, "main")) {
+ char* full_name = Perl_form(aTHX_
+ "%s::%s",
+ pkgname,
+ name);
+ n = strlen(full_name);
+ Safefree(name);
+ name = savepvn(full_name, n);
+ }
+ }
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
(value == 'p' ? '+' : '!'),
UTF8fARG(UTF, n, name));
case 't': value = '\t'; break;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
- case 'e': value = ASCII_TO_NATIVE('\033');break;
+ case 'e': value = ESC_NATIVE; break;
case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
break;
} /* End of switch on char following backslash */
} /* end of handling backslash escape sequences */
-#ifdef EBCDIC
- else
- literal_endpoint++;
-#endif
/* Here, we have the current token in 'value' */
- /* What matches in a locale is not known until runtime. This includes
- * what the Posix classes (like \w, [:space:]) match. Room must be
- * reserved (one time per outer bracketed class) to store such classes,
- * either if Perl is compiled so that locale nodes always should have
- * this space, or if there is such posix class info to be stored. The
- * space will contain a bit for each named class that is to be matched
- * against. This isn't needed for \p{} and pseudo-classes, as they are
- * not affected by locale, and hence are dealt with separately */
- if (LOC) {
- if (FOLD && ! need_class) {
- need_class = 1;
- if (SIZE_ONLY) {
- RExC_size += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP;
- }
- else {
- RExC_emit += ANYOF_POSIXL_FOLD_SKIP - ANYOF_SKIP;
- }
-
- /* We need to initialize this here because this node type has
- * this field, and will skip getting initialized when we get to
- * a posix class since are doing it here */
- ANYOF_POSIXL_ZERO(ret);
- }
- if (ANYOF_LOCALE == ANYOF_POSIXL
- || (namedclass > OOB_NAMEDCLASS
- && namedclass < ANYOF_POSIXL_MAX))
- {
- if (! need_class) {
- need_class = 1;
- if (SIZE_ONLY) {
- RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
- }
- else {
- RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
- }
- ANYOF_POSIXL_ZERO(ret);
- }
- ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
- }
- }
-
if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
U8 classnum;
&& classnum != _CC_ASCII
#endif
) {
+ /* What the Posix classes (like \w, [:space:]) match in locale
+ * isn't knowable under locale until actual match time. Room
+ * must be reserved (one time per outer bracketed class) to
+ * store such classes. The space will contain a bit for each
+ * named class that is to be matched against. This isn't
+ * needed for \p{} and pseudo-classes, as they are not affected
+ * by locale, and hence are dealt with separately */
+ if (! need_class) {
+ need_class = 1;
+ if (SIZE_ONLY) {
+ RExC_size += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
+ }
+ else {
+ RExC_emit += ANYOF_POSIXL_SKIP - ANYOF_SKIP;
+ }
+ ANYOF_FLAGS(ret) |= ANYOF_POSIXL;
+ ANYOF_POSIXL_ZERO(ret);
+ }
+
+ /* Coverity thinks it is possible for this to be negative; both
+ * jhi and khw think it's not, but be safer */
+ assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
+ || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
/* See if it already matches the complement of this POSIX
* class */
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (range) {
AV* this_array;
STRLEN cp_count = utf8_length(foldbuf,
foldbuf + foldlen);
- SV* multi_fold = sv_2mortal(newSVpvn("", 0));
+ SV* multi_fold = sv_2mortal(newSVpvs(""));
Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
&& ((prevvalue >= 'a' && value <= 'z')
|| (prevvalue >= 'A' && value <= 'Z')))
{
- _invlist_intersection(this_range, PL_ASCII,
+ _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
&this_range);
/* Since this above only contains ascii, the intersection of it
#endif
/* Look at the longest folds first */
- for (cp_count = av_len(multi_char_matches); cp_count > 0; cp_count--) {
+ for (cp_count = av_tindex(multi_char_matches); cp_count > 0; cp_count--) {
if (av_exists(multi_char_matches, cp_count)) {
AV** this_array_ptr;
op = POSIXA;
}
}
+ else if (prevvalue == 'A') {
+ if (value == 'Z'
+#ifdef EBCDIC
+ && literal_endpoint == 2
+#endif
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+ op = POSIXA;
+ }
+ }
+ else if (prevvalue == 'a') {
+ if (value == 'z'
+#ifdef EBCDIC
+ && literal_endpoint == 2
+#endif
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+ op = POSIXA;
+ }
+ }
}
/* Here, we have changed <op> away from its initial value iff we found
else {
RExC_emit = (regnode *)orig_emit;
if (PL_regkind[op] == POSIXD) {
+ if (op == POSIXL) {
+ RExC_contains_locale = 1;
+ }
if (invert) {
op += NPOSIXD - POSIXD;
}
*flagp |= HASWIDTH|SIMPLE;
}
else if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
+ TRUE /* downgradable to EXACT */
+ );
}
RExC_parse = (char *) cur_parse;
* runtime only when the locale indicates Unicode rules. For
* non-locale, we just use to the general list */
if (LOC) {
- use_list = &ANYOF_UTF8_LOCALE_INVLIST(ret);
- *use_list = NULL;
+ use_list = &only_utf8_locale_list;
}
else {
use_list = &cp_list;
/* This is a hash that for a particular fold gives all
* characters that are involved in it */
if (! PL_utf8_foldclosures) {
-
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES_CASE+1];
-
- /* This string is just a short named one above \xff */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures
- = _swash_inversion_hash(PL_utf8_tofold);
+ _load_PL_utf8_foldclosures();
}
}
if (j < 256) {
- /* We have the latin1 folding rules hard-coded here so
- * that an innocent-looking character class, like
- * /[ks]/i won't have to go out to disk to find the
- * possible matches. XXX It would be better to
- * generate these via regen, in case a new version of
- * the Unicode standard adds new mappings, though that
- * is not really likely, and may be caught by the
- * default: case of the switch below. */
-
if (IS_IN_SOME_FOLD_L1(j)) {
/* ASCII is always matched; non-ASCII is matched
}
}
- if (HAS_NONLATIN1_FOLD_CLOSURE(j)
+ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
{
- /* Certain Latin1 characters have matches outside
- * Latin1. To get here, <j> is one of those
- * characters. None of these matches is valid for
- * ASCII characters under /aa, which is why the 'if'
- * just above excludes those. These matches only
- * happen when the target string is utf8. The code
- * below adds the single fold closures for <j> to the
- * inversion list. */
-
- switch (j) {
- case 'k':
- case 'K':
- *use_list =
- add_cp_to_invlist(*use_list, KELVIN_SIGN);
- break;
- case 's':
- case 'S':
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_SMALL_LETTER_LONG_S);
- break;
- case MICRO_SIGN:
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_CAPITAL_LETTER_MU);
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_SMALL_LETTER_MU);
- break;
- case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
- case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
- *use_list =
- add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
- break;
- case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_SHARP_S);
- break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character
- * folds from code points that require UTF8
- * to express, so they can't match unless
- * the target string is in UTF-8, so no
- * action here is necessary, as regexec.c
- * properly handles the general case for
- * UTF-8 matching and multi-char folds */
- break;
- default:
- /* Use deprecated warning to increase the
- * chances of this being output */
- ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
- break;
- }
+ add_above_Latin1_folds(pRExC_state,
+ (U8) j,
+ use_list);
}
continue;
}
{
AV* list = (AV*) *listp;
IV k;
- for (k = 0; k <= av_len(list); k++) {
+ for (k = 0; k <= av_tindex(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
+ assert(c_p);
+
c = SvUV(*c_p);
/* /aa doesn't allow folds between ASCII and non- */
* fetching). We know to set the flag if we have a non-NULL list for UTF-8
* locales, or the class matches at least one 0-255 range code point */
if (LOC && FOLD) {
- if (ANYOF_UTF8_LOCALE_INVLIST(ret)) {
+ if (only_utf8_locale_list) {
ANYOF_FLAGS(ret) |= ANYOF_LOC_FOLD;
}
else if (cp_list) { /* Look to see if there a 0-255 code point is in
* at compile time. Besides not inverting folded locale now, we can't
* invert if there are things such as \w, which aren't known until runtime
* */
- if (invert
- && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL))
+ if (cp_list
+ && invert
+ && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
&& ! depends_list
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
{
if (cp_list
&& ! invert
&& ! depends_list
- && ! (ANYOF_FLAGS(ret) & (ANYOF_LOC_FOLD|ANYOF_POSIXL))
+ && ! (ANYOF_FLAGS(ret) & (ANYOF_LOCALE_FLAGS))
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
/* We don't optimize if we are supposed to make sure all non-Unicode
* code points raise a warning, as only ANYOF nodes have this check.
* */
- && ! ((ANYOF_FLAGS(ret) | ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
+ && ! ((ANYOF_FLAGS(ret) & ANYOF_WARN_SUPER) && ALWAYS_WARN_SUPER))
{
UV start, end;
U8 op = END; /* The optimzation node-type */
RExC_parse = (char *)cur_parse;
if (PL_regkind[op] == EXACT) {
- alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value,
+ TRUE /* downgradable to EXACT */
+ );
}
SvREFCNT_dec_NN(cp_list);
else {
cp_list = depends_list;
}
+ ANYOF_FLAGS(ret) |= ANYOF_UTF8;
}
/* If there is a swash and more than one element, we can't use the swash in
swash = NULL;
}
+ /* Note that the optimization of using 'swash' if it is the only thing in
+ * the class doesn't have us change swash at all, so it can include things
+ * that are also in the bitmap; otherwise we have purposely deleted that
+ * duplicate information */
set_ANYOF_arg(pRExC_state, ret, cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
? listsv : NULL,
+ only_utf8_locale_list,
swash, has_user_defined_property);
*flagp |= HASWIDTH|SIMPLE;
+
+ if (ANYOF_FLAGS(ret) & ANYOF_LOCALE_FLAGS) {
+ RExC_contains_locale = 1;
+ }
+
return ret;
}
regnode* const node,
SV* const cp_list,
SV* const runtime_defns,
+ SV* const only_utf8_locale_list,
SV* const swash,
const bool has_user_defined_property)
{
* av[1] if &PL_sv_undef, is a placeholder to later contain the swash
* computed from av[0]. But if no further computation need be done,
* the swash is stored here now (and av[0] is &PL_sv_undef).
- * av[2] stores the cp_list inversion list for use in addition or instead
+ * av[2] stores the inversion list of code points that match only if the
+ * current locale is UTF-8
+ * av[3] stores the cp_list inversion list for use in addition or instead
* of av[0]; used only if cp_list exists and av[1] is &PL_sv_undef.
* (Otherwise everything needed is already in av[0] and av[1])
- * av[3] is set if any component of the class is from a user-defined
- * property; used only if av[2] exists */
+ * av[4] is set if any component of the class is from a user-defined
+ * property; used only if av[3] exists */
UV n;
PERL_ARGS_ASSERT_SET_ANYOF_ARG;
- if (! cp_list && ! runtime_defns) {
+ if (! cp_list && ! runtime_defns && ! only_utf8_locale_list) {
+ assert(! (ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8)));
ARG_SET(node, ANYOF_NONBITMAP_EMPTY);
}
else {
AV * const av = newAV();
SV *rv;
+ assert(ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
+
av_store(av, 0, (runtime_defns)
? SvREFCNT_inc(runtime_defns) : &PL_sv_undef);
if (swash) {
+ assert(cp_list);
av_store(av, 1, swash);
SvREFCNT_dec_NN(cp_list);
}
else {
av_store(av, 1, &PL_sv_undef);
if (cp_list) {
- av_store(av, 2, cp_list);
- av_store(av, 3, newSVuv(has_user_defined_property));
+ av_store(av, 3, cp_list);
+ av_store(av, 4, newSVuv(has_user_defined_property));
}
}
+ if (only_utf8_locale_list) {
+ av_store(av, 2, only_utf8_locale_list);
+ }
+ else {
+ av_store(av, 2, &PL_sv_undef);
+ }
+
rv = newRV_noinc(MUTABLE_SV(av));
n = add_data(pRExC_state, STR_WITH_LEN("s"));
RExC_rxi->data->data[n] = (void*)rv;
}
}
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+SV *
+Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
+ const regnode* node,
+ bool doinit,
+ SV** listsvp,
+ SV** only_utf8_locale_ptr,
+ SV* exclude_list)
+
+{
+ /* For internal core use only.
+ * Returns the swash for the input 'node' in the regex 'prog'.
+ * If <doinit> is 'true', will attempt to create the swash if not already
+ * done.
+ * If <listsvp> is non-null, will return the printable contents of the
+ * swash. This can be used to get debugging information even before the
+ * swash exists, by calling this function with 'doinit' set to false, in
+ * which case the components that will be used to eventually create the
+ * swash are returned (in a printable form).
+ * If <exclude_list> is not NULL, it is an inversion list of things to
+ * exclude from what's returned in <listsvp>.
+ * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
+ * that, in spite of this function's name, the swash it returns may include
+ * the bitmap data as well */
+
+ SV *sw = NULL;
+ SV *si = NULL; /* Input swash initialization string */
+ SV* invlist = NULL;
+
+ RXi_GET_DECL(prog,progi);
+ const struct reg_data * const data = prog ? progi->data : NULL;
+
+ PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
+
+ assert(ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
+
+ if (data && data->count) {
+ const U32 n = ARG(node);
+
+ if (data->what[n] == 's') {
+ SV * const rv = MUTABLE_SV(data->data[n]);
+ AV * const av = MUTABLE_AV(SvRV(rv));
+ SV **const ary = AvARRAY(av);
+ U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+
+ si = *ary; /* ary[0] = the string to initialize the swash with */
+
+ /* Elements 3 and 4 are either both present or both absent. [3] is
+ * any inversion list generated at compile time; [4] indicates if
+ * that inversion list has any user-defined properties in it. */
+ if (av_tindex(av) >= 2) {
+ if (only_utf8_locale_ptr
+ && ary[2]
+ && ary[2] != &PL_sv_undef)
+ {
+ *only_utf8_locale_ptr = ary[2];
+ }
+ else {
+ assert(only_utf8_locale_ptr);
+ *only_utf8_locale_ptr = NULL;
+ }
+
+ if (av_tindex(av) >= 3) {
+ invlist = ary[3];
+ if (SvUV(ary[4])) {
+ swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
+ }
+ }
+ else {
+ invlist = NULL;
+ }
+ }
+
+ /* Element [1] is reserved for the set-up swash. If already there,
+ * return it; if not, create it and store it there */
+ if (ary[1] && SvROK(ary[1])) {
+ sw = ary[1];
+ }
+ else if (doinit && ((si && si != &PL_sv_undef)
+ || (invlist && invlist != &PL_sv_undef))) {
+ assert(si);
+ sw = _core_swash_init("utf8", /* the utf8 package */
+ "", /* nameless */
+ si,
+ 1, /* binary */
+ 0, /* not from tr/// */
+ invlist,
+ &swash_init_flags);
+ (void)av_store(av, 1, sw);
+ }
+ }
+ }
+
+ /* If requested, return a printable version of what this swash matches */
+ if (listsvp) {
+ SV* matches_string = newSVpvs("");
+
+ /* The swash should be used, if possible, to get the data, as it
+ * contains the resolved data. But this function can be called at
+ * compile-time, before everything gets resolved, in which case we
+ * return the currently best available information, which is the string
+ * that will eventually be used to do that resolving, 'si' */
+ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
+ && (si && si != &PL_sv_undef))
+ {
+ sv_catsv(matches_string, si);
+ }
+
+ /* Add the inversion list to whatever we have. This may have come from
+ * the swash, or from an input parameter */
+ if (invlist) {
+ if (exclude_list) {
+ SV* clone = invlist_clone(invlist);
+ _invlist_subtract(clone, exclude_list, &clone);
+ sv_catsv(matches_string, _invlist_contents(clone));
+ SvREFCNT_dec_NN(clone);
+ }
+ else {
+ sv_catsv(matches_string, _invlist_contents(invlist));
+ }
+ }
+ *listsvp = matches_string;
+ }
+
+ return sw;
+}
+#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
/* reg_skipcomment()
- Absorbs an /x style # comments from the input stream.
- Returns true if there is more text remaining in the stream.
- Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
- terminates the pattern without including a newline.
+ Absorbs an /x style # comment from the input stream,
+ returning a pointer to the first character beyond the comment, or if the
+ comment terminates the pattern without anything following it, this returns
+ one past the final character of the pattern (in other words, RExC_end) and
+ sets the REG_RUN_ON_COMMENT_SEEN flag.
- Note its the callers responsibility to ensure that we are
+ Note it's the callers responsibility to ensure that we are
actually in /x mode
*/
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
{
- bool ended = 0;
-
PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
- while (RExC_parse < RExC_end)
- if (*RExC_parse++ == '\n') {
- ended = 1;
- break;
+ assert(*p == '#');
+
+ while (p < RExC_end) {
+ if (*(++p) == '\n') {
+ return p+1;
}
- if (!ended) {
- /* we ran off the end of the pattern without ending
- the comment, so we have to add an \n when wrapping */
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- return 0;
- } else
- return 1;
+ }
+
+ /* we ran off the end of the pattern without ending the comment, so we have
+ * to add an \n when wrapping */
+ RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ return p;
}
/* nextchar()
continue;
}
if (RExC_flags & RXf_PMf_EXTENDED) {
- if (isSPACE(*RExC_parse)) {
- RExC_parse++;
- continue;
- }
- else if (*RExC_parse == '#') {
- if ( reg_skipcomment( pRExC_state ) )
- continue;
- }
+ char * p = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ if (p != RExC_parse) {
+ RExC_parse = p;
+ continue;
+ }
}
- return retval;
+ return retval;
}
}
STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
- dVAR;
regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
}
if (RExC_emit >= RExC_emit_bound)
Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
- op, RExC_emit, RExC_emit_bound);
+ op, (void*)RExC_emit, (void*)RExC_emit_bound);
NODE_ALIGN_FILL(ret);
ptr = ret;
STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
- dVAR;
regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
}
if (RExC_emit >= RExC_emit_bound)
Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
- op, RExC_emit, RExC_emit_bound);
+ op, (void*)RExC_emit, (void*)RExC_emit_bound);
NODE_ALIGN_FILL(ret);
ptr = ret;
PERL_STATIC_INLINE STRLEN
S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
{
- dVAR;
-
PERL_ARGS_ASSERT_REGUNI;
return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
- dVAR;
regnode *src;
regnode *dst;
regnode *place;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGINSERT;
+ PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
const regnode *val,U32 depth)
{
- dVAR;
regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE_r({
SV * const mysv=sv_newmortal();
DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
- regprop(RExC_rx, mysv, scan);
+ regprop(RExC_rx, mysv, scan, NULL);
PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
(temp == NULL ? "->" : ""),
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
const regnode *val,U32 depth)
{
- dVAR;
regnode *scan;
U8 exact = PSEUDO;
#ifdef EXPERIMENTAL_INPLACESCAN
DEBUG_PARSE_r({
SV * const mysv=sv_newmortal();
DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
- regprop(RExC_rx, mysv, scan);
+ regprop(RExC_rx, mysv, scan, NULL);
PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
SvPV_nolen_const(mysv),
REG_NODE_NUM(scan),
DEBUG_PARSE_r({
SV * const mysv_val=sv_newmortal();
DEBUG_PARSE_MSG("");
- regprop(RExC_rx, mysv_val, val);
+ regprop(RExC_rx, mysv_val, val, NULL);
PerlIO_printf(Perl_debug_log,
"~ attach to %s (%"IVdf") offset to %"IVdf"\n",
SvPV_nolen_const(mysv_val),
int bit;
int set=0;
- for (bit=0; bit<32; bit++) {
+ ASSUME(REG_INTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_INTFLAGS_NAME_SIZE; bit++) {
if (flags & (1<<bit)) {
if (!set++ && lead)
PerlIO_printf(Perl_debug_log, "%s",lead);
int set=0;
regex_charset cs;
- for (bit=0; bit<32; bit++) {
+ ASSUME(REG_EXTFLAGS_NAME_SIZE <= sizeof(flags)*8);
+
+ for (bit=0; bit<REG_EXTFLAGS_NAME_SIZE; bit++) {
if (flags & (1<<bit)) {
if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
continue;
Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
- dVAR;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
PerlIO_printf(Perl_debug_log, ") ");
if (ri->regstclass) {
- regprop(r, sv, ri->regstclass);
+ regprop(r, sv, ri->regstclass, NULL);
PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
}
if (r->intflags & PREGf_ANCH) {
}
/*
-- regprop - printable representation of opcode
+- regprop - printable representation of opcode, with run time support
*/
void
-Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
+Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
{
#ifdef DEBUGGING
- dVAR;
int k;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
(UV)trie->maxlen,
(UV)TRIE_CHARCOUNT(trie),
(UV)trie->uniquecharcount
- )
+ );
);
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
- (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
- ? ANYOF_BITMAP(o)
- : TRIE_BITMAP(trie));
+ (void) put_charclass_bitmap_innards(sv,
+ (IS_ANYOF_TRIE(op))
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie),
+ NULL);
sv_catpvs(sv, "]");
}
}
}
}
+ if ( k == REF && reginfo) {
+ U32 n = ARG(o); /* which paren pair */
+ I32 ln = prog->offs[n].start;
+ if (prog->lastparen < n || ln == -1)
+ Perl_sv_catpvf(aTHX_ sv, ": FAIL");
+ else if (ln == prog->offs[n].end)
+ Perl_sv_catpvf(aTHX_ sv, ": ACCEPT - EMPTY STRING");
+ else {
+ const char *s = reginfo->strbeg + ln;
+ Perl_sv_catpvf(aTHX_ sv, ": ");
+ Perl_pv_pretty( aTHX_ sv, s, prog->offs[n].end - prog->offs[n].start, 32, 0, 0,
+ PERL_PV_ESCAPE_UNI_DETECT|PERL_PV_PRETTY_NOCLEAR|PERL_PV_PRETTY_ELLIPSES|PERL_PV_PRETTY_QUOTE );
+ }
+ }
} else if (k == GOSUB)
/* Paren and offset */
Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o));
else if (k == ANYOF) {
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
+ SV* bitmap_invlist; /* Will hold what the bit map contains */
- if (flags & ANYOF_LOCALE)
+ if (flags & ANYOF_LOCALE_FLAGS)
sv_catpvs(sv, "{loc}");
if (flags & ANYOF_LOC_FOLD)
sv_catpvs(sv, "{i}");
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
- /* output what the standard cp 0-255 bitmap matches */
- do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
+ /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
+ * */
+ do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
+ &bitmap_invlist);
/* output any special charclass tests (used entirely under use
* locale) * */
}
}
- if ((flags & ANYOF_ABOVE_LATIN1_ALL)
- || ANYOF_UTF8_LOCALE_INVLIST(o) || ANYOF_NONBITMAP(o))
+ if ((flags & (ANYOF_ABOVE_LATIN1_ALL
+ |ANYOF_UTF8
+ |ANYOF_NONBITMAP_NON_UTF8
+ |ANYOF_LOC_FOLD)))
{
if (do_sep) {
Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]);
/* output information about the unicode matching */
if (flags & ANYOF_ABOVE_LATIN1_ALL)
sv_catpvs(sv, "{unicode_all}");
- else if (ANYOF_NONBITMAP(o)) {
+ else if (ARG(o) != ANYOF_NONBITMAP_EMPTY) {
SV *lv; /* Set if there is something outside the bit map. */
bool byte_output = FALSE; /* If something in the bitmap has
been output */
-
- /* Get the stuff that wasn't in the bitmap */
- (void) regclass_swash(prog, o, FALSE, &lv, NULL);
+ SV *only_utf8_locale;
+
+ /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
+ * is used to guarantee that nothing in the bitmap gets
+ * returned */
+ (void) _get_regclass_nonbitmap_data(prog, o, FALSE,
+ &lv, &only_utf8_locale,
+ bitmap_invlist);
if (lv && lv != &PL_sv_undef) {
char *s = savesvpv(lv);
char * const origs = s;
Safefree(origs);
SvREFCNT_dec_NN(lv);
}
- }
- /* Output any UTF-8 locale code points */
- if (flags & ANYOF_LOC_FOLD && ANYOF_UTF8_LOCALE_INVLIST(o)) {
- UV start, end;
- int max_entries = 256;
-
- sv_catpvs(sv, "{utf8 locale}");
- invlist_iterinit(ANYOF_UTF8_LOCALE_INVLIST(o));
- while (invlist_iternext(ANYOF_UTF8_LOCALE_INVLIST(o),
- &start, &end)) {
- put_range(sv, start, end);
- max_entries --;
- if (max_entries < 0) {
- sv_catpvs(sv, "...");
- break;
+ if ((flags & ANYOF_LOC_FOLD)
+ && only_utf8_locale
+ && only_utf8_locale != &PL_sv_undef)
+ {
+ UV start, end;
+ int max_entries = 256;
+
+ sv_catpvs(sv, "{utf8 locale}");
+ invlist_iterinit(only_utf8_locale);
+ while (invlist_iternext(only_utf8_locale,
+ &start, &end)) {
+ put_range(sv, start, end, FALSE);
+ max_entries --;
+ if (max_entries < 0) {
+ sv_catpvs(sv, "...");
+ break;
+ }
}
+ invlist_iterfinish(only_utf8_locale);
}
- invlist_iterfinish(ANYOF_UTF8_LOCALE_INVLIST(o));
}
}
+ SvREFCNT_dec(bitmap_invlist);
+
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
else if (k == POSIXD || k == NPOSIXD) {
U8 index = FLAGS(o) * 2;
- if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
- Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
- }
- else {
+ if (index < C_ARRAY_LENGTH(anyofs)) {
if (*anyofs[index] != '[') {
sv_catpv(sv, "[");
}
sv_catpv(sv, "]");
}
}
+ else {
+ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
+ }
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
PERL_UNUSED_ARG(sv);
PERL_UNUSED_ARG(o);
PERL_UNUSED_ARG(prog);
+ PERL_UNUSED_ARG(reginfo);
#endif /* DEBUGGING */
}
+
+
SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
- dVAR;
struct regexp *const prog = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
void
Perl_pregfree2(pTHX_ REGEXP *rx)
{
- dVAR;
struct regexp *const r = ReANY(rx);
GET_RE_DEBUG_FLAGS_DECL;
void
Perl_regfree_internal(pTHX_ REGEXP * const rx)
{
- dVAR;
struct regexp *const r = ReANY(rx);
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
Used in stclass optimization only */
U32 refcount;
reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
+#ifdef USE_ITHREADS
+ dVAR;
+#endif
OP_REFCNT_LOCK;
refcount = --aho->refcount;
OP_REFCNT_UNLOCK;
PerlMemShared_free(aho->fail);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
- PerlMemShared_free(ri->regstclass);
+ /* we should only ever get called once, so
+ * assert as much, and also guard the free
+ * which /might/ happen twice. At the least
+ * it will make code anlyzers happy and it
+ * doesn't cost much. - Yves */
+ assert(ri->regstclass);
+ if (ri->regstclass) {
+ PerlMemShared_free(ri->regstclass);
+ ri->regstclass = 0;
+ }
}
}
break;
/* trie structure. */
U32 refcount;
reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
+#ifdef USE_ITHREADS
+ dVAR;
+#endif
OP_REFCNT_LOCK;
refcount = --trie->refcount;
OP_REFCNT_UNLOCK;
* when the corresponding reg_ac_data struct is freed.
*/
reti->regstclass= ri->regstclass;
- /* Fall through */
+ /* FALLTHROUGH */
case 't':
OP_REFCNT_LOCK;
((reg_trie_data*)ri->data->data[i])->refcount++;
OP_REFCNT_UNLOCK;
- /* Fall through */
+ /* FALLTHROUGH */
case 'l':
case 'L':
d->data[i] = ri->data->data[i];
regnode *
Perl_regnext(pTHX_ regnode *p)
{
- dVAR;
I32 offset;
if (!p)
void
Perl_save_re_context(pTHX)
{
- dVAR;
-
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
if (PL_curpm) {
const REGEXP * const rx = PM_GETRE(PL_curpm);
#ifdef DEBUGGING
+/* Given that c is a control character, is it one for which we have a
+ * mnemonic? */
+#define isMNEMONIC_CNTRL(c) ((isSPACE_A(c) && (c) != '\v') \
+ || (c) == '\a' \
+ || (c) == '\b' \
+ || (c) == ESC_NATIVE)
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c) \
+ ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
if (!isPRINT(c)) {
switch (c) {
- case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+ case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+ case '\b': Perl_sv_catpvf(aTHX_ sv, "\\b"); break;
+ case ESC_NATIVE: Perl_sv_catpvf(aTHX_ sv, "\\e"); break;
+ case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+ case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
- case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
- case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
-
- default:
- Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
- break;
+ default: Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", c); break;
}
}
else {
const char string = c;
- if (c == '-' || c == ']' || c == '\\' || c == '^')
+ if (isBACKSLASHED_PUNCT(c))
sv_catpvs(sv, "\\");
sv_catpvn(sv, &string, 1);
}
}
+#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
STATIC void
-S_put_range(pTHX_ SV *sv, UV start, UV end)
+S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
{
-
/* Appends to 'sv' a displayable version of the range of code points from
- * 'start' to 'end' */
+ * 'start' to 'end'. It assumes that only ASCII printables are displayable
+ * as-is (though some of these will be escaped by put_byte()). */
+
+ const unsigned int min_range_count = 3;
assert(start <= end);
PERL_ARGS_ASSERT_PUT_RANGE;
- if (end - start < 3) { /* Individual chars in short ranges */
- for (; start <= end; start++)
- put_byte(sv, start);
- }
- else if ( end > 255
- || ! isALPHANUMERIC(start)
- || ! isALPHANUMERIC(end)
- || isDIGIT(start) != isDIGIT(end)
- || isUPPER(start) != isUPPER(end)
- || isLOWER(start) != isLOWER(end)
-
- /* This final test should get optimized out except on EBCDIC
- * platforms, where it causes ranges that cross discontinuities
- * like i/j to be shown as hex instead of the misleading,
- * e.g. H-K (since that range includes more than H, I, J, K).
+ while (start <= end) {
+ if (end - start < min_range_count) {
+
+ /* Individual chars in short ranges */
+ for (; start <= end; start++) {
+ put_byte(sv, start);
+ }
+ break;
+ }
+
+ /* If permitted by the input options, and there is a possibility that
+ * this range contains a printable literal, look to see if there is
+ * one. */
+ if (allow_literals && start <= MAX_PRINT_A) {
+
+ /* If the range begin isn't an ASCII printable, effectively split
+ * the range into two parts:
+ * 1) the portion before the first such printable,
+ * 2) the rest
+ * and output them separately. */
+ if (! isPRINT_A(start)) {
+ UV temp_end = start + 1;
+
+ /* There is no point looking beyond the final possible
+ * printable, in MAX_PRINT_A */
+ UV max = MIN(end, MAX_PRINT_A);
+
+ while (temp_end <= max && ! isPRINT_A(temp_end)) {
+ temp_end++;
+ }
+
+ /* Here, temp_end points to one beyond the first printable if
+ * found, or to one beyond 'max' if not. If none found, make
+ * sure that we use the entire range */
+ if (temp_end > MAX_PRINT_A) {
+ temp_end = end + 1;
+ }
+
+ /* Output the first part of the split range, the part that
+ * doesn't have printables, with no looking for literals
+ * (otherwise we would infinitely recurse) */
+ put_range(sv, start, temp_end - 1, FALSE);
+
+ /* The 2nd part of the range (if any) starts here. */
+ start = temp_end;
+
+ /* We continue instead of dropping down because even if the 2nd
+ * part is non-empty, it could be so short that we want to
+ * output it specially, as tested for at the top of this loop.
* */
- || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
- {
+ continue;
+ }
+
+ /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
+ * output a sub-range of just the digits or letters, then process
+ * the remaining portion as usual. */
+ if (isALPHANUMERIC_A(start)) {
+ UV mask = (isDIGIT_A(start))
+ ? _CC_DIGIT
+ : isUPPER_A(start)
+ ? _CC_UPPER
+ : _CC_LOWER;
+ UV temp_end = start + 1;
+
+ /* Find the end of the sub-range that includes just the
+ * characters in the same class as the first character in it */
+ while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
+ temp_end++;
+ }
+ temp_end--;
+
+ /* For short ranges, don't duplicate the code above to output
+ * them; just call recursively */
+ if (temp_end - start < min_range_count) {
+ put_range(sv, start, temp_end, FALSE);
+ }
+ else { /* Output as a range */
+ put_byte(sv, start);
+ sv_catpvs(sv, "-");
+ put_byte(sv, temp_end);
+ }
+ start = temp_end + 1;
+ continue;
+ }
+
+ /* We output any other printables as individual characters */
+ if (isPUNCT_A(start) || isSPACE_A(start)) {
+ while (start <= end && (isPUNCT_A(start)
+ || isSPACE_A(start)))
+ {
+ put_byte(sv, start);
+ start++;
+ }
+ continue;
+ }
+ } /* End of looking for literals */
+
+ /* Here is not to output as a literal. Some control characters have
+ * mnemonic names. Split off any of those at the beginning and end of
+ * the range to print mnemonically. It isn't possible for many of
+ * these to be in a row, so this won't overwhelm with output */
+ if (isMNEMONIC_CNTRL(start)) {
+ while (isMNEMONIC_CNTRL(start) && start <= end) {
+ put_byte(sv, start);
+ start++;
+ }
+ }
+ if (start < end && isMNEMONIC_CNTRL(end)) {
+
+ /* Here, the final character in the range has a mnemonic name.
+ * Work backwards from the end to find the final non-mnemonic */
+ UV temp_end = end - 1;
+ while (isMNEMONIC_CNTRL(temp_end)) {
+ temp_end--;
+ }
+
+ /* And separately output the range that doesn't have mnemonics */
+ put_range(sv, start, temp_end, FALSE);
+
+ /* Then output the mnemonic trailing controls */
+ start = temp_end + 1;
+ while (start <= end) {
+ put_byte(sv, start);
+ start++;
+ }
+ break;
+ }
+
+ /* As a final resort, output the range or subrange as hex. */
Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
start,
- (end < 256) ? end : 255);
- }
- else { /* Here, the ends of the range are both digits, or both uppercase,
- or both lowercase; and there's no discontinuity in the range
- (which could happen on EBCDIC platforms) */
- put_byte(sv, start);
- sv_catpvs(sv, "-");
- put_byte(sv, end);
+ (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1);
+ break;
}
}
STATIC bool
-S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
- * output anything */
+ * output anything, and bitmap_invlist, if not NULL, will point to an
+ * inversion list of what is in the bit map */
int i;
- bool has_output_anything = FALSE;
+ UV start, end;
+ unsigned int punct_count = 0;
+ SV* invlist = NULL;
+ SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
+ bool allow_literals = TRUE;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+ invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
+
+ /* Worst case is exactly every-other code point is in the list */
+ *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+
+ /* Convert the bit map to an inversion list, keeping track of how many
+ * ASCII puncts are set, including an extra amount for the backslashed
+ * ones. */
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST((U8 *) bitmap,i)) {
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
+ if (isPUNCT_A(i)) {
+ punct_count++;
+ if isBACKSLASHED_PUNCT(i) {
+ punct_count++;
+ }
+ }
+ }
+ }
+
+ /* Nothing to output */
+ if (_invlist_len(*invlist_ptr) == 0) {
+ SvREFCNT_dec(invlist);
+ return FALSE;
+ }
- PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+ /* Generally, it is more readable if printable characters are output as
+ * literals, but if a range (nearly) spans all of them, it's best to output
+ * it as a single range. This code will use a single range if all but 2
+ * printables are in it */
+ invlist_iterinit(*invlist_ptr);
+ while (invlist_iternext(*invlist_ptr, &start, &end)) {
- for (i = 0; i < 256; i++) {
- if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+ /* If range starts beyond final printable, it doesn't have any in it */
+ if (start > MAX_PRINT_A) {
+ break;
+ }
- /* The character at index i should be output. Find the next
- * character that should NOT be output */
- int j;
- for (j = i + 1; j <= 256; j++) {
- if (! BITMAP_TEST((U8 *) bitmap, j)) {
- break;
- }
+ /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
+ * all but two, the range must start and end no later than 2 from
+ * either end */
+ if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
+ if (end > MAX_PRINT_A) {
+ end = MAX_PRINT_A;
}
-
- /* Everything between them is a single range that should be output
- * */
- put_range(sv, i, j - 1);
- has_output_anything = TRUE;
- i = j;
+ if (start < ' ') {
+ start = ' ';
+ }
+ if (end - start >= MAX_PRINT_A - ' ' - 2) {
+ allow_literals = FALSE;
+ }
+ break;
}
}
+ invlist_iterfinish(*invlist_ptr);
+
+ /* The legibility of the output depends mostly on how many punctuation
+ * characters are output. There are 32 possible ASCII ones, and some have
+ * an additional backslash, bringing it to currently 36, so if any more
+ * than 18 are to be output, we can instead output it as its complement,
+ * yielding fewer puncts, and making it more legible. But give some weight
+ * to the fact that outputting it as a complement is less legible than a
+ * straight output, so don't complement unless we are somewhat over the 18
+ * mark */
+ if (allow_literals && punct_count > 22) {
+ sv_catpvs(sv, "^");
+
+ /* Add everything remaining to the list, so when we invert it just
+ * below, it will be excluded */
+ *invlist_ptr = _add_range_to_invlist(*invlist_ptr,
+ NUM_ANYOF_CODE_POINTS, UV_MAX);
+ _invlist_invert(*invlist_ptr);
+ }
+
+ /* Here we have figured things out. Output each range */
+ invlist_iterinit(*invlist_ptr);
+ while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ break;
+ }
+ put_range(sv, start, end, allow_literals);
+ }
+ invlist_iterfinish(*invlist_ptr);
- return has_output_anything;
+ return TRUE;
}
#define CLEAR_OPTSTART \
const regnode *last, const regnode *plast,
SV* sv, I32 indent, U32 depth)
{
- dVAR;
U8 op = PSEUDO; /* Arbitrary non-END op. */
const regnode *next;
const regnode *optstart= NULL;
last= plast;
while (PL_regkind[op] != END && (!last || node < last)) {
+ assert(node);
/* While that wasn't END last time... */
NODE_ALIGN(node);
op = OP(node);
} else
CLEAR_OPTSTART;
- regprop(r, sv, node);
+ regprop(r, sv, node, NULL);
PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
(int)(2*indent + 1), "", SvPVX_const(sv));
}
else if (PL_regkind[(U8)op] == ANYOF) {
/* arglen 1 + class block */
- node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
- ? ANYOF_POSIXL_FOLD_SKIP
- : (ANYOF_FLAGS(node) & ANYOF_POSIXL)
- ? ANYOF_POSIXL_SKIP
- : ANYOF_SKIP);
+ node += 1 + ((ANYOF_FLAGS(node) & ANYOF_POSIXL)
+ ? ANYOF_POSIXL_SKIP
+ : ANYOF_SKIP);
node = NEXTOPER(node);
}
else if (PL_regkind[(U8)op] == EXACT) {