#define REG_COMP_C
#ifdef PERL_IN_XSUB_RE
# include "re_comp.h"
+extern const struct regexp_engine my_reg_engine;
#else
# include "regcomp.h"
#endif
* one, and looks for problematic sequences of characters whose folds vs.
* non-folds have sufficiently different lengths, that the optimizer would be
* fooled into rejecting legitimate matches of them, and the trie construction
- * code can't cope with them. The joining is only done if:
+ * code needs to handle specially. The joining is only done if:
* 1) there is room in the current conglomerated node to entirely contain the
* next one.
* 2) they are the exact same node type
*
- * The adjacent nodes actually may be separated by NOTHING kind nodes, and
+ * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
* these get optimized out
*
* If there are problematic code sequences, *min_subtract is set to the delta
*
* This is as good a place as any to discuss the design of handling these
* problematic sequences. It's been wrong in Perl for a very long time. There
- * are three code points in Unicode whose folded lengths differ so much from
- * the un-folded lengths that it causes problems for the optimizer and trie
- * construction. Why only these are problematic, and not others where lengths
- * also differ is something I (khw) do not understand. New versions of Unicode
- * might add more such code points. Hopefully the logic in fold_grind.t that
- * figures out what to test (in part by verifying that each size-combination
- * gets tested) will catch any that do come along, so they can be added to the
- * special handling below. The chances of new ones are actually rather small,
- * as most, if not all, of the world's scripts that have casefolding have
- * already been encoded by Unicode. Also, a number of Unicode's decisions were
- * made to allow compatibility with pre-existing standards, and almost all of
- * those have already been dealt with. These would otherwise be the most
- * likely candidates for generating further tricky sequences. In other words,
- * Unicode by itself is unlikely to add new ones unless it is for compatibility
- * with pre-existing standards, and there aren't many of those left.
+ * are three code points currently in Unicode whose folded lengths differ so
+ * much from the un-folded lengths that it causes problems for the optimizer
+ * and trie construction. Why only these are problematic, and not others where
+ * lengths also differ is something I (khw) do not understand. New versions of
+ * Unicode might add more such code points. Hopefully the logic in
+ * fold_grind.t that figures out what to test (in part by verifying that each
+ * size-combination gets tested) will catch any that do come along, so they can
+ * be added to the special handling below. The chances of new ones are
+ * actually rather small, as most, if not all, of the world's scripts that have
+ * casefolding have already been encoded by Unicode. Also, a number of
+ * Unicode's decisions were made to allow compatibility with pre-existing
+ * standards, and almost all of those have already been dealt with. These
+ * would otherwise be the most likely candidates for generating further tricky
+ * sequences. In other words, Unicode by itself is unlikely to add new ones
+ * unless it is for compatibility with pre-existing standards, and there aren't
+ * many of those left.
*
* The previous designs for dealing with these involved assigning a special
* node for them. This approach doesn't work, as evidenced by this example:
* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
- * Both these fold to "sss", but if the pattern is parsed to create a node of
+ * Both these fold to "sss", but if the pattern is parsed to create a node
* that would match just the \xDF, it won't be able to handle the case where a
* successful match would have to cross the node's boundary. The new approach
* that hopefully generally solves the problem generates an EXACTFU_SS node
* problematic sequences. This delta is used by the caller to adjust the
* min length of the match, and the delta between min and max, so that the
* optimizer doesn't reject these possibilities based on size constraints.
- * 2) These sequences require special handling by the trie code, so it
- * changes the joined node type to ops for the trie's benefit, those new
- * ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
+ * 2) These sequences require special handling by the trie code, so this code
+ * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
+ * EXACTFU_SS.
* 3) This is sufficient for the two Greek sequences (described below), but
* the one involving the Sharp s (\xDF) needs more. The node type
* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
* itself with length changes, and so can be processed faster. regexec.c
* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
* is pre-folded by regcomp.c. This saves effort in regex matching.
- * However, probably mostly for historical reasons, the pre-folding isn't
- * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
- * nodes, as what they fold to isn't known until runtime.) The fold
- * possibilities for the non-UTF8 patterns are quite simple, except for
- * the sharp s. All the ones that don't involve a UTF-8 target string
- * are members of a fold-pair, and arrays are set up for all of them
- * that quickly find the other member of the pair. It might actually
- * be faster to pre-fold these, but it isn't currently done, except for
- * the sharp s. Code elsewhere in this file makes sure that it gets
- * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
- * issues described in the next item.
+ * However, the pre-folding isn't done for non-UTF8 patterns because the
+ * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL
+ * nodes fold to isn't known until runtime. The fold possibilities for
+ * the non-UTF8 patterns are quite simple, except for the sharp s. All
+ * the ones that don't involve a UTF-8 target string are members of a
+ * fold-pair, and arrays are set up for all of them so that the other
+ * member of the pair can be found quickly. Code elsewhere in this file
+ * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
+ * if the pattern isn't UTF-8. This avoids the issues described in the
+ * next item.
* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
* 'ss' or not is not knowable at compile time. It will match iff the
* target string is in UTF-8, unlike the EXACTFU nodes, where it always
const unsigned int oldl = STR_LEN(scan);
regnode * const nnext = regnext(n);
+ /* XXX I (khw) kind of doubt that this works on platforms where
+ * U8_MAX is above 255 because of lots of other assumptions */
if (oldl + STR_LEN(n) > U8_MAX)
break;
greek_sequence:
*min_subtract += 4;
- /* This can't currently be handled by trie's, so change
+ /* This requires special handling by trie's, so change
* the node type to indicate this. If EXACTFA and
* EXACTFL were ever to be handled by trie's, this
* would have to be changed. If this node has already
/* EXACTF nodes need to know that the minimum
* length changed so that a sharp s in the string
* can match this ss in the pattern, but they
- * remain EXACTF nodes, as they are not trie'able,
- * so don't have to invent a new node type to
- * exclude them from the trie code */
+ * remain EXACTF nodes, as they won't match this
+ * unless the target string is is UTF-8, which we
+ * don't know until runtime */
if (OP(scan) != EXACTF) {
OP(scan) = EXACTFU_SS;
}
}
#endif
-/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
- * pattern rather than a list of OPs */
+/* public(ish) entry point for the perl core's own regex compiling code.
+ * It's actually a wrapper for Perl_re_op_compile that only takes an SV
+ * pattern rather than a list of OPs, and uses the internal engine rather
+ * than the current one */
REGEXP *
Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
{
SV *pat = pattern; /* defeat constness! */
PERL_ARGS_ASSERT_RE_COMPILE;
- return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
- NULL, NULL, rx_flags, 0);
+ return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
+#ifdef PERL_IN_XSUB_RE
+ &my_reg_engine,
+#else
+ &PL_core_reg_engine,
+#endif
+ NULL, NULL, rx_flags, 0);
}
/* see if there are any run-time code blocks in the pattern.
runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
exp, plen);
if (!runtime_code) {
- ReREFCNT_inc(old_re);
if (used_setjump) {
JMPENV_POP;
}
#ifndef PERL_IN_XSUB_RE
-STATIC IV
-S_invlist_search(pTHX_ SV* const invlist, const UV cp)
+IV
+Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
IV high = invlist_len(invlist);
const UV * const array = invlist_array(invlist);
- PERL_ARGS_ASSERT_INVLIST_SEARCH;
+ PERL_ARGS_ASSERT__INVLIST_SEARCH;
/* If list is empty or the code point is before the first element, return
* failure. */
array = invlist_array(invlist);
/* Find which element it is */
- i = invlist_search(invlist, start);
+ i = _invlist_search(invlist, start);
/* We populate from <start> to <end> */
while (current < end) {
#endif
+STATIC bool
+S__invlist_contains_cp(pTHX_ SV* const invlist, const UV cp)
+{
+ /* Does <invlist> contain code point <cp> as part of the set? */
+
+ IV index = _invlist_search(invlist, cp);
+
+ PERL_ARGS_ASSERT__INVLIST_CONTAINS_CP;
+
+ return index >= 0 && ELEMENT_RANGE_MATCHES_INVLIST(index);
+}
+
PERL_STATIC_INLINE SV*
S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
return _add_range_to_invlist(invlist, cp, cp);
}
-/* reg_namedseq(pRExC_state,UVp, UV depth)
+/* grok_bslash_N(pRExC_state,UVp, UV depth)
This is expected to be called by a parser routine that has
recognized '\N' and needs to handle the rest. RExC_parse is
Parsing failures will generate a fatal error via vFAIL(...)
*/
STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
{
char * endbrace; /* '}' following the name */
regnode *ret = NULL;
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+ PERL_ARGS_ASSERT_GROK_BSLASH_N;
GET_RE_DEBUG_FLAGS;
return ret;
}
- /* Here, we have decided it should be a named sequence */
+ /* Here, we have decided it should be a named character or sequence */
/* The test above made sure that the next real character is a '{', but
* under the /x modifier, it could be separated by space (or a comment and
return (regnode *) &RExC_parse; /* Invalid regnode pointer */
}
- REQUIRE_UTF8; /* named sequences imply Unicode semantics */
+ RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
RExC_parse += 2; /* Skip past the 'U+' */
if (valuep) { /* In a bracketed char class */
Also this makes sure that things like /\N{BLAH}+/ and
\N{BLAH} being multi char Just Happen. dmq*/
++RExC_parse;
- ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
+ ret= grok_bslash_N(pRExC_state, NULL, flagp, depth);
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
register UV ender;
register char *p;
char *s;
+#define MAX_NODE_STRING_SIZE 127
+ char foldbuf[MAX_NODE_STRING_SIZE];
STRLEN foldlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
U8 node_type;
-
- /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
- * it is folded to 'ss' even if not utf8 */
- bool is_exactfu_sharp_s;
+ bool next_is_quantifier;
ender = 0;
node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
- s = STRING(ret);
+
+ /* In pass1, folded, we use a temporary buffer instead of the
+ * actual node, as the node doesn't exist yet */
+ s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
* non-final, but it is possible for there not to be any in the
* entire node. */
for (len = 0, p = RExC_parse - 1;
- len < 127 && p < RExC_end;
+ len < MAX_NODE_STRING_SIZE && p < RExC_end;
len++)
{
char * const oldp = p;
break;
} /* End of switch on the literal */
- is_exactfu_sharp_s = (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S);
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
- if ((UTF && FOLD) || is_exactfu_sharp_s) {
- /* Prime the casefolded buffer. Locale rules, which apply
- * only to code points < 256, aren't known until execution,
- * so for them, just output the original character using
- * utf8. If we start to fold non-UTF patterns, be sure to
- * update join_exact() */
- if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
- *tmpbuf = (U8) ender;
- foldlen = 1;
- } else {
- *tmpbuf = UTF8_TWO_BYTE_HI(ender);
- *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
- foldlen = 2;
- }
- }
- else if (isASCII(ender)) { /* Note: Here can't also be LOC
- */
- ender = toLOWER(ender);
- *tmpbuf = (U8) ender;
- foldlen = 1;
- }
- else if (! ASCII_FOLD_RESTRICTED && ! LOC) {
- /* Locale and /aa require more selectivity about the
- * fold, so are handled below. Otherwise, here, just
- * use the fold */
- ender = toFOLD_uni(ender, tmpbuf, &foldlen);
- }
- else {
- /* Under locale rules or /aa we are not to mix,
- * respectively, ords < 256 or ASCII with non-. So
- * reject folds that mix them, using only the
- * non-folded code point. So do the fold to a
- * temporary, and inspect each character in it. */
- U8 trialbuf[UTF8_MAXBYTES_CASE+1];
- U8* s = trialbuf;
- UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
- U8* e = s + foldlen;
- bool fold_ok = TRUE;
-
- while (s < e) {
- if (isASCII(*s)
- || (LOC && (UTF8_IS_INVARIANT(*s)
- || UTF8_IS_DOWNGRADEABLE_START(*s))))
- {
- fold_ok = FALSE;
- break;
- }
- s += UTF8SKIP(s);
- }
- if (fold_ok) {
- Copy(trialbuf, tmpbuf, foldlen, U8);
- ender = tmpender;
- }
- else {
- uvuni_to_utf8(tmpbuf, ender);
- foldlen = UNISKIP(ender);
- }
- }
- }
- if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
- if (len)
- p = oldp;
- else if (UTF || is_exactfu_sharp_s) {
- if (FOLD) {
- /* Emit all the Unicode characters. */
- STRLEN numlen;
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen) {
-
- /* tmpbuf has been constructed by us, so we
- * know it is valid utf8 */
- ender = valid_utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- }
- else {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- }
- else {
- len++;
- REGC((char)ender, s++);
- }
- break;
+ /* If the next thing is a quantifier, it applies to this
+ * character only, which means that this character has to be in
+ * its own node and can't just be appended to the string in an
+ * existing node, so if there are already other characters in
+ * the node, close the node with just them, and set up to do
+ * this character again next time through, when it will be the
+ * only thing in its new node */
+ if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+ {
+ p = oldp;
+ goto loopdone;
+ }
+
+ if (FOLD) {
+ if (UTF
+ /* See comments for join_exact() as to why we fold
+ * this non-UTF at compile time */
+ || (node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S))
+ {
+
+
+ /* Prime the casefolded buffer. Locale rules, which
+ * apply only to code points < 256, aren't known until
+ * execution, so for them, just output the original
+ * character using utf8. If we start to fold non-UTF
+ * patterns, be sure to update join_exact() */
+ if (LOC && ender < 256) {
+ if (UNI_IS_INVARIANT(ender)) {
+ *s = (U8) ender;
+ foldlen = 1;
+ } else {
+ *s = UTF8_TWO_BYTE_HI(ender);
+ *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else {
+ ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+ }
+ s += foldlen;
+
+ /* The loop increments <len> each time, as all but this
+ * path (and the one just below for UTF) through it add
+ * a single byte to the EXACTish node. But this one
+ * has changed len to be the correct final value, so
+ * subtract one to cancel out the increment that
+ * follows */
+ len += foldlen - 1;
+ }
+ else {
+ REGC((char)ender, s++);
+ }
}
- if (UTF || is_exactfu_sharp_s) {
- if (FOLD) {
- /* Emit all the Unicode characters. */
- STRLEN numlen;
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen) {
- ender = valid_utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- len += unilen;
- s += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break;
- }
- }
- else {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- len--;
+ else if (UTF) {
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+
+ /* See comment just above for - 1 */
+ len--;
}
else {
REGC((char)ender, s++);
+ }
+
+ if (next_is_quantifier) {
+
+ /* Here, the next input is a quantifier, and to get here,
+ * the current character is the only one in the node.
+ * Also, here <len> doesn't include the final byte for this
+ * character */
+ len++;
+ goto loopdone;
}
- }
+
+ } /* End of loop through literal characters */
+
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
RExC_parse = p - 1;
*flagp |= SIMPLE;
alloc_maybe_populate_EXACT(pRExC_state, ret, len, 0);
- }
+ } /* End of label 'defchar:' */
break;
- }
+ } /* End of giant switch on input character */
return(ret);
}
* not escapes. Thus we can tell if 'A' was input vs \x{C1} */
UV literal_endpoint = 0;
#endif
- UV stored = 0; /* how many chars stored in the bitmap */
bool invert = FALSE; /* Is this class to be complemented */
/* Is there any thing like \W or [:^digit:] that matches above the legal
from earlier versions, OTOH that behaviour was broken
as well. */
UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
+ if (grok_bslash_N(pRExC_state, &v, NULL, depth)) {
goto parseit;
}
value= v;
* fold the classes (folding of those is automatically handled by the swash
* fetching code) */
if (posixes) {
- if (AT_LEAST_UNI_SEMANTICS) {
+ if (! DEPENDS_SEMANTICS) {
if (cp_list) {
_invlist_union(cp_list, posixes, &cp_list);
SvREFCNT_dec(posixes);
}
}
else {
-
/* Under /d, we put into a separate list the Latin1 things that
* match only when the target string is utf8 */
SV* nonascii_but_latin1_properties = NULL;
invert = FALSE;
}
+ /* If we didn't do folding, it's because some information isn't available
+ * until runtime; set the run-time fold flag for these. (We don't have to
+ * worry about properties folding, as that is taken care of by the swash
+ * fetching) */
+ if (FOLD && (LOC || unicode_alternate))
+ {
+ ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+ }
+
+ /* Some character classes are equivalent to other nodes. Such nodes take
+ * up less room and generally fewer operations to execute than ANYOF nodes.
+ * Above, we checked for and optimized into some such equivalents for
+ * certain common classes that are easy to test. Getting to this point in
+ * the code means that the class didn't get optimized there. Since this
+ * code is only executed in Pass 2, it is too late to save space--it has
+ * been allocated in Pass 1, and currently isn't given back. But turning
+ * things into an EXACTish node can allow the optimizer to join it to any
+ * adjacent such nodes. And if the class is equivalent to things like /./,
+ * expensive run-time swashes can be avoided. Now that we have more
+ * complete information, we can find things necessarily missed by the
+ * earlier code. I (khw) am not sure how much to look for here. It would
+ * be easy, but perhaps too slow, to check any candidates against all the
+ * node types they could possibly match using _invlistEQ(). */
+
+ if (cp_list
+ && ! unicode_alternate
+ && ! invert
+ && ! depends_list
+ && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
+ && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+ {
+ UV start, end;
+ U8 op = END; /* The optimzation node-type */
+ const char * cur_parse= RExC_parse;
+
+ invlist_iterinit(cp_list);
+ if (! invlist_iternext(cp_list, &start, &end)) {
+
+ /* Here, the list is empty. This happens, for example, when a
+ * Unicode property is the only thing in the character class, and
+ * it doesn't match anything. (perluniprops.pod notes such
+ * properties) */
+ op = OPFAIL;
+ }
+ else if (start == end) { /* The range is a single code point */
+ if (! invlist_iternext(cp_list, &start, &end)
+
+ /* Don't do this optimization if it would require changing
+ * the pattern to UTF-8 */
+ && (start < 256 || UTF))
+ {
+ /* Here, the list contains a single code point. Can optimize
+ * into an EXACT node */
+
+ value = start;
+
+ if (! FOLD) {
+ op = EXACT;
+ }
+ else if (LOC) {
+
+ /* A locale node under folding with one code point can be
+ * an EXACTFL, as its fold won't be calculated until
+ * runtime */
+ op = EXACTFL;
+ }
+ else {
+
+ /* Here, we are generally folding, but there is only one
+ * code point to match. If we have to, we use an EXACT
+ * node, but it would be better for joining with adjacent
+ * nodes in the optimization pass if we used the same
+ * EXACTFish node that any such are likely to be. We can
+ * do this iff the code point doesn't participate in any
+ * folds. For example, an EXACTF of a colon is the same as
+ * an EXACT one, since nothing folds to or from a colon.
+ * In the Latin1 range, being an alpha means that the
+ * character participates in a fold (except for the
+ * feminine and masculine ordinals, which I (khw) don't
+ * think are worrying about optimizing for). */
+ if (value < 256) {
+ if (isALPHA_L1(value)) {
+ op = EXACT;
+ }
+ }
+ else {
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8", "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable = _get_swash_invlist(swash);
+ SvREFCNT_dec(swash);
+ }
+ if (_invlist_contains_cp(PL_utf8_foldable, value)) {
+ op = EXACT;
+ }
+ }
+
+ /* If we haven't found the node type, above, it means we
+ * can use the prevailing one */
+ if (op == END) {
+ op = compute_EXACTish(pRExC_state);
+ }
+ }
+ }
+ }
+ else if (start == 0) {
+ if (end == UV_MAX) {
+ op = SANY;
+ }
+ else if (end == '\n' - 1
+ && invlist_iternext(cp_list, &start, &end)
+ && start == '\n' + 1 && end == UV_MAX)
+ {
+ op = REG_ANY;
+ }
+ }
+
+ if (op != END) {
+ RExC_parse = (char *)orig_parse;
+ RExC_emit = (regnode *)orig_emit;
+
+ ret = reg_node(pRExC_state, op);
+
+ RExC_parse = (char *)cur_parse;
+
+ if (PL_regkind[op] == EXACT) {
+ alloc_maybe_populate_EXACT(pRExC_state, ret, 0, value);
+ }
+
+ SvREFCNT_dec(listsv);
+ return ret;
+ }
+ }
+
/* Here, <cp_list> contains all the code points we can determine at
* compile time that match under all conditions. Go through it, and
* for things that belong in the bitmap, put them there, and delete from
- * <cp_list> */
+ * <cp_list>. While we are at it, see if everything above 255 is in the
+ * list, and if so, set a flag to speed up execution */
ANYOF_BITMAP_ZERO(ret);
if (cp_list) {
UV high;
int i;
+ if (end == UV_MAX && start <= 256) {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ }
+
/* Quit if are above what we should change */
if (start > 255) {
break;
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(ret, i)) {
ANYOF_BITMAP_SET(ret, i);
- stored++;
prevvalue = value;
value = i;
}
ANYOF_FLAGS(ret) |= ANYOF_INVERT;
}
- /* Combine the two lists into one. */
+ /* Here, the bitmap has been populated with all the Latin1 code points that
+ * always match. Can now add to the overall list those that match only
+ * when the target string is UTF-8 (<depends_list>). */
if (depends_list) {
if (cp_list) {
_invlist_union(cp_list, depends_list, &cp_list);
}
}
- /* Folding in the bitmap is taken care of above, but not for locale (for
- * which we have to wait to see what folding is in effect at runtime), and
- * for some things not in the bitmap (only the upper latin folds in this
- * case, as all other single-char folding has been set above). Set
- * run-time fold flag for these */
- if (FOLD && (LOC
- || (DEPENDS_SEMANTICS
- && cp_list
- && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
- || unicode_alternate))
- {
- ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
- }
-
- /* A single character class can be "optimized" into an EXACTish node.
- * Note that since we don't currently count how many characters there are
- * outside the bitmap, we are XXX missing optimization possibilities for
- * them. This optimization can't happen unless this is a truly single
- * character class, which means that it can't be an inversion into a
- * many-character class, and there must be no possibility of there being
- * things outside the bitmap. 'stored' (only) for locales doesn't include
- * \w, etc, so have to make a special test that they aren't present
- *
- * Similarly A 2-character class of the very special form like [bB] can be
- * optimized into an EXACTFish node, but only for non-locales, and for
- * characters which only have the two folds; so things like 'fF' and 'Ii'
- * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
- * FI'. */
- if (! cp_list
- && ! unicode_alternate
- && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
- && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
- && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
- || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
- || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
- && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
- /* If the latest code point has a fold whose
- * bit is set, it must be the only other one */
- && ((prevvalue = PL_fold_latin1[value]) != value)
- && ANYOF_BITMAP_TEST(ret, prevvalue)))))
- {
- /* Note that the information needed to decide to do this optimization
- * is not currently available until the 2nd pass, and that the actually
- * used EXACTish node takes less space than the calculated ANYOF node,
- * and hence the amount of space calculated in the first pass is larger
- * than actually used, so this optimization doesn't gain us any space.
- * But an EXACT node is faster than an ANYOF node, and can be combined
- * with any adjacent EXACT nodes later by the optimizer for further
- * gains. The speed of executing an EXACTF is similar to an ANYOF
- * node, so the optimization advantage comes from the ability to join
- * it to adjacent EXACT nodes */
-
- const char * cur_parse= RExC_parse;
- U8 op;
- RExC_emit = (regnode *)orig_emit;
- RExC_parse = (char *)orig_parse;
-
- if (stored == 1) {
-
- /* A locale node with one point can be folded; all the other cases
- * with folding will have two points, since we calculate them above
- */
- if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
- op = EXACTFL;
- }
- else {
- op = EXACT;
- }
- }
- else { /* else 2 chars in the bit map: the folds of each other */
-
- /* Use the folded value, which for the cases where we get here,
- * is just the lower case of the current one (which may resolve to
- * itself, or to the other one */
- value = toLOWER_LATIN1(value);
-
- /* To join adjacent nodes, they must be the exact EXACTish type.
- * Try to use the most likely type, by using EXACTFA if possible,
- * then EXACTFU if the regex calls for it, or is required because
- * the character is non-ASCII. (If <value> is ASCII, its fold is
- * also ASCII for the cases where we get here.) */
- if (ASCII_FOLD_RESTRICTED && isASCII(value)) {
- op = EXACTFA;
- }
- else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
- op = EXACTFU;
- }
- else { /* Otherwise, more likely to be EXACTF type */
- op = EXACTF;
- }
- }
-
- ret = reg_node(pRExC_state, op);
- RExC_parse = (char *)cur_parse;
- if (UTF && ! NATIVE_IS_INVARIANT(value)) {
- *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
- *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
- STR_LEN(ret)= 2;
- RExC_emit += STR_SZ(2);
- }
- else {
- *STRING(ret)= (char)value;
- STR_LEN(ret)= 1;
- RExC_emit += STR_SZ(1);
- }
- SvREFCNT_dec(listsv);
- return ret;
- }
-
/* If there is a swash and more than one element, we can't use the swash in
* the optimization below. */
if (swash && element_count > 1) {
SvREFCNT_dec(swash);
swash = NULL;
}
+
if (! cp_list
&& ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
&& ! unicode_alternate)