#define REG_COMP_C
#ifdef PERL_IN_XSUB_RE
# include "re_comp.h"
+extern const struct regexp_engine my_reg_engine;
#else
# include "regcomp.h"
#endif
#include "dquote_static.c"
-#ifndef PERL_IN_XSUB_RE
-# include "charclass_invlists.h"
-#endif
+#include "charclass_invlists.h"
+#include "inline_invlist.c"
#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#ifdef op
#undef op
#define WORST 0 /* Worst case. */
#define HASWIDTH 0x01 /* Known to match non-null strings. */
-/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
+/* Simple enough to be STAR/PLUS operand; in an EXACT node must be a single
* character, and if utf8, must be invariant. Note that this is not the same
* thing as REGNODE_SIMPLE */
#define SIMPLE 0x02
#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
-#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
+#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
-#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
+/* There is no code point that is out-of-bounds, so this is problematic. But
+ * its only current use is to initialize a variable that is always set before
+ * looked at. */
+#define OOB_UNICODE 0xDEADBEEF
+
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
* one, and looks for problematic sequences of characters whose folds vs.
* non-folds have sufficiently different lengths, that the optimizer would be
* fooled into rejecting legitimate matches of them, and the trie construction
- * code can't cope with them. The joining is only done if:
+ * code needs to handle specially. The joining is only done if:
* 1) there is room in the current conglomerated node to entirely contain the
* next one.
* 2) they are the exact same node type
*
- * The adjacent nodes actually may be separated by NOTHING kind nodes, and
+ * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
* these get optimized out
*
* If there are problematic code sequences, *min_subtract is set to the delta
*
* This is as good a place as any to discuss the design of handling these
* problematic sequences. It's been wrong in Perl for a very long time. There
- * are three code points in Unicode whose folded lengths differ so much from
- * the un-folded lengths that it causes problems for the optimizer and trie
- * construction. Why only these are problematic, and not others where lengths
- * also differ is something I (khw) do not understand. New versions of Unicode
- * might add more such code points. Hopefully the logic in fold_grind.t that
- * figures out what to test (in part by verifying that each size-combination
- * gets tested) will catch any that do come along, so they can be added to the
- * special handling below. The chances of new ones are actually rather small,
- * as most, if not all, of the world's scripts that have casefolding have
- * already been encoded by Unicode. Also, a number of Unicode's decisions were
- * made to allow compatibility with pre-existing standards, and almost all of
- * those have already been dealt with. These would otherwise be the most
- * likely candidates for generating further tricky sequences. In other words,
- * Unicode by itself is unlikely to add new ones unless it is for compatibility
- * with pre-existing standards, and there aren't many of those left.
+ * are three code points currently in Unicode whose folded lengths differ so
+ * much from the un-folded lengths that it causes problems for the optimizer
+ * and trie construction. Why only these are problematic, and not others where
+ * lengths also differ is something I (khw) do not understand. New versions of
+ * Unicode might add more such code points. Hopefully the logic in
+ * fold_grind.t that figures out what to test (in part by verifying that each
+ * size-combination gets tested) will catch any that do come along, so they can
+ * be added to the special handling below. The chances of new ones are
+ * actually rather small, as most, if not all, of the world's scripts that have
+ * casefolding have already been encoded by Unicode. Also, a number of
+ * Unicode's decisions were made to allow compatibility with pre-existing
+ * standards, and almost all of those have already been dealt with. These
+ * would otherwise be the most likely candidates for generating further tricky
+ * sequences. In other words, Unicode by itself is unlikely to add new ones
+ * unless it is for compatibility with pre-existing standards, and there aren't
+ * many of those left.
*
* The previous designs for dealing with these involved assigning a special
* node for them. This approach doesn't work, as evidenced by this example:
* "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
- * Both these fold to "sss", but if the pattern is parsed to create a node of
+ * Both these fold to "sss", but if the pattern is parsed to create a node
* that would match just the \xDF, it won't be able to handle the case where a
* successful match would have to cross the node's boundary. The new approach
* that hopefully generally solves the problem generates an EXACTFU_SS node
* problematic sequences. This delta is used by the caller to adjust the
* min length of the match, and the delta between min and max, so that the
* optimizer doesn't reject these possibilities based on size constraints.
- * 2) These sequences are not currently correctly handled by the trie code
- * either, so it changes the joined node type to ops that are not handled
- * by trie's, those new ops being EXACTFU_SS and EXACTFU_TRICKYFOLD.
+ * 2) These sequences require special handling by the trie code, so this code
+ * changes the joined node type to special ops: EXACTFU_TRICKYFOLD and
+ * EXACTFU_SS.
* 3) This is sufficient for the two Greek sequences (described below), but
* the one involving the Sharp s (\xDF) needs more. The node type
* EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
* itself with length changes, and so can be processed faster. regexec.c
* takes advantage of this. Generally, an EXACTFish node that is in UTF-8
* is pre-folded by regcomp.c. This saves effort in regex matching.
- * However, probably mostly for historical reasons, the pre-folding isn't
- * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
- * nodes, as what they fold to isn't known until runtime.) The fold
- * possibilities for the non-UTF8 patterns are quite simple, except for
- * the sharp s. All the ones that don't involve a UTF-8 target string
- * are members of a fold-pair, and arrays are set up for all of them
- * that quickly find the other member of the pair. It might actually
- * be faster to pre-fold these, but it isn't currently done, except for
- * the sharp s. Code elsewhere in this file makes sure that it gets
- * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
- * issues described in the next item.
+ * However, the pre-folding isn't done for non-UTF8 patterns because the
+ * fold of the MICRO SIGN requires UTF-8. Also what EXACTF and EXACTFL
+ * nodes fold to isn't known until runtime. The fold possibilities for
+ * the non-UTF8 patterns are quite simple, except for the sharp s. All
+ * the ones that don't involve a UTF-8 target string are members of a
+ * fold-pair, and arrays are set up for all of them so that the other
+ * member of the pair can be found quickly. Code elsewhere in this file
+ * makes sure that in EXACTFU nodes, the sharp s gets folded to 'ss', even
+ * if the pattern isn't UTF-8. This avoids the issues described in the
+ * next item.
* 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
* 'ss' or not is not knowable at compile time. It will match iff the
* target string is in UTF-8, unlike the EXACTFU nodes, where it always
* matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
- * it can't be folded to "ss" at compile time, unlike EXACTFU does as
+ * it can't be folded to "ss" at compile time, unlike EXACTFU does (as
* described in item 3). An assumption that the optimizer part of
* regexec.c (probably unwittingly) makes is that a character in the
* pattern corresponds to at most a single character in the target string.
const unsigned int oldl = STR_LEN(scan);
regnode * const nnext = regnext(n);
+ /* XXX I (khw) kind of doubt that this works on platforms where
+ * U8_MAX is above 255 because of lots of other assumptions */
if (oldl + STR_LEN(n) > U8_MAX)
break;
greek_sequence:
*min_subtract += 4;
- /* This can't currently be handled by trie's, so change
+ /* This requires special handling by trie's, so change
* the node type to indicate this. If EXACTFA and
* EXACTFL were ever to be handled by trie's, this
* would have to be changed. If this node has already
/* EXACTF nodes need to know that the minimum
* length changed so that a sharp s in the string
* can match this ss in the pattern, but they
- * remain EXACTF nodes, as they are not trie'able,
- * so don't have to invent a new node type to
- * exclude them from the trie code */
+ * remain EXACTF nodes, as they won't match this
+ * unless the target string is is UTF-8, which we
+ * don't know until runtime */
if (OP(scan) != EXACTF) {
OP(scan) = EXACTFU_SS;
}
* the end pointer. */
if ( !first ) {
first = cur;
- trietype = noper_trietype;
if ( noper_trietype == NOTHING ) {
#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
regnode * const noper_next = regnext( noper );
U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
#endif
- if ( noper_next_trietype )
+ if ( noper_next_trietype ) {
trietype = noper_next_trietype;
+ } else if (noper_next_type) {
+ /* a NOTHING regop is 1 regop wide. We need at least two
+ * for a trie so we can't merge this in */
+ first = NULL;
+ }
+ } else {
+ trietype = noper_trietype;
}
} else {
if ( trietype == NOTHING )
uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
- else if (has_exactf_sharp_s) {
+ if (has_exactf_sharp_s) {
RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
}
min += l - min_subtract;
&& !(data->flags & SF_HAS_EVAL)
&& !deltanext /* atom is fixed width */
&& minnext != 0 /* CURLYM can't handle zero width */
+ && ! (RExC_seen & REG_SEEN_EXACTF_SHARP_S) /* Nor \xDF */
) {
/* XXXX How to optimize if data == 0? */
/* Optimize to a simpler form. */
data->flags |= (OP(scan) == MEOL
? SF_BEFORE_MEOL
: SF_BEFORE_SEOL);
+ SCAN_COMMIT(pRExC_state, data, minlenp);
+
}
else if ( PL_regkind[OP(scan)] == BRANCHJ
/* Lookbehind, or need to calculate parens/evals/stclass: */
}
#endif
-/* public(ish) wrapper for Perl_re_op_compile that only takes an SV
- * pattern rather than a list of OPs */
+/* public(ish) entry point for the perl core's own regex compiling code.
+ * It's actually a wrapper for Perl_re_op_compile that only takes an SV
+ * pattern rather than a list of OPs, and uses the internal engine rather
+ * than the current one */
REGEXP *
Perl_re_compile(pTHX_ SV * const pattern, U32 rx_flags)
{
SV *pat = pattern; /* defeat constness! */
PERL_ARGS_ASSERT_RE_COMPILE;
- return Perl_re_op_compile(aTHX_ &pat, 1, NULL, current_re_engine(),
- NULL, NULL, rx_flags, 0);
+ return Perl_re_op_compile(aTHX_ &pat, 1, NULL,
+#ifdef PERL_IN_XSUB_RE
+ &my_reg_engine,
+#else
+ &PL_core_reg_engine,
+#endif
+ NULL, NULL, rx_flags, 0);
}
/* see if there are any run-time code blocks in the pattern.
* the original pattern needs upgrading to utf8.
*/
-bool
+static bool
S_compile_runtime_code(pTHX_ RExC_state_t * const pRExC_state,
char *pat, STRLEN plen)
{
int n = 0;
STRLEN s;
char *p, *newpat;
- int newlen = plen + 5; /* allow for "qr''x" extra chars */
+ int newlen = plen + 6; /* allow for "qr''x\0" extra chars */
SV *sv, *qr_ref;
dSP;
}
+STATIC bool
+S_setup_longest(pTHX_ RExC_state_t *pRExC_state, SV* sv_longest, SV** rx_utf8, SV** rx_substr, I32* rx_end_shift, I32 lookbehind, I32 offset, I32 *minlen, STRLEN longest_length, bool eol, bool meol)
+{
+ /* This is the common code for setting up the floating and fixed length
+ * string data extracted from Perlre_op_compile() below. Returns a boolean
+ * as to whether succeeded or not */
+
+ I32 t,ml;
+
+ if (! (longest_length
+ || (eol /* Can't have SEOL and MULTI */
+ && (! meol || (RExC_flags & RXf_PMf_MULTILINE)))
+ )
+ /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
+ || (RExC_seen & REG_SEEN_EXACTF_SHARP_S))
+ {
+ return FALSE;
+ }
+
+ /* copy the information about the longest from the reg_scan_data
+ over to the program. */
+ if (SvUTF8(sv_longest)) {
+ *rx_utf8 = sv_longest;
+ *rx_substr = NULL;
+ } else {
+ *rx_substr = sv_longest;
+ *rx_utf8 = NULL;
+ }
+ /* end_shift is how many chars that must be matched that
+ follow this item. We calculate it ahead of time as once the
+ lookbehind offset is added in we lose the ability to correctly
+ calculate it.*/
+ ml = minlen ? *(minlen) : (I32)longest_length;
+ *rx_end_shift = ml - offset
+ - longest_length + (SvTAIL(sv_longest) != 0)
+ + lookbehind;
+
+ t = (eol/* Can't have SEOL and MULTI */
+ && (! meol || (RExC_flags & RXf_PMf_MULTILINE)));
+ fbm_compile(sv_longest, t ? FBMcf_TAIL : 0);
+
+ return TRUE;
+}
+
/*
* Perl_re_op_compile - the perl internal RE engine's function to compile a
* regular expression into internal code.
dVAR;
REGEXP *rx;
struct regexp *r;
- register regexp_internal *ri;
+ regexp_internal *ri;
STRLEN plen;
char * VOL exp;
char* xend;
PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
- PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
- PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
-
PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
OP *o = NULL;
int n = 0;
bool utf8 = 0;
+ STRLEN orig_patlen = 0;
if (pRExC_state->num_code_blocks) {
o = cLISTOPx(expr)->op_first;
o = o->op_sibling;;
}
+ if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
+ (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
+ {
+ sv_setsv(pat, sv);
+ /* overloading involved: all bets are off over literal
+ * code. Pretend we haven't seen it */
+ pRExC_state->num_code_blocks -= n;
+ n = 0;
+ rx = NULL;
+
+ }
+ else {
+ while (SvAMAGIC(msv)
+ && (sv = AMG_CALLunary(msv, string_amg))
+ && sv != msv)
+ {
+ msv = sv;
+ SvGETMAGIC(msv);
+ }
+ if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
+ msv = SvRV(msv);
+ orig_patlen = SvCUR(pat);
+ sv_catsv_nomg(pat, msv);
+ rx = msv;
+ if (code)
+ pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
+ }
+
/* extract any code blocks within any embedded qr//'s */
- rx = msv;
- if (SvROK(rx))
- rx = SvRV(rx);
- if (SvTYPE(rx) == SVt_REGEXP
+ if (rx && SvTYPE(rx) == SVt_REGEXP
&& RX_ENGINE((REGEXP*)rx)->op_comp)
{
pRExC_state->num_code_blocks += ri->num_code_blocks;
for (i=0; i < ri->num_code_blocks; i++) {
struct reg_code_block *src, *dst;
- STRLEN offset = SvCUR(pat)
+ STRLEN offset = orig_patlen
+ ((struct regexp *)SvANY(rx))->pre_prefix;
assert(n < pRExC_state->num_code_blocks);
src = &ri->code_blocks[i];
}
}
}
-
- if ((SvAMAGIC(pat) || SvAMAGIC(msv)) &&
- (sv = amagic_call(pat, msv, concat_amg, AMGf_assign)))
- {
- sv_setsv(pat, sv);
- /* overloading involved: all bets are off over literal
- * code. Pretend we haven't seen it */
- pRExC_state->num_code_blocks -= n;
- n = 0;
-
- }
- else {
- if (SvROK(msv) && SvTYPE(SvRV(msv)) == SVt_REGEXP)
- msv = SvRV(msv);
- sv_catsv_nomg(pat, msv);
- if (code)
- pRExC_state->code_blocks[n-1].end = SvCUR(pat)-1;
- }
}
SvSETMAGIC(pat);
}
- else
+ else {
+ SV *sv;
pat = *patternp;
+ while (SvAMAGIC(pat)
+ && (sv = AMG_CALLunary(pat, string_amg))
+ && sv != pat)
+ {
+ pat = sv;
+ SvGETMAGIC(pat);
+ }
+ }
/* handle bare regex: foo =~ $re */
{
runtime_code = S_has_runtime_code(aTHX_ pRExC_state, expr, pm_flags,
exp, plen);
if (!runtime_code) {
- ReREFCNT_inc(old_re);
if (used_setjump) {
JMPENV_POP;
}
scan_commit(pRExC_state, &data,&minlen,0);
SvREFCNT_dec(data.last_found);
- /* Note that code very similar to this but for anchored string
- follows immediately below, changes may need to be made to both.
- Be careful.
- */
longest_float_length = CHR_SVLEN(data.longest_float);
- if (longest_float_length
- || (data.flags & SF_FL_BEFORE_EOL
- && (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE))))
- {
- I32 t,ml;
- /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
- if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
- || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
- && data.offset_fixed == data.offset_float_min
- && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
- goto remove_float; /* As in (a)+. */
-
- /* copy the information about the longest float from the reg_scan_data
- over to the program. */
- if (SvUTF8(data.longest_float)) {
- r->float_utf8 = data.longest_float;
- r->float_substr = NULL;
- } else {
- r->float_substr = data.longest_float;
- r->float_utf8 = NULL;
- }
- /* float_end_shift is how many chars that must be matched that
- follow this item. We calculate it ahead of time as once the
- lookbehind offset is added in we lose the ability to correctly
- calculate it.*/
- ml = data.minlen_float ? *(data.minlen_float)
- : (I32)longest_float_length;
- r->float_end_shift = ml - data.offset_float_min
- - longest_float_length + (SvTAIL(data.longest_float) != 0)
- + data.lookbehind_float;
+ if (! ((SvCUR(data.longest_fixed) /* ok to leave SvCUR */
+ && data.offset_fixed == data.offset_float_min
+ && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
+ && S_setup_longest (aTHX_ pRExC_state,
+ data.longest_float,
+ &(r->float_utf8),
+ &(r->float_substr),
+ &(r->float_end_shift),
+ data.lookbehind_float,
+ data.offset_float_min,
+ data.minlen_float,
+ longest_float_length,
+ data.flags & SF_FL_BEFORE_EOL,
+ data.flags & SF_FL_BEFORE_MEOL))
+ {
r->float_min_offset = data.offset_float_min - data.lookbehind_float;
r->float_max_offset = data.offset_float_max;
if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
r->float_max_offset -= data.lookbehind_float;
-
- t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
- && (!(data.flags & SF_FL_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
}
else {
- remove_float:
r->float_substr = r->float_utf8 = NULL;
SvREFCNT_dec(data.longest_float);
longest_float_length = 0;
}
- /* Note that code very similar to this but for floating string
- is immediately above, changes may need to be made to both.
- Be careful.
- */
longest_fixed_length = CHR_SVLEN(data.longest_fixed);
- /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
- if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
- && (longest_fixed_length
- || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)))) )
+ if (S_setup_longest (aTHX_ pRExC_state,
+ data.longest_fixed,
+ &(r->anchored_utf8),
+ &(r->anchored_substr),
+ &(r->anchored_end_shift),
+ data.lookbehind_fixed,
+ data.offset_fixed,
+ data.minlen_fixed,
+ longest_fixed_length,
+ data.flags & SF_FIX_BEFORE_EOL,
+ data.flags & SF_FIX_BEFORE_MEOL))
{
- I32 t,ml;
-
- /* copy the information about the longest fixed
- from the reg_scan_data over to the program. */
- if (SvUTF8(data.longest_fixed)) {
- r->anchored_utf8 = data.longest_fixed;
- r->anchored_substr = NULL;
- } else {
- r->anchored_substr = data.longest_fixed;
- r->anchored_utf8 = NULL;
- }
- /* fixed_end_shift is how many chars that must be matched that
- follow this item. We calculate it ahead of time as once the
- lookbehind offset is added in we lose the ability to correctly
- calculate it.*/
- ml = data.minlen_fixed ? *(data.minlen_fixed)
- : (I32)longest_fixed_length;
- r->anchored_end_shift = ml - data.offset_fixed
- - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
- + data.lookbehind_fixed;
r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
-
- t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
- && (!(data.flags & SF_FIX_BEFORE_MEOL)
- || (RExC_flags & RXf_PMf_MULTILINE)));
- fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
}
else {
r->anchored_substr = r->anchored_utf8 = NULL;
SvREFCNT_dec(data.longest_fixed);
longest_fixed_length = 0;
}
+
if (ri->regstclass
&& (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
ri->regstclass = NULL;
r->intflags |= PREGf_VERBARG_SEEN;
if (RExC_seen & REG_SEEN_CUTGROUP)
r->intflags |= PREGf_CUTGROUP_SEEN;
+ if (pm_flags & PMf_USE_RE_EVAL)
+ r->intflags |= PREGf_USE_RE_EVAL;
if (RExC_paren_names)
RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
else
do {
RExC_parse++;
} while (isALNUM(*RExC_parse));
+ } else {
+ RExC_parse++; /* so the <- from the vFAIL is after the offending character */
+ vFAIL("Group name must start with a non-digit word character");
}
-
if ( flags ) {
SV* sv_name
= newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
(unsigned long) flags);
}
- /* NOT REACHED */
+ assert(0); /* NOT REACHED */
}
return NULL;
}
* Some of the methods should always be private to the implementation, and some
* should eventually be made public */
-#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
-#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
-
-/* This is a combination of a version and data structure type, so that one
- * being passed in can be validated to be an inversion list of the correct
- * vintage. When the structure of the header is changed, a new random number
- * in the range 2**31-1 should be generated and the new() method changed to
- * insert that at this location. Then, if an auxiliary program doesn't change
- * correspondingly, it will be discovered immediately */
-#define INVLIST_VERSION_ID_OFFSET 2
-#define INVLIST_VERSION_ID 1064334010
+/* The header definitions are in F<inline_invlist.c> */
-/* For safety, when adding new elements, remember to #undef them at the end of
- * the inversion list code section */
-
-#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
-/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
- * contains the code point U+00000, and begins here. If 1, the inversion list
- * doesn't contain U+0000, and it begins at the next UV in the array.
- * Inverting an inversion list consists of adding or removing the 0 at the
- * beginning of it. By reserving a space for that 0, inversion can be made
- * very fast */
-
-#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
-
-/* Internally things are UVs */
#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
/* Must be empty */
- assert(! *get_invlist_len_addr(invlist));
+ assert(! *_get_invlist_len_addr(invlist));
/* 1^1 = 0; 1^0 = 1 */
*zero = 1 ^ will_have_0;
/* Must not be empty. If these fail, you probably didn't check for <len>
* being non-zero before trying to get the array */
- assert(*get_invlist_len_addr(invlist));
+ assert(*_get_invlist_len_addr(invlist));
assert(*get_invlist_zero_addr(invlist) == 0
|| *get_invlist_zero_addr(invlist) == 1);
+ *get_invlist_zero_addr(invlist));
}
-PERL_STATIC_INLINE UV*
-S_get_invlist_len_addr(pTHX_ SV* invlist)
-{
- /* Return the address of the UV that contains the current number
- * of used elements in the inversion list */
-
- PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
-
- return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
-}
-
-PERL_STATIC_INLINE UV
-S_invlist_len(pTHX_ SV* const invlist)
-{
- /* Returns the current number of elements stored in the inversion list's
- * array */
-
- PERL_ARGS_ASSERT_INVLIST_LEN;
-
- return *get_invlist_len_addr(invlist);
-}
-
PERL_STATIC_INLINE void
S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
{
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
- *get_invlist_len_addr(invlist) = len;
+ *_get_invlist_len_addr(invlist) = len;
assert(len <= SvLEN(invlist));
* Note that when inverting, SvCUR shouldn't change */
}
+PERL_STATIC_INLINE IV*
+S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
+{
+ /* Return the address of the UV that is reserved to hold the cached index
+ * */
+
+ PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
+
+ return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+}
+
+PERL_STATIC_INLINE IV
+S_invlist_previous_index(pTHX_ SV* const invlist)
+{
+ /* Returns cached index of previous search */
+
+ PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
+
+ return *get_invlist_previous_index_addr(invlist);
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
+{
+ /* Caches <index> for later retrieval */
+
+ PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
+
+ assert(index == 0 || index < (int) _invlist_len(invlist));
+
+ *get_invlist_previous_index_addr(invlist) = index;
+}
+
PERL_STATIC_INLINE UV
S_invlist_max(pTHX_ SV* const invlist)
{
* properly */
*get_invlist_zero_addr(new_list) = UV_MAX;
+ *get_invlist_previous_index_addr(new_list) = 0;
*get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 4
+#if HEADER_LENGTH != 5
# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
#endif
SvPV_set(invlist, (char *) list);
SvLEN_set(invlist, 0); /* Means we own the contents, and the system
shouldn't touch it */
- SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
+ SvCUR_set(invlist, TO_INTERNAL_SIZE(_invlist_len(invlist)));
if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
SvPV_shrink_to_cur((SV *) invlist);
}
-/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
- * etc */
-#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
-#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
-
#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
STATIC void
UV* array;
UV max = invlist_max(invlist);
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
#ifndef PERL_IN_XSUB_RE
-STATIC IV
-S_invlist_search(pTHX_ SV* const invlist, const UV cp)
+IV
+Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
* contains <cp> */
IV low = 0;
- IV high = invlist_len(invlist);
- const UV * const array = invlist_array(invlist);
+ IV mid;
+ IV high = _invlist_len(invlist);
+ const IV highest_element = high - 1;
+ const UV* array;
- PERL_ARGS_ASSERT_INVLIST_SEARCH;
+ PERL_ARGS_ASSERT__INVLIST_SEARCH;
- /* If list is empty or the code point is before the first element, return
- * failure. */
- if (high == 0 || cp < array[0]) {
+ /* If list is empty, return failure. */
+ if (high == 0) {
return -1;
}
+ /* If the code point is before the first element, return failure. (We
+ * can't combine this with the test above, because we can't get the array
+ * unless we know the list is non-empty) */
+ array = invlist_array(invlist);
+
+ mid = invlist_previous_index(invlist);
+ assert(mid >=0 && mid <= highest_element);
+
+ /* <mid> contains the cache of the result of the previous call to this
+ * function (0 the first time). See if this call is for the same result,
+ * or if it is for mid-1. This is under the theory that calls to this
+ * function will often be for related code points that are near each other.
+ * And benchmarks show that caching gives better results. We also test
+ * here if the code point is within the bounds of the list. These tests
+ * replace others that would have had to be made anyway to make sure that
+ * the array bounds were not exceeded, and give us extra information at the
+ * same time */
+ if (cp >= array[mid]) {
+ if (cp >= array[highest_element]) {
+ return highest_element;
+ }
+
+ /* Here, array[mid] <= cp < array[highest_element]. This means that
+ * the final element is not the answer, so can exclude it; it also
+ * means that <mid> is not the final element, so can refer to 'mid + 1'
+ * safely */
+ if (cp < array[mid + 1]) {
+ return mid;
+ }
+ high--;
+ low = mid + 1;
+ }
+ else { /* cp < aray[mid] */
+ if (cp < array[0]) { /* Fail if outside the array */
+ return -1;
+ }
+ high = mid;
+ if (cp >= array[mid - 1]) {
+ goto found_entry;
+ }
+ }
+
/* Binary search. What we are looking for is <i> such that
* array[i] <= cp < array[i+1]
- * The loop below converges on the i+1. */
+ * The loop below converges on the i+1. Note that there may not be an
+ * (i+1)th element in the array, and things work nonetheless */
while (low < high) {
- IV mid = (low + high) / 2;
- if (array[mid] <= cp) {
+ mid = (low + high) / 2;
+ assert(mid <= highest_element);
+ if (array[mid] <= cp) { /* cp >= array[mid] */
low = mid + 1;
/* We could do this extra test to exit the loop early.
}
}
- return high - 1;
+ found_entry:
+ high--;
+ invlist_set_previous_index(invlist, high);
+ return high;
}
void
* that <swatch> is all 0's on input */
UV current = start;
- const IV len = invlist_len(invlist);
+ const IV len = _invlist_len(invlist);
IV i;
const UV * array;
array = invlist_array(invlist);
/* Find which element it is */
- i = invlist_search(invlist, start);
+ i = _invlist_search(invlist, start);
/* We populate from <start> to <end> */
while (current < end) {
current = array[i];
if (current >= end) { /* Finished if beyond the end of what we
are populating */
- return;
+ if (LIKELY(end < UV_MAX)) {
+ return;
+ }
+
+ /* We get here when the upper bound is the maximum
+ * representable on the machine, and we are looking for just
+ * that code point. Have to special case it */
+ i = len;
+ goto join_end_of_list;
}
}
assert(current >= start);
swatch[offset >> 3] |= 1 << (offset & 7);
}
+ join_end_of_list:
+
/* Quit if at the end of the list */
if (i >= len) {
return;
}
-
void
Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
{
assert(a != b);
/* If either one is empty, the union is the other one */
- if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
+ if (a == NULL || ((len_a = _invlist_len(a)) == 0)) {
if (*output == a) {
if (a != NULL) {
SvREFCNT_dec(a);
} /* else *output already = b; */
return;
}
- else if ((len_b = invlist_len(b)) == 0) {
+ else if ((len_b = _invlist_len(b)) == 0) {
if (*output == b) {
SvREFCNT_dec(b);
}
/* Set result to final length, which can change the pointer to array_u, so
* re-find it */
- if (len_u != invlist_len(u)) {
+ if (len_u != _invlist_len(u)) {
invlist_set_len(u, len_u);
invlist_trim(u);
array_u = invlist_array(u);
assert(a != b);
/* Special case if either one is empty */
- len_a = invlist_len(a);
- if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
+ len_a = _invlist_len(a);
+ if ((len_a == 0) || ((len_b = _invlist_len(b)) == 0)) {
if (len_a != 0 && complement_b) {
/* Set result to final length, which can change the pointer to array_r, so
* re-find it */
- if (len_r != invlist_len(r)) {
+ if (len_r != _invlist_len(r)) {
invlist_set_len(r, len_r);
invlist_trim(r);
array_r = invlist_array(r);
len = 0;
}
else {
- len = invlist_len(invlist);
+ len = _invlist_len(invlist);
}
/* If comes after the final entry, can just append it to the end */
if (len == 0
|| start >= invlist_array(invlist)
- [invlist_len(invlist) - 1])
+ [_invlist_len(invlist) - 1])
{
_append_range_to_invlist(invlist, start, end);
return invlist;
* have a zero; removes it otherwise. As described above, the data
* structure is set up so that this is very efficient */
- UV* len_pos = get_invlist_len_addr(invlist);
+ UV* len_pos = _get_invlist_len_addr(invlist);
PERL_ARGS_ASSERT__INVLIST_INVERT;
_invlist_invert(invlist);
- len = invlist_len(invlist);
+ len = _invlist_len(invlist);
if (len != 0) { /* If empty do nothing */
array = invlist_array(invlist);
/* Need to allocate extra space to accommodate Perl's addition of a
* trailing NUL to SvPV's, since it thinks they are always strings */
- SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
+ SV* new_invlist = _new_invlist(_invlist_len(invlist) + 1);
STRLEN length = SvCUR(invlist);
PERL_ARGS_ASSERT_INVLIST_CLONE;
* will start over at the beginning of the list */
UV* pos = get_invlist_iter_addr(invlist);
- UV len = invlist_len(invlist);
+ UV len = _invlist_len(invlist);
UV *array;
PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
return TRUE;
}
+PERL_STATIC_INLINE UV
+S_invlist_highest(pTHX_ SV* const invlist)
+{
+ /* Returns the highest code point that matches an inversion list. This API
+ * has an ambiguity, as it returns 0 under either the highest is actually
+ * 0, or if the list is empty. If this distinction matters to you, check
+ * for emptiness before calling this function */
+
+ UV len = _invlist_len(invlist);
+ UV *array;
+
+ PERL_ARGS_ASSERT_INVLIST_HIGHEST;
+
+ if (len == 0) {
+ return 0;
+ }
+
+ array = invlist_array(invlist);
+
+ /* The last element in the array in the inversion list always starts a
+ * range that goes to infinity. That range may be for code points that are
+ * matched in the inversion list, or it may be for ones that aren't
+ * matched. In the latter case, the highest code point in the set is one
+ * less than the beginning of this range; otherwise it is the final element
+ * of this range: infinity */
+ return (ELEMENT_RANGE_MATCHES_INVLIST(len - 1))
+ ? UV_MAX
+ : array[len - 1] - 1;
+}
+
#ifndef PERL_IN_XSUB_RE
SV *
Perl__invlist_contents(pTHX_ SV* const invlist)
}
#endif
+#if 0
+bool
+S__invlistEQ(pTHX_ SV* const a, SV* const b, bool complement_b)
+{
+ /* Return a boolean as to if the two passed in inversion lists are
+ * identical. The final argument, if TRUE, says to take the complement of
+ * the second inversion list before doing the comparison */
+
+ UV* array_a = invlist_array(a);
+ UV* array_b = invlist_array(b);
+ UV len_a = _invlist_len(a);
+ UV len_b = _invlist_len(b);
+
+ UV i = 0; /* current index into the arrays */
+ bool retval = TRUE; /* Assume are identical until proven otherwise */
+
+ PERL_ARGS_ASSERT__INVLISTEQ;
+
+ /* If are to compare 'a' with the complement of b, set it
+ * up so are looking at b's complement. */
+ if (complement_b) {
+
+ /* The complement of nothing is everything, so <a> would have to have
+ * just one element, starting at zero (ending at infinity) */
+ if (len_b == 0) {
+ return (len_a == 1 && array_a[0] == 0);
+ }
+ else if (array_b[0] == 0) {
+
+ /* Otherwise, to complement, we invert. Here, the first element is
+ * 0, just remove it. To do this, we just pretend the array starts
+ * one later, and clear the flag as we don't have to do anything
+ * else later */
+
+ array_b++;
+ len_b--;
+ complement_b = FALSE;
+ }
+ else {
+
+ /* But if the first element is not zero, we unshift a 0 before the
+ * array. The data structure reserves a space for that 0 (which
+ * should be a '1' right now), so physical shifting is unneeded,
+ * but temporarily change that element to 0. Before exiting the
+ * routine, we must restore the element to '1' */
+ array_b--;
+ len_b++;
+ array_b[0] = 0;
+ }
+ }
+
+ /* Make sure that the lengths are the same, as well as the final element
+ * before looping through the remainder. (Thus we test the length, final,
+ * and first elements right off the bat) */
+ if (len_a != len_b || array_a[len_a-1] != array_b[len_a-1]) {
+ retval = FALSE;
+ }
+ else for (i = 0; i < len_a - 1; i++) {
+ if (array_a[i] != array_b[i]) {
+ retval = FALSE;
+ break;
+ }
+ }
+
+ if (complement_b) {
+ array_b[0] = 1;
+ }
+ return retval;
+}
+#endif
+
#undef HEADER_LENGTH
#undef INVLIST_INITIAL_LENGTH
#undef TO_INTERNAL_SIZE
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
dVAR;
- register regnode *ret; /* Will be the head of the group. */
- register regnode *br;
- register regnode *lastbr;
- register regnode *ender = NULL;
- register I32 parno = 0;
+ regnode *ret; /* Will be the head of the group. */
+ regnode *br;
+ regnode *lastbr;
+ regnode *ender = NULL;
+ I32 parno = 0;
I32 flags;
U32 oregflags = RExC_flags;
bool have_branch = 0;
ret = reganode(pRExC_state,
((! FOLD)
? NREF
- : (MORE_ASCII_RESTRICTED)
+ : (ASCII_FOLD_RESTRICTED)
? NREFFA
: (AT_LEAST_UNI_SEMANTICS)
? NREFFU
num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
}
goto gen_recurse_regop;
- /* NOT REACHED */
+ assert(0); /* NOT REACHED */
case '+':
if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
RExC_parse++;
nextchar(pRExC_state);
return ret;
} /* named and numeric backreferences */
- /* NOT REACHED */
+ assert(0); /* NOT REACHED */
case '?': /* (??...) */
is_logical = 1;
nextchar(pRExC_state);
if (is_logical) {
+ regnode *eval;
ret = reg_node(pRExC_state, LOGICAL);
- if (!SIZE_ONLY)
+ eval = reganode(pRExC_state, EVAL, n);
+ if (!SIZE_ONLY) {
ret->flags = 2;
- REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
+ /* for later propagation into (??{}) return value */
+ eval->flags = (U8) (RExC_flags & RXf_PMf_COMPILETIME);
+ }
+ REGTAIL(pRExC_state, ret, eval);
/* deal with the length of this later - MJD */
return ret;
}
}
else
FAIL("Junk on end of regexp"); /* "Can't happen". */
- /* NOTREACHED */
+ assert(0); /* NOTREACHED */
}
if (RExC_in_lookbehind) {
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
{
dVAR;
- register regnode *ret;
- register regnode *chain = NULL;
- register regnode *latest;
+ regnode *ret;
+ regnode *chain = NULL;
+ regnode *latest;
I32 flags = 0, c = 0;
GET_RE_DEBUG_FLAGS_DECL;
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
dVAR;
- register regnode *ret;
- register char op;
- register char *next;
+ regnode *ret;
+ char op;
+ char *next;
I32 flags;
const char * const origparse = RExC_parse;
I32 min;
return(ret);
}
-
-/* reg_namedseq(pRExC_state,UVp, UV depth)
+STATIC bool
+S_grok_bslash_N(pTHX_ RExC_state_t *pRExC_state, regnode** node_p, UV *valuep, I32 *flagp, U32 depth, bool in_char_class)
+{
- This is expected to be called by a parser routine that has
- recognized '\N' and needs to handle the rest. RExC_parse is
- expected to point at the first char following the N at the time
- of the call.
+ /* This is expected to be called by a parser routine that has recognized '\N'
+ and needs to handle the rest. RExC_parse is expected to point at the first
+ char following the N at the time of the call. On successful return,
+ RExC_parse has been updated to point to just after the sequence identified
+ by this routine, and <*flagp> has been updated.
- The \N may be inside (indicated by valuep not being NULL) or outside a
+ The \N may be inside (indicated by the boolean <in_char_class>) or outside a
character class.
\N may begin either a named sequence, or if outside a character class, mean
to match a non-newline. For non single-quoted regexes, the tokenizer has
- attempted to decide which, and in the case of a named sequence converted it
+ attempted to decide which, and in the case of a named sequence, converted it
into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
where c1... are the characters in the sequence. For single-quoted regexes,
the tokenizer passes the \N sequence through unchanged; this code will not
- attempt to determine this nor expand those. The net effect is that if the
- beginning of the passed-in pattern isn't '{U+' or there is no '}', it
- signals that this \N occurrence means to match a non-newline.
-
+ attempt to determine this nor expand those, instead raising a syntax error.
+ The net effect is that if the beginning of the passed-in pattern isn't '{U+'
+ or there is no '}', it signals that this \N occurrence means to match a
+ non-newline.
+
Only the \N{U+...} form should occur in a character class, for the same
reason that '.' inside a character class means to just match a period: it
just doesn't make sense.
-
- If valuep is non-null then it is assumed that we are parsing inside
- of a charclass definition and the first codepoint in the resolved
- string is returned via *valuep and the routine will return NULL.
- In this mode if a multichar string is returned from the charnames
- handler, a warning will be issued, and only the first char in the
- sequence will be examined. If the string returned is zero length
- then the value of *valuep is undefined and NON-NULL will
- be returned to indicate failure. (This will NOT be a valid pointer
- to a regnode.)
-
- If valuep is null then it is assumed that we are parsing normal text and a
- new EXACT node is inserted into the program containing the resolved string,
- and a pointer to the new node is returned. But if the string is zero length
- a NOTHING node is emitted instead.
- On success RExC_parse is set to the char following the endbrace.
- Parsing failures will generate a fatal error via vFAIL(...)
+ The function raises an error (via vFAIL), and doesn't return for various
+ syntax errors. Otherwise it returns TRUE and sets <node_p> or <valuep> on
+ success; it returns FALSE otherwise.
+
+ If <valuep> is non-null, it means the caller can accept an input sequence
+ consisting of a just a single code point; <*valuep> is set to that value
+ if the input is such.
+
+ If <node_p> is non-null it signifies that the caller can accept any other
+ legal sequence (i.e., one that isn't just a single code point). <*node_p>
+ is set as follows:
+ 1) \N means not-a-NL: points to a newly created REG_ANY node;
+ 2) \N{}: points to a new NOTHING node;
+ 3) otherwise: points to a new EXACT node containing the resolved
+ string.
+ Note that FALSE is returned for single code point sequences if <valuep> is
+ null.
*/
-STATIC regnode *
-S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
-{
+
char * endbrace; /* '}' following the name */
- regnode *ret = NULL;
char* p;
+ char *endchar; /* Points to '.' or '}' ending cur char in the input
+ stream */
+ bool has_multiple_chars; /* true if the input stream contains a sequence of
+ more than one character */
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_REG_NAMEDSEQ;
+ PERL_ARGS_ASSERT_GROK_BSLASH_N;
GET_RE_DEBUG_FLAGS;
+ assert(cBOOL(node_p) ^ cBOOL(valuep)); /* Exactly one should be set */
+
/* The [^\n] meaning of \N ignores spaces and comments under the /x
* modifier. The other meaning does not */
p = (RExC_flags & RXf_PMf_EXTENDED)
? regwhite( pRExC_state, RExC_parse )
: RExC_parse;
-
+
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The former is assumed when it can't be the latter. */
if (*p != '{' || regcurly(p)) {
RExC_parse = p;
- if (valuep) {
+ if (! node_p) {
/* no bare \N in a charclass */
- vFAIL("\\N in a character class must be a named character: \\N{...}");
- }
+ if (in_char_class) {
+ vFAIL("\\N in a character class must be a named character: \\N{...}");
+ }
+ return FALSE;
+ }
nextchar(pRExC_state);
- ret = reg_node(pRExC_state, REG_ANY);
+ *node_p = reg_node(pRExC_state, REG_ANY);
*flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
RExC_parse--;
- Set_Node_Length(ret, 1); /* MJD */
- return ret;
+ Set_Node_Length(*node_p, 1); /* MJD */
+ return TRUE;
}
- /* Here, we have decided it should be a named sequence */
+ /* Here, we have decided it should be a named character or sequence */
/* The test above made sure that the next real character is a '{', but
* under the /x modifier, it could be separated by space (or a comment and
}
if (endbrace == RExC_parse) { /* empty: \N{} */
- if (! valuep) {
- RExC_parse = endbrace + 1;
- return reg_node(pRExC_state,NOTHING);
- }
-
- if (SIZE_ONLY) {
- ckWARNreg(RExC_parse,
- "Ignoring zero length \\N{} in character class"
- );
- RExC_parse = endbrace + 1;
+ bool ret = TRUE;
+ if (node_p) {
+ *node_p = reg_node(pRExC_state,NOTHING);
+ }
+ else if (in_char_class) {
+ if (SIZE_ONLY && in_char_class) {
+ ckWARNreg(RExC_parse,
+ "Ignoring zero length \\N{} in character class"
+ );
+ }
+ ret = FALSE;
}
- *valuep = 0;
- return (regnode *) &RExC_parse; /* Invalid regnode pointer */
+ else {
+ return FALSE;
+ }
+ nextchar(pRExC_state);
+ return ret;
}
- REQUIRE_UTF8; /* named sequences imply Unicode semantics */
+ RExC_uni_semantics = 1; /* Unicode named chars imply Unicode semantics */
RExC_parse += 2; /* Skip past the 'U+' */
- if (valuep) { /* In a bracketed char class */
- /* We only pay attention to the first char of
- multichar strings being returned. I kinda wonder
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
+
+ /* Code points are separated by dots. If none, there is only one code
+ * point, and is terminated by the brace */
+ has_multiple_chars = (endchar < endbrace);
+
+ if (valuep && (! has_multiple_chars || in_char_class)) {
+ /* We only pay attention to the first char of
+ multichar strings being returned in char classes. I kinda wonder
if this makes sense as it does change the behaviour
from earlier versions, OTOH that behaviour was broken
as well. XXX Solution is to recharacterize as
[rest-of-class]|multi1|multi2... */
- STRLEN length_of_hex;
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
+ STRLEN length_of_hex = (STRLEN)(endchar - RExC_parse);
+ I32 grok_hex_flags = PERL_SCAN_ALLOW_UNDERSCORES
| PERL_SCAN_DISALLOW_PREFIX
| (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
-
- char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
- if (endchar < endbrace) {
- ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
- }
- length_of_hex = (STRLEN)(endchar - RExC_parse);
- *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
+ *valuep = grok_hex(RExC_parse, &length_of_hex, &grok_hex_flags, NULL);
/* The tokenizer should have guaranteed validity, but it's possible to
* bypass it by using single quoting, so check */
? UTF8SKIP(RExC_parse)
: 1;
/* Guard against malformed utf8 */
- if (RExC_parse >= endchar) RExC_parse = endchar;
+ if (RExC_parse >= endchar) {
+ RExC_parse = endchar;
+ }
vFAIL("Invalid hexadecimal number in \\N{U+...}");
- }
+ }
- RExC_parse = endbrace + 1;
- if (endchar == endbrace) return NULL;
+ if (in_char_class && has_multiple_chars) {
+ ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
+ }
+ RExC_parse = endbrace + 1;
+ }
+ else if (! node_p || ! has_multiple_chars) {
- ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
+ /* Here, the input is legal, but not according to the caller's
+ * options. We fail without advancing the parse, so that the
+ * caller can try again */
+ RExC_parse = p;
+ return FALSE;
}
- else { /* Not a char class */
+ else {
/* What is done here is to convert this to a sub-pattern of the form
* (?:\x{char1}\x{char2}...)
SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
STRLEN len;
- char *endchar; /* Points to '.' or '}' ending cur char in the input
- stream */
char *orig_end = RExC_end;
+ I32 flags;
while (RExC_parse < endbrace) {
- /* Code points are separated by dots. If none, there is only one
- * code point, and is terminated by the brace */
- endchar = RExC_parse + strcspn(RExC_parse, ".}");
-
/* Convert to notation the rest of the code understands */
sv_catpv(substitute_parse, "\\x{");
sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
/* Point to the beginning of the next character in the sequence. */
RExC_parse = endchar + 1;
+ endchar = RExC_parse + strcspn(RExC_parse, ".}");
}
sv_catpv(substitute_parse, ")");
/* The values are Unicode, and therefore not subject to recoding */
RExC_override_recoding = 1;
- ret = reg(pRExC_state, 1, flagp, depth+1);
+ *node_p = reg(pRExC_state, 1, &flags, depth+1);
+ *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
RExC_parse = endbrace;
RExC_end = orig_end;
RExC_override_recoding = 0;
- nextchar(pRExC_state);
+ nextchar(pRExC_state);
}
- return ret;
+ return TRUE;
}
return uv;
}
+PERL_STATIC_INLINE U8
+S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
+{
+ U8 op;
-/*
- - regatom - the lowest level
-
- Try to identify anything special at the start of the pattern. If there
- is, then handle it as required. This may involve generating a single regop,
- such as for an assertion; or it may involve recursing, such as to
- handle a () structure.
+ PERL_ARGS_ASSERT_COMPUTE_EXACTISH;
- If the string doesn't start with something special then we gobble up
- as much literal text as we can.
+ if (! FOLD) {
+ return EXACT;
+ }
- Once we have been able to handle whatever type of thing started the
- sequence, we return.
+ op = get_regex_charset(RExC_flags);
+ if (op >= REGEX_ASCII_RESTRICTED_CHARSET) {
+ op--; /* /a is same as /u, and map /aa's offset to what /a's would have
+ been, so there is no hole */
+ }
- Note: we have to be careful with escapes, as they can be both literal
- and special, and in the case of \10 and friends can either, depending
- on context. Specifically there are two separate switches for handling
- escape sequences, with the one for handling literal escapes requiring
- a dummy entry for all of the special escapes that are actually handled
- by the other.
-*/
+ return op + EXACTF;
+}
-STATIC regnode *
-S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
+PERL_STATIC_INLINE void
+S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32* flagp, STRLEN len, UV code_point)
{
- dVAR;
- register regnode *ret = NULL;
- I32 flags;
- char *parse_start = RExC_parse;
- U8 op;
- GET_RE_DEBUG_FLAGS_DECL;
+ /* This knows the details about sizing an EXACTish node, setting flags for
+ * it (by setting <*flagp>, and potentially populating it with a single
+ * character.
+ *
+ * If <len> is non-zero, this function assumes that the node has already
+ * been populated, and just does the sizing. In this case <code_point>
+ * should be the final code point that has already been placed into the
+ * node. This value will be ignored except that under some circumstances
+ * <*flagp> is set based on it.
+ *
+ * If <len is zero, the function assumes that the node is to contain only
+ * the single character given by <code_point> and calculates what <len>
+ * should be. In pass 1, it sizes the node appropriately. In pass 2, it
+ * additionally will populate the node's STRING with <code_point>, if <len>
+ * is 0. In both cases <*flagp> is appropriately set
+ *
+ * It knows that under FOLD, UTF characters and the Latin Sharp S must be
+ * folded (the latter only when the rules indicate it can match 'ss') */
+
+ bool len_passed_in = cBOOL(len != 0);
+ U8 character[UTF8_MAXBYTES_CASE+1];
+
+ PERL_ARGS_ASSERT_ALLOC_MAYBE_POPULATE_EXACT;
+
+ if (! len_passed_in) {
+ if (UTF) {
+ if (FOLD) {
+ to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
+ }
+ else {
+ uvchr_to_utf8( character, code_point);
+ len = UTF8SKIP(character);
+ }
+ }
+ else if (! FOLD
+ || code_point != LATIN_SMALL_LETTER_SHARP_S
+ || ASCII_FOLD_RESTRICTED
+ || ! AT_LEAST_UNI_SEMANTICS)
+ {
+ *character = (U8) code_point;
+ len = 1;
+ }
+ else {
+ *character = 's';
+ *(character + 1) = 's';
+ len = 2;
+ }
+ }
+
+ if (SIZE_ONLY) {
+ RExC_size += STR_SZ(len);
+ }
+ else {
+ RExC_emit += STR_SZ(len);
+ STR_LEN(node) = len;
+ if (! len_passed_in) {
+ Copy((char *) character, STRING(node), len, char);
+ }
+ }
+
+ *flagp |= HASWIDTH;
+ if (len == 1 && UNI_IS_INVARIANT(code_point))
+ *flagp |= SIMPLE;
+}
+
+/*
+ - regatom - the lowest level
+
+ Try to identify anything special at the start of the pattern. If there
+ is, then handle it as required. This may involve generating a single regop,
+ such as for an assertion; or it may involve recursing, such as to
+ handle a () structure.
+
+ If the string doesn't start with something special then we gobble up
+ as much literal text as we can.
+
+ Once we have been able to handle whatever type of thing started the
+ sequence, we return.
+
+ Note: we have to be careful with escapes, as they can be both literal
+ and special, and in the case of \10 and friends, context determines which.
+
+ A summary of the code structure is:
+
+ switch (first_byte) {
+ cases for each special:
+ handle this special;
+ break;
+ case '\\':
+ switch (2nd byte) {
+ cases for each unambiguous special:
+ handle this special;
+ break;
+ cases for each ambigous special/literal:
+ disambiguate;
+ if (special) handle here
+ else goto defchar;
+ default: // unambiguously literal:
+ goto defchar;
+ }
+ default: // is a literal char
+ // FALL THROUGH
+ defchar:
+ create EXACTish node for literal;
+ while (more input and node isn't full) {
+ switch (input_byte) {
+ cases for each special;
+ make sure parse pointer is set so that the next call to
+ regatom will see this special first
+ goto loopdone; // EXACTish node terminated by prev. char
+ default:
+ append char to EXACTISH node;
+ }
+ get next input byte;
+ }
+ loopdone:
+ }
+ return the generated node;
+
+ Specifically there are two separate switches for handling
+ escape sequences, with the one for handling literal escapes requiring
+ a dummy entry for all of the special escapes that are actually handled
+ by the other.
+*/
+
+STATIC regnode *
+S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
+{
+ dVAR;
+ regnode *ret = NULL;
+ I32 flags;
+ char *parse_start = RExC_parse;
+ U8 op;
+ GET_RE_DEBUG_FLAGS_DECL;
DEBUG_PARSE("atom");
*flagp = WORST; /* Tentatively. */
case '[':
{
char * const oregcomp_parse = ++RExC_parse;
- ret = regclass(pRExC_state,depth+1);
+ ret = regclass(pRExC_state, flagp,depth+1);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
}
nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
break;
}
*flagp |= HASWIDTH;
goto finish_meta_pat;
case 'w':
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = ALNUML;
- break;
- case REGEX_UNICODE_CHARSET:
- op = ALNUMU;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = ALNUMA;
- break;
- case REGEX_DEPENDS_CHARSET:
- op = ALNUM;
- break;
- default:
- goto bad_charset;
+ op = ALNUM + get_regex_charset(RExC_flags);
+ if (op > ALNUMA) { /* /aa is same as /a */
+ op = ALNUMA;
}
ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'W':
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = NALNUML;
- break;
- case REGEX_UNICODE_CHARSET:
- op = NALNUMU;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = NALNUMA;
- break;
- case REGEX_DEPENDS_CHARSET:
- op = NALNUM;
- break;
- default:
- goto bad_charset;
+ op = NALNUM + get_regex_charset(RExC_flags);
+ if (op > NALNUMA) { /* /aa is same as /a */
+ op = NALNUMA;
}
ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = BOUNDL;
- break;
- case REGEX_UNICODE_CHARSET:
- op = BOUNDU;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = BOUNDA;
- break;
- case REGEX_DEPENDS_CHARSET:
- op = BOUND;
- break;
- default:
- goto bad_charset;
+ op = BOUND + get_regex_charset(RExC_flags);
+ if (op > BOUNDA) { /* /aa is same as /a */
+ op = BOUNDA;
}
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = NBOUNDL;
- break;
- case REGEX_UNICODE_CHARSET:
- op = NBOUNDU;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = NBOUNDA;
- break;
- case REGEX_DEPENDS_CHARSET:
- op = NBOUND;
- break;
- default:
- goto bad_charset;
+ op = NBOUND + get_regex_charset(RExC_flags);
+ if (op > NBOUNDA) { /* /aa is same as /a */
+ op = NBOUNDA;
}
ret = reg_node(pRExC_state, op);
FLAGS(ret) = get_regex_charset(RExC_flags);
*flagp |= SIMPLE;
goto finish_meta_pat;
case 's':
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = SPACEL;
- break;
- case REGEX_UNICODE_CHARSET:
- op = SPACEU;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = SPACEA;
- break;
- case REGEX_DEPENDS_CHARSET:
- op = SPACE;
- break;
- default:
- goto bad_charset;
+ op = SPACE + get_regex_charset(RExC_flags);
+ if (op > SPACEA) { /* /aa is same as /a */
+ op = SPACEA;
}
ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'S':
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = NSPACEL;
- break;
- case REGEX_UNICODE_CHARSET:
- op = NSPACEU;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = NSPACEA;
- break;
- case REGEX_DEPENDS_CHARSET:
- op = NSPACE;
- break;
- default:
- goto bad_charset;
- }
- ret = reg_node(pRExC_state, op);
- *flagp |= HASWIDTH|SIMPLE;
- goto finish_meta_pat;
- case 'd':
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = DIGITL;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = DIGITA;
- break;
- case REGEX_DEPENDS_CHARSET: /* No difference between these */
- case REGEX_UNICODE_CHARSET:
- op = DIGIT;
- break;
- default:
- goto bad_charset;
+ op = NSPACE + get_regex_charset(RExC_flags);
+ if (op > NSPACEA) { /* /aa is same as /a */
+ op = NSPACEA;
}
ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
goto finish_meta_pat;
case 'D':
- switch (get_regex_charset(RExC_flags)) {
- case REGEX_LOCALE_CHARSET:
- op = NDIGITL;
- break;
- case REGEX_ASCII_RESTRICTED_CHARSET:
- case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
- op = NDIGITA;
- break;
- case REGEX_DEPENDS_CHARSET: /* No difference between these */
- case REGEX_UNICODE_CHARSET:
- op = NDIGIT;
- break;
- default:
- goto bad_charset;
+ op = NDIGIT;
+ goto join_D_and_d;
+ case 'd':
+ op = DIGIT;
+ join_D_and_d:
+ {
+ U8 offset = get_regex_charset(RExC_flags);
+ if (offset == REGEX_UNICODE_CHARSET) {
+ offset = REGEX_DEPENDS_CHARSET;
+ }
+ else if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
+ offset = REGEX_ASCII_RESTRICTED_CHARSET;
+ }
+ op += offset;
}
ret = reg_node(pRExC_state, op);
*flagp |= HASWIDTH|SIMPLE;
}
RExC_parse--;
- ret = regclass(pRExC_state,depth+1);
+ ret = regclass(pRExC_state, flagp,depth+1);
RExC_end = oldregxend;
RExC_parse--;
Set_Node_Offset(ret, parse_start + 2);
Set_Node_Cur_Length(ret);
nextchar(pRExC_state);
- *flagp |= HASWIDTH|SIMPLE;
}
break;
case 'N':
- /* Handle \N and \N{NAME} here and not below because it can be
- multicharacter. join_exact() will join them up later on.
- Also this makes sure that things like /\N{BLAH}+/ and
- \N{BLAH} being multi char Just Happen. dmq*/
+ /* Handle \N and \N{NAME} with multiple code points here and not
+ * below because it can be multicharacter. join_exact() will join
+ * them up later on. Also this makes sure that things like
+ * /\N{BLAH}+/ and \N{BLAH} being multi char Just Happen. dmq.
+ * The options to the grok function call causes it to fail if the
+ * sequence is just a single code point. We then go treat it as
+ * just another character in the current EXACT node, and hence it
+ * gets uniform treatment with all the other characters. The
+ * special treatment for quantifiers is not needed for such single
+ * character sequences */
++RExC_parse;
- ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
+ if (! grok_bslash_N(pRExC_state, &ret, NULL, flagp, depth, FALSE)) {
+ RExC_parse--;
+ goto defchar;
+ }
break;
case 'k': /* Handle \k<NAME> and \k'NAME' */
parse_named_seq:
ret = reganode(pRExC_state,
((! FOLD)
? NREF
- : (MORE_ASCII_RESTRICTED)
+ : (ASCII_FOLD_RESTRICTED)
? NREFFA
: (AT_LEAST_UNI_SEMANTICS)
? NREFFU
vFAIL("Reference to nonexistent or unclosed group");
}
if (!isg && num > 9 && num >= RExC_npar)
+ /* Probably a character specified in octal, e.g. \35 */
goto defchar;
else {
char * const parse_start = RExC_parse - 1; /* MJD */
ret = reganode(pRExC_state,
((! FOLD)
? REF
- : (MORE_ASCII_RESTRICTED)
+ : (ASCII_FOLD_RESTRICTED)
? REFFA
: (AT_LEAST_UNI_SEMANTICS)
? REFFU
RExC_parse++;
defchar: {
- register STRLEN len;
- register UV ender;
- register char *p;
+ STRLEN len = 0;
+ UV ender;
+ char *p;
char *s;
+#define MAX_NODE_STRING_SIZE 127
+ char foldbuf[MAX_NODE_STRING_SIZE+UTF8_MAXBYTES_CASE];
+ char *s0;
+ U8 upper_parse = MAX_NODE_STRING_SIZE;
STRLEN foldlen;
- U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
U8 node_type;
-
- /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
- * it is folded to 'ss' even if not utf8 */
- bool is_exactfu_sharp_s;
+ bool next_is_quantifier;
+ char * oldp = NULL;
ender = 0;
- node_type = ((! FOLD) ? EXACT
- : (LOC)
- ? EXACTFL
- : (MORE_ASCII_RESTRICTED)
- ? EXACTFA
- : (AT_LEAST_UNI_SEMANTICS)
- ? EXACTFU
- : EXACTF);
+ node_type = compute_EXACTish(pRExC_state);
ret = reg_node(pRExC_state, node_type);
- s = STRING(ret);
+
+ /* In pass1, folded, we use a temporary buffer instead of the
+ * actual node, as the node doesn't exist yet */
+ s = (SIZE_ONLY && FOLD) ? foldbuf : STRING(ret);
+
+ s0 = s;
+
+ reparse:
/* XXX The node can hold up to 255 bytes, yet this only goes to
* 127. I (khw) do not know why. Keeping it somewhat less than
* could back off to end with only a code point that isn't such a
* non-final, but it is possible for there not to be any in the
* entire node. */
- for (len = 0, p = RExC_parse - 1;
- len < 127 && p < RExC_end;
+ for (p = RExC_parse - 1;
+ len < upper_parse && p < RExC_end;
len++)
{
- char * const oldp = p;
+ oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
case 'g': case 'G': /* generic-backref, pos assertion */
case 'h': case 'H': /* HORIZWS */
case 'k': case 'K': /* named backref, keep marker */
- case 'N': /* named char sequence */
case 'p': case 'P': /* Unicode property */
case 'R': /* LNBREAK */
case 's': case 'S': /* space class */
ender = '\n';
p++;
break;
+ case 'N': /* Handle a single-code point named character. */
+ /* The options cause it to fail if a multiple code
+ * point sequence. Handle those in the switch() above
+ * */
+ RExC_parse = p + 1;
+ if (! grok_bslash_N(pRExC_state, NULL, &ender,
+ flagp, depth, FALSE))
+ {
+ RExC_parse = p = oldp;
+ goto loopdone;
+ }
+ p = RExC_parse;
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
+ break;
case 'r':
ender = '\r';
p++;
break;
}
case 'x':
- if (*++p == '{') {
- char* const e = strchr(p, '}');
+ {
+ STRLEN brace_len = len;
+ UV result;
+ const char* error_msg;
- if (!e) {
- RExC_parse = p + 1;
- vFAIL("Missing right brace on \\x{}");
+ bool valid = grok_bslash_x(p,
+ &result,
+ &brace_len,
+ &error_msg,
+ 1);
+ p += brace_len;
+ if (! valid) {
+ RExC_parse = p; /* going to die anyway; point
+ to exact spot of failure */
+ vFAIL(error_msg);
}
else {
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX;
- STRLEN numlen = e - p - 1;
- ender = grok_hex(p + 1, &numlen, &flags, NULL);
- if (ender > 0xff)
- REQUIRE_UTF8;
- p = e + 1;
+ ender = result;
}
+ if (PL_encoding && ender < 0x100) {
+ goto recode_encoding;
+ }
+ if (ender > 0xff) {
+ REQUIRE_UTF8;
+ }
+ break;
}
- else {
- I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- STRLEN numlen = 2;
- ender = grok_hex(p, &numlen, &flags, NULL);
- p += numlen;
- }
- if (PL_encoding && ender < 0x100)
- goto recode_encoding;
- break;
case 'c':
p++;
ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
break;
case '0': case '1': case '2': case '3':case '4':
- case '5': case '6': case '7': case '8':case '9':
+ case '5': case '6': case '7':
if (*p == '0' ||
(isDIGIT(p[1]) && atoi(p) >= RExC_npar))
{
FAIL("Trailing \\");
/* FALL THROUGH */
default:
- if (!SIZE_ONLY&& isALPHA(*p)) {
+ if (!SIZE_ONLY&& isALNUMC(*p)) {
ckWARN2reg(p + 1, "Unrecognized escape \\%.1s passed through", p);
}
goto normal_default;
break;
} /* End of switch on the literal */
- is_exactfu_sharp_s = (node_type == EXACTFU
- && ender == LATIN_SMALL_LETTER_SHARP_S);
+ /* Here, have looked at the literal character and <ender>
+ * contains its ordinal, <p> points to the character after it
+ */
+
if ( RExC_flags & RXf_PMf_EXTENDED)
p = regwhite( pRExC_state, p );
- if ((UTF && FOLD) || is_exactfu_sharp_s) {
- /* Prime the casefolded buffer. Locale rules, which apply
- * only to code points < 256, aren't known until execution,
- * so for them, just output the original character using
- * utf8. If we start to fold non-UTF patterns, be sure to
- * update join_exact() */
- if (LOC && ender < 256) {
- if (UNI_IS_INVARIANT(ender)) {
- *tmpbuf = (U8) ender;
- foldlen = 1;
- } else {
- *tmpbuf = UTF8_TWO_BYTE_HI(ender);
- *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
- foldlen = 2;
- }
- }
- else if (isASCII(ender)) { /* Note: Here can't also be LOC
- */
- ender = toLOWER(ender);
- *tmpbuf = (U8) ender;
- foldlen = 1;
- }
- else if (! MORE_ASCII_RESTRICTED && ! LOC) {
- /* Locale and /aa require more selectivity about the
- * fold, so are handled below. Otherwise, here, just
- * use the fold */
- ender = toFOLD_uni(ender, tmpbuf, &foldlen);
- }
- else {
- /* Under locale rules or /aa we are not to mix,
- * respectively, ords < 256 or ASCII with non-. So
- * reject folds that mix them, using only the
- * non-folded code point. So do the fold to a
- * temporary, and inspect each character in it. */
- U8 trialbuf[UTF8_MAXBYTES_CASE+1];
- U8* s = trialbuf;
- UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
- U8* e = s + foldlen;
- bool fold_ok = TRUE;
-
- while (s < e) {
- if (isASCII(*s)
- || (LOC && (UTF8_IS_INVARIANT(*s)
- || UTF8_IS_DOWNGRADEABLE_START(*s))))
- {
- fold_ok = FALSE;
- break;
- }
- s += UTF8SKIP(s);
- }
- if (fold_ok) {
- Copy(trialbuf, tmpbuf, foldlen, U8);
- ender = tmpender;
- }
- else {
- uvuni_to_utf8(tmpbuf, ender);
- foldlen = UNISKIP(ender);
- }
- }
- }
- if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
- if (len)
- p = oldp;
- else if (UTF || is_exactfu_sharp_s) {
- if (FOLD) {
- /* Emit all the Unicode characters. */
- STRLEN numlen;
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen) {
-
- /* tmpbuf has been constructed by us, so we
- * know it is valid utf8 */
- ender = valid_utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- s += unilen;
- len += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break; /* "Can't happen." */
- }
- }
- else {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- }
- else {
- len++;
- REGC((char)ender, s++);
- }
- break;
+ /* If the next thing is a quantifier, it applies to this
+ * character only, which means that this character has to be in
+ * its own node and can't just be appended to the string in an
+ * existing node, so if there are already other characters in
+ * the node, close the node with just them, and set up to do
+ * this character again next time through, when it will be the
+ * only thing in its new node */
+ if ((next_is_quantifier = (p < RExC_end && ISMULT2(p))) && len)
+ {
+ p = oldp;
+ goto loopdone;
+ }
+
+ if (FOLD) {
+ if (UTF
+ /* See comments for join_exact() as to why we fold
+ * this non-UTF at compile time */
+ || (node_type == EXACTFU
+ && ender == LATIN_SMALL_LETTER_SHARP_S))
+ {
+
+
+ /* Prime the casefolded buffer. Locale rules, which
+ * apply only to code points < 256, aren't known until
+ * execution, so for them, just output the original
+ * character using utf8. If we start to fold non-UTF
+ * patterns, be sure to update join_exact() */
+ if (LOC && ender < 256) {
+ if (UNI_IS_INVARIANT(ender)) {
+ *s = (U8) ender;
+ foldlen = 1;
+ } else {
+ *s = UTF8_TWO_BYTE_HI(ender);
+ *(s + 1) = UTF8_TWO_BYTE_LO(ender);
+ foldlen = 2;
+ }
+ }
+ else {
+ ender = _to_uni_fold_flags(ender, (U8 *) s, &foldlen,
+ FOLD_FLAGS_FULL
+ | ((LOC) ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0)
+ );
+ }
+ s += foldlen;
+
+ /* The loop increments <len> each time, as all but this
+ * path (and the one just below for UTF) through it add
+ * a single byte to the EXACTish node. But this one
+ * has changed len to be the correct final value, so
+ * subtract one to cancel out the increment that
+ * follows */
+ len += foldlen - 1;
+ }
+ else {
+ *(s++) = ender;
+ }
}
- if (UTF || is_exactfu_sharp_s) {
- if (FOLD) {
- /* Emit all the Unicode characters. */
- STRLEN numlen;
- for (foldbuf = tmpbuf;
- foldlen;
- foldlen -= numlen) {
- ender = valid_utf8_to_uvchr(foldbuf, &numlen);
- if (numlen > 0) {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- len += unilen;
- s += unilen;
- /* In EBCDIC the numlen
- * and unilen can differ. */
- foldbuf += numlen;
- if (numlen >= foldlen)
- break;
- }
- else
- break;
- }
- }
- else {
- const STRLEN unilen = reguni(pRExC_state, ender, s);
- if (unilen > 0) {
- s += unilen;
- len += unilen;
- }
- }
- len--;
+ else if (UTF) {
+ const STRLEN unilen = reguni(pRExC_state, ender, s);
+ if (unilen > 0) {
+ s += unilen;
+ len += unilen;
+ }
+
+ /* See comment just above for - 1 */
+ len--;
}
else {
REGC((char)ender, s++);
+ }
+
+ if (next_is_quantifier) {
+
+ /* Here, the next input is a quantifier, and to get here,
+ * the current character is the only one in the node.
+ * Also, here <len> doesn't include the final byte for this
+ * character */
+ len++;
+ goto loopdone;
}
- }
+
+ } /* End of loop through literal characters */
+
+ /* Here we have either exhausted the input or ran out of room in
+ * the node. (If we encountered a character that can't be in the
+ * node, transfer is made directly to <loopdone>, and so we
+ * wouldn't have fallen off the end of the loop.) In the latter
+ * case, we artificially have to split the node into two, because
+ * we just don't have enough space to hold everything. This
+ * creates a problem if the final character participates in a
+ * multi-character fold in the non-final position, as a match that
+ * should have occurred won't, due to the way nodes are matched,
+ * and our artificial boundary. So back off until we find a non-
+ * problematic character -- one that isn't at the beginning or
+ * middle of such a fold. (Either it doesn't participate in any
+ * folds, or appears only in the final position of all the folds it
+ * does participate in.) A better solution with far fewer false
+ * positives, and that would fill the nodes more completely, would
+ * be to actually have available all the multi-character folds to
+ * test against, and to back-off only far enough to be sure that
+ * this node isn't ending with a partial one. <upper_parse> is set
+ * further below (if we need to reparse the node) to include just
+ * up through that final non-problematic character that this code
+ * identifies, so when it is set to less than the full node, we can
+ * skip the rest of this */
+ if (FOLD && p < RExC_end && upper_parse == MAX_NODE_STRING_SIZE) {
+
+ const STRLEN full_len = len;
+
+ assert(len >= MAX_NODE_STRING_SIZE);
+
+ /* Here, <s> points to the final byte of the final character.
+ * Look backwards through the string until find a non-
+ * problematic character */
+
+ if (! UTF) {
+
+ /* These two have no multi-char folds to non-UTF characters
+ */
+ if (ASCII_FOLD_RESTRICTED || LOC) {
+ goto loopdone;
+ }
+
+ while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) { }
+ len = s - s0 + 1;
+ }
+ else {
+ if (! PL_NonL1NonFinalFold) {
+ PL_NonL1NonFinalFold = _new_invlist_C_array(
+ NonL1_Perl_Non_Final_Folds_invlist);
+ }
+
+ /* Point to the first byte of the final character */
+ s = (char *) utf8_hop((U8 *) s, -1);
+
+ while (s >= s0) { /* Search backwards until find
+ non-problematic char */
+ if (UTF8_IS_INVARIANT(*s)) {
+
+ /* There are no ascii characters that participate
+ * in multi-char folds under /aa. In EBCDIC, the
+ * non-ascii invariants are all control characters,
+ * so don't ever participate in any folds. */
+ if (ASCII_FOLD_RESTRICTED
+ || ! IS_NON_FINAL_FOLD(*s))
+ {
+ break;
+ }
+ }
+ else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+
+ /* No Latin1 characters participate in multi-char
+ * folds under /l */
+ if (LOC
+ || ! IS_NON_FINAL_FOLD(TWO_BYTE_UTF8_TO_UNI(
+ *s, *(s+1))))
+ {
+ break;
+ }
+ }
+ else if (! _invlist_contains_cp(
+ PL_NonL1NonFinalFold,
+ valid_utf8_to_uvchr((U8 *) s, NULL)))
+ {
+ break;
+ }
+
+ /* Here, the current character is problematic in that
+ * it does occur in the non-final position of some
+ * fold, so try the character before it, but have to
+ * special case the very first byte in the string, so
+ * we don't read outside the string */
+ s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
+ } /* End of loop backwards through the string */
+
+ /* If there were only problematic characters in the string,
+ * <s> will point to before s0, in which case the length
+ * should be 0, otherwise include the length of the
+ * non-problematic character just found */
+ len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
+ }
+
+ /* Here, have found the final character, if any, that is
+ * non-problematic as far as ending the node without splitting
+ * it across a potential multi-char fold. <len> contains the
+ * number of bytes in the node up-to and including that
+ * character, or is 0 if there is no such character, meaning
+ * the whole node contains only problematic characters. In
+ * this case, give up and just take the node as-is. We can't
+ * do any better */
+ if (len == 0) {
+ len = full_len;
+ } else {
+
+ /* Here, the node does contain some characters that aren't
+ * problematic. If one such is the final character in the
+ * node, we are done */
+ if (len == full_len) {
+ goto loopdone;
+ }
+ else if (len + ((UTF) ? UTF8SKIP(s) : 1) == full_len) {
+
+ /* If the final character is problematic, but the
+ * penultimate is not, back-off that last character to
+ * later start a new node with it */
+ p = oldp;
+ goto loopdone;
+ }
+
+ /* Here, the final non-problematic character is earlier
+ * in the input than the penultimate character. What we do
+ * is reparse from the beginning, going up only as far as
+ * this final ok one, thus guaranteeing that the node ends
+ * in an acceptable character. The reason we reparse is
+ * that we know how far in the character is, but we don't
+ * know how to correlate its position with the input parse.
+ * An alternate implementation would be to build that
+ * correlation as we go along during the original parse,
+ * but that would entail extra work for every node, whereas
+ * this code gets executed only when the string is too
+ * large for the node, and the final two characters are
+ * problematic, an infrequent occurrence. Yet another
+ * possible strategy would be to save the tail of the
+ * string, and the next time regatom is called, initialize
+ * with that. The problem with this is that unless you
+ * back off one more character, you won't be guaranteed
+ * regatom will get called again, unless regbranch,
+ * regpiece ... are also changed. If you do back off that
+ * extra character, so that there is input guaranteed to
+ * force calling regatom, you can't handle the case where
+ * just the first character in the node is acceptable. I
+ * (khw) decided to try this method which doesn't have that
+ * pitfall; if performance issues are found, we can do a
+ * combination of the current approach plus that one */
+ upper_parse = len;
+ len = 0;
+ s = s0;
+ goto reparse;
+ }
+ } /* End of verifying node ends with an appropriate char */
+
loopdone: /* Jumped to when encounters something that shouldn't be in
the node */
+
+ /* I (khw) don't know if you can get here with zero length, but the
+ * old code handled this situation by creating a zero-length EXACT
+ * node. Might as well be NOTHING instead */
+ if (len == 0) {
+ OP(ret) = NOTHING;
+ }
+ else{
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, len, ender);
+ }
+
RExC_parse = p - 1;
Set_Node_Cur_Length(ret); /* MJD */
nextchar(pRExC_state);
if (iv < 0)
vFAIL("Internal disaster");
}
- if (len > 0)
- *flagp |= HASWIDTH;
- if (len == 1 && UNI_IS_INVARIANT(ender))
- *flagp |= SIMPLE;
- if (SIZE_ONLY)
- RExC_size += STR_SZ(len);
- else {
- STR_LEN(ret) = len;
- RExC_emit += STR_SZ(len);
- }
- }
+ } /* End of label 'defchar:' */
break;
- }
+ } /* End of giant switch on input character */
return(ret);
-
-/* Jumped to when an unrecognized character set is encountered */
-bad_charset:
- Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
- return(NULL);
}
STATIC char *
switch (skip) {
case 4:
if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
- namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
+ namedclass = ANYOF_ALNUM;
break;
case 5:
/* Names all of length 5. */
switch (posixcc[4]) {
case 'a':
if (memEQ(posixcc, "alph", 4)) /* alpha */
- namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
+ namedclass = ANYOF_ALPHA;
break;
case 'e':
if (memEQ(posixcc, "spac", 4)) /* space */
- namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
+ namedclass = ANYOF_PSXSPC;
break;
case 'h':
if (memEQ(posixcc, "grap", 4)) /* graph */
- namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
+ namedclass = ANYOF_GRAPH;
break;
case 'i':
if (memEQ(posixcc, "asci", 4)) /* ascii */
- namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
+ namedclass = ANYOF_ASCII;
break;
case 'k':
if (memEQ(posixcc, "blan", 4)) /* blank */
- namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
+ namedclass = ANYOF_BLANK;
break;
case 'l':
if (memEQ(posixcc, "cntr", 4)) /* cntrl */
- namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
+ namedclass = ANYOF_CNTRL;
break;
case 'm':
if (memEQ(posixcc, "alnu", 4)) /* alnum */
- namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
+ namedclass = ANYOF_ALNUMC;
break;
case 'r':
if (memEQ(posixcc, "lowe", 4)) /* lower */
- namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
+ namedclass = ANYOF_LOWER;
else if (memEQ(posixcc, "uppe", 4)) /* upper */
- namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
+ namedclass = ANYOF_UPPER;
break;
case 't':
if (memEQ(posixcc, "digi", 4)) /* digit */
- namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
+ namedclass = ANYOF_DIGIT;
else if (memEQ(posixcc, "prin", 4)) /* print */
- namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
+ namedclass = ANYOF_PRINT;
else if (memEQ(posixcc, "punc", 4)) /* punct */
- namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
+ namedclass = ANYOF_PUNCT;
break;
}
break;
case 6:
if (memEQ(posixcc, "xdigit", 6))
- namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
+ namedclass = ANYOF_XDIGIT;
break;
}
if (namedclass == OOB_NAMEDCLASS)
Simple_vFAIL3("POSIX class [:%.*s:] unknown",
t - s - 1, s + 1);
+
+ /* The #defines are structured so each complement is +1 to
+ * the normal one */
+ if (complement) {
+ namedclass++;
+ }
assert (posixcc[skip] == ':');
assert (posixcc[skip+1] == ']');
} else if (!SIZE_ONLY) {
* determined at run-time
* run_time_list is a SV* that contains text names of properties that are to
* be computed at run time. This concatenates <Xpropertyname>
- * to it, apppropriately
+ * to it, appropriately
* This is essentially DO_POSIX, but we know only the Latin1 values at compile
* time */
#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
}
/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
- * this and DO_N_POSIX */
+ * this and DO_N_POSIX. Sets <matches_above_unicode> only if it can; unchanged
+ * otherwise */
#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
- l1_sourcelist, Xpropertyname, run_time_list) \
+ l1_sourcelist, Xpropertyname, run_time_list, matches_above_unicode) \
if (AT_LEAST_ASCII_RESTRICTED) { \
_invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
} \
else { \
Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
+ matches_above_unicode = TRUE; \
if (LOC) { \
- ANYOF_CLASS_SET(node, namedclass); \
+ ANYOF_CLASS_SET(node, namedclass); \
} \
else { \
SV* scratch_list = NULL; \
} \
}
-STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
-{
-
- /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
- * Locale folding is done at run-time, so this function should not be
- * called for nodes that are for locales.
- *
- * This function sets the bit corresponding to the fold of the input
- * 'value', if not already set. The fold of 'f' is 'F', and the fold of
- * 'F' is 'f'.
- *
- * It also knows about the characters that are in the bitmap that have
- * folds that are matchable only outside it, and sets the appropriate lists
- * and flags.
- *
- * It returns the number of bits that actually changed from 0 to 1 */
-
- U8 stored = 0;
- U8 fold;
-
- PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
-
- fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
- : PL_fold[value];
-
- /* It assumes the bit for 'value' has already been set */
- if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
- ANYOF_BITMAP_SET(node, fold);
- stored++;
- }
- if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
- /* Certain Latin1 characters have matches outside the bitmap. To get
- * here, 'value' is one of those characters. None of these matches is
- * valid for ASCII characters under /aa, which have been excluded by
- * the 'if' above. The matches fall into three categories:
- * 1) They are singly folded-to or -from an above 255 character, as
- * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
- * WITH DIAERESIS;
- * 2) They are part of a multi-char fold with another character in the
- * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
- * 3) They are part of a multi-char fold with a character not in the
- * bitmap, such as various ligatures.
- * We aren't dealing fully with multi-char folds, except we do deal
- * with the pattern containing a character that has a multi-char fold
- * (not so much the inverse).
- * For types 1) and 3), the matches only happen when the target string
- * is utf8; that's not true for 2), and we set a flag for it.
- *
- * The code below adds to the passed in inversion list the single fold
- * closures for 'value'. The values are hard-coded here so that an
- * innocent-looking character class, like /[ks]/i won't have to go out
- * to disk to find the possible matches. XXX It would be better to
- * generate these via regen, in case a new version of the Unicode
- * standard adds new mappings, though that is not really likely. */
- switch (value) {
- case 'k':
- case 'K':
- /* KELVIN SIGN */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
- break;
- case 's':
- case 'S':
- /* LATIN SMALL LETTER LONG S */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
- break;
- case MICRO_SIGN:
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- GREEK_SMALL_LETTER_MU);
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- GREEK_CAPITAL_LETTER_MU);
- break;
- case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
- case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
- /* ANGSTROM SIGN */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
- if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- PL_fold_latin1[value]);
- }
- break;
- case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
- LATIN_CAPITAL_LETTER_SHARP_S);
-
- /* Under /a, /d, and /u, this can match the two chars "ss" */
- if (! MORE_ASCII_RESTRICTED) {
- add_alternate(alternate_ptr, (U8 *) "ss", 2);
-
- /* And under /u or /a, it can match even if the target is
- * not utf8 */
- if (AT_LEAST_UNI_SEMANTICS) {
- ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
- }
- }
- break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character folds from code
- * points that require UTF8 to express, so they can't match
- * unless the target string is in UTF-8, so no action here is
- * necessary, as regexec.c properly handles the general case
- * for UTF-8 matching */
- break;
- default:
- /* Use deprecated warning to increase the chances of this
- * being output */
- ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
- break;
- }
- }
- else if (DEPENDS_SEMANTICS
- && ! isASCII(value)
- && PL_fold_latin1[value] != value)
- {
- /* Under DEPENDS rules, non-ASCII Latin1 characters match their
- * folds only when the target string is in UTF-8. We add the fold
- * here to the list of things to match outside the bitmap, which
- * won't be looked at unless it is UTF8 (or else if something else
- * says to look even if not utf8, but those things better not happen
- * under DEPENDS semantics. */
- *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
- }
-
- return stored;
-}
-
-
-PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
-{
- /* This inline function sets a bit in the bitmap if not already set, and if
- * appropriate, its fold, returning the number of bits that actually
- * changed from 0 to 1 */
-
- U8 stored;
-
- PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
-
- if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
- return 0;
- }
-
- ANYOF_BITMAP_SET(node, value);
- stored = 1;
-
- if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
- stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
- }
-
- return stored;
-}
-
STATIC void
S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
{
return;
}
+/* The names of properties whose definitions are not known at compile time are
+ * stored in this SV, after a constant heading. So if the length has been
+ * changed since initialization, then there is a run-time definition. */
+#define HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION (SvCUR(listsv) != initial_listsv_len)
+
+/* This converts the named class defined in regcomp.h to its equivalent class
+ * number defined in handy.h. */
+#define namedclass_to_classnum(class) ((class) / 2)
+
/*
parse a class specification and produce either an ANYOF node that
matches the pattern or perhaps will be optimized into an EXACTish node
above 255, a range list is used */
STATIC regnode *
-S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
+S_regclass(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
dVAR;
- register UV nextvalue;
- register IV prevvalue = OOB_UNICODE;
- register IV range = 0;
- UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
- register regnode *ret;
+ UV nextvalue;
+ UV prevvalue = OOB_UNICODE;
+ IV range = 0;
+ UV value = 0;
+ regnode *ret;
STRLEN numlen;
- IV namedclass;
+ IV namedclass = OOB_NAMEDCLASS;
char *rangebegin = NULL;
bool need_class = 0;
bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
than just initialized. */
SV* properties = NULL; /* Code points that match \p{} \P{} */
+ SV* posixes = NULL; /* Code points that match classes like, [:word:],
+ extended beyond the Latin1 range */
UV element_count = 0; /* Number of distinct elements in the class.
Optimizations may be possible if this is tiny */
UV n;
/* Set if a component of this character class is user-defined; just passed
* on to the engine */
- UV has_user_defined_property = 0;
-
- /* code points this node matches that can't be stored in the bitmap */
- SV* nonbitmap = NULL;
-
- /* The items that are to match that aren't stored in the bitmap, but are a
- * result of things that are stored there. This is the fold closure of
- * such a character, either because it has DEPENDS semantics and shouldn't
- * be matched unless the target string is utf8, or is a code point that is
- * too large for the bit map, as for example, the fold of the MICRO SIGN is
- * above 255. This all is solely for performance reasons. By having this
- * code know the outside-the-bitmap folds that the bitmapped characters are
- * involved with, we don't have to go out to disk to find the list of
- * matches, unless the character class includes code points that aren't
- * storable in the bit map. That means that a character class with an 's'
- * in it, for example, doesn't need to go out to disk to find everything
- * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
- * empty unless there is something whose fold we don't know about, and will
- * have to go out to the disk to find. */
- SV* l1_fold_invlist = NULL;
+ bool has_user_defined_property = FALSE;
+
+ /* inversion list of code points this node matches only when the target
+ * string is in UTF-8. (Because is under /d) */
+ SV* depends_list = NULL;
+
+ /* inversion list of code points this node matches. For much of the
+ * function, it includes only those that match regardless of the utf8ness
+ * of the target string */
+ SV* cp_list = NULL;
/* List of multi-character folds that are matched by this node */
AV* unicode_alternate = NULL;
#ifdef EBCDIC
+ /* In a range, counts how many 0-2 of the ends of it came from literals,
+ * not escapes. Thus we can tell if 'A' was input vs \x{C1} */
UV literal_endpoint = 0;
#endif
- UV stored = 0; /* how many chars stored in the bitmap */
+ bool invert = FALSE; /* Is this class to be complemented */
+
+ /* Is there any thing like \W or [:^digit:] that matches above the legal
+ * Unicode range? */
+ bool runtime_posix_matches_above_Unicode = FALSE;
regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
case we need to change the emitted regop to an EXACT. */
const char * orig_parse = RExC_parse;
+ const I32 orig_size = RExC_size;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGCLASS;
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
RExC_parse++;
- if (!SIZE_ONLY)
- ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ invert = TRUE;
/* We have decided to not allow multi-char folds in inverted character
* classes, due to the confusion that can happen, especially with
if (LOC) {
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
}
- ANYOF_BITMAP_ZERO(ret);
listsv = newSVpvs("# comment\n");
initial_listsv_len = SvCUR(listsv);
}
if this makes sense as it does change the behaviour
from earlier versions, OTOH that behaviour was broken
as well. */
- UV v; /* value is register so we cant & it /grrr */
- if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
+ if (! grok_bslash_N(pRExC_state, NULL, &value, flagp, depth,
+ TRUE /* => charclass */))
+ {
goto parseit;
}
- value= v;
}
break;
case 'p':
n = 1;
}
if (!SIZE_ONLY) {
- SV** invlistsvp;
SV* invlist;
char* name;
+
if (UCHARAT(RExC_parse) == '^') {
RExC_parse++;
n--;
if ( ! swash
|| ! SvROK(swash)
|| ! SvTYPE(SvRV(swash)) == SVt_PVHV
- || ! (invlistsvp =
- hv_fetchs(MUTABLE_HV(SvRV(swash)),
- "INVLIST", FALSE))
- || ! (invlist = *invlistsvp))
+ || ! (invlist = _get_swash_invlist(swash)))
{
if (swash) {
SvREFCNT_dec(swash);
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
(value == 'p' ? '+' : '!'),
name);
- has_user_defined_property = 1;
+ has_user_defined_property = TRUE;
/* We don't know yet, so have to assume that the
* property could match something in the Latin1 range,
- * hence something that isn't utf8 */
+ * hence something that isn't utf8. Note that this
+ * would cause things in <depends_list> to match
+ * inappropriately, except that any \p{}, including
+ * this one forces Unicode semantics, which means there
+ * is <no depends_list> */
ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
}
else {
/* Here, did get the swash and its inversion list. If
* the swash is from a user-defined property, then this
* whole character class should be regarded as such */
- SV** user_defined_svp =
- hv_fetchs(MUTABLE_HV(SvRV(swash)),
- "USER_DEFINED", FALSE);
- if (user_defined_svp) {
- has_user_defined_property
- |= SvUV(*user_defined_svp);
- }
+ has_user_defined_property =
+ _is_swash_user_defined(swash);
/* Invert if asking for the complement */
if (value == 'P') {
- _invlist_union_complement_2nd(properties, invlist, &properties);
+ _invlist_union_complement_2nd(properties,
+ invlist,
+ &properties);
/* The swash can't be used as-is, because we've
* inverted things; delay removing it to here after
}
break;
case 'x':
- if (*RExC_parse == '{') {
- I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
- | PERL_SCAN_DISALLOW_PREFIX;
- char * const e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\x{}");
-
- numlen = e - RExC_parse;
- value = grok_hex(RExC_parse, &numlen, &flags, NULL);
- RExC_parse = e + 1;
- }
- else {
- I32 flags = PERL_SCAN_DISALLOW_PREFIX;
- numlen = 2;
- value = grok_hex(RExC_parse, &numlen, &flags, NULL);
+ RExC_parse--; /* function expects to be pointed at the 'x' */
+ {
+ const char* error_msg;
+ bool valid = grok_bslash_x(RExC_parse,
+ &value,
+ &numlen,
+ &error_msg,
+ 1);
RExC_parse += numlen;
+ if (! valid) {
+ vFAIL(error_msg);
+ }
}
if (PL_encoding && value < 0x100)
goto recode_encoding;
literal_endpoint++;
#endif
- if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
-
- /* What matches in a locale is not known until runtime, so need to
- * (one time per class) allocate extra space to pass to regexec.
- * The space will contain a bit for each named class that is to be
- * matched against. This isn't needed for \p{} and pseudo-classes,
- * as they are not affected by locale, and hence are dealt with
- * separately */
- if (LOC && namedclass < ANYOF_MAX && ! need_class) {
+ /* What matches in a locale is not known until runtime. This
+ * includes what the Posix classes (like \w, [:space:]) match.
+ * Room must be reserved (one time per class) to store such
+ * classes, either if Perl is compiled so that locale nodes always
+ * should have this space, or if there is such class info to be
+ * stored. The space will contain a bit for each named class that
+ * is to be matched against. This isn't needed for \p{} and
+ * pseudo-classes, as they are not affected by locale, and hence
+ * are dealt with separately */
+ if (LOC
+ && ! need_class
+ && (ANYOF_LOCALE == ANYOF_CLASS
+ || (namedclass > OOB_NAMEDCLASS && namedclass < ANYOF_MAX)))
+ {
need_class = 1;
if (SIZE_ONLY) {
RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
}
+ if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
/* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
* literal, as is the character that began the false range, i.e.
* the 'a' in the examples */
ckWARN4reg(RExC_parse,
"False [] range \"%*.*s\"",
w, w, rangebegin);
-
- stored +=
- set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
- if (prevvalue < 256) {
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
- }
- else {
- nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
- }
+ cp_list = add_cp_to_invlist(cp_list, '-');
+ cp_list = add_cp_to_invlist(cp_list, prevvalue);
}
range = 0; /* this was not a true range */
+ element_count += 2; /* So counts for three values */
}
- if (!SIZE_ONLY) {
-
- /* Possible truncation here but in some 64-bit environments
- * the compiler gets heartburn about switch on 64-bit values.
- * A similar issue a little earlier when switching on value.
- * --jhi */
+ if (! SIZE_ONLY) {
switch ((I32)namedclass) {
case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
break;
case ANYOF_NALNUMC:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+ PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv,
+ runtime_posix_matches_above_Unicode);
break;
case ANYOF_ALPHA:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
break;
case ANYOF_NALPHA:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+ PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv,
+ runtime_posix_matches_above_Unicode);
break;
case ANYOF_ASCII:
if (LOC) {
ANYOF_CLASS_SET(ret, namedclass);
}
else {
- _invlist_union(properties, PL_ASCII, &properties);
+ _invlist_union(posixes, PL_ASCII, &posixes);
}
break;
case ANYOF_NASCII:
ANYOF_CLASS_SET(ret, namedclass);
}
else {
- _invlist_union_complement_2nd(properties,
- PL_ASCII, &properties);
+ _invlist_union_complement_2nd(posixes,
+ PL_ASCII, &posixes);
if (DEPENDS_SEMANTICS) {
ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
}
}
break;
case ANYOF_BLANK:
- DO_POSIX(ret, namedclass, properties,
+ DO_POSIX(ret, namedclass, posixes,
PL_PosixBlank, PL_XPosixBlank);
break;
case ANYOF_NBLANK:
- DO_N_POSIX(ret, namedclass, properties,
+ DO_N_POSIX(ret, namedclass, posixes,
PL_PosixBlank, PL_XPosixBlank);
break;
case ANYOF_CNTRL:
- DO_POSIX(ret, namedclass, properties,
+ DO_POSIX(ret, namedclass, posixes,
PL_PosixCntrl, PL_XPosixCntrl);
break;
case ANYOF_NCNTRL:
- DO_N_POSIX(ret, namedclass, properties,
+ DO_N_POSIX(ret, namedclass, posixes,
PL_PosixCntrl, PL_XPosixCntrl);
break;
case ANYOF_DIGIT:
/* There are no digits in the Latin1 range outside of
* ASCII, so call the macro that doesn't have to resolve
* them */
- DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN_L1_RESOLVED(ret, namedclass, posixes,
PL_PosixDigit, "XPosixDigit", listsv);
break;
case ANYOF_NDIGIT:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+ PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv,
+ runtime_posix_matches_above_Unicode);
break;
case ANYOF_GRAPH:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
break;
case ANYOF_NGRAPH:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+ PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv,
+ runtime_posix_matches_above_Unicode);
break;
case ANYOF_HORIZWS:
- /* For these, we use the nonbitmap, as /d doesn't make a
+ /* For these, we use the cp_list, as /d doesn't make a
* difference in what these match. There would be problems
* if these characters had folds other than themselves, as
- * nonbitmap is subject to folding. It turns out that \h
+ * cp_list is subject to folding. It turns out that \h
* is just a synonym for XPosixBlank */
- _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
+ _invlist_union(cp_list, PL_XPosixBlank, &cp_list);
break;
case ANYOF_NHORIZWS:
- _invlist_union_complement_2nd(nonbitmap,
- PL_XPosixBlank, &nonbitmap);
+ _invlist_union_complement_2nd(cp_list,
+ PL_XPosixBlank, &cp_list);
break;
case ANYOF_LOWER:
case ANYOF_NLOWER:
Xname = "XPosixLower";
}
if (namedclass == ANYOF_LOWER) {
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
ascii_source, l1_source, Xname, listsv);
}
else {
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
- properties, ascii_source, l1_source, Xname, listsv);
+ posixes, ascii_source, l1_source, Xname, listsv,
+ runtime_posix_matches_above_Unicode);
}
break;
}
case ANYOF_PRINT:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
break;
case ANYOF_NPRINT:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+ PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv,
+ runtime_posix_matches_above_Unicode);
break;
case ANYOF_PUNCT:
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
break;
case ANYOF_NPUNCT:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+ PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv,
+ runtime_posix_matches_above_Unicode);
break;
case ANYOF_PSXSPC:
- DO_POSIX(ret, namedclass, properties,
+ DO_POSIX(ret, namedclass, posixes,
PL_PosixSpace, PL_XPosixSpace);
break;
case ANYOF_NPSXSPC:
- DO_N_POSIX(ret, namedclass, properties,
+ DO_N_POSIX(ret, namedclass, posixes,
PL_PosixSpace, PL_XPosixSpace);
break;
case ANYOF_SPACE:
- DO_POSIX(ret, namedclass, properties,
+ DO_POSIX(ret, namedclass, posixes,
PL_PerlSpace, PL_XPerlSpace);
break;
case ANYOF_NSPACE:
- DO_N_POSIX(ret, namedclass, properties,
+ DO_N_POSIX(ret, namedclass, posixes,
PL_PerlSpace, PL_XPerlSpace);
break;
case ANYOF_UPPER: /* Same as LOWER, above */
Xname = "XPosixUpper";
}
if (namedclass == ANYOF_UPPER) {
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
ascii_source, l1_source, Xname, listsv);
}
else {
DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
- properties, ascii_source, l1_source, Xname, listsv);
+ posixes, ascii_source, l1_source, Xname, listsv,
+ runtime_posix_matches_above_Unicode);
}
break;
}
case ANYOF_ALNUM: /* Really is 'Word' */
- DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
+ DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
break;
case ANYOF_NALNUM:
- DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
- PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
+ DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, posixes,
+ PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv,
+ runtime_posix_matches_above_Unicode);
break;
case ANYOF_VERTWS:
- /* For these, we use the nonbitmap, as /d doesn't make a
+ /* For these, we use the cp_list, as /d doesn't make a
* difference in what these match. There would be problems
* if these characters had folds other than themselves, as
- * nonbitmap is subject to folding */
- _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
+ * cp_list is subject to folding */
+ _invlist_union(cp_list, PL_VertSpace, &cp_list);
break;
case ANYOF_NVERTWS:
- _invlist_union_complement_2nd(nonbitmap,
- PL_VertSpace, &nonbitmap);
+ _invlist_union_complement_2nd(cp_list,
+ PL_VertSpace, &cp_list);
break;
case ANYOF_XDIGIT:
- DO_POSIX(ret, namedclass, properties,
+ DO_POSIX(ret, namedclass, posixes,
PL_PosixXDigit, PL_XPosixXDigit);
break;
case ANYOF_NXDIGIT:
- DO_N_POSIX(ret, namedclass, properties,
+ DO_N_POSIX(ret, namedclass, posixes,
PL_PosixXDigit, PL_XPosixXDigit);
break;
case ANYOF_MAX:
break;
}
- continue;
+ continue; /* Go get next character */
}
} /* end of namedclass \blah */
if (range) {
- if (prevvalue > (IV)value) /* b-a */ {
+ if (prevvalue > value) /* b-a */ {
const int w = RExC_parse - rangebegin;
Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
range = 0; /* not a valid range */
}
}
else {
- prevvalue = value; /* save the beginning of the range */
+ prevvalue = value; /* save the beginning of the potential range */
if (RExC_parse+1 < RExC_end
&& *RExC_parse == '-'
&& RExC_parse[1] != ']')
"False [] range \"%*.*s\"",
w, w, rangebegin);
}
- if (!SIZE_ONLY)
- stored +=
- set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
+ if (!SIZE_ONLY) {
+ cp_list = add_cp_to_invlist(cp_list, '-');
+ }
+ element_count++;
} else
range = 1; /* yeah, it's a range! */
continue; /* but do it the next time */
}
}
+ /* Here, <prevvalue> is the beginning of the range, if any; or <value>
+ * if not */
+
/* non-Latin1 code point implies unicode semantics. Must be set in
* pass1 so is there for the whole of pass 2 */
if (value > 255) {
RExC_uni_semantics = 1;
}
- /* now is the next time */
- if (!SIZE_ONLY) {
- if (prevvalue < 256) {
- const IV ceilvalue = value < 256 ? value : 255;
- IV i;
-#ifdef EBCDIC
- /* In EBCDIC [\x89-\x91] should include
- * the \x8e but [i-j] should not. */
- if (literal_endpoint == 2 &&
- ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
- (isUPPER(prevvalue) && isUPPER(ceilvalue))))
- {
- if (isLOWER(prevvalue)) {
- for (i = prevvalue; i <= ceilvalue; i++)
- if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
- }
- } else {
- for (i = prevvalue; i <= ceilvalue; i++)
- if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
- stored +=
- set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
- }
- }
- }
- else
-#endif
- for (i = prevvalue; i <= ceilvalue; i++) {
- stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
- }
- }
- if (value > 255) {
- const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
- const UV natvalue = NATIVE_TO_UNI(value);
- nonbitmap = _add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
- }
-#ifdef EBCDIC
- literal_endpoint = 0;
-#endif
+ /* Ready to process either the single value, or the completed range */
+ if (!SIZE_ONLY) {
+#ifndef EBCDIC
+ cp_list = _add_range_to_invlist(cp_list, prevvalue, value);
+#else
+ UV* this_range = _new_invlist(1);
+ _append_range_to_invlist(this_range, prevvalue, value);
+
+ /* In EBCDIC, the ranges 'A-Z' and 'a-z' are each not contiguous.
+ * If this range was specified using something like 'i-j', we want
+ * to include only the 'i' and the 'j', and not anything in
+ * between, so exclude non-ASCII, non-alphabetics from it.
+ * However, if the range was specified with something like
+ * [\x89-\x91] or [\x89-j], all code points within it should be
+ * included. literal_endpoint==2 means both ends of the range used
+ * a literal character, not \x{foo} */
+ if (literal_endpoint == 2
+ && (prevvalue >= 'a' && value <= 'z')
+ || (prevvalue >= 'A' && value <= 'Z'))
+ {
+ _invlist_intersection(this_range, PL_ASCII, &this_range, );
+ _invlist_intersection(this_range, PL_Alpha, &this_range, );
+ }
+ _invlist_union(cp_list, this_range, &cp_list);
+ literal_endpoint = 0;
+#endif
+ }
+
+ range = 0; /* this range (if it was one) is done now */
+ } /* End of loop through all the text within the brackets */
+
+ /* If the character class contains only a single element, it may be
+ * optimizable into another node type which is smaller and runs faster.
+ * Check if this is the case for this class */
+ if (element_count == 1) {
+ U8 op = END;
+ U8 arg = 0;
+
+ if (namedclass > OOB_NAMEDCLASS) { /* this is a named class, like \w or
+ [:digit:] or \p{foo} */
+
+ /* Certain named classes have equivalents that can appear outside a
+ * character class, e.g. \w, \H. We use these instead of a
+ * character class. */
+ switch ((I32)namedclass) {
+ U8 offset;
+
+ /* The first group is for node types that depend on the charset
+ * modifier to the regex. We first calculate the base node
+ * type, and if it should be inverted */
+
+ case ANYOF_NALNUM:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_ALNUM:
+ op = ALNUM;
+ goto join_charset_classes;
+
+ case ANYOF_NSPACE:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_SPACE:
+ op = SPACE;
+ goto join_charset_classes;
+
+ case ANYOF_NDIGIT:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_DIGIT:
+ op = DIGIT;
+
+ join_charset_classes:
+
+ /* Now that we have the base node type, we take advantage
+ * of the enum ordering of the charset modifiers to get the
+ * exact node type, For example the base SPACE also has
+ * SPACEL, SPACEU, and SPACEA */
+
+ offset = get_regex_charset(RExC_flags);
+
+ /* /aa is the same as /a for these */
+ if (offset == REGEX_ASCII_MORE_RESTRICTED_CHARSET) {
+ offset = REGEX_ASCII_RESTRICTED_CHARSET;
+ }
+ else if (op == DIGIT && offset == REGEX_UNICODE_CHARSET) {
+ offset = REGEX_DEPENDS_CHARSET; /* There is no DIGITU */
+ }
+
+ op += offset;
+
+ /* The number of varieties of each of these is the same,
+ * hence, so is the delta between the normal and
+ * complemented nodes */
+ if (invert) {
+ op += NALNUM - ALNUM;
+ }
+ *flagp |= HASWIDTH|SIMPLE;
+ break;
+
+ /* The second group doesn't depend of the charset modifiers.
+ * We just have normal and complemented */
+ case ANYOF_NHORIZWS:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_HORIZWS:
+ is_horizws:
+ op = (invert) ? NHORIZWS : HORIZWS;
+ *flagp |= HASWIDTH|SIMPLE;
+ break;
+
+ case ANYOF_NVERTWS:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_VERTWS:
+ op = (invert) ? NVERTWS : VERTWS;
+ *flagp |= HASWIDTH|SIMPLE;
+ break;
+
+ case ANYOF_MAX:
+ break;
+
+ case ANYOF_NBLANK:
+ invert = ! invert;
+ /* FALLTHROUGH */
+ case ANYOF_BLANK:
+ if (AT_LEAST_UNI_SEMANTICS && ! AT_LEAST_ASCII_RESTRICTED) {
+ goto is_horizws;
+ }
+ /* FALLTHROUGH */
+ default:
+ /* A generic posix class. All the /a ones can be handled
+ * by the POSIXA opcode. And all are closed under folding
+ * in the ASCII range, so FOLD doesn't matter */
+ if (AT_LEAST_ASCII_RESTRICTED
+ || (! LOC && namedclass == ANYOF_ASCII))
+ {
+ /* The odd numbered ones are the complements of the
+ * next-lower even number one */
+ if (namedclass % 2 == 1) {
+ invert = ! invert;
+ namedclass--;
+ }
+ arg = namedclass_to_classnum(namedclass);
+ op = (invert) ? NPOSIXA : POSIXA;
+ }
+ break;
+ }
+ }
+ else if (value == prevvalue) {
+
+ /* Here, the class consists of just a single code point */
+
+ if (invert) {
+ if (! LOC && value == '\n') {
+ op = REG_ANY; /* Optimize [^\n] */
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
+ }
+ }
+ else if (value < 256 || UTF) {
+
+ /* Optimize a single value into an EXACTish node, but not if it
+ * would require converting the pattern to UTF-8. */
+ op = compute_EXACTish(pRExC_state);
+ }
+ } /* Otherwise is a range */
+ else if (! LOC) { /* locale could vary these */
+ if (prevvalue == '0') {
+ if (value == '9') {
+ op = (invert) ? NDIGITA : DIGITA;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ }
}
- range = 0; /* this range (if it was one) is done now */
- }
+ /* Here, we have changed <op> away from its initial value iff we found
+ * an optimization */
+ if (op != END) {
+
+ /* Throw away this ANYOF regnode, and emit the calculated one,
+ * which should correspond to the beginning, not current, state of
+ * the parse */
+ const char * cur_parse = RExC_parse;
+ RExC_parse = (char *)orig_parse;
+ if ( SIZE_ONLY) {
+ if (! LOC) {
+
+ /* To get locale nodes to not use the full ANYOF size would
+ * require moving the code above that writes the portions
+ * of it that aren't in other nodes to after this point.
+ * e.g. ANYOF_CLASS_SET */
+ RExC_size = orig_size;
+ }
+ }
+ else {
+ RExC_emit = (regnode *)orig_emit;
+ }
+
+ ret = reg_node(pRExC_state, op);
+
+ if (PL_regkind[op] == POSIXD) {
+ if (! SIZE_ONLY) {
+ FLAGS(ret) = arg;
+ }
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (PL_regkind[op] == EXACT) {
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
+ }
+ RExC_parse = (char *) cur_parse;
+ SvREFCNT_dec(listsv);
+ return ret;
+ }
+ }
if (SIZE_ONLY)
return ret;
- /****** !SIZE_ONLY AFTER HERE *********/
+ /****** !SIZE_ONLY (Pass 2) AFTER HERE *********/
- /* If folding and there are code points above 255, we calculate all
- * characters that could fold to or from the ones already on the list */
- if (FOLD && nonbitmap) {
+ /* If folding, we calculate all characters that could fold to or from the
+ * ones already on the list */
+ if (FOLD && cp_list) {
UV start, end; /* End points of code point ranges */
SV* fold_intersection = NULL;
- /* This is a list of all the characters that participate in folds
- * (except marks, etc in multi-char folds */
- if (! PL_utf8_foldable) {
- SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
- PL_utf8_foldable = _swash_to_invlist(swash);
- SvREFCNT_dec(swash);
- }
+ /* In the Latin1 range, the characters that can be folded-to or -from
+ * are precisely the alphabetic characters. If the highest code point
+ * is within Latin1, we can use the compiled-in list, and not have to
+ * go out to disk. */
+ if (invlist_highest(cp_list) < 256) {
+ _invlist_intersection(PL_L1PosixAlpha, cp_list, &fold_intersection);
+ }
+ else {
- /* This is a hash that for a particular fold gives all characters
- * that are involved in it */
- if (! PL_utf8_foldclosures) {
+ /* Here, there are non-Latin1 code points, so we will have to go
+ * fetch the list of all the characters that participate in folds
+ */
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8", "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable = _get_swash_invlist(swash);
+ SvREFCNT_dec(swash);
+ }
- /* If we were unable to find any folds, then we likely won't be
- * able to find the closures. So just create an empty list.
- * Folding will effectively be restricted to the non-Unicode rules
- * hard-coded into Perl. (This case happens legitimately during
- * compilation of Perl itself before the Unicode tables are
- * generated) */
- if (invlist_len(PL_utf8_foldable) == 0) {
- PL_utf8_foldclosures = newHV();
- } else {
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES+1];
- STRLEN dummy_len;
-
- /* This particular string is above \xff in both UTF-8 and
- * UTFEBCDIC */
- to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
- }
- }
+ /* This is a hash that for a particular fold gives all characters
+ * that are involved in it */
+ if (! PL_utf8_foldclosures) {
+
+ /* If we were unable to find any folds, then we likely won't be
+ * able to find the closures. So just create an empty list.
+ * Folding will effectively be restricted to the non-Unicode
+ * rules hard-coded into Perl. (This case happens legitimately
+ * during compilation of Perl itself before the Unicode tables
+ * are generated) */
+ if (_invlist_len(PL_utf8_foldable) == 0) {
+ PL_utf8_foldclosures = newHV();
+ }
+ else {
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES+1];
+ STRLEN dummy_len;
+
+ /* This particular string is above \xff in both UTF-8
+ * and UTFEBCDIC */
+ to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+ assert(PL_utf8_tofold); /* Verify that worked */
+ }
+ PL_utf8_foldclosures =
+ _swash_inversion_hash(PL_utf8_tofold);
+ }
+ }
- /* Only the characters in this class that participate in folds need be
- * checked. Get the intersection of this class and all the possible
- * characters that are foldable. This can quickly narrow down a large
- * class */
- _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
+ /* Only the characters in this class that participate in folds need
+ * be checked. Get the intersection of this class and all the
+ * possible characters that are foldable. This can quickly narrow
+ * down a large class */
+ _invlist_intersection(PL_utf8_foldable, cp_list,
+ &fold_intersection);
+ }
/* Now look at the foldable characters in this class individually */
invlist_iterinit(fold_intersection);
while (invlist_iternext(fold_intersection, &start, &end)) {
UV j;
+ /* Locale folding for Latin1 characters is deferred until runtime */
+ if (LOC && start < 256) {
+ start = 256;
+ }
+
/* Look at every character in the range */
for (j = start; j <= end; j++) {
- /* Get its fold */
U8 foldbuf[UTF8_MAXBYTES_CASE+1];
STRLEN foldlen;
- const UV f =
- _to_uni_fold_flags(j, foldbuf, &foldlen,
- (allow_full_fold) ? FOLD_FLAGS_FULL : 0);
+ UV f;
+
+ if (j < 256) {
+
+ /* We have the latin1 folding rules hard-coded here so that
+ * an innocent-looking character class, like /[ks]/i won't
+ * have to go out to disk to find the possible matches.
+ * XXX It would be better to generate these via regen, in
+ * case a new version of the Unicode standard adds new
+ * mappings, though that is not really likely, and may be
+ * caught by the default: case of the switch below. */
+
+ if (PL_fold_latin1[j] != j) {
+
+ /* ASCII is always matched; non-ASCII is matched only
+ * under Unicode rules */
+ if (isASCII(j) || AT_LEAST_UNI_SEMANTICS) {
+ cp_list =
+ add_cp_to_invlist(cp_list, PL_fold_latin1[j]);
+ }
+ else {
+ depends_list =
+ add_cp_to_invlist(depends_list, PL_fold_latin1[j]);
+ }
+ }
+
+ if (HAS_NONLATIN1_FOLD_CLOSURE(j)
+ && (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
+ {
+ /* Certain Latin1 characters have matches outside
+ * Latin1, or are multi-character. To get here, 'j' is
+ * one of those characters. None of these matches is
+ * valid for ASCII characters under /aa, which is why
+ * the 'if' just above excludes those. The matches
+ * fall into three categories:
+ * 1) They are singly folded-to or -from an above 255
+ * character, e.g., LATIN SMALL LETTER Y WITH
+ * DIAERESIS and LATIN CAPITAL LETTER Y WITH
+ * DIAERESIS;
+ * 2) They are part of a multi-char fold with another
+ * latin1 character; only LATIN SMALL LETTER
+ * SHARP S => "ss" fits this;
+ * 3) They are part of a multi-char fold with a
+ * character outside of Latin1, such as various
+ * ligatures.
+ * We aren't dealing fully with multi-char folds, except
+ * we do deal with the pattern containing a character
+ * that has a multi-char fold (not so much the inverse).
+ * For types 1) and 3), the matches only happen when the
+ * target string is utf8; that's not true for 2), and we
+ * set a flag for it.
+ *
+ * The code below adds the single fold closures for 'j'
+ * to the inversion list. */
+ switch (j) {
+ case 'k':
+ case 'K':
+ cp_list =
+ add_cp_to_invlist(cp_list, KELVIN_SIGN);
+ break;
+ case 's':
+ case 'S':
+ cp_list = add_cp_to_invlist(cp_list,
+ LATIN_SMALL_LETTER_LONG_S);
+ break;
+ case MICRO_SIGN:
+ cp_list = add_cp_to_invlist(cp_list,
+ GREEK_CAPITAL_LETTER_MU);
+ cp_list = add_cp_to_invlist(cp_list,
+ GREEK_SMALL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ cp_list =
+ add_cp_to_invlist(cp_list, ANGSTROM_SIGN);
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ cp_list = add_cp_to_invlist(cp_list,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ cp_list = add_cp_to_invlist(cp_list,
+ LATIN_CAPITAL_LETTER_SHARP_S);
+
+ /* Under /a, /d, and /u, this can match the two
+ * chars "ss" */
+ if (! ASCII_FOLD_RESTRICTED) {
+ add_alternate(&unicode_alternate,
+ (U8 *) "ss", 2);
+
+ /* And under /u or /a, it can match even if
+ * the target is not utf8 */
+ if (AT_LEAST_UNI_SEMANTICS) {
+ ANYOF_FLAGS(ret) |=
+ ANYOF_NONBITMAP_NON_UTF8;
+ }
+ }
+ break;
+ case 'F': case 'f':
+ case 'I': case 'i':
+ case 'L': case 'l':
+ case 'T': case 't':
+ case 'A': case 'a':
+ case 'H': case 'h':
+ case 'J': case 'j':
+ case 'N': case 'n':
+ case 'W': case 'w':
+ case 'Y': case 'y':
+ /* These all are targets of multi-character
+ * folds from code points that require UTF8 to
+ * express, so they can't match unless the
+ * target string is in UTF-8, so no action here
+ * is necessary, as regexec.c properly handles
+ * the general case for UTF-8 matching */
+ break;
+ default:
+ /* Use deprecated warning to increase the
+ * chances of this being output */
+ ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
+ break;
+ }
+ }
+ continue;
+ }
+
+ /* Here is an above Latin1 character. We don't have the rules
+ * hard-coded for it. First, get its fold */
+ f = _to_uni_fold_flags(j, foldbuf, &foldlen,
+ ((allow_full_fold) ? FOLD_FLAGS_FULL : 0)
+ | ((LOC)
+ ? FOLD_FLAGS_LOCALE
+ : (ASCII_FOLD_RESTRICTED)
+ ? FOLD_FLAGS_NOMIX_ASCII
+ : 0));
if (foldlen > (STRLEN)UNISKIP(f)) {
/* If any of the folded characters of this are in the
* Latin1 range, tell the regex engine that this can
- * match a non-utf8 target string. The only multi-byte
- * fold whose source is in the Latin1 range (U+00DF)
- * applies only when the target string is utf8, or
- * under unicode rules */
- if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
- while (loc < e) {
-
- /* Can't mix ascii with non- under /aa */
- if (MORE_ASCII_RESTRICTED
- && (isASCII(*loc) != isASCII(j)))
- {
- goto end_multi_fold;
- }
- if (UTF8_IS_INVARIANT(*loc)
- || UTF8_IS_DOWNGRADEABLE_START(*loc))
- {
- /* Can't mix above and below 256 under LOC
- */
- if (LOC) {
- goto end_multi_fold;
- }
- ANYOF_FLAGS(ret)
- |= ANYOF_NONBITMAP_NON_UTF8;
- break;
- }
- loc += UTF8SKIP(loc);
- }
- }
+ * match a non-utf8 target string. */
+ while (loc < e) {
+ if (UTF8_IS_INVARIANT(*loc)
+ || UTF8_IS_DOWNGRADEABLE_START(*loc))
+ {
+ ANYOF_FLAGS(ret)
+ |= ANYOF_NONBITMAP_NON_UTF8;
+ break;
+ }
+ loc += UTF8SKIP(loc);
+ }
add_alternate(&unicode_alternate, foldbuf, foldlen);
- end_multi_fold: ;
- }
-
- /* This is special-cased, as it is the only letter which
- * has both a multi-fold and single-fold in Latin1. All
- * the other chars that have single and multi-folds are
- * always in utf8, and the utf8 folding algorithm catches
- * them */
- if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
- stored += set_regclass_bit(pRExC_state,
- ret,
- LATIN_SMALL_LETTER_SHARP_S,
- &l1_fold_invlist, &unicode_alternate);
}
}
- else {
- /* Single character fold. Add everything in its fold
- * closure to the list that this node should match */
+ else {
+ /* Single character fold of above Latin1. Add everything
+ * in its fold closure to the list that this node should
+ * match */
SV** listp;
/* The fold closures data structure is a hash with the keys
/* /aa doesn't allow folds between ASCII and non-;
* /l doesn't allow them between above and below
* 256 */
- if ((MORE_ASCII_RESTRICTED
- && (isASCII(c) != isASCII(j)))
- || (LOC && ((c < 256) != (j < 256))))
+ if ((ASCII_FOLD_RESTRICTED
+ && (isASCII(c) != isASCII(j)))
+ || (LOC && ((c < 256) != (j < 256))))
{
continue;
}
- if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
- stored += set_regclass_bit(pRExC_state,
- ret,
- (U8) c,
- &l1_fold_invlist, &unicode_alternate);
- }
- /* It may be that the code point is already in
- * this range or already in the bitmap, in
- * which case we need do nothing */
- else if ((c < start || c > end)
- && (c > 255
- || ! ANYOF_BITMAP_TEST(ret, c)))
- {
- nonbitmap = add_cp_to_invlist(nonbitmap, c);
+ /* Folds involving non-ascii Latin1 characters
+ * under /d are added to a separate list */
+ if (isASCII(c) || c > 255 || AT_LEAST_UNI_SEMANTICS)
+ {
+ cp_list = add_cp_to_invlist(cp_list, c);
+ }
+ else {
+ depends_list = add_cp_to_invlist(depends_list, c);
}
}
}
}
- }
+ }
}
SvREFCNT_dec(fold_intersection);
}
- /* Combine the two lists into one. */
- if (l1_fold_invlist) {
- if (nonbitmap) {
- _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
- SvREFCNT_dec(l1_fold_invlist);
- }
- else {
- nonbitmap = l1_fold_invlist;
- }
+ /* And combine the result (if any) with any inversion list from posix
+ * classes. The lists are kept separate up to now because we don't want to
+ * fold the classes (folding of those is automatically handled by the swash
+ * fetching code) */
+ if (posixes) {
+ if (! DEPENDS_SEMANTICS) {
+ if (cp_list) {
+ _invlist_union(cp_list, posixes, &cp_list);
+ SvREFCNT_dec(posixes);
+ }
+ else {
+ cp_list = posixes;
+ }
+ }
+ else {
+ /* Under /d, we put into a separate list the Latin1 things that
+ * match only when the target string is utf8 */
+ SV* nonascii_but_latin1_properties = NULL;
+ _invlist_intersection(posixes, PL_Latin1,
+ &nonascii_but_latin1_properties);
+ _invlist_subtract(nonascii_but_latin1_properties, PL_ASCII,
+ &nonascii_but_latin1_properties);
+ _invlist_subtract(posixes, nonascii_but_latin1_properties,
+ &posixes);
+ if (cp_list) {
+ _invlist_union(cp_list, posixes, &cp_list);
+ SvREFCNT_dec(posixes);
+ }
+ else {
+ cp_list = posixes;
+ }
+
+ if (depends_list) {
+ _invlist_union(depends_list, nonascii_but_latin1_properties,
+ &depends_list);
+ SvREFCNT_dec(nonascii_but_latin1_properties);
+ }
+ else {
+ depends_list = nonascii_but_latin1_properties;
+ }
+ }
}
/* And combine the result (if any) with any inversion list from properties.
- * The lists are kept separate up to now because we don't want to fold the
- * properties */
+ * The lists are kept separate up to now so that we can distinguish the two
+ * in regards to matching above-Unicode. A run-time warning is generated
+ * if a Unicode property is matched against a non-Unicode code point. But,
+ * we allow user-defined properties to match anything, without any warning,
+ * and we also suppress the warning if there is a portion of the character
+ * class that isn't a Unicode property, and which matches above Unicode, \W
+ * or [\x{110000}] for example.
+ * (Note that in this case, unlike the Posix one above, there is no
+ * <depends_list>, because having a Unicode property forces Unicode
+ * semantics */
if (properties) {
- if (nonbitmap) {
- _invlist_union(nonbitmap, properties, &nonbitmap);
- SvREFCNT_dec(properties);
- }
- else {
- nonbitmap = properties;
- }
+ bool warn_super = ! has_user_defined_property;
+ if (cp_list) {
+
+ /* If it matters to the final outcome, see if a non-property
+ * component of the class matches above Unicode. If so, the
+ * warning gets suppressed. This is true even if just a single
+ * such code point is specified, as though not strictly correct if
+ * another such code point is matched against, the fact that they
+ * are using above-Unicode code points indicates they should know
+ * the issues involved */
+ if (warn_super) {
+ bool non_prop_matches_above_Unicode =
+ runtime_posix_matches_above_Unicode
+ | (invlist_highest(cp_list) > PERL_UNICODE_MAX);
+ if (invert) {
+ non_prop_matches_above_Unicode =
+ ! non_prop_matches_above_Unicode;
+ }
+ warn_super = ! non_prop_matches_above_Unicode;
+ }
+
+ _invlist_union(properties, cp_list, &cp_list);
+ SvREFCNT_dec(properties);
+ }
+ else {
+ cp_list = properties;
+ }
+
+ if (warn_super) {
+ ANYOF_FLAGS(ret) |= ANYOF_WARN_SUPER;
+ }
}
- /* Here, <nonbitmap> contains all the code points we can determine at
- * compile time that we haven't put into the bitmap. Go through it, and
- * for things that belong in the bitmap, put them there, and delete from
- * <nonbitmap> */
- if (nonbitmap) {
+ /* Here, we have calculated what code points should be in the character
+ * class.
+ *
+ * Now we can see about various optimizations. Fold calculation (which we
+ * did above) needs to take place before inversion. Otherwise /[^k]/i
+ * would invert to include K, which under /i would match k, which it
+ * shouldn't. Therefore we can't invert folded locale now, as it won't be
+ * folded until runtime */
+
+ /* Optimize inverted simple patterns (e.g. [^a-z]) when everything is known
+ * at compile time. Besides not inverting folded locale now, we can't invert
+ * if there are things such as \w, which aren't known until runtime */
+ if (invert
+ && ! (LOC && (FOLD || (ANYOF_FLAGS(ret) & ANYOF_CLASS)))
+ && ! depends_list
+ && ! unicode_alternate
+ && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+ {
+ _invlist_invert(cp_list);
- /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
- * possibly only should match when the target string is UTF-8 */
- UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
+ /* Any swash can't be used as-is, because we've inverted things */
+ if (swash) {
+ SvREFCNT_dec(swash);
+ swash = NULL;
+ }
+
+ /* Clear the invert flag since have just done it here */
+ invert = FALSE;
+ }
+
+ /* If we didn't do folding, it's because some information isn't available
+ * until runtime; set the run-time fold flag for these. (We don't have to
+ * worry about properties folding, as that is taken care of by the swash
+ * fetching) */
+ if (FOLD && (LOC || unicode_alternate))
+ {
+ ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+ }
+
+ /* Some character classes are equivalent to other nodes. Such nodes take
+ * up less room and generally fewer operations to execute than ANYOF nodes.
+ * Above, we checked for and optimized into some such equivalents for
+ * certain common classes that are easy to test. Getting to this point in
+ * the code means that the class didn't get optimized there. Since this
+ * code is only executed in Pass 2, it is too late to save space--it has
+ * been allocated in Pass 1, and currently isn't given back. But turning
+ * things into an EXACTish node can allow the optimizer to join it to any
+ * adjacent such nodes. And if the class is equivalent to things like /./,
+ * expensive run-time swashes can be avoided. Now that we have more
+ * complete information, we can find things necessarily missed by the
+ * earlier code. I (khw) am not sure how much to look for here. It would
+ * be easy, but perhaps too slow, to check any candidates against all the
+ * node types they could possibly match using _invlistEQ(). */
+
+ if (cp_list
+ && ! unicode_alternate
+ && ! invert
+ && ! depends_list
+ && ! (ANYOF_FLAGS(ret) & ANYOF_CLASS)
+ && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+ {
+ UV start, end;
+ U8 op = END; /* The optimzation node-type */
+ const char * cur_parse= RExC_parse;
+
+ invlist_iterinit(cp_list);
+ if (! invlist_iternext(cp_list, &start, &end)) {
+
+ /* Here, the list is empty. This happens, for example, when a
+ * Unicode property is the only thing in the character class, and
+ * it doesn't match anything. (perluniprops.pod notes such
+ * properties) */
+ op = OPFAIL;
+ *flagp |= HASWIDTH|SIMPLE;
+ }
+ else if (start == end) { /* The range is a single code point */
+ if (! invlist_iternext(cp_list, &start, &end)
+
+ /* Don't do this optimization if it would require changing
+ * the pattern to UTF-8 */
+ && (start < 256 || UTF))
+ {
+ /* Here, the list contains a single code point. Can optimize
+ * into an EXACT node */
+
+ value = start;
+
+ if (! FOLD) {
+ op = EXACT;
+ }
+ else if (LOC) {
+
+ /* A locale node under folding with one code point can be
+ * an EXACTFL, as its fold won't be calculated until
+ * runtime */
+ op = EXACTFL;
+ }
+ else {
+
+ /* Here, we are generally folding, but there is only one
+ * code point to match. If we have to, we use an EXACT
+ * node, but it would be better for joining with adjacent
+ * nodes in the optimization pass if we used the same
+ * EXACTFish node that any such are likely to be. We can
+ * do this iff the code point doesn't participate in any
+ * folds. For example, an EXACTF of a colon is the same as
+ * an EXACT one, since nothing folds to or from a colon.
+ * In the Latin1 range, being an alpha means that the
+ * character participates in a fold (except for the
+ * feminine and masculine ordinals, which I (khw) don't
+ * think are worrying about optimizing for). */
+ if (value < 256) {
+ if (isALPHA_L1(value)) {
+ op = EXACT;
+ }
+ }
+ else {
+ if (! PL_utf8_foldable) {
+ SV* swash = swash_init("utf8", "_Perl_Any_Folds",
+ &PL_sv_undef, 1, 0);
+ PL_utf8_foldable = _get_swash_invlist(swash);
+ SvREFCNT_dec(swash);
+ }
+ if (_invlist_contains_cp(PL_utf8_foldable, value)) {
+ op = EXACT;
+ }
+ }
+
+ /* If we haven't found the node type, above, it means we
+ * can use the prevailing one */
+ if (op == END) {
+ op = compute_EXACTish(pRExC_state);
+ }
+ }
+ }
+ }
+ else if (start == 0) {
+ if (end == UV_MAX) {
+ op = SANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
+ }
+ else if (end == '\n' - 1
+ && invlist_iternext(cp_list, &start, &end)
+ && start == '\n' + 1 && end == UV_MAX)
+ {
+ op = REG_ANY;
+ *flagp |= HASWIDTH|SIMPLE;
+ RExC_naughty++;
+ }
+ }
+
+ if (op != END) {
+ RExC_parse = (char *)orig_parse;
+ RExC_emit = (regnode *)orig_emit;
+
+ ret = reg_node(pRExC_state, op);
+
+ RExC_parse = (char *)cur_parse;
+
+ if (PL_regkind[op] == EXACT) {
+ alloc_maybe_populate_EXACT(pRExC_state, ret, flagp, 0, value);
+ }
+
+ SvREFCNT_dec(listsv);
+ return ret;
+ }
+ }
+
+ /* Here, <cp_list> contains all the code points we can determine at
+ * compile time that match under all conditions. Go through it, and
+ * for things that belong in the bitmap, put them there, and delete from
+ * <cp_list>. While we are at it, see if everything above 255 is in the
+ * list, and if so, set a flag to speed up execution */
+ ANYOF_BITMAP_ZERO(ret);
+ if (cp_list) {
/* This gets set if we actually need to modify things */
bool change_invlist = FALSE;
UV start, end;
- /* Start looking through <nonbitmap> */
- invlist_iterinit(nonbitmap);
- while (invlist_iternext(nonbitmap, &start, &end)) {
+ /* Start looking through <cp_list> */
+ invlist_iterinit(cp_list);
+ while (invlist_iternext(cp_list, &start, &end)) {
UV high;
int i;
+ if (end == UV_MAX && start <= 256) {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
+ }
+
/* Quit if are above what we should change */
- if (start > max_cp_to_set) {
+ if (start > 255) {
break;
}
change_invlist = TRUE;
/* Set all the bits in the range, up to the max that we are doing */
- high = (end < max_cp_to_set) ? end : max_cp_to_set;
+ high = (end < 255) ? end : 255;
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(ret, i)) {
ANYOF_BITMAP_SET(ret, i);
- stored++;
prevvalue = value;
value = i;
}
}
/* Done with loop; remove any code points that are in the bitmap from
- * <nonbitmap> */
+ * <cp_list> */
if (change_invlist) {
- _invlist_subtract(nonbitmap,
- (DEPENDS_SEMANTICS)
- ? PL_ASCII
- : PL_Latin1,
- &nonbitmap);
+ _invlist_subtract(cp_list, PL_Latin1, &cp_list);
}
/* If have completely emptied it, remove it completely */
- if (invlist_len(nonbitmap) == 0) {
- SvREFCNT_dec(nonbitmap);
- nonbitmap = NULL;
+ if (_invlist_len(cp_list) == 0) {
+ SvREFCNT_dec(cp_list);
+ cp_list = NULL;
}
}
- /* Here, we have calculated what code points should be in the character
- * class. <nonbitmap> does not overlap the bitmap except possibly in the
- * case of DEPENDS rules.
- *
- * Now we can see about various optimizations. Fold calculation (which we
- * did above) needs to take place before inversion. Otherwise /[^k]/i
- * would invert to include K, which under /i would match k, which it
- * shouldn't. */
-
- /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
- * set the FOLD flag yet, so this does optimize those. It doesn't
- * optimize locale. Doing so perhaps could be done as long as there is
- * nothing like \w in it; some thought also would have to be given to the
- * interaction with above 0x100 chars */
- if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
- && ! LOC
- && ! unicode_alternate
- /* In case of /d, there are some things that should match only when in
- * not in the bitmap, i.e., they require UTF8 to match. These are
- * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
- * case, they don't require UTF8, so can invert here */
- && (! nonbitmap
- || ! DEPENDS_SEMANTICS
- || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
- && SvCUR(listsv) == initial_listsv_len)
- {
- int i;
- if (! nonbitmap) {
- for (i = 0; i < 256; ++i) {
- if (ANYOF_BITMAP_TEST(ret, i)) {
- ANYOF_BITMAP_CLEAR(ret, i);
- }
- else {
- ANYOF_BITMAP_SET(ret, i);
- prevvalue = value;
- value = i;
- }
- }
- /* The inversion means that everything above 255 is matched */
- ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
- }
- else {
- /* Here, also has things outside the bitmap that may overlap with
- * the bitmap. We have to sync them up, so that they get inverted
- * in both places. Earlier, we removed all overlaps except in the
- * case of /d rules, so no syncing is needed except for this case
- */
- SV *remove_list = NULL;
-
- if (DEPENDS_SEMANTICS) {
- UV start, end;
-
- /* Set the bits that correspond to the ones that aren't in the
- * bitmap. Otherwise, when we invert, we'll miss these.
- * Earlier, we removed from the nonbitmap all code points
- * < 128, so there is no extra work here */
- invlist_iterinit(nonbitmap);
- while (invlist_iternext(nonbitmap, &start, &end)) {
- if (start > 255) { /* The bit map goes to 255 */
- break;
- }
- if (end > 255) {
- end = 255;
- }
- for (i = start; i <= (int) end; ++i) {
- ANYOF_BITMAP_SET(ret, i);
- prevvalue = value;
- value = i;
- }
- }
- }
-
- /* Now invert both the bitmap and the nonbitmap. Anything in the
- * bitmap has to also be removed from the non-bitmap, but again,
- * there should not be overlap unless is /d rules. */
- _invlist_invert(nonbitmap);
-
- /* Any swash can't be used as-is, because we've inverted things */
- if (swash) {
- SvREFCNT_dec(swash);
- swash = NULL;
- }
-
- for (i = 0; i < 256; ++i) {
- if (ANYOF_BITMAP_TEST(ret, i)) {
- ANYOF_BITMAP_CLEAR(ret, i);
- if (DEPENDS_SEMANTICS) {
- if (! remove_list) {
- remove_list = _new_invlist(2);
- }
- remove_list = add_cp_to_invlist(remove_list, i);
- }
- }
- else {
- ANYOF_BITMAP_SET(ret, i);
- prevvalue = value;
- value = i;
- }
- }
-
- /* And do the removal */
- if (DEPENDS_SEMANTICS) {
- if (remove_list) {
- _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
- SvREFCNT_dec(remove_list);
- }
- }
- else {
- /* There is no overlap for non-/d, so just delete anything
- * below 256 */
- _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
- }
- }
-
- stored = 256 - stored;
-
- /* Clear the invert flag since have just done it here */
- ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
- }
-
- /* Folding in the bitmap is taken care of above, but not for locale (for
- * which we have to wait to see what folding is in effect at runtime), and
- * for some things not in the bitmap (only the upper latin folds in this
- * case, as all other single-char folding has been set above). Set
- * run-time fold flag for these */
- if (FOLD && (LOC
- || (DEPENDS_SEMANTICS
- && nonbitmap
- && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
- || unicode_alternate))
- {
- ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
+ if (invert) {
+ ANYOF_FLAGS(ret) |= ANYOF_INVERT;
}
- /* A single character class can be "optimized" into an EXACTish node.
- * Note that since we don't currently count how many characters there are
- * outside the bitmap, we are XXX missing optimization possibilities for
- * them. This optimization can't happen unless this is a truly single
- * character class, which means that it can't be an inversion into a
- * many-character class, and there must be no possibility of there being
- * things outside the bitmap. 'stored' (only) for locales doesn't include
- * \w, etc, so have to make a special test that they aren't present
- *
- * Similarly A 2-character class of the very special form like [bB] can be
- * optimized into an EXACTFish node, but only for non-locales, and for
- * characters which only have the two folds; so things like 'fF' and 'Ii'
- * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
- * FI'. */
- if (! nonbitmap
- && ! unicode_alternate
- && SvCUR(listsv) == initial_listsv_len
- && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
- && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
- || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
- || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
- && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
- /* If the latest code point has a fold whose
- * bit is set, it must be the only other one */
- && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
- && ANYOF_BITMAP_TEST(ret, prevvalue)))))
- {
- /* Note that the information needed to decide to do this optimization
- * is not currently available until the 2nd pass, and that the actually
- * used EXACTish node takes less space than the calculated ANYOF node,
- * and hence the amount of space calculated in the first pass is larger
- * than actually used, so this optimization doesn't gain us any space.
- * But an EXACT node is faster than an ANYOF node, and can be combined
- * with any adjacent EXACT nodes later by the optimizer for further
- * gains. The speed of executing an EXACTF is similar to an ANYOF
- * node, so the optimization advantage comes from the ability to join
- * it to adjacent EXACT nodes */
-
- const char * cur_parse= RExC_parse;
- U8 op;
- RExC_emit = (regnode *)orig_emit;
- RExC_parse = (char *)orig_parse;
-
- if (stored == 1) {
-
- /* A locale node with one point can be folded; all the other cases
- * with folding will have two points, since we calculate them above
- */
- if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
- op = EXACTFL;
- }
- else {
- op = EXACT;
- }
- }
- else { /* else 2 chars in the bit map: the folds of each other */
-
- /* Use the folded value, which for the cases where we get here,
- * is just the lower case of the current one (which may resolve to
- * itself, or to the other one */
- value = toLOWER_LATIN1(value);
-
- /* To join adjacent nodes, they must be the exact EXACTish type.
- * Try to use the most likely type, by using EXACTFA if possible,
- * then EXACTFU if the regex calls for it, or is required because
- * the character is non-ASCII. (If <value> is ASCII, its fold is
- * also ASCII for the cases where we get here.) */
- if (MORE_ASCII_RESTRICTED && isASCII(value)) {
- op = EXACTFA;
- }
- else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
- op = EXACTFU;
- }
- else { /* Otherwise, more likely to be EXACTF type */
- op = EXACTF;
- }
- }
-
- ret = reg_node(pRExC_state, op);
- RExC_parse = (char *)cur_parse;
- if (UTF && ! NATIVE_IS_INVARIANT(value)) {
- *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
- *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
- STR_LEN(ret)= 2;
- RExC_emit += STR_SZ(2);
+ /* Here, the bitmap has been populated with all the Latin1 code points that
+ * always match. Can now add to the overall list those that match only
+ * when the target string is UTF-8 (<depends_list>). */
+ if (depends_list) {
+ if (cp_list) {
+ _invlist_union(cp_list, depends_list, &cp_list);
+ SvREFCNT_dec(depends_list);
}
else {
- *STRING(ret)= (char)value;
- STR_LEN(ret)= 1;
- RExC_emit += STR_SZ(1);
+ cp_list = depends_list;
}
- SvREFCNT_dec(listsv);
- return ret;
}
/* If there is a swash and more than one element, we can't use the swash in
SvREFCNT_dec(swash);
swash = NULL;
}
- if (! nonbitmap
- && SvCUR(listsv) == initial_listsv_len
+
+ if (! cp_list
+ && ! HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
&& ! unicode_alternate)
{
ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
* swash is stored there now.
* av[2] stores the multicharacter foldings, used later in
* regexec.c:S_reginclass().
- * av[3] stores the nonbitmap inversion list for use in addition or
- * instead of av[0]; not used if av[1] isn't NULL
+ * av[3] stores the cp_list inversion list for use in addition or
+ * instead of av[0]; used only if av[1] is NULL
* av[4] is set if any component of the class is from a user-defined
- * property; not used if av[1] isn't NULL */
+ * property; used only if av[1] is NULL */
AV * const av = newAV();
SV *rv;
- av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
- ? &PL_sv_undef
- : listsv);
+ av_store(av, 0, (HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
+ ? listsv
+ : &PL_sv_undef);
if (swash) {
av_store(av, 1, swash);
- SvREFCNT_dec(nonbitmap);
+ SvREFCNT_dec(cp_list);
}
else {
av_store(av, 1, NULL);
- if (nonbitmap) {
- av_store(av, 3, nonbitmap);
+ if (cp_list) {
+ av_store(av, 3, cp_list);
av_store(av, 4, newSVuv(has_user_defined_property));
}
}
RExC_rxi->data->data[n] = (void*)rv;
ARG_SET(ret, n);
}
+
+ *flagp |= HASWIDTH|SIMPLE;
return ret;
}
+#undef HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION
/* reg_skipcomment()
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
dVAR;
- register regnode *ptr;
+ regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
dVAR;
- register regnode *ptr;
+ regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
dVAR;
- register regnode *src;
- register regnode *dst;
- register regnode *place;
+ regnode *src;
+ regnode *dst;
+ regnode *place;
const int offset = regarglen[(U8)op];
const int size = NODE_STEP_REGNODE + offset;
GET_RE_DEBUG_FLAGS_DECL;
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
dVAR;
- register regnode *scan;
+ regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGTAIL;
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
{
dVAR;
- register regnode *scan;
+ regnode *scan;
U8 exact = PSEUDO;
#ifdef EXPERIMENTAL_INPLACESCAN
I32 min = 0;
{
#ifdef DEBUGGING
dVAR;
- register int k;
+ int k;
+
+ /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
+ static const char * const anyofs[] = {
+ "\\w",
+ "\\W",
+ "\\s",
+ "\\S",
+ "\\d",
+ "\\D",
+ "[:alnum:]",
+ "[:^alnum:]",
+ "[:alpha:]",
+ "[:^alpha:]",
+ "[:ascii:]",
+ "[:^ascii:]",
+ "[:cntrl:]",
+ "[:^cntrl:]",
+ "[:graph:]",
+ "[:^graph:]",
+ "[:lower:]",
+ "[:^lower:]",
+ "[:print:]",
+ "[:^print:]",
+ "[:punct:]",
+ "[:^punct:]",
+ "[:upper:]",
+ "[:^upper:]",
+ "[:xdigit:]",
+ "[:^xdigit:]",
+ "[:space:]",
+ "[:^space:]",
+ "[:blank:]",
+ "[:^blank:]"
+ };
RXi_GET_DECL(prog,progi);
GET_RE_DEBUG_FLAGS_DECL;
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
- /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
- static const char * const anyofs[] = {
- "\\w",
- "\\W",
- "\\s",
- "\\S",
- "\\d",
- "\\D",
- "[:alnum:]",
- "[:^alnum:]",
- "[:alpha:]",
- "[:^alpha:]",
- "[:ascii:]",
- "[:^ascii:]",
- "[:cntrl:]",
- "[:^cntrl:]",
- "[:graph:]",
- "[:^graph:]",
- "[:lower:]",
- "[:^lower:]",
- "[:print:]",
- "[:^print:]",
- "[:punct:]",
- "[:^punct:]",
- "[:upper:]",
- "[:^upper:]",
- "[:xdigit:]",
- "[:^xdigit:]",
- "[:space:]",
- "[:^space:]",
- "[:blank:]",
- "[:^blank:]"
- };
if (flags & ANYOF_LOCALE)
sv_catpvs(sv, "{loc}");
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
+ else if (k == POSIXD) {
+ U8 index = FLAGS(o) * 2;
+ if (index > (sizeof(anyofs) / sizeof(anyofs[0]))) {
+ Perl_sv_catpvf(aTHX_ sv, "[illegal type=%d])", index);
+ }
+ else {
+ sv_catpv(sv, anyofs[index]);
+ }
+ }
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
#else
{
struct regexp *ret;
struct regexp *const r = (struct regexp *)SvANY(rx);
- register const I32 npar = r->nparens+1;
PERL_ARGS_ASSERT_REG_TEMP_COPY;
SvLEN_set(ret_x, 0);
SvSTASH_set(ret_x, NULL);
SvMAGIC_set(ret_x, NULL);
- Newx(ret->offs, npar, regexp_paren_pair);
- Copy(r->offs, ret->offs, npar, regexp_paren_pair);
+ if (r->offs) {
+ const I32 npar = r->nparens+1;
+ Newx(ret->offs, npar, regexp_paren_pair);
+ Copy(r->offs, ret->offs, npar, regexp_paren_pair);
+ }
if (r->substrs) {
Newx(ret->substrs, 1, struct reg_substr_data);
StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
Perl_regnext(pTHX_ register regnode *p)
{
dVAR;
- register I32 offset;
+ I32 offset;
if (!p)
return(NULL);
Copy(&PL_reg_state, state, 1, struct re_save_state);
- PL_reg_start_tmp = 0;
- PL_reg_start_tmpl = 0;
PL_reg_oldsaved = NULL;
PL_reg_oldsavedlen = 0;
PL_reg_maxiter = 0;
SV* sv, I32 indent, U32 depth)
{
dVAR;
- register U8 op = PSEUDO; /* Arbitrary non-END op. */
- register const regnode *next;
+ U8 op = PSEUDO; /* Arbitrary non-END op. */
+ const regnode *next;
const regnode *optstart= NULL;
RXi_GET_DECL(r,ri);
if (PL_regkind[(U8)op] == BRANCHJ) {
assert(next);
{
- register const regnode *nnode = (OP(next) == LONGJMP
- ? regnext((regnode *)next)
- : next);
+ const regnode *nnode = (OP(next) == LONGJMP
+ ? regnext((regnode *)next)
+ : next);
if (last && nnode > last)
nnode = last;
DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);