#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
+#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
+ _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
- ((*s) == '{' && regcurly(s, FALSE)))
+ ((*s) == '{' && regcurly(s)))
/*
* Flags to be passed up and down.
regnode_ssc *start_class;
} scan_data_t;
-/* The below is perhaps overboard, but this allows us to save a test at the
- * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
- * and 'a' differ by a single bit; the same with the upper and lower case of
- * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
- * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
- * then inverts it to form a mask, with just a single 0, in the bit position
- * where the upper- and lowercase differ. XXX There are about 40 other
- * instances in the Perl core where this micro-optimization could be used.
- * Should decide if maintenance cost is worse, before changing those
- *
- * Returns a boolean as to whether or not 'v' is either a lowercase or
- * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
- * compile-time constant, the generated code is better than some optimizing
- * compilers figure out, amounting to a mask and test. The results are
- * meaningless if 'c' is not one of [A-Za-z] */
-#define isARG2_lower_or_UPPER_ARG1(c, v) \
- (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
-
/*
* Forward declarations for pregcomp()'s friends.
*/
}
STATIC int
-S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
+S_ssc_is_anything(const regnode_ssc *ssc)
{
/* Returns TRUE if the SSC 'ssc' can match the empty string and any code
* point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
}
STATIC int
-S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
- const regnode_ssc *ssc)
+S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
+ const regnode_ssc *ssc)
{
/* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
* to the list of code points matched, and locale posix classes; hence does
}
}
- /* An ANYOF node contains a bitmap for the first 256 code points, and an
- * inversion list for the others, but if there are code points that should
- * match only conditionally on the target string being UTF-8, those are
- * placed in the inversion list, and not the bitmap. Since there are
- * circumstances under which they could match, they are included in the
- * SSC. But if the ANYOF node is to be inverted, we have to exclude them
- * here, so that when we invert below, the end result actually does include
- * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
- * before we add the unconditionally matched code points */
+ /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
+ * code points, and an inversion list for the others, but if there are code
+ * points that should match only conditionally on the target string being
+ * UTF-8, those are placed in the inversion list, and not the bitmap.
+ * Since there are circumstances under which they could match, they are
+ * included in the SSC. But if the ANYOF node is to be inverted, we have
+ * to exclude them here, so that when we invert below, the end result
+ * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
+ * have to do this here before we add the unconditionally matched code
+ * points */
if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
_invlist_intersection_complement_2nd(invlist,
PL_UpperLatin1,
}
/* Add in the points from the bit map */
- for (i = 0; i < 256; i++) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
invlist = add_cp_to_invlist(invlist, i);
new_node_has_latin1 = TRUE;
}
PERL_STATIC_INLINE void
-S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
+S_ssc_clear_locale(regnode_ssc *ssc)
{
/* Set the SSC 'ssc' to not match any locale things */
-
PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
assert(is_ANYOF_SYNTHETIC(ssc));
{
/* The inversion list in the SSC is marked mortal; now we need a more
* permanent copy, which is stored the same way that is done in a regular
- * ANYOF node, with the first 256 code points in a bit map */
+ * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
+ * map */
SV* invlist = invlist_clone(ssc->invlist);
May be the same as tail.
tail : item following the branch sequence
count : words in the sequence
- flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
+ flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
depth : indent depth
Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
regnode *first, regnode *last, regnode *tail,
U32 word_count, U32 flags, U32 depth)
{
- dVAR;
/* first pass, loop through and scan words */
reg_trie_data *trie;
HV *widecharmap = NULL;
: MADE_TRIE;
}
-STATIC void
-S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
+STATIC regnode *
+S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
{
/* The Trie is constructed and compressed now so we can build a fail array if
* it's needed
U32 *fail;
reg_ac_data *aho;
const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
+ regnode *stclass;
GET_RE_DEBUG_FLAGS_DECL;
- PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
+ PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
+ PERL_UNUSED_CONTEXT;
#ifndef DEBUGGING
PERL_UNUSED_ARG(depth);
#endif
+ if ( OP(source) == TRIE ) {
+ struct regnode_1 *op = (struct regnode_1 *)
+ PerlMemShared_calloc(1, sizeof(struct regnode_1));
+ StructCopy(source,op,struct regnode_1);
+ stclass = (regnode *)op;
+ } else {
+ struct regnode_charclass *op = (struct regnode_charclass *)
+ PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
+ StructCopy(source,op,struct regnode_charclass);
+ stclass = (regnode *)op;
+ }
+ OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
ARG_SET( stclass, data_slot );
aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
});
Safefree(q);
/*RExC_seen |= REG_TRIEDFA_SEEN;*/
+ return stclass;
}
else { /* Here is a generic multi-char fold. */
U8* multi_end = s + len;
- /* Count how many characters in it. In the case of /aa, no
- * folds which contain ASCII code points are allowed, so
- * check for those, and skip if found. */
+ /* Count how many characters are in it. In the case of
+ * /aa, no folds which contain ASCII code points are
+ * allowed, so check for those, and skip if found. */
if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
count = utf8_length(s, multi_end);
s = multi_end;
}
if (len == 2
- && isARG2_lower_or_UPPER_ARG1('s', *s)
- && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
+ && isALPHA_FOLD_EQ(*s, 's')
+ && isALPHA_FOLD_EQ(*(s+1), 's'))
{
/* EXACTF nodes need to know that the minimum length
/* recursed: which subroutines have we recursed into */
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
- dVAR;
/* There must be at least this number of characters to match */
SSize_t min = 0;
I32 pars = 0, code;
}
flags &= ~SCF_DO_STCLASS;
}
- else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
+ else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT!, so is
+ EXACTFish */
SSize_t l = STR_LEN(scan);
UV uc = *((U8*)STRING(scan));
SV* EXACTF_invlist = _new_invlist(4); /* Start out big enough for 2
separate code points */
+ const U8 * s = (U8*)STRING(scan);
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR) {
scan_commit(pRExC_state, data, minlenp, is_inf);
}
if (UTF) {
- const U8 * const s = (U8 *)STRING(scan);
uc = utf8_to_uvchr_buf(s, s + l, NULL);
l = utf8_length(s, s + l);
}
data->longest = &(data->longest_float);
}
}
- if (OP(scan) == EXACTFL) {
- /* We don't know what the folds are; it could be anything. XXX
- * Actually, we only support UTF-8 encoding for code points
- * above Latin1, so we could know what those folds are. */
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0,
- UV_MAX);
+ if (OP(scan) != EXACTFL && flags & SCF_DO_STCLASS_AND) {
+ ssc_clear_locale(data->start_class);
}
- else { /* Non-locale EXACTFish */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
- if (flags & SCF_DO_STCLASS_AND) {
- ssc_clear_locale(data->start_class);
+
+ if (! UTF) {
+
+ /* We punt and assume can match anything if the node begins
+ * with a multi-character fold. Things are complicated. For
+ * example, /ffi/i could match any of:
+ * "\N{LATIN SMALL LIGATURE FFI}"
+ * "\N{LATIN SMALL LIGATURE FF}I"
+ * "F\N{LATIN SMALL LIGATURE FI}"
+ * plus several other things; and making sure we have all the
+ * possibilities is hard. */
+ if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + STR_LEN(scan))) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
}
- if (uc < 256) { /* We know what the Latin1 folds are ... */
- if (IS_IN_SOME_FOLD_L1(uc)) { /* For instance, we
- know if anything folds
- with this */
- EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
+ else {
+
+ /* Any Latin1 range character can potentially match any
+ * other depending on the locale */
+ if (OP(scan) == EXACTFL) {
+ _invlist_union(EXACTF_invlist, PL_Latin1,
+ &EXACTF_invlist);
+ }
+ else {
+ /* But otherwise, it matches at least itself. We can
+ * quickly tell if it has a distinct fold, and if so,
+ * it matches that as well */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (IS_IN_SOME_FOLD_L1(uc)) {
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist,
PL_fold_latin1[uc]);
- if (OP(scan) != EXACTFA) { /* The folds below aren't
- legal under /iaa */
- if (isARG2_lower_or_UPPER_ARG1('s', uc)) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist,
- LATIN_SMALL_LETTER_SHARP_S);
- }
- else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 's');
- EXACTF_invlist
- = add_cp_to_invlist(EXACTF_invlist, 'S');
- }
}
+ }
- /* We also know if there are above-Latin1 code points
- * that fold to this (none legal for ASCII and /iaa) */
- if ((! isASCII(uc) || OP(scan) != EXACTFA)
- && HAS_NONLATIN1_FOLD_CLOSURE(uc))
- {
- /* XXX We could know exactly what does fold to this
- * if the reverse folds are loaded, as currently in
- * S_regclass() */
- _invlist_union(EXACTF_invlist,
- PL_AboveLatin1,
- &EXACTF_invlist);
+ /* Some characters match above-Latin1 ones under /i. This
+ * is true of EXACTFL ones when the locale is UTF-8 */
+ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
+ && (! isASCII(uc) || (OP(scan) != EXACTFA
+ && OP(scan) != EXACTFA_NO_TRIE)))
+ {
+ add_above_Latin1_folds(pRExC_state,
+ (U8) uc,
+ &EXACTF_invlist);
+ }
+ }
+ }
+ else { /* Pattern is UTF-8 */
+ U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
+ STRLEN foldlen = UTF8SKIP(s);
+ const U8* e = s + STR_LEN(scan);
+ SV** listp;
+
+ /* The only code points that aren't folded in a UTF EXACTFish
+ * node are are the problematic ones in EXACTFL nodes */
+ if (OP(scan) == EXACTFL
+ && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc))
+ {
+ /* We need to check for the possibility that this EXACTFL
+ * node begins with a multi-char fold. Therefore we fold
+ * the first few characters of it so that we can make that
+ * check */
+ U8 *d = folded;
+ int i;
+
+ for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
+ if (isASCII(*s)) {
+ *(d++) = (U8) toFOLD(*s);
+ s++;
+ }
+ else {
+ STRLEN len;
+ to_utf8_fold(s, d, &len);
+ d += len;
+ s += UTF8SKIP(s);
}
}
+
+ /* And set up so the code below that looks in this folded
+ * buffer instead of the node's string */
+ e = d;
+ foldlen = UTF8SKIP(folded);
+ s = folded;
}
- else { /* Non-locale, above Latin1. XXX We don't currently
- know what participates in folds with this, so have
- to assume anything could */
-
- /* XXX We could know exactly what does fold to this if the
- * reverse folds are loaded, as currently in S_regclass().
- * But we do know that under /iaa nothing in the ASCII
- * range can participate */
- if (OP(scan) == EXACTFA) {
- _invlist_union_complement_2nd(EXACTF_invlist,
- PL_XPosix_ptrs[_CC_ASCII],
- &EXACTF_invlist);
+
+ /* When we reach here 's' points to the fold of the first
+ * character(s) of the node; and 'e' points to far enough along
+ * the folded string to be just past any possible multi-char
+ * fold. 'foldlen' is the length in bytes of the first
+ * character in 's'
+ *
+ * Unlike the non-UTF-8 case, the macro for determining if a
+ * string is a multi-char fold requires all the characters to
+ * already be folded. This is because of all the complications
+ * if not. Note that they are folded anyway, except in EXACTFL
+ * nodes. Like the non-UTF case above, we punt if the node
+ * begins with a multi-char fold */
+
+ if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
+ EXACTF_invlist =
+ _add_range_to_invlist(EXACTF_invlist, 0, UV_MAX);
+ }
+ else { /* Single char fold */
+
+ /* It matches all the things that fold to it, which are
+ * found in PL_utf8_foldclosures (including itself) */
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, uc);
+ if (! PL_utf8_foldclosures) {
+ _load_PL_utf8_foldclosures();
}
- else {
- EXACTF_invlist = _add_range_to_invlist(EXACTF_invlist,
- 0, UV_MAX);
+ if ((listp = hv_fetch(PL_utf8_foldclosures,
+ (char *) s, foldlen, FALSE)))
+ {
+ AV* list = (AV*) *listp;
+ IV k;
+ for (k = 0; k <= av_tindex(list); k++) {
+ SV** c_p = av_fetch(list, k, FALSE);
+ UV c;
+ assert(c_p);
+
+ c = SvUV(*c_p);
+
+ /* /aa doesn't allow folds between ASCII and non- */
+ if ((OP(scan) == EXACTFA || OP(scan) == EXACTFA_NO_TRIE)
+ && isASCII(c) != isASCII(uc))
+ {
+ continue;
+ }
+
+ EXACTF_invlist = add_cp_to_invlist(EXACTF_invlist, c);
+ }
}
}
}
/* It is counted once already... */
data->pos_min += minnext * (mincount - counted);
#if 0
-PerlIO_printf(Perl_debug_log, "counted=%"UVdf" deltanext=%"UVdf
- " SSize_t_MAX=%"UVdf" minnext=%"UVdf
- " maxcount=%"UVdf" mincount=%"UVdf"\n",
+PerlIO_printf(Perl_debug_log, "counted=%"UVuf" deltanext=%"UVuf
+ " SSize_t_MAX=%"UVuf" minnext=%"UVuf
+ " maxcount=%"UVuf" mincount=%"UVuf"\n",
(UV)counted, (UV)deltanext, (UV)SSize_t_MAX, (UV)minnext, (UV)maxcount,
(UV)mincount);
if (deltanext != SSize_t_MAX)
-PerlIO_printf(Perl_debug_log, "LHS=%"UVdf" RHS=%"UVdf"\n",
+PerlIO_printf(Perl_debug_log, "LHS=%"UVuf" RHS=%"UVuf"\n",
(UV)(-counted * deltanext + (minnext + deltanext) * maxcount
- minnext * mincount), (UV)(SSize_t_MAX - data->pos_delta));
#endif
else if ( PL_regkind[OP(scan)] == BRANCHJ
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
- && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
+ && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
+ {
if ( OP(scan) == UNLESSM &&
scan->flags == 0 &&
OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
*/
ssc_init(pRExC_state, data->start_class);
} else {
- /* AND before and after: combine and continue */
+ /* AND before and after: combine and continue. These
+ * assertions are zero-length, so can match an EMPTY
+ * string */
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
}
}
if (f & SCF_DO_STCLASS_AND) {
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
return count;
}
-/*XXX: todo make this not included in a non debugging perl */
+/*XXX: todo make this not included in a non debugging perl, but appears to be
+ * used anyway there, in 'use re' */
#ifndef PERL_IN_XSUB_RE
void
Perl_reginitcolors(pTHX)
{
- dVAR;
const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
if (s) {
char *t = savepv(s);
regexp_engine const *
Perl_current_re_engine(pTHX)
{
- dVAR;
-
if (IN_PERL_COMPILETIME) {
HV * const table = GvHV(PL_hintgv);
SV **ptr;
REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
{
- dVAR;
regexp_engine const *eng = current_re_engine();
GET_RE_DEBUG_FLAGS_DECL;
/* if we know we have at least two args, create an empty string,
* then concatenate args to that. For no args, return an empty string */
if (!pat && pat_count != 1) {
- pat = newSVpvn("", 0);
+ pat = newSVpvs("");
SAVEFREESV(pat);
alloced = TRUE;
}
if (oplist) {
assert(oplist->op_type == OP_PADAV
|| oplist->op_type == OP_RV2AV);
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);
}
if (SvRMAGICAL(av)) {
pRExC_state->code_blocks[n].src_regex = NULL;
n++;
code = 1;
- oplist = oplist->op_sibling; /* skip CONST */
+ oplist = OP_SIBLING(oplist); /* skip CONST */
assert(oplist);
}
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);;
}
/* apply magic and QR overloading to arg */
{
int n = 0;
STRLEN s;
+
+ PERL_UNUSED_CONTEXT;
for (s = 0; s < plen; s++) {
if (n < pRExC_state->num_code_blocks
OP *expr, const regexp_engine* eng, REGEXP *old_re,
bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
{
- dVAR;
REGEXP *rx;
struct regexp *r;
regexp_internal *ri;
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
ncode++; /* count of DO blocks */
if (ncode) {
if (expr->op_type == OP_CONST)
n = 1;
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
n++;
}
if (expr->op_type == OP_CONST)
new_patternp[n] = cSVOPx_sv(expr);
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
new_patternp[n++] = cSVOPo_sv;
}
assert( expr->op_type == OP_PUSHMARK
|| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
|| expr->op_type == OP_PADRANGE);
- expr = expr->op_sibling;
+ expr = OP_SIBLING(expr);
}
pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
else if (PL_regkind[OP(first)] == TRIE &&
((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
{
- regnode *trie_op;
- /* this can happen only on restudy */
- if ( OP(first) == TRIE ) {
- struct regnode_1 *trieop = (struct regnode_1 *)
- PerlMemShared_calloc(1, sizeof(struct regnode_1));
- StructCopy(first,trieop,struct regnode_1);
- trie_op=(regnode *)trieop;
- } else {
- struct regnode_charclass *trieop = (struct regnode_charclass *)
- PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
- StructCopy(first,trieop,struct regnode_charclass);
- trie_op=(regnode *)trieop;
- }
- OP(trie_op)+=2;
- make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
- ri->regstclass = trie_op;
+ /* this can happen only on restudy */
+ ri->regstclass = construct_ahocorasick_from_trie(pRExC_state, (regnode *)first, 0);
}
#endif
else if (REGNODE_SIMPLE(OP(first)))
goto again;
}
else if ((!sawopen || !RExC_sawback) &&
+ !sawlookahead &&
(OP(first) == STAR &&
PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
!(r->intflags & PREGf_ANCH) && !pRExC_state->num_code_blocks)
/* The header definitions are in F<inline_invlist.c> */
PERL_STATIC_INLINE UV*
-S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
+S__invlist_array_init(SV* const invlist, const bool will_have_0)
{
/* Returns a pointer to the first element in the inversion list's array.
* This is called upon initialization of an inversion list. Where the
}
PERL_STATIC_INLINE UV*
-S_invlist_array(pTHX_ SV* const invlist)
+S_invlist_array(SV* const invlist)
{
/* Returns the pointer to the inversion list's array. Every time the
* length changes, this needs to be called in case malloc or realloc moved
{
/* Sets the current number of elements stored in the inversion list.
* Updates SvCUR correspondingly */
-
+ PERL_UNUSED_CONTEXT;
PERL_ARGS_ASSERT_INVLIST_SET_LEN;
assert(SvTYPE(invlist) == SVt_INVLIST);
}
PERL_STATIC_INLINE IV*
-S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
+S_get_invlist_previous_index_addr(SV* invlist)
{
/* Return the address of the IV that is reserved to hold the cached index
* */
-
PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
assert(SvTYPE(invlist) == SVt_INVLIST);
}
PERL_STATIC_INLINE IV
-S_invlist_previous_index(pTHX_ SV* const invlist)
+S_invlist_previous_index(SV* const invlist)
{
/* Returns cached index of previous search */
}
PERL_STATIC_INLINE void
-S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
+S_invlist_set_previous_index(SV* const invlist, const IV index)
{
/* Caches <index> for later retrieval */
}
PERL_STATIC_INLINE UV
-S_invlist_max(pTHX_ SV* const invlist)
+S_invlist_max(SV* const invlist)
{
/* Returns the maximum number of elements storable in the inversion list's
* array, without having to realloc() */
}
PERL_STATIC_INLINE void
-S_invlist_trim(pTHX_ SV* const invlist)
+S_invlist_trim(SV* const invlist)
{
PERL_ARGS_ASSERT_INVLIST_TRIM;
#ifndef PERL_IN_XSUB_RE
IV
-Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
+Perl__invlist_search(SV* const invlist, const UV cp)
{
/* Searches the inversion list for the entry that contains the input code
* point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
}
void
-Perl__invlist_populate_swatch(pTHX_ SV* const invlist,
- const UV start, const UV end, U8* swatch)
+Perl__invlist_populate_swatch(SV* const invlist,
+ const UV start, const UV end, U8* swatch)
{
/* populates a swatch of a swash the same way swatch_get() does in utf8.c,
* but is used when the swash has an inversion list. This makes this much
}
PERL_STATIC_INLINE STRLEN*
-S_get_invlist_iter_addr(pTHX_ SV* invlist)
+S_get_invlist_iter_addr(SV* invlist)
{
/* Return the address of the UV that contains the current iteration
* position */
}
PERL_STATIC_INLINE void
-S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
+S_invlist_iterinit(SV* invlist) /* Initialize iterator for invlist */
{
PERL_ARGS_ASSERT_INVLIST_ITERINIT;
}
PERL_STATIC_INLINE void
-S_invlist_iterfinish(pTHX_ SV* invlist)
+S_invlist_iterfinish(SV* invlist)
{
/* Terminate iterator for invlist. This is to catch development errors.
* Any iteration that is interrupted before completed should call this
}
STATIC bool
-S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
+S_invlist_iternext(SV* invlist, UV* start, UV* end)
{
/* An C<invlist_iterinit> call on <invlist> must be used to set this up.
* This call sets in <*start> and <*end>, the next range in <invlist>.
}
PERL_STATIC_INLINE bool
-S_invlist_is_iterating(pTHX_ SV* const invlist)
+S_invlist_is_iterating(SV* const invlist)
{
PERL_ARGS_ASSERT_INVLIST_IS_ITERATING;
}
PERL_STATIC_INLINE UV
-S_invlist_highest(pTHX_ SV* const invlist)
+S_invlist_highest(SV* const invlist)
{
/* Returns the highest code point that matches an inversion list. This API
* has an ambiguity, as it returns 0 under either the highest is actually
count += 2;
}
}
+
+void
+Perl__load_PL_utf8_foldclosures (pTHX)
+{
+ assert(! PL_utf8_foldclosures);
+
+ /* If the folds haven't been read in, call a fold function
+ * to force that */
+ if (! PL_utf8_tofold) {
+ U8 dummy[UTF8_MAXBYTES_CASE+1];
+
+ /* This string is just a short named one above \xff */
+ to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
+ assert(PL_utf8_tofold); /* Verify that worked */
+ }
+ PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+}
#endif
#ifdef PERL_ARGS_ASSERT__INVLISTEQ
* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
* this flag alerts us to the need to check for that */
{
- dVAR;
regnode *ret; /* Will be the head of the group. */
regnode *br;
regnode *lastbr;
bool is_open = 0;
I32 freeze_paren = 0;
I32 after_freeze = 0;
+ I32 num; /* numeric backreferences */
char * parse_start = RExC_parse; /* MJD */
char * const oregcomp_parse = RExC_parse;
int internal_argval = 0; /* internal_argval is only useful if
!argok */
- if (has_intervening_patws && SIZE_ONLY) {
- ckWARNregdep(RExC_parse + 1, "In '(*VERB...)', splitting the initial '(*' is deprecated");
+ if (has_intervening_patws) {
+ RExC_parse++;
+ vFAIL("In '(*VERB...)', the '(' and '*' must be adjacent");
}
while ( *RExC_parse && *RExC_parse != ')' ) {
if ( *RExC_parse == ':' ) {
else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
- if (has_intervening_patws && SIZE_ONLY) {
- ckWARNregdep(RExC_parse + 1, "In '(?...)', splitting the initial '(?' is deprecated");
+ const char * endptr;
+ if (has_intervening_patws) {
+ RExC_parse++;
+ vFAIL("In '(?...)', the '(' and '?' must be adjacent");
}
RExC_parse++;
case '@': /* (?@...) */
vFAIL2("Sequence (?%c...) not implemented", (int)paren);
break;
- case '#': /* (?#...) */
- /* XXX As soon as we disallow separating the '?' and '*' (by
- * spaces or (?#...) comment), it is believed that this case
- * will be unreachable and can be removed. See
- * [perl #117327] */
- while (*RExC_parse && *RExC_parse != ')')
- RExC_parse++;
- if (*RExC_parse != ')')
- FAIL("Sequence (?#... not terminated");
- nextchar(pRExC_state);
- *flagp = TRYAGAIN;
- return NULL;
case '0' : /* (?0) */
case 'R' : /* (?R) */
if (*RExC_parse != ')')
nextchar(pRExC_state);
return ret;
/*notreached*/
- { /* named and numeric backreferences */
- I32 num;
+ /* named and numeric backreferences */
case '&': /* (?&NAME) */
parse_start = RExC_parse - 1;
named_recursion:
case '5': case '6': case '7': case '8': case '9':
RExC_parse--;
parse_recursion:
- num = atoi(RExC_parse);
- parse_start = RExC_parse - 1; /* MJD */
- if (*RExC_parse == '-')
- RExC_parse++;
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ {
+ bool is_neg = FALSE;
+ parse_start = RExC_parse - 1; /* MJD */
+ if (*RExC_parse == '-') {
+ RExC_parse++;
+ is_neg = TRUE;
+ }
+ num = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
+ if (is_neg) {
+ /* Some limit for num? */
+ num = -num;
+ }
+ }
if (*RExC_parse!=')')
vFAIL("Expecting close bracket");
*flagp |= POSTPONED;
nextchar(pRExC_state);
return ret;
- } /* named and numeric backreferences */
+
assert(0); /* NOT REACHED */
case '?': /* (??...) */
REGTAIL(pRExC_state, ret, tail);
goto insert_if;
}
+ /* Fall through to ‘Unknown switch condition’ at the
+ end of the if/else chain. */
}
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
|| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
RExC_parse++;
parno = 0;
if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
- parno = atoi(RExC_parse++);
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
} else if (RExC_parse[0] == '&') {
SV *sv_dat;
RExC_parse++;
/* (?(1)...) */
char c;
char *tmp;
- parno = atoi(RExC_parse++);
-
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
but I can't figure out why. -- dmq*/
return ret;
}
- else {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- vFAIL("Unknown switch condition (?(...))");
- }
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
return handle_regex_sets(pRExC_state, NULL, flagp, depth,
STATIC regnode *
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
{
- dVAR;
regnode *ret;
regnode *chain = NULL;
regnode *latest;
STATIC regnode *
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- dVAR;
regnode *ret;
char op;
char *next;
op = *RExC_parse;
- if (op == '{' && regcurly(RExC_parse, FALSE)) {
+ if (op == '{' && regcurly(RExC_parse)) {
maxpos = NULL;
#ifdef RE_TRACK_PATTERN_OFFSETS
parse_start = RExC_parse; /* MJD */
next++;
}
if (*next == '}') { /* got one */
+ const char* endptr;
if (!maxpos)
maxpos = next;
RExC_parse++;
- min = atoi(RExC_parse);
+ min = grok_atou(RExC_parse, &endptr);
if (*maxpos == ',')
maxpos++;
else
maxpos = RExC_parse;
- max = atoi(maxpos);
+ max = grok_atou(maxpos, &endptr);
if (!max && *maxpos != '0')
max = REG_INFTY; /* meaning "infinity" */
else if (max >= REG_INFTY)
* modifier. The other meaning does not, so use a temporary until we find
* out which we are being called with */
p = (RExC_flags & RXf_PMf_EXTENDED)
- ? regwhite( pRExC_state, RExC_parse )
+ ? regpatws(pRExC_state, RExC_parse,
+ TRUE) /* means recognize comments */
: RExC_parse;
/* Disambiguate between \N meaning a named character versus \N meaning
* [^\n]. The former is assumed when it can't be the latter. */
- if (*p != '{' || regcurly(p, FALSE)) {
+ if (*p != '{' || regcurly(p)) {
RExC_parse = p;
if (! node_p) {
/* no bare \N allowed in a charclass */
}
PERL_STATIC_INLINE U8
-S_compute_EXACTish(pTHX_ RExC_state_t *pRExC_state)
+S_compute_EXACTish(RExC_state_t *pRExC_state)
{
U8 op;
}
-/* return atoi(p), unless it's too big to sensibly be a backref,
+/* Parse backref decimal value, unless it's too big to sensibly be a backref,
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
static I32
S_backref_value(char *p)
{
- char *q = p;
-
- for (;isDIGIT(*q); q++) {} /* calculate length of num */
- if (q - p == 0 || q - p > 9)
+ const char* endptr;
+ UV val = grok_atou(p, &endptr);
+ if (endptr == p || endptr == NULL || val > I32_MAX)
return I32_MAX;
- return atoi(p);
+ return (I32)val;
}
STATIC regnode *
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- dVAR;
regnode *ret = NULL;
I32 flags = 0;
char *parse_start = RExC_parse;
U8 op;
int invert = 0;
+ U8 arg;
GET_RE_DEBUG_FLAGS_DECL;
vFAIL("Internal urp");
/* Supposed to be caught earlier. */
break;
- case '{':
- if (!regcurly(RExC_parse, FALSE)) {
- RExC_parse++;
- goto defchar;
- }
- /* FALLTHROUGH */
case '?':
case '+':
case '*':
literal text handling code.
*/
switch ((U8)*++RExC_parse) {
- U8 arg;
/* Special Escapes */
case 'A':
RExC_seen_zerolen++;
ret = reg_node(pRExC_state, CANY);
RExC_seen |= REG_CANY_SEEN;
*flagp |= HASWIDTH|SIMPLE;
+ if (SIZE_ONLY) {
+ ckWARNdep(RExC_parse+1, "\\C is deprecated");
+ }
goto finish_meta_pat;
case 'X':
ret = reg_node(pRExC_state, CLUMP);
case '#':
if (RExC_flags & RXf_PMf_EXTENDED) {
- if ( reg_skipcomment( pRExC_state ) )
+ RExC_parse = reg_skipcomment( pRExC_state, RExC_parse );
+ if (RExC_parse < RExC_end)
goto tryagain;
}
/* FALLTHROUGH */
oldp = p;
if (RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
switch ((U8)*p) {
case '^':
case '$':
p++;
break;
case 'e':
- ender = ASCII_TO_NATIVE('\033');
+ ender = ESC_NATIVE;
p++;
break;
case 'a':
- ender = '\a';
+ ender = '\a';
p++;
break;
case 'o':
goto normal_default;
} /* End of switch on '\' */
break;
+ case '{':
+ /* Currently we don't warn when the lbrace is at the start
+ * of a construct. This catches it in the middle of a
+ * literal string, or when its the first thing after
+ * something like "\b" */
+ if (! SIZE_ONLY
+ && (len || (p > RExC_start && isALPHA_A(*(p -1)))))
+ {
+ ckWARNregdep(p + 1, "Unescaped left brace in regex is deprecated, passed through");
+ }
+ /*FALLTHROUGH*/
default: /* A literal character */
-
- if (! SIZE_ONLY
- && RExC_flags & RXf_PMf_EXTENDED
- && ckWARN_d(WARN_DEPRECATED)
- && is_PATWS_non_low_safe(p, RExC_end, UTF))
- {
- vWARN_dep(p + ((UTF) ? UTF8SKIP(p) : 1),
- "Escape literal pattern white space under /x");
- }
-
normal_default:
if (UTF8_IS_START(*p) && UTF) {
STRLEN numlen;
*/
if ( RExC_flags & RXf_PMf_EXTENDED)
- p = regwhite( pRExC_state, p );
+ p = regpatws(pRExC_state, p,
+ TRUE); /* means recognize comments */
/* If the next thing is a quantifier, it applies to this
* character only, which means that this character has to be in
&& (PL_fold[ender] != PL_fold_latin1[ender]
|| ender == LATIN_SMALL_LETTER_SHARP_S
|| (len > 0
- && isARG2_lower_or_UPPER_ARG1('s', ender)
- && isARG2_lower_or_UPPER_ARG1('s',
- *(s-1)))))
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))))
{
maybe_exactfu = FALSE;
}
* as if it turns into an EXACTFU, it could later get
* joined with another 's' that would then wrongly match
* the sharp s */
- if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+ if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
{
maybe_exactfu = FALSE;
}
}
STATIC char *
-S_regwhite( RExC_state_t *pRExC_state, char *p )
-{
- const char *e = RExC_end;
-
- PERL_ARGS_ASSERT_REGWHITE;
-
- while (p < e) {
- if (isSPACE(*p))
- ++p;
- else if (*p == '#') {
- bool ended = 0;
- do {
- if (*p++ == '\n') {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- }
- else
- break;
- }
- return p;
-}
-
-STATIC char *
-S_regpatws( RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
+S_regpatws(RExC_state_t *pRExC_state, char *p , const bool recognize_comment )
{
/* Returns the next non-pattern-white space, non-comment character (the
* latter only if 'recognize_comment is true) in the string p, which is
- * ended by RExC_end. If there is no line break ending a comment,
- * RExC_seen has added the REG_RUN_ON_COMMENT_SEEN flag; */
+ * ended by RExC_end. See also reg_skipcomment */
const char *e = RExC_end;
PERL_ARGS_ASSERT_REGPATWS;
p += len;
}
else if (recognize_comment && *p == '#') {
- bool ended = 0;
- do {
- p++;
- if (is_LNBREAK_safe(p, e, UTF)) {
- ended = 1;
- break;
- }
- } while (p < e);
- if (!ended)
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ p = reg_skipcomment(pRExC_state, p);
}
else
break;
}
/* Quit if are above what we should change */
- if (start > 255) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
break;
}
change_invlist = TRUE;
/* Set all the bits in the range, up to the max that we are doing */
- high = (end < 255) ? end : 255;
+ high = (end < NUM_ANYOF_CODE_POINTS - 1)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(node, i)) {
ANYOF_BITMAP_SET(node, i);
PERL_STATIC_INLINE I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
{
- dVAR;
I32 namedclass = OOB_NAMEDCLASS;
PERL_ARGS_ASSERT_REGPPOSIXCC;
}
STATIC bool
-S_could_it_be_a_POSIX_class(pTHX_ RExC_state_t *pRExC_state)
+S_could_it_be_a_POSIX_class(RExC_state_t *pRExC_state)
{
/* This applies some heuristics at the current parse position (which should
* be at a '[') to see if what follows might be intended to be a [:posix:]
while (RExC_parse < RExC_end) {
SV* current = NULL;
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE); /* means recognize comments */
switch (*RExC_parse) {
case '?':
if (RExC_parse[1] == '[') depth++, RExC_parse++;
/* Skip white space */
RExC_parse = regpatws(pRExC_state, RExC_parse,
- TRUE); /* means recognize comments */
+ TRUE /* means recognize comments */ );
if (RExC_parse >= RExC_end) {
Perl_croak(aTHX_ "panic: Read past end of '(?[ ])'");
}
}
#undef IS_OPERAND
+STATIC void
+S_add_above_Latin1_folds(pTHX_ RExC_state_t *pRExC_state, const U8 cp, SV** invlist)
+{
+ /* This hard-codes the Latin1/above-Latin1 folding rules, so that an
+ * innocent-looking character class, like /[ks]/i won't have to go out to
+ * disk to find the possible matches.
+ *
+ * This should be called only for a Latin1-range code points, cp, which is
+ * known to be involved in a simple fold with other code points above
+ * Latin1. It would give false results if /aa has been specified.
+ * Multi-char folds are outside the scope of this, and must be handled
+ * specially.
+ *
+ * XXX It would be better to generate these via regen, in case a new
+ * version of the Unicode standard adds new mappings, though that is not
+ * really likely, and may be caught by the default: case of the switch
+ * below. */
+
+ PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
+
+ assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
+
+ switch (cp) {
+ case 'k':
+ case 'K':
+ *invlist =
+ add_cp_to_invlist(*invlist, KELVIN_SIGN);
+ break;
+ case 's':
+ case 'S':
+ *invlist = add_cp_to_invlist(*invlist, LATIN_SMALL_LETTER_LONG_S);
+ break;
+ case MICRO_SIGN:
+ *invlist = add_cp_to_invlist(*invlist, GREEK_CAPITAL_LETTER_MU);
+ *invlist = add_cp_to_invlist(*invlist, GREEK_SMALL_LETTER_MU);
+ break;
+ case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
+ case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
+ *invlist = add_cp_to_invlist(*invlist, ANGSTROM_SIGN);
+ break;
+ case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
+ *invlist = add_cp_to_invlist(*invlist,
+ LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
+ break;
+ case LATIN_SMALL_LETTER_SHARP_S:
+ *invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
+ break;
+ default:
+ /* Use deprecated warning to increase the chances of this being
+ * output */
+ ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%02X; please use the perlbug utility to report;", cp);
+ break;
+ }
+}
+
/* The names of properties whose definitions are not known at compile time are
* stored in this SV, after a constant heading. So if the length has been
* changed since initialization, then there is a run-time definition. */
* ignored in the recursion by means of a flag:
* <RExC_in_multi_char_class>.)
*
- * ANYOF nodes contain a bit map for the first 256 characters, with the
- * corresponding bit set if that character is in the list. For characters
- * above 255, a range list or swash is used. There are extra bits for \w,
- * etc. in locale ANYOFs, as what these match is not determinable at
- * compile time
+ * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
+ * characters, with the corresponding bit set if that character is in the
+ * list. For characters above this, a range list or swash is used. There
+ * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
+ * determinable at compile time
*
* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
* to be restarted. This can only happen if ret_invlist is non-NULL.
*/
- dVAR;
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
UV value = OOB_UNICODE, save_value = OOB_UNICODE;
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
RExC_naughty++;
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
}
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (UCHARAT(RExC_parse) == ']') {
{
namedclass = regpposixcc(pRExC_state, value, strict);
}
- else if (value == '\\') {
- if (UTF) {
+ else if (value != '\\') {
+#ifdef EBCDIC
+ literal_endpoint++;
+#endif
+ }
+ else {
+ /* Is a backslash; get the code point of the char after it */
+ if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
}
if (!SIZE_ONLY) {
SV* invlist;
- char* formatted;
char* name;
if (UCHARAT(RExC_parse) == '^') {
* will have its name be <__NAME_i>. The design is
* discussed in commit
* 2f833f5208e26b208886e51e09e2c072b5eabb46 */
- formatted = Perl_form(aTHX_
+ name = savepv(Perl_form(aTHX_
"%s%.*s%s\n",
(FOLD) ? "__" : "",
(int)n,
RExC_parse,
(FOLD) ? "_i" : ""
- );
- name = savepvn(formatted, strlen(formatted));
+ ));
/* Look up the property name, and get its swash and
* inversion list, if the property is found */
&swash_init_flags
);
if (! swash || ! (invlist = _get_swash_invlist(swash))) {
+ HV* curpkg = (IN_PERL_COMPILETIME)
+ ? PL_curstash
+ : CopSTASH(PL_curcop);
if (swash) {
SvREFCNT_dec_NN(swash);
swash = NULL;
"Property '%"UTF8f"' is unknown",
UTF8fARG(UTF, n, name));
}
+
+ /* If the property name doesn't already have a package
+ * name, add the current one to it so that it can be
+ * referred to outside it. [perl #121777] */
+ if (curpkg && ! instr(name, "::")) {
+ char* pkgname = HvNAME(curpkg);
+ if (strNE(pkgname, "main")) {
+ char* full_name = Perl_form(aTHX_
+ "%s::%s",
+ pkgname,
+ name);
+ n = strlen(full_name);
+ Safefree(name);
+ name = savepvn(full_name, n);
+ }
+ }
Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%"UTF8f"\n",
(value == 'p' ? '+' : '!'),
UTF8fARG(UTF, n, name));
case 't': value = '\t'; break;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
- case 'e': value = ASCII_TO_NATIVE('\033');break;
+ case 'e': value = ESC_NATIVE; break;
case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
break;
} /* End of switch on char following backslash */
} /* end of handling backslash escape sequences */
-#ifdef EBCDIC
- else
- literal_endpoint++;
-#endif
/* Here, we have the current token in 'value' */
ANYOF_POSIXL_ZERO(ret);
}
+ /* Coverity thinks it is possible for this to be negative; both
+ * jhi and khw think it's not, but be safer */
+ assert(! (ANYOF_FLAGS(ret) & ANYOF_POSIXL)
+ || (namedclass + ((namedclass % 2) ? -1 : 1)) >= 0);
+
/* See if it already matches the complement of this POSIX
* class */
if ((ANYOF_FLAGS(ret) & ANYOF_POSIXL)
if (skip_white) {
RExC_parse = regpatws(pRExC_state, RExC_parse,
- FALSE /* means don't recognize comments */);
+ FALSE /* means don't recognize comments */ );
}
if (range) {
AV* this_array;
STRLEN cp_count = utf8_length(foldbuf,
foldbuf + foldlen);
- SV* multi_fold = sv_2mortal(newSVpvn("", 0));
+ SV* multi_fold = sv_2mortal(newSVpvs(""));
Perl_sv_catpvf(aTHX_ multi_fold, "\\x{%"UVXf"}", value);
&& ((prevvalue >= 'a' && value <= 'z')
|| (prevvalue >= 'A' && value <= 'Z')))
{
- _invlist_intersection(this_range, PL_ASCII,
+ _invlist_intersection(this_range, PL_XPosix_ptrs[_CC_ASCII],
&this_range);
/* Since this above only contains ascii, the intersection of it
op = POSIXA;
}
}
+ else if (prevvalue == 'A') {
+ if (value == 'Z'
+#ifdef EBCDIC
+ && literal_endpoint == 2
+#endif
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_UPPER;
+ op = POSIXA;
+ }
+ }
+ else if (prevvalue == 'a') {
+ if (value == 'z'
+#ifdef EBCDIC
+ && literal_endpoint == 2
+#endif
+ ) {
+ arg = (FOLD) ? _CC_ALPHA : _CC_LOWER;
+ op = POSIXA;
+ }
+ }
}
/* Here, we have changed <op> away from its initial value iff we found
/* This is a hash that for a particular fold gives all
* characters that are involved in it */
if (! PL_utf8_foldclosures) {
-
- /* If the folds haven't been read in, call a fold function
- * to force that */
- if (! PL_utf8_tofold) {
- U8 dummy[UTF8_MAXBYTES_CASE+1];
-
- /* This string is just a short named one above \xff */
- to_utf8_fold((U8*) HYPHEN_UTF8, dummy, NULL);
- assert(PL_utf8_tofold); /* Verify that worked */
- }
- PL_utf8_foldclosures
- = _swash_inversion_hash(PL_utf8_tofold);
+ _load_PL_utf8_foldclosures();
}
}
if (j < 256) {
- /* We have the latin1 folding rules hard-coded here so
- * that an innocent-looking character class, like
- * /[ks]/i won't have to go out to disk to find the
- * possible matches. XXX It would be better to
- * generate these via regen, in case a new version of
- * the Unicode standard adds new mappings, though that
- * is not really likely, and may be caught by the
- * default: case of the switch below. */
-
if (IS_IN_SOME_FOLD_L1(j)) {
/* ASCII is always matched; non-ASCII is matched
}
}
- if (HAS_NONLATIN1_FOLD_CLOSURE(j)
+ if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(j)
&& (! isASCII(j) || ! ASCII_FOLD_RESTRICTED))
{
- /* Certain Latin1 characters have matches outside
- * Latin1. To get here, <j> is one of those
- * characters. None of these matches is valid for
- * ASCII characters under /aa, which is why the 'if'
- * just above excludes those. These matches only
- * happen when the target string is utf8. The code
- * below adds the single fold closures for <j> to the
- * inversion list. */
-
- switch (j) {
- case 'k':
- case 'K':
- *use_list =
- add_cp_to_invlist(*use_list, KELVIN_SIGN);
- break;
- case 's':
- case 'S':
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_SMALL_LETTER_LONG_S);
- break;
- case MICRO_SIGN:
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_CAPITAL_LETTER_MU);
- *use_list = add_cp_to_invlist(*use_list,
- GREEK_SMALL_LETTER_MU);
- break;
- case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
- case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
- *use_list =
- add_cp_to_invlist(*use_list, ANGSTROM_SIGN);
- break;
- case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
- break;
- case LATIN_SMALL_LETTER_SHARP_S:
- *use_list = add_cp_to_invlist(*use_list,
- LATIN_CAPITAL_LETTER_SHARP_S);
- break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character
- * folds from code points that require UTF8
- * to express, so they can't match unless
- * the target string is in UTF-8, so no
- * action here is necessary, as regexec.c
- * properly handles the general case for
- * UTF-8 matching and multi-char folds */
- break;
- default:
- /* Use deprecated warning to increase the
- * chances of this being output */
- ckWARN2reg_d(RExC_parse, "Perl folding rules are not up-to-date for 0x%"UVXf"; please use the perlbug utility to report;", j);
- break;
- }
+ add_above_Latin1_folds(pRExC_state,
+ (U8) j,
+ use_list);
}
continue;
}
for (k = 0; k <= av_tindex(list); k++) {
SV** c_p = av_fetch(list, k, FALSE);
UV c;
- if (c_p == NULL) {
- Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
- }
+ assert(c_p);
+
c = SvUV(*c_p);
/* /aa doesn't allow folds between ASCII and non- */
swash = NULL;
}
+ /* Note that the optimization of using 'swash' if it is the only thing in
+ * the class doesn't have us change swash at all, so it can include things
+ * that are also in the bitmap; otherwise we have purposely deleted that
+ * duplicate information */
set_ANYOF_arg(pRExC_state, ret, cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
? listsv : NULL,
}
}
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+SV *
+Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
+ const regnode* node,
+ bool doinit,
+ SV** listsvp,
+ SV** only_utf8_locale_ptr,
+ SV* exclude_list)
+
+{
+ /* For internal core use only.
+ * Returns the swash for the input 'node' in the regex 'prog'.
+ * If <doinit> is 'true', will attempt to create the swash if not already
+ * done.
+ * If <listsvp> is non-null, will return the printable contents of the
+ * swash. This can be used to get debugging information even before the
+ * swash exists, by calling this function with 'doinit' set to false, in
+ * which case the components that will be used to eventually create the
+ * swash are returned (in a printable form).
+ * If <exclude_list> is not NULL, it is an inversion list of things to
+ * exclude from what's returned in <listsvp>.
+ * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
+ * that, in spite of this function's name, the swash it returns may include
+ * the bitmap data as well */
+
+ SV *sw = NULL;
+ SV *si = NULL; /* Input swash initialization string */
+ SV* invlist = NULL;
+
+ RXi_GET_DECL(prog,progi);
+ const struct reg_data * const data = prog ? progi->data : NULL;
+
+ PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
+
+ assert(ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
+
+ if (data && data->count) {
+ const U32 n = ARG(node);
+
+ if (data->what[n] == 's') {
+ SV * const rv = MUTABLE_SV(data->data[n]);
+ AV * const av = MUTABLE_AV(SvRV(rv));
+ SV **const ary = AvARRAY(av);
+ U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+
+ si = *ary; /* ary[0] = the string to initialize the swash with */
+
+ /* Elements 3 and 4 are either both present or both absent. [3] is
+ * any inversion list generated at compile time; [4] indicates if
+ * that inversion list has any user-defined properties in it. */
+ if (av_tindex(av) >= 2) {
+ if (only_utf8_locale_ptr
+ && ary[2]
+ && ary[2] != &PL_sv_undef)
+ {
+ *only_utf8_locale_ptr = ary[2];
+ }
+ else {
+ assert(only_utf8_locale_ptr);
+ *only_utf8_locale_ptr = NULL;
+ }
+
+ if (av_tindex(av) >= 3) {
+ invlist = ary[3];
+ if (SvUV(ary[4])) {
+ swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
+ }
+ }
+ else {
+ invlist = NULL;
+ }
+ }
+
+ /* Element [1] is reserved for the set-up swash. If already there,
+ * return it; if not, create it and store it there */
+ if (ary[1] && SvROK(ary[1])) {
+ sw = ary[1];
+ }
+ else if (doinit && ((si && si != &PL_sv_undef)
+ || (invlist && invlist != &PL_sv_undef))) {
+ assert(si);
+ sw = _core_swash_init("utf8", /* the utf8 package */
+ "", /* nameless */
+ si,
+ 1, /* binary */
+ 0, /* not from tr/// */
+ invlist,
+ &swash_init_flags);
+ (void)av_store(av, 1, sw);
+ }
+ }
+ }
+
+ /* If requested, return a printable version of what this swash matches */
+ if (listsvp) {
+ SV* matches_string = newSVpvs("");
+
+ /* The swash should be used, if possible, to get the data, as it
+ * contains the resolved data. But this function can be called at
+ * compile-time, before everything gets resolved, in which case we
+ * return the currently best available information, which is the string
+ * that will eventually be used to do that resolving, 'si' */
+ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
+ && (si && si != &PL_sv_undef))
+ {
+ sv_catsv(matches_string, si);
+ }
+
+ /* Add the inversion list to whatever we have. This may have come from
+ * the swash, or from an input parameter */
+ if (invlist) {
+ if (exclude_list) {
+ SV* clone = invlist_clone(invlist);
+ _invlist_subtract(clone, exclude_list, &clone);
+ sv_catsv(matches_string, _invlist_contents(clone));
+ SvREFCNT_dec_NN(clone);
+ }
+ else {
+ sv_catsv(matches_string, _invlist_contents(invlist));
+ }
+ }
+ *listsvp = matches_string;
+ }
+
+ return sw;
+}
+#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
/* reg_skipcomment()
- Absorbs an /x style # comments from the input stream.
- Returns true if there is more text remaining in the stream.
- Will set the REG_RUN_ON_COMMENT_SEEN flag if the comment
- terminates the pattern without including a newline.
+ Absorbs an /x style # comment from the input stream,
+ returning a pointer to the first character beyond the comment, or if the
+ comment terminates the pattern without anything following it, this returns
+ one past the final character of the pattern (in other words, RExC_end) and
+ sets the REG_RUN_ON_COMMENT_SEEN flag.
- Note its the callers responsibility to ensure that we are
+ Note it's the callers responsibility to ensure that we are
actually in /x mode
*/
-STATIC bool
-S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
+PERL_STATIC_INLINE char*
+S_reg_skipcomment(RExC_state_t *pRExC_state, char* p)
{
- bool ended = 0;
-
PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
- while (RExC_parse < RExC_end)
- if (*RExC_parse++ == '\n') {
- ended = 1;
- break;
+ assert(*p == '#');
+
+ while (p < RExC_end) {
+ if (*(++p) == '\n') {
+ return p+1;
}
- if (!ended) {
- /* we ran off the end of the pattern without ending
- the comment, so we have to add an \n when wrapping */
- RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
- return 0;
- } else
- return 1;
+ }
+
+ /* we ran off the end of the pattern without ending the comment, so we have
+ * to add an \n when wrapping */
+ RExC_seen |= REG_RUN_ON_COMMENT_SEEN;
+ return p;
}
/* nextchar()
continue;
}
if (RExC_flags & RXf_PMf_EXTENDED) {
- if (isSPACE(*RExC_parse)) {
- RExC_parse++;
- continue;
- }
- else if (*RExC_parse == '#') {
- if ( reg_skipcomment( pRExC_state ) )
- continue;
- }
+ char * p = regpatws(pRExC_state, RExC_parse,
+ TRUE); /* means recognize comments */
+ if (p != RExC_parse) {
+ RExC_parse = p;
+ continue;
+ }
}
- return retval;
+ return retval;
}
}
STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
- dVAR;
regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
}
if (RExC_emit >= RExC_emit_bound)
Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
- op, RExC_emit, RExC_emit_bound);
+ op, (void*)RExC_emit, (void*)RExC_emit_bound);
NODE_ALIGN_FILL(ret);
ptr = ret;
STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
- dVAR;
regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
}
if (RExC_emit >= RExC_emit_bound)
Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
- op, RExC_emit, RExC_emit_bound);
+ op, (void*)RExC_emit, (void*)RExC_emit_bound);
NODE_ALIGN_FILL(ret);
ptr = ret;
PERL_STATIC_INLINE STRLEN
S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
{
- dVAR;
-
PERL_ARGS_ASSERT_REGUNI;
return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
- dVAR;
regnode *src;
regnode *dst;
regnode *place;
GET_RE_DEBUG_FLAGS_DECL;
PERL_ARGS_ASSERT_REGINSERT;
+ PERL_UNUSED_CONTEXT;
PERL_UNUSED_ARG(depth);
/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
const regnode *val,U32 depth)
{
- dVAR;
regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
const regnode *val,U32 depth)
{
- dVAR;
regnode *scan;
U8 exact = PSEUDO;
#ifdef EXPERIMENTAL_INPLACESCAN
Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
- dVAR;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
{
#ifdef DEBUGGING
- dVAR;
int k;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
);
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
- (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
- ? ANYOF_BITMAP(o)
- : TRIE_BITMAP(trie));
+ (void) put_charclass_bitmap_innards(sv,
+ (IS_ANYOF_TRIE(op))
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie),
+ NULL);
sv_catpvs(sv, "]");
}
else if (k == ANYOF) {
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
+ SV* bitmap_invlist; /* Will hold what the bit map contains */
if (flags & ANYOF_LOCALE_FLAGS)
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
- /* output what the standard cp 0-255 bitmap matches */
- do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
+ /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
+ * */
+ do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
+ &bitmap_invlist);
/* output any special charclass tests (used entirely under use
* locale) * */
been output */
SV *only_utf8_locale;
- /* Get the stuff that wasn't in the bitmap */
+ /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
+ * is used to guarantee that nothing in the bitmap gets
+ * returned */
(void) _get_regclass_nonbitmap_data(prog, o, FALSE,
- &lv, &only_utf8_locale);
+ &lv, &only_utf8_locale,
+ bitmap_invlist);
if (lv && lv != &PL_sv_undef) {
char *s = savesvpv(lv);
char * const origs = s;
invlist_iterinit(only_utf8_locale);
while (invlist_iternext(only_utf8_locale,
&start, &end)) {
- put_range(sv, start, end);
+ put_range(sv, start, end, FALSE);
max_entries --;
if (max_entries < 0) {
sv_catpvs(sv, "...");
}
}
}
+ SvREFCNT_dec(bitmap_invlist);
+
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
- dVAR;
struct regexp *const prog = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
void
Perl_pregfree2(pTHX_ REGEXP *rx)
{
- dVAR;
struct regexp *const r = ReANY(rx);
GET_RE_DEBUG_FLAGS_DECL;
void
Perl_regfree_internal(pTHX_ REGEXP * const rx)
{
- dVAR;
struct regexp *const r = ReANY(rx);
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
Used in stclass optimization only */
U32 refcount;
reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
+#ifdef USE_ITHREADS
+ dVAR;
+#endif
OP_REFCNT_LOCK;
refcount = --aho->refcount;
OP_REFCNT_UNLOCK;
PerlMemShared_free(aho->fail);
/* do this last!!!! */
PerlMemShared_free(ri->data->data[n]);
- PerlMemShared_free(ri->regstclass);
+ /* we should only ever get called once, so
+ * assert as much, and also guard the free
+ * which /might/ happen twice. At the least
+ * it will make code anlyzers happy and it
+ * doesn't cost much. - Yves */
+ assert(ri->regstclass);
+ if (ri->regstclass) {
+ PerlMemShared_free(ri->regstclass);
+ ri->regstclass = 0;
+ }
}
}
break;
/* trie structure. */
U32 refcount;
reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
+#ifdef USE_ITHREADS
+ dVAR;
+#endif
OP_REFCNT_LOCK;
refcount = --trie->refcount;
OP_REFCNT_UNLOCK;
regnode *
Perl_regnext(pTHX_ regnode *p)
{
- dVAR;
I32 offset;
if (!p)
void
Perl_save_re_context(pTHX)
{
- dVAR;
-
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
if (PL_curpm) {
const REGEXP * const rx = PM_GETRE(PL_curpm);
#ifdef DEBUGGING
+/* Given that c is a control character, is it one for which we have a
+ * mnemonic? */
+#define isMNEMONIC_CNTRL(c) ((isSPACE_A(c) && (c) != '\v') \
+ || (c) == '\a' \
+ || (c) == '\b' \
+ || (c) == ESC_NATIVE)
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c) \
+ ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
if (!isPRINT(c)) {
switch (c) {
- case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+ case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+ case '\b': Perl_sv_catpvf(aTHX_ sv, "\\b"); break;
+ case ESC_NATIVE: Perl_sv_catpvf(aTHX_ sv, "\\e"); break;
+ case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+ case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
- case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
- case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
-
- default:
- Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
- break;
+ default: Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", c); break;
}
}
else {
const char string = c;
- if (c == '-' || c == ']' || c == '\\' || c == '^')
+ if (isBACKSLASHED_PUNCT(c))
sv_catpvs(sv, "\\");
sv_catpvn(sv, &string, 1);
}
}
+#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
STATIC void
-S_put_range(pTHX_ SV *sv, UV start, UV end)
+S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
{
-
/* Appends to 'sv' a displayable version of the range of code points from
- * 'start' to 'end' */
+ * 'start' to 'end'. It assumes that only ASCII printables are displayable
+ * as-is (though some of these will be escaped by put_byte()). */
+
+ const unsigned int min_range_count = 3;
assert(start <= end);
PERL_ARGS_ASSERT_PUT_RANGE;
- if (end - start < 3) { /* Individual chars in short ranges */
- for (; start <= end; start++)
- put_byte(sv, start);
- }
- else if ( end > 255
- || ! isALPHANUMERIC(start)
- || ! isALPHANUMERIC(end)
- || isDIGIT(start) != isDIGIT(end)
- || isUPPER(start) != isUPPER(end)
- || isLOWER(start) != isLOWER(end)
-
- /* This final test should get optimized out except on EBCDIC
- * platforms, where it causes ranges that cross discontinuities
- * like i/j to be shown as hex instead of the misleading,
- * e.g. H-K (since that range includes more than H, I, J, K).
+ while (start <= end) {
+ if (end - start < min_range_count) {
+
+ /* Individual chars in short ranges */
+ for (; start <= end; start++) {
+ put_byte(sv, start);
+ }
+ break;
+ }
+
+ /* If permitted by the input options, and there is a possibility that
+ * this range contains a printable literal, look to see if there is
+ * one. */
+ if (allow_literals && start <= MAX_PRINT_A) {
+
+ /* If the range begin isn't an ASCII printable, effectively split
+ * the range into two parts:
+ * 1) the portion before the first such printable,
+ * 2) the rest
+ * and output them separately. */
+ if (! isPRINT_A(start)) {
+ UV temp_end = start + 1;
+
+ /* There is no point looking beyond the final possible
+ * printable, in MAX_PRINT_A */
+ UV max = MIN(end, MAX_PRINT_A);
+
+ while (temp_end <= max && ! isPRINT_A(temp_end)) {
+ temp_end++;
+ }
+
+ /* Here, temp_end points to one beyond the first printable if
+ * found, or to one beyond 'max' if not. If none found, make
+ * sure that we use the entire range */
+ if (temp_end > MAX_PRINT_A) {
+ temp_end = end + 1;
+ }
+
+ /* Output the first part of the split range, the part that
+ * doesn't have printables, with no looking for literals
+ * (otherwise we would infinitely recurse) */
+ put_range(sv, start, temp_end - 1, FALSE);
+
+ /* The 2nd part of the range (if any) starts here. */
+ start = temp_end;
+
+ /* We continue instead of dropping down because even if the 2nd
+ * part is non-empty, it could be so short that we want to
+ * output it specially, as tested for at the top of this loop.
* */
- || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
- {
+ continue;
+ }
+
+ /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
+ * output a sub-range of just the digits or letters, then process
+ * the remaining portion as usual. */
+ if (isALPHANUMERIC_A(start)) {
+ UV mask = (isDIGIT_A(start))
+ ? _CC_DIGIT
+ : isUPPER_A(start)
+ ? _CC_UPPER
+ : _CC_LOWER;
+ UV temp_end = start + 1;
+
+ /* Find the end of the sub-range that includes just the
+ * characters in the same class as the first character in it */
+ while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
+ temp_end++;
+ }
+ temp_end--;
+
+ /* For short ranges, don't duplicate the code above to output
+ * them; just call recursively */
+ if (temp_end - start < min_range_count) {
+ put_range(sv, start, temp_end, FALSE);
+ }
+ else { /* Output as a range */
+ put_byte(sv, start);
+ sv_catpvs(sv, "-");
+ put_byte(sv, temp_end);
+ }
+ start = temp_end + 1;
+ continue;
+ }
+
+ /* We output any other printables as individual characters */
+ if (isPUNCT_A(start) || isSPACE_A(start)) {
+ while (start <= end && (isPUNCT_A(start)
+ || isSPACE_A(start)))
+ {
+ put_byte(sv, start);
+ start++;
+ }
+ continue;
+ }
+ } /* End of looking for literals */
+
+ /* Here is not to output as a literal. Some control characters have
+ * mnemonic names. Split off any of those at the beginning and end of
+ * the range to print mnemonically. It isn't possible for many of
+ * these to be in a row, so this won't overwhelm with output */
+ if (isMNEMONIC_CNTRL(start)) {
+ while (isMNEMONIC_CNTRL(start) && start <= end) {
+ put_byte(sv, start);
+ start++;
+ }
+ }
+ if (start < end && isMNEMONIC_CNTRL(end)) {
+
+ /* Here, the final character in the range has a mnemonic name.
+ * Work backwards from the end to find the final non-mnemonic */
+ UV temp_end = end - 1;
+ while (isMNEMONIC_CNTRL(temp_end)) {
+ temp_end--;
+ }
+
+ /* And separately output the range that doesn't have mnemonics */
+ put_range(sv, start, temp_end, FALSE);
+
+ /* Then output the mnemonic trailing controls */
+ start = temp_end + 1;
+ while (start <= end) {
+ put_byte(sv, start);
+ start++;
+ }
+ break;
+ }
+
+ /* As a final resort, output the range or subrange as hex. */
Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
start,
- (end < 256) ? end : 255);
- }
- else { /* Here, the ends of the range are both digits, or both uppercase,
- or both lowercase; and there's no discontinuity in the range
- (which could happen on EBCDIC platforms) */
- put_byte(sv, start);
- sv_catpvs(sv, "-");
- put_byte(sv, end);
+ (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1);
+ break;
}
}
STATIC bool
-S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
- * output anything */
+ * output anything, and bitmap_invlist, if not NULL, will point to an
+ * inversion list of what is in the bit map */
int i;
- bool has_output_anything = FALSE;
+ UV start, end;
+ unsigned int punct_count = 0;
+ SV* invlist = NULL;
+ SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
+ bool allow_literals = TRUE;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+ invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
+
+ /* Worst case is exactly every-other code point is in the list */
+ *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+
+ /* Convert the bit map to an inversion list, keeping track of how many
+ * ASCII puncts are set, including an extra amount for the backslashed
+ * ones. */
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST((U8 *) bitmap,i)) {
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
+ if (isPUNCT_A(i)) {
+ punct_count++;
+ if isBACKSLASHED_PUNCT(i) {
+ punct_count++;
+ }
+ }
+ }
+ }
- PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+ /* Nothing to output */
+ if (_invlist_len(*invlist_ptr) == 0) {
+ SvREFCNT_dec(invlist);
+ return FALSE;
+ }
- for (i = 0; i < 256; i++) {
- if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+ /* Generally, it is more readable if printable characters are output as
+ * literals, but if a range (nearly) spans all of them, it's best to output
+ * it as a single range. This code will use a single range if all but 2
+ * printables are in it */
+ invlist_iterinit(*invlist_ptr);
+ while (invlist_iternext(*invlist_ptr, &start, &end)) {
- /* The character at index i should be output. Find the next
- * character that should NOT be output */
- int j;
- for (j = i + 1; j <= 256; j++) {
- if (! BITMAP_TEST((U8 *) bitmap, j)) {
- break;
- }
- }
+ /* If range starts beyond final printable, it doesn't have any in it */
+ if (start > MAX_PRINT_A) {
+ break;
+ }
- /* Everything between them is a single range that should be output
- * */
- put_range(sv, i, j - 1);
- has_output_anything = TRUE;
- i = j;
+ /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
+ * all but two, the range must start and end no later than 2 from
+ * either end */
+ if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
+ if (end > MAX_PRINT_A) {
+ end = MAX_PRINT_A;
+ }
+ if (start < ' ') {
+ start = ' ';
+ }
+ if (end - start >= MAX_PRINT_A - ' ' - 2) {
+ allow_literals = FALSE;
+ }
+ break;
}
}
+ invlist_iterfinish(*invlist_ptr);
+
+ /* The legibility of the output depends mostly on how many punctuation
+ * characters are output. There are 32 possible ASCII ones, and some have
+ * an additional backslash, bringing it to currently 36, so if any more
+ * than 18 are to be output, we can instead output it as its complement,
+ * yielding fewer puncts, and making it more legible. But give some weight
+ * to the fact that outputting it as a complement is less legible than a
+ * straight output, so don't complement unless we are somewhat over the 18
+ * mark */
+ if (allow_literals && punct_count > 22) {
+ sv_catpvs(sv, "^");
+
+ /* Add everything remaining to the list, so when we invert it just
+ * below, it will be excluded */
+ *invlist_ptr = _add_range_to_invlist(*invlist_ptr,
+ NUM_ANYOF_CODE_POINTS, UV_MAX);
+ _invlist_invert(*invlist_ptr);
+ }
+
+ /* Here we have figured things out. Output each range */
+ invlist_iterinit(*invlist_ptr);
+ while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ break;
+ }
+ put_range(sv, start, end, allow_literals);
+ }
+ invlist_iterfinish(*invlist_ptr);
- return has_output_anything;
+ return TRUE;
}
#define CLEAR_OPTSTART \
const regnode *last, const regnode *plast,
SV* sv, I32 indent, U32 depth)
{
- dVAR;
U8 op = PSEUDO; /* Arbitrary non-END op. */
const regnode *next;
const regnode *optstart= NULL;