regnode_ssc *start_class;
} scan_data_t;
-/* The below is perhaps overboard, but this allows us to save a test at the
- * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
- * and 'a' differ by a single bit; the same with the upper and lower case of
- * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
- * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
- * then inverts it to form a mask, with just a single 0, in the bit position
- * where the upper- and lowercase differ. XXX There are about 40 other
- * instances in the Perl core where this micro-optimization could be used.
- * Should decide if maintenance cost is worse, before changing those
- *
- * Returns a boolean as to whether or not 'v' is either a lowercase or
- * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
- * compile-time constant, the generated code is better than some optimizing
- * compilers figure out, amounting to a mask and test. The results are
- * meaningless if 'c' is not one of [A-Za-z] */
-#define isARG2_lower_or_UPPER_ARG1(c, v) \
- (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
-
/*
* Forward declarations for pregcomp()'s friends.
*/
}
}
- /* An ANYOF node contains a bitmap for the first 256 code points, and an
- * inversion list for the others, but if there are code points that should
- * match only conditionally on the target string being UTF-8, those are
- * placed in the inversion list, and not the bitmap. Since there are
- * circumstances under which they could match, they are included in the
- * SSC. But if the ANYOF node is to be inverted, we have to exclude them
- * here, so that when we invert below, the end result actually does include
- * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
- * before we add the unconditionally matched code points */
+ /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
+ * code points, and an inversion list for the others, but if there are code
+ * points that should match only conditionally on the target string being
+ * UTF-8, those are placed in the inversion list, and not the bitmap.
+ * Since there are circumstances under which they could match, they are
+ * included in the SSC. But if the ANYOF node is to be inverted, we have
+ * to exclude them here, so that when we invert below, the end result
+ * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
+ * have to do this here before we add the unconditionally matched code
+ * points */
if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
_invlist_intersection_complement_2nd(invlist,
PL_UpperLatin1,
}
/* Add in the points from the bit map */
- for (i = 0; i < 256; i++) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
invlist = add_cp_to_invlist(invlist, i);
new_node_has_latin1 = TRUE;
{
/* The inversion list in the SSC is marked mortal; now we need a more
* permanent copy, which is stored the same way that is done in a regular
- * ANYOF node, with the first 256 code points in a bit map */
+ * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
+ * map */
SV* invlist = invlist_clone(ssc->invlist);
regnode *first, regnode *last, regnode *tail,
U32 word_count, U32 flags, U32 depth)
{
- dVAR;
/* first pass, loop through and scan words */
reg_trie_data *trie;
HV *widecharmap = NULL;
}
if (len == 2
- && isARG2_lower_or_UPPER_ARG1('s', *s)
- && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
+ && isALPHA_FOLD_EQ(*s, 's')
+ && isALPHA_FOLD_EQ(*(s+1), 's'))
{
/* EXACTF nodes need to know that the minimum length
/* recursed: which subroutines have we recursed into */
/* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
{
- dVAR;
/* There must be at least this number of characters to match */
SSize_t min = 0;
I32 pars = 0, code;
else if ( PL_regkind[OP(scan)] == BRANCHJ
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
- && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
+ && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
+ {
if ( OP(scan) == UNLESSM &&
scan->flags == 0 &&
OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
*/
ssc_init(pRExC_state, data->start_class);
} else {
- /* AND before and after: combine and continue */
+ /* AND before and after: combine and continue. These
+ * assertions are zero-length, so can match an EMPTY
+ * string */
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
}
}
if (f & SCF_DO_STCLASS_AND) {
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
void
Perl_reginitcolors(pTHX)
{
- dVAR;
const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
if (s) {
char *t = savepv(s);
regexp_engine const *
Perl_current_re_engine(pTHX)
{
- dVAR;
-
if (IN_PERL_COMPILETIME) {
HV * const table = GvHV(PL_hintgv);
SV **ptr;
REGEXP *
Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
{
- dVAR;
regexp_engine const *eng = current_re_engine();
GET_RE_DEBUG_FLAGS_DECL;
if (oplist) {
assert(oplist->op_type == OP_PADAV
|| oplist->op_type == OP_RV2AV);
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);
}
if (SvRMAGICAL(av)) {
pRExC_state->code_blocks[n].src_regex = NULL;
n++;
code = 1;
- oplist = oplist->op_sibling; /* skip CONST */
+ oplist = OP_SIBLING(oplist); /* skip CONST */
assert(oplist);
}
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);;
}
/* apply magic and QR overloading to arg */
OP *expr, const regexp_engine* eng, REGEXP *old_re,
bool *is_bare_re, U32 orig_rx_flags, U32 pm_flags)
{
- dVAR;
REGEXP *rx;
struct regexp *r;
regexp_internal *ri;
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
ncode++; /* count of DO blocks */
if (ncode) {
if (expr->op_type == OP_CONST)
n = 1;
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
n++;
}
if (expr->op_type == OP_CONST)
new_patternp[n] = cSVOPx_sv(expr);
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
new_patternp[n++] = cSVOPo_sv;
}
assert( expr->op_type == OP_PUSHMARK
|| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
|| expr->op_type == OP_PADRANGE);
- expr = expr->op_sibling;
+ expr = OP_SIBLING(expr);
}
pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
* RExC_parse beyond the '('. Things like '(?' are indivisible tokens, and
* this flag alerts us to the need to check for that */
{
- dVAR;
regnode *ret; /* Will be the head of the group. */
regnode *br;
regnode *lastbr;
else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ const char * endptr;
if (has_intervening_patws) {
RExC_parse++;
vFAIL("In '(?...)', the '(' and '?' must be adjacent");
case '5': case '6': case '7': case '8': case '9':
RExC_parse--;
parse_recursion:
- num = atoi(RExC_parse);
- parse_start = RExC_parse - 1; /* MJD */
- if (*RExC_parse == '-')
- RExC_parse++;
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ {
+ bool is_neg = FALSE;
+ parse_start = RExC_parse - 1; /* MJD */
+ if (*RExC_parse == '-') {
+ RExC_parse++;
+ is_neg = TRUE;
+ }
+ num = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
+ if (is_neg) {
+ /* Some limit for num? */
+ num = -num;
+ }
+ }
if (*RExC_parse!=')')
vFAIL("Expecting close bracket");
REGTAIL(pRExC_state, ret, tail);
goto insert_if;
}
+ /* Fall through to ‘Unknown switch condition’ at the
+ end of the if/else chain. */
}
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
|| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
RExC_parse++;
parno = 0;
if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
- parno = atoi(RExC_parse++);
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
} else if (RExC_parse[0] == '&') {
SV *sv_dat;
RExC_parse++;
/* (?(1)...) */
char c;
char *tmp;
- parno = atoi(RExC_parse++);
-
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
but I can't figure out why. -- dmq*/
return ret;
}
- else {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- vFAIL("Unknown switch condition (?(...))");
- }
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
return handle_regex_sets(pRExC_state, NULL, flagp, depth,
STATIC regnode *
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
{
- dVAR;
regnode *ret;
regnode *chain = NULL;
regnode *latest;
STATIC regnode *
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- dVAR;
regnode *ret;
char op;
char *next;
next++;
}
if (*next == '}') { /* got one */
+ const char* endptr;
if (!maxpos)
maxpos = next;
RExC_parse++;
- min = atoi(RExC_parse);
+ min = grok_atou(RExC_parse, &endptr);
if (*maxpos == ',')
maxpos++;
else
maxpos = RExC_parse;
- max = atoi(maxpos);
+ max = grok_atou(maxpos, &endptr);
if (!max && *maxpos != '0')
max = REG_INFTY; /* meaning "infinity" */
else if (max >= REG_INFTY)
EBCDIC, but it works there, as the extra invariants
fold to themselves) */
*character = toFOLD((U8) code_point);
- if (downgradable
- && *character == code_point
- && ! HAS_NONLATIN1_FOLD_CLOSURE(code_point))
- {
+
+ /* We can downgrade to an EXACT node if this character
+ * isn't a folding one. Note that this assumes that
+ * nothing above Latin1 folds to some other invariant than
+ * one of these alphabetics; otherwise we would also have
+ * to check:
+ * && (! HAS_NONLATIN1_FOLD_CLOSURE(code_point)
+ * || ASCII_FOLD_RESTRICTED))
+ */
+ if (downgradable && PL_fold[code_point] == code_point) {
OP(node) = EXACT;
}
}
}
-/* return atoi(p), unless it's too big to sensibly be a backref,
+/* Parse backref decimal value, unless it's too big to sensibly be a backref,
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
static I32
S_backref_value(char *p)
{
- char *q = p;
-
- for (;isDIGIT(*q); q++) {} /* calculate length of num */
- if (q - p == 0 || q - p > 9)
+ const char* endptr;
+ UV val = grok_atou(p, &endptr);
+ if (endptr == p || endptr == NULL || val > I32_MAX)
return I32_MAX;
- return atoi(p);
+ return (I32)val;
}
STATIC regnode *
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
{
- dVAR;
regnode *ret = NULL;
I32 flags = 0;
char *parse_start = RExC_parse;
ret = reg_node(pRExC_state, CANY);
RExC_seen |= REG_CANY_SEEN;
*flagp |= HASWIDTH|SIMPLE;
+ if (SIZE_ONLY) {
+ ckWARNdep(RExC_parse+1, "\\C is deprecated");
+ }
goto finish_meta_pat;
case 'X':
ret = reg_node(pRExC_state, CLUMP);
p++;
break;
case 'e':
- ender = ASCII_TO_NATIVE('\033');
+ ender = ESC_NATIVE;
p++;
break;
case 'a':
- ender = '\a';
+ ender = '\a';
p++;
break;
case 'o':
&& (PL_fold[ender] != PL_fold_latin1[ender]
|| ender == LATIN_SMALL_LETTER_SHARP_S
|| (len > 0
- && isARG2_lower_or_UPPER_ARG1('s', ender)
- && isARG2_lower_or_UPPER_ARG1('s',
- *(s-1)))))
+ && isALPHA_FOLD_EQ(ender, 's')
+ && isALPHA_FOLD_EQ(*(s-1), 's'))))
{
maybe_exactfu = FALSE;
}
* as if it turns into an EXACTFU, it could later get
* joined with another 's' that would then wrongly match
* the sharp s */
- if (maybe_exactfu && isARG2_lower_or_UPPER_ARG1('s', ender))
+ if (maybe_exactfu && isALPHA_FOLD_EQ(ender, 's'))
{
maybe_exactfu = FALSE;
}
}
/* Quit if are above what we should change */
- if (start > 255) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
break;
}
change_invlist = TRUE;
/* Set all the bits in the range, up to the max that we are doing */
- high = (end < 255) ? end : 255;
+ high = (end < NUM_ANYOF_CODE_POINTS - 1)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(node, i)) {
ANYOF_BITMAP_SET(node, i);
PERL_STATIC_INLINE I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value, const bool strict)
{
- dVAR;
I32 namedclass = OOB_NAMEDCLASS;
PERL_ARGS_ASSERT_REGPPOSIXCC;
* disk to find the possible matches.
*
* This should be called only for a Latin1-range code points, cp, which is
- * known to be involved in a fold with other code points above Latin1. It
- * would give false results if /aa has been specified. Multi-char folds
- * are outside the scope of this, and must be handled specially.
+ * known to be involved in a simple fold with other code points above
+ * Latin1. It would give false results if /aa has been specified.
+ * Multi-char folds are outside the scope of this, and must be handled
+ * specially.
*
* XXX It would be better to generate these via regen, in case a new
* version of the Unicode standard adds new mappings, though that is not
PERL_ARGS_ASSERT_ADD_ABOVE_LATIN1_FOLDS;
+ assert(HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(cp));
+
switch (cp) {
case 'k':
case 'K':
case LATIN_SMALL_LETTER_SHARP_S:
*invlist = add_cp_to_invlist(*invlist, LATIN_CAPITAL_LETTER_SHARP_S);
break;
- case 'F': case 'f':
- case 'I': case 'i':
- case 'L': case 'l':
- case 'T': case 't':
- case 'A': case 'a':
- case 'H': case 'h':
- case 'J': case 'j':
- case 'N': case 'n':
- case 'W': case 'w':
- case 'Y': case 'y':
- /* These all are targets of multi-character folds from code points
- * that require UTF8 to express, so they can't match unless the
- * target string is in UTF-8, so no action here is necessary, as
- * regexec.c properly handles the general case for UTF-8 matching
- * and multi-char folds */
- break;
default:
/* Use deprecated warning to increase the chances of this being
* output */
* ignored in the recursion by means of a flag:
* <RExC_in_multi_char_class>.)
*
- * ANYOF nodes contain a bit map for the first 256 characters, with the
- * corresponding bit set if that character is in the list. For characters
- * above 255, a range list or swash is used. There are extra bits for \w,
- * etc. in locale ANYOFs, as what these match is not determinable at
- * compile time
+ * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
+ * characters, with the corresponding bit set if that character is in the
+ * list. For characters above this, a range list or swash is used. There
+ * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
+ * determinable at compile time
*
* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
* to be restarted. This can only happen if ret_invlist is non-NULL.
*/
- dVAR;
UV prevvalue = OOB_UNICODE, save_prevvalue = OOB_UNICODE;
IV range = 0;
UV value = OOB_UNICODE, save_value = OOB_UNICODE;
{
namedclass = regpposixcc(pRExC_state, value, strict);
}
- else if (value == '\\') {
- if (UTF) {
+ else if (value != '\\') {
+#ifdef EBCDIC
+ literal_endpoint++;
+#endif
+ }
+ else {
+ /* Is a backslash; get the code point of the char after it */
+ if (UTF && ! UTF8_IS_INVARIANT(RExC_parse)) {
value = utf8n_to_uvchr((U8*)RExC_parse,
RExC_end - RExC_parse,
&numlen, UTF8_ALLOW_DEFAULT);
case 't': value = '\t'; break;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
- case 'e': value = ASCII_TO_NATIVE('\033');break;
+ case 'e': value = ESC_NATIVE; break;
case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
break;
} /* End of switch on char following backslash */
} /* end of handling backslash escape sequences */
-#ifdef EBCDIC
- else
- literal_endpoint++;
-#endif
/* Here, we have the current token in 'value' */
swash = NULL;
}
+ /* Note that the optimization of using 'swash' if it is the only thing in
+ * the class doesn't have us change swash at all, so it can include things
+ * that are also in the bitmap; otherwise we have purposely deleted that
+ * duplicate information */
set_ANYOF_arg(pRExC_state, ret, cp_list,
(HAS_NONLOCALE_RUNTIME_PROPERTY_DEFINITION)
? listsv : NULL,
}
}
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+SV *
+Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
+ const regnode* node,
+ bool doinit,
+ SV** listsvp,
+ SV** only_utf8_locale_ptr,
+ SV* exclude_list)
+
+{
+ /* For internal core use only.
+ * Returns the swash for the input 'node' in the regex 'prog'.
+ * If <doinit> is 'true', will attempt to create the swash if not already
+ * done.
+ * If <listsvp> is non-null, will return the printable contents of the
+ * swash. This can be used to get debugging information even before the
+ * swash exists, by calling this function with 'doinit' set to false, in
+ * which case the components that will be used to eventually create the
+ * swash are returned (in a printable form).
+ * If <exclude_list> is not NULL, it is an inversion list of things to
+ * exclude from what's returned in <listsvp>.
+ * Tied intimately to how S_set_ANYOF_arg sets up the data structure. Note
+ * that, in spite of this function's name, the swash it returns may include
+ * the bitmap data as well */
+
+ SV *sw = NULL;
+ SV *si = NULL; /* Input swash initialization string */
+ SV* invlist = NULL;
+
+ RXi_GET_DECL(prog,progi);
+ const struct reg_data * const data = prog ? progi->data : NULL;
+
+ PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
+
+ assert(ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
+
+ if (data && data->count) {
+ const U32 n = ARG(node);
+
+ if (data->what[n] == 's') {
+ SV * const rv = MUTABLE_SV(data->data[n]);
+ AV * const av = MUTABLE_AV(SvRV(rv));
+ SV **const ary = AvARRAY(av);
+ U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+
+ si = *ary; /* ary[0] = the string to initialize the swash with */
+
+ /* Elements 3 and 4 are either both present or both absent. [3] is
+ * any inversion list generated at compile time; [4] indicates if
+ * that inversion list has any user-defined properties in it. */
+ if (av_tindex(av) >= 2) {
+ if (only_utf8_locale_ptr
+ && ary[2]
+ && ary[2] != &PL_sv_undef)
+ {
+ *only_utf8_locale_ptr = ary[2];
+ }
+ else {
+ assert(only_utf8_locale_ptr);
+ *only_utf8_locale_ptr = NULL;
+ }
+
+ if (av_tindex(av) >= 3) {
+ invlist = ary[3];
+ if (SvUV(ary[4])) {
+ swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
+ }
+ }
+ else {
+ invlist = NULL;
+ }
+ }
+
+ /* Element [1] is reserved for the set-up swash. If already there,
+ * return it; if not, create it and store it there */
+ if (ary[1] && SvROK(ary[1])) {
+ sw = ary[1];
+ }
+ else if (doinit && ((si && si != &PL_sv_undef)
+ || (invlist && invlist != &PL_sv_undef))) {
+ assert(si);
+ sw = _core_swash_init("utf8", /* the utf8 package */
+ "", /* nameless */
+ si,
+ 1, /* binary */
+ 0, /* not from tr/// */
+ invlist,
+ &swash_init_flags);
+ (void)av_store(av, 1, sw);
+ }
+ }
+ }
+
+ /* If requested, return a printable version of what this swash matches */
+ if (listsvp) {
+ SV* matches_string = newSVpvs("");
+
+ /* The swash should be used, if possible, to get the data, as it
+ * contains the resolved data. But this function can be called at
+ * compile-time, before everything gets resolved, in which case we
+ * return the currently best available information, which is the string
+ * that will eventually be used to do that resolving, 'si' */
+ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
+ && (si && si != &PL_sv_undef))
+ {
+ sv_catsv(matches_string, si);
+ }
+
+ /* Add the inversion list to whatever we have. This may have come from
+ * the swash, or from an input parameter */
+ if (invlist) {
+ if (exclude_list) {
+ SV* clone = invlist_clone(invlist);
+ _invlist_subtract(clone, exclude_list, &clone);
+ sv_catsv(matches_string, _invlist_contents(clone));
+ SvREFCNT_dec_NN(clone);
+ }
+ else {
+ sv_catsv(matches_string, _invlist_contents(invlist));
+ }
+ }
+ *listsvp = matches_string;
+ }
+
+ return sw;
+}
+#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
/* reg_skipcomment()
{
PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
- assert(*p = '#');
+ assert(*p == '#');
while (p < RExC_end) {
if (*(++p) == '\n') {
STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
- dVAR;
regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
- dVAR;
regnode *ptr;
regnode * const ret = RExC_emit;
GET_RE_DEBUG_FLAGS_DECL;
PERL_STATIC_INLINE STRLEN
S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
{
- dVAR;
-
PERL_ARGS_ASSERT_REGUNI;
return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
{
- dVAR;
regnode *src;
regnode *dst;
regnode *place;
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p,
const regnode *val,U32 depth)
{
- dVAR;
regnode *scan;
GET_RE_DEBUG_FLAGS_DECL;
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
const regnode *val,U32 depth)
{
- dVAR;
regnode *scan;
U8 exact = PSEUDO;
#ifdef EXPERIMENTAL_INPLACESCAN
Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
- dVAR;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
{
#ifdef DEBUGGING
- dVAR;
int k;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
);
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
- (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
- ? ANYOF_BITMAP(o)
- : TRIE_BITMAP(trie));
+ (void) put_charclass_bitmap_innards(sv,
+ (IS_ANYOF_TRIE(op))
+ ? ANYOF_BITMAP(o)
+ : TRIE_BITMAP(trie),
+ NULL);
sv_catpvs(sv, "]");
}
else if (k == ANYOF) {
const U8 flags = ANYOF_FLAGS(o);
int do_sep = 0;
+ SV* bitmap_invlist; /* Will hold what the bit map contains */
if (flags & ANYOF_LOCALE_FLAGS)
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
- /* output what the standard cp 0-255 bitmap matches */
- do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
+ /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
+ * */
+ do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o),
+ &bitmap_invlist);
/* output any special charclass tests (used entirely under use
* locale) * */
been output */
SV *only_utf8_locale;
- /* Get the stuff that wasn't in the bitmap */
+ /* Get the stuff that wasn't in the bitmap. 'bitmap_invlist'
+ * is used to guarantee that nothing in the bitmap gets
+ * returned */
(void) _get_regclass_nonbitmap_data(prog, o, FALSE,
- &lv, &only_utf8_locale);
+ &lv, &only_utf8_locale,
+ bitmap_invlist);
if (lv && lv != &PL_sv_undef) {
char *s = savesvpv(lv);
char * const origs = s;
invlist_iterinit(only_utf8_locale);
while (invlist_iternext(only_utf8_locale,
&start, &end)) {
- put_range(sv, start, end);
+ put_range(sv, start, end, FALSE);
max_entries --;
if (max_entries < 0) {
sv_catpvs(sv, "...");
}
}
}
+ SvREFCNT_dec(bitmap_invlist);
+
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
SV *
Perl_re_intuit_string(pTHX_ REGEXP * const r)
{ /* Assume that RE_INTUIT is set */
- dVAR;
struct regexp *const prog = ReANY(r);
GET_RE_DEBUG_FLAGS_DECL;
void
Perl_pregfree2(pTHX_ REGEXP *rx)
{
- dVAR;
struct regexp *const r = ReANY(rx);
GET_RE_DEBUG_FLAGS_DECL;
void
Perl_regfree_internal(pTHX_ REGEXP * const rx)
{
- dVAR;
struct regexp *const r = ReANY(rx);
RXi_GET_DECL(r,ri);
GET_RE_DEBUG_FLAGS_DECL;
Used in stclass optimization only */
U32 refcount;
reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
+#ifdef USE_ITHREADS
+ dVAR;
+#endif
OP_REFCNT_LOCK;
refcount = --aho->refcount;
OP_REFCNT_UNLOCK;
/* trie structure. */
U32 refcount;
reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
+#ifdef USE_ITHREADS
+ dVAR;
+#endif
OP_REFCNT_LOCK;
refcount = --trie->refcount;
OP_REFCNT_UNLOCK;
regnode *
Perl_regnext(pTHX_ regnode *p)
{
- dVAR;
I32 offset;
if (!p)
void
Perl_save_re_context(pTHX)
{
- dVAR;
-
/* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
if (PL_curpm) {
const REGEXP * const rx = PM_GETRE(PL_curpm);
#ifdef DEBUGGING
+/* Given that c is a control character, is it one for which we have a
+ * mnemonic? */
+#define isMNEMONIC_CNTRL(c) ((isSPACE_A(c) && (c) != '\v') \
+ || (c) == '\a' \
+ || (c) == '\b' \
+ || (c) == ESC_NATIVE)
+/* Certain characters are output as a sequence with the first being a
+ * backslash. */
+#define isBACKSLASHED_PUNCT(c) \
+ ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
+
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
if (!isPRINT(c)) {
switch (c) {
- case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+ case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+ case '\b': Perl_sv_catpvf(aTHX_ sv, "\\b"); break;
+ case ESC_NATIVE: Perl_sv_catpvf(aTHX_ sv, "\\e"); break;
+ case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+ case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
- case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
- case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
-
- default:
- Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
- break;
+ default: Perl_sv_catpvf(aTHX_ sv, "\\x{%02X}", c); break;
}
}
else {
const char string = c;
- if (c == '-' || c == ']' || c == '\\' || c == '^')
+ if (isBACKSLASHED_PUNCT(c))
sv_catpvs(sv, "\\");
sv_catpvn(sv, &string, 1);
}
}
+#define MAX_PRINT_A MAX_PRINT_A_FOR_USE_ONLY_BY_REGCOMP_DOT_C
+
+#ifndef MIN
+#define MIN(a,b) ((a) < (b) ? (a) : (b))
+#endif
+
STATIC void
-S_put_range(pTHX_ SV *sv, UV start, UV end)
+S_put_range(pTHX_ SV *sv, UV start, const UV end, const bool allow_literals)
{
-
/* Appends to 'sv' a displayable version of the range of code points from
- * 'start' to 'end' */
+ * 'start' to 'end'. It assumes that only ASCII printables are displayable
+ * as-is (though some of these will be escaped by put_byte()). */
+
+ const unsigned int min_range_count = 3;
assert(start <= end);
PERL_ARGS_ASSERT_PUT_RANGE;
- if (end - start < 3) { /* Individual chars in short ranges */
- for (; start <= end; start++)
- put_byte(sv, start);
- }
- else if ( end > 255
- || ! isALPHANUMERIC(start)
- || ! isALPHANUMERIC(end)
- || isDIGIT(start) != isDIGIT(end)
- || isUPPER(start) != isUPPER(end)
- || isLOWER(start) != isLOWER(end)
-
- /* This final test should get optimized out except on EBCDIC
- * platforms, where it causes ranges that cross discontinuities
- * like i/j to be shown as hex instead of the misleading,
- * e.g. H-K (since that range includes more than H, I, J, K).
+ while (start <= end) {
+ if (end - start < min_range_count) {
+
+ /* Individual chars in short ranges */
+ for (; start <= end; start++) {
+ put_byte(sv, start);
+ }
+ break;
+ }
+
+ /* If permitted by the input options, and there is a possibility that
+ * this range contains a printable literal, look to see if there is
+ * one. */
+ if (allow_literals && start <= MAX_PRINT_A) {
+
+ /* If the range begin isn't an ASCII printable, effectively split
+ * the range into two parts:
+ * 1) the portion before the first such printable,
+ * 2) the rest
+ * and output them separately. */
+ if (! isPRINT_A(start)) {
+ UV temp_end = start + 1;
+
+ /* There is no point looking beyond the final possible
+ * printable, in MAX_PRINT_A */
+ UV max = MIN(end, MAX_PRINT_A);
+
+ while (temp_end <= max && ! isPRINT_A(temp_end)) {
+ temp_end++;
+ }
+
+ /* Here, temp_end points to one beyond the first printable if
+ * found, or to one beyond 'max' if not. If none found, make
+ * sure that we use the entire range */
+ if (temp_end > MAX_PRINT_A) {
+ temp_end = end + 1;
+ }
+
+ /* Output the first part of the split range, the part that
+ * doesn't have printables, with no looking for literals
+ * (otherwise we would infinitely recurse) */
+ put_range(sv, start, temp_end - 1, FALSE);
+
+ /* The 2nd part of the range (if any) starts here. */
+ start = temp_end;
+
+ /* We continue instead of dropping down because even if the 2nd
+ * part is non-empty, it could be so short that we want to
+ * output it specially, as tested for at the top of this loop.
* */
- || (end - start) != NATIVE_TO_ASCII(end) - NATIVE_TO_ASCII(start))
- {
+ continue;
+ }
+
+ /* Here, 'start' is a printable ASCII. If it is an alphanumeric,
+ * output a sub-range of just the digits or letters, then process
+ * the remaining portion as usual. */
+ if (isALPHANUMERIC_A(start)) {
+ UV mask = (isDIGIT_A(start))
+ ? _CC_DIGIT
+ : isUPPER_A(start)
+ ? _CC_UPPER
+ : _CC_LOWER;
+ UV temp_end = start + 1;
+
+ /* Find the end of the sub-range that includes just the
+ * characters in the same class as the first character in it */
+ while (temp_end <= end && _generic_isCC_A(temp_end, mask)) {
+ temp_end++;
+ }
+ temp_end--;
+
+ /* For short ranges, don't duplicate the code above to output
+ * them; just call recursively */
+ if (temp_end - start < min_range_count) {
+ put_range(sv, start, temp_end, FALSE);
+ }
+ else { /* Output as a range */
+ put_byte(sv, start);
+ sv_catpvs(sv, "-");
+ put_byte(sv, temp_end);
+ }
+ start = temp_end + 1;
+ continue;
+ }
+
+ /* We output any other printables as individual characters */
+ if (isPUNCT_A(start) || isSPACE_A(start)) {
+ while (start <= end && (isPUNCT_A(start)
+ || isSPACE_A(start)))
+ {
+ put_byte(sv, start);
+ start++;
+ }
+ continue;
+ }
+ } /* End of looking for literals */
+
+ /* Here is not to output as a literal. Some control characters have
+ * mnemonic names. Split off any of those at the beginning and end of
+ * the range to print mnemonically. It isn't possible for many of
+ * these to be in a row, so this won't overwhelm with output */
+ if (isMNEMONIC_CNTRL(start)) {
+ while (isMNEMONIC_CNTRL(start) && start <= end) {
+ put_byte(sv, start);
+ start++;
+ }
+ }
+ if (start < end && isMNEMONIC_CNTRL(end)) {
+
+ /* Here, the final character in the range has a mnemonic name.
+ * Work backwards from the end to find the final non-mnemonic */
+ UV temp_end = end - 1;
+ while (isMNEMONIC_CNTRL(temp_end)) {
+ temp_end--;
+ }
+
+ /* And separately output the range that doesn't have mnemonics */
+ put_range(sv, start, temp_end, FALSE);
+
+ /* Then output the mnemonic trailing controls */
+ start = temp_end + 1;
+ while (start <= end) {
+ put_byte(sv, start);
+ start++;
+ }
+ break;
+ }
+
+ /* As a final resort, output the range or subrange as hex. */
Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
start,
- (end < 256) ? end : 255);
- }
- else { /* Here, the ends of the range are both digits, or both uppercase,
- or both lowercase; and there's no discontinuity in the range
- (which could happen on EBCDIC platforms) */
- put_byte(sv, start);
- sv_catpvs(sv, "-");
- put_byte(sv, end);
+ (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1);
+ break;
}
}
STATIC bool
-S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap, SV** bitmap_invlist)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
- * output anything */
+ * output anything, and bitmap_invlist, if not NULL, will point to an
+ * inversion list of what is in the bit map */
int i;
- bool has_output_anything = FALSE;
+ UV start, end;
+ unsigned int punct_count = 0;
+ SV* invlist = NULL;
+ SV** invlist_ptr; /* Temporary, in case bitmap_invlist is NULL */
+ bool allow_literals = TRUE;
+
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
+
+ invlist_ptr = (bitmap_invlist) ? bitmap_invlist : &invlist;
+
+ /* Worst case is exactly every-other code point is in the list */
+ *invlist_ptr = _new_invlist(NUM_ANYOF_CODE_POINTS / 2);
+
+ /* Convert the bit map to an inversion list, keeping track of how many
+ * ASCII puncts are set, including an extra amount for the backslashed
+ * ones. */
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST((U8 *) bitmap,i)) {
+ *invlist_ptr = add_cp_to_invlist(*invlist_ptr, i);
+ if (isPUNCT_A(i)) {
+ punct_count++;
+ if isBACKSLASHED_PUNCT(i) {
+ punct_count++;
+ }
+ }
+ }
+ }
- PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+ /* Nothing to output */
+ if (_invlist_len(*invlist_ptr) == 0) {
+ SvREFCNT_dec(invlist);
+ return FALSE;
+ }
- for (i = 0; i < 256; i++) {
- if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+ /* Generally, it is more readable if printable characters are output as
+ * literals, but if a range (nearly) spans all of them, it's best to output
+ * it as a single range. This code will use a single range if all but 2
+ * printables are in it */
+ invlist_iterinit(*invlist_ptr);
+ while (invlist_iternext(*invlist_ptr, &start, &end)) {
- /* The character at index i should be output. Find the next
- * character that should NOT be output */
- int j;
- for (j = i + 1; j <= 256; j++) {
- if (! BITMAP_TEST((U8 *) bitmap, j)) {
- break;
- }
- }
+ /* If range starts beyond final printable, it doesn't have any in it */
+ if (start > MAX_PRINT_A) {
+ break;
+ }
- /* Everything between them is a single range that should be output
- * */
- put_range(sv, i, j - 1);
- has_output_anything = TRUE;
- i = j;
+ /* In both ASCII and EBCDIC, a SPACE is the lowest printable. To span
+ * all but two, the range must start and end no later than 2 from
+ * either end */
+ if (start < ' ' + 2 && end > MAX_PRINT_A - 2) {
+ if (end > MAX_PRINT_A) {
+ end = MAX_PRINT_A;
+ }
+ if (start < ' ') {
+ start = ' ';
+ }
+ if (end - start >= MAX_PRINT_A - ' ' - 2) {
+ allow_literals = FALSE;
+ }
+ break;
}
}
+ invlist_iterfinish(*invlist_ptr);
+
+ /* The legibility of the output depends mostly on how many punctuation
+ * characters are output. There are 32 possible ASCII ones, and some have
+ * an additional backslash, bringing it to currently 36, so if any more
+ * than 18 are to be output, we can instead output it as its complement,
+ * yielding fewer puncts, and making it more legible. But give some weight
+ * to the fact that outputting it as a complement is less legible than a
+ * straight output, so don't complement unless we are somewhat over the 18
+ * mark */
+ if (allow_literals && punct_count > 22) {
+ sv_catpvs(sv, "^");
+
+ /* Add everything remaining to the list, so when we invert it just
+ * below, it will be excluded */
+ *invlist_ptr = _add_range_to_invlist(*invlist_ptr,
+ NUM_ANYOF_CODE_POINTS, UV_MAX);
+ _invlist_invert(*invlist_ptr);
+ }
+
+ /* Here we have figured things out. Output each range */
+ invlist_iterinit(*invlist_ptr);
+ while (invlist_iternext(*invlist_ptr, &start, &end)) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
+ break;
+ }
+ put_range(sv, start, end, allow_literals);
+ }
+ invlist_iterfinish(*invlist_ptr);
- return has_output_anything;
+ return TRUE;
}
#define CLEAR_OPTSTART \
const regnode *last, const regnode *plast,
SV* sv, I32 indent, U32 depth)
{
- dVAR;
U8 op = PSEUDO; /* Arbitrary non-END op. */
const regnode *next;
const regnode *optstart= NULL;