}
}
- /* An ANYOF node contains a bitmap for the first 256 code points, and an
- * inversion list for the others, but if there are code points that should
- * match only conditionally on the target string being UTF-8, those are
- * placed in the inversion list, and not the bitmap. Since there are
- * circumstances under which they could match, they are included in the
- * SSC. But if the ANYOF node is to be inverted, we have to exclude them
- * here, so that when we invert below, the end result actually does include
- * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
- * before we add the unconditionally matched code points */
+ /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
+ * code points, and an inversion list for the others, but if there are code
+ * points that should match only conditionally on the target string being
+ * UTF-8, those are placed in the inversion list, and not the bitmap.
+ * Since there are circumstances under which they could match, they are
+ * included in the SSC. But if the ANYOF node is to be inverted, we have
+ * to exclude them here, so that when we invert below, the end result
+ * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
+ * have to do this here before we add the unconditionally matched code
+ * points */
if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
_invlist_intersection_complement_2nd(invlist,
PL_UpperLatin1,
}
/* Add in the points from the bit map */
- for (i = 0; i < 256; i++) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
if (ANYOF_BITMAP_TEST(node, i)) {
invlist = add_cp_to_invlist(invlist, i);
new_node_has_latin1 = TRUE;
{
/* The inversion list in the SSC is marked mortal; now we need a more
* permanent copy, which is stored the same way that is done in a regular
- * ANYOF node, with the first 256 code points in a bit map */
+ * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
+ * map */
SV* invlist = invlist_clone(ssc->invlist);
else if ( PL_regkind[OP(scan)] == BRANCHJ
/* Lookbehind, or need to calculate parens/evals/stclass: */
&& (scan->flags || data || (flags & SCF_DO_STCLASS))
- && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
+ && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
+ {
if ( OP(scan) == UNLESSM &&
scan->flags == 0 &&
OP(NEXTOPER(NEXTOPER(scan))) == NOTHING &&
*/
ssc_init(pRExC_state, data->start_class);
} else {
- /* AND before and after: combine and continue */
+ /* AND before and after: combine and continue. These
+ * assertions are zero-length, so can match an EMPTY
+ * string */
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
}
}
if (f & SCF_DO_STCLASS_AND) {
ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
+ ANYOF_FLAGS(data->start_class) |= ANYOF_EMPTY_STRING;
}
if (data) {
if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
if (oplist) {
assert(oplist->op_type == OP_PADAV
|| oplist->op_type == OP_RV2AV);
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);
}
if (SvRMAGICAL(av)) {
pRExC_state->code_blocks[n].src_regex = NULL;
n++;
code = 1;
- oplist = oplist->op_sibling; /* skip CONST */
+ oplist = OP_SIBLING(oplist); /* skip CONST */
assert(oplist);
}
- oplist = oplist->op_sibling;;
+ oplist = OP_SIBLING(oplist);;
}
/* apply magic and QR overloading to arg */
OP *o;
int ncode = 0;
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling)
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o))
if (o->op_type == OP_NULL && (o->op_flags & OPf_SPECIAL))
ncode++; /* count of DO blocks */
if (ncode) {
if (expr->op_type == OP_CONST)
n = 1;
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
n++;
}
if (expr->op_type == OP_CONST)
new_patternp[n] = cSVOPx_sv(expr);
else
- for (o = cLISTOPx(expr)->op_first; o; o = o->op_sibling) {
+ for (o = cLISTOPx(expr)->op_first; o; o = OP_SIBLING(o)) {
if (o->op_type == OP_CONST)
new_patternp[n++] = cSVOPo_sv;
}
assert( expr->op_type == OP_PUSHMARK
|| (expr->op_type == OP_NULL && expr->op_targ == OP_PUSHMARK)
|| expr->op_type == OP_PADRANGE);
- expr = expr->op_sibling;
+ expr = OP_SIBLING(expr);
}
pat = S_concat_pat(aTHX_ pRExC_state, NULL, new_patternp, pat_count,
else if (*RExC_parse == '?') { /* (?...) */
bool is_logical = 0;
const char * const seqstart = RExC_parse;
+ const char * endptr;
if (has_intervening_patws) {
RExC_parse++;
vFAIL("In '(?...)', the '(' and '?' must be adjacent");
case '5': case '6': case '7': case '8': case '9':
RExC_parse--;
parse_recursion:
- num = atoi(RExC_parse);
- parse_start = RExC_parse - 1; /* MJD */
- if (*RExC_parse == '-')
- RExC_parse++;
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ {
+ bool is_neg = FALSE;
+ parse_start = RExC_parse - 1; /* MJD */
+ if (*RExC_parse == '-') {
+ RExC_parse++;
+ is_neg = TRUE;
+ }
+ num = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
+ if (is_neg) {
+ /* Some limit for num? */
+ num = -num;
+ }
+ }
if (*RExC_parse!=')')
vFAIL("Expecting close bracket");
REGTAIL(pRExC_state, ret, tail);
goto insert_if;
}
+ /* Fall through to ‘Unknown switch condition’ at the
+ end of the if/else chain. */
}
else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
|| RExC_parse[0] == '\'' ) /* (?('NAME')...) */
RExC_parse++;
parno = 0;
if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
- parno = atoi(RExC_parse++);
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
} else if (RExC_parse[0] == '&') {
SV *sv_dat;
RExC_parse++;
/* (?(1)...) */
char c;
char *tmp;
- parno = atoi(RExC_parse++);
-
- while (isDIGIT(*RExC_parse))
- RExC_parse++;
+ parno = grok_atou(RExC_parse, &endptr);
+ if (endptr)
+ RExC_parse = (char*)endptr;
ret = reganode(pRExC_state, GROUPP, parno);
insert_if_check_paren:
but I can't figure out why. -- dmq*/
return ret;
}
- else {
- RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
- vFAIL("Unknown switch condition (?(...))");
- }
+ RExC_parse += UTF ? UTF8SKIP(RExC_parse) : 1;
+ vFAIL("Unknown switch condition (?(...))");
}
case '[': /* (?[ ... ]) */
return handle_regex_sets(pRExC_state, NULL, flagp, depth,
next++;
}
if (*next == '}') { /* got one */
+ const char* endptr;
if (!maxpos)
maxpos = next;
RExC_parse++;
- min = atoi(RExC_parse);
+ min = grok_atou(RExC_parse, &endptr);
if (*maxpos == ',')
maxpos++;
else
maxpos = RExC_parse;
- max = atoi(maxpos);
+ max = grok_atou(maxpos, &endptr);
if (!max && *maxpos != '0')
max = REG_INFTY; /* meaning "infinity" */
else if (max >= REG_INFTY)
}
-/* return atoi(p), unless it's too big to sensibly be a backref,
+/* Parse backref decimal value, unless it's too big to sensibly be a backref,
* in which case return I32_MAX (rather than possibly 32-bit wrapping) */
static I32
S_backref_value(char *p)
{
- char *q = p;
-
- for (;isDIGIT(*q); q++) {} /* calculate length of num */
- if (q - p == 0 || q - p > 9)
+ const char* endptr;
+ UV val = grok_atou(p, &endptr);
+ if (endptr == p || endptr == NULL || val > I32_MAX)
return I32_MAX;
- return atoi(p);
+ return (I32)val;
}
p++;
break;
case 'e':
- ender = ASCII_TO_NATIVE('\033');
+ ender = ESC_NATIVE;
p++;
break;
case 'a':
}
/* Quit if are above what we should change */
- if (start > 255) {
+ if (start >= NUM_ANYOF_CODE_POINTS) {
break;
}
change_invlist = TRUE;
/* Set all the bits in the range, up to the max that we are doing */
- high = (end < 255) ? end : 255;
+ high = (end < NUM_ANYOF_CODE_POINTS - 1)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1;
for (i = start; i <= (int) high; i++) {
if (! ANYOF_BITMAP_TEST(node, i)) {
ANYOF_BITMAP_SET(node, i);
* ignored in the recursion by means of a flag:
* <RExC_in_multi_char_class>.)
*
- * ANYOF nodes contain a bit map for the first 256 characters, with the
- * corresponding bit set if that character is in the list. For characters
- * above 255, a range list or swash is used. There are extra bits for \w,
- * etc. in locale ANYOFs, as what these match is not determinable at
- * compile time
+ * ANYOF nodes contain a bit map for the first NUM_ANYOF_CODE_POINTS
+ * characters, with the corresponding bit set if that character is in the
+ * list. For characters above this, a range list or swash is used. There
+ * are extra bits for \w, etc. in locale ANYOFs, as what these match is not
+ * determinable at compile time
*
* Returns NULL, setting *flagp to RESTART_UTF8 if the sizing scan needs
* to be restarted. This can only happen if ret_invlist is non-NULL.
case 't': value = '\t'; break;
case 'f': value = '\f'; break;
case 'b': value = '\b'; break;
- case 'e': value = ASCII_TO_NATIVE('\033');break;
+ case 'e': value = ESC_NATIVE; break;
case 'a': value = '\a'; break;
case 'o':
RExC_parse--; /* function expects to be pointed at the 'o' */
}
}
+#if !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION)
+
+SV *
+Perl__get_regclass_nonbitmap_data(pTHX_ const regexp *prog,
+ const regnode* node,
+ bool doinit,
+ SV** listsvp,
+ SV** only_utf8_locale_ptr)
+{
+ /* For internal core use only.
+ * Returns the swash for the input 'node' in the regex 'prog'.
+ * If <doinit> is 'true', will attempt to create the swash if not already
+ * done.
+ * If <listsvp> is non-null, will return the printable contents of the
+ * swash. This can be used to get debugging information even before the
+ * swash exists, by calling this function with 'doinit' set to false, in
+ * which case the components that will be used to eventually create the
+ * swash are returned (in a printable form).
+ * Tied intimately to how S_set_ANYOF_arg sets up the data structure */
+
+ SV *sw = NULL;
+ SV *si = NULL; /* Input swash initialization string */
+ SV* invlist = NULL;
+
+ RXi_GET_DECL(prog,progi);
+ const struct reg_data * const data = prog ? progi->data : NULL;
+
+ PERL_ARGS_ASSERT__GET_REGCLASS_NONBITMAP_DATA;
+
+ assert(ANYOF_FLAGS(node)
+ & (ANYOF_UTF8|ANYOF_NONBITMAP_NON_UTF8|ANYOF_LOC_FOLD));
+
+ if (data && data->count) {
+ const U32 n = ARG(node);
+
+ if (data->what[n] == 's') {
+ SV * const rv = MUTABLE_SV(data->data[n]);
+ AV * const av = MUTABLE_AV(SvRV(rv));
+ SV **const ary = AvARRAY(av);
+ U8 swash_init_flags = _CORE_SWASH_INIT_ACCEPT_INVLIST;
+
+ si = *ary; /* ary[0] = the string to initialize the swash with */
+
+ /* Elements 3 and 4 are either both present or both absent. [3] is
+ * any inversion list generated at compile time; [4] indicates if
+ * that inversion list has any user-defined properties in it. */
+ if (av_tindex(av) >= 2) {
+ if (only_utf8_locale_ptr
+ && ary[2]
+ && ary[2] != &PL_sv_undef)
+ {
+ *only_utf8_locale_ptr = ary[2];
+ }
+ else {
+ assert(only_utf8_locale_ptr);
+ *only_utf8_locale_ptr = NULL;
+ }
+
+ if (av_tindex(av) >= 3) {
+ invlist = ary[3];
+ if (SvUV(ary[4])) {
+ swash_init_flags |= _CORE_SWASH_INIT_USER_DEFINED_PROPERTY;
+ }
+ }
+ else {
+ invlist = NULL;
+ }
+ }
+
+ /* Element [1] is reserved for the set-up swash. If already there,
+ * return it; if not, create it and store it there */
+ if (ary[1] && SvROK(ary[1])) {
+ sw = ary[1];
+ }
+ else if (doinit && ((si && si != &PL_sv_undef)
+ || (invlist && invlist != &PL_sv_undef))) {
+ assert(si);
+ sw = _core_swash_init("utf8", /* the utf8 package */
+ "", /* nameless */
+ si,
+ 1, /* binary */
+ 0, /* not from tr/// */
+ invlist,
+ &swash_init_flags);
+ (void)av_store(av, 1, sw);
+ }
+ }
+ }
+
+ /* If requested, return a printable version of what this swash matches */
+ if (listsvp) {
+ SV* matches_string = newSVpvs("");
+
+ /* The swash should be used, if possible, to get the data, as it
+ * contains the resolved data. But this function can be called at
+ * compile-time, before everything gets resolved, in which case we
+ * return the currently best available information, which is the string
+ * that will eventually be used to do that resolving, 'si' */
+ if ((! sw || (invlist = _get_swash_invlist(sw)) == NULL)
+ && (si && si != &PL_sv_undef))
+ {
+ sv_catsv(matches_string, si);
+ }
+
+ /* Add the inversion list to whatever we have. This may have come from
+ * the swash, or from an input parameter */
+ if (invlist) {
+ sv_catsv(matches_string, _invlist_contents(invlist));
+ }
+ *listsvp = matches_string;
+ }
+
+ return sw;
+}
+
+#endif /* !defined(PERL_IN_XSUB_RE) || defined(PLUGGABLE_RE_EXTENSION) */
/* reg_skipcomment()
S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p,
const regnode *val,U32 depth)
{
- dVAR;
regnode *scan;
U8 exact = PSEUDO;
#ifdef EXPERIMENTAL_INPLACESCAN
Perl_regdump(pTHX_ const regexp *r)
{
#ifdef DEBUGGING
- dVAR;
SV * const sv = sv_newmortal();
SV *dsv= sv_newmortal();
RXi_GET_DECL(r,ri);
Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o, const regmatch_info *reginfo)
{
#ifdef DEBUGGING
- dVAR;
int k;
/* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
);
if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
sv_catpvs(sv, "[");
- (void) put_latin1_charclass_innards(sv, IS_ANYOF_TRIE(op)
+ (void) put_charclass_bitmap_innards(sv, IS_ANYOF_TRIE(op)
? ANYOF_BITMAP(o)
: TRIE_BITMAP(trie));
sv_catpvs(sv, "]");
if (flags & ANYOF_INVERT)
sv_catpvs(sv, "^");
- /* output what the standard cp 0-255 bitmap matches */
- do_sep = put_latin1_charclass_innards(sv, ANYOF_BITMAP(o));
+ /* output what the standard cp 0-NUM_ANYOF_CODE_POINTS-1 bitmap matches
+ * */
+ do_sep = put_charclass_bitmap_innards(sv, ANYOF_BITMAP(o));
/* output any special charclass tests (used entirely under use
* locale) * */
if (!isPRINT(c)) {
switch (c) {
- case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
+ case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
+ case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
case '\n': Perl_sv_catpvf(aTHX_ sv, "\\n"); break;
+ case '\r': Perl_sv_catpvf(aTHX_ sv, "\\r"); break;
case '\t': Perl_sv_catpvf(aTHX_ sv, "\\t"); break;
- case '\f': Perl_sv_catpvf(aTHX_ sv, "\\f"); break;
- case '\a': Perl_sv_catpvf(aTHX_ sv, "\\a"); break;
default:
Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
/* Appends to 'sv' a displayable version of the range of code points from
* 'start' to 'end'. It assumes that only ASCII printables are displayable
- * as-is (though some of these will be escaped by put_byte()). For the
- * time being, this subroutine only works for latin1 (< 256) code points */
+ * as-is (though some of these will be escaped by put_byte()). */
assert(start <= end);
* hex. */
Perl_sv_catpvf(aTHX_ sv, "\\x{%02" UVXf "}-\\x{%02" UVXf "}",
start,
- (end < 256) ? end : 255);
+ (end < NUM_ANYOF_CODE_POINTS)
+ ? end
+ : NUM_ANYOF_CODE_POINTS - 1);
break;
}
}
STATIC bool
-S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
+S_put_charclass_bitmap_innards(pTHX_ SV *sv, char *bitmap)
{
/* Appends to 'sv' a displayable version of the innards of the bracketed
* character class whose bitmap is 'bitmap'; Returns 'TRUE' if it actually
int i;
bool has_output_anything = FALSE;
- PERL_ARGS_ASSERT_PUT_LATIN1_CHARCLASS_INNARDS;
+ PERL_ARGS_ASSERT_PUT_CHARCLASS_BITMAP_INNARDS;
- for (i = 0; i < 256; i++) {
- if (i < 256 && BITMAP_TEST((U8 *) bitmap,i)) {
+ for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
+ if (BITMAP_TEST((U8 *) bitmap,i)) {
/* The character at index i should be output. Find the next
* character that should NOT be output */
int j;
- for (j = i + 1; j <= 256; j++) {
+ for (j = i + 1; j < NUM_ANYOF_CODE_POINTS; j++) {
if (! BITMAP_TEST((U8 *) bitmap, j)) {
break;
}
const regnode *last, const regnode *plast,
SV* sv, I32 indent, U32 depth)
{
- dVAR;
U8 op = PSEUDO; /* Arbitrary non-END op. */
const regnode *next;
const regnode *optstart= NULL;