*
**** Alterations to Henry's code are...
****
- **** Copyright (c) 1991-2000, Larry Wall
+ **** Copyright (c) 1991-2001, Larry Wall
****
**** You may distribute under the terms of either the GNU General Public
**** License or the Artistic License, as specified in the README file.
char *end; /* End of input for compile */
char *parse; /* Input-scan pointer. */
I32 whilem_seen; /* number of WHILEM in this expr */
- regnode *emit; /* Code-emit pointer; ®dummy = don't */
+ regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */
I32 naughty; /* How bad is this pattern? */
I32 sawback; /* Did we see \1, ...? */
U32 seen;
I32 extralen;
I32 seen_zerolen;
I32 seen_evals;
+ I32 utf8;
#if ADD_TO_REGEXEC
char *starttry; /* -Dr: where regtry was called. */
#define RExC_starttry (pRExC_state->starttry)
#define RExC_extralen (pRExC_state->extralen)
#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
#define RExC_seen_evals (pRExC_state->seen_evals)
+#define RExC_utf8 (pRExC_state->utf8)
#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
#define SCF_WHILEM_VISITED_POS 0x2000
-#define RF_utf8 8
-#define UTF (PL_reg_flags & RF_utf8)
+#define UTF RExC_utf8
#define LOC (RExC_flags16 & PMf_LOCALE)
#define FOLD (RExC_flags16 & PMf_FOLD)
-#define OOB_CHAR8 1234
-#define OOB_UTF8 123456
+#define OOB_UNICODE 12345678
#define OOB_NAMEDCLASS -1
#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
STATIC void
S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data)
{
- dTHR;
STRLEN l = CHR_SVLEN(data->last_found);
STRLEN old_l = CHR_SVLEN(*data->longest);
ANYOF_CLASS_ZERO(cl);
for (value = 0; value < 256; ++value)
ANYOF_BITMAP_SET(cl, value);
- cl->flags = ANYOF_EOS;
+ cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
if (LOC)
cl->flags |= ANYOF_LOCALE;
}
for (value = 0; value <= ANYOF_MAX; value += 2)
if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
return 1;
+ if (!(cl->flags & ANYOF_UNICODE_ALL))
+ return 0;
for (value = 0; value < 256; ++value)
if (!ANYOF_BITMAP_TEST(cl, value))
return 0;
} /* XXXX: logic is complicated otherwise, leave it along for a moment. */
if (!(and_with->flags & ANYOF_EOS))
cl->flags &= ~ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) {
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ cl->flags |= ANYOF_UNICODE;
+ ARG_SET(cl, ARG(and_with));
+ }
+ if (!(and_with->flags & ANYOF_UNICODE_ALL))
+ cl->flags &= ~ANYOF_UNICODE_ALL;
+ if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)))
+ cl->flags &= ~ANYOF_UNICODE;
}
/* 'OR' a given class with another one. Can create false positives */
}
if (or_with->flags & ANYOF_EOS)
cl->flags |= ANYOF_EOS;
+
+ if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
+ ARG(cl) != ARG(or_with)) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
+ if (or_with->flags & ANYOF_UNICODE_ALL) {
+ cl->flags |= ANYOF_UNICODE_ALL;
+ cl->flags &= ~ANYOF_UNICODE;
+ }
}
/* REx optimizer. Converts nodes into quickier variants "in place".
/* deltap: Write maxlen-minlen here. */
/* last: Stop before this one. */
{
- dTHR;
I32 min = 0, pars = 0, code;
regnode *scan = *scanp, *next;
I32 delta = 0;
}
else if (OP(scan) == EXACT) {
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8*)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uv_simple(s, NULL);
}
min += l;
if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
+ if (uc >= 0x100 ||
+ !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
&& (!(data->start_class->flags & ANYOF_FOLD)
- || !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)])))
+ || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat)
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
}
else if (flags & SCF_DO_STCLASS_OR) {
/* false positive possible if the class is case-folded */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
cl_and(data->start_class, &and_with);
}
}
else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */
I32 l = STR_LEN(scan);
+ UV uc = *((U8*)STRING(scan));
/* Search for fixed substrings supports EXACT only. */
if (flags & SCF_DO_SUBSTR)
scan_commit(pRExC_state, data);
if (UTF) {
- unsigned char *s = (unsigned char *)STRING(scan);
- unsigned char *e = s + l;
- I32 newl = 0;
- while (s < e) {
- newl++;
- s += UTF8SKIP(s);
- }
- l = newl;
+ U8 *s = (U8 *)STRING(scan);
+ l = utf8_length(s, s + l);
+ uc = utf8_to_uv_simple(s, NULL);
}
min += l;
if (data && (flags & SCF_DO_SUBSTR))
/* Check whether it is compatible with what we know already! */
int compat = 1;
- if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
- && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan))
- && !ANYOF_BITMAP_TEST(data->start_class,
- PL_fold[*(U8*)STRING(scan)]))
+ if (uc >= 0x100 ||
+ !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
+ && !ANYOF_BITMAP_TEST(data->start_class, uc)
+ && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))
compat = 0;
ANYOF_CLASS_ZERO(data->start_class);
ANYOF_BITMAP_ZERO(data->start_class);
if (compat) {
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
data->start_class->flags |= ANYOF_FOLD;
if (OP(scan) == EXACTFL)
if (data->start_class->flags & ANYOF_FOLD) {
/* false positive possible if the class is case-folded.
Assume that the locale settings are the same... */
- ANYOF_BITMAP_SET(data->start_class, *STRING(scan));
+ if (uc < 0x100)
+ ANYOF_BITMAP_SET(data->start_class, uc);
data->start_class->flags &= ~ANYOF_EOS;
}
cl_and(data->start_class, &and_with);
break;
}
}
- else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) {
+ else if (strchr((char*)PL_simple,OP(scan))) {
int value;
if (flags & SCF_DO_SUBSTR) {
/* Some of the logic below assumes that switching
locale on will only add false positives. */
switch (PL_regkind[(U8)OP(scan)]) {
- case ANYUTF8:
case SANY:
- case SANYUTF8:
- case ALNUMUTF8:
- case ANYOFUTF8:
- case ALNUMLUTF8:
- case NALNUMUTF8:
- case NALNUMLUTF8:
- case SPACEUTF8:
- case NSPACEUTF8:
- case SPACELUTF8:
- case NSPACELUTF8:
- case DIGITUTF8:
- case NDIGITUTF8:
default:
do_default:
/* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
STATIC I32
S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s)
{
- dTHR;
if (RExC_rx->data) {
Renewc(RExC_rx->data,
sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1),
void
Perl_reginitcolors(pTHX)
{
- dTHR;
int i = 0;
char *s = PerlEnv_getenv("PERL_RE_COLORS");
regexp *
Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
{
- dTHR;
register regexp *r;
regnode *scan;
regnode *first;
FAIL("NULL regexp argument");
/* XXXX This looks very suspicious... */
- if (pm->op_pmdynflags & PMdf_UTF8) {
- PL_reg_flags |= RF_utf8;
- }
+ if (pm->op_pmdynflags & PMdf_CMP_UTF8)
+ RExC_utf8 = 1;
else
- PL_reg_flags = 0;
+ RExC_utf8 = 0;
RExC_precomp = savepvn(exp, xend - exp);
DEBUG_r(if (!PL_colorset) reginitcolors());
/* Starting-point info. */
again:
if (PL_regkind[(U8)OP(first)] == EXACT) {
- if (OP(first) == EXACT); /* Empty, get anchored substr later. */
- else if ((OP(first) == EXACTF || OP(first) == EXACTFL)
- && !UTF)
+ if (OP(first) == EXACT)
+ ; /* Empty, get anchored substr later. */
+ else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
r->regstclass = first;
}
else if (strchr((char*)PL_simple,OP(first)))
/* turn .* into ^.* with an implied $*=1 */
int type = OP(NEXTOPER(first));
- if (type == REG_ANY || type == ANYUTF8)
+ if (type == REG_ANY)
type = ROPT_ANCH_MBOL;
else
type = ROPT_ANCH_SBOL;
longest_fixed_length = 0;
}
if (r->regstclass
- && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8
- || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY))
+ && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY))
r->regstclass = NULL;
if ((!r->anchored_substr || r->anchored_offset) && stclass_flag
&& !(data.start_class->flags & ANYOF_EOS)
struct regnode_charclass_class);
r->regstclass = (regnode*)RExC_rx->data->data[n];
r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */
+ PL_regdata = r->data; /* for regprop() */
DEBUG_r((sv = sv_newmortal(),
regprop(sv, (regnode*)data.start_class),
PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n",
r->reganch |= ROPT_EVAL_SEEN;
Newz(1002, r->startp, RExC_npar, I32);
Newz(1002, r->endp, RExC_npar, I32);
+ PL_regdata = r->data; /* for regprop() */
DEBUG_r(regdump(r));
return(r);
}
S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp)
/* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
{
- dTHR;
register regnode *ret; /* Will be the head of the group. */
register regnode *br;
register regnode *lastbr;
/* FALL THROUGH */
case '{':
{
- dTHR;
I32 count = 1, n = 0;
char c;
char *s = RExC_parse;
STATIC regnode *
S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first)
{
- dTHR;
register regnode *ret;
register regnode *chain = NULL;
register regnode *latest;
STATIC regnode *
S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
{
- dTHR;
register regnode *ret;
register char op;
register char *next;
STATIC regnode *
S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp)
{
- dTHR;
register regnode *ret = 0;
I32 flags;
break;
case '.':
nextchar(pRExC_state);
- if (UTF) {
- if (RExC_flags16 & PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SANYUTF8);
- else
- ret = reg_node(pRExC_state, ANYUTF8);
- *flagp |= HASWIDTH;
- }
- else {
- if (RExC_flags16 & PMf_SINGLELINE)
- ret = reg_node(pRExC_state, SANY);
- else
- ret = reg_node(pRExC_state, REG_ANY);
- *flagp |= HASWIDTH|SIMPLE;
- }
+ if (RExC_flags16 & PMf_SINGLELINE)
+ ret = reg_node(pRExC_state, SANY);
+ else
+ ret = reg_node(pRExC_state, REG_ANY);
+ *flagp |= HASWIDTH|SIMPLE;
RExC_naughty++;
break;
case '[':
{
char *oregcomp_parse = ++RExC_parse;
- ret = (UTF ? regclassutf8(pRExC_state) : regclass(pRExC_state));
+ ret = regclass(pRExC_state);
if (*RExC_parse != ']') {
RExC_parse = oregcomp_parse;
vFAIL("Unmatched [");
is_utf8_mark((U8*)"~"); /* preload table */
break;
case 'w':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? ALNUMLUTF8 : ALNUMUTF8)
- : (LOC ? ALNUML : ALNUM));
+ ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 'W':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NALNUMLUTF8 : NALNUMUTF8)
- : (LOC ? NALNUML : NALNUM));
+ ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
case 'b':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? BOUNDLUTF8 : BOUNDUTF8)
- : (LOC ? BOUNDL : BOUND));
+ ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
case 'B':
RExC_seen_zerolen++;
RExC_seen |= REG_SEEN_LOOKBEHIND;
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8)
- : (LOC ? NBOUNDL : NBOUND));
+ ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND);
*flagp |= SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_alnum)
is_utf8_alnum((U8*)"a"); /* preload table */
break;
case 's':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? SPACELUTF8 : SPACEUTF8)
- : (LOC ? SPACEL : SPACE));
+ ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_space)
is_utf8_space((U8*)" "); /* preload table */
break;
case 'S':
- ret = reg_node(pRExC_state,
- UTF
- ? (LOC ? NSPACELUTF8 : NSPACEUTF8)
- : (LOC ? NSPACEL : NSPACE));
+ ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_space)
is_utf8_space((U8*)" "); /* preload table */
break;
case 'd':
- ret = reg_node(pRExC_state, UTF ? DIGITUTF8 : DIGIT);
+ ret = reg_node(pRExC_state, DIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_digit)
is_utf8_digit((U8*)"1"); /* preload table */
break;
case 'D':
- ret = reg_node(pRExC_state, UTF ? NDIGITUTF8 : NDIGIT);
+ ret = reg_node(pRExC_state, NDIGIT);
*flagp |= HASWIDTH|SIMPLE;
nextchar(pRExC_state);
if (UTF && !PL_utf8_digit)
RExC_end = RExC_parse + 2;
RExC_parse--;
- ret = regclassutf8(pRExC_state);
+ ret = regclass(pRExC_state);
RExC_end = oldregxend;
RExC_parse--;
RExC_parse = p + 1;
vFAIL("Missing right brace on \\x{}");
}
- else if (UTF) {
+ else {
numlen = 1; /* allow underscores */
ender = (UV)scan_hex(p + 1, e - p - 1, &numlen);
/* numlen is generous */
}
p = e + 1;
}
- else
- {
- RExC_parse = e + 1;
- vFAIL("Can't use \\x{} without 'use utf8' declaration");
- }
-
}
else {
numlen = 0; /* disallow underscores */
STATIC I32
S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
{
- dTHR;
char *posixcc = 0;
I32 namedclass = OOB_NAMEDCLASS;
STATIC regnode *
S_regclass(pTHX_ RExC_state_t *pRExC_state)
{
- dTHR;
- register U32 value;
- register I32 lastvalue = OOB_CHAR8;
- register I32 range = 0;
+ register UV value;
+ register IV lastvalue = OOB_UNICODE;
+ register IV range = 0;
register regnode *ret;
STRLEN numlen;
- I32 namedclass;
+ IV namedclass;
char *rangebegin;
bool need_class = 0;
+ SV *listsv;
+ register char *e;
+ UV n;
+ bool dont_optimize_invert = FALSE;
+
+ ret = reganode(pRExC_state, ANYOF, 0);
+
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) = 0;
+
+ if (*RExC_parse == '^') { /* Complement of range. */
+ RExC_naughty++;
+ RExC_parse++;
+ if (!SIZE_ONLY)
+ ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ }
- ret = reg_node(pRExC_state, ANYOF);
if (SIZE_ONLY)
RExC_size += ANYOF_SKIP;
else {
- ret->flags = 0;
- ANYOF_BITMAP_ZERO(ret);
RExC_emit += ANYOF_SKIP;
if (FOLD)
ANYOF_FLAGS(ret) |= ANYOF_FOLD;
if (LOC)
ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
- }
- if (*RExC_parse == '^') { /* Complement of range. */
- RExC_naughty++;
- RExC_parse++;
- if (!SIZE_ONLY)
- ANYOF_FLAGS(ret) |= ANYOF_INVERT;
+ ANYOF_BITMAP_ZERO(ret);
+ listsv = newSVpvn("# comment\n", 10);
}
if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
checkposixcc(pRExC_state);
if (*RExC_parse == ']' || *RExC_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
+ goto charclassloop; /* allow 1st char to be ] or - */
+
while (RExC_parse < RExC_end && *RExC_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
+
+ charclassloop:
+
+ namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
+
if (!range)
rangebegin = RExC_parse;
- value = UCHARAT(RExC_parse++);
+ if (UTF) {
+ value = utf8_to_uv((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
if (value == '[')
namedclass = regpposixcc(pRExC_state, value);
else if (value == '\\') {
- value = UCHARAT(RExC_parse++);
+ if (UTF) {
+ value = utf8_to_uv((U8*)RExC_parse,
+ RExC_end - RExC_parse,
+ &numlen, 0);
+ RExC_parse += numlen;
+ }
+ else
+ value = UCHARAT(RExC_parse++);
/* Some compilers cannot handle switching on 64-bit integer
- * values, therefore the 'value' cannot be an UV. --jhi */
- switch (value) {
+ * values, therefore value cannot be an UV. Yes, this will
+ * be a problem later if we want switch on Unicode.
+ * A similar issue a little bit later when switching on
+ * namedclass. --jhi */
+ switch ((I32)value) {
case 'w': namedclass = ANYOF_ALNUM; break;
case 'W': namedclass = ANYOF_NALNUM; break;
case 's': namedclass = ANYOF_SPACE; break;
case 'S': namedclass = ANYOF_NSPACE; break;
case 'd': namedclass = ANYOF_DIGIT; break;
case 'D': namedclass = ANYOF_NDIGIT; break;
+ case 'p':
+ case 'P':
+ if (*RExC_parse == '{') {
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL("Missing right brace on \\p{}");
+ n = e - RExC_parse;
+ }
+ else {
+ e = RExC_parse;
+ n = 1;
+ }
+ if (!SIZE_ONLY) {
+ if (value == 'p')
+ Perl_sv_catpvf(aTHX_ listsv,
+ "+utf8::%.*s\n", (int)n, RExC_parse);
+ else
+ Perl_sv_catpvf(aTHX_ listsv,
+ "!utf8::%.*s\n", (int)n, RExC_parse);
+ }
+ RExC_parse = e + 1;
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ continue;
case 'n': value = '\n'; break;
case 'r': value = '\r'; break;
case 't': value = '\t'; break;
case 'a': value = '\057'; break;
#endif
case 'x':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
- RExC_parse += numlen;
+ if (*RExC_parse == '{') {
+ e = strchr(RExC_parse++, '}');
+ if (!e)
+ vFAIL("Missing right brace on \\x{}");
+ numlen = 1; /* allow underscores */
+ value = (UV)scan_hex(RExC_parse,
+ e - RExC_parse,
+ &numlen);
+ RExC_parse = e + 1;
+ }
+ else {
+ numlen = 0; /* disallow underscores */
+ value = (UV)scan_hex(RExC_parse, 2, &numlen);
+ RExC_parse += numlen;
+ }
break;
case 'c':
value = UCHARAT(RExC_parse++);
break;
default:
if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
-
- vWARN2(RExC_parse, "Unrecognized escape \\%c in character class passed through", (int)value);
+ vWARN2(RExC_parse,
+ "Unrecognized escape \\%c in character class passed through",
+ (int)value);
break;
}
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (!need_class && !SIZE_ONLY)
+ } /* end of \blah */
+
+ if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
+
+ if (!SIZE_ONLY && !need_class)
ANYOF_CLASS_ZERO(ret);
+
need_class = 1;
- if (range) { /* a-\d, a-[:digit:] */
+
+ /* a bad range like a-\d, a-[:digit:] ? */
+ if (range) {
if (!SIZE_ONLY) {
if (ckWARN(WARN_REGEXP))
vWARN4(RExC_parse,
RExC_parse - rangebegin,
RExC_parse - rangebegin,
rangebegin);
- ANYOF_BITMAP_SET(ret, lastvalue);
- ANYOF_BITMAP_SET(ret, '-');
+ if (lastvalue < 256) {
+ ANYOF_BITMAP_SET(ret, lastvalue);
+ ANYOF_BITMAP_SET(ret, '-');
+ }
+ else {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ Perl_sv_catpvf(aTHX_ listsv,
+ /* 0x002D is Unicode for '-' */
+ "%04"UVxf"\n002D\n", (UV)lastvalue);
+ }
}
- range = 0; /* this is not a true range */
+
+ range = 0; /* this was not a true range */
}
+
if (!SIZE_ONLY) {
- switch (namedclass) {
+ /* Possible truncation here but in some 64-bit environments
+ * the compiler gets heartburn about switch on 64-bit values.
+ * A similar issue a little earlier when switching on value.
+ * --jhi */
+ switch ((I32)namedclass) {
case ANYOF_ALNUM:
if (LOC)
ANYOF_CLASS_SET(ret, ANYOF_ALNUM);
if (isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n");
break;
case ANYOF_NALNUM:
if (LOC)
if (!isALNUM(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n");
break;
- case ANYOF_SPACE:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_SPACE);
- else {
- for (value = 0; value < 256; value++)
- if (isSPACE(value))
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NSPACE:
+ case ANYOF_ALNUMC:
if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
else {
for (value = 0; value < 256; value++)
- if (!isSPACE(value))
+ if (isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_DIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
- else {
- for (value = '0'; value <= '9'; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
- break;
- case ANYOF_NDIGIT:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
- else {
- for (value = 0; value < '0'; value++)
- ANYOF_BITMAP_SET(ret, value);
- for (value = '9' + 1; value < 256; value++)
- ANYOF_BITMAP_SET(ret, value);
- }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n");
break;
case ANYOF_NALNUMC:
if (LOC)
if (!isALNUMC(value))
ANYOF_BITMAP_SET(ret, value);
}
- break;
- case ANYOF_ALNUMC:
- if (LOC)
- ANYOF_CLASS_SET(ret, ANYOF_ALNUMC);
- else {
- for (value = 0; value < 256; value++)
- if (isALNUMC(value))
- ANYOF_BITMAP_SET(ret, value);
- }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n");
break;
case ANYOF_ALPHA:
if (LOC)
if (isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n");
break;
case ANYOF_NALPHA:
if (LOC)
if (!isALPHA(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n");
break;
case ANYOF_ASCII:
if (LOC)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n");
break;
case ANYOF_NASCII:
if (LOC)
ANYOF_BITMAP_SET(ret, value);
#endif /* EBCDIC */
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n");
break;
case ANYOF_BLANK:
if (LOC)
if (isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n");
break;
case ANYOF_NBLANK:
if (LOC)
if (!isBLANK(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n");
break;
case ANYOF_CNTRL:
if (LOC)
if (isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
- lastvalue = OOB_CHAR8;
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n");
break;
case ANYOF_NCNTRL:
if (LOC)
if (!isCNTRL(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n");
+ break;
+ case ANYOF_DIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = '0'; value <= '9'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n");
+ break;
+ case ANYOF_NDIGIT:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
+ else {
+ /* consecutive digits assumed */
+ for (value = 0; value < '0'; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ for (value = '9' + 1; value < 256; value++)
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n");
break;
case ANYOF_GRAPH:
if (LOC)
if (isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n");
break;
case ANYOF_NGRAPH:
if (LOC)
if (!isGRAPH(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n");
break;
case ANYOF_LOWER:
if (LOC)
if (isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n");
break;
case ANYOF_NLOWER:
if (LOC)
if (!isLOWER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n");
break;
case ANYOF_PRINT:
if (LOC)
if (isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n");
break;
case ANYOF_NPRINT:
if (LOC)
if (!isPRINT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n");
break;
case ANYOF_PSXSPC:
if (LOC)
if (isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n");
break;
case ANYOF_NPSXSPC:
if (LOC)
if (!isPSXSPC(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n");
break;
case ANYOF_PUNCT:
if (LOC)
if (isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n");
break;
case ANYOF_NPUNCT:
if (LOC)
if (!isPUNCT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n");
+ break;
+ case ANYOF_SPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_SPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n");
+ break;
+ case ANYOF_NSPACE:
+ if (LOC)
+ ANYOF_CLASS_SET(ret, ANYOF_NSPACE);
+ else {
+ for (value = 0; value < 256; value++)
+ if (!isSPACE(value))
+ ANYOF_BITMAP_SET(ret, value);
+ }
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n");
break;
case ANYOF_UPPER:
if (LOC)
if (isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n");
break;
case ANYOF_NUPPER:
if (LOC)
if (!isUPPER(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n");
break;
case ANYOF_XDIGIT:
if (LOC)
if (isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n");
break;
case ANYOF_NXDIGIT:
if (LOC)
if (!isXDIGIT(value))
ANYOF_BITMAP_SET(ret, value);
}
+ dont_optimize_invert = TRUE;
+ Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n");
break;
default:
vFAIL("Invalid [::] class");
ANYOF_FLAGS(ret) |= ANYOF_CLASS;
continue;
}
- }
+ } /* end of namedclass \blah */
+
if (range) {
if (lastvalue > value) /* b-a */ {
Simple_vFAIL4("Invalid [] range \"%*.*s\"",
RExC_parse - rangebegin,
rangebegin);
}
- range = 0;
+ range = 0; /* not a true range */
}
else {
- lastvalue = value;
+ lastvalue = value; /* save the beginning of the range */
if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
RExC_parse[1] != ']') {
RExC_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
+
+ /* a bad range like \w-, [:word:]- ? */
+ if (namedclass > OOB_NAMEDCLASS) {
if (ckWARN(WARN_REGEXP))
vWARN4(RExC_parse,
"False [] range \"%*.*s\"",
if (!SIZE_ONLY)
ANYOF_BITMAP_SET(ret, '-');
} else
- range = 1;
- continue; /* do it next time */
+ range = 1; /* yeah, it's a range! */
+ continue; /* but do it the next time */
}
}
+
/* now is the next time */
if (!SIZE_ONLY) {
+ if (lastvalue < 256 && value < 256) {
#ifndef ASCIIish /* EBCDIC, for example. */
- if ((isLOWER(lastvalue) && isLOWER(value)) ||
- (isUPPER(lastvalue) && isUPPER(value)))
- {
- I32 i;
- if (isLOWER(lastvalue)) {
- for (i = lastvalue; i <= value; i++)
- if (isLOWER(i))
- ANYOF_BITMAP_SET(ret, i);
- } else {
- for (i = lastvalue; i <= value; i++)
- if (isUPPER(i))
- ANYOF_BITMAP_SET(ret, i);
+ if ((isLOWER(lastvalue) && isLOWER(value)) ||
+ (isUPPER(lastvalue) && isUPPER(value)))
+ {
+ IV i;
+ if (isLOWER(lastvalue)) {
+ for (i = lastvalue; i <= value; i++)
+ if (isLOWER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ } else {
+ for (i = lastvalue; i <= value; i++)
+ if (isUPPER(i))
+ ANYOF_BITMAP_SET(ret, i);
+ }
}
- }
- else
+ else
#endif
- for ( ; lastvalue <= value; lastvalue++)
- ANYOF_BITMAP_SET(ret, lastvalue);
+ for ( ; lastvalue <= value; lastvalue++)
+ ANYOF_BITMAP_SET(ret, lastvalue);
+ } else {
+ ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+ if (lastvalue < value)
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+ (UV)lastvalue, (UV)value);
+ else
+ Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
+ (UV)value);
+ }
}
- range = 0;
+
+ range = 0; /* this range (if it was one) is done now */
}
+
if (need_class) {
if (SIZE_ONLY)
RExC_size += ANYOF_CLASS_ADD_SKIP;
else
RExC_emit += ANYOF_CLASS_ADD_SKIP;
}
+
/* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
if (!SIZE_ONLY &&
- (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) {
+ (ANYOF_FLAGS(ret) &
+ /* If the only flag is folding (plus possibly inversion). */
+ (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) {
for (value = 0; value < 256; ++value) {
if (ANYOF_BITMAP_TEST(ret, value)) {
- I32 cf = PL_fold[value];
- ANYOF_BITMAP_SET(ret, cf);
+ IV fold = PL_fold[value];
+
+ if (fold != value)
+ ANYOF_BITMAP_SET(ret, fold);
}
}
ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
}
+
/* optimize inverted simple patterns (e.g. [^a-z]) */
- if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
+ if (!SIZE_ONLY && !dont_optimize_invert &&
+ /* If the only flag is inversion. */
+ (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
- ANYOF_FLAGS(ret) = 0;
- }
- return ret;
-}
-
-STATIC regnode *
-S_regclassutf8(pTHX_ RExC_state_t *pRExC_state)
-{
- dTHR;
- register char *e;
- register U32 value;
- register U32 lastvalue = OOB_UTF8;
- register I32 range = 0;
- register regnode *ret;
- STRLEN numlen;
- I32 n;
- SV *listsv;
- U8 flags = 0;
- I32 namedclass;
- char *rangebegin;
-
- if (*RExC_parse == '^') { /* Complement of range. */
- RExC_naughty++;
- RExC_parse++;
- if (!SIZE_ONLY)
- flags |= ANYOF_INVERT;
- }
- if (!SIZE_ONLY) {
- if (FOLD)
- flags |= ANYOF_FOLD;
- if (LOC)
- flags |= ANYOF_LOCALE;
- listsv = newSVpvn("# comment\n",10);
+ ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
}
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP))
- checkposixcc(pRExC_state);
-
- if (*RExC_parse == ']' || *RExC_parse == '-')
- goto skipcond; /* allow 1st char to be ] or - */
+ if (!SIZE_ONLY) {
+ AV *av = newAV();
+ SV *rv;
- while (RExC_parse < RExC_end && *RExC_parse != ']') {
- skipcond:
- namedclass = OOB_NAMEDCLASS;
- if (!range)
- rangebegin = RExC_parse;
- value = utf8_to_uv((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
- RExC_parse += numlen;
- if (value == '[')
- namedclass = regpposixcc(pRExC_state, value);
- else if (value == '\\') {
- value = (U32)utf8_to_uv((U8*)RExC_parse,
- RExC_end - RExC_parse,
- &numlen, 0);
- RExC_parse += numlen;
- /* Some compilers cannot handle switching on 64-bit integer
- * values, therefore value cannot be an UV. Yes, this will
- * be a problem later if we want switch on Unicode. --jhi */
- switch (value) {
- case 'w': namedclass = ANYOF_ALNUM; break;
- case 'W': namedclass = ANYOF_NALNUM; break;
- case 's': namedclass = ANYOF_SPACE; break;
- case 'S': namedclass = ANYOF_NSPACE; break;
- case 'd': namedclass = ANYOF_DIGIT; break;
- case 'D': namedclass = ANYOF_NDIGIT; break;
- case 'p':
- case 'P':
- if (*RExC_parse == '{') {
- e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\p{}");
- n = e - RExC_parse;
- }
- else {
- e = RExC_parse;
- n = 1;
- }
- if (!SIZE_ONLY) {
- if (value == 'p')
- Perl_sv_catpvf(aTHX_ listsv,
- "+utf8::%.*s\n", (int)n, RExC_parse);
- else
- Perl_sv_catpvf(aTHX_ listsv,
- "!utf8::%.*s\n", (int)n, RExC_parse);
- }
- RExC_parse = e + 1;
- lastvalue = OOB_UTF8;
- continue;
- case 'n': value = '\n'; break;
- case 'r': value = '\r'; break;
- case 't': value = '\t'; break;
- case 'f': value = '\f'; break;
- case 'b': value = '\b'; break;
-#ifdef ASCIIish
- case 'e': value = '\033'; break;
- case 'a': value = '\007'; break;
-#else
- case 'e': value = '\047'; break;
- case 'a': value = '\057'; break;
-#endif
- case 'x':
- if (*RExC_parse == '{') {
- e = strchr(RExC_parse++, '}');
- if (!e)
- vFAIL("Missing right brace on \\x{}");
- numlen = 1; /* allow underscores */
- value = (UV)scan_hex(RExC_parse,
- e - RExC_parse,
- &numlen);
- RExC_parse = e + 1;
- }
- else {
- numlen = 0; /* disallow underscores */
- value = (UV)scan_hex(RExC_parse, 2, &numlen);
- RExC_parse += numlen;
- }
- break;
- case 'c':
- value = UCHARAT(RExC_parse++);
- value = toCTRL(value);
- break;
- case '0': case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- numlen = 0; /* disallow underscores */
- value = (UV)scan_oct(--RExC_parse, 3, &numlen);
- RExC_parse += numlen;
- break;
- default:
- if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value))
- vWARN2(RExC_parse,
- "Unrecognized escape \\%c in character class passed through",
- (int)value);
- break;
- }
- }
- if (namedclass > OOB_NAMEDCLASS) {
- if (range) { /* a-\d, a-[:digit:] */
- if (!SIZE_ONLY) {
- if (ckWARN(WARN_REGEXP))
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "%04"UVxf"\n002D\n", (UV)lastvalue);
- }
- range = 0;
- }
- if (!SIZE_ONLY) {
- switch (namedclass) {
- case ANYOF_ALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break;
- case ANYOF_NALNUM:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break;
- case ANYOF_ALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break;
- case ANYOF_NALNUMC:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break;
- case ANYOF_ALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break;
- case ANYOF_NALPHA:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break;
- case ANYOF_ASCII:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break;
- case ANYOF_NASCII:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break;
- case ANYOF_CNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break;
- case ANYOF_NCNTRL:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break;
- case ANYOF_GRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break;
- case ANYOF_NGRAPH:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break;
- case ANYOF_DIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break;
- case ANYOF_NDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break;
- case ANYOF_LOWER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break;
- case ANYOF_NLOWER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break;
- case ANYOF_PRINT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break;
- case ANYOF_NPRINT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break;
- case ANYOF_PUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break;
- case ANYOF_NPUNCT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break;
- case ANYOF_SPACE:
- case ANYOF_PSXSPC:
- case ANYOF_BLANK:
- /* Not very true for PSXSPC and BLANK
- * but not feeling like creating IsPOSIXSpace and
- * IsBlank right now. --jhi */
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break;
- case ANYOF_NSPACE:
- case ANYOF_NPSXSPC:
- case ANYOF_NBLANK:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break;
- case ANYOF_UPPER:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break;
- case ANYOF_NUPPER:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break;
- case ANYOF_XDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break;
- case ANYOF_NXDIGIT:
- Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break;
- }
- continue;
- }
- }
- if (range) {
- if (lastvalue > value) { /* b-a */
- Simple_vFAIL4("Invalid [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- }
- range = 0;
- }
- else {
- lastvalue = value;
- if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
- RExC_parse[1] != ']') {
- RExC_parse++;
- if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */
- if (ckWARN(WARN_REGEXP))
- vWARN4(RExC_parse,
- "False [] range \"%*.*s\"",
- RExC_parse - rangebegin,
- RExC_parse - rangebegin,
- rangebegin);
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv,
- /* 0x002D is Unicode for '-' */
- "002D\n");
- } else
- range = 1;
- continue; /* do it next time */
- }
- }
- /* now is the next time */
- if (!SIZE_ONLY)
- Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
- (UV)lastvalue, (UV)value);
- range = 0;
- }
-
- ret = reganode(pRExC_state, ANYOFUTF8, 0);
-
- if (!SIZE_ONLY) {
- SV *rv = swash_init("utf8", "", listsv, 1, 0);
- SvREFCNT_dec(listsv);
- n = add_data(pRExC_state, 1,"s");
+ av_store(av, 0, listsv);
+ av_store(av, 1, NULL);
+ rv = newRV_noinc((SV*)av);
+ n = add_data(pRExC_state, 1, "s");
RExC_rx->data->data[n] = (void*)rv;
- ARG1_SET(ret, flags);
- ARG2_SET(ret, n);
+ ARG_SET(ret, n);
}
return ret;
STATIC char*
S_nextchar(pTHX_ RExC_state_t *pRExC_state)
{
- dTHR;
char* retval = RExC_parse++;
for (;;) {
STATIC regnode * /* Location. */
S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
{
- dTHR;
register regnode *ret;
register regnode *ptr;
STATIC regnode * /* Location. */
S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
{
- dTHR;
register regnode *ret;
register regnode *ptr;
STATIC void
S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp)
{
- dTHR;
- if (SIZE_ONLY) {
- U8 tmpbuf[UTF8_MAXLEN];
- *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf;
- }
- else
- *lenp = uv_to_utf8((U8*)s, uv) - (U8*)s;
-
+ *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s);
}
/*
STATIC void
S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd)
{
- dTHR;
register regnode *src;
register regnode *dst;
register regnode *place;
STATIC void
S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
{
- dTHR;
register regnode *scan;
register regnode *temp;
STATIC void
S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val)
{
- dTHR;
/* "Operandless" and "op != BRANCH" are synonymous in practice. */
if (p == NULL || SIZE_ONLY)
return;
Perl_regdump(pTHX_ regexp *r)
{
#ifdef DEBUGGING
- dTHR;
SV *sv = sv_newmortal();
(void)dumpuntil(r->program, r->program + 1, NULL, sv, 0);
STATIC void
S_put_byte(pTHX_ SV *sv, int c)
{
- if (c <= ' ' || c == 127 || c == 255)
+ if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c))
Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
else if (c == '-' || c == ']' || c == '\\' || c == '^')
Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
Perl_regprop(pTHX_ SV *sv, regnode *o)
{
#ifdef DEBUGGING
- dTHR;
register int k;
sv_setpvn(sv, "", 0);
Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
else if (k == ANYOF) {
int i, rangestart = -1;
- const char * const out[] = { /* Should be syncronized with
- ANYOF_ #xdefines in regcomp.h */
+ U8 flags = ANYOF_FLAGS(o);
+ const char * const anyofs[] = { /* Should be syncronized with
+ * ANYOF_ #xdefines in regcomp.h */
"\\w",
"\\W",
"\\s",
"[:^blank:]"
};
- if (o->flags & ANYOF_LOCALE)
+ if (flags & ANYOF_LOCALE)
sv_catpv(sv, "{loc}");
- if (o->flags & ANYOF_FOLD)
+ if (flags & ANYOF_FOLD)
sv_catpv(sv, "{i}");
Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
- if (o->flags & ANYOF_INVERT)
+ if (flags & ANYOF_INVERT)
sv_catpv(sv, "^");
- if (OP(o) == ANYOF) {
- for (i = 0; i <= 256; i++) {
- if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
- if (rangestart == -1)
- rangestart = i;
- } else if (rangestart != -1) {
- if (i <= rangestart + 3)
- for (; rangestart < i; rangestart++)
- put_byte(sv, rangestart);
- else {
+ for (i = 0; i <= 256; i++) {
+ if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++)
put_byte(sv, rangestart);
- sv_catpv(sv, "-");
- put_byte(sv, i - 1);
- }
- rangestart = -1;
+ else {
+ put_byte(sv, rangestart);
+ sv_catpv(sv, "-");
+ put_byte(sv, i - 1);
}
+ rangestart = -1;
}
- if (o->flags & ANYOF_CLASS)
- for (i = 0; i < sizeof(out)/sizeof(char*); i++)
- if (ANYOF_CLASS_TEST(o,i))
- sv_catpv(sv, out[i]);
}
- else {
- sv_catpv(sv, "{ANYOFUTF8}"); /* TODO: full decode */
+
+ if (o->flags & ANYOF_CLASS)
+ for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++)
+ if (ANYOF_CLASS_TEST(o,i))
+ sv_catpv(sv, anyofs[i]);
+
+ if (flags & ANYOF_UNICODE)
+ sv_catpv(sv, "{unicode}");
+ else if (flags & ANYOF_UNICODE_ALL)
+ sv_catpv(sv, "{all-unicode}");
+
+ {
+ SV *lv;
+ SV *sw = regclass_swash(o, FALSE, &lv);
+
+ if (lv) {
+ if (sw) {
+ UV i;
+ U8 s[UTF8_MAXLEN+1];
+
+ for (i = 0; i <= 256; i++) { /* just the first 256 */
+ U8 *e = uv_to_utf8(s, i);
+
+ if (i < 256 && swash_fetch(sw, s)) {
+ if (rangestart == -1)
+ rangestart = i;
+ } else if (rangestart != -1) {
+ U8 *p;
+
+ if (i <= rangestart + 3)
+ for (; rangestart < i; rangestart++) {
+ for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ else {
+ for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++)
+ put_byte(sv, *p);
+ sv_catpv(sv, "-");
+ for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++)
+ put_byte(sv, *p);
+ }
+ rangestart = -1;
+ }
+ }
+
+ sv_catpv(sv, "..."); /* et cetera */
+ }
+
+ {
+ char *s = savepv(SvPVX(lv));
+ char *origs = s;
+
+ while(*s && *s != '\n') s++;
+
+ if (*s == '\n') {
+ char *t = ++s;
+
+ while (*s) {
+ if (*s == '\n')
+ *s = ' ';
+ s++;
+ }
+ if (s[-1] == ' ')
+ s[-1] = 0;
+
+ sv_catpv(sv, t);
+ }
+
+ Safefree(origs);
+ }
+ }
}
+
Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
}
else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
void
Perl_pregfree(pTHX_ struct regexp *r)
{
- dTHR;
DEBUG_r(if (!PL_colorset) reginitcolors());
if (!r || (--r->refcnt > 0))
regnode *
Perl_regnext(pTHX_ register regnode *p)
{
- dTHR;
register I32 offset;
if (p == &PL_regdummy)
void
Perl_save_re_context(pTHX)
{
- dTHR;
-
#if 0
SAVEPPTR(RExC_precomp); /* uncompiled string. */
SAVEI32(RExC_npar); /* () count. */
SAVEI32(PL_reg_oldpos); /* from regexec.c */
SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */
SAVEVPTR(PL_reg_curpm); /* from regexec.c */
+ SAVEI32(PL_regnpar); /* () count. */
#ifdef DEBUGGING
SAVEPPTR(PL_reg_starttry); /* from regexec.c */
#endif
{
ReREFCNT_dec((regexp *)r);
}
-