X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/0407a77bc74fb10c233a2d09d551311e3628eba5..b207e5e2c40219749f3bdcc1346a77b6985ff4c7:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 90500a4..c39405e 100644 --- a/regcomp.c +++ b/regcomp.c @@ -69,7 +69,7 @@ * **** Alterations to Henry's code are... **** - **** Copyright (c) 1991-1999, Larry Wall + **** Copyright (c) 1991-2001, Larry Wall **** **** You may distribute under the terms of either the GNU General Public **** License or the Artistic License, as specified in the README file. @@ -111,6 +111,46 @@ #define STATIC static #endif +typedef struct RExC_state_t { + U16 flags16; /* are we folding, multilining? */ + char *precomp; /* uncompiled string. */ + regexp *rx; + char *end; /* End of input for compile */ + char *parse; /* Input-scan pointer. */ + I32 whilem_seen; /* number of WHILEM in this expr */ + regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ + I32 naughty; /* How bad is this pattern? */ + I32 sawback; /* Did we see \1, ...? */ + U32 seen; + I32 size; /* Code size. */ + I32 npar; /* () count. */ + I32 extralen; + I32 seen_zerolen; + I32 seen_evals; + I32 utf8; +#if ADD_TO_REGEXEC + char *starttry; /* -Dr: where regtry was called. */ +#define RExC_starttry (pRExC_state->starttry) +#endif +} RExC_state_t; + +#define RExC_flags16 (pRExC_state->flags16) +#define RExC_precomp (pRExC_state->precomp) +#define RExC_rx (pRExC_state->rx) +#define RExC_end (pRExC_state->end) +#define RExC_parse (pRExC_state->parse) +#define RExC_whilem_seen (pRExC_state->whilem_seen) +#define RExC_emit (pRExC_state->emit) +#define RExC_naughty (pRExC_state->naughty) +#define RExC_sawback (pRExC_state->sawback) +#define RExC_seen (pRExC_state->seen) +#define RExC_size (pRExC_state->size) +#define RExC_npar (pRExC_state->npar) +#define RExC_extralen (pRExC_state->extralen) +#define RExC_seen_zerolen (pRExC_state->seen_zerolen) +#define RExC_seen_evals (pRExC_state->seen_evals) +#define RExC_utf8 (pRExC_state->utf8) + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) @@ -151,6 +191,7 @@ typedef struct scan_data_t { I32 offset_float_max; I32 flags; I32 whilem_c; + I32 *last_closep; struct regnode_charclass_class *start_class; } scan_data_t; @@ -159,7 +200,7 @@ typedef struct scan_data_t { */ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0 }; + 0, 0, 0, 0, 0, 0}; #define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL) #define SF_BEFORE_SEOL 0x1 @@ -188,19 +229,196 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, #define SCF_DO_STCLASS_AND 0x0800 #define SCF_DO_STCLASS_OR 0x1000 #define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR) +#define SCF_WHILEM_VISITED_POS 0x2000 -#define RF_utf8 8 -#define UTF (PL_reg_flags & RF_utf8) -#define LOC (PL_regflags & PMf_LOCALE) -#define FOLD (PL_regflags & PMf_FOLD) +#define UTF RExC_utf8 +#define LOC (RExC_flags16 & PMf_LOCALE) +#define FOLD (RExC_flags16 & PMf_FOLD) -#define OOB_CHAR8 1234 -#define OOB_UTF8 123456 +#define OOB_UNICODE 12345678 #define OOB_NAMEDCLASS -1 #define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv)) #define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b) + +/* length of regex to show in messages that don't mark a position within */ +#define RegexLengthToShowInErrorMessages 127 + +/* + * If MARKER[12] are adjusted, be sure to adjust the constants at the top + * of t/op/regmesg.t, the tests in t/op/re_tests, and those in + * op/pragma/warn/regcomp. + */ +#define MARKER1 "HERE" /* marker as it appears in the description */ +#define MARKER2 " << HERE " /* marker as it appears within the regex */ + +#define REPORT_LOCATION " before " MARKER1 " mark in regex m/%.*s" MARKER2 "%s/" + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * arg. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define FAIL(msg) \ + STMT_START { \ + char *ellipses = ""; \ + unsigned len = strlen(RExC_precomp); \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \ + msg, (int)len, RExC_precomp, ellipses); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given + * args. Show regex, up to a maximum length. If it's too long, chop and add + * "...". + */ +#define FAIL2(pat,msg) \ + STMT_START { \ + char *ellipses = ""; \ + unsigned len = strlen(RExC_precomp); \ + \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + \ + if (len > RegexLengthToShowInErrorMessages) { \ + /* chop 10 shorter than the max, to ensure meaning of "..." */ \ + len = RegexLengthToShowInErrorMessages - 10; \ + ellipses = "..."; \ + } \ + S_re_croak2(aTHX_ pat, " in regex m/%.*s%s/", \ + msg, (int)len, RExC_precomp, ellipses); \ + } STMT_END + + +/* + * Simple_vFAIL -- like FAIL, but marks the current location in the scan + */ +#define Simple_vFAIL(m) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + \ + Perl_croak(aTHX_ "%s" REPORT_LOCATION, \ + m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL() + */ +#define vFAIL(m) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + Simple_vFAIL(m); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts two arguments. + */ +#define Simple_vFAIL2(m,a1) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2(). + */ +#define vFAIL2(m,a1) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + Simple_vFAIL2(m, a1); \ + } STMT_END + + +/* + * Like Simple_vFAIL(), but accepts three arguments. + */ +#define Simple_vFAIL3(m, a1, a2) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + +/* + * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3(). + */ +#define vFAIL3(m,a1,a2) \ + STMT_START { \ + if (!SIZE_ONLY) \ + SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \ + Simple_vFAIL3(m, a1, a2); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts four arguments. + */ +#define Simple_vFAIL4(m, a1, a2, a3) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3,\ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + +/* + * Like Simple_vFAIL(), but accepts five arguments. + */ +#define Simple_vFAIL5(m, a1, a2, a3, a4) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-RExC_parse); \ + S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, a4,\ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + + +#define vWARN(loc,m) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, "%s" REPORT_LOCATION,\ + m, (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END \ + + +#define vWARN2(loc, m, a1) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + a1, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + +#define vWARN3(loc, m, a1, a2) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp) - (RExC_end - (loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION, \ + a1, a2, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + +#define vWARN4(loc, m, a1, a2, a3) \ + STMT_START { \ + unsigned offset = strlen(RExC_precomp)-(RExC_end-(loc)); \ + Perl_warner(aTHX_ WARN_REGEXP, m REPORT_LOCATION,\ + a1, a2, a3, \ + (int)offset, RExC_precomp, RExC_precomp + offset); \ + } STMT_END + + /* Allow for side effects in s */ #define REGC(c,s) STMT_START { if (!SIZE_ONLY) *(s) = (c); else (s);} STMT_END @@ -211,9 +429,8 @@ static void clear_re(pTHXo_ void *r); floating substrings if needed. */ STATIC void -S_scan_commit(pTHX_ scan_data_t *data) +S_scan_commit(pTHX_ RExC_state_t *pRExC_state, scan_data_t *data) { - dTHR; STRLEN l = CHR_SVLEN(data->last_found); STRLEN old_l = CHR_SVLEN(*data->longest); @@ -246,14 +463,14 @@ S_scan_commit(pTHX_ scan_data_t *data) /* Can match anything (initialization) */ STATIC void -S_cl_anything(pTHX_ struct regnode_charclass_class *cl) +S_cl_anything(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { int value; ANYOF_CLASS_ZERO(cl); for (value = 0; value < 256; ++value) ANYOF_BITMAP_SET(cl, value); - cl->flags = ANYOF_EOS; + cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL; if (LOC) cl->flags |= ANYOF_LOCALE; } @@ -264,9 +481,11 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) { int value; - for (value = 0; value < ANYOF_MAX; value += 2) + for (value = 0; value <= ANYOF_MAX; value += 2) if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1)) return 1; + if (!(cl->flags & ANYOF_UNICODE_ALL)) + return 0; for (value = 0; value < 256; ++value) if (!ANYOF_BITMAP_TEST(cl, value)) return 0; @@ -275,19 +494,19 @@ S_cl_is_anything(pTHX_ struct regnode_charclass_class *cl) /* Can match anything (initialization) */ STATIC void -S_cl_init(pTHX_ struct regnode_charclass_class *cl) +S_cl_init(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; - cl_anything(cl); + cl_anything(pRExC_state, cl); } STATIC void -S_cl_init_zero(pTHX_ struct regnode_charclass_class *cl) +S_cl_init_zero(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl) { + Zero(cl, 1, struct regnode_charclass_class); cl->type = ANYOF; - cl_anything(cl); - ANYOF_CLASS_ZERO(cl); - ANYOF_BITMAP_ZERO(cl); + cl_anything(pRExC_state, cl); if (LOC) cl->flags |= ANYOF_LOCALE; } @@ -298,8 +517,6 @@ STATIC void S_cl_and(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class *and_with) { - int value; - if (!(and_with->flags & ANYOF_CLASS) && !(cl->flags & ANYOF_CLASS) && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE) @@ -316,15 +533,23 @@ S_cl_and(pTHX_ struct regnode_charclass_class *cl, } /* XXXX: logic is complicated otherwise, leave it along for a moment. */ if (!(and_with->flags & ANYOF_EOS)) cl->flags &= ~ANYOF_EOS; + + if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE) { + cl->flags &= ~ANYOF_UNICODE_ALL; + cl->flags |= ANYOF_UNICODE; + ARG_SET(cl, ARG(and_with)); + } + if (!(and_with->flags & ANYOF_UNICODE_ALL)) + cl->flags &= ~ANYOF_UNICODE_ALL; + if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL))) + cl->flags &= ~ANYOF_UNICODE; } /* 'OR' a given class with another one. Can create false positives */ /* We assume that cl is not inverted */ STATIC void -S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with) +S_cl_or(pTHX_ RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, struct regnode_charclass_class *or_with) { - int value; - if (or_with->flags & ANYOF_INVERT) { /* We do not use * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2)) @@ -344,7 +569,7 @@ S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class cl->bitmap[i] |= ~or_with->bitmap[i]; } /* XXXX: logic is complicated otherwise */ else { - cl_anything(cl); + cl_anything(pRExC_state, cl); } } else { /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */ @@ -363,11 +588,21 @@ S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class } } else { /* XXXX: logic is complicated, leave it along for a moment. */ - cl_anything(cl); + cl_anything(pRExC_state, cl); } } if (or_with->flags & ANYOF_EOS) cl->flags |= ANYOF_EOS; + + if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE && + ARG(cl) != ARG(or_with)) { + cl->flags |= ANYOF_UNICODE_ALL; + cl->flags &= ~ANYOF_UNICODE; + } + if (or_with->flags & ANYOF_UNICODE_ALL) { + cl->flags |= ANYOF_UNICODE_ALL; + cl->flags &= ~ANYOF_UNICODE; + } } /* REx optimizer. Converts nodes into quickier variants "in place". @@ -377,12 +612,11 @@ S_cl_or(pTHX_ struct regnode_charclass_class *cl, struct regnode_charclass_class to the position after last scanned or to NULL. */ STATIC I32 -S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) +S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ /* last: Stop before this one. */ { - dTHR; I32 min = 0, pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; @@ -486,29 +720,35 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da struct regnode_charclass_class accum; if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */ - scan_commit(data); /* Cannot merge strings after this. */ + scan_commit(pRExC_state, data); /* Cannot merge strings after this. */ if (flags & SCF_DO_STCLASS) - cl_init_zero(&accum); + cl_init_zero(pRExC_state, &accum); while (OP(scan) == code) { - I32 deltanext, minnext, f = 0; + I32 deltanext, minnext, f = 0, fake; struct regnode_charclass_class this_class; num++; data_fake.flags = 0; - if (data) + if (data) { data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; next = regnext(scan); scan = NEXTOPER(scan); if (code != BRANCH) scan = NEXTOPER(scan); if (flags & SCF_DO_STCLASS) { - cl_init(&this_class); + cl_init(pRExC_state, &this_class); data_fake.start_class = &this_class; f = SCF_DO_STCLASS_AND; } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; /* we suppose the run is continuous, last=next...*/ - minnext = study_chunk(&scan, &deltanext, next, - &data_fake, f); + minnext = study_chunk(pRExC_state, &scan, &deltanext, + next, &data_fake, f); if (min1 > minnext) min1 = minnext; if (max1 < minnext + deltanext) @@ -523,7 +763,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (data) data->whilem_c = data_fake.whilem_c; if (flags & SCF_DO_STCLASS) - cl_or(&accum, &this_class); + cl_or(pRExC_state, &accum, &this_class); if (code == SUSPEND) break; } @@ -538,7 +778,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da min += min1; delta += max1 - min1; if (flags & SCF_DO_STCLASS_OR) { - cl_or(data->start_class, &accum); + cl_or(pRExC_state, data->start_class, &accum); if (min1) { cl_and(data->start_class, &and_with); flags &= ~SCF_DO_STCLASS; @@ -570,15 +810,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } else if (OP(scan) == EXACT) { I32 l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); if (UTF) { - unsigned char *s = (unsigned char *)STRING(scan); - unsigned char *e = s + l; - I32 newl = 0; - while (s < e) { - newl++; - s += UTF8SKIP(s); - } - l = newl; + U8 *s = (U8*)STRING(scan); + l = utf8_length(s, s + l); + uc = utf8_to_uv_simple(s, NULL); } min += l; if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */ @@ -598,21 +834,22 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da /* Check whether it is compatible with what we know already! */ int compat = 1; - if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) - && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) + if (uc >= 0x100 || + !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) && (!(data->start_class->flags & ANYOF_FOLD) - || !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*STRING(scan)]))) + || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc]))) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; } else if (flags & SCF_DO_STCLASS_OR) { /* false positive possible if the class is case-folded */ - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + if (uc < 0x100) + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; cl_and(data->start_class, &and_with); } @@ -620,19 +857,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } else if (PL_regkind[(U8)OP(scan)] == EXACT) { /* But OP != EXACT! */ I32 l = STR_LEN(scan); + UV uc = *((U8*)STRING(scan)); /* Search for fixed substrings supports EXACT only. */ if (flags & SCF_DO_SUBSTR) - scan_commit(data); + scan_commit(pRExC_state, data); if (UTF) { - unsigned char *s = (unsigned char *)STRING(scan); - unsigned char *e = s + l; - I32 newl = 0; - while (s < e) { - newl++; - s += UTF8SKIP(s); - } - l = newl; + U8 *s = (U8 *)STRING(scan); + l = utf8_length(s, s + l); + uc = utf8_to_uv_simple(s, NULL); } min += l; if (data && (flags & SCF_DO_SUBSTR)) @@ -641,15 +874,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da /* Check whether it is compatible with what we know already! */ int compat = 1; - if (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) - && !ANYOF_BITMAP_TEST(data->start_class, *STRING(scan)) - && !ANYOF_BITMAP_TEST(data->start_class, - PL_fold[*STRING(scan)])) + if (uc >= 0x100 || + !(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE)) + && !ANYOF_BITMAP_TEST(data->start_class, uc) + && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])) compat = 0; ANYOF_CLASS_ZERO(data->start_class); ANYOF_BITMAP_ZERO(data->start_class); if (compat) { - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; data->start_class->flags |= ANYOF_FOLD; if (OP(scan) == EXACTFL) @@ -660,7 +893,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (data->start_class->flags & ANYOF_FOLD) { /* false positive possible if the class is case-folded. Assume that the locale settings are the same... */ - ANYOF_BITMAP_SET(data->start_class, *STRING(scan)); + if (uc < 0x100) + ANYOF_BITMAP_SET(data->start_class, uc); data->start_class->flags &= ~ANYOF_EOS; } cl_and(data->start_class, &and_with); @@ -668,8 +902,8 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da flags &= ~SCF_DO_STCLASS; } else if (strchr((char*)PL_varies,OP(scan))) { - I32 mincount, maxcount, minnext, deltanext, pos_before, fl; - I32 f = flags; + I32 mincount, maxcount, minnext, deltanext, fl; + I32 f = flags, pos_before = 0; regnode *oscan = scan; struct regnode_charclass_class this_class; struct regnode_charclass_class *oclass = NULL; @@ -704,7 +938,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da is_inf = is_inf_internal = 1; scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { - scan_commit(data); /* Cannot extend fixed substrings */ + scan_commit(pRExC_state, data); /* Cannot extend fixed substrings */ data->longest = &(data->longest_float); } goto optimize_curly_tail; @@ -712,10 +946,15 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da mincount = ARG1(scan); maxcount = ARG2(scan); next = regnext(scan); + if (OP(scan) == CURLYX) { + I32 lp = (data ? *(data->last_closep) : 0); + + scan->flags = ((lp <= U8_MAX) ? lp : U8_MAX); + } scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS; do_curly: if (flags & SCF_DO_SUBSTR) { - if (mincount == 0) scan_commit(data); /* Cannot extend fixed substrings */ + if (mincount == 0) scan_commit(pRExC_state,data); /* Cannot extend fixed substrings */ pos_before = data->pos_min; } if (data) { @@ -725,15 +964,23 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->flags |= SF_IS_INF; } if (flags & SCF_DO_STCLASS) { - cl_init(&this_class); + cl_init(pRExC_state, &this_class); oclass = data->start_class; data->start_class = &this_class; f |= SCF_DO_STCLASS_AND; f &= ~SCF_DO_STCLASS_OR; } + /* These are the cases when once a subexpression + fails at a particular position, it cannot succeed + even after backtracking at the enclosing scope. + + XXXX what if minimal match and we are at the + initial run of {n,m}? */ + if ((mincount != maxcount - 1) && (maxcount != REG_INFTY)) + f &= ~SCF_WHILEM_VISITED_POS; /* This will finish on WHILEM, setting scan, or on NULL: */ - minnext = study_chunk(&scan, &deltanext, last, data, + minnext = study_chunk(pRExC_state, &scan, &deltanext, last, data, mincount == 0 ? (f & ~SCF_DO_SUBSTR) : f); @@ -741,7 +988,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->start_class = oclass; if (mincount == 0 || minnext == 0) { if (flags & SCF_DO_STCLASS_OR) { - cl_or(data->start_class, &this_class); + cl_or(pRExC_state, data->start_class, &this_class); } else if (flags & SCF_DO_STCLASS_AND) { /* Switch to OR mode: cache the old value of @@ -756,7 +1003,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } } else { /* Non-zero len */ if (flags & SCF_DO_STCLASS_OR) { - cl_or(data->start_class, &this_class); + cl_or(pRExC_state, data->start_class, &this_class); cl_and(data->start_class, &and_with); } else if (flags & SCF_DO_STCLASS_AND) @@ -765,15 +1012,18 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; - if (ckWARN(WARN_UNSAFE) && (minnext + deltanext == 0) + if (ckWARN(WARN_REGEXP) && (minnext + deltanext == 0) && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) && maxcount <= REG_INFTY/3) /* Complement check for big count */ - Perl_warner(aTHX_ WARN_UNSAFE, - "Strange *+?{} on zero-length expression"); + { + vWARN(RExC_parse, + "Quantifier unexpected on zero-length expression"); + } + min += minnext * mincount; - is_inf_internal |= (maxcount == REG_INFTY - && (minnext + deltanext) > 0 - || deltanext == I32_MAX); + is_inf_internal |= ((maxcount == REG_INFTY + && (minnext + deltanext) > 0) + || deltanext == I32_MAX); is_inf |= is_inf_internal; delta += (minnext + deltanext) * maxcount - minnext * mincount; @@ -832,7 +1082,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/ if (OP(nxt) != CLOSE) - FAIL("panic opt close"); + FAIL("Panic opt close"); oscan->flags = ARG(nxt); OP(nxt1) = OPTIMIZED; /* was OPEN. */ OP(nxt) = OPTIMIZED; /* was CLOSE. */ @@ -858,20 +1108,27 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } #endif /* Optimize again: */ - study_chunk(&nxt1, &deltanext, nxt, NULL, 0); + study_chunk(pRExC_state, &nxt1, &deltanext, nxt, + NULL, 0); } else oscan->flags = 0; } - else if (OP(oscan) == CURLYX && data && ++data->whilem_c < 16) { - /* This stays as CURLYX, and can put the count/of pair. */ + else if ((OP(oscan) == CURLYX) + && (flags & SCF_WHILEM_VISITED_POS) + /* See the comment on a similar expression above. + However, this time it not a subexpression + we care about, but the expression itself. */ + && (maxcount == REG_INFTY) + && data && ++data->whilem_c < 16) { + /* This stays as CURLYX, we can put the count/of pair. */ /* Find WHILEM (as in regexec.c) */ regnode *nxt = oscan + NEXT_OFF(oscan); if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */ nxt += ARG(nxt); PREVOPER(nxt)->flags = data->whilem_c - | (PL_reg_whilem_seen << 4); /* On WHILEM */ + | (RExC_whilem_seen << 4); /* On WHILEM */ } if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) pars++; @@ -905,6 +1162,11 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da sv_catsv(data->last_found, last_str); data->last_end += l * (mincount - 1); } + } else { + /* start offset must point into the last copy */ + data->last_start_min += minnext * (mincount - 1); + data->last_start_max += is_inf ? 0 : (maxcount - 1) + * (minnext + data->pos_delta); } } /* It is counted once already... */ @@ -914,7 +1176,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (mincount != maxcount) { /* Cannot extend fixed substrings found inside the group. */ - scan_commit(data); + scan_commit(pRExC_state,data); if (mincount && last_str) { sv_setsv(data->last_found, last_str); data->last_end = data->pos_min; @@ -940,21 +1202,21 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da continue; default: /* REF and CLUMP only? */ if (flags & SCF_DO_SUBSTR) { - scan_commit(data); /* Cannot expect anything... */ + scan_commit(pRExC_state,data); /* Cannot expect anything... */ data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); flags &= ~SCF_DO_STCLASS; break; } } - else if (strchr((char*)PL_simple,OP(scan)) || PL_regkind[(U8)OP(scan)] == ANYUTF8) { + else if (strchr((char*)PL_simple,OP(scan))) { int value; if (flags & SCF_DO_SUBSTR) { - scan_commit(data); + scan_commit(pRExC_state,data); data->pos_min++; } min++; @@ -964,25 +1226,12 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da /* Some of the logic below assumes that switching locale on will only add false positives. */ switch (PL_regkind[(U8)OP(scan)]) { - case ANYUTF8: case SANY: - case SANYUTF8: - case ALNUMUTF8: - case ANYOFUTF8: - case ALNUMLUTF8: - case NALNUMUTF8: - case NALNUMLUTF8: - case SPACEUTF8: - case NSPACEUTF8: - case SPACELUTF8: - case NSPACELUTF8: - case DIGITUTF8: - case NDIGITUTF8: default: do_default: /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */ if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); break; case REG_ANY: if (OP(scan) == SANY) @@ -990,7 +1239,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */ value = (ANYOF_BITMAP_TEST(data->start_class,'\n') || (data->start_class->flags & ANYOF_CLASS)); - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); } if (flags & SCF_DO_STCLASS_AND || !value) ANYOF_BITMAP_CLEAR(data->start_class,'\n'); @@ -1000,7 +1249,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da cl_and(data->start_class, (struct regnode_charclass_class*)scan); else - cl_or(data->start_class, + cl_or(pRExC_state, data->start_class, (struct regnode_charclass_class*)scan); break; case ALNUM: @@ -1173,29 +1422,35 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da && (scan->flags || data || (flags & SCF_DO_STCLASS)) && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) { /* Lookahead/lookbehind */ - I32 deltanext, minnext; + I32 deltanext, minnext, fake = 0; regnode *nscan; struct regnode_charclass_class intrnl; int f = 0; data_fake.flags = 0; - if (data) + if (data) { data_fake.whilem_c = data->whilem_c; + data_fake.last_closep = data->last_closep; + } + else + data_fake.last_closep = &fake; if ( flags & SCF_DO_STCLASS && !scan->flags && OP(scan) == IFMATCH ) { /* Lookahead */ - cl_init(&intrnl); + cl_init(pRExC_state, &intrnl); data_fake.start_class = &intrnl; - f = SCF_DO_STCLASS_AND; + f |= SCF_DO_STCLASS_AND; } + if (flags & SCF_WHILEM_VISITED_POS) + f |= SCF_WHILEM_VISITED_POS; next = regnext(scan); nscan = NEXTOPER(NEXTOPER(scan)); - minnext = study_chunk(&nscan, &deltanext, last, &data_fake, f); + minnext = study_chunk(pRExC_state, &nscan, &deltanext, last, &data_fake, f); if (scan->flags) { if (deltanext) { - FAIL("variable length lookbehind not implemented"); + vFAIL("Variable length lookbehind not implemented"); } else if (minnext > U8_MAX) { - FAIL2("lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); + vFAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX); } scan->flags = minnext; } @@ -1205,7 +1460,7 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da data->flags |= SF_HAS_EVAL; if (data) data->whilem_c = data_fake.whilem_c; - if (f) { + if (f & SCF_DO_STCLASS_AND) { int was = (data->start_class->flags & ANYOF_EOS); cl_and(data->start_class, &intrnl); @@ -1216,24 +1471,29 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da else if (OP(scan) == OPEN) { pars++; } - else if (OP(scan) == CLOSE && ARG(scan) == is_par) { - next = regnext(scan); + else if (OP(scan) == CLOSE) { + if (ARG(scan) == is_par) { + next = regnext(scan); - if ( next && (OP(next) != WHILEM) && next < last) - is_par = 0; /* Disable optimization */ + if ( next && (OP(next) != WHILEM) && next < last) + is_par = 0; /* Disable optimization */ + } + if (data) + *(data->last_closep) = ARG(scan); } else if (OP(scan) == EVAL) { if (data) data->flags |= SF_HAS_EVAL; } - else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded */ + else if (OP(scan) == LOGICAL && scan->flags == 2) { /* Embedded follows */ if (flags & SCF_DO_SUBSTR) { - scan_commit(data); + scan_commit(pRExC_state,data); data->longest = &(data->longest_float); } is_inf = is_inf_internal = 1; if (flags & SCF_DO_STCLASS_OR) /* Allow everything */ - cl_anything(data->start_class); + cl_anything(pRExC_state, data->start_class); + flags &= ~SCF_DO_STCLASS; } /* Else: zero-length, ignore. */ scan = regnext(scan); @@ -1260,30 +1520,28 @@ S_study_chunk(pTHX_ regnode **scanp, I32 *deltap, regnode *last, scan_data_t *da } STATIC I32 -S_add_data(pTHX_ I32 n, char *s) +S_add_data(pTHX_ RExC_state_t *pRExC_state, I32 n, char *s) { - dTHR; - if (PL_regcomp_rx->data) { - Renewc(PL_regcomp_rx->data, - sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (PL_regcomp_rx->data->count + n - 1), + if (RExC_rx->data) { + Renewc(RExC_rx->data, + sizeof(*RExC_rx->data) + sizeof(void*) * (RExC_rx->data->count + n - 1), char, struct reg_data); - Renew(PL_regcomp_rx->data->what, PL_regcomp_rx->data->count + n, U8); - PL_regcomp_rx->data->count += n; + Renew(RExC_rx->data->what, RExC_rx->data->count + n, U8); + RExC_rx->data->count += n; } else { - Newc(1207, PL_regcomp_rx->data, sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (n - 1), + Newc(1207, RExC_rx->data, sizeof(*RExC_rx->data) + sizeof(void*) * (n - 1), char, struct reg_data); - New(1208, PL_regcomp_rx->data->what, n, U8); - PL_regcomp_rx->data->count = n; + New(1208, RExC_rx->data->what, n, U8); + RExC_rx->data->count = n; } - Copy(s, PL_regcomp_rx->data->what + PL_regcomp_rx->data->count - n, n, U8); - return PL_regcomp_rx->data->count - n; + Copy(s, RExC_rx->data->what + RExC_rx->data->count - n, n, U8); + return RExC_rx->data->count - n; } void Perl_reginitcolors(pTHX) { - dTHR; int i = 0; char *s = PerlEnv_getenv("PERL_RE_COLORS"); @@ -1305,6 +1563,7 @@ Perl_reginitcolors(pTHX) PL_colorset = 1; } + /* - pregcomp - compile a regular expression into internal code * @@ -1323,103 +1582,110 @@ Perl_reginitcolors(pTHX) regexp * Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) { - dTHR; register regexp *r; regnode *scan; - SV **longest; - SV *longest_fixed; - SV *longest_float; regnode *first; I32 flags; I32 minlen = 0; I32 sawplus = 0; I32 sawopen = 0; scan_data_t data; + RExC_state_t RExC_state; + RExC_state_t *pRExC_state = &RExC_state; if (exp == NULL) FAIL("NULL regexp argument"); - if (PL_curcop == &PL_compiling ? (PL_hints & HINT_UTF8) : IN_UTF8) - PL_reg_flags |= RF_utf8; + /* XXXX This looks very suspicious... */ + if (pm->op_pmdynflags & PMdf_CMP_UTF8) + RExC_utf8 = 1; else - PL_reg_flags = 0; + RExC_utf8 = 0; - PL_regprecomp = savepvn(exp, xend - exp); + RExC_precomp = savepvn(exp, xend - exp); DEBUG_r(if (!PL_colorset) reginitcolors()); DEBUG_r(PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n", PL_colors[4],PL_colors[5],PL_colors[0], - (int)(xend - exp), PL_regprecomp, PL_colors[1])); - PL_regflags = pm->op_pmflags; - PL_regsawback = 0; + (int)(xend - exp), RExC_precomp, PL_colors[1])); + RExC_flags16 = pm->op_pmflags; + RExC_sawback = 0; - PL_regseen = 0; - PL_seen_zerolen = *exp == '^' ? -1 : 0; - PL_seen_evals = 0; - PL_extralen = 0; + RExC_seen = 0; + RExC_seen_zerolen = *exp == '^' ? -1 : 0; + RExC_seen_evals = 0; + RExC_extralen = 0; /* First pass: determine size, legality. */ - PL_regcomp_parse = exp; - PL_regxend = xend; - PL_regnaughty = 0; - PL_regnpar = 1; - PL_regsize = 0L; - PL_regcode = &PL_regdummy; - PL_reg_whilem_seen = 0; - REGC((U8)REG_MAGIC, (char*)PL_regcode); - if (reg(0, &flags) == NULL) { - Safefree(PL_regprecomp); - PL_regprecomp = Nullch; + RExC_parse = exp; + RExC_end = xend; + RExC_naughty = 0; + RExC_npar = 1; + RExC_size = 0L; + RExC_emit = &PL_regdummy; + RExC_whilem_seen = 0; +#if 0 /* REGC() is (currently) a NOP at the first pass. + * Clever compilers notice this and complain. --jhi */ + REGC((U8)REG_MAGIC, (char*)RExC_emit); +#endif + if (reg(pRExC_state, 0, &flags) == NULL) { + Safefree(RExC_precomp); + RExC_precomp = Nullch; return(NULL); } - DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)PL_regsize)); + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size)); /* Small enough for pointer-storage convention? If extralen==0, this means that we will not need long jumps. */ - if (PL_regsize >= 0x10000L && PL_extralen) - PL_regsize += PL_extralen; + if (RExC_size >= 0x10000L && RExC_extralen) + RExC_size += RExC_extralen; else - PL_extralen = 0; - if (PL_reg_whilem_seen > 15) - PL_reg_whilem_seen = 15; + RExC_extralen = 0; + if (RExC_whilem_seen > 15) + RExC_whilem_seen = 15; /* Allocate space and initialize. */ - Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), + Newc(1001, r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char, regexp); if (r == NULL) - FAIL("regexp out of space"); + FAIL("Regexp out of space"); + +#ifdef DEBUGGING + /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */ + Zero(r, sizeof(regexp) + (unsigned)RExC_size * sizeof(regnode), char); +#endif r->refcnt = 1; r->prelen = xend - exp; - r->precomp = PL_regprecomp; + r->precomp = RExC_precomp; r->subbeg = NULL; r->reganch = pm->op_pmflags & PMf_COMPILETIME; - r->nparens = PL_regnpar - 1; /* set early to validate backrefs */ + r->nparens = RExC_npar - 1; /* set early to validate backrefs */ r->substrs = 0; /* Useful during FAIL. */ r->startp = 0; /* Useful during FAIL. */ r->endp = 0; /* Useful during FAIL. */ - PL_regcomp_rx = r; + RExC_rx = r; /* Second pass: emit code. */ - PL_regcomp_parse = exp; - PL_regxend = xend; - PL_regnaughty = 0; - PL_regnpar = 1; - PL_regcode = r->program; + RExC_parse = exp; + RExC_end = xend; + RExC_naughty = 0; + RExC_npar = 1; + RExC_emit = r->program; /* Store the count of eval-groups for security checks: */ - PL_regcode->next_off = ((PL_seen_evals > U16_MAX) ? U16_MAX : PL_seen_evals); - REGC((U8)REG_MAGIC, (char*) PL_regcode++); + RExC_emit->next_off = ((RExC_seen_evals > U16_MAX) ? U16_MAX : RExC_seen_evals); + REGC((U8)REG_MAGIC, (char*) RExC_emit++); r->data = 0; - if (reg(0, &flags) == NULL) + if (reg(pRExC_state, 0, &flags) == NULL) return(NULL); /* Dig out information for optimizations. */ r->reganch = pm->op_pmflags & PMf_COMPILETIME; /* Again? */ - pm->op_pmflags = PL_regflags; + pm->op_pmflags = RExC_flags16; if (UTF) r->reganch |= ROPT_UTF8; r->regstclass = NULL; - if (PL_regnaughty >= 10) /* Probably an expensive pattern. */ + if (RExC_naughty >= 10) /* Probably an expensive pattern. */ r->reganch |= ROPT_NAUGHTY; scan = r->program + 1; /* First BRANCH. */ @@ -1434,6 +1700,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) STRLEN longest_float_length, longest_fixed_length; struct regnode_charclass_class ch_class; int stclass_flag; + I32 last_close = 0; first = scan; /* Skip introductions and multiplicators >= 1. */ @@ -1454,9 +1721,9 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* Starting-point info. */ again: if (PL_regkind[(U8)OP(first)] == EXACT) { - if (OP(first) == EXACT); /* Empty, get anchored substr later. */ - else if ((OP(first) == EXACTF || OP(first) == EXACTFL) - && !UTF) + if (OP(first) == EXACT) + ; /* Empty, get anchored substr later. */ + else if ((OP(first) == EXACTF || OP(first) == EXACTFL)) r->regstclass = first; } else if (strchr((char*)PL_simple,OP(first))) @@ -1485,7 +1752,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* turn .* into ^.* with an implied $*=1 */ int type = OP(NEXTOPER(first)); - if (type == REG_ANY || type == ANYUTF8) + if (type == REG_ANY) type = ROPT_ANCH_MBOL; else type = ROPT_ANCH_SBOL; @@ -1494,8 +1761,8 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) first = NEXTOPER(first); goto again; } - if (sawplus && (!sawopen || !PL_regsawback) - && !(PL_regseen & REG_SEEN_EVAL)) /* May examine pos and $& */ + if (sawplus && (!sawopen || !RExC_sawback) + && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */ /* x+ must match at the 1st pos of run of x's */ r->reganch |= ROPT_SKIP; @@ -1521,27 +1788,28 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) data.longest = &(data.longest_fixed); first = scan; if (!r->regstclass) { - cl_init(&ch_class); + cl_init(pRExC_state, &ch_class); data.start_class = &ch_class; stclass_flag = SCF_DO_STCLASS_AND; } else /* XXXX Check for BOUND? */ stclass_flag = 0; + data.last_closep = &last_close; - minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */ - &data, SCF_DO_SUBSTR | stclass_flag); - if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed) + minlen = study_chunk(pRExC_state, &first, &fake, scan + RExC_size, /* Up to end */ + &data, SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag); + if ( RExC_npar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 - && !PL_seen_zerolen - && (!(PL_regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) + && !RExC_seen_zerolen + && (!(RExC_seen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) r->reganch |= ROPT_CHECK_ALL; - scan_commit(&data); + scan_commit(pRExC_state, &data); SvREFCNT_dec(data.last_found); longest_float_length = CHR_SVLEN(data.longest_float); if (longest_float_length || (data.flags & SF_FL_BEFORE_EOL && (!(data.flags & SF_FL_BEFORE_MEOL) - || (PL_regflags & PMf_MULTILINE)))) { + || (RExC_flags16 & PMf_MULTILINE)))) { int t; if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */ @@ -1554,7 +1822,7 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) r->float_max_offset = data.offset_float_max; t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FL_BEFORE_MEOL) - || (PL_regflags & PMf_MULTILINE))); + || (RExC_flags16 & PMf_MULTILINE))); fbm_compile(r->float_substr, t ? FBMcf_TAIL : 0); } else { @@ -1568,14 +1836,14 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) if (longest_fixed_length || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (PL_regflags & PMf_MULTILINE)))) { + || (RExC_flags16 & PMf_MULTILINE)))) { int t; r->anchored_substr = data.longest_fixed; r->anchored_offset = data.offset_fixed; t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (PL_regflags & PMf_MULTILINE))); + || (RExC_flags16 & PMf_MULTILINE))); fbm_compile(r->anchored_substr, t ? FBMcf_TAIL : 0); } else { @@ -1584,25 +1852,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) longest_fixed_length = 0; } if (r->regstclass - && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == ANYUTF8 - || OP(r->regstclass) == SANYUTF8 || OP(r->regstclass) == SANY)) + && (OP(r->regstclass) == REG_ANY || OP(r->regstclass) == SANY)) r->regstclass = NULL; if ((!r->anchored_substr || r->anchored_offset) && stclass_flag && !(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { SV *sv; - I32 n = add_data(1, "f"); + I32 n = add_data(pRExC_state, 1, "f"); - New(1006, PL_regcomp_rx->data->data[n], 1, + New(1006, RExC_rx->data->data[n], 1, struct regnode_charclass_class); StructCopy(data.start_class, - (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n], + (struct regnode_charclass_class*)RExC_rx->data->data[n], struct regnode_charclass_class); - r->regstclass = (regnode*)PL_regcomp_rx->data->data[n]; + r->regstclass = (regnode*)RExC_rx->data->data[n]; r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ + PL_regdata = r->data; /* for regprop() */ DEBUG_r((sv = sv_newmortal(), regprop(sv, (regnode*)data.start_class), - PerlIO_printf(Perl_debug_log, "synthetic stclass.\n", + PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", SvPVX(sv)))); } @@ -1630,41 +1898,44 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) /* Several toplevels. Best we can is to set minlen. */ I32 fake; struct regnode_charclass_class ch_class; + I32 last_close = 0; DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); scan = r->program + 1; - cl_init(&ch_class); + cl_init(pRExC_state, &ch_class); data.start_class = &ch_class; - minlen = study_chunk(&scan, &fake, scan + PL_regsize, &data, SCF_DO_STCLASS_AND); + data.last_closep = &last_close; + minlen = study_chunk(pRExC_state, &scan, &fake, scan + RExC_size, &data, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS); r->check_substr = r->anchored_substr = r->float_substr = Nullsv; if (!(data.start_class->flags & ANYOF_EOS) && !cl_is_anything(data.start_class)) { SV *sv; - I32 n = add_data(1, "f"); + I32 n = add_data(pRExC_state, 1, "f"); - New(1006, PL_regcomp_rx->data->data[n], 1, + New(1006, RExC_rx->data->data[n], 1, struct regnode_charclass_class); StructCopy(data.start_class, - (struct regnode_charclass_class*)PL_regcomp_rx->data->data[n], + (struct regnode_charclass_class*)RExC_rx->data->data[n], struct regnode_charclass_class); - r->regstclass = (regnode*)PL_regcomp_rx->data->data[n]; + r->regstclass = (regnode*)RExC_rx->data->data[n]; r->reganch &= ~ROPT_SKIP; /* Used in find_byclass(). */ DEBUG_r((sv = sv_newmortal(), regprop(sv, (regnode*)data.start_class), - PerlIO_printf(Perl_debug_log, "synthetic stclass.\n", + PerlIO_printf(Perl_debug_log, "synthetic stclass `%s'.\n", SvPVX(sv)))); } } r->minlen = minlen; - if (PL_regseen & REG_SEEN_GPOS) + if (RExC_seen & REG_SEEN_GPOS) r->reganch |= ROPT_GPOS_SEEN; - if (PL_regseen & REG_SEEN_LOOKBEHIND) + if (RExC_seen & REG_SEEN_LOOKBEHIND) r->reganch |= ROPT_LOOKBEHIND_SEEN; - if (PL_regseen & REG_SEEN_EVAL) + if (RExC_seen & REG_SEEN_EVAL) r->reganch |= ROPT_EVAL_SEEN; - Newz(1002, r->startp, PL_regnpar, I32); - Newz(1002, r->endp, PL_regnpar, I32); + Newz(1002, r->startp, RExC_npar, I32); + Newz(1002, r->endp, RExC_npar, I32); + PL_regdata = r->data; /* for regprop() */ DEBUG_r(regdump(r)); return(r); } @@ -1679,100 +1950,110 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) * follows makes it hard to avoid. */ STATIC regnode * -S_reg(pTHX_ I32 paren, I32 *flagp) +S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { - dTHR; register regnode *ret; /* Will be the head of the group. */ register regnode *br; register regnode *lastbr; register regnode *ender = 0; register I32 parno = 0; - I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0; + I32 flags, oregflags = RExC_flags16, have_branch = 0, open = 0; + char *oregcomp_parse = RExC_parse; char c; *flagp = 0; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { - if (*PL_regcomp_parse == '?') { + if (*RExC_parse == '?') { U16 posflags = 0, negflags = 0; U16 *flagsp = &posflags; int logical = 0; + char *seqstart = RExC_parse; - PL_regcomp_parse++; - paren = *PL_regcomp_parse++; + RExC_parse++; + paren = *RExC_parse++; ret = NULL; /* For look-ahead/behind. */ switch (paren) { case '<': - PL_regseen |= REG_SEEN_LOOKBEHIND; - if (*PL_regcomp_parse == '!') + RExC_seen |= REG_SEEN_LOOKBEHIND; + if (*RExC_parse == '!') paren = ','; - if (*PL_regcomp_parse != '=' && *PL_regcomp_parse != '!') + if (*RExC_parse != '=' && *RExC_parse != '!') goto unknown; - PL_regcomp_parse++; + RExC_parse++; case '=': case '!': - PL_seen_zerolen++; + RExC_seen_zerolen++; case ':': case '>': break; case '$': case '@': - FAIL2("Sequence (?%c...) not implemented", (int)paren); + vFAIL2("Sequence (?%c...) not implemented", (int)paren); break; case '#': - while (*PL_regcomp_parse && *PL_regcomp_parse != ')') - PL_regcomp_parse++; - if (*PL_regcomp_parse != ')') + while (*RExC_parse && *RExC_parse != ')') + RExC_parse++; + if (*RExC_parse != ')') FAIL("Sequence (?#... not terminated"); - nextchar(); + nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; case 'p': + if (SIZE_ONLY) + vWARN(RExC_parse, "(?p{}) is deprecated - use (??{})"); + /* FALL THROUGH*/ + case '?': logical = 1; - paren = *PL_regcomp_parse++; + paren = *RExC_parse++; /* FALL THROUGH */ case '{': { - dTHR; I32 count = 1, n = 0; char c; - char *s = PL_regcomp_parse; + char *s = RExC_parse; SV *sv; OP_4tree *sop, *rop; - PL_seen_zerolen++; - PL_regseen |= REG_SEEN_EVAL; - while (count && (c = *PL_regcomp_parse)) { - if (c == '\\' && PL_regcomp_parse[1]) - PL_regcomp_parse++; + RExC_seen_zerolen++; + RExC_seen |= REG_SEEN_EVAL; + while (count && (c = *RExC_parse)) { + if (c == '\\' && RExC_parse[1]) + RExC_parse++; else if (c == '{') count++; else if (c == '}') count--; - PL_regcomp_parse++; + RExC_parse++; + } + if (*RExC_parse != ')') + { + RExC_parse = s; + vFAIL("Sequence (?{...}) not terminated or not {}-balanced"); } - if (*PL_regcomp_parse != ')') - FAIL("Sequence (?{...}) not terminated or not {}-balanced"); if (!SIZE_ONLY) { AV *av; - if (PL_regcomp_parse - 1 - s) - sv = newSVpvn(s, PL_regcomp_parse - 1 - s); + if (RExC_parse - 1 - s) + sv = newSVpvn(s, RExC_parse - 1 - s); else sv = newSVpvn("", 0); + ENTER; + Perl_save_re_context(aTHX); rop = sv_compile_2op(sv, &sop, "re", &av); + LEAVE; - n = add_data(3, "nop"); - PL_regcomp_rx->data->data[n] = (void*)rop; - PL_regcomp_rx->data->data[n+1] = (void*)sop; - PL_regcomp_rx->data->data[n+2] = (void*)av; + n = add_data(pRExC_state, 3, "nop"); + RExC_rx->data->data[n] = (void*)rop; + RExC_rx->data->data[n+1] = (void*)sop; + RExC_rx->data->data[n+2] = (void*)av; SvREFCNT_dec(sv); } else { /* First pass */ - if (PL_reginterp_cnt < ++PL_seen_evals + if (PL_reginterp_cnt < ++RExC_seen_evals && PL_curcop != &PL_compiling) /* No compiled RE interpolated, has runtime components ===> unsafe. */ @@ -1781,110 +2062,113 @@ S_reg(pTHX_ I32 paren, I32 *flagp) FAIL("Eval-group in insecure regular expression"); } - nextchar(); + nextchar(pRExC_state); if (logical) { - ret = reg_node(LOGICAL); + ret = reg_node(pRExC_state, LOGICAL); if (!SIZE_ONLY) ret->flags = 2; - regtail(ret, reganode(EVAL, n)); + regtail(pRExC_state, ret, reganode(pRExC_state, EVAL, n)); return ret; } - return reganode(EVAL, n); + return reganode(pRExC_state, EVAL, n); } case '(': { - if (PL_regcomp_parse[0] == '?') { - if (PL_regcomp_parse[1] == '=' || PL_regcomp_parse[1] == '!' - || PL_regcomp_parse[1] == '<' - || PL_regcomp_parse[1] == '{') { /* Lookahead or eval. */ + if (RExC_parse[0] == '?') { + if (RExC_parse[1] == '=' || RExC_parse[1] == '!' + || RExC_parse[1] == '<' + || RExC_parse[1] == '{') { /* Lookahead or eval. */ I32 flag; - ret = reg_node(LOGICAL); + ret = reg_node(pRExC_state, LOGICAL); if (!SIZE_ONLY) ret->flags = 1; - regtail(ret, reg(1, &flag)); + regtail(pRExC_state, ret, reg(pRExC_state, 1, &flag)); goto insert_if; } } - else if (PL_regcomp_parse[0] >= '1' && PL_regcomp_parse[0] <= '9' ) { - parno = atoi(PL_regcomp_parse++); - - while (isDIGIT(*PL_regcomp_parse)) - PL_regcomp_parse++; - ret = reganode(GROUPP, parno); - if ((c = *nextchar()) != ')') - FAIL2("Switch (?(number%c not recognized", c); + else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) { + parno = atoi(RExC_parse++); + + while (isDIGIT(*RExC_parse)) + RExC_parse++; + ret = reganode(pRExC_state, GROUPP, parno); + if ((c = *nextchar(pRExC_state)) != ')') + vFAIL("Switch condition not recognized"); insert_if: - regtail(ret, reganode(IFTHEN, 0)); - br = regbranch(&flags, 1); + regtail(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0)); + br = regbranch(pRExC_state, &flags, 1); if (br == NULL) - br = reganode(LONGJMP, 0); + br = reganode(pRExC_state, LONGJMP, 0); else - regtail(br, reganode(LONGJMP, 0)); - c = *nextchar(); + regtail(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0)); + c = *nextchar(pRExC_state); if (flags&HASWIDTH) *flagp |= HASWIDTH; if (c == '|') { - lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */ - regbranch(&flags, 1); - regtail(ret, lastbr); + lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */ + regbranch(pRExC_state, &flags, 1); + regtail(pRExC_state, ret, lastbr); if (flags&HASWIDTH) *flagp |= HASWIDTH; - c = *nextchar(); + c = *nextchar(pRExC_state); } else lastbr = NULL; if (c != ')') - FAIL("Switch (?(condition)... contains too many branches"); - ender = reg_node(TAIL); - regtail(br, ender); + vFAIL("Switch (?(condition)... contains too many branches"); + ender = reg_node(pRExC_state, TAIL); + regtail(pRExC_state, br, ender); if (lastbr) { - regtail(lastbr, ender); - regtail(NEXTOPER(NEXTOPER(lastbr)), ender); + regtail(pRExC_state, lastbr, ender); + regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); } else - regtail(ret, ender); + regtail(pRExC_state, ret, ender); return ret; } else { - FAIL2("Unknown condition for (?(%.2s", PL_regcomp_parse); + vFAIL2("Unknown switch condition (?(%.2s", RExC_parse); } } case 0: - FAIL("Sequence (? incomplete"); + RExC_parse--; /* for vFAIL to print correctly */ + vFAIL("Sequence (? incomplete"); break; default: - --PL_regcomp_parse; + --RExC_parse; parse_flags: - while (*PL_regcomp_parse && strchr("iogcmsx", *PL_regcomp_parse)) { - if (*PL_regcomp_parse != 'o') - pmflag(flagsp, *PL_regcomp_parse); - ++PL_regcomp_parse; + while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { + if (*RExC_parse != 'o') + pmflag(flagsp, *RExC_parse); + ++RExC_parse; } - if (*PL_regcomp_parse == '-') { + if (*RExC_parse == '-') { flagsp = &negflags; - ++PL_regcomp_parse; + ++RExC_parse; goto parse_flags; } - PL_regflags |= posflags; - PL_regflags &= ~negflags; - if (*PL_regcomp_parse == ':') { - PL_regcomp_parse++; + RExC_flags16 |= posflags; + RExC_flags16 &= ~negflags; + if (*RExC_parse == ':') { + RExC_parse++; paren = ':'; break; } unknown: - if (*PL_regcomp_parse != ')') - FAIL2("Sequence (?%c...) not recognized", *PL_regcomp_parse); - nextchar(); + if (*RExC_parse != ')') { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + } + nextchar(pRExC_state); *flagp = TRYAGAIN; return NULL; } } else { - parno = PL_regnpar; - PL_regnpar++; - ret = reganode(OPEN, parno); + parno = RExC_npar; + RExC_npar++; + ret = reganode(pRExC_state, OPEN, parno); open = 1; } } @@ -1892,24 +2176,24 @@ S_reg(pTHX_ I32 paren, I32 *flagp) ret = NULL; /* Pick up the branches, linking them together. */ - br = regbranch(&flags, 1); + br = regbranch(pRExC_state, &flags, 1); if (br == NULL) return(NULL); - if (*PL_regcomp_parse == '|') { - if (!SIZE_ONLY && PL_extralen) { - reginsert(BRANCHJ, br); + if (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, BRANCHJ, br); } else - reginsert(BRANCH, br); + reginsert(pRExC_state, BRANCH, br); have_branch = 1; if (SIZE_ONLY) - PL_extralen += 1; /* For BRANCHJ-BRANCH. */ + RExC_extralen += 1; /* For BRANCHJ-BRANCH. */ } else if (paren == ':') { *flagp |= flags&SIMPLE; } if (open) { /* Starts with OPEN. */ - regtail(ret, br); /* OPEN -> first. */ + regtail(pRExC_state, ret, br); /* OPEN -> first. */ } else if (paren != '?') /* Not Conditional */ ret = br; @@ -1917,18 +2201,18 @@ S_reg(pTHX_ I32 paren, I32 *flagp) *flagp |= HASWIDTH; *flagp |= flags&SPSTART; lastbr = br; - while (*PL_regcomp_parse == '|') { - if (!SIZE_ONLY && PL_extralen) { - ender = reganode(LONGJMP,0); - regtail(NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ + while (*RExC_parse == '|') { + if (!SIZE_ONLY && RExC_extralen) { + ender = reganode(pRExC_state, LONGJMP,0); + regtail(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ } if (SIZE_ONLY) - PL_extralen += 2; /* Account for LONGJMP. */ - nextchar(); - br = regbranch(&flags, 0); + RExC_extralen += 2; /* Account for LONGJMP. */ + nextchar(pRExC_state); + br = regbranch(pRExC_state, &flags, 0); if (br == NULL) return(NULL); - regtail(lastbr, br); /* BRANCH -> BRANCH. */ + regtail(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; if (flags&HASWIDTH) *flagp |= HASWIDTH; @@ -1939,10 +2223,10 @@ S_reg(pTHX_ I32 paren, I32 *flagp) /* Make a closing node, and hook it on the end. */ switch (paren) { case ':': - ender = reg_node(TAIL); + ender = reg_node(pRExC_state, TAIL); break; case 1: - ender = reganode(CLOSE, parno); + ender = reganode(pRExC_state, CLOSE, parno); break; case '<': case ',': @@ -1951,18 +2235,18 @@ S_reg(pTHX_ I32 paren, I32 *flagp) *flagp &= ~HASWIDTH; /* FALL THROUGH */ case '>': - ender = reg_node(SUCCEED); + ender = reg_node(pRExC_state, SUCCEED); break; case 0: - ender = reg_node(END); + ender = reg_node(pRExC_state, END); break; } - regtail(lastbr, ender); + regtail(pRExC_state, lastbr, ender); if (have_branch) { /* Hook the tails of the branches to the closing node. */ for (br = ret; br != NULL; br = regnext(br)) { - regoptail(br, ender); + regoptail(pRExC_state, br, ender); } } } @@ -1977,25 +2261,27 @@ S_reg(pTHX_ I32 paren, I32 *flagp) if (paren == '>') node = SUSPEND, flag = 0; - reginsert(node,ret); + reginsert(pRExC_state, node,ret); ret->flags = flag; - regtail(ret, reg_node(TAIL)); + regtail(pRExC_state, ret, reg_node(pRExC_state, TAIL)); } } /* Check for proper termination. */ if (paren) { - PL_regflags = oregflags; - if (PL_regcomp_parse >= PL_regxend || *nextchar() != ')') { - FAIL("unmatched () in regexp"); + RExC_flags16 = oregflags; + if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ("); } } - else if (!paren && PL_regcomp_parse < PL_regxend) { - if (*PL_regcomp_parse == ')') { - FAIL("unmatched () in regexp"); + else if (!paren && RExC_parse < RExC_end) { + if (*RExC_parse == ')') { + RExC_parse++; + vFAIL("Unmatched )"); } else - FAIL("junk on end of regexp"); /* "Can't happen". */ + FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } @@ -2008,9 +2294,8 @@ S_reg(pTHX_ I32 paren, I32 *flagp) * Implements the concatenation operator. */ STATIC regnode * -S_regbranch(pTHX_ I32 *flagp, I32 first) +S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first) { - dTHR; register regnode *ret; register regnode *chain = NULL; register regnode *latest; @@ -2019,22 +2304,22 @@ S_regbranch(pTHX_ I32 *flagp, I32 first) if (first) ret = NULL; else { - if (!SIZE_ONLY && PL_extralen) - ret = reganode(BRANCHJ,0); + if (!SIZE_ONLY && RExC_extralen) + ret = reganode(pRExC_state, BRANCHJ,0); else - ret = reg_node(BRANCH); + ret = reg_node(pRExC_state, BRANCH); } if (!first && SIZE_ONLY) - PL_extralen += 1; /* BRANCHJ */ + RExC_extralen += 1; /* BRANCHJ */ *flagp = WORST; /* Tentatively. */ - PL_regcomp_parse--; - nextchar(); - while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '|' && *PL_regcomp_parse != ')') { + RExC_parse--; + nextchar(pRExC_state); + while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') { flags &= ~TRYAGAIN; - latest = regpiece(&flags); + latest = regpiece(pRExC_state, &flags); if (latest == NULL) { if (flags & TRYAGAIN) continue; @@ -2046,14 +2331,14 @@ S_regbranch(pTHX_ I32 *flagp, I32 first) if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { - PL_regnaughty++; - regtail(chain, latest); + RExC_naughty++; + regtail(pRExC_state, chain, latest); } chain = latest; c++; } if (chain == NULL) { /* Loop ran zero times. */ - chain = reg_node(NOTHING); + chain = reg_node(pRExC_state, NOTHING); if (ret == NULL) ret = chain; } @@ -2074,29 +2359,28 @@ S_regbranch(pTHX_ I32 *flagp, I32 first) * endmarker role is not redundant. */ STATIC regnode * -S_regpiece(pTHX_ I32 *flagp) +S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) { - dTHR; register regnode *ret; register char op; register char *next; I32 flags; - char *origparse = PL_regcomp_parse; + char *origparse = RExC_parse; char *maxpos; I32 min; I32 max = REG_INFTY; - ret = regatom(&flags); + ret = regatom(pRExC_state, &flags); if (ret == NULL) { if (flags & TRYAGAIN) *flagp |= TRYAGAIN; return(NULL); } - op = *PL_regcomp_parse; + op = *RExC_parse; - if (op == '{' && regcurly(PL_regcomp_parse)) { - next = PL_regcomp_parse + 1; + if (op == '{' && regcurly(RExC_parse)) { + next = RExC_parse + 1; maxpos = Nullch; while (isDIGIT(*next) || *next == ',') { if (*next == ',') { @@ -2110,42 +2394,42 @@ S_regpiece(pTHX_ I32 *flagp) if (*next == '}') { /* got one */ if (!maxpos) maxpos = next; - PL_regcomp_parse++; - min = atoi(PL_regcomp_parse); + RExC_parse++; + min = atoi(RExC_parse); if (*maxpos == ',') maxpos++; else - maxpos = PL_regcomp_parse; + maxpos = RExC_parse; max = atoi(maxpos); if (!max && *maxpos != '0') max = REG_INFTY; /* meaning "infinity" */ else if (max >= REG_INFTY) - FAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); - PL_regcomp_parse = next; - nextchar(); + vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); + RExC_parse = next; + nextchar(pRExC_state); do_curly: if ((flags&SIMPLE)) { - PL_regnaughty += 2 + PL_regnaughty / 2; - reginsert(CURLY, ret); + RExC_naughty += 2 + RExC_naughty / 2; + reginsert(pRExC_state, CURLY, ret); } else { - regnode *w = reg_node(WHILEM); + regnode *w = reg_node(pRExC_state, WHILEM); w->flags = 0; - regtail(ret, w); - if (!SIZE_ONLY && PL_extralen) { - reginsert(LONGJMP,ret); - reginsert(NOTHING,ret); + regtail(pRExC_state, ret, w); + if (!SIZE_ONLY && RExC_extralen) { + reginsert(pRExC_state, LONGJMP,ret); + reginsert(pRExC_state, NOTHING,ret); NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ } - reginsert(CURLYX,ret); - if (!SIZE_ONLY && PL_extralen) + reginsert(pRExC_state, CURLYX,ret); + if (!SIZE_ONLY && RExC_extralen) NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ - regtail(ret, reg_node(NOTHING)); + regtail(pRExC_state, ret, reg_node(pRExC_state, NOTHING)); if (SIZE_ONLY) - PL_reg_whilem_seen++, PL_extralen += 3; - PL_regnaughty += 4 + PL_regnaughty; /* compound interest */ + RExC_whilem_seen++, RExC_extralen += 3; + RExC_naughty += 4 + RExC_naughty; /* compound interest */ } ret->flags = 0; @@ -2154,7 +2438,7 @@ S_regpiece(pTHX_ I32 *flagp) if (max > 0) *flagp |= HASWIDTH; if (max && max < min) - FAIL("Can't do {n,m} with n > m"); + vFAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { ARG1_SET(ret, min); ARG2_SET(ret, max); @@ -2170,27 +2454,38 @@ S_regpiece(pTHX_ I32 *flagp) } #if 0 /* Now runtime fix should be reliable. */ + + /* if this is reinstated, don't forget to put this back into perldiag: + + =item Regexp *+ operand could be empty at {#} in regex m/%s/ + + (F) The part of the regexp subject to either the * or + quantifier + could match an empty string. The {#} shows in the regular + expression about where the problem was discovered. + + */ + if (!(flags&HASWIDTH) && op != '?') - FAIL("regexp *+ operand could be empty"); + vFAIL("Regexp *+ operand could be empty"); #endif - nextchar(); + nextchar(pRExC_state); *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) { - reginsert(STAR, ret); + reginsert(pRExC_state, STAR, ret); ret->flags = 0; - PL_regnaughty += 4; + RExC_naughty += 4; } else if (op == '*') { min = 0; goto do_curly; } else if (op == '+' && (flags&SIMPLE)) { - reginsert(PLUS, ret); + reginsert(pRExC_state, PLUS, ret); ret->flags = 0; - PL_regnaughty += 3; + RExC_naughty += 3; } else if (op == '+') { min = 1; @@ -2201,18 +2496,22 @@ S_regpiece(pTHX_ I32 *flagp) goto do_curly; } nest_check: - if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { - Perl_warner(aTHX_ WARN_UNSAFE, "%.*s matches null string many times", - PL_regcomp_parse - origparse, origparse); + if (ckWARN(WARN_REGEXP) && !SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3) { + vWARN3(RExC_parse, + "%.*s matches null string many times", + RExC_parse - origparse, + origparse); } - if (*PL_regcomp_parse == '?') { - nextchar(); - reginsert(MINMOD, ret); - regtail(ret, ret + NODE_STEP_REGNODE); + if (*RExC_parse == '?') { + nextchar(pRExC_state); + reginsert(pRExC_state, MINMOD, ret); + regtail(pRExC_state, ret, ret + NODE_STEP_REGNODE); + } + if (ISMULT2(RExC_parse)) { + RExC_parse++; + vFAIL("Nested quantifiers"); } - if (ISMULT2(PL_regcomp_parse)) - FAIL("nested *?+ in regexp"); return(ret); } @@ -2225,72 +2524,71 @@ S_regpiece(pTHX_ I32 *flagp) * faster to run. Backslashed characters are exceptions, each becoming a * separate node; the code is simpler that way and it's not worth fixing. * - * [Yes, it is worth fixing, some scripts can run twice the speed.] - */ + * [Yes, it is worth fixing, some scripts can run twice the speed.] */ STATIC regnode * -S_regatom(pTHX_ I32 *flagp) +S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp) { - dTHR; register regnode *ret = 0; I32 flags; *flagp = WORST; /* Tentatively. */ tryagain: - switch (*PL_regcomp_parse) { + switch (*RExC_parse) { case '^': - PL_seen_zerolen++; - nextchar(); - if (PL_regflags & PMf_MULTILINE) - ret = reg_node(MBOL); - else if (PL_regflags & PMf_SINGLELINE) - ret = reg_node(SBOL); + RExC_seen_zerolen++; + nextchar(pRExC_state); + if (RExC_flags16 & PMf_MULTILINE) + ret = reg_node(pRExC_state, MBOL); + else if (RExC_flags16 & PMf_SINGLELINE) + ret = reg_node(pRExC_state, SBOL); else - ret = reg_node(BOL); + ret = reg_node(pRExC_state, BOL); break; case '$': - if (PL_regcomp_parse[1]) - PL_seen_zerolen++; - nextchar(); - if (PL_regflags & PMf_MULTILINE) - ret = reg_node(MEOL); - else if (PL_regflags & PMf_SINGLELINE) - ret = reg_node(SEOL); + nextchar(pRExC_state); + if (*RExC_parse) + RExC_seen_zerolen++; + if (RExC_flags16 & PMf_MULTILINE) + ret = reg_node(pRExC_state, MEOL); + else if (RExC_flags16 & PMf_SINGLELINE) + ret = reg_node(pRExC_state, SEOL); else - ret = reg_node(EOL); + ret = reg_node(pRExC_state, EOL); break; case '.': - nextchar(); - if (UTF) { - if (PL_regflags & PMf_SINGLELINE) - ret = reg_node(SANYUTF8); - else - ret = reg_node(ANYUTF8); - *flagp |= HASWIDTH; - } - else { - if (PL_regflags & PMf_SINGLELINE) - ret = reg_node(SANY); - else - ret = reg_node(REG_ANY); - *flagp |= HASWIDTH|SIMPLE; - } - PL_regnaughty++; + nextchar(pRExC_state); + if (RExC_flags16 & PMf_SINGLELINE) + ret = reg_node(pRExC_state, SANY); + else + ret = reg_node(pRExC_state, REG_ANY); + *flagp |= HASWIDTH|SIMPLE; + RExC_naughty++; break; case '[': - PL_regcomp_parse++; - ret = (UTF ? regclassutf8() : regclass()); - if (*PL_regcomp_parse != ']') - FAIL("unmatched [] in regexp"); - nextchar(); + { + char *oregcomp_parse = ++RExC_parse; + ret = regclass(pRExC_state); + if (*RExC_parse != ']') { + RExC_parse = oregcomp_parse; + vFAIL("Unmatched ["); + } + nextchar(pRExC_state); *flagp |= HASWIDTH|SIMPLE; break; + } case '(': - nextchar(); - ret = reg(1, &flags); + nextchar(pRExC_state); + ret = reg(pRExC_state, 1, &flags); if (ret == NULL) { - if (flags & TRYAGAIN) + if (flags & TRYAGAIN) { + if (RExC_parse == RExC_end) { + /* Make parent create an empty node if needed. */ + *flagp |= TRYAGAIN; + return(NULL); + } goto tryagain; + } return(NULL); } *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); @@ -2301,155 +2599,141 @@ tryagain: *flagp |= TRYAGAIN; return NULL; } - FAIL2("internal urp in regexp at /%s/", PL_regcomp_parse); + vFAIL("Internal urp"); /* Supposed to be caught earlier. */ break; case '{': - if (!regcurly(PL_regcomp_parse)) { - PL_regcomp_parse++; + if (!regcurly(RExC_parse)) { + RExC_parse++; goto defchar; } /* FALL THROUGH */ case '?': case '+': case '*': - FAIL("?+*{} follows nothing in regexp"); + RExC_parse++; + vFAIL("Quantifier follows nothing"); break; case '\\': - switch (*++PL_regcomp_parse) { + switch (*++RExC_parse) { case 'A': - PL_seen_zerolen++; - ret = reg_node(SBOL); + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, SBOL); *flagp |= SIMPLE; - nextchar(); + nextchar(pRExC_state); break; case 'G': - ret = reg_node(GPOS); - PL_regseen |= REG_SEEN_GPOS; + ret = reg_node(pRExC_state, GPOS); + RExC_seen |= REG_SEEN_GPOS; *flagp |= SIMPLE; - nextchar(); + nextchar(pRExC_state); break; case 'Z': - ret = reg_node(SEOL); + ret = reg_node(pRExC_state, SEOL); *flagp |= SIMPLE; - nextchar(); + nextchar(pRExC_state); break; case 'z': - ret = reg_node(EOS); + ret = reg_node(pRExC_state, EOS); *flagp |= SIMPLE; - PL_seen_zerolen++; /* Do not optimize RE away */ - nextchar(); + RExC_seen_zerolen++; /* Do not optimize RE away */ + nextchar(pRExC_state); break; case 'C': - ret = reg_node(SANY); + ret = reg_node(pRExC_state, SANY); *flagp |= HASWIDTH|SIMPLE; - nextchar(); + nextchar(pRExC_state); break; case 'X': - ret = reg_node(CLUMP); + ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_mark) is_utf8_mark((U8*)"~"); /* preload table */ break; case 'w': - ret = reg_node( - UTF - ? (LOC ? ALNUMLUTF8 : ALNUMUTF8) - : (LOC ? ALNUML : ALNUM)); + ret = reg_node(pRExC_state, LOC ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'W': - ret = reg_node( - UTF - ? (LOC ? NALNUMLUTF8 : NALNUMUTF8) - : (LOC ? NALNUML : NALNUM)); + ret = reg_node(pRExC_state, LOC ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'b': - PL_seen_zerolen++; - PL_regseen |= REG_SEEN_LOOKBEHIND; - ret = reg_node( - UTF - ? (LOC ? BOUNDLUTF8 : BOUNDUTF8) - : (LOC ? BOUNDL : BOUND)); + RExC_seen_zerolen++; + RExC_seen |= REG_SEEN_LOOKBEHIND; + ret = reg_node(pRExC_state, LOC ? BOUNDL : BOUND); *flagp |= SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 'B': - PL_seen_zerolen++; - PL_regseen |= REG_SEEN_LOOKBEHIND; - ret = reg_node( - UTF - ? (LOC ? NBOUNDLUTF8 : NBOUNDUTF8) - : (LOC ? NBOUNDL : NBOUND)); + RExC_seen_zerolen++; + RExC_seen |= REG_SEEN_LOOKBEHIND; + ret = reg_node(pRExC_state, LOC ? NBOUNDL : NBOUND); *flagp |= SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_alnum) is_utf8_alnum((U8*)"a"); /* preload table */ break; case 's': - ret = reg_node( - UTF - ? (LOC ? SPACELUTF8 : SPACEUTF8) - : (LOC ? SPACEL : SPACE)); + ret = reg_node(pRExC_state, LOC ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_space) is_utf8_space((U8*)" "); /* preload table */ break; case 'S': - ret = reg_node( - UTF - ? (LOC ? NSPACELUTF8 : NSPACEUTF8) - : (LOC ? NSPACEL : NSPACE)); + ret = reg_node(pRExC_state, LOC ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_space) is_utf8_space((U8*)" "); /* preload table */ break; case 'd': - ret = reg_node(UTF ? DIGITUTF8 : DIGIT); + ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_digit) is_utf8_digit((U8*)"1"); /* preload table */ break; case 'D': - ret = reg_node(UTF ? NDIGITUTF8 : NDIGIT); + ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; - nextchar(); + nextchar(pRExC_state); if (UTF && !PL_utf8_digit) is_utf8_digit((U8*)"1"); /* preload table */ break; case 'p': case 'P': { /* a lovely hack--pretend we saw [\pX] instead */ - char* oldregxend = PL_regxend; - - if (PL_regcomp_parse[1] == '{') { - PL_regxend = strchr(PL_regcomp_parse, '}'); - if (!PL_regxend) - FAIL("Missing right brace on \\p{}"); - PL_regxend++; + char* oldregxend = RExC_end; + + if (RExC_parse[1] == '{') { + RExC_end = strchr(RExC_parse, '}'); + if (!RExC_end) { + RExC_parse += 2; + RExC_end = oldregxend; + vFAIL("Missing right brace on \\p{}"); + } + RExC_end++; } else - PL_regxend = PL_regcomp_parse + 2; - PL_regcomp_parse--; + RExC_end = RExC_parse + 2; + RExC_parse--; - ret = regclassutf8(); + ret = regclass(pRExC_state); - PL_regxend = oldregxend; - PL_regcomp_parse--; - nextchar(); + RExC_end = oldregxend; + RExC_parse--; + nextchar(pRExC_state); *flagp |= HASWIDTH|SIMPLE; } break; @@ -2466,28 +2750,29 @@ tryagain: case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { - I32 num = atoi(PL_regcomp_parse); + I32 num = atoi(RExC_parse); - if (num > 9 && num >= PL_regnpar) + if (num > 9 && num >= RExC_npar) goto defchar; else { - if (!SIZE_ONLY && num > PL_regcomp_rx->nparens) - FAIL("reference to nonexistent group"); - PL_regsawback = 1; - ret = reganode(FOLD + while (isDIGIT(*RExC_parse)) + RExC_parse++; + + if (!SIZE_ONLY && num > RExC_rx->nparens) + vFAIL("Reference to nonexistent group"); + RExC_sawback = 1; + ret = reganode(pRExC_state, FOLD ? (LOC ? REFFL : REFF) : REF, num); *flagp |= HASWIDTH; - while (isDIGIT(*PL_regcomp_parse)) - PL_regcomp_parse++; - PL_regcomp_parse--; - nextchar(); + RExC_parse--; + nextchar(pRExC_state); } } break; case '\0': - if (PL_regcomp_parse >= PL_regxend) - FAIL("trailing \\ in regexp"); + if (RExC_parse >= RExC_end) + FAIL("Trailing \\"); /* FALL THROUGH */ default: /* Do not generate `unrecognized' warnings here, we fall @@ -2497,35 +2782,35 @@ tryagain: break; case '#': - if (PL_regflags & PMf_EXTENDED) { - while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '\n') PL_regcomp_parse++; - if (PL_regcomp_parse < PL_regxend) + if (RExC_flags16 & PMf_EXTENDED) { + while (RExC_parse < RExC_end && *RExC_parse != '\n') RExC_parse++; + if (RExC_parse < RExC_end) goto tryagain; } /* FALL THROUGH */ default: { - register I32 len; + register STRLEN len; register UV ender; register char *p; char *oldp, *s; - I32 numlen; + STRLEN numlen; - PL_regcomp_parse++; + RExC_parse++; defchar: - ret = reg_node(FOLD + ret = reg_node(pRExC_state, FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT); s = STRING(ret); - for (len = 0, p = PL_regcomp_parse - 1; - len < 127 && p < PL_regxend; + for (len = 0, p = RExC_parse - 1; + len < 127 && p < RExC_end; len++) { oldp = p; - if (PL_regflags & PMf_EXTENDED) - p = regwhite(p, PL_regxend); + if (RExC_flags16 & PMf_EXTENDED) + p = regwhite(p, RExC_end); switch (*p) { case '^': case '$': @@ -2589,20 +2874,23 @@ tryagain: if (*++p == '{') { char* e = strchr(p, '}'); - if (!e) - FAIL("Missing right brace on \\x{}"); - else if (UTF) { - ender = (UV)scan_hex(p + 1, e - p, &numlen); - if (numlen + len >= 127) { /* numlen is generous */ + if (!e) { + RExC_parse = p + 1; + vFAIL("Missing right brace on \\x{}"); + } + else { + numlen = 1; /* allow underscores */ + ender = (UV)scan_hex(p + 1, e - p - 1, &numlen); + /* numlen is generous */ + if (numlen + len >= 127) { p--; goto loopdone; } p = e + 1; } - else - FAIL("Can't use \\x{} without 'use utf8' declaration"); } else { + numlen = 0; /* disallow underscores */ ender = (UV)scan_hex(p, 2, &numlen); p += numlen; } @@ -2615,7 +2903,8 @@ tryagain: case '0': case '1': case '2': case '3':case '4': case '5': case '6': case '7': case '8':case '9': if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) { + (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) { + numlen = 0; /* disallow underscores */ ender = (UV)scan_oct(p, 3, &numlen); p += numlen; } @@ -2625,30 +2914,28 @@ tryagain: } break; case '\0': - if (p >= PL_regxend) - FAIL("trailing \\ in regexp"); + if (p >= RExC_end) + FAIL("Trailing \\"); /* FALL THROUGH */ default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(*p)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: Unrecognized escape \\%c passed through", - PL_regprecomp, - *p); + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(*p)) + vWARN2(p +1, "Unrecognized escape \\%c passed through", *p); goto normal_default; } break; default: normal_default: if ((*p & 0xc0) == 0xc0 && UTF) { - ender = utf8_to_uv((U8*)p, &numlen); + ender = utf8_to_uv((U8*)p, RExC_end - p, + &numlen, 0); p += numlen; } else ender = *p++; break; } - if (PL_regflags & PMf_EXTENDED) - p = regwhite(p, PL_regxend); + if (RExC_flags16 & PMf_EXTENDED) + p = regwhite(p, RExC_end); if (UTF && FOLD) { if (LOC) ender = toLOWER_LC_uni(ender); @@ -2659,7 +2946,7 @@ tryagain: if (len) p = oldp; else if (ender >= 0x80 && UTF) { - reguni(ender, s, &numlen); + reguni(pRExC_state, ender, s, &numlen); s += numlen; len += numlen; } @@ -2670,7 +2957,7 @@ tryagain: break; } if (ender >= 0x80 && UTF) { - reguni(ender, s, &numlen); + reguni(pRExC_state, ender, s, &numlen); s += numlen; len += numlen - 1; } @@ -2678,10 +2965,14 @@ tryagain: REGC(ender, s++); } loopdone: - PL_regcomp_parse = p - 1; - nextchar(); - if (len < 0) - FAIL("internal disaster in regexp"); + RExC_parse = p - 1; + nextchar(pRExC_state); + { + /* len is STRLEN which is unsigned, need to copy to signed */ + IV iv = len; + if (iv < 0) + vFAIL("Internal disaster"); + } if (len > 0) *flagp |= HASWIDTH; if (len == 1) @@ -2689,9 +2980,9 @@ tryagain: if (!SIZE_ONLY) STR_LEN(ret) = len; if (SIZE_ONLY) - PL_regsize += STR_SZ(len); + RExC_size += STR_SZ(len); else - PL_regcode += STR_SZ(len); + RExC_emit += STR_SZ(len); } break; } @@ -2722,30 +3013,29 @@ S_regwhite(pTHX_ char *p, char *e) Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed, but trigger warnings because they are currently unimplemented. */ STATIC I32 -S_regpposixcc(pTHX_ I32 value) +S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value) { - dTHR; char *posixcc = 0; I32 namedclass = OOB_NAMEDCLASS; - if (value == '[' && PL_regcomp_parse + 1 < PL_regxend && + if (value == '[' && RExC_parse + 1 < RExC_end && /* I smell either [: or [= or [. -- POSIX has been here, right? */ - (*PL_regcomp_parse == ':' || - *PL_regcomp_parse == '=' || - *PL_regcomp_parse == '.')) { - char c = *PL_regcomp_parse; - char* s = PL_regcomp_parse++; + (*RExC_parse == ':' || + *RExC_parse == '=' || + *RExC_parse == '.')) { + char c = *RExC_parse; + char* s = RExC_parse++; - while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != c) - PL_regcomp_parse++; - if (PL_regcomp_parse == PL_regxend) + while (RExC_parse < RExC_end && *RExC_parse != c) + RExC_parse++; + if (RExC_parse == RExC_end) /* Grandfather lone [:, [=, [. */ - PL_regcomp_parse = s; + RExC_parse = s; else { - char* t = PL_regcomp_parse++; /* skip over the c */ + char* t = RExC_parse++; /* skip over the c */ - if (*PL_regcomp_parse == ']') { - PL_regcomp_parse++; /* skip over the ending ] */ + if (*RExC_parse == ']') { + RExC_parse++; /* skip over the ending ] */ posixcc = s + 1; if (*s == ':') { I32 complement = *posixcc == '^' ? *posixcc++ : 0; @@ -2763,6 +3053,11 @@ S_regpposixcc(pTHX_ I32 value) namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII; break; + case 'b': + if (strnEQ(posixcc, "blank", 5)) + namedclass = + complement ? ANYOF_NBLANK : ANYOF_BLANK; + break; case 'c': if (strnEQ(posixcc, "cntrl", 5)) namedclass = @@ -2794,7 +3089,8 @@ S_regpposixcc(pTHX_ I32 value) case 's': if (strnEQ(posixcc, "space", 5)) namedclass = - complement ? ANYOF_NSPACE : ANYOF_SPACE; + complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC; + break; case 'u': if (strnEQ(posixcc, "upper", 5)) namedclass = @@ -2815,20 +3111,26 @@ S_regpposixcc(pTHX_ I32 value) } break; } - if ((namedclass == OOB_NAMEDCLASS || - !(posixcc + skip + 2 < PL_regxend && - (posixcc[skip] == ':' && - posixcc[skip + 1] == ']')))) - Perl_croak(aTHX_ "Character class [:%.*s:] unknown", - t - s - 1, s + 1); - } else if (ckWARN(WARN_UNSAFE) && !SIZE_ONLY) + if (namedclass == OOB_NAMEDCLASS || + posixcc[skip] != ':' || + posixcc[skip+1] != ']') + { + Simple_vFAIL3("POSIX class [:%.*s:] unknown", + t - s - 1, s + 1); + } + } else if (!SIZE_ONLY) { /* [[=foo=]] and [[.foo.]] are still future. */ - Perl_warner(aTHX_ WARN_UNSAFE, - "Character class syntax [%c %c] is reserved for future extensions", c, c); + + /* adjust RExC_parse so the warning shows after + the class closes */ + while (*RExC_parse && *RExC_parse != ']') + RExC_parse++; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } } else { /* Maternal grandfather: * "[:" ending in ":" but not in ":]" */ - PL_regcomp_parse = s; + RExC_parse = s; } } } @@ -2837,82 +3139,141 @@ S_regpposixcc(pTHX_ I32 value) } STATIC void -S_checkposixcc(pTHX) +S_checkposixcc(pTHX_ RExC_state_t *pRExC_state) { - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && - (*PL_regcomp_parse == ':' || - *PL_regcomp_parse == '=' || - *PL_regcomp_parse == '.')) { - char *s = PL_regcomp_parse; + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && + (*RExC_parse == ':' || + *RExC_parse == '=' || + *RExC_parse == '.')) { + char *s = RExC_parse; char c = *s++; while(*s && isALNUM(*s)) s++; if (*s && c == *s && s[1] == ']') { - Perl_warner(aTHX_ WARN_UNSAFE, - "Character class syntax [%c %c] belongs inside character classes", c, c); + vWARN3(s+2, "POSIX syntax [%c %c] belongs inside character classes", c, c); + + /* [[=foo=]] and [[.foo.]] are still future. */ if (c == '=' || c == '.') - Perl_warner(aTHX_ WARN_UNSAFE, - "Character class syntax [%c %c] is reserved for future extensions", c, c); + { + /* adjust RExC_parse so the error shows after + the class closes */ + while (*RExC_parse && *RExC_parse++ != ']') + ; + Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c); + } } } } STATIC regnode * -S_regclass(pTHX) +S_regclass(pTHX_ RExC_state_t *pRExC_state) { - dTHR; register UV value; - register I32 lastvalue = OOB_CHAR8; - register I32 range = 0; + register IV lastvalue = OOB_UNICODE; + register IV range = 0; register regnode *ret; - register I32 def; - I32 numlen; - I32 namedclass; + STRLEN numlen; + IV namedclass; char *rangebegin; bool need_class = 0; + SV *listsv; + register char *e; + UV n; + bool dont_optimize_invert = FALSE; + + ret = reganode(pRExC_state, ANYOF, 0); + + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) = 0; + + if (*RExC_parse == '^') { /* Complement of range. */ + RExC_naughty++; + RExC_parse++; + if (!SIZE_ONLY) + ANYOF_FLAGS(ret) |= ANYOF_INVERT; + } - ret = reg_node(ANYOF); if (SIZE_ONLY) - PL_regsize += ANYOF_SKIP; + RExC_size += ANYOF_SKIP; else { - ret->flags = 0; - ANYOF_BITMAP_ZERO(ret); - PL_regcode += ANYOF_SKIP; + RExC_emit += ANYOF_SKIP; if (FOLD) ANYOF_FLAGS(ret) |= ANYOF_FOLD; if (LOC) ANYOF_FLAGS(ret) |= ANYOF_LOCALE; + ANYOF_BITMAP_ZERO(ret); + listsv = newSVpvn("# comment\n", 10); } - if (*PL_regcomp_parse == '^') { /* Complement of range. */ - PL_regnaughty++; - PL_regcomp_parse++; - if (!SIZE_ONLY) - ANYOF_FLAGS(ret) |= ANYOF_INVERT; - } - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE)) - checkposixcc(); + if (!SIZE_ONLY && ckWARN(WARN_REGEXP)) + checkposixcc(pRExC_state); + + if (*RExC_parse == ']' || *RExC_parse == '-') + goto charclassloop; /* allow 1st char to be ] or - */ + + while (RExC_parse < RExC_end && *RExC_parse != ']') { + + charclassloop: + + namedclass = OOB_NAMEDCLASS; /* initialize as illegal */ - if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ - while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; if (!range) - rangebegin = PL_regcomp_parse; - value = UCHARAT(PL_regcomp_parse++); + rangebegin = RExC_parse; + if (UTF) { + value = utf8_to_uv((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); if (value == '[') - namedclass = regpposixcc(value); + namedclass = regpposixcc(pRExC_state, value); else if (value == '\\') { - value = UCHARAT(PL_regcomp_parse++); - switch (value) { + if (UTF) { + value = utf8_to_uv((U8*)RExC_parse, + RExC_end - RExC_parse, + &numlen, 0); + RExC_parse += numlen; + } + else + value = UCHARAT(RExC_parse++); + /* Some compilers cannot handle switching on 64-bit integer + * values, therefore value cannot be an UV. Yes, this will + * be a problem later if we want switch on Unicode. + * A similar issue a little bit later when switching on + * namedclass. --jhi */ + switch ((I32)value) { case 'w': namedclass = ANYOF_ALNUM; break; case 'W': namedclass = ANYOF_NALNUM; break; case 's': namedclass = ANYOF_SPACE; break; case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'p': + case 'P': + if (*RExC_parse == '{') { + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\p{}"); + n = e - RExC_parse; + } + else { + e = RExC_parse; + n = 1; + } + if (!SIZE_ONLY) { + if (value == 'p') + Perl_sv_catpvf(aTHX_ listsv, + "+utf8::%.*s\n", (int)n, RExC_parse); + else + Perl_sv_catpvf(aTHX_ listsv, + "!utf8::%.*s\n", (int)n, RExC_parse); + } + RExC_parse = e + 1; + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + continue; case 'n': value = '\n'; break; case 'r': value = '\r'; break; case 't': value = '\t'; break; @@ -2926,47 +3287,78 @@ S_regclass(pTHX) case 'a': value = '\057'; break; #endif case 'x': - value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); - PL_regcomp_parse += numlen; + if (*RExC_parse == '{') { + e = strchr(RExC_parse++, '}'); + if (!e) + vFAIL("Missing right brace on \\x{}"); + numlen = 1; /* allow underscores */ + value = (UV)scan_hex(RExC_parse, + e - RExC_parse, + &numlen); + RExC_parse = e + 1; + } + else { + numlen = 0; /* disallow underscores */ + value = (UV)scan_hex(RExC_parse, 2, &numlen); + RExC_parse += numlen; + } break; case 'c': - value = UCHARAT(PL_regcomp_parse++); + value = UCHARAT(RExC_parse++); value = toCTRL(value); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); - PL_regcomp_parse += numlen; + numlen = 0; /* disallow underscores */ + value = (UV)scan_oct(--RExC_parse, 3, &numlen); + RExC_parse += numlen; break; default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(value)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: Unrecognized escape \\%c in character class passed through", - PL_regprecomp, - (int)value); + if (!SIZE_ONLY && ckWARN(WARN_REGEXP) && isALPHA(value)) + vWARN2(RExC_parse, + "Unrecognized escape \\%c in character class passed through", + (int)value); break; } - } - if (namedclass > OOB_NAMEDCLASS) { - if (!need_class && !SIZE_ONLY) + } /* end of \blah */ + + if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */ + + if (!SIZE_ONLY && !need_class) ANYOF_CLASS_ZERO(ret); + need_class = 1; - if (range) { /* a-\d, a-[:digit:] */ + + /* a bad range like a-\d, a-[:digit:] ? */ + if (range) { if (!SIZE_ONLY) { - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); - ANYOF_BITMAP_SET(ret, lastvalue); - ANYOF_BITMAP_SET(ret, '-'); + if (ckWARN(WARN_REGEXP)) + vWARN4(RExC_parse, + "False [] range \"%*.*s\"", + RExC_parse - rangebegin, + RExC_parse - rangebegin, + rangebegin); + if (lastvalue < 256) { + ANYOF_BITMAP_SET(ret, lastvalue); + ANYOF_BITMAP_SET(ret, '-'); + } + else { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + Perl_sv_catpvf(aTHX_ listsv, + /* 0x002D is Unicode for '-' */ + "%04"UVxf"\n002D\n", (UV)lastvalue); + } } - range = 0; /* this is not a true range */ + + range = 0; /* this was not a true range */ } + if (!SIZE_ONLY) { - switch (namedclass) { + /* Possible truncation here but in some 64-bit environments + * the compiler gets heartburn about switch on 64-bit values. + * A similar issue a little earlier when switching on value. + * --jhi */ + switch ((I32)namedclass) { case ANYOF_ALNUM: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ALNUM); @@ -2975,6 +3367,8 @@ S_regclass(pTHX) if (isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; case ANYOF_NALNUM: if (LOC) @@ -2984,42 +3378,19 @@ S_regclass(pTHX) if (!isALNUM(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_SPACE: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_SPACE); - else { - for (value = 0; value < 256; value++) - if (isSPACE(value)) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NSPACE: + case ANYOF_ALNUMC: if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); else { for (value = 0; value < 256; value++) - if (!isSPACE(value)) + if (isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_DIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_DIGIT); - else { - for (value = '0'; value <= '9'; value++) - ANYOF_BITMAP_SET(ret, value); - } - break; - case ANYOF_NDIGIT: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); - else { - for (value = 0; value < '0'; value++) - ANYOF_BITMAP_SET(ret, value); - for (value = '9' + 1; value < 256; value++) - ANYOF_BITMAP_SET(ret, value); - } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; case ANYOF_NALNUMC: if (LOC) @@ -3029,15 +3400,8 @@ S_regclass(pTHX) if (!isALNUMC(value)) ANYOF_BITMAP_SET(ret, value); } - break; - case ANYOF_ALNUMC: - if (LOC) - ANYOF_CLASS_SET(ret, ANYOF_ALNUMC); - else { - for (value = 0; value < 256; value++) - if (isALNUMC(value)) - ANYOF_BITMAP_SET(ret, value); - } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; case ANYOF_ALPHA: if (LOC) @@ -3047,6 +3411,8 @@ S_regclass(pTHX) if (isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; case ANYOF_NALPHA: if (LOC) @@ -3056,6 +3422,8 @@ S_regclass(pTHX) if (!isALPHA(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; case ANYOF_ASCII: if (LOC) @@ -3070,6 +3438,8 @@ S_regclass(pTHX) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; case ANYOF_NASCII: if (LOC) @@ -3084,6 +3454,30 @@ S_regclass(pTHX) ANYOF_BITMAP_SET(ret, value); #endif /* EBCDIC */ } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); + break; + case ANYOF_BLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_BLANK); + else { + for (value = 0; value < 256; value++) + if (isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsBlank\n"); + break; + case ANYOF_NBLANK: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NBLANK); + else { + for (value = 0; value < 256; value++) + if (!isBLANK(value)) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsBlank\n"); break; case ANYOF_CNTRL: if (LOC) @@ -3093,7 +3487,8 @@ S_regclass(pTHX) if (isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } - lastvalue = OOB_CHAR8; + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; case ANYOF_NCNTRL: if (LOC) @@ -3103,6 +3498,32 @@ S_regclass(pTHX) if (!isCNTRL(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); + break; + case ANYOF_DIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_DIGIT); + else { + /* consecutive digits assumed */ + for (value = '0'; value <= '9'; value++) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); + break; + case ANYOF_NDIGIT: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NDIGIT); + else { + /* consecutive digits assumed */ + for (value = 0; value < '0'; value++) + ANYOF_BITMAP_SET(ret, value); + for (value = '9' + 1; value < 256; value++) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; case ANYOF_GRAPH: if (LOC) @@ -3112,6 +3533,8 @@ S_regclass(pTHX) if (isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; case ANYOF_NGRAPH: if (LOC) @@ -3121,6 +3544,8 @@ S_regclass(pTHX) if (!isGRAPH(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; case ANYOF_LOWER: if (LOC) @@ -3130,6 +3555,8 @@ S_regclass(pTHX) if (isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; case ANYOF_NLOWER: if (LOC) @@ -3139,6 +3566,8 @@ S_regclass(pTHX) if (!isLOWER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; case ANYOF_PRINT: if (LOC) @@ -3148,6 +3577,8 @@ S_regclass(pTHX) if (isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; case ANYOF_NPRINT: if (LOC) @@ -3157,6 +3588,30 @@ S_regclass(pTHX) if (!isPRINT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); + break; + case ANYOF_PSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_PSXSPC); + else { + for (value = 0; value < 256; value++) + if (isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); + break; + case ANYOF_NPSXSPC: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NPSXSPC); + else { + for (value = 0; value < 256; value++) + if (!isPSXSPC(value)) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; case ANYOF_PUNCT: if (LOC) @@ -3166,6 +3621,8 @@ S_regclass(pTHX) if (isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; case ANYOF_NPUNCT: if (LOC) @@ -3175,6 +3632,30 @@ S_regclass(pTHX) if (!isPUNCT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); + break; + case ANYOF_SPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_SPACE); + else { + for (value = 0; value < 256; value++) + if (isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpacePerl\n"); + break; + case ANYOF_NSPACE: + if (LOC) + ANYOF_CLASS_SET(ret, ANYOF_NSPACE); + else { + for (value = 0; value < 256; value++) + if (!isSPACE(value)) + ANYOF_BITMAP_SET(ret, value); + } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpacePerl\n"); break; case ANYOF_UPPER: if (LOC) @@ -3184,6 +3665,8 @@ S_regclass(pTHX) if (isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; case ANYOF_NUPPER: if (LOC) @@ -3193,6 +3676,8 @@ S_regclass(pTHX) if (!isUPPER(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; case ANYOF_XDIGIT: if (LOC) @@ -3202,6 +3687,8 @@ S_regclass(pTHX) if (isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; case ANYOF_NXDIGIT: if (LOC) @@ -3211,371 +3698,155 @@ S_regclass(pTHX) if (!isXDIGIT(value)) ANYOF_BITMAP_SET(ret, value); } + dont_optimize_invert = TRUE; + Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; default: - FAIL("invalid [::] class in regexp"); + vFAIL("Invalid [::] class"); break; } if (LOC) ANYOF_FLAGS(ret) |= ANYOF_CLASS; continue; } - } + } /* end of namedclass \blah */ + if (range) { if (lastvalue > value) /* b-a */ { - Perl_croak(aTHX_ - "/%.127s/: invalid [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + Simple_vFAIL4("Invalid [] range \"%*.*s\"", + RExC_parse - rangebegin, + RExC_parse - rangebegin, + rangebegin); } - range = 0; + range = 0; /* not a true range */ } else { - lastvalue = value; - if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && - PL_regcomp_parse[1] != ']') { - PL_regcomp_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); + lastvalue = value; /* save the beginning of the range */ + if (*RExC_parse == '-' && RExC_parse+1 < RExC_end && + RExC_parse[1] != ']') { + RExC_parse++; + + /* a bad range like \w-, [:word:]- ? */ + if (namedclass > OOB_NAMEDCLASS) { + if (ckWARN(WARN_REGEXP)) + vWARN4(RExC_parse, + "False [] range \"%*.*s\"", + RExC_parse - rangebegin, + RExC_parse - rangebegin, + rangebegin); if (!SIZE_ONLY) ANYOF_BITMAP_SET(ret, '-'); } else - range = 1; - continue; /* do it next time */ + range = 1; /* yeah, it's a range! */ + continue; /* but do it the next time */ } } + /* now is the next time */ if (!SIZE_ONLY) { + if (lastvalue < 256 && value < 256) { #ifndef ASCIIish /* EBCDIC, for example. */ - if ((isLOWER(lastvalue) && isLOWER(value)) || - (isUPPER(lastvalue) && isUPPER(value))) - { - I32 i; - if (isLOWER(lastvalue)) { - for (i = lastvalue; i <= value; i++) - if (isLOWER(i)) - ANYOF_BITMAP_SET(ret, i); - } else { - for (i = lastvalue; i <= value; i++) - if (isUPPER(i)) - ANYOF_BITMAP_SET(ret, i); + if ((isLOWER(lastvalue) && isLOWER(value)) || + (isUPPER(lastvalue) && isUPPER(value))) + { + IV i; + if (isLOWER(lastvalue)) { + for (i = lastvalue; i <= value; i++) + if (isLOWER(i)) + ANYOF_BITMAP_SET(ret, i); + } else { + for (i = lastvalue; i <= value; i++) + if (isUPPER(i)) + ANYOF_BITMAP_SET(ret, i); + } } - } - else + else #endif - for ( ; lastvalue <= value; lastvalue++) - ANYOF_BITMAP_SET(ret, lastvalue); + for ( ; lastvalue <= value; lastvalue++) + ANYOF_BITMAP_SET(ret, lastvalue); + } else { + ANYOF_FLAGS(ret) |= ANYOF_UNICODE; + if (lastvalue < value) + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", + (UV)lastvalue, (UV)value); + else + Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", + (UV)value); + } } - range = 0; + + range = 0; /* this range (if it was one) is done now */ } + if (need_class) { if (SIZE_ONLY) - PL_regsize += ANYOF_CLASS_ADD_SKIP; + RExC_size += ANYOF_CLASS_ADD_SKIP; else - PL_regcode += ANYOF_CLASS_ADD_SKIP; + RExC_emit += ANYOF_CLASS_ADD_SKIP; } + /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ if (!SIZE_ONLY && - (ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD) { + (ANYOF_FLAGS(ret) & + /* If the only flag is folding (plus possibly inversion). */ + (ANYOF_FLAGS_ALL ^ ANYOF_INVERT) == ANYOF_FOLD)) { for (value = 0; value < 256; ++value) { if (ANYOF_BITMAP_TEST(ret, value)) { - I32 cf = PL_fold[value]; - ANYOF_BITMAP_SET(ret, cf); + IV fold = PL_fold[value]; + + if (fold != value) + ANYOF_BITMAP_SET(ret, fold); } } ANYOF_FLAGS(ret) &= ~ANYOF_FOLD; } + /* optimize inverted simple patterns (e.g. [^a-z]) */ - if (!SIZE_ONLY && (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { + if (!SIZE_ONLY && !dont_optimize_invert && + /* If the only flag is inversion. */ + (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) { for (value = 0; value < ANYOF_BITMAP_SIZE; ++value) ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL; - ANYOF_FLAGS(ret) = 0; + ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL; } - return ret; -} -STATIC regnode * -S_regclassutf8(pTHX) -{ - dTHR; - register char *e; - register UV value; - register U32 lastvalue = OOB_UTF8; - register I32 range = 0; - register regnode *ret; - I32 numlen; - I32 n; - SV *listsv; - U8 flags = 0; - I32 namedclass; - char *rangebegin; + if (!SIZE_ONLY) { + AV *av = newAV(); + SV *rv; - if (*PL_regcomp_parse == '^') { /* Complement of range. */ - PL_regnaughty++; - PL_regcomp_parse++; - if (!SIZE_ONLY) - flags |= ANYOF_INVERT; - } - if (!SIZE_ONLY) { - if (FOLD) - flags |= ANYOF_FOLD; - if (LOC) - flags |= ANYOF_LOCALE; - listsv = newSVpvn("# comment\n",10); - } - - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE)) - checkposixcc(); - - if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') - goto skipcond; /* allow 1st char to be ] or - */ - - while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { - skipcond: - namedclass = OOB_NAMEDCLASS; - if (!range) - rangebegin = PL_regcomp_parse; - value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); - PL_regcomp_parse += numlen; - if (value == '[') - namedclass = regpposixcc(value); - else if (value == '\\') { - value = utf8_to_uv((U8*)PL_regcomp_parse, &numlen); - PL_regcomp_parse += numlen; - switch (value) { - case 'w': namedclass = ANYOF_ALNUM; break; - case 'W': namedclass = ANYOF_NALNUM; break; - case 's': namedclass = ANYOF_SPACE; break; - case 'S': namedclass = ANYOF_NSPACE; break; - case 'd': namedclass = ANYOF_DIGIT; break; - case 'D': namedclass = ANYOF_NDIGIT; break; - case 'p': - case 'P': - if (*PL_regcomp_parse == '{') { - e = strchr(PL_regcomp_parse++, '}'); - if (!e) - FAIL("Missing right brace on \\p{}"); - n = e - PL_regcomp_parse; - } - else { - e = PL_regcomp_parse; - n = 1; - } - if (!SIZE_ONLY) { - if (value == 'p') - Perl_sv_catpvf(aTHX_ listsv, - "+utf8::%.*s\n", n, PL_regcomp_parse); - else - Perl_sv_catpvf(aTHX_ listsv, - "!utf8::%.*s\n", n, PL_regcomp_parse); - } - PL_regcomp_parse = e + 1; - lastvalue = OOB_UTF8; - continue; - case 'n': value = '\n'; break; - case 'r': value = '\r'; break; - case 't': value = '\t'; break; - case 'f': value = '\f'; break; - case 'b': value = '\b'; break; -#ifdef ASCIIish - case 'e': value = '\033'; break; - case 'a': value = '\007'; break; -#else - case 'e': value = '\047'; break; - case 'a': value = '\057'; break; -#endif - case 'x': - if (*PL_regcomp_parse == '{') { - e = strchr(PL_regcomp_parse++, '}'); - if (!e) - FAIL("Missing right brace on \\x{}"); - value = (UV)scan_hex(PL_regcomp_parse, - e - PL_regcomp_parse, - &numlen); - PL_regcomp_parse = e + 1; - } - else { - value = (UV)scan_hex(PL_regcomp_parse, 2, &numlen); - PL_regcomp_parse += numlen; - } - break; - case 'c': - value = UCHARAT(PL_regcomp_parse++); - value = toCTRL(value); - break; - case '0': case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - value = (UV)scan_oct(--PL_regcomp_parse, 3, &numlen); - PL_regcomp_parse += numlen; - break; - default: - if (!SIZE_ONLY && ckWARN(WARN_UNSAFE) && isALPHA(value)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: Unrecognized escape \\%c in character class passed through", - PL_regprecomp, - (int)value); - break; - } - } - if (namedclass > OOB_NAMEDCLASS) { - if (range) { /* a-\d, a-[:digit:] */ - if (!SIZE_ONLY) { - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "%04"UVxf"\n002D\n", (UV)lastvalue); - } - range = 0; - } - if (!SIZE_ONLY) { - switch (namedclass) { - case ANYOF_ALNUM: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsWord\n"); break; - case ANYOF_NALNUM: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsWord\n"); break; - case ANYOF_ALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlnum\n"); break; - case ANYOF_NALNUMC: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlnum\n"); break; - case ANYOF_ALPHA: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsAlpha\n"); break; - case ANYOF_NALPHA: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsAlpha\n"); break; - case ANYOF_ASCII: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsASCII\n"); break; - case ANYOF_NASCII: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsASCII\n"); break; - case ANYOF_CNTRL: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsCntrl\n"); break; - case ANYOF_NCNTRL: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsCntrl\n"); break; - case ANYOF_GRAPH: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsGraph\n"); break; - case ANYOF_NGRAPH: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsGraph\n"); break; - case ANYOF_DIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsDigit\n"); break; - case ANYOF_NDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsDigit\n"); break; - case ANYOF_LOWER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsLower\n"); break; - case ANYOF_NLOWER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsLower\n"); break; - case ANYOF_PRINT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPrint\n"); break; - case ANYOF_NPRINT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPrint\n"); break; - case ANYOF_PUNCT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsPunct\n"); break; - case ANYOF_NPUNCT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsPunct\n"); break; - case ANYOF_SPACE: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsSpace\n"); break; - case ANYOF_NSPACE: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsSpace\n"); break; - case ANYOF_UPPER: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsUpper\n"); break; - case ANYOF_NUPPER: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsUpper\n"); break; - case ANYOF_XDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "+utf8::IsXDigit\n"); break; - case ANYOF_NXDIGIT: - Perl_sv_catpvf(aTHX_ listsv, "!utf8::IsXDigit\n"); break; - } - continue; - } - } - if (range) { - if (lastvalue > value) { /* b-a */ - Perl_croak(aTHX_ - "/%.127s/: invalid [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); - } - range = 0; - } - else { - lastvalue = value; - if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && - PL_regcomp_parse[1] != ']') { - PL_regcomp_parse++; - if (namedclass > OOB_NAMEDCLASS) { /* \w-, [:word:]- */ - if (ckWARN(WARN_UNSAFE)) - Perl_warner(aTHX_ WARN_UNSAFE, - "/%.127s/: false [] range \"%*.*s\" in regexp", - PL_regprecomp, - PL_regcomp_parse - rangebegin, - PL_regcomp_parse - rangebegin, - rangebegin); - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, - /* 0x002D is Unicode for '-' */ - "002D\n"); - } else - range = 1; - continue; /* do it next time */ - } - } - /* now is the next time */ - if (!SIZE_ONLY) - Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n", - (UV)lastvalue, (UV)value); - range = 0; - } - - ret = reganode(ANYOFUTF8, 0); - - if (!SIZE_ONLY) { - SV *rv = swash_init("utf8", "", listsv, 1, 0); - SvREFCNT_dec(listsv); - n = add_data(1,"s"); - PL_regcomp_rx->data->data[n] = (void*)rv; - ARG1_SET(ret, flags); - ARG2_SET(ret, n); + av_store(av, 0, listsv); + av_store(av, 1, NULL); + rv = newRV_noinc((SV*)av); + n = add_data(pRExC_state, 1, "s"); + RExC_rx->data->data[n] = (void*)rv; + ARG_SET(ret, n); } return ret; } STATIC char* -S_nextchar(pTHX) +S_nextchar(pTHX_ RExC_state_t *pRExC_state) { - dTHR; - char* retval = PL_regcomp_parse++; + char* retval = RExC_parse++; for (;;) { - if (*PL_regcomp_parse == '(' && PL_regcomp_parse[1] == '?' && - PL_regcomp_parse[2] == '#') { - while (*PL_regcomp_parse && *PL_regcomp_parse != ')') - PL_regcomp_parse++; - PL_regcomp_parse++; + if (*RExC_parse == '(' && RExC_parse[1] == '?' && + RExC_parse[2] == '#') { + while (*RExC_parse && *RExC_parse != ')') + RExC_parse++; + RExC_parse++; continue; } - if (PL_regflags & PMf_EXTENDED) { - if (isSPACE(*PL_regcomp_parse)) { - PL_regcomp_parse++; + if (RExC_flags16 & PMf_EXTENDED) { + if (isSPACE(*RExC_parse)) { + RExC_parse++; continue; } - else if (*PL_regcomp_parse == '#') { - while (*PL_regcomp_parse && *PL_regcomp_parse != '\n') - PL_regcomp_parse++; - PL_regcomp_parse++; + else if (*RExC_parse == '#') { + while (*RExC_parse && *RExC_parse != '\n') + RExC_parse++; + RExC_parse++; continue; } } @@ -3587,23 +3858,22 @@ S_nextchar(pTHX) - reg_node - emit a node */ STATIC regnode * /* Location. */ -S_reg_node(pTHX_ U8 op) +S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) { - dTHR; register regnode *ret; register regnode *ptr; - ret = PL_regcode; + ret = RExC_emit; if (SIZE_ONLY) { - SIZE_ALIGN(PL_regsize); - PL_regsize += 1; + SIZE_ALIGN(RExC_size); + RExC_size += 1; return(ret); } NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); - PL_regcode = ptr; + RExC_emit = ptr; return(ret); } @@ -3612,23 +3882,22 @@ S_reg_node(pTHX_ U8 op) - reganode - emit a node with an argument */ STATIC regnode * /* Location. */ -S_reganode(pTHX_ U8 op, U32 arg) +S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) { - dTHR; register regnode *ret; register regnode *ptr; - ret = PL_regcode; + ret = RExC_emit; if (SIZE_ONLY) { - SIZE_ALIGN(PL_regsize); - PL_regsize += 2; + SIZE_ALIGN(RExC_size); + RExC_size += 2; return(ret); } NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); - PL_regcode = ptr; + RExC_emit = ptr; return(ret); } @@ -3637,16 +3906,9 @@ S_reganode(pTHX_ U8 op, U32 arg) - reguni - emit (if appropriate) a Unicode character */ STATIC void -S_reguni(pTHX_ UV uv, char* s, I32* lenp) +S_reguni(pTHX_ RExC_state_t *pRExC_state, UV uv, char* s, STRLEN* lenp) { - dTHR; - if (SIZE_ONLY) { - U8 tmpbuf[10]; - *lenp = uv_to_utf8(tmpbuf, uv) - tmpbuf; - } - else - *lenp = uv_to_utf8((U8*)s, uv) - (U8*)s; - + *lenp = SIZE_ONLY ? UNISKIP(uv) : (uv_to_utf8((U8*)s, uv) - (U8*)s); } /* @@ -3655,9 +3917,8 @@ S_reguni(pTHX_ UV uv, char* s, I32* lenp) * Means relocating the operand. */ STATIC void -S_reginsert(pTHX_ U8 op, regnode *opnd) +S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd) { - dTHR; register regnode *src; register regnode *dst; register regnode *place; @@ -3666,13 +3927,13 @@ S_reginsert(pTHX_ U8 op, regnode *opnd) /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ if (SIZE_ONLY) { - PL_regsize += NODE_STEP_REGNODE + offset; + RExC_size += NODE_STEP_REGNODE + offset; return; } - src = PL_regcode; - PL_regcode += NODE_STEP_REGNODE + offset; - dst = PL_regcode; + src = RExC_emit; + RExC_emit += NODE_STEP_REGNODE + offset; + dst = RExC_emit; while (src > opnd) StructCopy(--src, --dst, regnode); @@ -3686,12 +3947,10 @@ S_reginsert(pTHX_ U8 op, regnode *opnd) - regtail - set the next-pointer at the end of a node chain of p to val. */ STATIC void -S_regtail(pTHX_ regnode *p, regnode *val) +S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) { - dTHR; register regnode *scan; register regnode *temp; - register I32 offset; if (SIZE_ONLY) return; @@ -3717,17 +3976,16 @@ S_regtail(pTHX_ regnode *p, regnode *val) - regoptail - regtail on operand of first argument; nop if operandless */ STATIC void -S_regoptail(pTHX_ regnode *p, regnode *val) +S_regoptail(pTHX_ RExC_state_t *pRExC_state, regnode *p, regnode *val) { - dTHR; /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p == NULL || SIZE_ONLY) return; if (PL_regkind[(U8)OP(p)] == BRANCH) { - regtail(NEXTOPER(p), val); + regtail(pRExC_state, NEXTOPER(p), val); } else if ( PL_regkind[(U8)OP(p)] == BRANCHJ) { - regtail(NEXTOPER(NEXTOPER(p)), val); + regtail(pRExC_state, NEXTOPER(NEXTOPER(p)), val); } else return; @@ -3760,7 +4018,7 @@ S_dumpuntil(pTHX_ regnode *start, regnode *node, regnode *last, SV* sv, I32 l) { #ifdef DEBUGGING register U8 op = EXACT; /* Arbitrary non-END op. */ - register regnode *next, *onode; + register regnode *next; while (op != END && (!last || node < last)) { /* While that wasn't END last time... */ @@ -3833,7 +4091,6 @@ void Perl_regdump(pTHX_ regexp *r) { #ifdef DEBUGGING - dTHR; SV *sv = sv_newmortal(); (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); @@ -3900,7 +4157,7 @@ Perl_regdump(pTHX_ regexp *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (c <= ' ' || c == 127 || c == 255) + if (isCNTRL(c) || c == 127 || c == 255 || !isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); else if (c == '-' || c == ']' || c == '\\' || c == '^') Perl_sv_catpvf(aTHX_ sv, "\\%c", c); @@ -3915,12 +4172,13 @@ void Perl_regprop(pTHX_ SV *sv, regnode *o) { #ifdef DEBUGGING - dTHR; register int k; sv_setpvn(sv, "", 0); if (OP(o) >= reg_num) /* regnode.type is unsigned */ - FAIL("corrupted regexp opcode"); + /* It would be nice to FAIL() here, but this may be called from + regexec.c, and it would be hard to supply pRExC_state. */ + Perl_croak(aTHX_ "Corrupted regexp opcode"); sv_catpv(sv, (char*)reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[(U8)OP(o)]; @@ -3929,20 +4187,21 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) Perl_sv_catpvf(aTHX_ sv, " <%s%.*s%s>", PL_colors[0], STR_LEN(o), STRING(o), PL_colors[1]); else if (k == CURLY) { - if (OP(o) == CURLYM || OP(o) == CURLYN) + if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */ Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o)); } else if (k == WHILEM && o->flags) /* Ordinal/of */ Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4); else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP ) - Perl_sv_catpvf(aTHX_ sv, "%d", ARG(o)); /* Parenth number */ + Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ else if (k == ANYOF) { int i, rangestart = -1; - const char * const out[] = { /* Should be syncronized with - a table in regcomp.h */ + U8 flags = ANYOF_FLAGS(o); + const char * const anyofs[] = { /* Should be syncronized with + * ANYOF_ #xdefines in regcomp.h */ "\\w", "\\W", "\\s", @@ -3966,17 +4225,21 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) "[:punct:]", "[:^punct:]", "[:upper:]", - "[:!upper:]", + "[:^upper:]", "[:xdigit:]", - "[:^xdigit:]" + "[:^xdigit:]", + "[:space:]", + "[:^space:]", + "[:blank:]", + "[:^blank:]" }; - if (o->flags & ANYOF_LOCALE) + if (flags & ANYOF_LOCALE) sv_catpv(sv, "{loc}"); - if (o->flags & ANYOF_FOLD) + if (flags & ANYOF_FOLD) sv_catpv(sv, "{i}"); Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]); - if (o->flags & ANYOF_INVERT) + if (flags & ANYOF_INVERT) sv_catpv(sv, "^"); for (i = 0; i <= 256; i++) { if (i < 256 && ANYOF_BITMAP_TEST(o,i)) { @@ -3994,10 +4257,79 @@ Perl_regprop(pTHX_ SV *sv, regnode *o) rangestart = -1; } } + if (o->flags & ANYOF_CLASS) - for (i = 0; i < sizeof(out)/sizeof(char*); i++) + for (i = 0; i < sizeof(anyofs)/sizeof(char*); i++) if (ANYOF_CLASS_TEST(o,i)) - sv_catpv(sv, out[i]); + sv_catpv(sv, anyofs[i]); + + if (flags & ANYOF_UNICODE) + sv_catpv(sv, "{unicode}"); + else if (flags & ANYOF_UNICODE_ALL) + sv_catpv(sv, "{all-unicode}"); + + { + SV *lv; + SV *sw = regclass_swash(o, FALSE, &lv); + + if (lv) { + if (sw) { + UV i; + U8 s[UTF8_MAXLEN+1]; + + for (i = 0; i <= 256; i++) { /* just the first 256 */ + U8 *e = uv_to_utf8(s, i); + + if (i < 256 && swash_fetch(sw, s)) { + if (rangestart == -1) + rangestart = i; + } else if (rangestart != -1) { + U8 *p; + + if (i <= rangestart + 3) + for (; rangestart < i; rangestart++) { + for(e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + } + else { + for (e = uv_to_utf8(s, rangestart), p = s; p < e; p++) + put_byte(sv, *p); + sv_catpv(sv, "-"); + for (e = uv_to_utf8(s, i - 1), p = s; p < e; p++) + put_byte(sv, *p); + } + rangestart = -1; + } + } + + sv_catpv(sv, "..."); /* et cetera */ + } + + { + char *s = savepv(SvPVX(lv)); + char *origs = s; + + while(*s && *s != '\n') s++; + + if (*s == '\n') { + char *t = ++s; + + while (*s) { + if (*s == '\n') + *s = ' '; + s++; + } + if (s[-1] == ' ') + s[-1] = 0; + + sv_catpv(sv, t); + } + + Safefree(origs); + } + } + } + Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]); } else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH)) @@ -4027,7 +4359,6 @@ Perl_re_intuit_string(pTHX_ regexp *prog) void Perl_pregfree(pTHX_ struct regexp *r) { - dTHR; DEBUG_r(if (!PL_colorset) reginitcolors()); if (!r || (--r->refcnt > 0)) @@ -4072,8 +4403,13 @@ Perl_pregfree(pTHX_ struct regexp *r) Perl_croak(aTHX_ "panic: pregfree comppad"); old_comppad = PL_comppad; old_curpad = PL_curpad; - PL_comppad = new_comppad; - PL_curpad = AvARRAY(new_comppad); + /* Watch out for global destruction's random ordering. */ + if (SvTYPE(new_comppad) == SVt_PVAV) { + PL_comppad = new_comppad; + PL_curpad = AvARRAY(new_comppad); + } + else + PL_curpad = NULL; op_free((OP_4tree*)r->data->data[n]); PL_comppad = old_comppad; PL_curpad = old_curpad; @@ -4083,7 +4419,7 @@ Perl_pregfree(pTHX_ struct regexp *r) case 'n': break; default: - FAIL2("panic: regfree data code '%c'", r->data->what[n]); + Perl_croak(aTHX_ "panic: regfree data code '%c'", r->data->what[n]); } } Safefree(r->data->what); @@ -4103,7 +4439,6 @@ Perl_pregfree(pTHX_ struct regexp *r) regnode * Perl_regnext(pTHX_ register regnode *p) { - dTHR; register I32 offset; if (p == &PL_regdummy) @@ -4155,12 +4490,22 @@ S_re_croak2(pTHX_ const char* pat1,const char* pat2,...) void Perl_save_re_context(pTHX) { - dTHR; +#if 0 + SAVEPPTR(RExC_precomp); /* uncompiled string. */ + SAVEI32(RExC_npar); /* () count. */ + SAVEI32(RExC_size); /* Code size. */ + SAVEI16(RExC_flags16); /* are we folding, multilining? */ + SAVEVPTR(RExC_rx); /* from regcomp.c */ + SAVEI32(RExC_seen); /* from regcomp.c */ + SAVEI32(RExC_sawback); /* Did we see \1, ...? */ + SAVEI32(RExC_naughty); /* How bad is this pattern? */ + SAVEVPTR(RExC_emit); /* Code-emit pointer; ®dummy = don't */ + SAVEPPTR(RExC_end); /* End of input for compile */ + SAVEPPTR(RExC_parse); /* Input-scan pointer. */ +#endif + + SAVEI32(PL_reg_flags); /* from regexec.c */ SAVEPPTR(PL_bostr); - SAVEPPTR(PL_regprecomp); /* uncompiled string. */ - SAVEI32(PL_regnpar); /* () count. */ - SAVEI32(PL_regsize); /* Code size. */ - SAVEI16(PL_regflags); /* are we folding, multilining? */ SAVEPPTR(PL_reginput); /* String-input pointer. */ SAVEPPTR(PL_regbol); /* Beginning of input, for ^ check. */ SAVEPPTR(PL_regeol); /* End of input, for $ check. */ @@ -4175,20 +4520,12 @@ Perl_save_re_context(pTHX) SAVEI32(PL_reg_start_tmpl); /* from regexec.c */ PL_reg_start_tmpl = 0; SAVEVPTR(PL_regdata); - SAVEI32(PL_reg_flags); /* from regexec.c */ SAVEI32(PL_reg_eval_set); /* from regexec.c */ SAVEI32(PL_regnarrate); /* from regexec.c */ SAVEVPTR(PL_regprogram); /* from regexec.c */ SAVEINT(PL_regindent); /* from regexec.c */ SAVEVPTR(PL_regcc); /* from regexec.c */ SAVEVPTR(PL_curcop); - SAVEVPTR(PL_regcomp_rx); /* from regcomp.c */ - SAVEI32(PL_regseen); /* from regcomp.c */ - SAVEI32(PL_regsawback); /* Did we see \1, ...? */ - SAVEI32(PL_regnaughty); /* How bad is this pattern? */ - SAVEVPTR(PL_regcode); /* Code-emit pointer; ®dummy = don't */ - SAVEPPTR(PL_regxend); /* End of input for compile */ - SAVEPPTR(PL_regcomp_parse); /* Input-scan pointer. */ SAVEVPTR(PL_reg_call_cc); /* from regexec.c */ SAVEVPTR(PL_reg_re); /* from regexec.c */ SAVEPPTR(PL_reg_ganch); /* from regexec.c */ @@ -4197,6 +4534,7 @@ Perl_save_re_context(pTHX) SAVEI32(PL_reg_oldpos); /* from regexec.c */ SAVEVPTR(PL_reg_oldcurpm); /* from regexec.c */ SAVEVPTR(PL_reg_curpm); /* from regexec.c */ + SAVEI32(PL_regnpar); /* () count. */ #ifdef DEBUGGING SAVEPPTR(PL_reg_starttry); /* from regexec.c */ #endif @@ -4213,4 +4551,3 @@ clear_re(pTHXo_ void *r) { ReREFCNT_dec((regexp *)r); } -