X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/161b471ac314d8d6343f9f351e5fb9ef816168a8..ff6514248547c171417c037cbe5ba4b3f2dc6097:/regcomp.c?ds=sidebyside diff --git a/regcomp.c b/regcomp.c index adda2aa..d26489c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -19,6 +19,27 @@ * with the POSIX routines of the same names. */ +#ifdef PERL_EXT_RE_BUILD +/* need to replace pregcomp et al, so enable that */ +# ifndef PERL_IN_XSUB_RE +# define PERL_IN_XSUB_RE +# endif +/* need access to debugger hooks */ +# ifndef DEBUGGING +# define DEBUGGING +# endif +#endif + +#ifdef PERL_IN_XSUB_RE +/* We *really* need to overwrite these symbols: */ +# define Perl_pregcomp my_regcomp +# define Perl_regdump my_regdump +# define Perl_regprop my_regprop +/* *These* symbols are masked to allow static link. */ +# define Perl_pregfree my_regfree +# define Perl_regnext my_regnext +#endif + /*SUPPRESS 112*/ /* * pregcomp and pregexec -- regsub and regerror are not used in perl @@ -55,28 +76,17 @@ */ #include "EXTERN.h" #include "perl.h" -#include "INTERN.h" + +#ifndef PERL_IN_XSUB_RE +# include "INTERN.h" +#endif #define REG_COMP_C #include "regcomp.h" -#ifdef USE_THREADS +#ifdef op #undef op -#endif /* USE_THREADS */ - -static regnode regdummy; -static char * regparse; /* Input-scan pointer. */ -static char * regxend; /* End of input for compile */ -static regnode * regcode; /* Code-emit pointer; ®dummy = don't. */ -static I32 regnaughty; /* How bad is this pattern? */ -static I32 regsawback; /* Did we see \1, ...? */ - -/* This guys appear both in regcomp.c and regexec.c, but there is no - other reason to have them global. */ -static char * regprecomp; /* uncompiled string. */ -static I32 regnpar; /* () count. */ -static I32 regsize; /* Code size. */ -static U16 regflags; /* are we folding, multilining? */ +#endif /* op */ #ifdef MSDOS # if defined(BUGGY_MSC6) @@ -107,7 +117,7 @@ static U16 regflags; /* are we folding, multilining? */ * Flags to be passed up and down. */ #define WORST 0 /* Worst case. */ -#define HASWIDTH 0x1 /* Known never to match null string. */ +#define HASWIDTH 0x1 /* Known to match non-null strings. */ #define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ #define SPSTART 0x4 /* Starts with * or +. */ #define TRYAGAIN 0x8 /* Weeded out a declaration. */ @@ -116,6 +126,7 @@ static U16 regflags; /* are we folding, multilining? */ * Forward declarations for pregcomp()'s friends. */ +#ifndef PERL_OBJECT static regnode *reg _((I32, I32 *)); static regnode *reganode _((U8, U32)); static regnode *regatom _((I32 *)); @@ -127,23 +138,15 @@ static regnode *reg_node _((U8)); static regnode *regpiece _((I32 *)); static void reginsert _((U8, regnode *)); static void regoptail _((regnode *, regnode *)); -static void regset _((char *, I32)); static void regtail _((regnode *, regnode *)); static char* regwhite _((char *, char *)); static char* nextchar _((void)); - -static U32 regseen; -static I32 seen_zerolen; -static regexp *rx; -static I32 extralen; - -#ifdef DEBUGGING -static int colorset; -char *colors[4]; -#endif +static void re_croak2 _((const char* pat1,const char* pat2,...)) __attribute__((noreturn)); +#endif /* Length of a variant. */ +#ifndef PERL_OBJECT typedef struct { I32 len_min; I32 len_delta; @@ -161,6 +164,7 @@ typedef struct { I32 offset_float_max; I32 flags; } scan_data_t; +#endif static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; @@ -170,8 +174,13 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; #define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL) #define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL) -#define SF_FIX_SHIFT_EOL (+2) -#define SF_FL_SHIFT_EOL (+4) +#ifdef NO_UNARY_PLUS +# define SF_FIX_SHIFT_EOL (0+2) +# define SF_FL_SHIFT_EOL (0+4) +#else +# define SF_FIX_SHIFT_EOL (+2) +# define SF_FL_SHIFT_EOL (+4) +#endif #define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL) #define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL) @@ -182,8 +191,9 @@ static scan_data_t zero_scan_data = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; #define SF_HAS_PAR 0x80 #define SF_IN_PAR 0x100 #define SF_HAS_EVAL 0x200 +#define SCF_DO_SUBSTR 0x400 -static void +STATIC void scan_commit(scan_data_t *data) { STRLEN l = SvCUR(data->last_found); @@ -215,21 +225,21 @@ scan_commit(scan_data_t *data) data->flags &= ~SF_BEFORE_EOL; } -#define SCF_DO_SUBSTR 1 - /* Stops at toplevel WHILEM as well as at `last'. At end *scanp is set to the position after last scanned or to NULL. */ -static I32 +STATIC I32 study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 flags) /* scanp: Start here (read-write). */ /* deltap: Write maxlen-minlen here. */ /* last: Stop before this one. */ { + dTHR; I32 min = 0, pars = 0, code; regnode *scan = *scanp, *next; I32 delta = 0; int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF); + int is_inf_internal = 0; /* The studied chunk is infinite */ I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0; scan_data_t data_fake; @@ -283,6 +293,10 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 /* Allow dumping */ n = scan + (*OPERAND(scan) + 2 - 1)/sizeof(regnode) + 2; while (n <= stop) { + /* Purify reports a benign UMR here sometimes, because we + * don't initialize the OP() slot of a node when that node + * is occupied by just the trailing null of the string in + * an EXACT node */ if (regkind[(U8)OP(n)] != NOTHING || OP(n) == NOTHING) { OP(n) = OPTIMIZED; NEXT_OFF(n) = 0; @@ -293,7 +307,10 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 } if (OP(scan) != CURLYX) { - int max = (reg_off_by_arg[OP(scan)] ? I32_MAX : U16_MAX); + int max = (reg_off_by_arg[OP(scan)] + ? I32_MAX + /* I32 may be smaller than U16 on CRAYs! */ + : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX)); int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan)); int noff; regnode *n = scan; @@ -336,11 +353,11 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 if (max1 < minnext + deltanext) max1 = minnext + deltanext; if (deltanext == I32_MAX) - is_inf = 1; + is_inf = is_inf_internal = 1; scan = next; if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; - if (data_fake.flags & SF_HAS_EVAL) + if (data && (data_fake.flags & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; if (code == SUSPEND) break; @@ -407,7 +424,7 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 min++; /* Fall through. */ case STAR: - is_inf = 1; + is_inf = is_inf_internal = 1; scan = regnext(scan); if (flags & SCF_DO_SUBSTR) { scan_commit(data); @@ -436,16 +453,18 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 ? (flags & ~SCF_DO_SUBSTR) : flags); if (!scan) /* It was not CURLYX, but CURLY. */ scan = next; - if (dowarn && (minnext + deltanext == 0) - && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))) + if (PL_dowarn && (minnext + deltanext == 0) + && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) + && maxcount <= 10000) /* Complement check for big count */ warn("Strange *+?{} on zero-length expression"); min += minnext * mincount; - is_inf |= (maxcount == REG_INFTY && (minnext + deltanext) > 0 - || deltanext == I32_MAX); + is_inf_internal |= (maxcount == REG_INFTY + && (minnext + deltanext) > 0 + || deltanext == I32_MAX); + is_inf |= is_inf_internal; delta += (minnext + deltanext) * maxcount - minnext * mincount; /* Try powerful optimization CURLYX => CURLYN. */ -#ifdef REGALIGN_STRUCT if ( OP(oscan) == CURLYX && data && data->flags & SF_IN_PAR && !(data->flags & SF_HAS_EVAL) @@ -477,16 +496,11 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */ #endif } -#endif nogo: /* Try optimization CURLYX => CURLYM. */ if ( OP(oscan) == CURLYX && data -#ifdef REGALIGN_STRUCT && !(data->flags & SF_HAS_PAR) -#else - && !(data->flags & (SF_HAS_PAR|SF_IN_PAR)) -#endif && !(data->flags & SF_HAS_EVAL) && !deltanext ) { /* XXXX How to optimize if data == 0? */ @@ -499,7 +513,6 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 && (OP(nxt2) != WHILEM)) nxt = nxt2; OP(nxt2) = SUCCEED; /* Whas WHILEM */ -#ifdef REGALIGN_STRUCT /* Need to optimize away parenths. */ if (data->flags & SF_IN_PAR) { /* Set the parenth number. */ @@ -535,7 +548,6 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 study_chunk(&nxt1, &deltanext, nxt, NULL, 0); } else oscan->flags = 0; -#endif } if (data && fl & (SF_HAS_PAR|SF_IN_PAR)) pars++; @@ -585,24 +597,23 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 } data->longest = &(data->longest_float); } + SvREFCNT_dec(last_str); } - if (fl & SF_HAS_EVAL) + if (data && (fl & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; optimize_curly_tail: -#ifdef REGALIGN if (OP(oscan) != CURLYX) { while (regkind[(U8)OP(next = regnext(oscan))] == NOTHING && NEXT_OFF(next)) NEXT_OFF(oscan) += NEXT_OFF(next); } -#endif continue; default: /* REF only? */ if (flags & SCF_DO_SUBSTR) { scan_commit(data); data->longest = &(data->longest_float); } - is_inf = 1; + is_inf = is_inf_internal = 1; break; } } else if (strchr(simple,OP(scan))) { @@ -635,16 +646,14 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 } if (data && data_fake.flags & (SF_HAS_PAR|SF_IN_PAR)) pars++; - if (data_fake.flags & SF_HAS_EVAL) + if (data && (data_fake.flags & SF_HAS_EVAL)) data->flags |= SF_HAS_EVAL; } else if (OP(scan) == OPEN) { pars++; } else if (OP(scan) == CLOSE && ARG(scan) == is_par) { -#ifdef REGALIGN_STRUCT next = regnext(scan); if ( next && (OP(next) != WHILEM) && next < last) -#endif is_par = 0; /* Disable optimization */ } else if (OP(scan) == EVAL) { if (data) @@ -656,7 +665,7 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 finish: *scanp = scan; - *deltap = is_inf ? I32_MAX : delta; + *deltap = is_inf_internal ? I32_MAX : delta; if (flags & SCF_DO_SUBSTR && is_inf) data->pos_delta = I32_MAX - data->pos_min; if (is_par > U8_MAX) @@ -671,23 +680,24 @@ study_chunk(regnode **scanp, I32 *deltap, regnode *last, scan_data_t *data, U32 return min; } -static I32 +STATIC I32 add_data(I32 n, char *s) { - if (rx->data) { - Renewc(rx->data, - sizeof(*rx->data) + sizeof(void*) * (rx->data->count + n - 1), + dTHR; + if (PL_regcomp_rx->data) { + Renewc(PL_regcomp_rx->data, + sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (PL_regcomp_rx->data->count + n - 1), char, struct reg_data); - Renew(rx->data->what, rx->data->count + n, U8); - rx->data->count += n; + Renew(PL_regcomp_rx->data->what, PL_regcomp_rx->data->count + n, U8); + PL_regcomp_rx->data->count += n; } else { - Newc(1207, rx->data, sizeof(*rx->data) + sizeof(void*) * (n - 1), + Newc(1207, PL_regcomp_rx->data, sizeof(*PL_regcomp_rx->data) + sizeof(void*) * (n - 1), char, struct reg_data); - New(1208, rx->data->what, n, U8); - rx->data->count = n; + New(1208, PL_regcomp_rx->data->what, n, U8); + PL_regcomp_rx->data->count = n; } - Copy(s, rx->data->what + rx->data->count - n, n, U8); - return rx->data->count - n; + Copy(s, PL_regcomp_rx->data->what + PL_regcomp_rx->data->count - n, n, U8); + return PL_regcomp_rx->data->count - n; } /* @@ -708,6 +718,7 @@ add_data(I32 n, char *s) regexp * pregcomp(char *exp, char *xend, PMOP *pm) { + dTHR; register regexp *r; regnode *scan; SV **longest; @@ -722,96 +733,100 @@ pregcomp(char *exp, char *xend, PMOP *pm) if (exp == NULL) FAIL("NULL regexp argument"); - regprecomp = savepvn(exp, xend - exp); + PL_regprecomp = savepvn(exp, xend - exp); DEBUG_r(PerlIO_printf(Perl_debug_log, "compiling RE `%*s'\n", - xend - exp, regprecomp)); - regflags = pm->op_pmflags; - regsawback = 0; + xend - exp, PL_regprecomp)); + PL_regflags = pm->op_pmflags; + PL_regsawback = 0; - regseen = 0; - seen_zerolen = *exp == '^' ? -1 : 0; - extralen = 0; + PL_regseen = 0; + PL_seen_zerolen = *exp == '^' ? -1 : 0; + PL_seen_evals = 0; + PL_extralen = 0; /* First pass: determine size, legality. */ - regparse = exp; - regxend = xend; - regnaughty = 0; - regnpar = 1; - regsize = 0L; - regcode = ®dummy; - regc((U8)MAGIC, (char*)regcode); + PL_regcomp_parse = exp; + PL_regxend = xend; + PL_regnaughty = 0; + PL_regnpar = 1; + PL_regsize = 0L; + PL_regcode = &PL_regdummy; + regc((U8)MAGIC, (char*)PL_regcode); if (reg(0, &flags) == NULL) { - Safefree(regprecomp); - regprecomp = Nullch; + Safefree(PL_regprecomp); + PL_regprecomp = Nullch; return(NULL); } - DEBUG_r(PerlIO_printf(Perl_debug_log, "size %d ", regsize)); + DEBUG_r(PerlIO_printf(Perl_debug_log, "size %d ", PL_regsize)); DEBUG_r( - if (!colorset) { + if (!PL_colorset) { int i = 0; - char *s = getenv("TERMCAP_COLORS"); + char *s = PerlEnv_getenv("TERMCAP_COLORS"); - colorset = 1; + PL_colorset = 1; if (s) { - colors[0] = s = savepv(s); + PL_colors[0] = s = savepv(s); while (++i < 4) { s = strchr(s, '\t'); if (!s) FAIL("Not enough TABs in TERMCAP_COLORS"); *s = '\0'; - colors[i] = ++s; + PL_colors[i] = ++s; } } else { while (i < 4) - colors[i++] = ""; + PL_colors[i++] = ""; } /* Reset colors: */ PerlIO_printf(Perl_debug_log, "%s%s%s%s", - colors[0],colors[1],colors[2],colors[3]); + PL_colors[0],PL_colors[1],PL_colors[2],PL_colors[3]); } ); /* Small enough for pointer-storage convention? If extralen==0, this means that we will not need long jumps. */ -#ifndef REGALIGN_STRUCT - if (regsize >= 0x10000L && extralen) - FAIL("regexp too big"); -#else - if (regsize >= 0x10000L && extralen) - regsize += extralen; + if (PL_regsize >= 0x10000L && PL_extralen) + PL_regsize += PL_extralen; else - extralen = 0; -#endif + PL_extralen = 0; /* Allocate space and initialize. */ - Newc(1001, r, sizeof(regexp) + (unsigned)regsize * sizeof(regnode), + Newc(1001, r, sizeof(regexp) + (unsigned)PL_regsize * sizeof(regnode), char, regexp); if (r == NULL) FAIL("regexp out of space"); r->refcnt = 1; r->prelen = xend - exp; - r->precomp = regprecomp; + r->precomp = PL_regprecomp; r->subbeg = r->subbase = NULL; - rx = r; + r->nparens = PL_regnpar - 1; /* set early to validate backrefs */ + PL_regcomp_rx = r; /* Second pass: emit code. */ - regparse = exp; - regxend = xend; - regnaughty = 0; - regnpar = 1; - regcode = r->program; - regc((U8)MAGIC, (char*) regcode++); + PL_regcomp_parse = exp; + PL_regxend = xend; + PL_regnaughty = 0; + PL_regnpar = 1; + PL_regcode = r->program; + /* Store the count of eval-groups for security checks: */ + PL_regcode->next_off = ((PL_seen_evals > U16_MAX) ? U16_MAX : PL_seen_evals); + regc((U8)MAGIC, (char*) PL_regcode++); r->data = 0; if (reg(0, &flags) == NULL) return(NULL); /* Dig out information for optimizations. */ - pm->op_pmflags = regflags; - r->reganch = 0; + r->reganch = pm->op_pmflags & PMf_COMPILETIME; + pm->op_pmflags = PL_regflags; r->regstclass = NULL; - r->naughty = regnaughty >= 10; /* Probably an expensive pattern. */ + r->naughty = PL_regnaughty >= 10; /* Probably an expensive pattern. */ scan = r->program + 1; /* First BRANCH. */ + + /* XXXX To minimize changes to RE engine we always allocate + 3-units-long substrs field. */ + Newz(1004, r->substrs, 1, struct reg_substr_data); + if (OP(scan) != BRANCH) { /* Only one top-level choice. */ scan_data_t data; I32 fake; @@ -859,7 +874,7 @@ pregcomp(char *exp, char *xend, PMOP *pm) first = NEXTOPER(first); goto again; } - if (sawplus && (!sawopen || !regsawback)) + if (sawplus && (!sawopen || !PL_regsawback)) r->reganch |= ROPT_SKIP; /* x+ must match 1st of run */ /* Scan is after the zeroth branch, first is atomic matcher. */ @@ -884,12 +899,12 @@ pregcomp(char *exp, char *xend, PMOP *pm) data.longest = &(data.longest_fixed); first = scan; - minlen = study_chunk(&first, &fake, scan + regsize, /* Up to end */ + minlen = study_chunk(&first, &fake, scan + PL_regsize, /* Up to end */ &data, SCF_DO_SUBSTR); - if ( regnpar == 1 && data.longest == &(data.longest_fixed) + if ( PL_regnpar == 1 && data.longest == &(data.longest_fixed) && data.last_start_min == 0 && data.last_end > 0 - && !seen_zerolen - && (!(regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) + && !PL_seen_zerolen + && (!(PL_regseen & REG_SEEN_GPOS) || (r->reganch & ROPT_ANCH_GPOS))) r->reganch |= ROPT_CHECK_ALL; scan_commit(&data); SvREFCNT_dec(data.last_found); @@ -898,22 +913,23 @@ pregcomp(char *exp, char *xend, PMOP *pm) if (longest_float_length || (data.flags & SF_FL_BEFORE_EOL && (!(data.flags & SF_FL_BEFORE_MEOL) - || (regflags & PMf_MULTILINE)))) { + || (PL_regflags & PMf_MULTILINE)))) { if (SvCUR(data.longest_fixed) - && data.offset_fixed == data.offset_float_min) - goto remove; /* Like in (a)+. */ + && data.offset_fixed == data.offset_float_min + && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)) + goto remove_float; /* Like in (a)+. */ r->float_substr = data.longest_float; r->float_min_offset = data.offset_float_min; r->float_max_offset = data.offset_float_max; - fbm_compile(r->float_substr); + fbm_compile(r->float_substr, 0); BmUSEFUL(r->float_substr) = 100; if (data.flags & SF_FL_BEFORE_EOL /* Cannot have SEOL and MULTI */ && (!(data.flags & SF_FL_BEFORE_MEOL) - || (regflags & PMf_MULTILINE))) + || (PL_regflags & PMf_MULTILINE))) SvTAIL_on(r->float_substr); } else { - remove: + remove_float: r->float_substr = Nullsv; SvREFCNT_dec(data.longest_float); longest_float_length = 0; @@ -923,14 +939,14 @@ pregcomp(char *exp, char *xend, PMOP *pm) if (longest_fixed_length || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (regflags & PMf_MULTILINE)))) { + || (PL_regflags & PMf_MULTILINE)))) { r->anchored_substr = data.longest_fixed; r->anchored_offset = data.offset_fixed; - fbm_compile(r->anchored_substr); + fbm_compile(r->anchored_substr, 0); BmUSEFUL(r->anchored_substr) = 100; if (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */ && (!(data.flags & SF_FIX_BEFORE_MEOL) - || (regflags & PMf_MULTILINE))) + || (PL_regflags & PMf_MULTILINE))) SvTAIL_on(r->anchored_substr); } else { r->anchored_substr = Nullsv; @@ -955,18 +971,19 @@ pregcomp(char *exp, char *xend, PMOP *pm) DEBUG_r(PerlIO_printf(Perl_debug_log, "\n")); scan = r->program + 1; - minlen = study_chunk(&scan, &fake, scan + regsize, NULL, 0); + minlen = study_chunk(&scan, &fake, scan + PL_regsize, NULL, 0); r->check_substr = r->anchored_substr = r->float_substr = Nullsv; } - r->nparens = regnpar - 1; r->minlen = minlen; - if (regseen & REG_SEEN_GPOS) + if (PL_regseen & REG_SEEN_GPOS) r->reganch |= ROPT_GPOS_SEEN; - if (regseen & REG_SEEN_LOOKBEHIND) + if (PL_regseen & REG_SEEN_LOOKBEHIND) r->reganch |= ROPT_LOOKBEHIND_SEEN; - Newz(1002, r->startp, regnpar, char*); - Newz(1002, r->endp, regnpar, char*); + if (PL_regseen & REG_SEEN_EVAL) + r->reganch |= ROPT_EVAL_SEEN; + Newz(1002, r->startp, PL_regnpar, char*); + Newz(1002, r->endp, PL_regnpar, char*); DEBUG_r(regdump(r)); return(r); } @@ -980,40 +997,41 @@ pregcomp(char *exp, char *xend, PMOP *pm) * is a trifle forced, but the need to tie the tails of the branches to what * follows makes it hard to avoid. */ -static regnode * +STATIC regnode * reg(I32 paren, I32 *flagp) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ { + dTHR; register regnode *ret; /* Will be the head of the group. */ register regnode *br; register regnode *lastbr; register regnode *ender = 0; register I32 parno = 0; - I32 flags, oregflags = regflags, have_branch = 0, open = 0; + I32 flags, oregflags = PL_regflags, have_branch = 0, open = 0; char c; - *flagp = HASWIDTH; /* Tentatively. */ + *flagp = 0; /* Tentatively. */ /* Make an OPEN node, if parenthesized. */ if (paren) { - if (*regparse == '?') { - regparse++; - paren = *regparse++; + if (*PL_regcomp_parse == '?') { + U16 posflags = 0, negflags = 0; + U16 *flagsp = &posflags; + + PL_regcomp_parse++; + paren = *PL_regcomp_parse++; ret = NULL; /* For look-ahead/behind. */ switch (paren) { case '<': -#ifndef REGALIGN_STRUCT - FAIL("lookbehind non-implemented without REGALIGN_STRUCT"); -#endif - regseen |= REG_SEEN_LOOKBEHIND; - if (*regparse == '!') + PL_regseen |= REG_SEEN_LOOKBEHIND; + if (*PL_regcomp_parse == '!') paren = ','; - if (*regparse != '=' && *regparse != '!') + if (*PL_regcomp_parse != '=' && *PL_regcomp_parse != '!') goto unknown; - regparse++; + PL_regcomp_parse++; case '=': case '!': - seen_zerolen++; + PL_seen_zerolen++; case ':': case '>': break; @@ -1022,9 +1040,9 @@ reg(I32 paren, I32 *flagp) FAIL2("Sequence (?%c...) not implemented", (int)paren); break; case '#': - while (*regparse && *regparse != ')') - regparse++; - if (*regparse != ')') + while (*PL_regcomp_parse && *PL_regcomp_parse != ')') + PL_regcomp_parse++; + if (*PL_regcomp_parse != ')') FAIL("Sequence (?#... not terminated"); nextchar(); *flagp = TRYAGAIN; @@ -1034,61 +1052,67 @@ reg(I32 paren, I32 *flagp) dTHR; I32 count = 1, n = 0; char c; - char *s = regparse; + char *s = PL_regcomp_parse; SV *sv; OP_4tree *sop, *rop; - seen_zerolen++; - while (count && (c = *regparse)) { - if (c == '\\' && regparse[1]) - regparse++; + PL_seen_zerolen++; + PL_regseen |= REG_SEEN_EVAL; + while (count && (c = *PL_regcomp_parse)) { + if (c == '\\' && PL_regcomp_parse[1]) + PL_regcomp_parse++; else if (c == '{') count++; else if (c == '}') count--; - regparse++; + PL_regcomp_parse++; } - if (*regparse != ')') + if (*PL_regcomp_parse != ')') FAIL("Sequence (?{...}) not terminated or not {}-balanced"); if (!SIZE_ONLY) { AV *av; - if (regparse - 1 - s) - sv = newSVpv(s, regparse - 1 - s); + if (PL_regcomp_parse - 1 - s) + sv = newSVpv(s, PL_regcomp_parse - 1 - s); else sv = newSVpv("", 0); rop = sv_compile_2op(sv, &sop, "re", &av); n = add_data(3, "nso"); - rx->data->data[n] = (void*)rop; - rx->data->data[n+1] = (void*)av; - rx->data->data[n+2] = (void*)sop; + PL_regcomp_rx->data->data[n] = (void*)rop; + PL_regcomp_rx->data->data[n+1] = (void*)av; + PL_regcomp_rx->data->data[n+2] = (void*)sop; SvREFCNT_dec(sv); + } else { /* First pass */ + if (PL_reginterp_cnt < ++PL_seen_evals && PL_curcop != &PL_compiling) + /* No compiled RE interpolated, has runtime + components ===> unsafe. */ + FAIL("Eval-group not allowed at runtime, use re 'eval'"); + if (PL_tainted) + FAIL("Eval-group in insecure regular expression"); } nextchar(); - if (tainted) - FAIL("Eval-group in insecure regular expression"); return reganode(EVAL, n); } case '(': { - if (regparse[0] == '?') { - if (regparse[1] == '=' || regparse[1] == '!' - || regparse[1] == '<' - || regparse[1] == '{') { /* Lookahead or eval. */ + if (PL_regcomp_parse[0] == '?') { + if (PL_regcomp_parse[1] == '=' || PL_regcomp_parse[1] == '!' + || PL_regcomp_parse[1] == '<' + || PL_regcomp_parse[1] == '{') { /* Lookahead or eval. */ I32 flag; ret = reg_node(LOGICAL); regtail(ret, reg(1, &flag)); goto insert_if; } - } else if (regparse[0] >= '1' && regparse[0] <= '9' ) { - parno = atoi(regparse++); + } else if (PL_regcomp_parse[0] >= '1' && PL_regcomp_parse[0] <= '9' ) { + parno = atoi(PL_regcomp_parse++); - while (isDIGIT(*regparse)) - regparse++; + while (isDIGIT(*PL_regcomp_parse)) + PL_regcomp_parse++; ret = reganode(GROUPP, parno); if ((c = *nextchar()) != ')') FAIL2("Switch (?(number%c not recognized", c); @@ -1100,10 +1124,14 @@ reg(I32 paren, I32 *flagp) else regtail(br, reganode(LONGJMP, 0)); c = *nextchar(); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; if (c == '|') { lastbr = reganode(IFTHEN, 0); /* Fake one for optimizer. */ regbranch(&flags, 1); regtail(ret, lastbr); + if (flags&HASWIDTH) + *flagp |= HASWIDTH; c = *nextchar(); } else lastbr = NULL; @@ -1118,27 +1146,43 @@ reg(I32 paren, I32 *flagp) regtail(ret, ender); return ret; } else { - FAIL2("Unknown condition for (?(%.2s", regparse); + FAIL2("Unknown condition for (?(%.2s", PL_regcomp_parse); } } case 0: FAIL("Sequence (? incomplete"); break; default: - --regparse; - while (*regparse && strchr("iogcmsx", *regparse)) - pmflag(®flags, *regparse++); + --PL_regcomp_parse; + parse_flags: + while (*PL_regcomp_parse && strchr("iogcmsx", *PL_regcomp_parse)) { + if (*PL_regcomp_parse != 'o') + pmflag(flagsp, *PL_regcomp_parse); + ++PL_regcomp_parse; + } + if (*PL_regcomp_parse == '-') { + flagsp = &negflags; + ++PL_regcomp_parse; + goto parse_flags; + } + PL_regflags |= posflags; + PL_regflags &= ~negflags; + if (*PL_regcomp_parse == ':') { + PL_regcomp_parse++; + paren = ':'; + break; + } unknown: - if (*regparse != ')') - FAIL2("Sequence (?%c...) not recognized", *regparse); + if (*PL_regcomp_parse != ')') + FAIL2("Sequence (?%c...) not recognized", *PL_regcomp_parse); nextchar(); *flagp = TRYAGAIN; return NULL; } } else { - parno = regnpar; - regnpar++; + parno = PL_regnpar; + PL_regnpar++; ret = reganode(OPEN, parno); open = 1; } @@ -1149,14 +1193,14 @@ reg(I32 paren, I32 *flagp) br = regbranch(&flags, 1); if (br == NULL) return(NULL); - if (*regparse == '|') { - if (!SIZE_ONLY && extralen) { + if (*PL_regcomp_parse == '|') { + if (!SIZE_ONLY && PL_extralen) { reginsert(BRANCHJ, br); } else reginsert(BRANCH, br); have_branch = 1; if (SIZE_ONLY) - extralen += 1; /* For BRANCHJ-BRANCH. */ + PL_extralen += 1; /* For BRANCHJ-BRANCH. */ } else if (paren == ':') { *flagp |= flags&SIMPLE; } @@ -1164,25 +1208,25 @@ reg(I32 paren, I32 *flagp) regtail(ret, br); /* OPEN -> first. */ } else if (paren != '?') /* Not Conditional */ ret = br; - if (!(flags&HASWIDTH)) - *flagp &= ~HASWIDTH; + if (flags&HASWIDTH) + *flagp |= HASWIDTH; *flagp |= flags&SPSTART; lastbr = br; - while (*regparse == '|') { - if (!SIZE_ONLY && extralen) { + while (*PL_regcomp_parse == '|') { + if (!SIZE_ONLY && PL_extralen) { ender = reganode(LONGJMP,0); regtail(NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */ } if (SIZE_ONLY) - extralen += 2; /* Account for LONGJMP. */ + PL_extralen += 2; /* Account for LONGJMP. */ nextchar(); br = regbranch(&flags, 0); if (br == NULL) return(NULL); regtail(lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; - if (!(flags&HASWIDTH)) - *flagp &= ~HASWIDTH; + if (flags&HASWIDTH) + *flagp |= HASWIDTH; *flagp |= flags&SPSTART; } @@ -1196,12 +1240,13 @@ reg(I32 paren, I32 *flagp) ender = reganode(CLOSE, parno); break; case '<': - case '>': case ',': case '=': case '!': - ender = reg_node(SUCCEED); *flagp &= ~HASWIDTH; + /* FALL THROUGH */ + case '>': + ender = reg_node(SUCCEED); break; case 0: ender = reg_node(END); @@ -1228,25 +1273,23 @@ reg(I32 paren, I32 *flagp) if (paren == '>') node = SUSPEND, flag = 0; reginsert(node,ret); -#ifdef REGALIGN_STRUCT ret->flags = flag; -#endif regtail(ret, reg_node(TAIL)); } } /* Check for proper termination. */ - if (paren && (regparse >= regxend || *nextchar() != ')')) { + if (paren && (PL_regcomp_parse >= PL_regxend || *nextchar() != ')')) { FAIL("unmatched () in regexp"); - } else if (!paren && regparse < regxend) { - if (*regparse == ')') { + } else if (!paren && PL_regcomp_parse < PL_regxend) { + if (*PL_regcomp_parse == ')') { FAIL("unmatched () in regexp"); } else FAIL("junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } if (paren != 0) { - regflags = oregflags; + PL_regflags = oregflags; } return(ret); @@ -1257,9 +1300,10 @@ reg(I32 paren, I32 *flagp) * * Implements the concatenation operator. */ -static regnode * +STATIC regnode * regbranch(I32 *flagp, I32 first) { + dTHR; register regnode *ret; register regnode *chain = NULL; register regnode *latest; @@ -1268,20 +1312,20 @@ regbranch(I32 *flagp, I32 first) if (first) ret = NULL; else { - if (!SIZE_ONLY && extralen) + if (!SIZE_ONLY && PL_extralen) ret = reganode(BRANCHJ,0); else ret = reg_node(BRANCH); } if (!first && SIZE_ONLY) - extralen += 1; /* BRANCHJ */ + PL_extralen += 1; /* BRANCHJ */ *flagp = WORST; /* Tentatively. */ - regparse--; + PL_regcomp_parse--; nextchar(); - while (regparse < regxend && *regparse != '|' && *regparse != ')') { + while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '|' && *PL_regcomp_parse != ')') { flags &= ~TRYAGAIN; latest = regpiece(&flags); if (latest == NULL) { @@ -1294,7 +1338,7 @@ regbranch(I32 *flagp, I32 first) if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { - regnaughty++; + PL_regnaughty++; regtail(chain, latest); } chain = latest; @@ -1321,14 +1365,15 @@ regbranch(I32 *flagp, I32 first) * It might seem that this node could be dispensed with entirely, but the * endmarker role is not redundant. */ -static regnode * +STATIC regnode * regpiece(I32 *flagp) { + dTHR; register regnode *ret; register char op; register char *next; I32 flags; - char *origparse = regparse; + char *origparse = PL_regcomp_parse; char *maxpos; I32 min; I32 max = REG_INFTY; @@ -1340,10 +1385,10 @@ regpiece(I32 *flagp) return(NULL); } - op = *regparse; + op = *PL_regcomp_parse; - if (op == '{' && regcurly(regparse)) { - next = regparse + 1; + if (op == '{' && regcurly(PL_regcomp_parse)) { + next = PL_regcomp_parse + 1; maxpos = Nullch; while (isDIGIT(*next) || *next == ',') { if (*next == ',') { @@ -1357,46 +1402,46 @@ regpiece(I32 *flagp) if (*next == '}') { /* got one */ if (!maxpos) maxpos = next; - regparse++; - min = atoi(regparse); + PL_regcomp_parse++; + min = atoi(PL_regcomp_parse); if (*maxpos == ',') maxpos++; else - maxpos = regparse; + maxpos = PL_regcomp_parse; max = atoi(maxpos); if (!max && *maxpos != '0') max = REG_INFTY; /* meaning "infinity" */ else if (max >= REG_INFTY) FAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1); - regparse = next; + PL_regcomp_parse = next; nextchar(); do_curly: if ((flags&SIMPLE)) { - regnaughty += 2 + regnaughty / 2; + PL_regnaughty += 2 + PL_regnaughty / 2; reginsert(CURLY, ret); } else { - regnaughty += 4 + regnaughty; /* compound interest */ + PL_regnaughty += 4 + PL_regnaughty; /* compound interest */ regtail(ret, reg_node(WHILEM)); - if (!SIZE_ONLY && extralen) { + if (!SIZE_ONLY && PL_extralen) { reginsert(LONGJMP,ret); reginsert(NOTHING,ret); NEXT_OFF(ret) = 3; /* Go over LONGJMP. */ } reginsert(CURLYX,ret); - if (!SIZE_ONLY && extralen) + if (!SIZE_ONLY && PL_extralen) NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */ regtail(ret, reg_node(NOTHING)); if (SIZE_ONLY) - extralen += 3; + PL_extralen += 3; } -#ifdef REGALIGN_STRUCT ret->flags = 0; -#endif if (min > 0) - *flagp = (WORST|HASWIDTH); + *flagp = WORST; + if (max > 0) + *flagp |= HASWIDTH; if (max && max < min) FAIL("Can't do {n,m} with n > m"); if (!SIZE_ONLY) { @@ -1420,24 +1465,20 @@ regpiece(I32 *flagp) nextchar(); - *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH); + *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH); if (op == '*' && (flags&SIMPLE)) { reginsert(STAR, ret); -#ifdef REGALIGN_STRUCT ret->flags = 0; -#endif - regnaughty += 4; + PL_regnaughty += 4; } else if (op == '*') { min = 0; goto do_curly; } else if (op == '+' && (flags&SIMPLE)) { reginsert(PLUS, ret); -#ifdef REGALIGN_STRUCT ret->flags = 0; -#endif - regnaughty += 3; + PL_regnaughty += 3; } else if (op == '+') { min = 1; @@ -1447,21 +1488,17 @@ regpiece(I32 *flagp) goto do_curly; } nest_check: - if (dowarn && !SIZE_ONLY && !(flags&HASWIDTH) && max > 10000) { + if (PL_dowarn && !SIZE_ONLY && !(flags&HASWIDTH) && max > 10000) { warn("%.*s matches null string many times", - regparse - origparse, origparse); + PL_regcomp_parse - origparse, origparse); } - if (*regparse == '?') { + if (*PL_regcomp_parse == '?') { nextchar(); reginsert(MINMOD, ret); -#ifdef REGALIGN regtail(ret, ret + NODE_STEP_REGNODE); -#else - regtail(ret, ret + 3); -#endif } - if (ISMULT2(regparse)) + if (ISMULT2(PL_regcomp_parse)) FAIL("nested *?+ in regexp"); return(ret); @@ -1477,48 +1514,49 @@ regpiece(I32 *flagp) * * [Yes, it is worth fixing, some scripts can run twice the speed.] */ -static regnode * +STATIC regnode * regatom(I32 *flagp) { + dTHR; register regnode *ret = 0; I32 flags; *flagp = WORST; /* Tentatively. */ tryagain: - switch (*regparse) { + switch (*PL_regcomp_parse) { case '^': - seen_zerolen++; + PL_seen_zerolen++; nextchar(); - if (regflags & PMf_MULTILINE) + if (PL_regflags & PMf_MULTILINE) ret = reg_node(MBOL); - else if (regflags & PMf_SINGLELINE) + else if (PL_regflags & PMf_SINGLELINE) ret = reg_node(SBOL); else ret = reg_node(BOL); break; case '$': - if (regparse[1]) - seen_zerolen++; + if (PL_regcomp_parse[1]) + PL_seen_zerolen++; nextchar(); - if (regflags & PMf_MULTILINE) + if (PL_regflags & PMf_MULTILINE) ret = reg_node(MEOL); - else if (regflags & PMf_SINGLELINE) + else if (PL_regflags & PMf_SINGLELINE) ret = reg_node(SEOL); else ret = reg_node(EOL); break; case '.': nextchar(); - if (regflags & PMf_SINGLELINE) + if (PL_regflags & PMf_SINGLELINE) ret = reg_node(SANY); else ret = reg_node(ANY); - regnaughty++; + PL_regnaughty++; *flagp |= HASWIDTH|SIMPLE; break; case '[': - regparse++; + PL_regcomp_parse++; ret = regclass(); *flagp |= HASWIDTH|SIMPLE; break; @@ -1538,12 +1576,12 @@ tryagain: *flagp |= TRYAGAIN; return NULL; } - FAIL2("internal urp in regexp at /%s/", regparse); + FAIL2("internal urp in regexp at /%s/", PL_regcomp_parse); /* Supposed to be caught earlier. */ break; case '{': - if (!regcurly(regparse)) { - regparse++; + if (!regcurly(PL_regcomp_parse)) { + PL_regcomp_parse++; goto defchar; } /* FALL THROUGH */ @@ -1553,16 +1591,16 @@ tryagain: FAIL("?+*{} follows nothing in regexp"); break; case '\\': - switch (*++regparse) { + switch (*++PL_regcomp_parse) { case 'A': - seen_zerolen++; + PL_seen_zerolen++; ret = reg_node(SBOL); *flagp |= SIMPLE; nextchar(); break; case 'G': ret = reg_node(GPOS); - regseen |= REG_SEEN_GPOS; + PL_regseen |= REG_SEEN_GPOS; *flagp |= SIMPLE; nextchar(); break; @@ -1571,35 +1609,41 @@ tryagain: *flagp |= SIMPLE; nextchar(); break; + case 'z': + ret = reg_node(EOS); + *flagp |= SIMPLE; + PL_seen_zerolen++; /* Do not optimize RE away */ + nextchar(); + break; case 'w': - ret = reg_node((regflags & PMf_LOCALE) ? ALNUML : ALNUM); + ret = reg_node((PL_regflags & PMf_LOCALE) ? ALNUML : ALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'W': - ret = reg_node((regflags & PMf_LOCALE) ? NALNUML : NALNUM); + ret = reg_node((PL_regflags & PMf_LOCALE) ? NALNUML : NALNUM); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'b': - seen_zerolen++; - ret = reg_node((regflags & PMf_LOCALE) ? BOUNDL : BOUND); + PL_seen_zerolen++; + ret = reg_node((PL_regflags & PMf_LOCALE) ? BOUNDL : BOUND); *flagp |= SIMPLE; nextchar(); break; case 'B': - seen_zerolen++; - ret = reg_node((regflags & PMf_LOCALE) ? NBOUNDL : NBOUND); + PL_seen_zerolen++; + ret = reg_node((PL_regflags & PMf_LOCALE) ? NBOUNDL : NBOUND); *flagp |= SIMPLE; nextchar(); break; case 's': - ret = reg_node((regflags & PMf_LOCALE) ? SPACEL : SPACE); + ret = reg_node((PL_regflags & PMf_LOCALE) ? SPACEL : SPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; case 'S': - ret = reg_node((regflags & PMf_LOCALE) ? NSPACEL : NSPACE); + ret = reg_node((PL_regflags & PMf_LOCALE) ? NSPACEL : NSPACE); *flagp |= HASWIDTH|SIMPLE; nextchar(); break; @@ -1626,25 +1670,27 @@ tryagain: case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': { - I32 num = atoi(regparse); + I32 num = atoi(PL_regcomp_parse); - if (num > 9 && num >= regnpar) + if (num > 9 && num >= PL_regnpar) goto defchar; else { - regsawback = 1; - ret = reganode((regflags & PMf_FOLD) - ? ((regflags & PMf_LOCALE) ? REFFL : REFF) + if (!SIZE_ONLY && num > PL_regcomp_rx->nparens) + FAIL("reference to nonexistent group"); + PL_regsawback = 1; + ret = reganode((PL_regflags & PMf_FOLD) + ? ((PL_regflags & PMf_LOCALE) ? REFFL : REFF) : REF, num); *flagp |= HASWIDTH; - while (isDIGIT(*regparse)) - regparse++; - regparse--; + while (isDIGIT(*PL_regcomp_parse)) + PL_regcomp_parse++; + PL_regcomp_parse--; nextchar(); } } break; case '\0': - if (regparse >= regxend) + if (PL_regcomp_parse >= PL_regxend) FAIL("trailing \\ in regexp"); /* FALL THROUGH */ default: @@ -1653,9 +1699,9 @@ tryagain: break; case '#': - if (regflags & PMf_EXTENDED) { - while (regparse < regxend && *regparse != '\n') regparse++; - if (regparse < regxend) + if (PL_regflags & PMf_EXTENDED) { + while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != '\n') PL_regcomp_parse++; + if (PL_regcomp_parse < PL_regxend) goto tryagain; } /* FALL THROUGH */ @@ -1667,22 +1713,22 @@ tryagain: char *oldp, *s; I32 numlen; - regparse++; + PL_regcomp_parse++; defchar: - ret = reg_node((regflags & PMf_FOLD) - ? ((regflags & PMf_LOCALE) ? EXACTFL : EXACTF) + ret = reg_node((PL_regflags & PMf_FOLD) + ? ((PL_regflags & PMf_LOCALE) ? EXACTFL : EXACTF) : EXACT); s = (char *) OPERAND(ret); regc(0, s++); /* save spot for len */ - for (len = 0, p = regparse - 1; - len < 127 && p < regxend; + for (len = 0, p = PL_regcomp_parse - 1; + len < 127 && p < PL_regxend; len++) { oldp = p; - if (regflags & PMf_EXTENDED) - p = regwhite(p, regxend); + if (PL_regflags & PMf_EXTENDED) + p = regwhite(p, PL_regxend); switch (*p) { case '^': case '$': @@ -1697,6 +1743,7 @@ tryagain: case 'A': case 'G': case 'Z': + case 'z': case 'w': case 'W': case 'b': @@ -1743,7 +1790,7 @@ tryagain: case '0': case '1': case '2': case '3':case '4': case '5': case '6': case '7': case '8':case '9': if (*p == '0' || - (isDIGIT(p[1]) && atoi(p) >= regnpar) ) { + (isDIGIT(p[1]) && atoi(p) >= PL_regnpar) ) { ender = scan_oct(p, 3, &numlen); p += numlen; } @@ -1753,7 +1800,7 @@ tryagain: } break; case '\0': - if (p >= regxend) + if (p >= PL_regxend) FAIL("trailing \\ in regexp"); /* FALL THROUGH */ default: @@ -1765,8 +1812,8 @@ tryagain: ender = *p++; break; } - if (regflags & PMf_EXTENDED) - p = regwhite(p, regxend); + if (PL_regflags & PMf_EXTENDED) + p = regwhite(p, PL_regxend); if (ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; @@ -1779,7 +1826,7 @@ tryagain: regc(ender, s++); } loopdone: - regparse = p - 1; + PL_regcomp_parse = p - 1; nextchar(); if (len < 0) FAIL("internal disaster in regexp"); @@ -1791,11 +1838,9 @@ tryagain: *OPERAND(ret) = len; regc('\0', s++); if (SIZE_ONLY) { -#ifdef REGALIGN_STRUCT - regsize += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode); -#endif + PL_regsize += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode); } else { - regcode += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode); + PL_regcode += (len + 2 + sizeof(regnode) - 1) / sizeof(regnode); } } break; @@ -1804,7 +1849,7 @@ tryagain: return(ret); } -static char * +STATIC char * regwhite(char *p, char *e) { while (p < e) { @@ -1821,18 +1866,10 @@ regwhite(char *p, char *e) return p; } -static void -regset(char *opnd, register I32 c) -{ - if (SIZE_ONLY) - return; - c &= 0xFF; - opnd[1 + (c >> 3)] |= (1 << (c & 7)); -} - -static regnode * +STATIC regnode * regclass(void) { + dTHR; register char *opnd, *s; register I32 Class; register I32 lastclass = 1234; @@ -1841,91 +1878,119 @@ regclass(void) register I32 def; I32 numlen; - s = opnd = (char *) OPERAND(regcode); + s = opnd = (char *) OPERAND(PL_regcode); ret = reg_node(ANYOF); for (Class = 0; Class < 33; Class++) regc(0, s++); - if (*regparse == '^') { /* Complement of range. */ - regnaughty++; - regparse++; + if (*PL_regcomp_parse == '^') { /* Complement of range. */ + PL_regnaughty++; + PL_regcomp_parse++; if (!SIZE_ONLY) *opnd |= ANYOF_INVERT; } if (!SIZE_ONLY) { - regcode += ANY_SKIP; - if (regflags & PMf_FOLD) + PL_regcode += ANY_SKIP; + if (PL_regflags & PMf_FOLD) *opnd |= ANYOF_FOLD; - if (regflags & PMf_LOCALE) + if (PL_regflags & PMf_LOCALE) *opnd |= ANYOF_LOCALE; } else { - regsize += ANY_SKIP; + PL_regsize += ANY_SKIP; } - if (*regparse == ']' || *regparse == '-') + if (*PL_regcomp_parse == ']' || *PL_regcomp_parse == '-') goto skipcond; /* allow 1st char to be ] or - */ - while (regparse < regxend && *regparse != ']') { + while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != ']') { skipcond: - Class = UCHARAT(regparse++); + Class = UCHARAT(PL_regcomp_parse++); + if (Class == '[' && PL_regcomp_parse + 1 < PL_regxend && + /* I smell either [: or [= or [. -- POSIX has been here, right? */ + (*PL_regcomp_parse == ':' || *PL_regcomp_parse == '=' || *PL_regcomp_parse == '.')) { + char posixccc = *PL_regcomp_parse; + char* posixccs = PL_regcomp_parse++; + + while (PL_regcomp_parse < PL_regxend && *PL_regcomp_parse != posixccc) + PL_regcomp_parse++; + if (PL_regcomp_parse == PL_regxend) + /* Grandfather lone [:, [=, [. */ + PL_regcomp_parse = posixccs; + else { + PL_regcomp_parse++; /* skip over the posixccc */ + if (*PL_regcomp_parse == ']') { + /* Not Implemented Yet. + * (POSIX Extended Character Classes, that is) + * The text between e.g. [: and :] would start + * at posixccs + 1 and stop at regcomp_parse - 2. */ + if (PL_dowarn && !SIZE_ONLY) + warn("Character class syntax [%c %c] is reserved for future extensions", posixccc, posixccc); + PL_regcomp_parse++; /* skip over the ending ] */ + } + } + } if (Class == '\\') { - Class = UCHARAT(regparse++); + Class = UCHARAT(PL_regcomp_parse++); switch (Class) { case 'w': - if (regflags & PMf_LOCALE) { - if (!SIZE_ONLY) + if (!SIZE_ONLY) { + if (PL_regflags & PMf_LOCALE) *opnd |= ANYOF_ALNUML; - } - else { - for (Class = 0; Class < 256; Class++) - if (isALNUM(Class)) - regset(opnd, Class); + else { + for (Class = 0; Class < 256; Class++) + if (isALNUM(Class)) + ANYOF_SET(opnd, Class); + } } lastclass = 1234; continue; case 'W': - if (regflags & PMf_LOCALE) { - if (!SIZE_ONLY) + if (!SIZE_ONLY) { + if (PL_regflags & PMf_LOCALE) *opnd |= ANYOF_NALNUML; - } - else { - for (Class = 0; Class < 256; Class++) - if (!isALNUM(Class)) - regset(opnd, Class); + else { + for (Class = 0; Class < 256; Class++) + if (!isALNUM(Class)) + ANYOF_SET(opnd, Class); + } } lastclass = 1234; continue; case 's': - if (regflags & PMf_LOCALE) { - if (!SIZE_ONLY) + if (!SIZE_ONLY) { + if (PL_regflags & PMf_LOCALE) *opnd |= ANYOF_SPACEL; - } - else { - for (Class = 0; Class < 256; Class++) - if (isSPACE(Class)) - regset(opnd, Class); + else { + for (Class = 0; Class < 256; Class++) + if (isSPACE(Class)) + ANYOF_SET(opnd, Class); + } } lastclass = 1234; continue; case 'S': - if (regflags & PMf_LOCALE) { - if (!SIZE_ONLY) + if (!SIZE_ONLY) { + if (PL_regflags & PMf_LOCALE) *opnd |= ANYOF_NSPACEL; - } - else { - for (Class = 0; Class < 256; Class++) - if (!isSPACE(Class)) - regset(opnd, Class); + else { + for (Class = 0; Class < 256; Class++) + if (!isSPACE(Class)) + ANYOF_SET(opnd, Class); + } } lastclass = 1234; continue; case 'd': - for (Class = '0'; Class <= '9'; Class++) - regset(opnd, Class); + if (!SIZE_ONLY) { + for (Class = '0'; Class <= '9'; Class++) + ANYOF_SET(opnd, Class); + } lastclass = 1234; continue; case 'D': - for (Class = 0; Class < '0'; Class++) - regset(opnd, Class); - for (Class = '9' + 1; Class < 256; Class++) - regset(opnd, Class); + if (!SIZE_ONLY) { + for (Class = 0; Class < '0'; Class++) + ANYOF_SET(opnd, Class); + for (Class = '9' + 1; Class < 256; Class++) + ANYOF_SET(opnd, Class); + } lastclass = 1234; continue; case 'n': @@ -1950,17 +2015,17 @@ regclass(void) Class = '\007'; break; case 'x': - Class = scan_hex(regparse, 2, &numlen); - regparse += numlen; + Class = scan_hex(PL_regcomp_parse, 2, &numlen); + PL_regcomp_parse += numlen; break; case 'c': - Class = UCHARAT(regparse++); + Class = UCHARAT(PL_regcomp_parse++); Class = toCTRL(Class); break; case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': - Class = scan_oct(--regparse, 3, &numlen); - regparse += numlen; + Class = scan_oct(--PL_regcomp_parse, 3, &numlen); + PL_regcomp_parse += numlen; break; } } @@ -1971,45 +2036,80 @@ regclass(void) } else { lastclass = Class; - if (*regparse == '-' && regparse+1 < regxend && - regparse[1] != ']') { - regparse++; + if (*PL_regcomp_parse == '-' && PL_regcomp_parse+1 < PL_regxend && + PL_regcomp_parse[1] != ']') { + PL_regcomp_parse++; range = 1; continue; /* do it next time */ } } - for ( ; lastclass <= Class; lastclass++) - regset(opnd, lastclass); + if (!SIZE_ONLY) { +#ifndef ASCIIish + register I32 i; + if ((isLOWER(lastclass) && isLOWER(Class)) || + (isUPPER(lastclass) && isUPPER(Class))) { + if (isLOWER(lastclass)) { + for (i = lastclass; i <= Class; i++) + if (isLOWER(i)) + ANYOF_SET(opnd, i); + } else { + for (i = lastclass; i <= Class; i++) + if (isUPPER(i)) + ANYOF_SET(opnd, i); + } + } + else +#endif + for ( ; lastclass <= Class; lastclass++) + ANYOF_SET(opnd, lastclass); + } lastclass = Class; } - if (*regparse != ']') + if (*PL_regcomp_parse != ']') FAIL("unmatched [] in regexp"); nextchar(); + /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */ + if (!SIZE_ONLY && (*opnd & (0xFF ^ ANYOF_INVERT)) == ANYOF_FOLD) { + for (Class = 0; Class < 256; ++Class) { + if (ANYOF_TEST(opnd, Class)) { + I32 cf = fold[Class]; + ANYOF_SET(opnd, cf); + } + } + *opnd &= ~ANYOF_FOLD; + } + /* optimize inverted simple patterns (e.g. [^a-z]) */ + if (!SIZE_ONLY && (*opnd & 0xFF) == ANYOF_INVERT) { + for (Class = 0; Class < 32; ++Class) + opnd[1 + Class] ^= 0xFF; + *opnd = 0; + } return ret; } -static char* +STATIC char* nextchar(void) { - char* retval = regparse++; + dTHR; + char* retval = PL_regcomp_parse++; for (;;) { - if (*regparse == '(' && regparse[1] == '?' && - regparse[2] == '#') { - while (*regparse && *regparse != ')') - regparse++; - regparse++; + if (*PL_regcomp_parse == '(' && PL_regcomp_parse[1] == '?' && + PL_regcomp_parse[2] == '#') { + while (*PL_regcomp_parse && *PL_regcomp_parse != ')') + PL_regcomp_parse++; + PL_regcomp_parse++; continue; } - if (regflags & PMf_EXTENDED) { - if (isSPACE(*regparse)) { - regparse++; + if (PL_regflags & PMf_EXTENDED) { + if (isSPACE(*PL_regcomp_parse)) { + PL_regcomp_parse++; continue; } - else if (*regparse == '#') { - while (*regparse && *regparse != '\n') - regparse++; - regparse++; + else if (*PL_regcomp_parse == '#') { + while (*PL_regcomp_parse && *PL_regcomp_parse != '\n') + PL_regcomp_parse++; + PL_regcomp_parse++; continue; } } @@ -2020,32 +2120,24 @@ nextchar(void) /* - reg_node - emit a node */ -static regnode * /* Location. */ -#ifdef CAN_PROTOTYPE +STATIC regnode * /* Location. */ reg_node(U8 op) -#else -reg_node(op) -U8 op; -#endif { + dTHR; register regnode *ret; register regnode *ptr; - ret = regcode; + ret = PL_regcode; if (SIZE_ONLY) { - SIZE_ALIGN(regsize); -#ifdef REGALIGN_STRUCT - regsize += 1; -#else - regsize += 3; -#endif + SIZE_ALIGN(PL_regsize); + PL_regsize += 1; return(ret); } NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); - regcode = ptr; + PL_regcode = ptr; return(ret); } @@ -2053,33 +2145,24 @@ U8 op; /* - reganode - emit a node with an argument */ -static regnode * /* Location. */ -#ifdef CAN_PROTOTYPE +STATIC regnode * /* Location. */ reganode(U8 op, U32 arg) -#else -reganode(op, arg) -U8 op; -U32 arg; -#endif { + dTHR; register regnode *ret; register regnode *ptr; - ret = regcode; + ret = PL_regcode; if (SIZE_ONLY) { - SIZE_ALIGN(regsize); -#ifdef REGALIGN - regsize += 2; -#else - regsize += 5; -#endif + SIZE_ALIGN(PL_regsize); + PL_regsize += 2; return(ret); } NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); - regcode = ptr; + PL_regcode = ptr; return(ret); } @@ -2087,16 +2170,10 @@ U32 arg; /* - regc - emit (if appropriate) a byte of code */ -#ifdef CAN_PROTOTYPE -static void +STATIC void regc(U8 b, char* s) -#else -static void -regc(b, s) -U8 b; -char *s; -#endif { + dTHR; if (!SIZE_ONLY) *s = b; } @@ -2106,16 +2183,10 @@ char *s; * * Means relocating the operand. */ -#ifdef CAN_PROTOTYPE -static void +STATIC void reginsert(U8 op, regnode *opnd) -#else -static void -reginsert(op, opnd) -U8 op; -regnode *opnd; -#endif { + dTHR; register regnode *src; register regnode *dst; register regnode *place; @@ -2124,13 +2195,13 @@ regnode *opnd; /* (regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ if (SIZE_ONLY) { - regsize += NODE_STEP_REGNODE + offset; + PL_regsize += NODE_STEP_REGNODE + offset; return; } - src = regcode; - regcode += NODE_STEP_REGNODE + offset; - dst = regcode; + src = PL_regcode; + PL_regcode += NODE_STEP_REGNODE + offset; + dst = PL_regcode; while (src > opnd) StructCopy(--src, --dst, regnode); @@ -2138,17 +2209,15 @@ regnode *opnd; src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); -#if defined(REGALIGN) && !defined(REGALIGN_STRUCT) - src[offset + 1] = '\177'; -#endif } /* - regtail - set the next-pointer at the end of a node chain of p to val. */ -static void +STATIC void regtail(regnode *p, regnode *val) { + dTHR; register regnode *scan; register regnode *temp; register I32 offset; @@ -2165,35 +2234,20 @@ regtail(regnode *p, regnode *val) scan = temp; } -#ifdef REGALIGN -# ifdef REGALIGN_STRUCT if (reg_off_by_arg[OP(scan)]) { ARG_SET(scan, val - scan); } else { NEXT_OFF(scan) = val - scan; } -# else - offset = val - scan; -# ifndef lint - *(short*)(scan+1) = offset; -# endif -#endif -#else - if (OP(scan) == BACK) - offset = scan - val; - else - offset = val - scan; - *(scan+1) = (offset>>8)&0377; - *(scan+2) = offset&0377; -#endif } /* - regoptail - regtail on operand of first argument; nop if operandless */ -static void +STATIC void regoptail(regnode *p, regnode *val) { + dTHR; /* "Operandless" and "op != BRANCH" are synonymous in practice. */ if (p == NULL || SIZE_ONLY) return; @@ -2226,11 +2280,11 @@ regcurly(register char *s) return TRUE; } -#ifdef DEBUGGING -static regnode * +STATIC regnode * dumpuntil(regnode *start, regnode *node, regnode *last, SV* sv, I32 l) { +#ifdef DEBUGGING register char op = EXACT; /* Arbitrary non-END op. */ register regnode *next, *onode; @@ -2246,7 +2300,7 @@ dumpuntil(regnode *start, regnode *node, regnode *last, SV* sv, I32 l) if (OP(node) == OPTIMIZED) goto after_print; regprop(sv, node); - PerlIO_printf(Perl_debug_log, "%4d%*s%s", node - start, + PerlIO_printf(Perl_debug_log, "%4d:%*s%s", node - start, 2*l + 1, "", SvPVX(sv)); if (next == NULL) /* Next ptr. */ PerlIO_printf(Perl_debug_log, "(0)"); @@ -2287,6 +2341,7 @@ dumpuntil(regnode *start, regnode *node, regnode *last, SV* sv, I32 l) else if (op == WHILEM) l--; } +#endif /* DEBUGGING */ return node; } @@ -2296,6 +2351,8 @@ dumpuntil(regnode *start, regnode *node, regnode *last, SV* sv, I32 l) void regdump(regexp *r) { +#ifdef DEBUGGING + dTHR; SV *sv = sv_newmortal(); (void)dumpuntil(r->program, r->program + 1, NULL, sv, 0); @@ -2303,16 +2360,16 @@ regdump(regexp *r) /* Header fields of interest. */ if (r->anchored_substr) PerlIO_printf(Perl_debug_log, "anchored `%s%s%s'%s at %d ", - colors[0], + PL_colors[0], SvPVX(r->anchored_substr), - colors[1], + PL_colors[1], SvTAIL(r->anchored_substr) ? "$" : "", r->anchored_offset); if (r->float_substr) PerlIO_printf(Perl_debug_log, "floating `%s%s%s'%s at %d..%u ", - colors[0], + PL_colors[0], SvPVX(r->float_substr), - colors[1], + PL_colors[1], SvTAIL(r->float_substr) ? "$" : "", r->float_min_offset, r->float_max_offset); if (r->check_substr) @@ -2347,7 +2404,10 @@ regdump(regexp *r) if (r->reganch & ROPT_IMPLICIT) PerlIO_printf(Perl_debug_log, "implicit "); PerlIO_printf(Perl_debug_log, "minlen %ld ", (long) r->minlen); + if (r->reganch & ROPT_EVAL_SEEN) + PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); +#endif /* DEBUGGING */ } /* @@ -2356,9 +2416,11 @@ regdump(regexp *r) void regprop(SV *sv, regnode *o) { +#ifdef DEBUGGING + dTHR; register char *p = 0; - sv_setpv(sv, ":"); + sv_setpvn(sv, "", 0); switch (OP(o)) { case BOL: p = "BOL"; @@ -2372,6 +2434,9 @@ regprop(SV *sv, regnode *o) case EOL: p = "EOL"; break; + case EOS: + p = "EOS"; + break; case MEOL: p = "MEOL"; break; @@ -2391,13 +2456,13 @@ regprop(SV *sv, regnode *o) p = "BRANCH"; break; case EXACT: - sv_catpvf(sv, "EXACT <%s%s%s>", colors[0], OPERAND(o) + 1, colors[1]); + sv_catpvf(sv, "EXACT <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]); break; case EXACTF: - sv_catpvf(sv, "EXACTF <%s%s%s>", colors[0], OPERAND(o) + 1, colors[1]); + sv_catpvf(sv, "EXACTF <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]); break; case EXACTFL: - sv_catpvf(sv, "EXACTFL <%s%s%s>", colors[0], OPERAND(o) + 1, colors[1]); + sv_catpvf(sv, "EXACTFL <%s%s%s>", PL_colors[0], OPERAND(o) + 1, PL_colors[1]); break; case NOTHING: p = "NOTHING"; @@ -2427,18 +2492,10 @@ regprop(SV *sv, regnode *o) sv_catpvf(sv, "CURLY {%d,%d}", ARG1(o), ARG2(o)); break; case CURLYM: -#ifdef REGALIGN_STRUCT sv_catpvf(sv, "CURLYM[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o)); -#else - sv_catpvf(sv, "CURLYM {%d,%d}", ARG1(o), ARG2(o)); -#endif break; case CURLYN: -#ifdef REGALIGN_STRUCT sv_catpvf(sv, "CURLYN[%d] {%d,%d}", o->flags, ARG1(o), ARG2(o)); -#else - sv_catpvf(sv, "CURLYN {%d,%d}", ARG1(o), ARG2(o)); -#endif break; case CURLYX: sv_catpvf(sv, "CURLYX {%d,%d}", ARG1(o), ARG2(o)); @@ -2472,18 +2529,10 @@ regprop(SV *sv, regnode *o) p = "GPOS"; break; case UNLESSM: -#ifdef REGALIGN_STRUCT sv_catpvf(sv, "UNLESSM[-%d]", o->flags); -#else - p = "UNLESSM"; -#endif break; case IFMATCH: -#ifdef REGALIGN_STRUCT sv_catpvf(sv, "IFMATCH[-%d]", o->flags); -#else - p = "IFMATCH"; -#endif break; case SUCCEED: p = "SUCCEED"; @@ -2553,22 +2602,26 @@ regprop(SV *sv, regnode *o) } if (p) sv_catpv(sv, p); +#endif /* DEBUGGING */ } -#endif /* DEBUGGING */ void pregfree(struct regexp *r) { + dTHR; if (!r || (--r->refcnt > 0)) return; if (r->precomp) Safefree(r->precomp); if (r->subbase) Safefree(r->subbase); - if (r->anchored_substr) - SvREFCNT_dec(r->anchored_substr); - if (r->float_substr) - SvREFCNT_dec(r->float_substr); + if (r->substrs) { + if (r->anchored_substr) + SvREFCNT_dec(r->anchored_substr); + if (r->float_substr) + SvREFCNT_dec(r->float_substr); + Safefree(r->substrs); + } if (r->data) { int n = r->data->count; while (--n >= 0) { @@ -2602,36 +2655,21 @@ pregfree(struct regexp *r) regnode * regnext(register regnode *p) { + dTHR; register I32 offset; - if (p == ®dummy) + if (p == &PL_regdummy) return(NULL); offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p)); if (offset == 0) return(NULL); -#ifdef REGALIGN return(p+offset); -#else - if (OP(p) == BACK) - return(p-offset); - else - return(p+offset); -#endif } -#ifdef I_STDARG -void +STATIC void re_croak2(const char* pat1,const char* pat2,...) -#else -/*VARARGS0*/ -void -re_croak2(const char* pat1,const char* pat2, va_alist) - const char* pat1; - const char* pat2; - va_dcl -#endif { va_list args; STRLEN l1 = strlen(pat1); @@ -2645,13 +2683,9 @@ re_croak2(const char* pat1,const char* pat2, va_alist) l2 = 510 - l1; Copy(pat1, buf, l1 , char); Copy(pat2, buf + l1, l2 , char); - buf[l1 + l2 + 1] = '\n'; - buf[l1 + l2 + 2] = '\0'; -#ifdef I_STDARG + buf[l1 + l2] = '\n'; + buf[l1 + l2 + 1] = '\0'; va_start(args, pat2); -#else - va_start(args); -#endif message = mess(buf, &args); va_end(args); l1 = strlen(message); @@ -2661,5 +2695,3 @@ re_croak2(const char* pat1,const char* pat2, va_alist) buf[l1] = '\0'; /* Overwrite \n */ croak("%s", buf); } - -