X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/cde0cee5716418bb58782f073048ee9685ed2368..b5dffda6f343ffd74e5c9a395a43ef0450d6727b:/regcomp.c diff --git a/regcomp.c b/regcomp.c index d07f177..429b493 100644 --- a/regcomp.c +++ b/regcomp.c @@ -109,6 +109,7 @@ typedef struct RExC_state_t { char *parse; /* Input-scan pointer. */ I32 whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the allocated space */ regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ @@ -151,9 +152,12 @@ typedef struct RExC_state_t { #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) -#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */ +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) #define RExC_naughty (pRExC_state->naughty) #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) @@ -184,10 +188,11 @@ typedef struct RExC_state_t { * Flags to be passed up and down. */ #define WORST 0 /* Worst case. */ -#define HASWIDTH 0x1 /* Known to match non-null strings. */ -#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 0x4 /* Starts with * or +. */ -#define TRYAGAIN 0x8 /* Weeded out a declaration. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ +#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 0x04 /* Starts with * or +. */ +#define TRYAGAIN 0x08 /* Weeded out a declaration. */ +#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -511,7 +516,21 @@ static const scan_data_t zero_scan_data = * Element 0 holds the number n. * Position is 1 indexed. */ - +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x #define Set_Node_Offset_To_R(node,byte) STMT_START { \ if (! SIZE_ONLY) { \ MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ @@ -554,11 +573,11 @@ static const scan_data_t zero_scan_data = Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ } STMT_END - +#endif #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS #define EXPERIMENTAL_INPLACESCAN -#endif +#endif /*RE_TRACK_PATTERN_OFFSETS*/ #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ @@ -646,7 +665,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("cl_anything: ",data,0); + DEBUG_STUDYDATA("commit: ",data,0); } /* Can match anything (initialization) */ @@ -1843,9 +1862,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #ifdef DEBUGGING regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + U32 mjd_offset = 0; U32 mjd_nodelen = 0; -#endif +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ /* This means we convert either the first branch or the first Exact, depending on whether the thing following (in 'last') is a branch @@ -1858,25 +1880,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( first != startbranch || OP( last ) == BRANCH ) { /* branch sub-chain */ NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_r({ mjd_offset= Node_Offset((convert)); mjd_nodelen= Node_Length((convert)); }); +#endif /* whole branch chain */ - } else { + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { DEBUG_r({ const regnode *nop = NEXTOPER( convert ); mjd_offset= Node_Offset((nop)); mjd_nodelen= Node_Length((nop)); }); } - DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); - +#endif /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; @@ -2028,6 +2053,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* needed for dumping*/ DEBUG_r(if (optimize) { regnode *opt = convert; + while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } @@ -4091,11 +4117,6 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) if (RExC_whilem_seen > 15) RExC_whilem_seen = 15; -#ifdef DEBUGGING - /* Make room for a sentinel value at the end of the program */ - RExC_size++; -#endif - /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ @@ -4117,8 +4138,50 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) r->engine= RE_ENGINE_PTR; r->refcnt = 1; r->prelen = xend - exp; - r->precomp = savepvn(RExC_precomp, r->prelen); r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + { + bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + r->wraplen = r->prelen + has_minus + has_k + has_runon + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(r->wrapped, r->wraplen, char ); + p = r->wrapped; + *p++='('; *p++='?'; + if (has_k) + *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + { + char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; + char *colon = r + 1; + char ch; + + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + else + *r-- = ch; + reganch >>= 1; + } + if(has_minus) { + *r = '-'; + p = colon; + } + } + + *p++=':'; + Copy(RExC_precomp, p, r->prelen, char); + r->precomp = p; + p += r->prelen; + if (has_runon) + *p++='\n'; + *p=')'; + } + r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -4130,15 +4193,14 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) } /* Useful during FAIL. */ - Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ - if (ri->offsets) { - ri->offsets[0] = RExC_size; - } +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, "%s %"UVuf" bytes for offset annotations.\n", - ri->offsets ? "Got" : "Couldn't get", + ri->u.offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); - +#endif + SetProgLen(ri,RExC_size); RExC_rx = r; RExC_rxi = ri; @@ -4150,11 +4212,8 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) RExC_npar = 1; RExC_emit_start = ri->program; RExC_emit = ri->program; -#ifdef DEBUGGING - /* put a sentinal on the end of the program so we can check for - overwrites */ - ri->program[RExC_size].type = 255; -#endif + RExC_emit_bound = ri->program + RExC_size + 1; + /* Store the count of eval-groups for security checks: */ RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); @@ -4610,72 +4669,70 @@ reStudy: PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); - DEBUG_OFFSETS_r(if (ri->offsets) { - const U32 len = ri->offsets[0]; +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const U32 len = ri->u.offsets[0]; U32 i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]); + PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { - if (ri->offsets[i*2-1] || ri->offsets[i*2]) + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", - (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]); + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); } PerlIO_printf(Perl_debug_log, "\n"); }); +#endif return(r); } -#undef CORE_ONLY_BLOCK #undef RE_ENGINE_PTR -#ifndef PERL_IN_XSUB_RE + SV* -Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags) +Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) { AV *retarray = NULL; SV *ret; if (flags & 1) retarray=newAV(); - - if (from_re || PL_curpm) { - const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); - if (he_str) { - IV i; - SV* sv_dat=HeVAL(he_str); - I32 *nums=(I32*)SvPVX(sv_dat); - for ( i=0; ilastparen) >= nums[i] && - rx->endp[nums[i]] != -1) - { - ret = reg_numbered_buff_get(nums[i],rx,NULL,0); - if (!retarray) - return ret; - } else { - ret = newSVsv(&PL_sv_undef); - } - if (retarray) { - SvREFCNT_inc(ret); - av_push(retarray, ret); - } + + if (rx && rx->paren_names) { + HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->startp[nums[i]] != -1 + && rx->endp[nums[i]] != -1) + { + ret = CALLREG_NUMBUF(rx,nums[i],NULL); + if (!retarray) + return ret; + } else { + ret = newSVsv(&PL_sv_undef); + } + if (retarray) { + SvREFCNT_inc(ret); + av_push(retarray, ret); } - if (retarray) - return (SV*)retarray; } + if (retarray) + return (SV*)retarray; } } return NULL; } SV* -Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags) +Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) { char *s = NULL; I32 i = 0; I32 s1, t1; SV *sv = usesv ? usesv : newSVpvs(""); - PERL_UNUSED_ARG(flags); if (!rx->subbeg) { sv_setsv(sv,&PL_sv_undef); @@ -4744,7 +4801,7 @@ Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, } return sv; } -#endif + /* Scans the name of a named buffer from the pattern. * If flags is REG_RSN_RETURN_NULL returns null. @@ -4822,7 +4879,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { PerlIO_printf(Perl_debug_log,"%16s",""); \ \ if (SIZE_ONLY) \ - num=RExC_size; \ + num = RExC_size + 1; \ else \ num=REG_NODE_NUM(RExC_emit); \ if (RExC_lastnum!=num) \ @@ -4881,6 +4938,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) const I32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; /* for (?g), (?gc), and (?o) warnings; warning about (?c) will warn about (?g) -- japhy */ @@ -5051,8 +5110,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } - goto unknown; - case '<': /* (?<...) */ + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; else if (*RExC_parse != '=') @@ -5067,8 +5128,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? /* reverse test from the others */ REG_RSN_RETURN_NAME : REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) - goto unknown; + if (RExC_parse == name_start) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } if (*RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); @@ -5094,11 +5158,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Perl_croak(aTHX_ "panic: paren_name hash element allocation failed"); } else if ( SvPOK(sv_dat) ) { - IV count=SvIV(sv_dat); - I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1); - SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32)); - pv[count]=RExC_npar; - SvIVX(sv_dat)++; + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIVX(sv_dat)++; + } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); @@ -5126,6 +5205,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; case ':': /* (?:...) */ case '>': /* (?>...) */ break; @@ -5146,6 +5232,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; /*notreached*/ @@ -5226,19 +5313,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ /* NOT REACHED */ - case 'p': /* (?p...) */ - if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) - vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); - /* FALL THROUGH*/ case '?': /* (??...) */ is_logical = 1; - if (*RExC_parse != '{') - goto unknown; + if (*RExC_parse != '{') { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -5421,6 +5509,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ return ret; } else { @@ -5484,8 +5575,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; case '-': - if (flagsp == &negflags) - goto unknown; + if (flagsp == &negflags) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } flagsp = &negflags; wastedflags = 0; /* reset so (?g-c) warns twice */ break; @@ -5505,7 +5599,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } /*NOTREACHED*/ default: - unknown: RExC_parse++; vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); /*NOTREACHED*/ @@ -5523,7 +5616,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE) { + if (RExC_seen & REG_SEEN_RECURSE + && !RExC_open_parens[parno-1]) + { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); @@ -5567,7 +5662,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if (paren != '?') /* Not Conditional */ ret = br; - *flagp |= flags & (SPSTART | HASWIDTH); + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -5577,15 +5672,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) return(NULL); REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); } if (have_branch || paren != ':') { @@ -5678,7 +5776,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } - + if (after_freeze) + RExC_npar = after_freeze; return(ret); } @@ -5725,7 +5824,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } else if (ret == NULL) ret = latest; - *flagp |= flags&HASWIDTH; + *flagp |= flags&(HASWIDTH|POSTPONED); if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { @@ -5907,7 +6006,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { vWARN3(RExC_parse, "%.*s matches null string many times", (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), @@ -6132,7 +6231,9 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) char *s; char *p, *pend; STRLEN charlen = 1; +#ifdef DEBUGGING char * parse_start = name-3; /* needed for the offsets */ +#endif GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */ ret = reg_node(pRExC_state, @@ -6340,7 +6441,7 @@ tryagain: } return(NULL); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; case '|': case ')': @@ -6465,7 +6566,9 @@ tryagain: case 'P': { char* const oldregxend = RExC_end; +#ifdef DEBUGGING char* parse_start = RExC_parse - 2; +#endif if (RExC_parse[1] == '{') { /* a lovely hack--pretend we saw [\pX] instead */ @@ -6616,9 +6719,7 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - while (RExC_parse < RExC_end && *RExC_parse != '\n') - RExC_parse++; - if (RExC_parse < RExC_end) + if ( reg_skipcomment( pRExC_state ) ) goto tryagain; } /* FALL THROUGH */ @@ -6647,7 +6748,7 @@ tryagain: char * const oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + p = regwhite( pRExC_state, p ); switch (*p) { case '^': case '$': @@ -6795,13 +6896,13 @@ tryagain: ender = *p++; break; } - if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); if (UTF && FOLD) { /* Prime the casefolded buffer. */ ender = toFOLD_uni(ender, tmpbuf, &foldlen); } - if (ISMULT2(p)) { /* Back off on ?+*. */ + if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (UTF) { @@ -6903,15 +7004,22 @@ tryagain: } STATIC char * -S_regwhite(char *p, const char *e) +S_regwhite( RExC_state_t *pRExC_state, char *p ) { + const char *e = RExC_end; while (p < e) { if (isSPACE(*p)) ++p; else if (*p == '#') { + bool ended = 0; do { - p++; - } while (p < e && *p != '\n'); + if (*p++ == '\n') { + ended = 1; + break; + } + } while (p < e); + if (!ended) + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; } else break; @@ -7632,7 +7740,7 @@ parseit: return ret; /****** !SIZE_ONLY AFTER HERE *********/ - if( stored == 1 && value < 256 + if( stored == 1 && (value < 128 || (value < 256 && !UTF)) && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) ) { /* optimize single char class to an EXACT node @@ -7693,6 +7801,49 @@ parseit: #undef _C_C_T_ +/* reg_skipcomment() + + Absorbs an /x style # comments from the input stream. + Returns true if there is more text remaining in the stream. + Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + terminates the pattern without including a newline. + + Note its the callers responsibility to ensure that we are + actually in /x mode + +*/ + +STATIC bool +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +{ + bool ended = 0; + while (RExC_parse < RExC_end) + if (*RExC_parse++ == '\n') { + ended = 1; + break; + } + if (!ended) { + /* we ran off the end of the pattern without ending + the comment, so we have to add an \n when wrapping */ + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + return 0; + } else + return 1; +} + +/* nextchar() + + Advance that parse position, and optionall absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + STATIC char* S_nextchar(pTHX_ RExC_state_t *pRExC_state) { @@ -7715,9 +7866,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } else if (*RExC_parse == '#') { - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') break; - continue; + if ( reg_skipcomment( pRExC_state ) ) + continue; } } return retval; @@ -7740,18 +7890,17 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) RExC_size += 1; return(ret); } -#ifdef DEBUGGING - if (OP(RExC_emit) == 255) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ", - reg_name[op], OP(RExC_emit)); -#endif + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", "reg_node", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), @@ -7759,7 +7908,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) (UV)RExC_offsets[0])); Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } - +#endif RExC_emit = ptr; return(ret); } @@ -7794,18 +7943,18 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) */ return(ret); } -#ifdef DEBUGGING - if (OP(RExC_emit) == 255) - Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space"); -#endif + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), @@ -7813,7 +7962,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) (UV)RExC_offsets[0])); Set_Cur_Node_Offset; } - +#endif RExC_emit = ptr; return(ret); } @@ -7843,8 +7992,9 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ - DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]); + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); if (SIZE_ONLY) { RExC_size += size; return; @@ -7855,30 +8005,31 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) dst = RExC_emit; if (RExC_open_parens) { int paren; - DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar); + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ for ( paren=0 ; paren < RExC_npar ; paren++ ) { if ( RExC_open_parens[paren] >= opnd ) { - DEBUG_PARSE_FMT("open"," - %d",size); + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ RExC_open_parens[paren] += size; } else { - DEBUG_PARSE_FMT("open"," - %s","ok"); + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ } if ( RExC_close_parens[paren] >= opnd ) { - DEBUG_PARSE_FMT("close"," - %d",size); + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ RExC_close_parens[paren] += size; } else { - DEBUG_PARSE_FMT("close"," - %s","ok"); + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ } } } while (src > opnd) { StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(dst - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(src - RExC_emit_start), @@ -7887,15 +8038,17 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } +#endif } place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(place - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(place - RExC_emit_start), @@ -7904,6 +8057,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } +#endif src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -7938,7 +8092,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), - (temp == NULL ? reg_name[OP(val)] : "") + (temp == NULL ? PL_reg_name[OP(val)] : "") ); }); if (temp == NULL) @@ -8019,7 +8173,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), - reg_name[exact]); + PL_reg_name[exact]); }); if (temp == NULL) break; @@ -8177,7 +8331,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); - sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */ + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; @@ -8206,7 +8360,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) const reg_trie_data * const trie = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]); + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( Perl_sv_catpvf(aTHX_ sv, "", @@ -8480,10 +8634,6 @@ Perl_pregfree(pTHX_ struct regexp *r) return; CALLREGFREE_PVT(r); /* free the private data */ - - /* gcov results gave these as non-null 100% of the time, so there's no - optimisation in checking them before calling Safefree */ - Safefree(r->precomp); RX_MATCH_COPY_FREE(r); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) @@ -8501,8 +8651,8 @@ Perl_pregfree(pTHX_ struct regexp *r) Safefree(r->substrs); } if (r->paren_names) - SvREFCNT_dec(r->paren_names); - + SvREFCNT_dec(r->paren_names); + Safefree(r->wrapped); Safefree(r->startp); Safefree(r->endp); Safefree(r); @@ -8539,8 +8689,10 @@ Perl_regfree_internal(pTHX_ struct regexp *r) PL_colors[4],PL_colors[5],s); } }); - - Safefree(ri->offsets); /* 20010421 MJD */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif if (ri->data) { int n = ri->data->count; PAD* new_comppad = NULL; @@ -8692,11 +8844,14 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) } else ret->substrs = NULL; - ret->precomp = SAVEPVN(r->precomp, r->prelen); + ret->wrapped = SAVEPVN(r->wrapped, r->wraplen); + ret->precomp = ret->wrapped + (r->precomp - r->wrapped); + ret->prelen = r->prelen; + ret->wraplen = r->wraplen; + ret->refcnt = r->refcnt; ret->minlen = r->minlen; ret->minlenret = r->minlenret; - ret->prelen = r->prelen; ret->nparens = r->nparens; ret->lastparen = r->lastparen; ret->lastcloseparen = r->lastcloseparen; @@ -8749,7 +8904,7 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) RXi_GET_DECL(r,ri); npar = r->nparens+1; - len = ri->offsets[0]; + len = ProgLen(ri); Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); @@ -8763,8 +8918,8 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) reti->swap = NULL; } - reti->regstclass = NULL; + if (ri->data) { struct reg_data *d; const int count = ri->data->count; @@ -8827,9 +8982,15 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) reti->name_list_idx = ri->name_list_idx; - Newx(reti->offsets, 2*len+1, U32); - Copy(ri->offsets, reti->offsets, 2*len+1, U32); - +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + return (void*)reti; } @@ -8863,83 +9024,18 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) */ #ifndef PERL_IN_XSUB_RE + char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { dVAR; const regexp * const re = (regexp *)mg->mg_obj; - - if (!mg->mg_ptr) { - const char *fptr = "msix"; - char reflags[7]; - char ch; - bool hask = ((re->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); - bool hasm = ((re->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); - U16 reganch = (U16)((re->extflags & RXf_PMf_STD_PMMOD) >> 12); - bool need_newline = 0; - int left = 0; - int right = 4 + hask; - if (hask) - reflags[left++]='k'; - while((ch = *fptr++)) { - if(reganch & 1) { - reflags[left++] = ch; - } - else { - reflags[right--] = ch; - } - reganch >>= 1; - } - if(hasm) { - reflags[left] = '-'; - left = 5 + hask; - } - /* printf("[%*.7s]\n",left,reflags); */ - mg->mg_len = re->prelen + 4 + left; - /* - * If /x was used, we have to worry about a regex ending with a - * comment later being embedded within another regex. If so, we don't - * want this regex's "commentization" to leak out to the right part of - * the enclosing regex, we must cap it with a newline. - * - * So, if /x was used, we scan backwards from the end of the regex. If - * we find a '#' before we find a newline, we need to add a newline - * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), - * we don't need to add anything. -jfriedl - */ - if (PMf_EXTENDED & re->extflags) { - const char *endptr = re->precomp + re->prelen; - while (endptr >= re->precomp) { - const char c = *(endptr--); - if (c == '\n') - break; /* don't need another */ - if (c == '#') { - /* we end while in a comment, so we need a newline */ - mg->mg_len++; /* save space for it */ - need_newline = 1; /* note to add it */ - break; - } - } - } - - Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); - mg->mg_ptr[0] = '('; - mg->mg_ptr[1] = '?'; - Copy(reflags, mg->mg_ptr+2, left, char); - *(mg->mg_ptr+left+2) = ':'; - Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); - if (need_newline) - mg->mg_ptr[mg->mg_len - 2] = '\n'; - mg->mg_ptr[mg->mg_len - 1] = ')'; - mg->mg_ptr[mg->mg_len] = 0; - } if (haseval) *haseval = re->seen_evals; if (flags) *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = mg->mg_len; - return mg->mg_ptr; + *lp = re->wraplen; + return re->wrapped; } /*