X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/a9d504c36a04b1db7840b9e7ee4324a09113c71b..39aa8307f3f2358b81eed6e9cadcba8c05567eec:/regcomp.c diff --git a/regcomp.c b/regcomp.c index e26621d..60b31ed 100644 --- a/regcomp.c +++ b/regcomp.c @@ -109,6 +109,7 @@ typedef struct RExC_state_t { char *parse; /* Input-scan pointer. */ I32 whilem_seen; /* number of WHILEM in this expr */ regnode *emit_start; /* Start of emitted-code area */ + regnode *emit_bound; /* First regnode outside of the allocated space */ regnode *emit; /* Code-emit pointer; ®dummy = don't = compiling */ I32 naughty; /* How bad is this pattern? */ I32 sawback; /* Did we see \1, ...? */ @@ -123,7 +124,10 @@ typedef struct RExC_state_t { regnode **open_parens; /* pointers to open parens */ regnode **close_parens; /* pointers to close parens */ regnode *opend; /* END node in program */ - I32 utf8; + I32 utf8; /* whether the pattern is utf8 or not */ + I32 orig_utf8; /* whether the pattern was originally in utf8 */ + /* XXX use this for future optimisation of case + * where pattern must be upgraded to utf8. */ HV *charnames; /* cache of named sequences */ HV *paren_names; /* Paren names */ @@ -151,9 +155,12 @@ typedef struct RExC_state_t { #define RExC_end (pRExC_state->end) #define RExC_parse (pRExC_state->parse) #define RExC_whilem_seen (pRExC_state->whilem_seen) -#define RExC_offsets (pRExC_state->rxi->offsets) /* I am not like the others */ +#ifdef RE_TRACK_PATTERN_OFFSETS +#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */ +#endif #define RExC_emit (pRExC_state->emit) #define RExC_emit_start (pRExC_state->emit_start) +#define RExC_emit_bound (pRExC_state->emit_bound) #define RExC_naughty (pRExC_state->naughty) #define RExC_sawback (pRExC_state->sawback) #define RExC_seen (pRExC_state->seen) @@ -164,6 +171,7 @@ typedef struct RExC_state_t { #define RExC_seen_zerolen (pRExC_state->seen_zerolen) #define RExC_seen_evals (pRExC_state->seen_evals) #define RExC_utf8 (pRExC_state->utf8) +#define RExC_orig_utf8 (pRExC_state->orig_utf8) #define RExC_charnames (pRExC_state->charnames) #define RExC_open_parens (pRExC_state->open_parens) #define RExC_close_parens (pRExC_state->close_parens) @@ -172,6 +180,7 @@ typedef struct RExC_state_t { #define RExC_recurse (pRExC_state->recurse) #define RExC_recurse_count (pRExC_state->recurse_count) + #define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?') #define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \ ((*s) == '{' && regcurly(s))) @@ -183,10 +192,11 @@ typedef struct RExC_state_t { * Flags to be passed up and down. */ #define WORST 0 /* Worst case. */ -#define HASWIDTH 0x1 /* Known to match non-null strings. */ -#define SIMPLE 0x2 /* Simple enough to be STAR/PLUS operand. */ -#define SPSTART 0x4 /* Starts with * or +. */ -#define TRYAGAIN 0x8 /* Weeded out a declaration. */ +#define HASWIDTH 0x01 /* Known to match non-null strings. */ +#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */ +#define SPSTART 0x04 /* Starts with * or +. */ +#define TRYAGAIN 0x08 /* Weeded out a declaration. */ +#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */ #define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1) @@ -510,7 +520,21 @@ static const scan_data_t zero_scan_data = * Element 0 holds the number n. * Position is 1 indexed. */ - +#ifndef RE_TRACK_PATTERN_OFFSETS +#define Set_Node_Offset_To_R(node,byte) +#define Set_Node_Offset(node,byte) +#define Set_Cur_Node_Offset +#define Set_Node_Length_To_R(node,len) +#define Set_Node_Length(node,len) +#define Set_Node_Cur_Length(node) +#define Node_Offset(n) +#define Node_Length(n) +#define Set_Node_Offset_Length(node,offset,len) +#define ProgLen(ri) ri->u.proglen +#define SetProgLen(ri,x) ri->u.proglen = x +#else +#define ProgLen(ri) ri->u.offsets[0] +#define SetProgLen(ri,x) ri->u.offsets[0] = x #define Set_Node_Offset_To_R(node,byte) STMT_START { \ if (! SIZE_ONLY) { \ MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \ @@ -553,11 +577,11 @@ static const scan_data_t zero_scan_data = Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \ Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \ } STMT_END - +#endif #if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS #define EXPERIMENTAL_INPLACESCAN -#endif +#endif /*RE_TRACK_PATTERN_OFFSETS*/ #define DEBUG_STUDYDATA(str,data,depth) \ DEBUG_OPTIMISE_MORE_r(if(data){ \ @@ -645,7 +669,7 @@ S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *min } data->last_end = -1; data->flags &= ~SF_BEFORE_EOL; - DEBUG_STUDYDATA("cl_anything: ",data,0); + DEBUG_STUDYDATA("commit: ",data,0); } /* Can match anything (initialization) */ @@ -1332,7 +1356,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store unicode characters. We use the + (trie->charmap) and we use a an HV* to store Unicode characters. We use the native representation of the character value as the key and IV's for the coded index. @@ -1355,16 +1379,17 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ]; const U8 *scan = (U8*)NULL; U32 wordlen = 0; /* required init */ - STRLEN chars=0; + STRLEN chars = 0; + bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/ if (OP(noper) == NOTHING) { trie->minlen= 0; continue; } - if (trie->bitmap) { - TRIE_BITMAP_SET(trie,*uc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]); - } + if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */ + TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte + regardless of encoding */ + for ( ; uc < e ; uc += len ) { TRIE_CHARCOUNT(trie)++; TRIE_READ_CHAR; @@ -1376,6 +1401,13 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ]; TRIE_STORE_REVCHAR; } + if ( set_bit ) { + /* store the codepoint in the bitmap, and if its ascii + also store its folded equivelent. */ + TRIE_BITMAP_SET(trie,uvc); + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + set_bit = 0; /* We've done our bit :-) */ + } } else { SV** svpp; if ( !widecharmap ) @@ -1842,9 +1874,12 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs #ifdef DEBUGGING regnode *optimize = NULL; +#ifdef RE_TRACK_PATTERN_OFFSETS + U32 mjd_offset = 0; U32 mjd_nodelen = 0; -#endif +#endif /* RE_TRACK_PATTERN_OFFSETS */ +#endif /* DEBUGGING */ /* This means we convert either the first branch or the first Exact, depending on whether the thing following (in 'last') is a branch @@ -1857,25 +1892,28 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs if ( first != startbranch || OP( last ) == BRANCH ) { /* branch sub-chain */ NEXT_OFF( first ) = (U16)(last - first); +#ifdef RE_TRACK_PATTERN_OFFSETS DEBUG_r({ mjd_offset= Node_Offset((convert)); mjd_nodelen= Node_Length((convert)); }); +#endif /* whole branch chain */ - } else { + } +#ifdef RE_TRACK_PATTERN_OFFSETS + else { DEBUG_r({ const regnode *nop = NEXTOPER( convert ); mjd_offset= Node_Offset((nop)); mjd_nodelen= Node_Length((nop)); }); } - DEBUG_OPTIMISE_r( PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n", (int)depth * 2 + 2, "", (UV)mjd_offset, (UV)mjd_nodelen) ); - +#endif /* But first we check to see if there is a common prefix we can split out as an EXACT and put in front of the TRIE node. */ trie->startstate= 1; @@ -1928,7 +1966,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( count == 1 ) { SV **tmp = av_fetch( revcharmap, idx, 0); - char *ch = SvPV_nolen( *tmp ); + STRLEN len; + char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); PerlIO_printf( Perl_debug_log, @@ -1947,11 +1986,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs str=STRING(convert); STR_LEN(convert)=0; } - while (*ch) { + STR_LEN(convert) += len; + while (len--) *str++ = *ch++; - STR_LEN(convert)++; - } - } else { #ifdef DEBUGGING if (state>1) @@ -2027,6 +2064,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* needed for dumping*/ DEBUG_r(if (optimize) { regnode *opt = convert; + while ( ++opt < optimize) { Set_Node_Offset_Length(opt,0,0); } @@ -2361,6 +2399,34 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) +#define CASE_SYNST_FNC(nAmE) \ +case nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break; \ +case N ## nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break + + + STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, @@ -3291,6 +3357,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; } } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + int value = 0; + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + if (flags & SCF_DO_STCLASS_AND) { + for (value = 0; value < 256; value++) + if (!is_VERTWS_cp(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + for (value = 0; value < 256; value++) + if (is_VERTWS_cp(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + min += 1; + delta += 1; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + + } + else if (OP(scan) == FOLDCHAR) { + int d = ARG(scan)==0xDF ? 1 : 2; + flags &= ~SCF_DO_STCLASS; + min += 1; + delta += d; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += d; + data->longest = &(data->longest_float); + } + } else if (strchr((const char*)PL_simple,OP(scan))) { int value = 0; @@ -3485,6 +3591,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } break; + CASE_SYNST_FNC(VERTWS); + CASE_SYNST_FNC(HORIZWS); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -3855,6 +3964,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ + /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -3977,8 +4087,8 @@ extern const struct regexp_engine my_reg_engine; #endif #ifndef PERL_IN_XSUB_RE -regexp * -Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) +REGEXP * +Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) { dVAR; HV * const table = GvHV(PL_hintgv); @@ -3993,21 +4103,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", SvIV(*ptr)); }); - return CALLREGCOMP_ENG(eng, exp, xend, pm); + return CALLREGCOMP_ENG(eng, pattern, flags); } } - return Perl_re_compile(aTHX_ exp, xend, pm); + return Perl_re_compile(aTHX_ pattern, flags); } #endif -regexp * -Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) +REGEXP * +Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) { dVAR; - register regexp *r; + register REGEXP *r; register regexp_internal *ri; + STRLEN plen; + char* exp = SvPV((SV*)pattern, plen); + char* xend = exp + plen; regnode *scan; - regnode *first; I32 flags; I32 minlen = 0; I32 sawplus = 0; @@ -4021,21 +4133,20 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) #endif GET_RE_DEBUG_FLAGS_DECL; DEBUG_r(if (!PL_colorset) reginitcolors()); - - if (exp == NULL) - FAIL("NULL regexp argument"); - RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; - RExC_precomp = exp; DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, RExC_precomp, (xend - exp), 60); + dsv, exp, plen, 60); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); - RExC_flags = pm->op_pmflags; + +redo_first_pass: + RExC_precomp = exp; + RExC_flags = pm_flags; RExC_sawback = 0; RExC_seen = 0; @@ -4073,6 +4184,25 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) RExC_precomp = NULL; return(NULL); } + if (RExC_utf8 && !RExC_orig_utf8) { + /* It's possible to write a regexp in ascii that represents Unicode + codepoints outside of the byte range, such as via \x{100}. If we + detect such a sequence we have to convert the entire pattern to utf8 + and then recompile, as our sizing calculation will have been based + on 1 byte == 1 character, but we will need to use utf8 to encode + at least some part of the pattern, and therefore must convert the whole + thing. + XXX: somehow figure out how to make this less expensive... + -- dmq */ + STRLEN len = plen; + DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, + "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); + exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); + xend = exp + len; + RExC_orig_utf8 = RExC_utf8; + SAVEFREEPV(exp); + goto redo_first_pass; + } DEBUG_PARSE_r({ PerlIO_printf(Perl_debug_log, "Required size %"IVdf" nodes\n" @@ -4090,11 +4220,6 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) if (RExC_whilem_seen > 15) RExC_whilem_seen = 15; -#ifdef DEBUGGING - /* Make room for a sentinel value at the end of the program */ - RExC_size++; -#endif - /* Allocate space and zero-initialize. Note, the two step process of zeroing when in debug mode, thus anything assigned has to happen after that */ @@ -4115,9 +4240,52 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; r->refcnt = 1; - r->prelen = xend - exp; - r->precomp = savepvn(RExC_precomp, r->prelen); - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + r->prelen = plen; + r->extflags = pm_flags; + { + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); + bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); + U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); + const char *fptr = STD_PAT_MODS; /*"msix"*/ + char *p; + r->wraplen = r->prelen + has_minus + has_p + has_runon + + (sizeof(STD_PAT_MODS) - 1) + + (sizeof("(?:)") - 1); + + Newx(r->wrapped, r->wraplen + 1, char ); + p = r->wrapped; + *p++='('; *p++='?'; + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ + { + char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; + char *colon = r + 1; + char ch; + + while((ch = *fptr++)) { + if(reganch & 1) + *p++ = ch; + else + *r-- = ch; + reganch >>= 1; + } + if(has_minus) { + *r = '-'; + p = colon; + } + } + + *p++ = ':'; + Copy(RExC_precomp, p, r->prelen, char); + r->precomp = p; + p += r->prelen; + if (has_runon) + *p++ = '\n'; + *p++ = ')'; + *p = 0; + } + r->intflags = 0; r->nparens = RExC_npar - 1; /* set early to validate backrefs */ @@ -4129,37 +4297,34 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) } /* Useful during FAIL. */ - Newxz(ri->offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ - if (ri->offsets) { - ri->offsets[0] = RExC_size; - } +#ifdef RE_TRACK_PATTERN_OFFSETS + Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */ DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log, "%s %"UVuf" bytes for offset annotations.\n", - ri->offsets ? "Got" : "Couldn't get", + ri->u.offsets ? "Got" : "Couldn't get", (UV)((2*RExC_size+1) * sizeof(U32)))); - +#endif + SetProgLen(ri,RExC_size); RExC_rx = r; RExC_rxi = ri; /* Second pass: emit code. */ - RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */ + RExC_flags = pm_flags; /* don't let top level (?i) bleed */ RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; RExC_npar = 1; RExC_emit_start = ri->program; RExC_emit = ri->program; -#ifdef DEBUGGING - /* put a sentinal on the end of the program so we can check for - overwrites */ - ri->program[RExC_size].type = 255; -#endif + RExC_emit_bound = ri->program + RExC_size + 1; + /* Store the count of eval-groups for security checks: */ RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); - if (reg(pRExC_state, 0, &flags,1) == NULL) + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(r); return(NULL); - + } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -4197,8 +4362,9 @@ reStudy: #endif /* Dig out information for optimizations. */ - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */ - pm->op_pmflags = RExC_flags; + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + if (UTF) r->extflags |= RXf_UTF8; /* Unicode in it? */ ri->regstclass = NULL; @@ -4214,18 +4380,20 @@ reStudy: struct regnode_charclass_class ch_class; /* pointed to by data */ int stclass_flag; I32 last_close = 0; /* pointed to by data */ - - first = scan; + regnode *first= scan; + regnode *first_next= regnext(first); + /* Skip introductions and multiplicators >= 1. */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ - (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || + (OP(first) == BRANCH && OP(first_next) != BRANCH) || /* for now we can't handle lookbehind IFMATCH*/ (OP(first) == IFMATCH && !first->flags) || (OP(first) == PLUS) || (OP(first) == MINMOD) || /* An {n,m} with n>0 */ - (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { if (OP(first) == PLUS) @@ -4237,6 +4405,7 @@ reStudy: first += EXTRA_STEP_2ARGS; } else /* XXX possible optimisation for /(?=)/ */ first = NEXTOPER(first); + first_next= regnext(first); } /* Starting-point info. */ @@ -4582,13 +4751,37 @@ reStudy: r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); else r->paren_names = NULL; + +#ifdef STUPID_PATTERN_CHECKS + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3)) + r->extflags |= RXf_WHITE; + else if (r->prelen == 1 && r->precomp[0] == '^') + r->extflags |= RXf_START_ONLY; +#else + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + U8 nop = OP(NEXTOPER(first)); + + if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) + r->extflags |= RXf_WHITE; + } +#endif #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, 1, "p" ); ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list); } else - ri->name_list_idx = 0; #endif + ri->name_list_idx = 0; if (RExC_recurse_count) { for ( ; RExC_recurse_count ; RExC_recurse_count-- ) { @@ -4596,66 +4789,165 @@ reStudy: ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); } } - Newxz(r->startp, RExC_npar, I32); - Newxz(r->endp, RExC_npar, I32); + Newxz(r->offs, RExC_npar, regexp_paren_pair); /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ PerlIO_printf(Perl_debug_log,"Final program:\n"); regdump(r); }); - DEBUG_OFFSETS_r(if (ri->offsets) { - const U32 len = ri->offsets[0]; +#ifdef RE_TRACK_PATTERN_OFFSETS + DEBUG_OFFSETS_r(if (ri->u.offsets) { + const U32 len = ri->u.offsets[0]; U32 i; GET_RE_DEBUG_FLAGS_DECL; - PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->offsets[0]); + PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]); for (i = 1; i <= len; i++) { - if (ri->offsets[i*2-1] || ri->offsets[i*2]) + if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2]) PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ", - (UV)i, (UV)ri->offsets[i*2-1], (UV)ri->offsets[i*2]); + (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]); } PerlIO_printf(Perl_debug_log, "\n"); }); +#endif return(r); } -#undef CORE_ONLY_BLOCK #undef RE_ENGINE_PTR -#ifndef PERL_IN_XSUB_RE + SV* -Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flags) +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak(aTHX_ PL_no_modify); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) { AV *retarray = NULL; SV *ret; - if (flags & 1) + if (flags & RXapif_ALL) retarray=newAV(); - - if (from_re || PL_curpm) { - const REGEXP * const rx = from_re ? from_re : PM_GETRE(PL_curpm); - if (rx && rx->paren_names) { - HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); - if (he_str) { - IV i; - SV* sv_dat=HeVAL(he_str); - I32 *nums=(I32*)SvPVX(sv_dat); - for ( i=0; ilastparen) >= nums[i] && - rx->endp[nums[i]] != -1) - { - ret = reg_numbered_buff_get(nums[i],rx,NULL,0); - if (!retarray) - return ret; - } else { - ret = newSVsv(&PL_sv_undef); - } - if (retarray) { - SvREFCNT_inc(ret); - av_push(retarray, ret); - } + + if (rx && rx->paren_names) { + HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 ); + if (he_str) { + IV i; + SV* sv_dat=HeVAL(he_str); + I32 *nums=(I32*)SvPVX(sv_dat); + for ( i=0; inparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) + { + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(rx,nums[i],ret); + if (!retarray) + return ret; + } else { + ret = newSVsv(&PL_sv_undef); + } + if (retarray) { + SvREFCNT_inc_simple_void(ret); + av_push(retarray, ret); + } + } + if (retarray) + return newRV((SV*)retarray); + } + } + return NULL; +} + +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, + const U32 flags) +{ + if (rx && rx->paren_names) { + if (flags & RXapif_ALL) { + return hv_exists_ent(rx->paren_names, key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); + if (sv) { + SvREFCNT_dec(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) +{ + (void)hv_iterinit(rx->paren_names); + + return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) +{ + if (rx && rx->paren_names) { + HV *hv = rx->paren_names; + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; } - if (retarray) - return (SV*)retarray; + } + if (parno || flags & RXapif_ALL) { + STRLEN len; + char *pv = HePV(temphe, len); + return newSVpvn(pv,len); } } } @@ -4663,79 +4955,213 @@ Perl_reg_named_buff_get(pTHX_ SV* namesv, const REGEXP * const from_re, U32 flag } SV* -Perl_reg_numbered_buff_get(pTHX_ I32 paren, const REGEXP * const rx, SV* usesv, U32 flags) +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) +{ + SV *ret; + AV *av; + I32 length; + + if (rx && rx->paren_names) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(rx->paren_names)); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); + av = (AV*)SvRV(ret); + length = av_len(av); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) +{ + AV *av = newAV(); + + if (rx && rx->paren_names) { + HV *hv= rx->paren_names; + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + STRLEN len; + char *pv = HePV(temphe, len); + av_push(av, newSVpvn(pv,len)); + } + } + } + + return newRV((SV*)av); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) { char *s = NULL; I32 i = 0; I32 s1, t1; - SV *sv = usesv ? usesv : newSVpvs(""); - if (paren == -2 && (s = rx->subbeg) && rx->startp[0] != -1) { + if (!rx->subbeg) { + sv_setsv(sv,&PL_sv_undef); + return; + } + else + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { /* $` */ - i = rx->startp[0]; + i = rx->offs[0].start; + s = rx->subbeg; } else - if (paren == -1 && rx->subbeg && rx->endp[0] != -1) { + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { /* $' */ - s = rx->subbeg + rx->endp[0]; - i = rx->sublen - rx->endp[0]; + s = rx->subbeg + rx->offs[0].end; + i = rx->sublen - rx->offs[0].end; } else if ( 0 <= paren && paren <= (I32)rx->nparens && - (s1 = rx->startp[paren]) != -1 && - (t1 = rx->endp[paren]) != -1) + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) { /* $& $1 ... */ i = t1 - s1; s = rx->subbeg + s1; - } - - if (s) { - assert(rx->subbeg); - assert(rx->sublen >= (s - rx->subbeg) + i ); - - if (i >= 0) { - const int oldtainted = PL_tainted; - TAINT_NOT; - sv_setpvn(sv, s, i); - PL_tainted = oldtainted; - if ( (rx->extflags & RXf_CANY_SEEN) - ? (RX_MATCH_UTF8(rx) - && (!i || is_utf8_string((U8*)s, i))) - : (RX_MATCH_UTF8(rx)) ) - { - SvUTF8_on(sv); - } - else - SvUTF8_off(sv); - if (PL_tainting) { - if (RX_MATCH_TAINTED(rx)) { - if (SvTYPE(sv) >= SVt_PVMG) { - MAGIC* const mg = SvMAGIC(sv); - MAGIC* mgt; - PL_tainted = 1; - SvMAGIC_set(sv, mg->mg_moremagic); - SvTAINT(sv); - if ((mgt = SvMAGIC(sv))) { - mg->mg_moremagic = mgt; - SvMAGIC_set(sv, mg); - } - } else { - PL_tainted = 1; - SvTAINT(sv); + } else { + sv_setsv(sv,&PL_sv_undef); + return; + } + assert(rx->sublen >= (s - rx->subbeg) + i ); + if (i >= 0) { + const int oldtainted = PL_tainted; + TAINT_NOT; + sv_setpvn(sv, s, i); + PL_tainted = oldtainted; + if ( (rx->extflags & RXf_CANY_SEEN) + ? (RX_MATCH_UTF8(rx) + && (!i || is_utf8_string((U8*)s, i))) + : (RX_MATCH_UTF8(rx)) ) + { + SvUTF8_on(sv); + } + else + SvUTF8_off(sv); + if (PL_tainting) { + if (RX_MATCH_TAINTED(rx)) { + if (SvTYPE(sv) >= SVt_PVMG) { + MAGIC* const mg = SvMAGIC(sv); + MAGIC* mgt; + PL_tainted = 1; + SvMAGIC_set(sv, mg->mg_moremagic); + SvTAINT(sv); + if ((mgt = SvMAGIC(sv))) { + mg->mg_moremagic = mgt; + SvMAGIC_set(sv, mg); } - } else - SvTAINTED_off(sv); - } - } else { - sv_setsv(sv,&PL_sv_undef); + } else { + PL_tainted = 1; + SvTAINT(sv); + } + } else + SvTAINTED_off(sv); } } else { sv_setsv(sv,&PL_sv_undef); + return; } - return sv; } -#endif + +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak(aTHX_ PL_no_modify); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, + const I32 paren) +{ + I32 i; + I32 s1, t1; + + /* Some of this code was originally in C in F */ + switch (paren) { + /* $` / ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + /* $' / ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + /* $& / ${^MATCH}, $1, $2, ... */ + default: + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((SV*)sv); + return 0; + } + } + getlen: + if (i > 0 && RX_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_UNUSED_ARG(rx); + return newSVpvs("Regexp"); +} /* Scans the name of a named buffer from the pattern. * If flags is REG_RSN_RETURN_NULL returns null. @@ -4813,7 +5239,7 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { PerlIO_printf(Perl_debug_log,"%16s",""); \ \ if (SIZE_ONLY) \ - num=RExC_size; \ + num = RExC_size + 1; \ else \ num=REG_NODE_NUM(RExC_emit); \ if (RExC_lastnum!=num) \ @@ -4854,10 +5280,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) #endif -/* this idea is borrowed from STR_WITH_LEN in handy.h */ -#define CHECK_WORD(s,v,l) \ - (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1)))) - STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ @@ -4869,9 +5291,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) register regnode *ender = NULL; register I32 parno = 0; I32 flags; - const I32 oregflags = RExC_flags; + U32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; + I32 freeze_paren = 0; + I32 after_freeze = 0; /* for (?g), (?gc), and (?o) warnings; warning about (?c) will warn about (?g) -- japhy */ @@ -4888,7 +5312,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) GET_RE_DEBUG_FLAGS_DECL; DEBUG_PARSE("reg "); - *flagp = 0; /* Tentatively. */ @@ -4925,39 +5348,39 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ - if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) { + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { op = ACCEPT; internal_argval = RExC_nestroot; } break; case 'C': /* (*COMMIT) */ - if ( CHECK_WORD("COMMIT",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"COMMIT") ) op = COMMIT; break; case 'F': /* (*FAIL) */ - if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) { + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { op = OPFAIL; argok = 0; } break; case ':': /* (*:NAME) */ case 'M': /* (*MARK:NAME) */ - if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) { + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { op = MARKPOINT; argok = -1; } break; case 'P': /* (*PRUNE) */ - if ( CHECK_WORD("PRUNE",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"PRUNE") ) op = PRUNE; break; case 'S': /* (*SKIP) */ - if ( CHECK_WORD("SKIP",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"SKIP") ) op = SKIP; break; case 'T': /* (*THEN) */ /* [19:06] :: is then */ - if ( CHECK_WORD("THEN",start_verb,verb_len) ) { + if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; RExC_seen |= REG_SEEN_CUTGROUP; } @@ -5000,8 +5423,6 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) return ret; } else if (*RExC_parse == '?') { /* (?...) */ - U32 posflags = 0, negflags = 0; - U32 *flagsp = &posflags; bool is_logical = 0; const char * const seqstart = RExC_parse; @@ -5030,7 +5451,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; ret = reganode(pRExC_state, @@ -5044,8 +5465,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } - goto unknown; - case '<': /* (?<...) */ + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + case '<': /* (?<...) */ if (*RExC_parse == '!') paren = ','; else if (*RExC_parse != '=') @@ -5060,8 +5483,11 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) SIZE_ONLY ? /* reverse test from the others */ REG_RSN_RETURN_NAME : REG_RSN_RETURN_NULL); - if (RExC_parse == name_start) - goto unknown; + if (RExC_parse == name_start) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } if (*RExC_parse != paren) vFAIL2("Sequence (?%c... not terminated", paren=='>' ? '<' : paren); @@ -5087,11 +5513,26 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Perl_croak(aTHX_ "panic: paren_name hash element allocation failed"); } else if ( SvPOK(sv_dat) ) { - IV count=SvIV(sv_dat); - I32 *pv=(I32*)SvGROW(sv_dat,SvCUR(sv_dat)+sizeof(I32)+1); - SvCUR_set(sv_dat,SvCUR(sv_dat)+sizeof(I32)); - pv[count]=RExC_npar; - SvIVX(sv_dat)++; + /* (?|...) can mean we have dupes so scan to check + its already been stored. Maybe a flag indicating + we are inside such a construct would be useful, + but the arrays are likely to be quite small, so + for now we punt -- dmq */ + IV count = SvIV(sv_dat); + I32 *pv = (I32*)SvPVX(sv_dat); + IV i; + for ( i = 0 ; i < count ; i++ ) { + if ( pv[i] == RExC_npar ) { + count = 0; + break; + } + } + if ( count ) { + pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1); + SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32)); + pv[count] = RExC_npar; + SvIVX(sv_dat)++; + } } else { (void)SvUPGRADE(sv_dat,SVt_PVNV); sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32)); @@ -5119,6 +5560,13 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) nextchar(pRExC_state); return ret; } + break; + case '|': /* (?|...) */ + /* branch reset, behave like a (?:...) except that + buffers in alternations share the same numbers */ + paren = ':'; + after_freeze = freeze_paren = RExC_npar; + break; case ':': /* (?:...) */ case '>': /* (?>...) */ break; @@ -5139,6 +5587,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (*RExC_parse != ')') FAIL("Sequence (?R) not terminated"); ret = reg_node(pRExC_state, GOSTART); + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; /*notreached*/ @@ -5219,19 +5668,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */ Set_Node_Offset(ret, parse_start); /* MJD */ + *flagp |= POSTPONED; nextchar(pRExC_state); return ret; } /* named and numeric backreferences */ /* NOT REACHED */ - case 'p': /* (?p...) */ - if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP)) - vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})"); - /* FALL THROUGH*/ case '?': /* (??...) */ is_logical = 1; - if (*RExC_parse != '{') - goto unknown; + if (*RExC_parse != '{') { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + *flagp |= POSTPONED; paren = *RExC_parse++; /* FALL THROUGH */ case '{': /* (?{...}) */ @@ -5336,7 +5786,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } ret = reganode(pRExC_state,NGROUPP,num); goto insert_if_check_paren; @@ -5414,6 +5864,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else REGTAIL(pRExC_state, ret, ender); + RExC_size++; /* XXX WHY do we need this?!! + For large programs it seems to be required + but I can't figure out why. -- dmq*/ return ret; } else { @@ -5425,13 +5878,20 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) vFAIL("Sequence (? incomplete"); break; default: - --RExC_parse; - parse_flags: /* (?i) */ - while (*RExC_parse && strchr("iogcmsx", *RExC_parse)) { + --RExC_parse; + parse_flags: /* (?i) */ + { + U32 posflags = 0, negflags = 0; + U32 *flagsp = &posflags; + + while (*RExC_parse) { + /* && strchr("iogcmsx", *RExC_parse) */ /* (?g), (?gc) and (?o) are useless here and must be globally applied -- japhy */ - - if (*RExC_parse == 'o' || *RExC_parse == 'g') { + switch (*RExC_parse) { + CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5446,8 +5906,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else if (*RExC_parse == 'c') { + break; + + case CONTINUE_PAT_MOD: /* 'c' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5459,33 +5920,51 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) ); } } - } - else { pmflag(flagsp, *RExC_parse); } - - ++RExC_parse; - } - if (*RExC_parse == '-') { - flagsp = &negflags; - wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case KEEPCOPY_PAT_MOD: /* 'p' */ + if (flagsp == &negflags) { + if (SIZE_ONLY && ckWARN(WARN_REGEXP)) + vWARN(RExC_parse + 1,"Useless use of (?-p)"); + } else { + *flagsp |= RXf_PMf_KEEPCOPY; + } + break; + case '-': + if (flagsp == &negflags) { + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } + flagsp = &negflags; + wastedflags = 0; /* reset so (?g-c) warns twice */ + break; + case ':': + paren = ':'; + /*FALLTHROUGH*/ + case ')': + RExC_flags |= posflags; + RExC_flags &= ~negflags; + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + } + nextchar(pRExC_state); + if (paren != ':') { + *flagp = TRYAGAIN; + return NULL; + } else { + ret = NULL; + goto parse_rest; + } + /*NOTREACHED*/ + default: + RExC_parse++; + vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); + /*NOTREACHED*/ + } ++RExC_parse; - goto parse_flags; } - RExC_flags |= posflags; - RExC_flags &= ~negflags; - if (*RExC_parse == ':') { - RExC_parse++; - paren = ':'; - break; - } - unknown: - if (*RExC_parse != ')') { - RExC_parse++; - vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart); - } - nextchar(pRExC_state); - *flagp = TRYAGAIN; - return NULL; - } + }} /* one for the default block, one for the switch */ } else { /* (...) */ capturing_parens: @@ -5496,7 +5975,9 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY ){ if (!RExC_nestroot) RExC_nestroot = parno; - if (RExC_seen & REG_SEEN_RECURSE) { + if (RExC_seen & REG_SEEN_RECURSE + && !RExC_open_parens[parno-1]) + { DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log, "Setting open paren #%"IVdf" to %d\n", (IV)parno, REG_NODE_NUM(ret))); @@ -5510,7 +5991,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else /* ! paren */ ret = NULL; - + + parse_rest: /* Pick up the branches, linking them together. */ parse_start = RExC_parse; /* MJD */ br = regbranch(pRExC_state, &flags, 1,depth+1); @@ -5539,7 +6021,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } else if (paren != '?') /* Not Conditional */ ret = br; - *flagp |= flags & (SPSTART | HASWIDTH); + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); lastbr = br; while (*RExC_parse == '|') { if (!SIZE_ONLY && RExC_extralen) { @@ -5549,15 +6031,18 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (SIZE_ONLY) RExC_extralen += 2; /* Account for LONGJMP. */ nextchar(pRExC_state); + if (freeze_paren) { + if (RExC_npar > after_freeze) + after_freeze = RExC_npar; + RExC_npar = freeze_paren; + } br = regbranch(pRExC_state, &flags, 0, depth+1); if (br == NULL) return(NULL); REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */ lastbr = br; - if (flags&HASWIDTH) - *flagp |= HASWIDTH; - *flagp |= flags&SPSTART; + *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED); } if (have_branch || paren != ':') { @@ -5650,7 +6135,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) FAIL("Junk on end of regexp"); /* "Can't happen". */ /* NOTREACHED */ } - + if (after_freeze) + RExC_npar = after_freeze; return(ret); } @@ -5669,6 +6155,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) I32 flags = 0, c = 0; GET_RE_DEBUG_FLAGS_DECL; DEBUG_PARSE("brnc"); + if (first) ret = NULL; else { @@ -5697,7 +6184,7 @@ S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth) } else if (ret == NULL) ret = latest; - *flagp |= flags&HASWIDTH; + *flagp |= flags&(HASWIDTH|POSTPONED); if (chain == NULL) /* First piece. */ *flagp |= flags&SPSTART; else { @@ -5879,7 +6366,7 @@ S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) goto do_curly; } nest_check: - if (!SIZE_ONLY && !(flags&HASWIDTH) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { + if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) { vWARN3(RExC_parse, "%.*s matches null string many times", (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0), @@ -5975,7 +6462,7 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) /* RExC_parse points at the beginning brace, endbrace points at the last */ if ( name[0]=='U' && name[1]=='+' ) { - /* its a "unicode hex" notation {U+89AB} */ + /* its a "Unicode hex" notation {U+89AB} */ I32 fl = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); @@ -6104,7 +6591,9 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) char *s; char *p, *pend; STRLEN charlen = 1; +#ifdef DEBUGGING char * parse_start = name-3; /* needed for the offsets */ +#endif GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */ ret = reg_node(pRExC_state, @@ -6197,8 +6686,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) { STRLEN numlen = 1; SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); - const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp) - : SvPVX(sv); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; @@ -6209,8 +6697,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) if (!newlen || numlen != newlen) { uv = UNICODE_REPLACEMENT; - if (encp) - *encp = NULL; + *encp = NULL; } return uv; } @@ -6218,15 +6705,26 @@ S_reg_recode(pTHX_ const char value, SV **encp) /* - regatom - the lowest level - * - * Optimization: gobbles an entire sequence of ordinary characters so that - * it can turn them into a single node, which is smaller to store and - * faster to run. Backslashed characters are exceptions, each becoming a - * separate node; the code is simpler that way and it's not worth fixing. - * - * [Yes, it is worth fixing, some scripts can run twice the speed.] - * [It looks like its ok, as in S_study_chunk we merge adjacent EXACT nodes] - */ + + Try to identify anything special at the start of the pattern. If there + is, then handle it as required. This may involve generating a single regop, + such as for an assertion; or it may involve recursing, such as to + handle a () structure. + + If the string doesn't start with something special then we gobble up + as much literal text as we can. + + Once we have been able to handle whatever type of thing started the + sequence, we return. + + Note: we have to be careful with escapes, as they can be both literal + and special, and in the case of \10 and friends can either, depending + on context. Specifically there are two seperate switches for handling + escape sequences, with the one for handling literal escapes requiring + a dummy entry for all of the special escapes that are actually handled + by the other. +*/ + STATIC regnode * S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) { @@ -6238,8 +6736,9 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) DEBUG_PARSE("atom"); *flagp = WORST; /* Tentatively. */ + tryagain: - switch (*RExC_parse) { + switch ((U8)*RExC_parse) { case '^': RExC_seen_zerolen++; nextchar(pRExC_state); @@ -6300,7 +6799,7 @@ tryagain: } return(NULL); } - *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE); + *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED); break; case '|': case ')': @@ -6323,105 +6822,136 @@ tryagain: RExC_parse++; vFAIL("Quantifier follows nothing"); break; + case 0xDF: + case 0xC3: + case 0xCE: + if (!LOC && FOLD) { + U32 len,cp; + if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { + *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ + RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ + ret = reganode(pRExC_state, FOLDCHAR, cp); + Set_Node_Length(ret, 1); /* MJD */ + nextchar(pRExC_state); /* kill whitespace under /x */ + return ret; + } + } + goto outer_default; case '\\': + /* Special Escapes + + This switch handles escape sequences that resolve to some kind + of special regop and not to literal text. Escape sequnces that + resolve to literal text are handled below in the switch marked + "Literal Escapes". + + Every entry in this switch *must* have a corresponding entry + in the literal escape switch. However, the opposite is not + required, as the default for this switch is to jump to the + literal text handling code. + */ switch (*++RExC_parse) { + /* Special Escapes */ case 'A': RExC_seen_zerolen++; ret = reg_node(pRExC_state, SBOL); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'G': ret = reg_node(pRExC_state, GPOS); RExC_seen |= REG_SEEN_GPOS; *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; + case 'K': + RExC_seen_zerolen++; + ret = reg_node(pRExC_state, KEEPS); + *flagp |= SIMPLE; + goto finish_meta_pat; case 'Z': ret = reg_node(pRExC_state, SEOL); *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ - nextchar(pRExC_state); - break; + goto finish_meta_pat; case 'z': ret = reg_node(pRExC_state, EOS); *flagp |= SIMPLE; RExC_seen_zerolen++; /* Do not optimize RE away */ - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'C': ret = reg_node(pRExC_state, CANY); RExC_seen |= REG_SEEN_CANY; *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'X': ret = reg_node(pRExC_state, CLUMP); *flagp |= HASWIDTH; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'w': ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'W': ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'b': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND)); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'B': RExC_seen_zerolen++; RExC_seen |= REG_SEEN_LOOKBEHIND; ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND)); *flagp |= SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 's': ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'S': ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE)); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'd': ret = reg_node(pRExC_state, DIGIT); *flagp |= HASWIDTH|SIMPLE; - nextchar(pRExC_state); - Set_Node_Length(ret, 2); /* MJD */ - break; + goto finish_meta_pat; case 'D': ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'h': + ret = reg_node(pRExC_state, HORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'H': + ret = reg_node(pRExC_state, NHORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'v': + ret = reg_node(pRExC_state, VERTWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'V': + ret = reg_node(pRExC_state, NVERTWS); + *flagp |= HASWIDTH|SIMPLE; + finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ - break; + break; case 'p': case 'P': { char* const oldregxend = RExC_end; +#ifdef DEBUGGING char* parse_start = RExC_parse - 2; +#endif if (RExC_parse[1] == '{') { /* a lovely hack--pretend we saw [\pX] instead */ @@ -6481,7 +7011,7 @@ tryagain: if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; @@ -6498,16 +7028,6 @@ tryagain: } break; } - case 'n': - case 'r': - case 't': - case 'f': - case 'e': - case 'a': - case 'x': - case 'c': - case '0': - goto defchar; case 'g': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': @@ -6582,14 +7102,13 @@ tryagain: case '#': if (RExC_flags & RXf_PMf_EXTENDED) { - while (RExC_parse < RExC_end && *RExC_parse != '\n') - RExC_parse++; - if (RExC_parse < RExC_end) + if ( reg_skipcomment( pRExC_state ) ) goto tryagain; } /* FALL THROUGH */ - default: { + default: + outer_default:{ register STRLEN len; register UV ender; register char *p; @@ -6613,8 +7132,13 @@ tryagain: char * const oldp = p; if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); - switch (*p) { + p = regwhite( pRExC_state, p ); + switch ((U8)*p) { + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case '^': case '$': case '.': @@ -6624,29 +7148,42 @@ tryagain: case '|': goto loopdone; case '\\': + /* Literal Escapes Switch + + This switch is meant to handle escape sequences that + resolve to a literal character. + + Every escape sequence that represents something + else, like an assertion or a char class, is handled + in the switch marked 'Special Escapes' above in this + routine, but also has an entry here as anything that + isn't explicitly mentioned here will be treated as + an unescaped equivalent literal. + */ + switch (*++p) { - case 'A': - case 'C': - case 'X': - case 'G': - case 'g': - case 'Z': - case 'z': - case 'w': - case 'W': - case 'b': - case 'B': - case 's': - case 'S': - case 'd': - case 'D': - case 'p': - case 'P': - case 'N': - case 'R': - case 'k': + /* These are all the special escapes. */ + case 'A': /* Start assertion */ + case 'b': case 'B': /* Word-boundary assertion*/ + case 'C': /* Single char !DANGEROUS! */ + case 'd': case 'D': /* digit class */ + case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ + case 'k': case 'K': /* named backref, keep marker */ + case 'N': /* named char sequence */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ + case 's': case 'S': /* space class */ + case 'v': case 'V': /* VERTWS */ + case 'w': case 'W': /* word class */ + case 'X': /* eXtended Unicode "combining character sequence" */ + case 'z': case 'Z': /* End of line/string assertion */ --p; goto loopdone; + + /* Anything after here is an escape that resolves to a + literal. (Except digits, which may or may not) + */ case 'n': ender = '\n'; p++; @@ -6750,13 +7287,13 @@ tryagain: ender = *p++; break; } - if (RExC_flags & RXf_PMf_EXTENDED) - p = regwhite(p, RExC_end); + if ( RExC_flags & RXf_PMf_EXTENDED) + p = regwhite( pRExC_state, p ); if (UTF && FOLD) { /* Prime the casefolded buffer. */ ender = toFOLD_uni(ender, tmpbuf, &foldlen); } - if (ISMULT2(p)) { /* Back off on ?+*. */ + if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */ if (len) p = oldp; else if (UTF) { @@ -6858,15 +7395,22 @@ tryagain: } STATIC char * -S_regwhite(char *p, const char *e) +S_regwhite( RExC_state_t *pRExC_state, char *p ) { + const char *e = RExC_end; while (p < e) { if (isSPACE(*p)) ++p; else if (*p == '#') { + bool ended = 0; do { - p++; - } while (p < e && *p != '\n'); + if (*p++ == '\n') { + ended = 1; + break; + } + } while (p < e); + if (!ended) + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; } else break; @@ -7053,6 +7597,21 @@ case ANYOF_N##NAME: \ what = WORD; \ break +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +ANYOF_##NAME: \ + for (value = 0; value < 256; value++) \ + if (TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '+'; \ + what = WORD; \ + break; \ +case ANYOF_N##NAME: \ + for (value = 0; value < 256; value++) \ + if (!TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '!'; \ + what = WORD; \ + break /* parse a class specification and produce either an ANYOF node that @@ -7065,10 +7624,10 @@ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; - register UV value = 0; register UV nextvalue; register IV prevvalue = OOB_UNICODE; register IV range = 0; + UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; IV namedclass; @@ -7171,6 +7730,10 @@ parseit: case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { /* We only pay attention to the first char of @@ -7349,6 +7912,8 @@ parseit: case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); case _C_C_T_(UPPER, isUPPER(value), "Upper"); case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); + case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ASCII); @@ -7470,12 +8035,16 @@ parseit: { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i)) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } else { for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i)) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } } else @@ -7587,7 +8156,7 @@ parseit: return ret; /****** !SIZE_ONLY AFTER HERE *********/ - if( stored == 1 && value < 256 + if( stored == 1 && (value < 128 || (value < 256 && !UTF)) && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) ) ) { /* optimize single char class to an EXACT node @@ -7648,6 +8217,49 @@ parseit: #undef _C_C_T_ +/* reg_skipcomment() + + Absorbs an /x style # comments from the input stream. + Returns true if there is more text remaining in the stream. + Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment + terminates the pattern without including a newline. + + Note its the callers responsibility to ensure that we are + actually in /x mode + +*/ + +STATIC bool +S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state) +{ + bool ended = 0; + while (RExC_parse < RExC_end) + if (*RExC_parse++ == '\n') { + ended = 1; + break; + } + if (!ended) { + /* we ran off the end of the pattern without ending + the comment, so we have to add an \n when wrapping */ + RExC_seen |= REG_SEEN_RUN_ON_COMMENT; + return 0; + } else + return 1; +} + +/* nextchar() + + Advance that parse position, and optionall absorbs + "whitespace" from the inputstream. + + Without /x "whitespace" means (?#...) style comments only, + with /x this means (?#...) and # comments and whitespace proper. + + Returns the RExC_parse point from BEFORE the scan occurs. + + This is the /x friendly way of saying RExC_parse++. +*/ + STATIC char* S_nextchar(pTHX_ RExC_state_t *pRExC_state) { @@ -7670,9 +8282,8 @@ S_nextchar(pTHX_ RExC_state_t *pRExC_state) continue; } else if (*RExC_parse == '#') { - while (RExC_parse < RExC_end) - if (*RExC_parse++ == '\n') break; - continue; + if ( reg_skipcomment( pRExC_state ) ) + continue; } } return retval; @@ -7695,18 +8306,17 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) RExC_size += 1; return(ret); } -#ifdef DEBUGGING - if (OP(RExC_emit) == 255) - Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ", - reg_name[op], OP(RExC_emit)); -#endif + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE(ptr, op); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n", "reg_node", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), @@ -7714,7 +8324,7 @@ S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op) (UV)RExC_offsets[0])); Set_Node_Offset(RExC_emit, RExC_parse + (op == END)); } - +#endif RExC_emit = ptr; return(ret); } @@ -7749,18 +8359,18 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) */ return(ret); } -#ifdef DEBUGGING - if (OP(RExC_emit) == 255) - Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space"); -#endif + if (RExC_emit >= RExC_emit_bound) + Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d", op); + NODE_ALIGN_FILL(ret); ptr = ret; FILL_ADVANCE_NODE_ARG(ptr, op, arg); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reganode", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(RExC_emit - RExC_emit_start), @@ -7768,7 +8378,7 @@ S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg) (UV)RExC_offsets[0])); Set_Cur_Node_Offset; } - +#endif RExC_emit = ptr; return(ret); } @@ -7798,8 +8408,9 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) const int offset = regarglen[(U8)op]; const int size = NODE_STEP_REGNODE + offset; GET_RE_DEBUG_FLAGS_DECL; + PERL_UNUSED_ARG(depth); /* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */ - DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]); + DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]); if (SIZE_ONLY) { RExC_size += size; return; @@ -7810,30 +8421,31 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) dst = RExC_emit; if (RExC_open_parens) { int paren; - DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar); + /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/ for ( paren=0 ; paren < RExC_npar ; paren++ ) { if ( RExC_open_parens[paren] >= opnd ) { - DEBUG_PARSE_FMT("open"," - %d",size); + /*DEBUG_PARSE_FMT("open"," - %d",size);*/ RExC_open_parens[paren] += size; } else { - DEBUG_PARSE_FMT("open"," - %s","ok"); + /*DEBUG_PARSE_FMT("open"," - %s","ok");*/ } if ( RExC_close_parens[paren] >= opnd ) { - DEBUG_PARSE_FMT("close"," - %d",size); + /*DEBUG_PARSE_FMT("close"," - %d",size);*/ RExC_close_parens[paren] += size; } else { - DEBUG_PARSE_FMT("close"," - %s","ok"); + /*DEBUG_PARSE_FMT("close"," - %s","ok");*/ } } } while (src > opnd) { StructCopy(--src, --dst, regnode); +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD 20010112 */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n", "reg_insert", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(dst - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(src - RExC_emit_start), @@ -7842,15 +8454,17 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src)); Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src)); } +#endif } place = opnd; /* Op node, where operand used to be. */ +#ifdef RE_TRACK_PATTERN_OFFSETS if (RExC_offsets) { /* MJD */ MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n", "reginsert", __LINE__, - reg_name[op], + PL_reg_name[op], (UV)(place - RExC_emit_start) > RExC_offsets[0] ? "Overwriting end of array!\n" : "OK", (UV)(place - RExC_emit_start), @@ -7859,6 +8473,7 @@ S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth) Set_Node_Offset(place, RExC_parse); Set_Node_Length(place, 1); } +#endif src = NEXTOPER(place); FILL_ADVANCE_NODE(place, op); Zero(src, offset, regnode); @@ -7893,7 +8508,7 @@ S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 de PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), (temp == NULL ? "->" : ""), - (temp == NULL ? reg_name[OP(val)] : "") + (temp == NULL ? PL_reg_name[OP(val)] : "") ); }); if (temp == NULL) @@ -7974,7 +8589,7 @@ S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val, PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n", SvPV_nolen_const(mysv), REG_NODE_NUM(scan), - reg_name[exact]); + PL_reg_name[exact]); }); if (temp == NULL) break; @@ -8026,6 +8641,27 @@ S_regcurly(register const char *s) /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ +#ifdef DEBUGGING +void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { + int bit; + int set=0; + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); @@ -8107,6 +8744,7 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); @@ -8132,7 +8770,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* It would be nice to FAIL() here, but this may be called from regexec.c, and it would be hard to supply pRExC_state. */ Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX); - sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */ + sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */ k = PL_regkind[OP(o)]; @@ -8154,14 +8792,14 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ const char op = OP(o); - const I32 n = ARG(o); + const U32 n = ARG(o); const reg_ac_data * const ac = IS_TRIE_AC(op) ? (reg_ac_data *)progi->data->data[n] : NULL; const reg_trie_data * const trie = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie]; - Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]); + Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]); DEBUG_TRIE_COMPILE_r( Perl_sv_catpvf(aTHX_ sv, "", @@ -8208,26 +8846,27 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) { Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */ if ( prog->paren_names ) { - AV *list= (AV *)progi->data->data[progi->name_list_idx]; - SV **name= av_fetch(list, ARG(o), 0 ); - if (name) - Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); - } - } else if (k == NREF) { - if ( prog->paren_names ) { - AV *list= (AV *)progi->data->data[ progi->name_list_idx ]; - SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ]; - I32 *nums=(I32*)SvPVX(sv_dat); - SV **name= av_fetch(list, nums[0], 0 ); - I32 n; - if (name) { - for ( n=0; ndata->data[progi->name_list_idx]; + SV **name= av_fetch(list, ARG(o), 0 ); + if (name) + Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name)); + } + else { + AV *list= (AV *)progi->data->data[ progi->name_list_idx ]; + SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ]; + I32 *nums=(I32*)SvPVX(sv_dat); + SV **name= av_fetch(list, nums[0], 0 ); + I32 n; + if (name) { + for ( n=0; ndata->data[ ARG( o ) ])); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + else if (k == FOLDCHAR) + Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]",ARG(o) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -8386,7 +9027,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ regexp *prog) +Perl_re_intuit_string(pTHX_ REGEXP * const prog) { /* Assume that RE_INTUIT is set */ dVAR; GET_RE_DEBUG_FLAGS_DECL; @@ -8432,34 +9073,82 @@ Perl_pregfree(pTHX_ struct regexp *r) if (!r || (--r->refcnt > 0)) return; - - CALLREGFREE_PVT(r); /* free the private data */ - - /* gcov results gave these as non-null 100% of the time, so there's no - optimisation in checking them before calling Safefree */ - Safefree(r->precomp); + if (r->mother_re) { + ReREFCNT_dec(r->mother_re); + } else { + CALLREGFREE_PVT(r); /* free the private data */ + if (r->paren_names) + SvREFCNT_dec(r->paren_names); + Safefree(r->wrapped); + } + if (r->substrs) { + if (r->anchored_substr) + SvREFCNT_dec(r->anchored_substr); + if (r->anchored_utf8) + SvREFCNT_dec(r->anchored_utf8); + if (r->float_substr) + SvREFCNT_dec(r->float_substr); + if (r->float_utf8) + SvREFCNT_dec(r->float_utf8); + Safefree(r->substrs); + } RX_MATCH_COPY_FREE(r); #ifdef PERL_OLD_COPY_ON_WRITE if (r->saved_copy) - SvREFCNT_dec(r->saved_copy); + SvREFCNT_dec(r->saved_copy); #endif + Safefree(r->swap); + Safefree(r->offs); + Safefree(r); +} + +/* reg_temp_copy() + + This is a hacky workaround to the structural issue of match results + being stored in the regexp structure which is in turn stored in + PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern + could be PL_curpm in multiple contexts, and could require multiple + result sets being associated with the pattern simultaneously, such + as when doing a recursive match with (??{$qr}) + + The solution is to make a lightweight copy of the regexp structure + when a qr// is returned from the code executed by (??{$qr}) this + lightweight copy doesnt actually own any of its data except for + the starp/end and the actual regexp structure itself. + +*/ + + +regexp * +Perl_reg_temp_copy (pTHX_ struct regexp *r) { + regexp *ret; + register const I32 npar = r->nparens+1; + (void)ReREFCNT_inc(r); + Newx(ret, 1, regexp); + StructCopy(r, ret, regexp); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + ret->refcnt = 1; if (r->substrs) { - if (r->anchored_substr) - SvREFCNT_dec(r->anchored_substr); - if (r->anchored_utf8) - SvREFCNT_dec(r->anchored_utf8); - if (r->float_substr) - SvREFCNT_dec(r->float_substr); - if (r->float_utf8) - SvREFCNT_dec(r->float_utf8); - Safefree(r->substrs); + Newx(ret->substrs, 1, struct reg_substr_data); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ } - if (r->paren_names) - SvREFCNT_dec(r->paren_names); + RX_MATCH_COPIED_off(ret); +#ifdef PERL_OLD_COPY_ON_WRITE + ret->saved_copy = NULL; +#endif + ret->mother_re = r; + ret->swap = NULL; - Safefree(r->startp); - Safefree(r->endp); - Safefree(r); + return ret; } #endif @@ -8476,7 +9165,7 @@ Perl_pregfree(pTHX_ struct regexp *r) */ void -Perl_regfree_internal(pTHX_ struct regexp *r) +Perl_regfree_internal(pTHX_ REGEXP * const r) { dVAR; RXi_GET_DECL(r,ri); @@ -8493,8 +9182,10 @@ Perl_regfree_internal(pTHX_ struct regexp *r) PL_colors[4],PL_colors[5],s); } }); - - Safefree(ri->offsets); /* 20010421 MJD */ +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) + Safefree(ri->u.offsets); /* 20010421 MJD */ +#endif if (ri->data) { int n = ri->data->count; PAD* new_comppad = NULL; @@ -8583,11 +9274,7 @@ Perl_regfree_internal(pTHX_ struct regexp *r) Safefree(ri->data->what); Safefree(ri->data); } - if (ri->swap) { - Safefree(ri->swap->startp); - Safefree(ri->swap->endp); - Safefree(ri->swap); - } + Safefree(ri); } @@ -8597,12 +9284,11 @@ Perl_regfree_internal(pTHX_ struct regexp *r) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* - regdupe - duplicate a regexp. - - This routine is called by sv.c's re_dup and is expected to clone a - given regexp structure. It is a no-op when not under USE_ITHREADS. - (Originally this *was* re_dup() for change history see sv.c) + re_dup - duplicate a regexp. + This routine is expected to clone a given regexp structure. It is not + compiler under USE_ITHREADS. + After all of the core data stored in struct regexp is duplicated the regexp_engine.dupe method is used to copy any private data stored in the *pprivate pointer. This allows extensions to handle @@ -8617,8 +9303,7 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) { dVAR; regexp *ret; - int i, npar; - struct reg_substr_datum *s; + I32 npar; if (!r) return (REGEXP *)NULL; @@ -8628,52 +9313,63 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) npar = r->nparens+1; - Newxz(ret, 1, regexp); - Newx(ret->startp, npar, I32); - Copy(r->startp, ret->startp, npar, I32); - Newx(ret->endp, npar, I32); - Copy(r->endp, ret->endp, npar, I32); + Newx(ret, 1, regexp); + StructCopy(r, ret, regexp); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if(ret->swap) { + /* no need to copy these */ + Newx(ret->swap, npar, regexp_paren_pair); + } - if (r->substrs) { + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr == r->anchored_substr; Newx(ret->substrs, 1, struct reg_substr_data); - for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { - s->min_offset = r->substrs->data[i].min_offset; - s->max_offset = r->substrs->data[i].max_offset; - s->end_shift = r->substrs->data[i].end_shift; - s->substr = sv_dup_inc(r->substrs->data[i].substr, param); - s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param); - } - } else - ret->substrs = NULL; - - ret->precomp = SAVEPVN(r->precomp, r->prelen); - ret->refcnt = r->refcnt; - ret->minlen = r->minlen; - ret->minlenret = r->minlenret; - ret->prelen = r->prelen; - ret->nparens = r->nparens; - ret->lastparen = r->lastparen; - ret->lastcloseparen = r->lastcloseparen; - ret->intflags = r->intflags; - ret->extflags = r->extflags; - - ret->sublen = r->sublen; - - ret->engine = r->engine; - - ret->paren_names = hv_dup_inc(r->paren_names, param); + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } + } + + ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1); + ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped); + ret->paren_names = hv_dup_inc(ret->paren_names, param); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); if (RX_MATCH_COPIED(ret)) - ret->subbeg = SAVEPVN(r->subbeg, r->sublen); + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; #ifdef PERL_OLD_COPY_ON_WRITE ret->saved_copy = NULL; #endif - - ret->pprivate = r->pprivate; - if (ret->pprivate) - RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); + + ret->mother_re = NULL; + ret->gofs = 0; + ret->seen_evals = 0; ptr_table_store(PL_ptr_table, r, ret); return ret; @@ -8695,7 +9391,7 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) { dVAR; regexp_internal *reti; @@ -8703,22 +9399,14 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) RXi_GET_DECL(r,ri); npar = r->nparens+1; - len = ri->offsets[0]; + len = ProgLen(ri); Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal); Copy(ri->program, reti->program, len+1, regnode); - if(ri->swap) { - Newx(reti->swap, 1, regexp_paren_ofs); - /* no need to copy these */ - Newx(reti->swap->startp, npar, I32); - Newx(reti->swap->endp, npar, I32); - } else { - reti->swap = NULL; - } - reti->regstclass = NULL; + if (ri->data) { struct reg_data *d; const int count = ri->data->count; @@ -8779,9 +9467,17 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) else reti->data = NULL; - Newx(reti->offsets, 2*len+1, U32); - Copy(ri->offsets, reti->offsets, 2*len+1, U32); - + reti->name_list_idx = ri->name_list_idx; + +#ifdef RE_TRACK_PATTERN_OFFSETS + if (ri->u.offsets) { + Newx(reti->u.offsets, 2*len+1, U32); + Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32); + } +#else + SetProgLen(reti,len); +#endif + return (void*)reti; } @@ -8815,80 +9511,18 @@ Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) */ #ifndef PERL_IN_XSUB_RE + char * Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) { dVAR; const regexp * const re = (regexp *)mg->mg_obj; - - if (!mg->mg_ptr) { - const char *fptr = "msix"; - char reflags[6]; - char ch; - int left = 0; - int right = 4; - bool need_newline = 0; - U16 reganch = (U16)((re->extflags & RXf_PMf_COMPILETIME) >> 12); - - while((ch = *fptr++)) { - if(reganch & 1) { - reflags[left++] = ch; - } - else { - reflags[right--] = ch; - } - reganch >>= 1; - } - if(left != 4) { - reflags[left] = '-'; - left = 5; - } - - mg->mg_len = re->prelen + 4 + left; - /* - * If /x was used, we have to worry about a regex ending with a - * comment later being embedded within another regex. If so, we don't - * want this regex's "commentization" to leak out to the right part of - * the enclosing regex, we must cap it with a newline. - * - * So, if /x was used, we scan backwards from the end of the regex. If - * we find a '#' before we find a newline, we need to add a newline - * ourself. If we find a '\n' first (or if we don't find '#' or '\n'), - * we don't need to add anything. -jfriedl - */ - if (PMf_EXTENDED & re->extflags) { - const char *endptr = re->precomp + re->prelen; - while (endptr >= re->precomp) { - const char c = *(endptr--); - if (c == '\n') - break; /* don't need another */ - if (c == '#') { - /* we end while in a comment, so we need a newline */ - mg->mg_len++; /* save space for it */ - need_newline = 1; /* note to add it */ - break; - } - } - } - - Newx(mg->mg_ptr, mg->mg_len + 1 + left, char); - mg->mg_ptr[0] = '('; - mg->mg_ptr[1] = '?'; - Copy(reflags, mg->mg_ptr+2, left, char); - *(mg->mg_ptr+left+2) = ':'; - Copy(re->precomp, mg->mg_ptr+3+left, re->prelen, char); - if (need_newline) - mg->mg_ptr[mg->mg_len - 2] = '\n'; - mg->mg_ptr[mg->mg_len - 1] = ')'; - mg->mg_ptr[mg->mg_len] = 0; - } if (haseval) *haseval = re->seen_evals; if (flags) *flags = ((re->extflags & RXf_UTF8) ? 1 : 0); - if (lp) - *lp = mg->mg_len; - return mg->mg_ptr; + *lp = re->wraplen; + return re->wrapped; } /* @@ -9098,7 +9732,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, else if ( PL_regkind[(U8)op] == TRIE ) { const regnode *this_trie = node; const char op = OP(node); - const I32 n = ARG(node); + const U32 n = ARG(node); const reg_ac_data * const ac = op>=AHOCORASICK ? (reg_ac_data *)ri->data->data[n] : NULL;