X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/28d8d7f41ab202dd5f7611033d27ecad44cadd60..f64b4994f9f999ca75917a20a715de24dfcb1237:/regcomp.c diff --git a/regcomp.c b/regcomp.c index 663d288..3ad5d8c 100644 --- a/regcomp.c +++ b/regcomp.c @@ -1163,11 +1163,19 @@ is the recommended Unicode-aware way of saying #define TRIE_STORE_REVCHAR \ STMT_START { \ - SV *tmp = newSVpvs(""); \ - if (UTF) SvUTF8_on(tmp); \ - Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \ - av_push( revcharmap, tmp ); \ - } STMT_END + if (UTF) { \ + SV *zlopp = newSV(2); \ + unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \ + unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \ + SvCUR_set(zlopp, kapow - flrbbbbb); \ + SvPOK_on(zlopp); \ + SvUTF8_on(zlopp); \ + av_push(revcharmap, zlopp); \ + } else { \ + char ooooff = (char)uvc; \ + av_push(revcharmap, newSVpvn(&ooooff, 1)); \ + } \ + } STMT_END #define TRIE_READ_CHAR STMT_START { \ wordlen++; \ @@ -1356,7 +1364,7 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs have unique chars. We use an array of integers to represent the character codes 0..255 - (trie->charmap) and we use a an HV* to store unicode characters. We use the + (trie->charmap) and we use a an HV* to store Unicode characters. We use the native representation of the character value as the key and IV's for the coded index. @@ -1405,7 +1413,20 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs /* store the codepoint in the bitmap, and if its ascii also store its folded equivelent. */ TRIE_BITMAP_SET(trie,uvc); - if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + /* store the folded codepoint */ + if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]); + + if ( !UTF ) { + /* store first byte of utf8 representation of + codepoints in the 127 < uvc < 256 range */ + if (127 < uvc && uvc < 192) { + TRIE_BITMAP_SET(trie,194); + } else if (191 < uvc ) { + TRIE_BITMAP_SET(trie,195); + /* && uvc < 256 -- we know uvc is < 256 already */ + } + } set_bit = 0; /* We've done our bit :-) */ } } else { @@ -1966,7 +1987,8 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs } if ( count == 1 ) { SV **tmp = av_fetch( revcharmap, idx, 0); - char *ch = SvPV_nolen( *tmp ); + STRLEN len; + char *ch = SvPV( *tmp, len ); DEBUG_OPTIMISE_r({ SV *sv=sv_newmortal(); PerlIO_printf( Perl_debug_log, @@ -1985,11 +2007,9 @@ S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *firs str=STRING(convert); STR_LEN(convert)=0; } - while (*ch) { + STR_LEN(convert) += len; + while (len--) *str++ = *ch++; - STR_LEN(convert)++; - } - } else { #ifdef DEBUGGING if (state>1) @@ -2400,6 +2420,34 @@ typedef struct scan_frame { #define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf) +#define CASE_SYNST_FNC(nAmE) \ +case nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break; \ +case N ## nAmE: \ + if (flags & SCF_DO_STCLASS_AND) { \ + for (value = 0; value < 256; value++) \ + if (is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_CLEAR(data->start_class, value); \ + } \ + else { \ + for (value = 0; value < 256; value++) \ + if (!is_ ## nAmE ## _cp(value)) \ + ANYOF_BITMAP_SET(data->start_class, value); \ + } \ + break + + + STATIC I32 S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, I32 *minlenp, I32 *deltap, @@ -3270,7 +3318,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, SvUTF8(sv) && SvMAGICAL(sv) ? mg_find(sv, PERL_MAGIC_utf8) : NULL; if (mg && mg->mg_len >= 0) - mg->mg_len += CHR_SVLEN(last_str); + mg->mg_len += CHR_SVLEN(last_str) - l; } data->last_end += l * (mincount - 1); } @@ -3330,6 +3378,46 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, break; } } + else if (OP(scan) == LNBREAK) { + if (flags & SCF_DO_STCLASS) { + int value = 0; + data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */ + if (flags & SCF_DO_STCLASS_AND) { + for (value = 0; value < 256; value++) + if (!is_VERTWS_cp(value)) + ANYOF_BITMAP_CLEAR(data->start_class, value); + } + else { + for (value = 0; value < 256; value++) + if (is_VERTWS_cp(value)) + ANYOF_BITMAP_SET(data->start_class, value); + } + if (flags & SCF_DO_STCLASS_OR) + cl_and(data->start_class, and_withp); + flags &= ~SCF_DO_STCLASS; + } + min += 1; + delta += 1; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += 1; + data->longest = &(data->longest_float); + } + + } + else if (OP(scan) == FOLDCHAR) { + int d = ARG(scan)==0xDF ? 1 : 2; + flags &= ~SCF_DO_STCLASS; + min += 1; + delta += d; + if (flags & SCF_DO_SUBSTR) { + SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */ + data->pos_min += 1; + data->pos_delta += d; + data->longest = &(data->longest_float); + } + } else if (strchr((const char*)PL_simple,OP(scan))) { int value = 0; @@ -3524,6 +3612,9 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } } break; + CASE_SYNST_FNC(VERTWS); + CASE_SYNST_FNC(HORIZWS); + } if (flags & SCF_DO_STCLASS_OR) cl_and(data->start_class, and_withp); @@ -3894,6 +3985,7 @@ S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp, } #endif /* old or new */ #endif /* TRIE_STUDY_OPT */ + /* Else: zero-length, ignore. */ scan = regnext(scan); } @@ -4016,8 +4108,8 @@ extern const struct regexp_engine my_reg_engine; #endif #ifndef PERL_IN_XSUB_RE -regexp * -Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) +REGEXP * +Perl_pregcomp(pTHX_ const SV * const pattern, const U32 flags) { dVAR; HV * const table = GvHV(PL_hintgv); @@ -4032,21 +4124,23 @@ Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm) PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n", SvIV(*ptr)); }); - return CALLREGCOMP_ENG(eng, exp, xend, pm); + return CALLREGCOMP_ENG(eng, pattern, flags); } } - return Perl_re_compile(aTHX_ exp, xend, pm); + return Perl_re_compile(aTHX_ pattern, flags); } #endif -regexp * -Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) +REGEXP * +Perl_re_compile(pTHX_ const SV * const pattern, const U32 pm_flags) { dVAR; - register regexp *r; + register REGEXP *r; register regexp_internal *ri; + STRLEN plen; + char* exp = SvPV((SV*)pattern, plen); + char* xend = exp + plen; regnode *scan; - regnode *first; I32 flags; I32 minlen = 0; I32 sawplus = 0; @@ -4060,23 +4154,20 @@ Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm) #endif GET_RE_DEBUG_FLAGS_DECL; DEBUG_r(if (!PL_colorset) reginitcolors()); - - if (exp == NULL) - FAIL("NULL regexp argument"); - RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8; + RExC_utf8 = RExC_orig_utf8 = pm_flags & RXf_UTF8; DEBUG_COMPILE_r({ SV *dsv= sv_newmortal(); RE_PV_QUOTED_DECL(s, RExC_utf8, - dsv, exp, (xend - exp), 60); + dsv, exp, plen, 60); PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n", PL_colors[4],PL_colors[5],s); }); redo_first_pass: RExC_precomp = exp; - RExC_flags = pm->op_pmflags; + RExC_flags = pm_flags; RExC_sawback = 0; RExC_seen = 0; @@ -4115,7 +4206,7 @@ redo_first_pass: return(NULL); } if (RExC_utf8 && !RExC_orig_utf8) { - /* It's possible to write a regexp in ascii that represents unicode + /* It's possible to write a regexp in ascii that represents Unicode codepoints outside of the byte range, such as via \x{100}. If we detect such a sequence we have to convert the entire pattern to utf8 and then recompile, as our sizing calculation will have been based @@ -4124,7 +4215,7 @@ redo_first_pass: thing. XXX: somehow figure out how to make this less expensive... -- dmq */ - STRLEN len = xend-exp; + STRLEN len = plen; DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "UTF8 mismatch! Converting to utf8 for resizing and compile\n")); exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len); @@ -4170,24 +4261,24 @@ redo_first_pass: RXi_SET( r, ri ); r->engine= RE_ENGINE_PTR; r->refcnt = 1; - r->prelen = xend - exp; - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; + r->prelen = plen; + r->extflags = pm_flags; { - bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); + bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY); bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD); bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT); U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12); const char *fptr = STD_PAT_MODS; /*"msix"*/ char *p; - r->wraplen = r->prelen + has_minus + has_k + has_runon + r->wraplen = r->prelen + has_minus + has_p + has_runon + (sizeof(STD_PAT_MODS) - 1) + (sizeof("(?:)") - 1); Newx(r->wrapped, r->wraplen + 1, char ); p = r->wrapped; *p++='('; *p++='?'; - if (has_k) - *p++ = KEEPCOPY_PAT_MOD; /*'k'*/ + if (has_p) + *p++ = KEEPCOPY_PAT_MOD; /*'p'*/ { char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1; char *colon = r + 1; @@ -4239,7 +4330,7 @@ redo_first_pass: RExC_rxi = ri; /* Second pass: emit code. */ - RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */ + RExC_flags = pm_flags; /* don't let top level (?i) bleed */ RExC_parse = exp; RExC_end = xend; RExC_naughty = 0; @@ -4251,9 +4342,10 @@ redo_first_pass: /* Store the count of eval-groups for security checks: */ RExC_rx->seen_evals = RExC_seen_evals; REGC((U8)REG_MAGIC, (char*) RExC_emit++); - if (reg(pRExC_state, 0, &flags,1) == NULL) + if (reg(pRExC_state, 0, &flags,1) == NULL) { + ReREFCNT_dec(r); return(NULL); - + } /* XXXX To minimize changes to RE engine we always allocate 3-units-long substrs field. */ Newx(r->substrs, 1, struct reg_substr_data); @@ -4291,8 +4383,9 @@ reStudy: #endif /* Dig out information for optimizations. */ - r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */ - pm->op_pmflags = RExC_flags; + r->extflags = RExC_flags; /* was pm_op */ + /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */ + if (UTF) r->extflags |= RXf_UTF8; /* Unicode in it? */ ri->regstclass = NULL; @@ -4308,18 +4401,20 @@ reStudy: struct regnode_charclass_class ch_class; /* pointed to by data */ int stclass_flag; I32 last_close = 0; /* pointed to by data */ - - first = scan; + regnode *first= scan; + regnode *first_next= regnext(first); + /* Skip introductions and multiplicators >= 1. */ while ((OP(first) == OPEN && (sawopen = 1)) || /* An OR of *one* alternative - should not happen now. */ - (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) || + (OP(first) == BRANCH && OP(first_next) != BRANCH) || /* for now we can't handle lookbehind IFMATCH*/ (OP(first) == IFMATCH && !first->flags) || (OP(first) == PLUS) || (OP(first) == MINMOD) || /* An {n,m} with n>0 */ - (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ) + (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) || + (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END )) { if (OP(first) == PLUS) @@ -4331,6 +4426,7 @@ reStudy: first += EXTRA_STEP_2ARGS; } else /* XXX possible optimisation for /(?=)/ */ first = NEXTOPER(first); + first_next= regnext(first); } /* Starting-point info. */ @@ -4676,11 +4772,34 @@ reStudy: r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names); else r->paren_names = NULL; - if (r->prelen == 3 && strEQ("\\s+", r->precomp)) - r->extflags |= RXf_WHITE; + +#ifdef STUPID_PATTERN_CHECKS + if (r->prelen == 0) + r->extflags |= RXf_NULL; + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else if (r->prelen == 3 && memEQ("\\s+", r->precomp, 3)) + r->extflags |= RXf_WHITE; else if (r->prelen == 1 && r->precomp[0] == '^') r->extflags |= RXf_START_ONLY; - +#else + if (r->extflags & RXf_SPLIT && r->prelen == 1 && r->precomp[0] == ' ') + /* XXX: this should happen BEFORE we compile */ + r->extflags |= (RXf_SKIPWHITE|RXf_WHITE); + else { + regnode *first = ri->program + 1; + U8 fop = OP(first); + U8 nop = OP(NEXTOPER(first)); + + if (PL_regkind[fop] == NOTHING && nop == END) + r->extflags |= RXf_NULL; + else if (PL_regkind[fop] == BOL && nop == END) + r->extflags |= RXf_START_ONLY; + else if (fop == PLUS && nop ==SPACE && OP(regnext(first))==END) + r->extflags |= RXf_WHITE; + } +#endif #ifdef DEBUGGING if (RExC_paren_names) { ri->name_list_idx = add_data( pRExC_state, 1, "p" ); @@ -4695,8 +4814,7 @@ reStudy: ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan ); } } - Newxz(r->startp, RExC_npar, I32); - Newxz(r->endp, RExC_npar, I32); + Newxz(r->offs, RExC_npar, regexp_paren_pair); /* assume we don't need to swap parens around before we match */ DEBUG_DUMP_r({ @@ -4724,11 +4842,52 @@ reStudy: SV* -Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) +Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value, + const U32 flags) +{ + PERL_UNUSED_ARG(value); + + if (flags & RXapif_FETCH) { + return reg_named_buff_fetch(rx, key, flags); + } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) { + Perl_croak(aTHX_ PL_no_modify); + return NULL; + } else if (flags & RXapif_EXISTS) { + return reg_named_buff_exists(rx, key, flags) + ? &PL_sv_yes + : &PL_sv_no; + } else if (flags & RXapif_REGNAMES) { + return reg_named_buff_all(rx, flags); + } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) { + return reg_named_buff_scalar(rx, flags); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey, + const U32 flags) +{ + PERL_UNUSED_ARG(lastkey); + + if (flags & RXapif_FIRSTKEY) + return reg_named_buff_firstkey(rx, flags); + else if (flags & RXapif_NEXTKEY) + return reg_named_buff_nextkey(rx, flags); + else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags); + return NULL; + } +} + +SV* +Perl_reg_named_buff_fetch(pTHX_ REGEXP * const rx, SV * const namesv, const U32 flags) { AV *retarray = NULL; SV *ret; - if (flags & 1) + if (flags & RXapif_ALL) retarray=newAV(); if (rx && rx->paren_names) { @@ -4738,63 +4897,183 @@ Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags) SV* sv_dat=HeVAL(he_str); I32 *nums=(I32*)SvPVX(sv_dat); for ( i=0; inparens) >= nums[i] - && rx->startp[nums[i]] != -1 - && rx->endp[nums[i]] != -1) + if ((I32)(rx->nparens) >= nums[i] + && rx->offs[nums[i]].start != -1 + && rx->offs[nums[i]].end != -1) { - ret = CALLREG_NUMBUF(rx,nums[i],NULL); + ret = newSVpvs(""); + CALLREG_NUMBUF_FETCH(rx,nums[i],ret); if (!retarray) return ret; } else { ret = newSVsv(&PL_sv_undef); } if (retarray) { - SvREFCNT_inc(ret); + SvREFCNT_inc_simple_void(ret); av_push(retarray, ret); } } if (retarray) - return (SV*)retarray; + return newRV((SV*)retarray); } } return NULL; } +bool +Perl_reg_named_buff_exists(pTHX_ REGEXP * const rx, SV * const key, + const U32 flags) +{ + if (rx && rx->paren_names) { + if (flags & RXapif_ALL) { + return hv_exists_ent(rx->paren_names, key, 0); + } else { + SV *sv = CALLREG_NAMED_BUFF_FETCH(rx, key, flags); + if (sv) { + SvREFCNT_dec(sv); + return TRUE; + } else { + return FALSE; + } + } + } else { + return FALSE; + } +} + SV* -Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) +Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const rx, const U32 flags) +{ + if ( rx && rx->paren_names ) { + (void)hv_iterinit(rx->paren_names); + + return CALLREG_NAMED_BUFF_NEXTKEY(rx, NULL, flags & ~RXapif_FIRSTKEY); + } else { + return FALSE; + } +} + +SV* +Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const rx, const U32 flags) +{ + if (rx && rx->paren_names) { + HV *hv = rx->paren_names; + HE *temphe; + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + STRLEN len; + char *pv = HePV(temphe, len); + return newSVpvn(pv,len); + } + } + } + return NULL; +} + +SV* +Perl_reg_named_buff_scalar(pTHX_ REGEXP * const rx, const U32 flags) +{ + SV *ret; + AV *av; + I32 length; + + if (rx && rx->paren_names) { + if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) { + return newSViv(HvTOTALKEYS(rx->paren_names)); + } else if (flags & RXapif_ONE) { + ret = CALLREG_NAMED_BUFF_ALL(rx, (flags | RXapif_REGNAMES)); + av = (AV*)SvRV(ret); + length = av_len(av); + return newSViv(length + 1); + } else { + Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags); + return NULL; + } + } + return &PL_sv_undef; +} + +SV* +Perl_reg_named_buff_all(pTHX_ REGEXP * const rx, const U32 flags) +{ + AV *av = newAV(); + + if (rx && rx->paren_names) { + HV *hv= rx->paren_names; + HE *temphe; + (void)hv_iterinit(hv); + while ( (temphe = hv_iternext_flags(hv,0)) ) { + IV i; + IV parno = 0; + SV* sv_dat = HeVAL(temphe); + I32 *nums = (I32*)SvPVX(sv_dat); + for ( i = 0; i < SvIVX(sv_dat); i++ ) { + if ((I32)(rx->lastcloseparen) >= nums[i] && + rx->offs[nums[i]].start != -1 && + rx->offs[nums[i]].end != -1) + { + parno = nums[i]; + break; + } + } + if (parno || flags & RXapif_ALL) { + STRLEN len; + char *pv = HePV(temphe, len); + av_push(av, newSVpvn(pv,len)); + } + } + } + + return newRV((SV*)av); +} + +void +Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const rx, const I32 paren, SV * const sv) { char *s = NULL; I32 i = 0; I32 s1, t1; - SV *sv = usesv ? usesv : newSVpvs(""); if (!rx->subbeg) { sv_setsv(sv,&PL_sv_undef); - return sv; + return; } else - if (paren == -2 && rx->startp[0] != -1) { + if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) { /* $` */ - i = rx->startp[0]; + i = rx->offs[0].start; s = rx->subbeg; } else - if (paren == -1 && rx->endp[0] != -1) { + if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) { /* $' */ - s = rx->subbeg + rx->endp[0]; - i = rx->sublen - rx->endp[0]; + s = rx->subbeg + rx->offs[0].end; + i = rx->sublen - rx->offs[0].end; } else if ( 0 <= paren && paren <= (I32)rx->nparens && - (s1 = rx->startp[paren]) != -1 && - (t1 = rx->endp[paren]) != -1) + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) { /* $& $1 ... */ i = t1 - s1; s = rx->subbeg + s1; } else { sv_setsv(sv,&PL_sv_undef); - return sv; + return; } assert(rx->sublen >= (s - rx->subbeg) + i ); if (i >= 0) { @@ -4832,10 +5111,86 @@ Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv) } } else { sv_setsv(sv,&PL_sv_undef); + return; } - return sv; } +void +Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren, + SV const * const value) +{ + PERL_UNUSED_ARG(rx); + PERL_UNUSED_ARG(paren); + PERL_UNUSED_ARG(value); + + if (!PL_localizing) + Perl_croak(aTHX_ PL_no_modify); +} + +I32 +Perl_reg_numbered_buff_length(pTHX_ REGEXP * const rx, const SV * const sv, + const I32 paren) +{ + I32 i; + I32 s1, t1; + + /* Some of this code was originally in C in F */ + switch (paren) { + /* $` / ${^PREMATCH} */ + case RX_BUFF_IDX_PREMATCH: + if (rx->offs[0].start != -1) { + i = rx->offs[0].start; + if (i > 0) { + s1 = 0; + t1 = i; + goto getlen; + } + } + return 0; + /* $' / ${^POSTMATCH} */ + case RX_BUFF_IDX_POSTMATCH: + if (rx->offs[0].end != -1) { + i = rx->sublen - rx->offs[0].end; + if (i > 0) { + s1 = rx->offs[0].end; + t1 = rx->sublen; + goto getlen; + } + } + return 0; + /* $& / ${^MATCH}, $1, $2, ... */ + default: + if (paren <= (I32)rx->nparens && + (s1 = rx->offs[paren].start) != -1 && + (t1 = rx->offs[paren].end) != -1) + { + i = t1 - s1; + goto getlen; + } else { + if (ckWARN(WARN_UNINITIALIZED)) + report_uninit((SV*)sv); + return 0; + } + } + getlen: + if (i > 0 && RX_MATCH_UTF8(rx)) { + const char * const s = rx->subbeg + s1; + const U8 *ep; + STRLEN el; + + i = t1 - s1; + if (is_utf8_string_loclen((U8*)s, i, &ep, &el)) + i = el; + } + return i; +} + +SV* +Perl_reg_qr_package(pTHX_ REGEXP * const rx) +{ + PERL_UNUSED_ARG(rx); + return newSVpvs("Regexp"); +} /* Scans the name of a named buffer from the pattern. * If flags is REG_RSN_RETURN_NULL returns null. @@ -4954,10 +5309,6 @@ S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) { #define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1) #endif -/* this idea is borrowed from STR_WITH_LEN in handy.h */ -#define CHECK_WORD(s,v,l) \ - (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1)))) - STATIC regnode * S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */ @@ -4969,7 +5320,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) register regnode *ender = NULL; register I32 parno = 0; I32 flags; - const I32 oregflags = RExC_flags; + U32 oregflags = RExC_flags; bool have_branch = 0; bool is_open = 0; I32 freeze_paren = 0; @@ -5026,39 +5377,39 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) switch ( *start_verb ) { case 'A': /* (*ACCEPT) */ - if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) { + if ( memEQs(start_verb,verb_len,"ACCEPT") ) { op = ACCEPT; internal_argval = RExC_nestroot; } break; case 'C': /* (*COMMIT) */ - if ( CHECK_WORD("COMMIT",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"COMMIT") ) op = COMMIT; break; case 'F': /* (*FAIL) */ - if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) { + if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) { op = OPFAIL; argok = 0; } break; case ':': /* (*:NAME) */ case 'M': /* (*MARK:NAME) */ - if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) { + if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) { op = MARKPOINT; argok = -1; } break; case 'P': /* (*PRUNE) */ - if ( CHECK_WORD("PRUNE",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"PRUNE") ) op = PRUNE; break; case 'S': /* (*SKIP) */ - if ( CHECK_WORD("SKIP",start_verb,verb_len) ) + if ( memEQs(start_verb,verb_len,"SKIP") ) op = SKIP; break; case 'T': /* (*THEN) */ /* [19:06] :: is then */ - if ( CHECK_WORD("THEN",start_verb,verb_len) ) { + if ( memEQs(start_verb,verb_len,"THEN") ) { op = CUTGROUP; RExC_seen |= REG_SEEN_CUTGROUP; } @@ -5129,7 +5480,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; ret = reganode(pRExC_state, @@ -5464,7 +5815,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } ret = reganode(pRExC_state,NGROUPP,num); goto insert_if_check_paren; @@ -5568,8 +5919,8 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) and must be globally applied -- japhy */ switch (*RExC_parse) { CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp); - case 'o': - case 'g': + case ONCE_PAT_MOD: /* 'o' */ + case GLOBAL_PAT_MOD: /* 'g' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G; if (! (wastedflags & wflagbit) ) { @@ -5586,7 +5937,7 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } break; - case 'c': + case CONTINUE_PAT_MOD: /* 'c' */ if (SIZE_ONLY && ckWARN(WARN_REGEXP)) { if (! (wastedflags & WASTED_C) ) { wastedflags |= WASTED_GC; @@ -5599,10 +5950,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) } } break; - case 'k': + case KEEPCOPY_PAT_MOD: /* 'p' */ if (flagsp == &negflags) { if (SIZE_ONLY && ckWARN(WARN_REGEXP)) - vWARN(RExC_parse + 1,"Useless use of (?-k)"); + vWARN(RExC_parse + 1,"Useless use of (?-p)"); } else { *flagsp |= RXf_PMf_KEEPCOPY; } @@ -5622,6 +5973,10 @@ S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth) case ')': RExC_flags |= posflags; RExC_flags &= ~negflags; + if (paren != ':') { + oregflags |= posflags; + oregflags &= ~negflags; + } nextchar(pRExC_state); if (paren != ':') { *flagp = TRYAGAIN; @@ -6136,11 +6491,12 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) /* RExC_parse points at the beginning brace, endbrace points at the last */ if ( name[0]=='U' && name[1]=='+' ) { - /* its a "unicode hex" notation {U+89AB} */ + /* its a "Unicode hex" notation {U+89AB} */ I32 fl = PERL_SCAN_ALLOW_UNDERSCORES | PERL_SCAN_DISALLOW_PREFIX | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0); UV cp; + char string; len = (STRLEN)(endbrace - name - 2); cp = grok_hex(name + 2, &len, &fl, NULL); if ( len != (STRLEN)(endbrace - name - 2) ) { @@ -6152,7 +6508,8 @@ S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep) *valuep = cp; return NULL; } - sv_str= Perl_newSVpvf_nocontext("%c",(int)cp); + string = (char)cp; + sv_str= newSVpvn(&string, 1); } else { /* fetch the charnames handler for this scope */ HV * const table = GvHV(PL_hintgv); @@ -6360,8 +6717,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) { STRLEN numlen = 1; SV * const sv = sv_2mortal(newSVpvn(&value, numlen)); - const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp) - : SvPVX(sv); + const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv); const STRLEN newlen = SvCUR(sv); UV uv = UNICODE_REPLACEMENT; @@ -6372,8 +6728,7 @@ S_reg_recode(pTHX_ const char value, SV **encp) if (!newlen || numlen != newlen) { uv = UNICODE_REPLACEMENT; - if (encp) - *encp = NULL; + *encp = NULL; } return uv; } @@ -6414,7 +6769,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth) tryagain: - switch (*RExC_parse) { + switch ((U8)*RExC_parse) { case '^': RExC_seen_zerolen++; nextchar(pRExC_state); @@ -6498,6 +6853,22 @@ tryagain: RExC_parse++; vFAIL("Quantifier follows nothing"); break; + case 0xDF: + case 0xC3: + case 0xCE: + if (!LOC && FOLD) { + U32 len,cp; + len=0; /* silence a spurious compiler warning */ + if ((cp = what_len_TRICKYFOLD_safe(RExC_parse,RExC_end,UTF,len))) { + *flagp |= HASWIDTH; /* could be SIMPLE too, but needs a handler in regexec.regrepeat */ + RExC_parse+=len-1; /* we get one from nextchar() as well. :-( */ + ret = reganode(pRExC_state, FOLDCHAR, cp); + Set_Node_Length(ret, 1); /* MJD */ + nextchar(pRExC_state); /* kill whitespace under /x */ + return ret; + } + } + goto outer_default; case '\\': /* Special Escapes @@ -6583,15 +6954,25 @@ tryagain: ret = reg_node(pRExC_state, NDIGIT); *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; + case 'R': + ret = reg_node(pRExC_state, LNBREAK); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'h': + ret = reg_node(pRExC_state, HORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; + case 'H': + ret = reg_node(pRExC_state, NHORIZWS); + *flagp |= HASWIDTH|SIMPLE; + goto finish_meta_pat; case 'v': - ret = reganode(pRExC_state, PRUNE, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, VERTWS); + *flagp |= HASWIDTH|SIMPLE; goto finish_meta_pat; case 'V': - ret = reganode(pRExC_state, SKIP, 0); - ret->flags = 1; - *flagp |= SIMPLE; + ret = reg_node(pRExC_state, NVERTWS); + *flagp |= HASWIDTH|SIMPLE; finish_meta_pat: nextchar(pRExC_state); Set_Node_Length(ret, 2); /* MJD */ @@ -6662,7 +7043,7 @@ tryagain: if (!SIZE_ONLY) { num = add_data( pRExC_state, 1, "S" ); RExC_rxi->data->data[num]=(void*)sv_dat; - SvREFCNT_inc(sv_dat); + SvREFCNT_inc_simple_void(sv_dat); } RExC_sawback = 1; @@ -6703,6 +7084,8 @@ tryagain: goto parse_named_seq; } } num = atoi(RExC_parse); + if (isg && num == 0) + vFAIL("Reference to invalid group 0"); if (isrel) { num = RExC_npar - num; if (num < 1) @@ -6758,7 +7141,8 @@ tryagain: } /* FALL THROUGH */ - default: { + default: + outer_default:{ register STRLEN len; register UV ender; register char *p; @@ -6783,7 +7167,12 @@ tryagain: if (RExC_flags & RXf_PMf_EXTENDED) p = regwhite( pRExC_state, p ); - switch (*p) { + switch ((U8)*p) { + case 0xDF: + case 0xC3: + case 0xCE: + if (LOC || !FOLD || !is_TRICKYFOLD_safe(p,RExC_end,UTF)) + goto normal_default; case '^': case '$': case '.': @@ -6813,11 +7202,13 @@ tryagain: case 'C': /* Single char !DANGEROUS! */ case 'd': case 'D': /* digit class */ case 'g': case 'G': /* generic-backref, pos assertion */ + case 'h': case 'H': /* HORIZWS */ case 'k': case 'K': /* named backref, keep marker */ case 'N': /* named char sequence */ - case 'p': case 'P': /* unicode property */ + case 'p': case 'P': /* Unicode property */ + case 'R': /* LNBREAK */ case 's': case 'S': /* space class */ - case 'v': case 'V': /* (*PRUNE) and (*SKIP) */ + case 'v': case 'V': /* VERTWS */ case 'w': case 'W': /* word class */ case 'X': /* eXtended Unicode "combining character sequence" */ case 'z': case 'Z': /* End of line/string assertion */ @@ -7240,6 +7631,21 @@ case ANYOF_N##NAME: \ what = WORD; \ break +#define _C_C_T_NOLOC_(NAME,TEST,WORD) \ +ANYOF_##NAME: \ + for (value = 0; value < 256; value++) \ + if (TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '+'; \ + what = WORD; \ + break; \ +case ANYOF_N##NAME: \ + for (value = 0; value < 256; value++) \ + if (!TEST) \ + ANYOF_BITMAP_SET(ret, value); \ + yesno = '!'; \ + what = WORD; \ + break /* parse a class specification and produce either an ANYOF node that @@ -7252,10 +7658,10 @@ STATIC regnode * S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth) { dVAR; - register UV value = 0; register UV nextvalue; register IV prevvalue = OOB_UNICODE; register IV range = 0; + UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */ register regnode *ret; STRLEN numlen; IV namedclass; @@ -7358,6 +7764,10 @@ parseit: case 'S': namedclass = ANYOF_NSPACE; break; case 'd': namedclass = ANYOF_DIGIT; break; case 'D': namedclass = ANYOF_NDIGIT; break; + case 'v': namedclass = ANYOF_VERTWS; break; + case 'V': namedclass = ANYOF_NVERTWS; break; + case 'h': namedclass = ANYOF_HORIZWS; break; + case 'H': namedclass = ANYOF_NHORIZWS; break; case 'N': /* Handle \N{NAME} in class */ { /* We only pay attention to the first char of @@ -7536,6 +7946,8 @@ parseit: case _C_C_T_(SPACE, isSPACE(value), "SpacePerl"); case _C_C_T_(UPPER, isUPPER(value), "Upper"); case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit"); + case _C_C_T_NOLOC_(VERTWS, is_VERTWS_latin1(&value), "VertSpace"); + case _C_C_T_NOLOC_(HORIZWS, is_HORIZWS_latin1(&value), "HorizSpace"); case ANYOF_ASCII: if (LOC) ANYOF_CLASS_SET(ret, ANYOF_ASCII); @@ -7657,12 +8069,16 @@ parseit: { if (isLOWER(prevvalue)) { for (i = prevvalue; i <= ceilvalue; i++) - if (isLOWER(i)) + if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } else { for (i = prevvalue; i <= ceilvalue; i++) - if (isUPPER(i)) + if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) { + stored++; ANYOF_BITMAP_SET(ret, i); + } } } else @@ -8259,6 +8675,27 @@ S_regcurly(register const char *s) /* - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form */ +#ifdef DEBUGGING +void +S_regdump_extflags(pTHX_ const char *lead, const U32 flags) { + int bit; + int set=0; + for (bit=0; bit<32; bit++) { + if (flags & (1<program, ri->program + 1, NULL, NULL, sv, 0, 0); @@ -8340,6 +8778,7 @@ Perl_regdump(pTHX_ const regexp *r) if (r->extflags & RXf_EVAL_SEEN) PerlIO_printf(Perl_debug_log, "with eval "); PerlIO_printf(Perl_debug_log, "\n"); + DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags)); #else PERL_UNUSED_CONTEXT; PERL_UNUSED_ARG(r); @@ -8370,19 +8809,17 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) k = PL_regkind[OP(o)]; if (k == EXACT) { - SV * const dsv = sv_2mortal(newSVpvs("")); + sv_catpvs(sv, " "); /* Using is_utf8_string() (via PERL_PV_UNI_DETECT) * is a crude hack but it may be the best for now since * we have no flag "this EXACTish node was UTF-8" * --jhi */ - const char * const s = - pv_pretty(dsv, STRING(o), STR_LEN(o), 60, - PL_colors[0], PL_colors[1], - PERL_PV_ESCAPE_UNI_DETECT | - PERL_PV_PRETTY_ELIPSES | - PERL_PV_PRETTY_LTGT - ); - Perl_sv_catpvf(aTHX_ sv, " %s", s ); + pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1], + PERL_PV_ESCAPE_UNI_DETECT | + PERL_PV_PRETTY_ELLIPSES | + PERL_PV_PRETTY_LTGT | + PERL_PV_PRETTY_NOCLEAR + ); } else if (k == TRIE) { /* print the details of the trie in dumpuntil instead, as * progi->data isn't available here */ @@ -8411,7 +8848,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) int i; int rangestart = -1; U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie); - Perl_sv_catpvf(aTHX_ sv, "["); + sv_catpvs(sv, "["); for (i = 0; i <= 256; i++) { if (i < 256 && BITMAP_TEST(bitmap,i)) { if (rangestart == -1) @@ -8428,7 +8865,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) rangestart = -1; } } - Perl_sv_catpvf(aTHX_ sv, "]"); + sv_catpvs(sv, "]"); } } else if (k == CURLY) { @@ -8470,6 +8907,8 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) SVfARG((SV*)progi->data->data[ ARG( o ) ])); } else if (k == LOGICAL) Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */ + else if (k == FOLDCHAR) + Perl_sv_catpvf(aTHX_ sv, "[0x%"UVXf"]", PTR2UV(ARG(o)) ); else if (k == ANYOF) { int i, rangestart = -1; const U8 flags = ANYOF_FLAGS(o); @@ -8620,7 +9059,7 @@ Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o) } SV * -Perl_re_intuit_string(pTHX_ regexp *prog) +Perl_re_intuit_string(pTHX_ REGEXP * const prog) { /* Assume that RE_INTUIT is set */ dVAR; GET_RE_DEBUG_FLAGS_DECL; @@ -8690,13 +9129,8 @@ Perl_pregfree(pTHX_ struct regexp *r) if (r->saved_copy) SvREFCNT_dec(r->saved_copy); #endif - if (r->swap) { - Safefree(r->swap->startp); - Safefree(r->swap->endp); - Safefree(r->swap); - } - Safefree(r->startp); - Safefree(r->endp); + Safefree(r->swap); + Safefree(r->offs); Safefree(r); } @@ -8724,29 +9158,24 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { (void)ReREFCNT_inc(r); Newx(ret, 1, regexp); StructCopy(r, ret, regexp); - Newx(ret->startp, npar, I32); - Copy(r->startp, ret->startp, npar, I32); - Newx(ret->endp, npar, I32); - Copy(r->endp, ret->endp, npar, I32); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); ret->refcnt = 1; if (r->substrs) { - struct reg_substr_datum *s; - I32 i; Newx(ret->substrs, 1, struct reg_substr_data); - for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { - s->min_offset = r->substrs->data[i].min_offset; - s->max_offset = r->substrs->data[i].max_offset; - s->end_shift = r->substrs->data[i].end_shift; - s->substr = SvREFCNT_inc(r->substrs->data[i].substr); - s->utf8_substr = SvREFCNT_inc(r->substrs->data[i].utf8_substr); - } - } + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); + + SvREFCNT_inc_void(ret->anchored_substr); + SvREFCNT_inc_void(ret->anchored_utf8); + SvREFCNT_inc_void(ret->float_substr); + SvREFCNT_inc_void(ret->float_utf8); + + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + } RX_MATCH_COPIED_off(ret); #ifdef PERL_OLD_COPY_ON_WRITE - /* this is broken. */ - assert(0); - if (ret->saved_copy) - ret->saved_copy=NULL; + ret->saved_copy = NULL; #endif ret->mother_re = r; ret->swap = NULL; @@ -8768,7 +9197,7 @@ Perl_reg_temp_copy (pTHX_ struct regexp *r) { */ void -Perl_regfree_internal(pTHX_ struct regexp *r) +Perl_regfree_internal(pTHX_ REGEXP * const r) { dVAR; RXi_GET_DECL(r,ri); @@ -8887,12 +9316,11 @@ Perl_regfree_internal(pTHX_ struct regexp *r) #define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL) /* - regdupe - duplicate a regexp. - - This routine is called by sv.c's re_dup and is expected to clone a - given regexp structure. It is a no-op when not under USE_ITHREADS. - (Originally this *was* re_dup() for change history see sv.c) + re_dup - duplicate a regexp. + This routine is expected to clone a given regexp structure. It is not + compiler under USE_ITHREADS. + After all of the core data stored in struct regexp is duplicated the regexp_engine.dupe method is used to copy any private data stored in the *pprivate pointer. This allows extensions to handle @@ -8907,8 +9335,7 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) { dVAR; regexp *ret; - I32 i, npar; - struct reg_substr_datum *s; + I32 npar; if (!r) return (REGEXP *)NULL; @@ -8918,64 +9345,63 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) npar = r->nparens+1; - Newxz(ret, 1, regexp); - Newx(ret->startp, npar, I32); - Copy(r->startp, ret->startp, npar, I32); - Newx(ret->endp, npar, I32); - Copy(r->endp, ret->endp, npar, I32); - if(r->swap) { - Newx(ret->swap, 1, regexp_paren_ofs); + Newx(ret, 1, regexp); + StructCopy(r, ret, regexp); + Newx(ret->offs, npar, regexp_paren_pair); + Copy(r->offs, ret->offs, npar, regexp_paren_pair); + if(ret->swap) { /* no need to copy these */ - Newx(ret->swap->startp, npar, I32); - Newx(ret->swap->endp, npar, I32); - } else { - ret->swap = NULL; + Newx(ret->swap, npar, regexp_paren_pair); } - if (r->substrs) { + if (ret->substrs) { + /* Do it this way to avoid reading from *r after the StructCopy(). + That way, if any of the sv_dup_inc()s dislodge *r from the L1 + cache, it doesn't matter. */ + const bool anchored = r->check_substr == r->anchored_substr; Newx(ret->substrs, 1, struct reg_substr_data); - for (s = ret->substrs->data, i = 0; i < 3; i++, s++) { - s->min_offset = r->substrs->data[i].min_offset; - s->max_offset = r->substrs->data[i].max_offset; - s->end_shift = r->substrs->data[i].end_shift; - s->substr = sv_dup_inc(r->substrs->data[i].substr, param); - s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param); - } - } else - ret->substrs = NULL; + StructCopy(r->substrs, ret->substrs, struct reg_substr_data); - ret->wrapped = SAVEPVN(r->wrapped, r->wraplen+1); - ret->precomp = ret->wrapped + (r->precomp - r->wrapped); - ret->prelen = r->prelen; - ret->wraplen = r->wraplen; + ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param); + ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param); + ret->float_substr = sv_dup_inc(ret->float_substr, param); + ret->float_utf8 = sv_dup_inc(ret->float_utf8, param); - ret->mother_re = NULL; - ret->refcnt = r->refcnt; - ret->minlen = r->minlen; - ret->minlenret = r->minlenret; - ret->nparens = r->nparens; - ret->lastparen = r->lastparen; - ret->lastcloseparen = r->lastcloseparen; - ret->intflags = r->intflags; - ret->extflags = r->extflags; - - ret->sublen = r->sublen; - - ret->engine = r->engine; - - ret->paren_names = hv_dup_inc(r->paren_names, param); + /* check_substr and check_utf8, if non-NULL, point to either their + anchored or float namesakes, and don't hold a second reference. */ + + if (ret->check_substr) { + if (anchored) { + assert(r->check_utf8 == r->anchored_utf8); + ret->check_substr = ret->anchored_substr; + ret->check_utf8 = ret->anchored_utf8; + } else { + assert(r->check_substr == r->float_substr); + assert(r->check_utf8 == r->float_utf8); + ret->check_substr = ret->float_substr; + ret->check_utf8 = ret->float_utf8; + } + } + } + + ret->wrapped = SAVEPVN(ret->wrapped, ret->wraplen+1); + ret->precomp = ret->wrapped + (ret->precomp - ret->wrapped); + ret->paren_names = hv_dup_inc(ret->paren_names, param); + + if (ret->pprivate) + RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); if (RX_MATCH_COPIED(ret)) - ret->subbeg = SAVEPVN(r->subbeg, r->sublen); + ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen); else ret->subbeg = NULL; #ifdef PERL_OLD_COPY_ON_WRITE ret->saved_copy = NULL; #endif - - ret->pprivate = r->pprivate; - if (ret->pprivate) - RXi_SET(ret,CALLREGDUPE_PVT(ret,param)); + + ret->mother_re = NULL; + ret->gofs = 0; + ret->seen_evals = 0; ptr_table_store(PL_ptr_table, r, ret); return ret; @@ -8997,7 +9423,7 @@ Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param) */ void * -Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param) +Perl_regdupe_internal(pTHX_ REGEXP * const r, CLONE_PARAMS *param) { dVAR; regexp_internal *reti; @@ -9250,12 +9676,24 @@ clear_re(pTHX_ void *r) STATIC void S_put_byte(pTHX_ SV *sv, int c) { - if (isCNTRL(c) || c == 255 || !isPRINT(c)) + /* Our definition of isPRINT() ignores locales, so only bytes that are + not part of UTF-8 are considered printable. I assume that the same + holds for UTF-EBCDIC. + Also, code point 255 is not printable in either (it's E0 in EBCDIC, + which Wikipedia says: + + EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all + ones (binary 1111 1111, hexadecimal FF). It is similar, but not + identical, to the ASCII delete (DEL) or rubout control character. + ) So the old condition can be simplified to !isPRINT(c) */ + if (!isPRINT(c)) Perl_sv_catpvf(aTHX_ sv, "\\%o", c); - else if (c == '-' || c == ']' || c == '\\' || c == '^') - Perl_sv_catpvf(aTHX_ sv, "\\%c", c); - else - Perl_sv_catpvf(aTHX_ sv, "%c", c); + else { + const char string = c; + if (c == '-' || c == ']' || c == '\\' || c == '^') + sv_catpvs(sv, "\\"); + sv_catpvn(sv, &string, 1); + } } @@ -9358,7 +9796,7 @@ S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node, elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60, PL_colors[0], PL_colors[1], (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) | - PERL_PV_PRETTY_ELIPSES | + PERL_PV_PRETTY_ELLIPSES | PERL_PV_PRETTY_LTGT ) : "???"