X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/a45c7426c93f17067d1734c68cb400246e1db490..02bb3106fe98e7142548ce7815ba8632f0e4d4f8:/pp.c diff --git a/pp.c b/pp.c index 8234a77..5876cfd 100644 --- a/pp.c +++ b/pp.c @@ -9,8 +9,11 @@ */ /* - * "It's a big house this, and very peculiar. Always a bit more to discover, - * and no knowing what you'll find around a corner. And Elves, sir!" --Samwise + * 'It's a big house this, and very peculiar. Always a bit more + * to discover, and no knowing what you'll find round a corner. + * And Elves, sir!' --Samwise Gamgee + * + * [p.225 of _The Lord of the Rings_, II/i: "Many Meetings"] */ /* This file contains general pp ("push/pop") functions that execute the @@ -60,6 +63,7 @@ PP(pp_padav) { dVAR; dSP; dTARGET; I32 gimme; + assert(SvTYPE(TARG) == SVt_PVAV); if (PL_op->op_private & OPpLVAL_INTRO) if (!(PL_op->op_private & OPpPAD_STATE)) SAVECLEARSV(PAD_SVl(PL_op->op_targ)); @@ -103,6 +107,7 @@ PP(pp_padhv) dVAR; dSP; dTARGET; I32 gimme; + assert(SvTYPE(TARG) == SVt_PVHV); XPUSHs(TARG); if (PL_op->op_private & OPpLVAL_INTRO) if (!(PL_op->op_private & OPpPAD_STATE)) @@ -127,6 +132,9 @@ PP(pp_padhv) /* Translations. */ +static const char S_no_symref_sv[] = + "Can't use string (\"%" SVf32 "\"%s) as %s ref while \"strict refs\" in use"; + PP(pp_rv2gv) { dVAR; dSP; dTOPss; @@ -137,11 +145,11 @@ PP(pp_rv2gv) sv = SvRV(sv); if (SvTYPE(sv) == SVt_PVIO) { - GV * const gv = (GV*) sv_newmortal(); + GV * const gv = MUTABLE_GV(sv_newmortal()); gv_init(gv, 0, "", 0, 0); GvIOp(gv) = MUTABLE_IO(sv); SvREFCNT_inc_void_NN(sv); - sv = (SV*) gv; + sv = MUTABLE_SV(gv); } else if (!isGV_with_GP(sv)) DIE(aTHX_ "Not a GLOB reference"); @@ -158,14 +166,14 @@ PP(pp_rv2gv) * NI-S 1999/05/07 */ if (SvREADONLY(sv)) - Perl_croak(aTHX_ PL_no_modify); + Perl_croak(aTHX_ "%s", PL_no_modify); if (PL_op->op_private & OPpDEREF) { GV *gv; if (cUNOP->op_targ) { STRLEN len; SV * const namesv = PAD_SV(cUNOP->op_targ); const char * const name = SvPV(namesv, len); - gv = (GV*)newSV(0); + gv = MUTABLE_GV(newSV(0)); gv_init(gv, CopSTASH(PL_curcop), name, len, 0); } else { @@ -173,7 +181,7 @@ PP(pp_rv2gv) gv = newGVgen(name); } prepare_SV_for_RV(sv); - SvRV_set(sv, (SV*)gv); + SvRV_set(sv, MUTABLE_SV(gv)); SvROK_on(sv); SvSETMAGIC(sv); goto wasref; @@ -188,17 +196,18 @@ PP(pp_rv2gv) if ((PL_op->op_flags & OPf_SPECIAL) && !(PL_op->op_flags & OPf_MOD)) { - SV * const temp = (SV*)gv_fetchsv(sv, 0, SVt_PVGV); + SV * const temp = MUTABLE_SV(gv_fetchsv(sv, 0, SVt_PVGV)); if (!temp && (!is_gv_magical_sv(sv,0) - || !(sv = (SV*)gv_fetchsv(sv, GV_ADD, SVt_PVGV)))) { + || !(sv = MUTABLE_SV(gv_fetchsv(sv, GV_ADD, + SVt_PVGV))))) { RETSETUNDEF; } sv = temp; } else { if (PL_op->op_private & HINT_STRICT_REFS) - DIE(aTHX_ PL_no_symref_sv, sv, "a symbol"); + DIE(aTHX_ S_no_symref_sv, sv, (SvCUR(sv)>32 ? "..." : ""), "a symbol"); if ((PL_op->op_private & (OPpLVAL_INTRO|OPpDONT_INIT_GV)) == OPpDONT_INIT_GV) { /* We are the target of a coderef assignment. Return @@ -206,12 +215,12 @@ PP(pp_rv2gv) things. */ RETURN; } - sv = (SV*)gv_fetchsv(sv, GV_ADD, SVt_PVGV); + sv = MUTABLE_SV(gv_fetchsv(sv, GV_ADD, SVt_PVGV)); } } } if (PL_op->op_private & OPpLVAL_INTRO) - save_gp((GV*)sv, !(PL_op->op_flags & OPf_SPECIAL)); + save_gp(MUTABLE_GV(sv), !(PL_op->op_flags & OPf_SPECIAL)); SETs(sv); RETURN; } @@ -228,7 +237,7 @@ Perl_softref2xv(pTHX_ SV *const sv, const char *const what, if (PL_op->op_private & HINT_STRICT_REFS) { if (SvOK(sv)) - Perl_die(aTHX_ PL_no_symref_sv, sv, what); + Perl_die(aTHX_ S_no_symref_sv, sv, (SvCUR(sv)>32 ? "..." : ""), what); else Perl_die(aTHX_ PL_no_usym, what); } @@ -283,7 +292,7 @@ PP(pp_rv2sv) } } else { - gv = (GV*)sv; + gv = MUTABLE_GV(sv); if (!isGV_with_GP(gv)) { if (SvGMAGICAL(sv)) { @@ -300,11 +309,11 @@ PP(pp_rv2sv) if (PL_op->op_flags & OPf_MOD) { if (PL_op->op_private & OPpLVAL_INTRO) { if (cUNOP->op_first->op_type == OP_NULL) - sv = save_scalar((GV*)TOPs); + sv = save_scalar(MUTABLE_GV(TOPs)); else if (gv) sv = save_scalar(gv); else - Perl_croak(aTHX_ PL_no_localize_ref); + Perl_croak(aTHX_ "%s", PL_no_localize_ref); } else if (PL_op->op_private & OPpDEREF) vivify_ref(sv, PL_op->op_private & OPpDEREF); @@ -317,12 +326,19 @@ PP(pp_av2arylen) { dVAR; dSP; AV * const av = MUTABLE_AV(TOPs); - SV ** const sv = Perl_av_arylen_p(aTHX_ MUTABLE_AV(av)); - if (!*sv) { - *sv = newSV_type(SVt_PVMG); - sv_magic(*sv, (SV*)av, PERL_MAGIC_arylen, NULL, 0); + const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET; + if (lvalue) { + SV ** const sv = Perl_av_arylen_p(aTHX_ MUTABLE_AV(av)); + if (!*sv) { + *sv = newSV_type(SVt_PVMG); + sv_magic(*sv, MUTABLE_SV(av), PERL_MAGIC_arylen, NULL, 0); + } + SETs(*sv); + } else { + SETs(sv_2mortal(newSViv( + AvFILL(MUTABLE_AV(av)) + CopARYBASE_get(PL_curcop) + ))); } - SETs(*sv); RETURN; } @@ -338,8 +354,7 @@ PP(pp_pos) LvTYPE(TARG) = '.'; if (LvTARG(TARG) != sv) { - if (LvTARG(TARG)) - SvREFCNT_dec(LvTARG(TARG)); + SvREFCNT_dec(LvTARG(TARG)); LvTARG(TARG) = SvREFCNT_inc_simple(sv); } PUSHs(TARG); /* no SvSETMAGIC */ @@ -376,7 +391,7 @@ PP(pp_rv2cv) CV *cv = sv_2cv(TOPs, &stash_unused, &gv, flags); if (cv) { if (CvCLONE(cv)) - cv = MUTABLE_CV(sv_2mortal((SV*)cv_clone(cv))); + cv = MUTABLE_CV(sv_2mortal(MUTABLE_SV(cv_clone(cv)))); if ((PL_op->op_private & OPpLVAL_INTRO)) { if (gv && GvCV(gv) == cv && (gv = gv_autoload4(GvSTASH(gv), GvNAME(gv), GvNAMELEN(gv), FALSE))) cv = GvCV(gv); @@ -389,7 +404,7 @@ PP(pp_rv2cv) } else cv = MUTABLE_CV(&PL_sv_undef); - SETs((SV*)cv); + SETs(MUTABLE_SV(cv)); RETURN; } @@ -418,6 +433,10 @@ PP(pp_prototype) ret = newSVpvs_flags("_;$", SVs_TEMP); goto set; } + if (code == -KEY_keys || code == -KEY_values || code == -KEY_each) { + ret = newSVpvs_flags("\\[@%]", SVs_TEMP); + goto set; + } if (code == -KEY_readpipe) { s = "CORE::backtick"; } @@ -474,9 +493,9 @@ PP(pp_anoncode) dVAR; dSP; CV *cv = MUTABLE_CV(PAD_SV(PL_op->op_targ)); if (CvCLONE(cv)) - cv = MUTABLE_CV(sv_2mortal((SV*)cv_clone(cv))); + cv = MUTABLE_CV(sv_2mortal(MUTABLE_SV(cv_clone(cv)))); EXTEND(SP,1); - PUSHs((SV*)cv); + PUSHs(MUTABLE_SV(cv)); RETURN; } @@ -572,9 +591,9 @@ PP(pp_bless) if (ssv && !SvGMAGICAL(ssv) && !SvAMAGIC(ssv) && SvROK(ssv)) Perl_croak(aTHX_ "Attempt to bless into a reference"); ptr = SvPV_const(ssv,len); - if (len == 0 && ckWARN(WARN_MISC)) - Perl_warner(aTHX_ packWARN(WARN_MISC), - "Explicit blessing to '' (assuming package main)"); + if (len == 0) + Perl_ck_warner(aTHX_ packWARN(WARN_MISC), + "Explicit blessing to '' (assuming package main)"); stash = gv_stashpvn(ptr, len, GV_ADD); } @@ -588,7 +607,7 @@ PP(pp_gelem) SV *sv = POPs; const char * const elem = SvPV_nolen_const(sv); - GV * const gv = (GV*)POPs; + GV * const gv = MUTABLE_GV(POPs); SV * tmpRef = NULL; sv = NULL; @@ -598,33 +617,33 @@ PP(pp_gelem) switch (*elem) { case 'A': if (strEQ(second_letter, "RRAY")) - tmpRef = (SV*)GvAV(gv); + tmpRef = MUTABLE_SV(GvAV(gv)); break; case 'C': if (strEQ(second_letter, "ODE")) - tmpRef = (SV*)GvCVu(gv); + tmpRef = MUTABLE_SV(GvCVu(gv)); break; case 'F': if (strEQ(second_letter, "ILEHANDLE")) { /* finally deprecated in 5.8.0 */ deprecate("*glob{FILEHANDLE}"); - tmpRef = (SV*)GvIOp(gv); + tmpRef = MUTABLE_SV(GvIOp(gv)); } else if (strEQ(second_letter, "ORMAT")) - tmpRef = (SV*)GvFORM(gv); + tmpRef = MUTABLE_SV(GvFORM(gv)); break; case 'G': if (strEQ(second_letter, "LOB")) - tmpRef = (SV*)gv; + tmpRef = MUTABLE_SV(gv); break; case 'H': if (strEQ(second_letter, "ASH")) - tmpRef = (SV*)GvHV(gv); + tmpRef = MUTABLE_SV(GvHV(gv)); break; case 'I': if (*second_letter == 'O' && !elem[2]) - tmpRef = (SV*)GvIOp(gv); + tmpRef = MUTABLE_SV(GvIOp(gv)); break; case 'N': if (strEQ(second_letter, "AME")) @@ -809,10 +828,10 @@ PP(pp_undef) hv_undef(MUTABLE_HV(sv)); break; case SVt_PVCV: - if (cv_const_sv((const CV *)sv) && ckWARN(WARN_MISC)) - Perl_warner(aTHX_ packWARN(WARN_MISC), "Constant subroutine %s undefined", - CvANON((const CV *)sv) ? "(anonymous)" - : GvENAME(CvGV((const CV *)sv))); + if (cv_const_sv((const CV *)sv)) + Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Constant subroutine %s undefined", + CvANON((const CV *)sv) ? "(anonymous)" + : GvENAME(CvGV((const CV *)sv))); /* FALLTHROUGH */ case SVt_PVFM: { @@ -832,18 +851,19 @@ PP(pp_undef) HV *stash; /* undef *Foo:: */ - if((stash = GvHV((GV*)sv)) && HvNAME_get(stash)) + if((stash = GvHV((const GV *)sv)) && HvNAME_get(stash)) mro_isa_changed_in(stash); /* undef *Pkg::meth_name ... */ - else if(GvCVu((GV*)sv) && (stash = GvSTASH((GV*)sv)) && HvNAME_get(stash)) + else if(GvCVu((const GV *)sv) && (stash = GvSTASH((const GV *)sv)) + && HvNAME_get(stash)) mro_method_changed_in(stash); - gp_free((GV*)sv); + gp_free(MUTABLE_GV(sv)); Newxz(gp, 1, GP); GvGP(sv) = gp_ref(gp); GvSV(sv) = newSV(0); GvLINE(sv) = CopLINE(PL_curcop); - GvEGV(sv) = (GV*)sv; + GvEGV(sv) = MUTABLE_GV(sv); GvMULTI_on(sv); break; } @@ -865,7 +885,7 @@ PP(pp_predec) { dVAR; dSP; if (SvTYPE(TOPs) >= SVt_PVAV || isGV_with_GP(TOPs)) - DIE(aTHX_ PL_no_modify); + DIE(aTHX_ "%s", PL_no_modify); if (!SvREADONLY(TOPs) && SvIOK_notUV(TOPs) && !SvNOK(TOPs) && !SvPOK(TOPs) && SvIVX(TOPs) != IV_MIN) { @@ -882,7 +902,7 @@ PP(pp_postinc) { dVAR; dSP; dTARGET; if (SvTYPE(TOPs) >= SVt_PVAV || isGV_with_GP(TOPs)) - DIE(aTHX_ PL_no_modify); + DIE(aTHX_ "%s", PL_no_modify); sv_setsv(TARG, TOPs); if (!SvREADONLY(TOPs) && SvIOK_notUV(TOPs) && !SvNOK(TOPs) && !SvPOK(TOPs) && SvIVX(TOPs) != IV_MAX) @@ -904,7 +924,7 @@ PP(pp_postdec) { dVAR; dSP; dTARGET; if (SvTYPE(TOPs) >= SVt_PVAV || isGV_with_GP(TOPs)) - DIE(aTHX_ PL_no_modify); + DIE(aTHX_ "%s", PL_no_modify); sv_setsv(TARG, TOPs); if (!SvREADONLY(TOPs) && SvIOK_notUV(TOPs) && !SvNOK(TOPs) && !SvPOK(TOPs) && SvIVX(TOPs) != IV_MIN) @@ -1549,7 +1569,7 @@ PP(pp_repeat) } MARK++; repeatcpy((char*)(MARK + items), (char*)MARK, - items * sizeof(SV*), count - 1); + items * sizeof(const SV *), count - 1); SP += max; } else if (count <= 0) @@ -2548,7 +2568,7 @@ PP(pp_complement) sv_usepvn_flags(TARG, (char*)result, nchar, SV_HAS_TRAILING_NUL); SvUTF8_off(TARG); } - SETs(TARG); + SETTARG; RETURN; } #ifdef LIBERAL @@ -2564,8 +2584,7 @@ PP(pp_complement) #endif for ( ; anum > 0; anum--, tmps++) *tmps = ~*tmps; - - SETs(TARG); + SETTARG; } RETURN; } @@ -3060,15 +3079,19 @@ PP(pp_substr) { dVAR; dSP; dTARGET; SV *sv; - I32 len = 0; STRLEN curlen; STRLEN utf8_curlen; - I32 pos; - I32 rem; - I32 fail; + SV * pos_sv; + IV pos1_iv; + int pos1_is_uv; + IV pos2_iv; + int pos2_is_uv; + SV * len_sv; + IV len_iv = 0; + int len_is_uv = 1; const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET; const char *tmps; - const I32 arybase = CopARYBASE_get(PL_curcop); + const IV arybase = CopARYBASE_get(PL_curcop); SV *repl_sv = NULL; const char *repl = NULL; STRLEN repl_len; @@ -3084,9 +3107,13 @@ PP(pp_substr) repl = SvPV_const(repl_sv, repl_len); repl_is_utf8 = DO_UTF8(repl_sv) && SvCUR(repl_sv); } - len = POPi; + len_sv = POPs; + len_iv = SvIV(len_sv); + len_is_uv = SvIOK_UV(len_sv); } - pos = POPi; + pos_sv = POPs; + pos1_iv = SvIV(pos_sv); + pos1_is_uv = SvIOK_UV(pos_sv); sv = POPs; PUTBACK; if (repl_sv) { @@ -3108,52 +3135,80 @@ PP(pp_substr) else utf8_curlen = 0; - if (pos >= arybase) { - pos -= arybase; - rem = curlen-pos; - fail = rem; - if (num_args > 2) { - if (len < 0) { - rem += len; - if (rem < 0) - rem = 0; - } - else if (rem > len) - rem = len; + if ( (pos1_is_uv && arybase < 0) || (pos1_iv >= arybase) ) { /* pos >= $[ */ + UV pos1_uv = pos1_iv-arybase; + /* Overflow can occur when $[ < 0 */ + if (arybase < 0 && pos1_uv < (UV)pos1_iv) + goto bound_fail; + pos1_iv = pos1_uv; + pos1_is_uv = 1; + } + else if (pos1_is_uv ? (UV)pos1_iv > 0 : pos1_iv > 0) { + goto bound_fail; /* $[=3; substr($_,2,...) */ + } + else { /* pos < $[ */ + if (pos1_iv == 0) { /* $[=1; substr($_,0,...) */ + pos1_iv = curlen; + pos1_is_uv = 1; + } else { + if (curlen) { + pos1_is_uv = curlen-1 > ~(UV)pos1_iv; + pos1_iv += curlen; + } } } - else { - pos += curlen; - if (num_args < 3) - rem = curlen; - else if (len >= 0) { - rem = pos+len; - if (rem > (I32)curlen) - rem = curlen; + if (pos1_is_uv || pos1_iv > 0) { + if ((UV)pos1_iv > curlen) + goto bound_fail; + } + + if (num_args > 2) { + if (!len_is_uv && len_iv < 0) { + pos2_iv = curlen + len_iv; + if (curlen) + pos2_is_uv = curlen-1 > ~(UV)len_iv; + else + pos2_is_uv = 0; + } else { /* len_iv >= 0 */ + if (!pos1_is_uv && pos1_iv < 0) { + pos2_iv = pos1_iv + len_iv; + pos2_is_uv = (UV)len_iv > (UV)IV_MAX; + } else { + if ((UV)len_iv > curlen-(UV)pos1_iv) + pos2_iv = curlen; + else + pos2_iv = pos1_iv+len_iv; + pos2_is_uv = 1; + } } - else { - rem = curlen+len; - if (rem < pos) - rem = pos; - } - if (pos < 0) - pos = 0; - fail = rem; - rem -= pos; - } - if (fail < 0) { - if (lvalue || repl) - Perl_croak(aTHX_ "substr outside of string"); - if (ckWARN(WARN_SUBSTR)) - Perl_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string"); - RETPUSHUNDEF; } else { - const I32 upos = pos; - const I32 urem = rem; - if (utf8_curlen) - sv_pos_u2b(sv, &pos, &rem); - tmps += pos; + pos2_iv = curlen; + pos2_is_uv = 1; + } + + if (!pos2_is_uv && pos2_iv < 0) { + if (!pos1_is_uv && pos1_iv < 0) + goto bound_fail; + pos2_iv = 0; + } + else if (!pos1_is_uv && pos1_iv < 0) + pos1_iv = 0; + + if ((UV)pos2_iv < (UV)pos1_iv) + pos2_iv = pos1_iv; + if ((UV)pos2_iv > curlen) + pos2_iv = curlen; + + { + /* pos1_iv and pos2_iv both in 0..curlen, so the cast is safe */ + const STRLEN pos = (STRLEN)( (UV)pos1_iv ); + const STRLEN len = (STRLEN)( (UV)pos2_iv - (UV)pos1_iv ); + STRLEN byte_len = len; + STRLEN byte_pos = utf8_curlen + ? sv_pos_u2b_flags(sv, pos, &byte_len, SV_CONST_RETURN) : pos; + + tmps += byte_pos; /* we either return a PV or an LV. If the TARG hasn't been used * before, or is of that type, reuse it; otherwise use a mortal * instead. Note that LVs can have an extended lifetime, so also @@ -3167,7 +3222,7 @@ PP(pp_substr) } } - sv_setpvn(TARG, tmps, rem); + sv_setpvn(TARG, tmps, byte_len); #ifdef USE_LOCALE_COLLATE sv_unmagic(TARG, PERL_MAGIC_collxfrm); #endif @@ -3184,19 +3239,17 @@ PP(pp_substr) } if (!SvOK(sv)) sv_setpvs(sv, ""); - sv_insert_flags(sv, pos, rem, repl, repl_len, 0); + sv_insert_flags(sv, byte_pos, byte_len, repl, repl_len, 0); if (repl_is_utf8) SvUTF8_on(sv); - if (repl_sv_copy) - SvREFCNT_dec(repl_sv_copy); + SvREFCNT_dec(repl_sv_copy); } else if (lvalue) { /* it's an lvalue! */ if (!SvGMAGICAL(sv)) { if (SvROK(sv)) { SvPV_force_nolen(sv); - if (ckWARN(WARN_SUBSTR)) - Perl_warner(aTHX_ packWARN(WARN_SUBSTR), - "Attempt to use reference as lvalue in substr"); + Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), + "Attempt to use reference as lvalue in substr"); } if (isGV_with_GP(sv)) SvPV_force_nolen(sv); @@ -3213,17 +3266,22 @@ PP(pp_substr) LvTYPE(TARG) = 'x'; if (LvTARG(TARG) != sv) { - if (LvTARG(TARG)) - SvREFCNT_dec(LvTARG(TARG)); + SvREFCNT_dec(LvTARG(TARG)); LvTARG(TARG) = SvREFCNT_inc_simple(sv); } - LvTARGOFF(TARG) = upos; - LvTARGLEN(TARG) = urem; + LvTARGOFF(TARG) = pos; + LvTARGLEN(TARG) = len; } } SPAGAIN; PUSHs(TARG); /* avoid SvSETMAGIC here */ RETURN; + +bound_fail: + if (lvalue || repl) + Perl_croak(aTHX_ "substr outside of string"); + Perl_ck_warner(aTHX_ packWARN(WARN_SUBSTR), "substr outside of string"); + RETPUSHUNDEF; } PP(pp_vec) @@ -3244,8 +3302,7 @@ PP(pp_vec) } LvTYPE(TARG) = 'v'; if (LvTARG(TARG) != src) { - if (LvTARG(TARG)) - SvREFCNT_dec(LvTARG(TARG)); + SvREFCNT_dec(LvTARG(TARG)); LvTARG(TARG) = SvREFCNT_inc_simple(src); } LvTARGOFF(TARG) = offset; @@ -3371,8 +3428,7 @@ PP(pp_index) if (retval > 0 && big_utf8) sv_pos_b2u(big, &retval); } - if (temp) - SvREFCNT_dec(temp); + SvREFCNT_dec(temp); fail: PUSHi(retval + arybase); RETURN; @@ -3509,30 +3565,106 @@ PP(pp_crypt) # else sv_setpv(TARG, PerlProc_crypt(tmps, SvPV_nolen_const(right))); # endif - SETs(TARG); + SETTARG; RETURN; #else DIE(aTHX_ "The crypt() function is unimplemented due to excessive paranoia."); + return NORMAL; #endif } +/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So + * most comments below say UTF-8, when in fact they mean UTF-EBCDIC as well */ + +/* Both the characters below can be stored in two UTF-8 bytes. In UTF-8 the max + * character that 2 bytes can hold is U+07FF, and in UTF-EBCDIC it is U+03FF. + * See http://www.unicode.org/unicode/reports/tr16 */ +#define LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS 0x0178 /* Also is title case */ +#define GREEK_CAPITAL_LETTER_MU 0x039C /* Upper and title case of MICRON */ + +/* Below are several macros that generate code */ +/* Generates code to store a unicode codepoint c that is known to occupy + * exactly two UTF-8 and UTF-EBCDIC bytes; it is stored into p and p+1. */ +#define STORE_UNI_TO_UTF8_TWO_BYTE(p, c) \ + STMT_START { \ + *(p) = UTF8_TWO_BYTE_HI(c); \ + *((p)+1) = UTF8_TWO_BYTE_LO(c); \ + } STMT_END + +/* Like STORE_UNI_TO_UTF8_TWO_BYTE, but advances p to point to the next + * available byte after the two bytes */ +#define CAT_UNI_TO_UTF8_TWO_BYTE(p, c) \ + STMT_START { \ + *(p)++ = UTF8_TWO_BYTE_HI(c); \ + *((p)++) = UTF8_TWO_BYTE_LO(c); \ + } STMT_END + +/* Generates code to store the upper case of latin1 character l which is known + * to have its upper case be non-latin1 into the two bytes p and p+1. There + * are only two characters that fit this description, and this macro knows + * about them, and that the upper case values fit into two UTF-8 or UTF-EBCDIC + * bytes */ +#define STORE_NON_LATIN1_UC(p, l) \ +STMT_START { \ + if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \ + STORE_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); \ + } else { /* Must be the following letter */ \ + STORE_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU); \ + } \ +} STMT_END + +/* Like STORE_NON_LATIN1_UC, but advances p to point to the next available byte + * after the character stored */ +#define CAT_NON_LATIN1_UC(p, l) \ +STMT_START { \ + if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \ + CAT_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); \ + } else { \ + CAT_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU); \ + } \ +} STMT_END + +/* Generates code to add the two UTF-8 bytes (probably u) that are the upper + * case of l into p and p+1. u must be the result of toUPPER_LATIN1_MOD(l), + * and must require two bytes to store it. Advances p to point to the next + * available position */ +#define CAT_TWO_BYTE_UNI_UPPER_MOD(p, l, u) \ +STMT_START { \ + if ((u) != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \ + CAT_UNI_TO_UTF8_TWO_BYTE((p), (u)); /* not special, just save it */ \ + } else if (l == LATIN_SMALL_LETTER_SHARP_S) { \ + *(p)++ = 'S'; *(p)++ = 'S'; /* upper case is 'SS' */ \ + } else {/* else is one of the other two special cases */ \ + CAT_NON_LATIN1_UC((p), (l)); \ + } \ +} STMT_END + PP(pp_ucfirst) { + /* Actually is both lcfirst() and ucfirst(). Only the first character + * changes. This means that possibly we can change in-place, ie., just + * take the source and change that one character and store it back, but not + * if read-only etc, or if the length changes */ + dVAR; dSP; SV *source = TOPs; - STRLEN slen; + STRLEN slen; /* slen is the byte length of the whole SV. */ STRLEN need; SV *dest; - bool inplace = TRUE; - bool doing_utf8; + bool inplace; /* ? Convert first char only, in-place */ + bool doing_utf8 = FALSE; /* ? using utf8 */ + bool convert_source_to_utf8 = FALSE; /* ? need to convert */ const int op_type = PL_op->op_type; const U8 *s; U8 *d; U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; - STRLEN ulen; - STRLEN tculen; + STRLEN ulen; /* ulen is the byte length of the original Unicode character + * stored as UTF-8 at s. */ + STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or + * lowercased) character stored in tmpbuf. May be either + * UTF-8 or not, but in either case is the number of bytes */ SvGETMAGIC(source); if (SvOK(source)) { @@ -3544,25 +3676,187 @@ PP(pp_ucfirst) slen = 0; } - if (slen && DO_UTF8(source) && UTF8_IS_START(*s)) { + /* We may be able to get away with changing only the first character, in + * place, but not if read-only, etc. Later we may discover more reasons to + * not convert in-place. */ + inplace = SvPADTMP(source) && !SvREADONLY(source) && SvTEMP(source); + + /* First calculate what the changed first character should be. This affects + * whether we can just swap it out, leaving the rest of the string unchanged, + * or even if have to convert the dest to UTF-8 when the source isn't */ + + if (! slen) { /* If empty */ + need = 1; /* still need a trailing NUL */ + } + else if (DO_UTF8(source)) { /* Is the source utf8? */ doing_utf8 = TRUE; - utf8_to_uvchr(s, &ulen); - if (op_type == OP_UCFIRST) { - toTITLE_utf8(s, tmpbuf, &tculen); - } else { - toLOWER_utf8(s, tmpbuf, &tculen); + +/* TODO: This is #ifdefd out because it has hard-coded the standard mappings, + * and doesn't allow for the user to specify their own. When code is added to + * detect if there is a user-defined mapping in force here, and if so to use + * that, then the code below can be compiled. The detection would be a good + * thing anyway, as currently the user-defined mappings only work on utf8 + * strings, and thus depend on the chosen internal storage method, which is a + * bad thing */ +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + if (UTF8_IS_INVARIANT(*s)) { + + /* An invariant source character is either ASCII or, in EBCDIC, an + * ASCII equivalent or a caseless C1 control. In both these cases, + * the lower and upper cases of any character are also invariants + * (and title case is the same as upper case). So it is safe to + * use the simple case change macros which avoid the overhead of + * the general functions. Note that if perl were to be extended to + * do locale handling in UTF-8 strings, this wouldn't be true in, + * for example, Lithuanian or Turkic. */ + *tmpbuf = (op_type == OP_LCFIRST) ? toLOWER(*s) : toUPPER(*s); + tculen = ulen = 1; + need = slen + 1; } - /* If the two differ, we definately cannot do inplace. */ - inplace = (ulen == tculen); - need = slen + 1 - ulen + tculen; - } else { - doing_utf8 = FALSE; - need = slen + 1; + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + U8 chr; + + /* Similarly, if the source character isn't invariant but is in the + * latin1 range (or EBCDIC equivalent thereof), we have the case + * changes compiled into perl, and can avoid the overhead of the + * general functions. In this range, the characters are stored as + * two UTF-8 bytes, and it so happens that any changed-case version + * is also two bytes (in both ASCIIish and EBCDIC machines). */ + tculen = ulen = 2; + need = slen + 1; + + /* Convert the two source bytes to a single Unicode code point + * value, change case and save for below */ + chr = UTF8_ACCUMULATE(*s, *(s+1)); + if (op_type == OP_LCFIRST) { /* lower casing is easy */ + U8 lower = toLOWER_LATIN1(chr); + STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, lower); + } + else { /* ucfirst */ + U8 upper = toUPPER_LATIN1_MOD(chr); + + /* Most of the latin1 range characters are well-behaved. Their + * title and upper cases are the same, and are also in the + * latin1 range. The macro above returns their upper (hence + * title) case, and all that need be done is to save the result + * for below. However, several characters are problematic, and + * have to be handled specially. The MOD in the macro name + * above means that these tricky characters all get mapped to + * the single character LATIN_SMALL_LETTER_Y_WITH_DIAERESIS. + * This mapping saves some tests for the majority of the + * characters */ + + if (upper != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { + + /* Not tricky. Just save it. */ + STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, upper); + } + else if (chr == LATIN_SMALL_LETTER_SHARP_S) { + + /* This one is tricky because it is two characters long, + * though the UTF-8 is still two bytes, so the stored + * length doesn't change */ + *tmpbuf = 'S'; /* The UTF-8 is 'Ss' */ + *(tmpbuf + 1) = 's'; + } + else { + + /* The other two have their title and upper cases the same, + * but are tricky because the changed-case characters + * aren't in the latin1 range. They, however, do fit into + * two UTF-8 bytes */ + STORE_NON_LATIN1_UC(tmpbuf, chr); + } + } + } + else { +#endif /* end of dont want to break user-defined casing */ + + /* Here, can't short-cut the general case */ + + utf8_to_uvchr(s, &ulen); + if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen); + else toLOWER_utf8(s, tmpbuf, &tculen); + + /* we can't do in-place if the length changes. */ + if (ulen != tculen) inplace = FALSE; + need = slen + 1 - ulen + tculen; +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + } +#endif } + else { /* Non-zero length, non-UTF-8, Need to consider locale and if + * latin1 is treated as caseless. Note that a locale takes + * precedence */ + tculen = 1; /* Most characters will require one byte, but this will + * need to be overridden for the tricky ones */ + need = slen + 1; + + if (op_type == OP_LCFIRST) { + + /* lower case the first letter: no trickiness for any character */ + *tmpbuf = (IN_LOCALE_RUNTIME) ? toLOWER_LC(*s) : + ((IN_UNI_8_BIT) ? toLOWER_LATIN1(*s) : toLOWER(*s)); + } + /* is ucfirst() */ + else if (IN_LOCALE_RUNTIME) { + *tmpbuf = toUPPER_LC(*s); /* This would be a bug if any locales + * have upper and title case different + */ + } + else if (! IN_UNI_8_BIT) { + *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or + * on EBCDIC machines whatever the + * native function does */ + } + else { /* is ucfirst non-UTF-8, not in locale, and cased latin1 */ + *tmpbuf = toUPPER_LATIN1_MOD(*s); + + /* tmpbuf now has the correct title case for all latin1 characters + * except for the several ones that have tricky handling. All + * of these are mapped by the MOD to the letter below. */ + if (*tmpbuf == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { + + /* The length is going to change, with all three of these, so + * can't replace just the first character */ + inplace = FALSE; + + /* We use the original to distinguish between these tricky + * cases */ + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + /* Two character title case 'Ss', but can remain non-UTF-8 */ + need = slen + 2; + *tmpbuf = 'S'; + *(tmpbuf + 1) = 's'; /* Assert: length(tmpbuf) >= 2 */ + tculen = 2; + } + else { + + /* The other two tricky ones have their title case outside + * latin1. It is the same as their upper case. */ + doing_utf8 = TRUE; + STORE_NON_LATIN1_UC(tmpbuf, *s); + + /* The UTF-8 and UTF-EBCDIC lengths of both these characters + * and their upper cases is 2. */ + tculen = ulen = 2; + + /* The entire result will have to be in UTF-8. Assume worst + * case sizing in conversion. (all latin1 characters occupy + * at most two bytes in utf8) */ + convert_source_to_utf8 = TRUE; + need = slen * 2 + 1; + } + } /* End of is one of the three special chars */ + } /* End of use Unicode (Latin1) semantics */ + } /* End of changing the case of the first character */ - if (SvPADTMP(source) && !SvREADONLY(source) && inplace && SvTEMP(source)) { - /* We can convert in place. */ + /* Here, have the first character's changed case stored in tmpbuf. Ready to + * generate the result */ + if (inplace) { + /* We can convert in place. This means we change just the first + * character without disturbing the rest; no need to grow */ dest = source; s = d = (U8*)SvPV_force_nomg(source, slen); } else { @@ -3570,53 +3864,83 @@ PP(pp_ucfirst) dest = TARG; + /* Here, we can't convert in place; we earlier calculated how much + * space we will need, so grow to accommodate that */ SvUPGRADE(dest, SVt_PV); d = (U8*)SvGROW(dest, need); (void)SvPOK_only(dest); SETs(dest); - - inplace = FALSE; } if (doing_utf8) { - if(!inplace) { - /* slen is the byte length of the whole SV. - * ulen is the byte length of the original Unicode character - * stored as UTF-8 at s. - * tculen is the byte length of the freshly titlecased (or - * lowercased) Unicode character stored as UTF-8 at tmpbuf. - * We first set the result to be the titlecased (/lowercased) - * character, and then append the rest of the SV data. */ - sv_setpvn(dest, (char*)tmpbuf, tculen); - if (slen > ulen) - sv_catpvn(dest, (char*)(s + ulen), slen - ulen); + if (! inplace) { + if (! convert_source_to_utf8) { + + /* Here both source and dest are in UTF-8, but have to create + * the entire output. We initialize the result to be the + * title/lower cased first character, and then append the rest + * of the string. */ + sv_setpvn(dest, (char*)tmpbuf, tculen); + if (slen > ulen) { + sv_catpvn(dest, (char*)(s + ulen), slen - ulen); + } + } + else { + const U8 *const send = s + slen; + + /* Here the dest needs to be in UTF-8, but the source isn't, + * except we earlier UTF-8'd the first character of the source + * into tmpbuf. First put that into dest, and then append the + * rest of the source, converting it to UTF-8 as we go. */ + + /* Assert tculen is 2 here because the only two characters that + * get to this part of the code have 2-byte UTF-8 equivalents */ + *d++ = *tmpbuf; + *d++ = *(tmpbuf + 1); + s++; /* We have just processed the 1st char */ + + for (; s < send; s++) { + d = uvchr_to_utf8(d, *s); + } + *d = '\0'; + SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); + } SvUTF8_on(dest); } - else { + else { /* in-place UTF-8. Just overwrite the first character */ Copy(tmpbuf, d, tculen, U8); SvCUR_set(dest, need - 1); } } - else { - if (*s) { + else { /* Neither source nor dest are in or need to be UTF-8 */ + if (slen) { if (IN_LOCALE_RUNTIME) { TAINT; SvTAINTED_on(dest); - *d = (op_type == OP_UCFIRST) - ? toUPPER_LC(*s) : toLOWER_LC(*s); } - else - *d = (op_type == OP_UCFIRST) ? toUPPER(*s) : toLOWER(*s); - } else { - /* See bug #39028 */ + if (inplace) { /* in-place, only need to change the 1st char */ + *d = *tmpbuf; + } + else { /* Not in-place */ + + /* Copy the case-changed character(s) from tmpbuf */ + Copy(tmpbuf, d, tculen, U8); + d += tculen - 1; /* Code below expects d to point to final + * character stored */ + } + } + else { /* empty source */ + /* See bug #39028: Don't taint if empty */ *d = *s; } + /* In a "use bytes" we don't treat the source as UTF-8, but, still want + * the destination to retain that flag */ if (SvUTF8(source)) SvUTF8_on(dest); - if (!inplace) { + if (!inplace) { /* Finish the rest of the string, unchanged */ /* This will copy the trailing NUL */ Copy(s + 1, d + 1, slen, U8); SvCUR_set(dest, need - 1); @@ -3628,7 +3952,7 @@ PP(pp_ucfirst) /* There's so much setup/teardown code common between uc and lc, I wonder if it would be worth merging the two, and just having a switch outside each - of the three tight loops. */ + of the three tight loops. There is less and less commonality though */ PP(pp_uc) { dVAR; @@ -3643,9 +3967,16 @@ PP(pp_uc) SvGETMAGIC(source); if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) - && SvTEMP(source) && !DO_UTF8(source)) { - /* We can convert in place. */ - + && SvTEMP(source) && !DO_UTF8(source) + && (IN_LOCALE_RUNTIME || ! IN_UNI_8_BIT)) { + + /* We can convert in place. The reason we can't if in UNI_8_BIT is to + * make the loop tight, so we overwrite the source with the dest before + * looking at it, and we need to look at the original source + * afterwards. There would also need to be code added to handle + * switching to not in-place in midstream if we run into characters + * that change the length. + */ dest = source; s = d = (U8*)SvPV_force_nomg(source, len); min = len + 1; @@ -3685,48 +4016,209 @@ PP(pp_uc) const U8 *const send = s + len; U8 tmpbuf[UTF8_MAXBYTES+1]; +/* This is ifdefd out because it needs more work and thought. It isn't clear + * that we should do it. These are hard-coded rules from the Unicode standard, + * and may change. 5.2 gives new guidance on the iota subscript, for example, + * which has not been checked against this; and secondly it may be that we are + * passed a subset of the context, via a \U...\E, for example, and its not + * clear what the best approach is to that */ +#ifdef CONTEXT_DEPENDENT_CASING + bool in_iota_subscript = FALSE; +#endif + while (s < send) { - const STRLEN u = UTF8SKIP(s); - STRLEN ulen; - - toUPPER_utf8(s, tmpbuf, &ulen); - if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { - /* If the eventually required minimum size outgrows - * the available space, we need to grow. */ - const UV o = d - (U8*)SvPVX_const(dest); - - /* If someone uppercases one million U+03B0s we SvGROW() one - * million times. Or we could try guessing how much to - allocate without allocating too much. Such is life. */ - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; +#ifdef CONTEXT_DEPENDENT_CASING + if (in_iota_subscript && ! is_utf8_mark(s)) { + /* A non-mark. Time to output the iota subscript */ +#define GREEK_CAPITAL_LETTER_IOTA 0x0399 +#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345 + + CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA); + in_iota_subscript = FALSE; + } +#endif + + +/* See comments at the first instance in this file of this ifdef */ +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + + /* If the UTF-8 character is invariant, then it is in the range + * known by the standard macro; result is only one byte long */ + if (UTF8_IS_INVARIANT(*s)) { + *d++ = toUPPER(*s); + s++; + } + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + + /* Likewise, if it fits in a byte, its case change is in our + * table */ + U8 orig = UTF8_ACCUMULATE(*s, *(s+1)); + U8 upper = toUPPER_LATIN1_MOD(orig); + CAT_TWO_BYTE_UNI_UPPER_MOD(d, orig, upper); + s += 2; + } + else { +#else + { +#endif + + /* Otherwise, need the general UTF-8 case. Get the changed + * case value and copy it to the output buffer */ + + const STRLEN u = UTF8SKIP(s); + STRLEN ulen; + +#ifndef CONTEXT_DEPENDENT_CASING + toUPPER_utf8(s, tmpbuf, &ulen); +#else + const UV uv = toUPPER_utf8(s, tmpbuf, &ulen); + if (uv == GREEK_CAPITAL_LETTER_IOTA && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI) { + in_iota_subscript = TRUE; + } + else { +#endif + if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { + /* If the eventually required minimum size outgrows + * the available space, we need to grow. */ + const UV o = d - (U8*)SvPVX_const(dest); + + /* If someone uppercases one million U+03B0s we + * SvGROW() one million times. Or we could try + * guessing how much to allocate without allocating too + * much. Such is life. See corresponding comment in lc code + * for another option */ + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + Copy(tmpbuf, d, ulen, U8); + d += ulen; +#ifdef CONTEXT_DEPENDENT_CASING + } +#endif + s += u; } - Copy(tmpbuf, d, ulen, U8); - d += ulen; - s += u; } +#ifdef CONTEXT_DEPENDENT_CASING + if (in_iota_subscript) CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA); +#endif SvUTF8_on(dest); *d = '\0'; SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); - } else { + } else { /* Not UTF-8 */ if (len) { const U8 *const send = s + len; + + /* Use locale casing if in locale; regular style if not treating + * latin1 as having case; otherwise the latin1 casing. Do the + * whole thing in a tight loop, for speed, */ if (IN_LOCALE_RUNTIME) { TAINT; SvTAINTED_on(dest); for (; s < send; d++, s++) *d = toUPPER_LC(*s); } - else { - for (; s < send; d++, s++) + else if (! IN_UNI_8_BIT) { + for (; s < send; d++, s++) { *d = toUPPER(*s); + } } - } + else { + for (; s < send; d++, s++) { + *d = toUPPER_LATIN1_MOD(*s); + if (*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) continue; + + /* The mainstream case is the tight loop above. To avoid + * extra tests in that, all three characters that require + * special handling are mapped by the MOD to the one tested + * just above. + * Use the source to distinguish between the three cases */ + + if (*s == LATIN_SMALL_LETTER_SHARP_S) { + + /* uc() of this requires 2 characters, but they are + * ASCII. If not enough room, grow the string */ + if (SvLEN(dest) < ++min) { + const UV o = d - (U8*)SvPVX_const(dest); + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } + *d++ = 'S'; *d = 'S'; /* upper case is 'SS' */ + continue; /* Back to the tight loop; still in ASCII */ + } + + /* The other two special handling characters have their + * upper cases outside the latin1 range, hence need to be + * in UTF-8, so the whole result needs to be in UTF-8. So, + * here we are somewhere in the middle of processing a + * non-UTF-8 string, and realize that we will have to convert + * the whole thing to UTF-8. What to do? There are + * several possibilities. The simplest to code is to + * convert what we have so far, set a flag, and continue on + * in the loop. The flag would be tested each time through + * the loop, and if set, the next character would be + * converted to UTF-8 and stored. But, I (khw) didn't want + * to slow down the mainstream case at all for this fairly + * rare case, so I didn't want to add a test that didn't + * absolutely have to be there in the loop, besides the + * possibility that it would get too complicated for + * optimizers to deal with. Another possibility is to just + * give up, convert the source to UTF-8, and restart the + * function that way. Another possibility is to convert + * both what has already been processed and what is yet to + * come separately to UTF-8, then jump into the loop that + * handles UTF-8. But the most efficient time-wise of the + * ones I could think of is what follows, and turned out to + * not require much extra code. */ + + /* Convert what we have so far into UTF-8, telling the + * function that we know it should be converted, and to + * allow extra space for what we haven't processed yet. + * Assume the worst case space requirements for converting + * what we haven't processed so far: that it will require + * two bytes for each remaining source character, plus the + * NUL at the end. This may cause the string pointer to + * move, so re-find it. */ + + len = d - (U8*)SvPVX_const(dest); + SvCUR_set(dest, len); + len = sv_utf8_upgrade_flags_grow(dest, + SV_GMAGIC|SV_FORCE_UTF8_UPGRADE, + (send -s) * 2 + 1); + d = (U8*)SvPVX(dest) + len; + + /* And append the current character's upper case in UTF-8 */ + CAT_NON_LATIN1_UC(d, *s); + + /* Now process the remainder of the source, converting to + * upper and UTF-8. If a resulting byte is invariant in + * UTF-8, output it as-is, otherwise convert to UTF-8 and + * append it to the output. */ + + s++; + for (; s < send; s++) { + U8 upper = toUPPER_LATIN1_MOD(*s); + if UTF8_IS_INVARIANT(upper) { + *d++ = upper; + } + else { + CAT_TWO_BYTE_UNI_UPPER_MOD(d, *s, upper); + } + } + + /* Here have processed the whole source; no need to continue + * with the outer loop. Each character has been converted + * to upper case and converted to UTF-8 */ + + break; + } /* End of processing all latin1-style chars */ + } /* End of processing all chars */ + } /* End of source is not empty */ + if (source != dest) { - *d = '\0'; + *d = '\0'; /* Here d points to 1 after last char, add NUL */ SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); } - } + } /* End of isn't utf8 */ SvSETMAGIC(dest); RETURN; } @@ -3746,8 +4238,9 @@ PP(pp_lc) if (SvPADTMP(source) && !SvREADONLY(source) && !SvAMAGIC(source) && SvTEMP(source) && !DO_UTF8(source)) { - /* We can convert in place. */ + /* We can convert in place, as lowercasing anything in the latin1 range + * (or else DO_UTF8 would have been on) doesn't lengthen it */ dest = source; s = d = (U8*)SvPV_force_nomg(source, len); min = len + 1; @@ -3788,56 +4281,148 @@ PP(pp_lc) U8 tmpbuf[UTF8_MAXBYTES_CASE+1]; while (s < send) { - const STRLEN u = UTF8SKIP(s); - STRLEN ulen; - const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); +/* See comments at the first instance in this file of this ifdef */ +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + if (UTF8_IS_INVARIANT(*s)) { -#define GREEK_CAPITAL_LETTER_SIGMA 0x03A3 /* Unicode U+03A3 */ - if (uv == GREEK_CAPITAL_LETTER_SIGMA) { - NOOP; - /* - * Now if the sigma is NOT followed by - * /$ignorable_sequence$cased_letter/; - * and it IS preceded by /$cased_letter$ignorable_sequence/; - * where $ignorable_sequence is [\x{2010}\x{AD}\p{Mn}]* - * and $cased_letter is [\p{Ll}\p{Lo}\p{Lt}] - * then it should be mapped to 0x03C2, - * (GREEK SMALL LETTER FINAL SIGMA), - * instead of staying 0x03A3. - * "should be": in other words, this is not implemented yet. - * See lib/unicore/SpecialCasing.txt. + /* Invariant characters use the standard mappings compiled in. */ + *d++ = toLOWER(*s); + s++; } - if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { - /* If the eventually required minimum size outgrows - * the available space, we need to grow. */ - const UV o = d - (U8*)SvPVX_const(dest); - - /* If someone lowercases one million U+0130s we SvGROW() one - * million times. Or we could try guessing how much to - allocate without allocating too much. Such is life. */ - SvGROW(dest, min); - d = (U8*)SvPVX(dest) + o; + else if (UTF8_IS_DOWNGRADEABLE_START(*s)) { + + /* As do the ones in the Latin1 range */ + U8 lower = toLOWER_LATIN1(UTF8_ACCUMULATE(*s, *(s+1))); + CAT_UNI_TO_UTF8_TWO_BYTE(d, lower); + s += 2; } - Copy(tmpbuf, d, ulen, U8); - d += ulen; - s += u; - } + else { +#endif + /* Here, is utf8 not in Latin-1 range, have to go out and get + * the mappings from the tables. */ + + const STRLEN u = UTF8SKIP(s); + STRLEN ulen; + +/* See comments at the first instance in this file of this ifdef */ +#ifndef CONTEXT_DEPENDENT_CASING + toLOWER_utf8(s, tmpbuf, &ulen); +#else + /* Here is context dependent casing, not compiled in currently; + * needs more thought and work */ + + const UV uv = toLOWER_utf8(s, tmpbuf, &ulen); + + /* If the lower case is a small sigma, it may be that we need + * to change it to a final sigma. This happens at the end of + * a word that contains more than just this character, and only + * when we started with a capital sigma. */ + if (uv == UNICODE_GREEK_SMALL_LETTER_SIGMA && + s > send - len && /* Makes sure not the first letter */ + utf8_to_uvchr(s, 0) == UNICODE_GREEK_CAPITAL_LETTER_SIGMA + ) { + + /* We use the algorithm in: + * http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf (C + * is a CAPITAL SIGMA): If C is preceded by a sequence + * consisting of a cased letter and a case-ignorable + * sequence, and C is not followed by a sequence consisting + * of a case ignorable sequence and then a cased letter, + * then when lowercasing C, C becomes a final sigma */ + + /* To determine if this is the end of a word, need to peek + * ahead. Look at the next character */ + const U8 *peek = s + u; + + /* Skip any case ignorable characters */ + while (peek < send && is_utf8_case_ignorable(peek)) { + peek += UTF8SKIP(peek); + } + + /* If we reached the end of the string without finding any + * non-case ignorable characters, or if the next such one + * is not-cased, then we have met the conditions for it + * being a final sigma with regards to peek ahead, and so + * must do peek behind for the remaining conditions. (We + * know there is stuff behind to look at since we tested + * above that this isn't the first letter) */ + if (peek >= send || ! is_utf8_cased(peek)) { + peek = utf8_hop(s, -1); + + /* Here are at the beginning of the first character + * before the original upper case sigma. Keep backing + * up, skipping any case ignorable characters */ + while (is_utf8_case_ignorable(peek)) { + peek = utf8_hop(peek, -1); + } + + /* Here peek points to the first byte of the closest + * non-case-ignorable character before the capital + * sigma. If it is cased, then by the Unicode + * algorithm, we should use a small final sigma instead + * of what we have */ + if (is_utf8_cased(peek)) { + STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, + UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA); + } + } + } + else { /* Not a context sensitive mapping */ +#endif /* End of commented out context sensitive */ + if (ulen > u && (SvLEN(dest) < (min += ulen - u))) { + + /* If the eventually required minimum size outgrows + * the available space, we need to grow. */ + const UV o = d - (U8*)SvPVX_const(dest); + + /* If someone lowercases one million U+0130s we + * SvGROW() one million times. Or we could try + * guessing how much to allocate without allocating too + * much. Such is life. Another option would be to + * grow an extra byte or two more each time we need to + * grow, which would cut down the million to 500K, with + * little waste */ + SvGROW(dest, min); + d = (U8*)SvPVX(dest) + o; + } +#ifdef CONTEXT_DEPENDENT_CASING + } +#endif + /* Copy the newly lowercased letter to the output buffer we're + * building */ + Copy(tmpbuf, d, ulen, U8); + d += ulen; + s += u; +#ifdef GO_AHEAD_AND_BREAK_USER_DEFINED_CASE_MAPPINGS + } +#endif + } /* End of looping through the source string */ SvUTF8_on(dest); *d = '\0'; SvCUR_set(dest, d - (U8*)SvPVX_const(dest)); - } else { + } else { /* Not utf8 */ if (len) { const U8 *const send = s + len; + + /* Use locale casing if in locale; regular style if not treating + * latin1 as having case; otherwise the latin1 casing. Do the + * whole thing in a tight loop, for speed, */ if (IN_LOCALE_RUNTIME) { TAINT; SvTAINTED_on(dest); for (; s < send; d++, s++) *d = toLOWER_LC(*s); } - else { - for (; s < send; d++, s++) + else if (! IN_UNI_8_BIT) { + for (; s < send; d++, s++) { *d = toLOWER(*s); + } + } + else { + for (; s < send; d++, s++) { + *d = toLOWER_LATIN1(*s); + } } } if (source != dest) { @@ -3894,9 +4479,7 @@ PP(pp_quotemeta) } else sv_setpvn(TARG, s, len); - SETs(TARG); - if (SvSMAGICAL(TARG)) - mg_set(TARG); + SETTARG; RETURN; } @@ -3910,7 +4493,17 @@ PP(pp_aslice) if (SvTYPE(av) == SVt_PVAV) { const I32 arybase = CopARYBASE_get(PL_curcop); - if (lval && PL_op->op_private & OPpLVAL_INTRO) { + const bool localizing = PL_op->op_private & OPpLVAL_INTRO; + bool can_preserve = FALSE; + + if (localizing) { + MAGIC *mg; + HV *stash; + + can_preserve = SvCANEXISTDELETE(av); + } + + if (lval && localizing) { register SV **svp; I32 max = -1; for (svp = MARK + 1; svp <= SP; svp++) { @@ -3921,18 +4514,32 @@ PP(pp_aslice) if (max > AvMAX(av)) av_extend(av, max); } + while (++MARK <= SP) { register SV **svp; I32 elem = SvIV(*MARK); + bool preeminent = TRUE; if (elem > 0) elem -= arybase; + if (localizing && can_preserve) { + /* If we can determine whether the element exist, + * Try to preserve the existenceness of a tied array + * element by using EXISTS and DELETE if possible. + * Fallback to FETCH and STORE otherwise. */ + preeminent = av_exists(av, elem); + } + svp = av_fetch(av, elem, lval); if (lval) { if (!svp || *svp == &PL_sv_undef) DIE(aTHX_ PL_no_aelem, elem); - if (PL_op->op_private & OPpLVAL_INTRO) - save_aelem(av, elem, svp); + if (localizing) { + if (preeminent) + save_aelem(av, elem, svp); + else + SAVEADELETE(av, elem); + } } *MARK = svp ? *svp : &PL_sv_undef; } @@ -4040,12 +4647,195 @@ PP(pp_each) RETURN; } -PP(pp_delete) +STATIC OP * +S_do_delete_local(pTHX) { dVAR; dSP; const I32 gimme = GIMME_V; - const I32 discard = (gimme == G_VOID) ? G_DISCARD : 0; + const MAGIC *mg; + HV *stash; + + if (PL_op->op_private & OPpSLICE) { + dMARK; dORIGMARK; + SV * const osv = POPs; + const bool tied = SvRMAGICAL(osv) + && mg_find((const SV *)osv, PERL_MAGIC_tied); + const bool can_preserve = SvCANEXISTDELETE(osv) + || mg_find((const SV *)osv, PERL_MAGIC_env); + const U32 type = SvTYPE(osv); + if (type == SVt_PVHV) { /* hash element */ + HV * const hv = MUTABLE_HV(osv); + while (++MARK <= SP) { + SV * const keysv = *MARK; + SV *sv = NULL; + bool preeminent = TRUE; + if (can_preserve) + preeminent = hv_exists_ent(hv, keysv, 0); + if (tied) { + HE *he = hv_fetch_ent(hv, keysv, 1, 0); + if (he) + sv = HeVAL(he); + else + preeminent = FALSE; + } + else { + sv = hv_delete_ent(hv, keysv, 0, 0); + SvREFCNT_inc_simple_void(sv); /* De-mortalize */ + } + if (preeminent) { + save_helem_flags(hv, keysv, &sv, SAVEf_KEEPOLDELEM); + if (tied) { + *MARK = sv_mortalcopy(sv); + mg_clear(sv); + } else + *MARK = sv; + } + else { + SAVEHDELETE(hv, keysv); + *MARK = &PL_sv_undef; + } + } + } + else if (type == SVt_PVAV) { /* array element */ + if (PL_op->op_flags & OPf_SPECIAL) { + AV * const av = MUTABLE_AV(osv); + while (++MARK <= SP) { + I32 idx = SvIV(*MARK); + SV *sv = NULL; + bool preeminent = TRUE; + if (can_preserve) + preeminent = av_exists(av, idx); + if (tied) { + SV **svp = av_fetch(av, idx, 1); + if (svp) + sv = *svp; + else + preeminent = FALSE; + } + else { + sv = av_delete(av, idx, 0); + SvREFCNT_inc_simple_void(sv); /* De-mortalize */ + } + if (preeminent) { + save_aelem_flags(av, idx, &sv, SAVEf_KEEPOLDELEM); + if (tied) { + *MARK = sv_mortalcopy(sv); + mg_clear(sv); + } else + *MARK = sv; + } + else { + SAVEADELETE(av, idx); + *MARK = &PL_sv_undef; + } + } + } + } + else + DIE(aTHX_ "Not a HASH reference"); + if (gimme == G_VOID) + SP = ORIGMARK; + else if (gimme == G_SCALAR) { + MARK = ORIGMARK; + if (SP > MARK) + *++MARK = *SP; + else + *++MARK = &PL_sv_undef; + SP = MARK; + } + } + else { + SV * const keysv = POPs; + SV * const osv = POPs; + const bool tied = SvRMAGICAL(osv) + && mg_find((const SV *)osv, PERL_MAGIC_tied); + const bool can_preserve = SvCANEXISTDELETE(osv) + || mg_find((const SV *)osv, PERL_MAGIC_env); + const U32 type = SvTYPE(osv); + SV *sv = NULL; + if (type == SVt_PVHV) { + HV * const hv = MUTABLE_HV(osv); + bool preeminent = TRUE; + if (can_preserve) + preeminent = hv_exists_ent(hv, keysv, 0); + if (tied) { + HE *he = hv_fetch_ent(hv, keysv, 1, 0); + if (he) + sv = HeVAL(he); + else + preeminent = FALSE; + } + else { + sv = hv_delete_ent(hv, keysv, 0, 0); + SvREFCNT_inc_simple_void(sv); /* De-mortalize */ + } + if (preeminent) { + save_helem_flags(hv, keysv, &sv, SAVEf_KEEPOLDELEM); + if (tied) { + SV *nsv = sv_mortalcopy(sv); + mg_clear(sv); + sv = nsv; + } + } + else + SAVEHDELETE(hv, keysv); + } + else if (type == SVt_PVAV) { + if (PL_op->op_flags & OPf_SPECIAL) { + AV * const av = MUTABLE_AV(osv); + I32 idx = SvIV(keysv); + bool preeminent = TRUE; + if (can_preserve) + preeminent = av_exists(av, idx); + if (tied) { + SV **svp = av_fetch(av, idx, 1); + if (svp) + sv = *svp; + else + preeminent = FALSE; + } + else { + sv = av_delete(av, idx, 0); + SvREFCNT_inc_simple_void(sv); /* De-mortalize */ + } + if (preeminent) { + save_aelem_flags(av, idx, &sv, SAVEf_KEEPOLDELEM); + if (tied) { + SV *nsv = sv_mortalcopy(sv); + mg_clear(sv); + sv = nsv; + } + } + else + SAVEADELETE(av, idx); + } + else + DIE(aTHX_ "panic: avhv_delete no longer supported"); + } + else + DIE(aTHX_ "Not a HASH reference"); + if (!sv) + sv = &PL_sv_undef; + if (gimme != G_VOID) + PUSHs(sv); + } + + RETURN; +} + +PP(pp_delete) +{ + dVAR; + dSP; + I32 gimme; + I32 discard; + + if (PL_op->op_private & OPpLVAL_INTRO) + return do_delete_local(); + + gimme = GIMME_V; + discard = (gimme == G_VOID) ? G_DISCARD : 0; if (PL_op->op_private & OPpSLICE) { dMARK; dORIGMARK; @@ -4081,7 +4871,7 @@ PP(pp_delete) else { SV *keysv = POPs; HV * const hv = MUTABLE_HV(POPs); - SV *sv; + SV *sv = NULL; if (SvTYPE(hv) == SVt_PVHV) sv = hv_delete_ent(hv, keysv, discard, 0); else if (SvTYPE(hv) == SVt_PVAV) { @@ -4141,31 +4931,28 @@ PP(pp_hslice) register HV * const hv = MUTABLE_HV(POPs); register const I32 lval = (PL_op->op_flags & OPf_MOD || LVRET); const bool localizing = PL_op->op_private & OPpLVAL_INTRO; - bool other_magic = FALSE; + bool can_preserve = FALSE; if (localizing) { MAGIC *mg; HV *stash; - other_magic = mg_find((SV*)hv, PERL_MAGIC_env) || - ((mg = mg_find((SV*)hv, PERL_MAGIC_tied)) - /* Try to preserve the existenceness of a tied hash - * element by using EXISTS and DELETE if possible. - * Fallback to FETCH and STORE otherwise */ - && (stash = SvSTASH(SvRV(SvTIED_obj((SV*)hv, mg)))) - && gv_fetchmethod_autoload(stash, "EXISTS", TRUE) - && gv_fetchmethod_autoload(stash, "DELETE", TRUE)); + if (SvCANEXISTDELETE(hv) || mg_find((const SV *)hv, PERL_MAGIC_env)) + can_preserve = TRUE; } while (++MARK <= SP) { SV * const keysv = *MARK; SV **svp; HE *he; - bool preeminent = FALSE; - - if (localizing) { - preeminent = SvRMAGICAL(hv) && !other_magic ? 1 : - hv_exists_ent(hv, keysv, 0); + bool preeminent = TRUE; + + if (localizing && can_preserve) { + /* If we can determine whether the element exist, + * try to preserve the existenceness of a tied hash + * element by using EXISTS and DELETE if possible. + * Fallback to FETCH and STORE otherwise. */ + preeminent = hv_exists_ent(hv, keysv, 0); } he = hv_fetch_ent(hv, keysv, lval, 0); @@ -4177,17 +4964,12 @@ PP(pp_hslice) } if (localizing) { if (HvNAME_get(hv) && isGV(*svp)) - save_gp((GV*)*svp, !(PL_op->op_flags & OPf_SPECIAL)); - else { - if (preeminent) - save_helem(hv, keysv, svp); - else { - STRLEN keylen; - const char * const key = SvPV_const(keysv, keylen); - SAVEDELETE(hv, savepvn(key,keylen), - SvUTF8(keysv) ? -(I32)keylen : (I32)keylen); - } - } + save_gp(MUTABLE_GV(*svp), !(PL_op->op_flags & OPf_SPECIAL)); + else if (preeminent) + save_helem_flags(hv, keysv, svp, + (PL_op->op_flags & OPf_SPECIAL) ? 0 : SAVEf_SETMAGIC); + else + SAVEHDELETE(hv, keysv); } } *MARK = svp ? *svp : &PL_sv_undef; @@ -4273,7 +5055,7 @@ PP(pp_anonlist) { dVAR; dSP; dMARK; dORIGMARK; const I32 items = SP - MARK; - SV * const av = (SV *) av_make(items, MARK+1); + SV * const av = MUTABLE_SV(av_make(items, MARK+1)); SP = ORIGMARK; /* av_make() might realloc stack_sp */ mXPUSHs((PL_op->op_flags & OPf_SPECIAL) ? newRV_noinc(av) : av); @@ -4290,13 +5072,13 @@ PP(pp_anonhash) SV * const val = newSV(0); if (MARK < SP) sv_setsv(val, *++MARK); - else if (ckWARN(WARN_MISC)) - Perl_warner(aTHX_ packWARN(WARN_MISC), "Odd number of elements in anonymous hash"); + else + Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Odd number of elements in anonymous hash"); (void)hv_store_ent(hv,key,val,0); } SP = ORIGMARK; mXPUSHs((PL_op->op_flags & OPf_SPECIAL) - ? newRV_noinc((SV*) hv) : (SV*) hv); + ? newRV_noinc(MUTABLE_SV(hv)) : MUTABLE_SV(hv)); RETURN; } @@ -4312,15 +5094,15 @@ PP(pp_splice) I32 newlen; I32 after; I32 diff; - const MAGIC * const mg = SvTIED_mg((SV*)ary, PERL_MAGIC_tied); + const MAGIC * const mg = SvTIED_mg((const SV *)ary, PERL_MAGIC_tied); if (mg) { - *MARK-- = SvTIED_obj((SV*)ary, mg); + *MARK-- = SvTIED_obj(MUTABLE_SV(ary), mg); PUSHMARK(MARK); PUTBACK; - ENTER; + ENTER_with_name("call_SPLICE"); call_method("SPLICE",GIMME_V); - LEAVE; + LEAVE_with_name("call_SPLICE"); SPAGAIN; RETURN; } @@ -4351,8 +5133,7 @@ PP(pp_splice) length = AvMAX(ary) + 1; } if (offset > AvFILLp(ary) + 1) { - if (ckWARN(WARN_MISC)) - Perl_warner(aTHX_ packWARN(WARN_MISC), "splice() offset past end of array" ); + Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "splice() offset past end of array" ); offset = AvFILLp(ary) + 1; } after = AvFILLp(ary) + 1 - (offset + length); @@ -4509,18 +5290,16 @@ PP(pp_push) { dVAR; dSP; dMARK; dORIGMARK; dTARGET; register AV * const ary = MUTABLE_AV(*++MARK); - const MAGIC * const mg = SvTIED_mg((SV*)ary, PERL_MAGIC_tied); + const MAGIC * const mg = SvTIED_mg((const SV *)ary, PERL_MAGIC_tied); if (mg) { - *MARK-- = SvTIED_obj((SV*)ary, mg); + *MARK-- = SvTIED_obj(MUTABLE_SV(ary), mg); PUSHMARK(MARK); PUTBACK; - ENTER; + ENTER_with_name("call_PUSH"); call_method("PUSH",G_SCALAR|G_DISCARD); - LEAVE; + LEAVE_with_name("call_PUSH"); SPAGAIN; - SP = ORIGMARK; - PUSHi( AvFILL(ary) + 1 ); } else { PL_delaymagic = DM_DELAY; @@ -4531,11 +5310,13 @@ PP(pp_push) av_store(ary, AvFILLp(ary)+1, sv); } if (PL_delaymagic & DM_ARRAY) - mg_set((SV*)ary); + mg_set(MUTABLE_SV(ary)); PL_delaymagic = 0; - SP = ORIGMARK; - PUSHi( AvFILLp(ary) + 1 ); + } + SP = ORIGMARK; + if (OP_GIMME(PL_op, 0) != G_VOID) { + PUSHi( AvFILL(ary) + 1 ); } RETURN; } @@ -4558,15 +5339,15 @@ PP(pp_unshift) { dVAR; dSP; dMARK; dORIGMARK; dTARGET; register AV *ary = MUTABLE_AV(*++MARK); - const MAGIC * const mg = SvTIED_mg((SV*)ary, PERL_MAGIC_tied); + const MAGIC * const mg = SvTIED_mg((const SV *)ary, PERL_MAGIC_tied); if (mg) { - *MARK-- = SvTIED_obj((SV*)ary, mg); + *MARK-- = SvTIED_obj(MUTABLE_SV(ary), mg); PUSHMARK(MARK); PUTBACK; - ENTER; + ENTER_with_name("call_UNSHIFT"); call_method("UNSHIFT",G_SCALAR|G_DISCARD); - LEAVE; + LEAVE_with_name("call_UNSHIFT"); SPAGAIN; } else { @@ -4578,24 +5359,85 @@ PP(pp_unshift) } } SP = ORIGMARK; - PUSHi( AvFILL(ary) + 1 ); + if (OP_GIMME(PL_op, 0) != G_VOID) { + PUSHi( AvFILL(ary) + 1 ); + } RETURN; } PP(pp_reverse) { dVAR; dSP; dMARK; - SV ** const oldsp = SP; if (GIMME == G_ARRAY) { - MARK++; - while (MARK < SP) { - register SV * const tmp = *MARK; - *MARK++ = *SP; - *SP-- = tmp; + if (PL_op->op_private & OPpREVERSE_INPLACE) { + AV *av; + + /* See pp_sort() */ + assert( MARK+1 == SP && *SP && SvTYPE(*SP) == SVt_PVAV); + (void)POPMARK; /* remove mark associated with ex-OP_AASSIGN */ + av = MUTABLE_AV((*SP)); + /* In-place reversing only happens in void context for the array + * assignment. We don't need to push anything on the stack. */ + SP = MARK; + + if (SvMAGICAL(av)) { + I32 i, j; + register SV *tmp = sv_newmortal(); + /* For SvCANEXISTDELETE */ + HV *stash; + const MAGIC *mg; + bool can_preserve = SvCANEXISTDELETE(av); + + for (i = 0, j = av_len(av); i < j; ++i, --j) { + register SV *begin, *end; + + if (can_preserve) { + if (!av_exists(av, i)) { + if (av_exists(av, j)) { + register SV *sv = av_delete(av, j, 0); + begin = *av_fetch(av, i, TRUE); + sv_setsv_mg(begin, sv); + } + continue; + } + else if (!av_exists(av, j)) { + register SV *sv = av_delete(av, i, 0); + end = *av_fetch(av, j, TRUE); + sv_setsv_mg(end, sv); + continue; + } + } + + begin = *av_fetch(av, i, TRUE); + end = *av_fetch(av, j, TRUE); + sv_setsv(tmp, begin); + sv_setsv_mg(begin, end); + sv_setsv_mg(end, tmp); + } + } + else { + SV **begin = AvARRAY(av); + SV **end = begin + AvFILLp(av); + + while (begin < end) { + register SV * const tmp = *begin; + *begin++ = *end; + *end-- = tmp; + } + } + } + else { + SV **oldsp = SP; + MARK++; + while (MARK < SP) { + register SV * const tmp = *MARK; + *MARK++ = *SP; + *SP-- = tmp; + } + /* safe as long as stack cannot get extended in the above */ + SP = oldsp; } - /* safe as long as stack cannot get extended in the above */ - SP = oldsp; } else { register char *up; @@ -4677,11 +5519,13 @@ PP(pp_split) I32 iters = 0; const STRLEN slen = do_utf8 ? utf8_length((U8*)s, (U8*)strend) : (STRLEN)(strend - s); I32 maxiters = slen + 10; + I32 trailing_empty = 0; const char *orig; const I32 origlimit = limit; I32 realarray = 0; I32 base; const I32 gimme = GIMME_V; + bool gimme_scalar; const I32 oldsave = PL_savestack_ix; U32 make_mortal = SVs_TEMP; bool multiline = 0; @@ -4703,15 +5547,13 @@ PP(pp_split) #ifdef USE_ITHREADS if (pm->op_pmreplrootu.op_pmtargetoff) { - ary = GvAVn((GV*)PAD_SVl(pm->op_pmreplrootu.op_pmtargetoff)); + ary = GvAVn(MUTABLE_GV(PAD_SVl(pm->op_pmreplrootu.op_pmtargetoff))); } #else if (pm->op_pmreplrootu.op_pmtargetgv) { ary = GvAVn(pm->op_pmreplrootu.op_pmtargetgv); } #endif - else if (gimme != G_ARRAY) - ary = GvAVn(PL_defgv); else ary = NULL; if (ary && (gimme != G_ARRAY || (pm->op_pmflags & PMf_ONCE))) { @@ -4720,9 +5562,9 @@ PP(pp_split) av_extend(ary,0); av_clear(ary); SPAGAIN; - if ((mg = SvTIED_mg((SV*)ary, PERL_MAGIC_tied))) { + if ((mg = SvTIED_mg((const SV *)ary, PERL_MAGIC_tied))) { PUSHMARK(SP); - XPUSHs(SvTIED_obj((SV*)ary, mg)); + XPUSHs(SvTIED_obj(MUTABLE_SV(ary), mg)); } else { if (!AvREAL(ary)) { @@ -4757,6 +5599,8 @@ PP(pp_split) multiline = 1; } + gimme_scalar = gimme == G_SCALAR && !ary; + if (!limit) limit = maxiters + 2; if (RX_EXTFLAGS(rx) & RXf_WHITE) { @@ -4782,9 +5626,17 @@ PP(pp_split) if (m >= strend) break; - dstr = newSVpvn_flags(s, m-s, - (do_utf8 ? SVf_UTF8 : 0) | make_mortal); - XPUSHs(dstr); + if (gimme_scalar) { + iters++; + if (m-s == 0) + trailing_empty++; + else + trailing_empty = 0; + } else { + dstr = newSVpvn_flags(s, m-s, + (do_utf8 ? SVf_UTF8 : 0) | make_mortal); + XPUSHs(dstr); + } /* skip the whitespace found last */ if (do_utf8) @@ -4812,9 +5664,18 @@ PP(pp_split) m++; if (m >= strend) break; - dstr = newSVpvn_flags(s, m-s, - (do_utf8 ? SVf_UTF8 : 0) | make_mortal); - XPUSHs(dstr); + + if (gimme_scalar) { + iters++; + if (m-s == 0) + trailing_empty++; + else + trailing_empty = 0; + } else { + dstr = newSVpvn_flags(s, m-s, + (do_utf8 ? SVf_UTF8 : 0) | make_mortal); + XPUSHs(dstr); + } s = m; } } @@ -4827,34 +5688,49 @@ PP(pp_split) or split //, $str, $i; */ - const U32 items = limit - 1; - if (items < slen) - EXTEND(SP, items); - else - EXTEND(SP, slen); + if (!gimme_scalar) { + const U32 items = limit - 1; + if (items < slen) + EXTEND(SP, items); + else + EXTEND(SP, slen); + } if (do_utf8) { while (--limit) { /* keep track of how many bytes we skip over */ m = s; s += UTF8SKIP(s); - dstr = newSVpvn_flags(m, s-m, SVf_UTF8 | make_mortal); + if (gimme_scalar) { + iters++; + if (s-m == 0) + trailing_empty++; + else + trailing_empty = 0; + } else { + dstr = newSVpvn_flags(m, s-m, SVf_UTF8 | make_mortal); - PUSHs(dstr); + PUSHs(dstr); + } if (s >= strend) break; } } else { while (--limit) { - dstr = newSVpvn(s, 1); + if (gimme_scalar) { + iters++; + } else { + dstr = newSVpvn(s, 1); - s++; - if (make_mortal) - sv_2mortal(dstr); + if (make_mortal) + sv_2mortal(dstr); - PUSHs(dstr); + PUSHs(dstr); + } + + s++; if (s >= strend) break; @@ -4876,9 +5752,17 @@ PP(pp_split) ; if (m >= strend) break; - dstr = newSVpvn_flags(s, m-s, - (do_utf8 ? SVf_UTF8 : 0) | make_mortal); - XPUSHs(dstr); + if (gimme_scalar) { + iters++; + if (m-s == 0) + trailing_empty++; + else + trailing_empty = 0; + } else { + dstr = newSVpvn_flags(s, m-s, + (do_utf8 ? SVf_UTF8 : 0) | make_mortal); + XPUSHs(dstr); + } /* The rx->minlen is in characters but we want to step * s ahead by bytes. */ if (do_utf8) @@ -4892,9 +5776,17 @@ PP(pp_split) (m = fbm_instr((unsigned char*)s, (unsigned char*)strend, csv, multiline ? FBMrf_MULTILINE : 0)) ) { - dstr = newSVpvn_flags(s, m-s, - (do_utf8 ? SVf_UTF8 : 0) | make_mortal); - XPUSHs(dstr); + if (gimme_scalar) { + iters++; + if (m-s == 0) + trailing_empty++; + else + trailing_empty = 0; + } else { + dstr = newSVpvn_flags(s, m-s, + (do_utf8 ? SVf_UTF8 : 0) | make_mortal); + XPUSHs(dstr); + } /* The rx->minlen is in characters but we want to step * s ahead by bytes. */ if (do_utf8) @@ -4924,9 +5816,18 @@ PP(pp_split) strend = s + (strend - m); } m = RX_OFFS(rx)[0].start + orig; - dstr = newSVpvn_flags(s, m-s, - (do_utf8 ? SVf_UTF8 : 0) | make_mortal); - XPUSHs(dstr); + + if (gimme_scalar) { + iters++; + if (m-s == 0) + trailing_empty++; + else + trailing_empty = 0; + } else { + dstr = newSVpvn_flags(s, m-s, + (do_utf8 ? SVf_UTF8 : 0) | make_mortal); + XPUSHs(dstr); + } if (RX_NPARENS(rx)) { I32 i; for (i = 1; i <= (I32)RX_NPARENS(rx); i++) { @@ -4936,37 +5837,54 @@ PP(pp_split) /* japhy (07/27/01) -- the (m && s) test doesn't catch parens that didn't match -- they should be set to undef, not the empty string */ - if (m >= orig && s >= orig) { - dstr = newSVpvn_flags(s, m-s, - (do_utf8 ? SVf_UTF8 : 0) - | make_mortal); + if (gimme_scalar) { + iters++; + if (m-s == 0) + trailing_empty++; + else + trailing_empty = 0; + } else { + if (m >= orig && s >= orig) { + dstr = newSVpvn_flags(s, m-s, + (do_utf8 ? SVf_UTF8 : 0) + | make_mortal); + } + else + dstr = &PL_sv_undef; /* undef, not "" */ + XPUSHs(dstr); } - else - dstr = &PL_sv_undef; /* undef, not "" */ - XPUSHs(dstr); + } } s = RX_OFFS(rx)[0].end + orig; } } - iters = (SP - PL_stack_base) - base; + if (!gimme_scalar) { + iters = (SP - PL_stack_base) - base; + } if (iters > maxiters) DIE(aTHX_ "Split loop"); /* keep field after final delim? */ if (s < strend || (iters && origlimit)) { - const STRLEN l = strend - s; - dstr = newSVpvn_flags(s, l, (do_utf8 ? SVf_UTF8 : 0) | make_mortal); - XPUSHs(dstr); + if (!gimme_scalar) { + const STRLEN l = strend - s; + dstr = newSVpvn_flags(s, l, (do_utf8 ? SVf_UTF8 : 0) | make_mortal); + XPUSHs(dstr); + } iters++; } else if (!origlimit) { - while (iters > 0 && (!TOPs || !SvANY(TOPs) || SvCUR(TOPs) == 0)) { - if (TOPs && !make_mortal) - sv_2mortal(TOPs); - iters--; - *SP-- = &PL_sv_undef; + if (gimme_scalar) { + iters -= trailing_empty; + } else { + while (iters > 0 && (!TOPs || !SvANY(TOPs) || SvCUR(TOPs) == 0)) { + if (TOPs && !make_mortal) + sv_2mortal(TOPs); + *SP-- = &PL_sv_undef; + iters--; + } } } @@ -4977,7 +5895,7 @@ PP(pp_split) if (!mg) { if (SvSMAGICAL(ary)) { PUTBACK; - mg_set((SV*)ary); + mg_set(MUTABLE_SV(ary)); SPAGAIN; } if (gimme == G_ARRAY) { @@ -4989,9 +5907,9 @@ PP(pp_split) } else { PUTBACK; - ENTER; + ENTER_with_name("call_PUSH"); call_method("PUSH",G_SCALAR|G_DISCARD); - LEAVE; + LEAVE_with_name("call_PUSH"); SPAGAIN; if (gimme == G_ARRAY) { I32 i; @@ -5034,9 +5952,9 @@ PP(pp_lock) dSP; dTOPss; SV *retsv = sv; + assert(SvTYPE(retsv) != SVt_PVCV); SvLOCK(sv); - if (SvTYPE(retsv) == SVt_PVAV || SvTYPE(retsv) == SVt_PVHV - || SvTYPE(retsv) == SVt_PVCV) { + if (SvTYPE(retsv) == SVt_PVAV || SvTYPE(retsv) == SVt_PVHV) { retsv = refto(retsv); } SETs(retsv); @@ -5049,6 +5967,25 @@ PP(unimplemented_op) dVAR; DIE(aTHX_ "panic: unimplemented op %s (#%d) called", OP_NAME(PL_op), PL_op->op_type); + return NORMAL; +} + +PP(pp_boolkeys) +{ + dVAR; + dSP; + HV * const hv = (HV*)POPs; + + if (SvRMAGICAL(hv)) { + MAGIC * const mg = mg_find((SV*)hv, PERL_MAGIC_tied); + if (mg) { + XPUSHs(magic_scalarpack(hv, mg)); + RETURN; + } + } + + XPUSHs(boolSV(HvKEYS(hv) != 0)); + RETURN; } /*