#include "perl.h"
#include "keywords.h"
+#include "invlist_inline.h"
#include "reentr.h"
#include "regcharclass.h"
-/* XXX I can't imagine anyone who doesn't have this actually _needs_
- it, since pid_t is an integral type.
- --AD 2/20/1998
-*/
-#ifdef NEED_GETPID_PROTO
-extern Pid_t getpid (void);
-#endif
-
-/*
- * Some BSDs and Cygwin default to POSIX math instead of IEEE.
- * This switches them over to IEEE.
- */
-#if defined(LIBM_LIB_VERSION)
- _LIB_VERSION_TYPE _LIB_VERSION = _IEEE_;
-#endif
-
-static const STRLEN small_mu_len = sizeof(GREEK_SMALL_LETTER_MU_UTF8) - 1;
-static const STRLEN capital_iota_len = sizeof(GREEK_CAPITAL_LETTER_IOTA_UTF8) - 1;
-
/* variations on pp_null */
PP(pp_stub)
/* Pushy stuff. */
-/* This is also called directly by pp_lvavref. */
-PP(pp_padav)
-{
- dSP; dTARGET;
- U8 gimme;
- assert(SvTYPE(TARG) == SVt_PVAV);
- if (UNLIKELY( PL_op->op_private & OPpLVAL_INTRO ))
- if (LIKELY( !(PL_op->op_private & OPpPAD_STATE) ))
- SAVECLEARSV(PAD_SVl(PL_op->op_targ));
- EXTEND(SP, 1);
-
- if (PL_op->op_flags & OPf_REF) {
- PUSHs(TARG);
- RETURN;
- }
- else if (PL_op->op_private & OPpMAYBE_LVSUB) {
- const I32 flags = is_lvalue_sub();
- if (flags && !(flags & OPpENTERSUB_INARGS)) {
- if (GIMME_V == G_SCALAR)
- /* diag_listed_as: Can't return %s to lvalue scalar context */
- Perl_croak(aTHX_ "Can't return array to lvalue scalar context");
- PUSHs(TARG);
- RETURN;
- }
- }
-
- gimme = GIMME_V;
- if (gimme == G_ARRAY) {
- /* XXX see also S_pushav in pp_hot.c */
- const SSize_t maxarg = AvFILL(MUTABLE_AV(TARG)) + 1;
- EXTEND(SP, maxarg);
- if (SvMAGICAL(TARG)) {
- SSize_t i;
- for (i=0; i < maxarg; i++) {
- SV * const * const svp = av_fetch(MUTABLE_AV(TARG), i, FALSE);
- SP[i+1] = (svp) ? *svp : &PL_sv_undef;
- }
- }
- else {
- SSize_t i;
- for (i=0; i < maxarg; i++) {
- SV * const sv = AvARRAY((const AV *)TARG)[i];
- SP[i+1] = sv ? sv : &PL_sv_undef;
- }
- }
- SP += maxarg;
- }
- else if (gimme == G_SCALAR) {
- SV* const sv = sv_newmortal();
- const SSize_t maxarg = AvFILL(MUTABLE_AV(TARG)) + 1;
- sv_setiv(sv, maxarg);
- PUSHs(sv);
- }
- RETURN;
-}
-
-PP(pp_padhv)
-{
- dSP; dTARGET;
- U8 gimme;
- bool tied;
-
- assert(SvTYPE(TARG) == SVt_PVHV);
- XPUSHs(TARG);
- if (UNLIKELY( PL_op->op_private & OPpLVAL_INTRO ))
- if (LIKELY( !(PL_op->op_private & OPpPAD_STATE) ))
- SAVECLEARSV(PAD_SVl(PL_op->op_targ));
- if (PL_op->op_flags & OPf_REF)
- RETURN;
- else if (PL_op->op_private & OPpMAYBE_LVSUB) {
- const I32 flags = is_lvalue_sub();
- if (flags && !(flags & OPpENTERSUB_INARGS)) {
- if (GIMME_V == G_SCALAR)
- /* diag_listed_as: Can't return %s to lvalue scalar context */
- Perl_croak(aTHX_ "Can't return hash to lvalue scalar context");
- RETURN;
- }
- }
-
- gimme = GIMME_V;
- if (gimme == G_ARRAY) {
- RETURNOP(Perl_do_kv(aTHX));
- }
-
- if (PL_op->op_private & OPpPADHV_ISKEYS)
- /* 'keys %h' masquerading as '%h': reset iterator */
- (void)hv_iterinit(MUTABLE_HV(TARG));
-
- tied = SvRMAGICAL(TARG) && mg_find(TARG, PERL_MAGIC_tied);
-
- if ( ( PL_op->op_private & OPpTRUEBOOL
- || ( PL_op->op_private & OPpMAYBE_TRUEBOOL
- && block_gimme() == G_VOID )
- )
- && !tied
- )
- SETs(HvUSEDKEYS(TARG) ? &PL_sv_yes : &PL_sv_zero);
- else if (gimme == G_SCALAR) {
- if (PL_op->op_private & OPpPADHV_ISKEYS) {
- IV i;
- if (tied) {
- i = 0;
- while (hv_iternext(MUTABLE_HV(TARG)))
- i++;
- }
- else
- i = HvUSEDKEYS(MUTABLE_HV(TARG));
- (void)POPs;
- mPUSHi(i);
- }
- else {
- SV* const sv = Perl_hv_scalar(aTHX_ MUTABLE_HV(TARG));
- SETs(sv);
- }
- }
- RETURN;
-}
PP(pp_padcv)
{
*/
if (vivify_sv && sv != &PL_sv_undef) {
GV *gv;
+ HV *stash;
if (SvREADONLY(sv))
Perl_croak_no_modify();
+ gv = MUTABLE_GV(newSV(0));
+ stash = CopSTASH(PL_curcop);
+ if (SvTYPE(stash) != SVt_PVHV) stash = NULL;
if (cUNOP->op_targ) {
SV * const namesv = PAD_SV(cUNOP->op_targ);
- HV *stash = CopSTASH(PL_curcop);
- if (SvTYPE(stash) != SVt_PVHV) stash = NULL;
- gv = MUTABLE_GV(newSV(0));
gv_init_sv(gv, stash, namesv, 0);
}
else {
- const char * const name = CopSTASHPV(PL_curcop);
- gv = newGVgen_flags(name,
- HvNAMEUTF8(CopSTASH(PL_curcop)) ? SVf_UTF8 : 0 );
- SvREFCNT_inc_simple_void_NN(gv);
+ gv_init_pv(gv, stash, "__ANONIO__", 0);
}
prepare_SV_for_RV(sv);
SvRV_set(sv, MUTABLE_SV(gv));
else {
const MAGIC * const mg = mg_find_mglob(sv);
if (mg && mg->mg_len != -1) {
- dTARGET;
STRLEN i = mg->mg_len;
- if (mg->mg_flags & MGf_BYTES && DO_UTF8(sv))
- i = sv_pos_b2u_flags(sv, i, SV_GMAGIC|SV_CONST_RETURN);
- SETu(i);
+ if (PL_op->op_private & OPpTRUEBOOL)
+ SETs(i ? &PL_sv_yes : &PL_sv_zero);
+ else {
+ dTARGET;
+ if (mg->mg_flags & MGf_BYTES && DO_UTF8(sv))
+ i = sv_pos_b2u_flags(sv, i, SV_GMAGIC|SV_CONST_RETURN);
+ SETu(i);
+ }
return NORMAL;
}
SETs(&PL_sv_undef);
cv = SvTYPE(SvRV(gv)) == SVt_PVCV
? MUTABLE_CV(SvRV(gv))
: MUTABLE_CV(gv);
- }
+ }
else
cv = MUTABLE_CV(&PL_sv_undef);
SETs(MUTABLE_SV(cv));
if (SvGMAGICAL(TOPs)) SETs(sv_mortalcopy(TOPs));
if (SvPOK(TOPs) && SvCUR(TOPs) >= 7) {
const char * s = SvPVX_const(TOPs);
- if (strnEQ(s, "CORE::", 6)) {
+ if (memBEGINs(s, SvCUR(TOPs), "CORE::")) {
const int code = keyword(s + 6, SvCUR(TOPs) - 6, 1);
if (!code)
DIE(aTHX_ "Can't find an opnumber for \"%" UTF8f "\"",
else if (SvPADTMP(sv)) {
sv = newSVsv(sv);
}
+ else if (UNLIKELY(SvSMAGICAL(sv) && mg_find(sv, PERL_MAGIC_nonelem)))
+ sv_unmagic(SvREFCNT_inc_simple_NN(sv), PERL_MAGIC_nonelem);
else {
SvTEMP_off(sv);
SvREFCNT_inc_void_NN(sv);
PP(pp_trans)
{
- dSP;
+ dSP;
SV *sv;
if (PL_op->op_flags & OPf_STACKED)
PUSHs(newsv);
}
else {
- I32 i = do_trans(sv);
- mPUSHi(i);
+ Size_t i = do_trans(sv);
+ mPUSHi((UV)i);
}
RETURN;
}
else if (result <= (UV)IV_MAX)
/* answer negative, fits in IV */
SETi( -(IV)result );
- else if (result == (UV)IV_MIN)
+ else if (result == (UV)IV_MIN)
/* 2's complement assumption: special case IV_MIN */
SETi( IV_MIN );
else
/* answer negative, doesn't fit */
SETn( -(NV)result );
RETURN;
- }
+ }
}
}
float_it:
-#endif
+#endif
{
NV right = SvNV_nomg(svr);
NV left = SvNV_nomg(svl);
NV nr = SvNVX(svr);
NV result;
- if (
-#if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
- !Perl_isnan(nl) && nl == (NV)(il = (IV)nl)
- && !Perl_isnan(nr) && nr == (NV)(ir = (IV)nr)
-#else
- nl == (NV)(il = (IV)nl) && nr == (NV)(ir = (IV)nr)
-#endif
- )
+ if (lossless_NV_to_IV(nl, &il) && lossless_NV_to_IV(nr, &ir)) {
/* nothing was lost by converting to IVs */
goto do_iv;
+ }
SP--;
result = nl * nr;
# if defined(__sgi) && defined(USE_LONG_DOUBLE) && LONG_DOUBLEKIND == LONG_DOUBLE_IS_DOUBLEDOUBLE_128_BIT_BE_BE && NVSIZE == 16
alow = aiv;
auvok = TRUE; /* effectively it's a UV now */
} else {
- /* abs, auvok == false records sign */
- alow = (aiv == IV_MIN) ? (UV)aiv : (UV)(-aiv);
+ /* abs, auvok == false records sign; Using 0- here and
+ * later to silence bogus warning from MS VC */
+ alow = (UV) (0 - (UV) aiv);
}
}
if (buvok) {
buvok = TRUE; /* effectively it's a UV now */
} else {
/* abs, buvok == false records sign */
- blow = (biv == IV_MIN) ? (UV)biv : (UV)(-biv);
+ blow = (UV) (0 - (UV) biv);
}
}
can be too large to preserve, so don't need to compile the code to
test the size of UVs. */
-#ifdef SLOPPYDIVIDE
+#if defined(SLOPPYDIVIDE) || (defined(PERL_PRESERVE_IVUV) && !defined(NV_PRESERVES_UV))
# define PERL_TRY_UV_DIVIDE
/* ensure that 20./5. == 4. */
-#else
-# ifdef PERL_PRESERVE_IVUV
-# ifndef NV_PRESERVES_UV
-# define PERL_TRY_UV_DIVIDE
-# endif
-# endif
#endif
#ifdef PERL_TRY_UV_DIVIDE
right_non_neg = TRUE; /* effectively it's a UV now */
}
else {
- right = (biv == IV_MIN) ? (UV)biv : (UV)(-biv);
+ right = -(UV)biv;
}
}
/* historically undef()/0 gives a "Use of uninitialized value"
left_non_neg = TRUE; /* effectively it's a UV now */
}
else {
- left = (aiv == IV_MIN) ? (UV)aiv : (UV)(-aiv);
+ left = -(UV)aiv;
}
}
#endif
) {
/* Integer division can't overflow, but it can be imprecise. */
+
+ /* Modern compilers optimize division followed by
+ * modulo into a single div instruction */
const UV result = left / right;
- if (result * right == left) {
+ if (left % right == 0) {
SP--; /* result is valid */
if (left_non_neg == right_non_neg) {
/* signs identical, result is positive. */
right = biv;
right_neg = FALSE; /* effectively it's a UV now */
} else {
- right = (biv == IV_MIN) ? (UV)biv : (UV)(-biv);
+ right = (UV) (0 - (UV) biv);
}
}
}
left = aiv;
left_neg = FALSE; /* effectively it's a UV now */
} else {
- left = (aiv == IV_MIN) ? (UV)aiv : (UV)(-aiv);
+ left = (UV) (0 - (UV) aiv);
}
}
}
IV count;
SV *sv;
bool infnan = FALSE;
+ const U8 gimme = GIMME_V;
- if (GIMME_V == G_ARRAY && PL_op->op_private & OPpREPEAT_DOLIST) {
+ if (gimme == G_ARRAY && PL_op->op_private & OPpREPEAT_DOLIST) {
/* TODO: think of some way of doing list-repeat overloading ??? */
sv = POPs;
SvGETMAGIC(sv);
else {
dTOPss;
ASSUME(MARK + 1 == SP);
- XPUSHs(sv);
+ MEXTEND(SP, 1);
+ PUSHs(sv);
MARK[1] = &PL_sv_undef;
}
SP = MARK + 2;
"Negative repeat count does nothing");
}
- if (GIMME_V == G_ARRAY && PL_op->op_private & OPpREPEAT_DOLIST) {
+ if (gimme == G_ARRAY && PL_op->op_private & OPpREPEAT_DOLIST) {
dMARK;
const SSize_t items = SP - MARK;
const U8 mod = PL_op->op_flags & OPf_MOD;
NV nl = SvNVX(svl);
NV nr = SvNVX(svr);
- if (
-#if defined(NAN_COMPARE_BROKEN) && defined(Perl_isnan)
- !Perl_isnan(nl) && nl == (NV)(il = (IV)nl)
- && !Perl_isnan(nr) && nr == (NV)(ir = (IV)nr)
-#else
- nl == (NV)(il = (IV)nl) && nr == (NV)(ir = (IV)nr)
-#endif
- )
+ if (lossless_NV_to_IV(nl, &il) && lossless_NV_to_IV(nr, &ir)) {
/* nothing was lost by converting to IVs */
goto do_iv;
+ }
SP--;
TARGn(nl - nr, 0); /* args not GMG, so can't be tainted */
SETs(TARG);
if (aiv >= 0) {
auv = aiv;
auvok = 1; /* Now acting as a sign flag. */
- } else { /* 2s complement assumption for IV_MIN */
- auv = (aiv == IV_MIN) ? (UV)aiv : (UV)-aiv;
+ } else {
+ auv = (UV) (0 - (UV) aiv);
}
}
a_valid = 1;
UV result;
UV buv;
bool buvok = SvUOK(svr);
-
+
if (buvok)
buv = SvUVX(svr);
else {
buv = biv;
buvok = 1;
} else
- buv = (biv == IV_MIN) ? (UV)biv : (UV)-biv;
+ buv = (UV) (0 - (UV) biv);
}
/* ?uvok if value is >= 0. basically, flagged as UV if it's +ve,
else "IV" now, independent of how it came in.
shift = -shift;
left = !left;
}
- if (shift >= IV_BITS) {
+ if (UNLIKELY(shift >= IV_BITS)) {
return 0;
}
return left ? uv << shift : uv >> shift;
static IV S_iv_shift(IV iv, int shift, bool left)
{
- if (shift < 0) {
- shift = -shift;
- left = !left;
- }
- if (shift >= IV_BITS) {
- return iv < 0 && !left ? -1 : 0;
- }
- return left ? iv << shift : iv >> shift;
+ if (shift < 0) {
+ shift = -shift;
+ left = !left;
+ }
+
+ if (UNLIKELY(shift >= IV_BITS)) {
+ return iv < 0 && !left ? -1 : 0;
+ }
+
+ /* For left shifts, perl 5 has chosen to treat the value as unsigned for
+ * the * purposes of shifting, then cast back to signed. This is very
+ * different from perl 6:
+ *
+ * $ perl6 -e 'say -2 +< 5'
+ * -64
+ *
+ * $ ./perl -le 'print -2 << 5'
+ * 18446744073709551552
+ * */
+ if (left) {
+ if (iv == IV_MIN) { /* Casting this to a UV is undefined behavior */
+ return 0;
+ }
+ return (IV) (((UV) iv) << shift);
+ }
+
+ /* Here is right shift */
+ return iv >> shift;
}
#define UV_LEFT_SHIFT(uv, shift) S_uv_shift(uv, shift, TRUE)
dSP;
SV *left, *right;
- tryAMAGICbin_MG(lt_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(lt_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(gt_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(gt_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(le_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(le_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(ge_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(ge_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
dSP;
SV *left, *right;
- tryAMAGICbin_MG(ne_amg, AMGf_set|AMGf_numeric);
+ tryAMAGICbin_MG(ne_amg, AMGf_numeric);
right = POPs;
left = TOPs;
SETs(boolSV(
break;
}
- tryAMAGICbin_MG(amg_type, AMGf_set);
+ tryAMAGICbin_MG(amg_type, 0);
{
dPOPTOPssrl;
const int cmp =
PP(pp_seq)
{
dSP;
- tryAMAGICbin_MG(seq_amg, AMGf_set);
+ tryAMAGICbin_MG(seq_amg, 0);
{
dPOPTOPssrl;
SETs(boolSV(sv_eq_flags(left, right, 0)));
PP(pp_sne)
{
dSP;
- tryAMAGICbin_MG(sne_amg, AMGf_set);
+ tryAMAGICbin_MG(sne_amg, 0);
{
dPOPTOPssrl;
SETs(boolSV(!sv_eq_flags(left, right, 0)));
dSP;
SV *sv;
- tryAMAGICun_MG(not_amg, AMGf_set);
+ tryAMAGICun_MG(not_amg, 0);
sv = *PL_stack_sp;
*PL_stack_sp = boolSV(!SvTRUE_nomg_NN(sv));
return NORMAL;
if (SvUTF8(TARG)) {
if (len && ! utf8_to_bytes(tmps, &len)) {
- Perl_croak(aTHX_ fatal_above_ff_msg, PL_op_desc[PL_op->op_type]);
+ Perl_croak(aTHX_ FATAL_ABOVE_FF_MSG, PL_op_desc[PL_op->op_type]);
}
- SvCUR(TARG) = len;
+ SvCUR_set(TARG, len);
SvUTF8_off(TARG);
}
anum = len;
-#ifdef LIBERAL
{
long *tmpl;
- for ( ; anum && (unsigned long)tmps % sizeof(long); anum--, tmps++)
+ for ( ; anum && PTR2nat(tmps) % sizeof(long); anum--, tmps++)
*tmps = ~*tmps;
tmpl = (long*)tmps;
for ( ; anum >= (I32)sizeof(long); anum -= (I32)sizeof(long), tmpl++)
*tmpl = ~*tmpl;
tmps = (U8*)tmpl;
}
-#endif
+
for ( ; anum > 0; anum--, tmps++)
*tmps = ~*tmps;
}
PP(pp_i_lt)
{
dSP;
- tryAMAGICbin_MG(lt_amg, AMGf_set);
+ tryAMAGICbin_MG(lt_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left < right));
PP(pp_i_gt)
{
dSP;
- tryAMAGICbin_MG(gt_amg, AMGf_set);
+ tryAMAGICbin_MG(gt_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left > right));
PP(pp_i_le)
{
dSP;
- tryAMAGICbin_MG(le_amg, AMGf_set);
+ tryAMAGICbin_MG(le_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left <= right));
PP(pp_i_ge)
{
dSP;
- tryAMAGICbin_MG(ge_amg, AMGf_set);
+ tryAMAGICbin_MG(ge_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left >= right));
PP(pp_i_eq)
{
dSP;
- tryAMAGICbin_MG(eq_amg, AMGf_set);
+ tryAMAGICbin_MG(eq_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left == right));
PP(pp_i_ne)
{
dSP;
- tryAMAGICbin_MG(ne_amg, AMGf_set);
+ tryAMAGICbin_MG(ne_amg, 0);
{
dPOPTOPiirl_nomg;
SETs(boolSV(left != right));
{
dSP;
NV value;
-
+
if (MAXARG < 1)
{
EXTEND(SP, 1);
/* If Unicode, try to downgrade
* If not possible, croak. */
SV* const tsv = sv_2mortal(newSVsv(sv));
-
+
SvUTF8_on(tsv);
sv_utf8_downgrade(tsv, FALSE);
tmps = SvPV_const(tsv, len);
/* String stuff. */
+
PP(pp_length)
{
dSP; dTARGET;
SV * const sv = TOPs;
U32 in_bytes = IN_BYTES;
- /* simplest case shortcut */
- /* turn off SVf_UTF8 in tmp flags if HINT_BYTES on*/
+ /* Simplest case shortcut:
+ * set svflags to just the SVf_POK|SVs_GMG|SVf_UTF8 from the SV,
+ * with the SVf_UTF8 flag inverted if under 'use bytes' (HINT_BYTES
+ * set)
+ */
U32 svflags = (SvFLAGS(sv) ^ (in_bytes << 26)) & (SVf_POK|SVs_GMG|SVf_UTF8);
- STATIC_ASSERT_STMT(HINT_BYTES == 0x00000008 && SVf_UTF8 == 0x20000000 && (SVf_UTF8 == HINT_BYTES << 26));
+
+ STATIC_ASSERT_STMT(SVf_UTF8 == (HINT_BYTES << 26));
SETs(TARG);
- if(LIKELY(svflags == SVf_POK))
+ if (LIKELY(svflags == SVf_POK))
goto simple_pv;
- if(svflags & SVs_GMG)
+
+ if (svflags & SVs_GMG)
mg_get(sv);
+
if (SvOK(sv)) {
- if (!IN_BYTES) /* reread to avoid using an C auto/register */
- sv_setiv(TARG, (IV)sv_len_utf8_nomg(sv));
- else
- {
- STRLEN len;
+ STRLEN len;
+ if (!IN_BYTES) { /* reread to avoid using an C auto/register */
+ if ((SvFLAGS(sv) & (SVf_POK|SVf_UTF8)) == SVf_POK)
+ goto simple_pv;
+ if ( SvPOK(sv) && (PL_op->op_private & OPpTRUEBOOL)) {
+ /* no need to convert from bytes to chars */
+ len = SvCUR(sv);
+ goto return_bool;
+ }
+ len = sv_len_utf8_nomg(sv);
+ }
+ else {
/* unrolled SvPV_nomg_const(sv,len) */
- if(SvPOK_nog(sv)){
- simple_pv:
+ if (SvPOK_nog(sv)) {
+ simple_pv:
len = SvCUR(sv);
- } else {
+ if (PL_op->op_private & OPpTRUEBOOL) {
+ return_bool:
+ SETs(len ? &PL_sv_yes : &PL_sv_zero);
+ return NORMAL;
+ }
+ }
+ else {
(void)sv_2pv_flags(sv, &len, 0|SV_CONST_RETURN);
}
- sv_setiv(TARG, (IV)(len));
}
- } else {
+ TARGi((IV)(len), 1);
+ }
+ else {
if (!SvPADTMP(TARG)) {
+ /* OPpTARGET_MY: targ is var in '$lex = length()' */
sv_set_undef(TARG);
- } else { /* TARG is on stack at this point and is overwriten by SETs.
- This branch is the odd one out, so put TARG by default on
- stack earlier to let local SP go out of liveness sooner */
+ SvSETMAGIC(TARG);
+ }
+ else
+ /* TARG is on stack at this point and is overwriten by SETs.
+ * This branch is the odd one out, so put TARG by default on
+ * stack earlier to let local SP go out of liveness sooner */
SETs(&PL_sv_undef);
- goto no_set_magic;
- }
}
- SvSETMAGIC(TARG);
- no_set_magic:
return NORMAL; /* no putback, SP didn't move in this opcode */
}
+
/* Returns false if substring is completely outside original string.
No length is indicated by len_iv = 0 and len_is_uv = 0. len_is_uv must
always be true for an explicit 0.
convert the small string to ISO-8859-1, then there is no
way that it could be found anywhere by index. */
retval = -1;
- goto fail;
+ goto push_result;
}
/* At this point, pv is a malloc()ed string. So donate it to temp
retval = sv_pos_b2u_flags(big, retval, SV_CONST_RETURN);
}
SvREFCNT_dec(temp);
- fail:
- PUSHi(retval);
+
+ push_result:
+ /* OPpTRUEBOOL indicates an '== -1' has been optimised away */
+ if (PL_op->op_private & OPpTRUEBOOL) {
+ PUSHs( ((retval != -1) ^ cBOOL(PL_op->op_private & OPpINDEX_BOOLNEG))
+ ? &PL_sv_yes : &PL_sv_no);
+ if (PL_op->op_private & OPpTARGET_MY)
+ /* $lex = (index() == -1) */
+ sv_setsv(TARG, TOPs);
+ }
+ else
+ PUSHi(retval);
RETURN;
}
#if defined(__GLIBC__) || defined(__EMX__)
if (PL_reentrant_buffer->_crypt_struct_buffer) {
PL_reentrant_buffer->_crypt_struct_buffer->initialized = 0;
- /* work around glibc-2.2.5 bug */
+#if (defined(__GLIBC__) && __GLIBC__ == 2) && \
+ (defined(__GLIBC_MINOR__) && __GLIBC_MINOR__ >= 2 && __GLIBC_MINOR__ < 4)
+ /* work around glibc-2.2.5 bug, has been fixed at some
+ * time in glibc-2.3.X */
PL_reentrant_buffer->_crypt_struct_buffer->current_saltbits = 0;
+#endif
}
#endif
}
#endif
}
-/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So
+/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So
* most comments below say UTF-8, when in fact they mean UTF-EBCDIC as well */
STRLEN tculen; /* tculen is the byte length of the freshly titlecased (or
* lowercased) character stored in tmpbuf. May be either
* UTF-8 or not, but in either case is the number of bytes */
+ bool remove_dot_above = FALSE;
s = (const U8*)SvPV_const(source, slen);
* not convert in-place. */
inplace = !SvREADONLY(source) && SvPADTMP(source);
+#ifdef USE_LOCALE_CTYPE
+
+ if (IN_LC_RUNTIME(LC_CTYPE)) {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
+
+#endif
+
/* First calculate what the changed first character should be. This affects
* whether we can just swap it out, leaving the rest of the string unchanged,
* or even if have to convert the dest to UTF-8 when the source isn't */
if (! slen) { /* If empty */
need = 1; /* still need a trailing NUL */
ulen = 0;
+ *tmpbuf = '\0';
}
else if (DO_UTF8(source)) { /* Is the source utf8? */
doing_utf8 = TRUE;
ulen = UTF8SKIP(s);
+
if (op_type == OP_UCFIRST) {
#ifdef USE_LOCALE_CTYPE
_toTITLE_utf8_flags(s, s +slen, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE));
#endif
}
else {
+
#ifdef USE_LOCALE_CTYPE
+
_toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, IN_LC_RUNTIME(LC_CTYPE));
+
+ /* In turkic locales, lower casing an 'I' normally yields U+0131,
+ * LATIN SMALL LETTER DOTLESS I, but not if the grapheme also
+ * contains a COMBINING DOT ABOVE. Instead it is treated like
+ * LATIN CAPITAL LETTER I WITH DOT ABOVE lowercased to 'i'. The
+ * call to lowercase above has handled this. But SpecialCasing.txt
+ * says we are supposed to remove the COMBINING DOT ABOVE. We can
+ * tell if we have this situation if I ==> i in a turkic locale. */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && IN_LC_RUNTIME(LC_CTYPE)
+ && (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')))
+ {
+ /* Here, we know there was a COMBINING DOT ABOVE. We won't be
+ * able to handle this in-place. */
+ inplace = FALSE;
+
+ /* It seems likely that the DOT will immediately follow the
+ * 'I'. If so, we can remove it simply by indicating to the
+ * code below to start copying the source just beyond the DOT.
+ * We know its length is 2 */
+ if (LIKELY(memBEGINs(s + 1, s + slen, COMBINING_DOT_ABOVE_UTF8))) {
+ ulen += 2;
+ }
+ else { /* But if it doesn't follow immediately, set a flag for
+ the code below */
+ remove_dot_above = TRUE;
+ }
+ }
#else
+ PERL_UNUSED_VAR(remove_dot_above);
+
_toLOWER_utf8_flags(s, s + slen, tmpbuf, &tculen, 0);
#endif
- }
+
+ }
/* we can't do in-place if the length changes. */
if (ulen != tculen) inplace = FALSE;
}
else { /* Non-zero length, non-UTF-8, Need to consider locale and if
* latin1 is treated as caseless. Note that a locale takes
- * precedence */
+ * precedence */
ulen = 1; /* Original character is 1 byte */
tculen = 1; /* Most characters will require one byte, but this will
* need to be overridden for the tricky ones */
need = slen + 1;
- if (op_type == OP_LCFIRST) {
- /* lower case the first letter: no trickiness for any character */
#ifdef USE_LOCALE_CTYPE
- if (IN_LC_RUNTIME(LC_CTYPE)) {
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- *tmpbuf = toLOWER_LC(*s);
- }
- else
-#endif
+
+ if (IN_LC_RUNTIME(LC_CTYPE)) {
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && ( (op_type == OP_LCFIRST && UNLIKELY(*s == 'I'))
+ || (op_type == OP_UCFIRST && UNLIKELY(*s == 'i'))))
{
- *tmpbuf = (IN_UNI_8_BIT)
- ? toLOWER_LATIN1(*s)
- : toLOWER(*s);
+ if (*s == 'I') { /* lcfirst('I') */
+ tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+ tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+ }
+ else { /* ucfirst('i') */
+ tmpbuf[0] = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ tmpbuf[1] = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ tculen = 2;
+ inplace = FALSE;
+ doing_utf8 = TRUE;
+ convert_source_to_utf8 = TRUE;
+ need += variant_under_utf8_count(s, s + slen);
}
- }
-#ifdef USE_LOCALE_CTYPE
- /* is ucfirst() */
- else if (IN_LC_RUNTIME(LC_CTYPE)) {
- if (IN_UTF8_CTYPE_LOCALE) {
- goto do_uni_rules;
+ else if (op_type == OP_LCFIRST) {
+
+ /* For lc, there are no gotchas for UTF-8 locales (other than
+ * the turkish ones already handled above) */
+ *tmpbuf = toLOWER_LC(*s);
}
+ else { /* ucfirst */
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- *tmpbuf = (U8) toUPPER_LC(*s); /* This would be a bug if any
- locales have upper and title case
- different */
- }
+ /* But for uc, some characters require special handling */
+ if (IN_UTF8_CTYPE_LOCALE) {
+ goto do_uni_rules;
+ }
+
+ /* This would be a bug if any locales have upper and title case
+ * different */
+ *tmpbuf = (U8) toUPPER_LC(*s);
+ }
+ }
+ else
#endif
- else if (! IN_UNI_8_BIT) {
- *tmpbuf = toUPPER(*s); /* Returns caseless for non-ascii, or
- * on EBCDIC machines whatever the
- * native function does */
- }
+ /* Here, not in locale. If not using Unicode rules, is a simple
+ * lower/upper, depending */
+ if (! IN_UNI_8_BIT) {
+ *tmpbuf = (op_type == OP_LCFIRST)
+ ? toLOWER(*s)
+ : toUPPER(*s);
+ }
+ else if (op_type == OP_LCFIRST) {
+ /* lower case the first letter: no trickiness for any character */
+ *tmpbuf = toLOWER_LATIN1(*s);
+ }
else {
/* Here, is ucfirst non-UTF-8, not in locale (unless that locale is
- * UTF-8, which we treat as not in locale), and cased latin1 */
+ * non-turkic UTF-8, which we treat as not in locale), and cased
+ * latin1 */
UV title_ord;
#ifdef USE_LOCALE_CTYPE
do_uni_rules:
inplace = FALSE;
/* If the result won't fit in a byte, the entire result
- * will have to be in UTF-8. Assume worst case sizing in
- * conversion. (all latin1 characters occupy at most two
- * bytes in utf8) */
+ * will have to be in UTF-8. Allocate enough space for the
+ * expanded first byte, and if UTF-8, the rest of the input
+ * string, some or all of which may also expand to two
+ * bytes, plus the terminating NUL. */
if (title_ord > 255) {
doing_utf8 = TRUE;
convert_source_to_utf8 = TRUE;
- need = slen * 2 + 1;
+ need = slen
+ + variant_under_utf8_count(s, s + slen)
+ + 1;
/* The (converted) UTF-8 and UTF-EBCDIC lengths of all
- * (both) characters whose title case is above 255 is
+ * characters whose title case is above 255 is
* 2. */
ulen = 2;
}
* of the string. */
sv_setpvn(dest, (char*)tmpbuf, tculen);
if (slen > ulen) {
+
+ /* But this boolean being set means we are in a turkic
+ * locale, and there is a DOT character that needs to be
+ * removed, and it isn't immediately after the current
+ * character. Keep concatenating characters to the output
+ * one at a time, until we find the DOT, which we simply
+ * skip */
+ if (UNLIKELY(remove_dot_above)) {
+ do {
+ Size_t this_len = UTF8SKIP(s + ulen);
+
+ sv_catpvn(dest, (char*)(s + ulen), this_len);
+
+ ulen += this_len;
+ if (memBEGINs(s + ulen, s + slen, COMBINING_DOT_ABOVE_UTF8)) {
+ ulen += 2;
+ break;
+ }
+ } while (s + ulen < s + slen);
+ }
+
+ /* The rest of the string can be concatenated unchanged,
+ * all at once */
sv_catpvn(dest, (char*)(s + ulen), slen - ulen);
}
}
* into tmpbuf. First put that into dest, and then append the
* rest of the source, converting it to UTF-8 as we go. */
- /* Assert tculen is 2 here because the only two characters that
+ /* Assert tculen is 2 here because the only characters that
* get to this part of the code have 2-byte UTF-8 equivalents */
+ assert(tculen == 2);
*d++ = *tmpbuf;
*d++ = *(tmpbuf + 1);
s++; /* We have just processed the 1st char */
- for (; s < send; s++) {
- d = uvchr_to_utf8(d, *s);
- }
+ while (s < send) {
+ append_utf8_from_native_byte(*s, &d);
+ s++;
+ }
+
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
}
}
}
- else { /* Neither source nor dest are in or need to be UTF-8 */
+ else { /* Neither source nor dest are, nor need to be UTF-8 */
if (slen) {
if (inplace) { /* in-place, only need to change the 1st char */
*d = *tmpbuf;
/* In a "use bytes" we don't treat the source as UTF-8, but, still want
* the destination to retain that flag */
- if (SvUTF8(source) && ! IN_BYTES)
+ if (DO_UTF8(source))
SvUTF8_on(dest);
if (!inplace) { /* Finish the rest of the string, unchanged */
return NORMAL;
}
-/* There's so much setup/teardown code common between uc and lc, I wonder if
- it would be worth merging the two, and just having a switch outside each
- of the three tight loops. There is less and less commonality though */
PP(pp_uc)
{
+ dVAR;
dSP;
SV *source = TOPs;
STRLEN len;
SETs(dest);
}
+#ifdef USE_LOCALE_CTYPE
+
+ if (IN_LC_RUNTIME(LC_CTYPE)) {
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
+
+#endif
+
/* Overloaded values may have toggled the UTF-8 flag on source, so we need
to check DO_UTF8 again here. */
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+#define GREEK_CAPITAL_LETTER_IOTA 0x0399
+#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
/* All occurrences of these are to be moved to follow any other marks.
* This is context-dependent. We may not be passed enough context to
* move the iota subscript beyond all of them, but we do the best we can
STRLEN u;
STRLEN ulen;
UV uv;
- if (in_iota_subscript && ! _is_utf8_mark(s)) {
+ if (UNLIKELY(in_iota_subscript)) {
+ UV cp = utf8_to_uvchr_buf(s, send, NULL);
- /* A non-mark. Time to output the iota subscript */
- Copy(GREEK_CAPITAL_LETTER_IOTA_UTF8, d, capital_iota_len, U8);
- d += capital_iota_len;
- in_iota_subscript = FALSE;
+ if (! _invlist_contains_cp(PL_utf8_mark, cp)) {
+
+ /* A non-mark. Time to output the iota subscript */
+ *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);
+ *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);
+ in_iota_subscript = FALSE;
+ }
}
/* Then handle the current character. Get the changed case value
#else
uv = _toUPPER_utf8_flags(s, send, tmpbuf, &ulen, 0);
#endif
-#define GREEK_CAPITAL_LETTER_IOTA 0x0399
-#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
if (uv == GREEK_CAPITAL_LETTER_IOTA
&& utf8_to_uvchr_buf(s, send, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
{
/* If someone uppercases one million U+03B0s we SvGROW()
* one million times. Or we could try guessing how much to
- * allocate without allocating too much. Such is life.
- * See corresponding comment in lc code for another option
- * */
+ * allocate without allocating too much. But we can't
+ * really guess without examining the rest of the string.
+ * Such is life. See corresponding comment in lc code for
+ * another option */
d = o + (U8*) SvGROW(dest, min);
}
Copy(tmpbuf, d, ulen, U8);
s += u;
}
if (in_iota_subscript) {
- Copy(GREEK_CAPITAL_LETTER_IOTA_UTF8, d, capital_iota_len, U8);
- d += capital_iota_len;
+ *d++ = UTF8_TWO_BYTE_HI(GREEK_CAPITAL_LETTER_IOTA);
+ *d++ = UTF8_TWO_BYTE_LO(GREEK_CAPITAL_LETTER_IOTA);
}
SvUTF8_on(dest);
*d = '\0';
if (IN_UTF8_CTYPE_LOCALE) {
goto do_uni_rules;
}
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
for (; s < send; d++, s++)
*d = (U8) toUPPER_LC(*s);
}
do_uni_rules:
#endif
for (; s < send; d++, s++) {
+ Size_t extra;
+
*d = toUPPER_LATIN1_MOD(*s);
- if (LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) {
+ if ( LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)
+
+#ifdef USE_LOCALE_CTYPE
+
+ && (LIKELY( ! PL_in_utf8_turkic_locale
+ || ! IN_LC_RUNTIME(LC_CTYPE))
+ || *s != 'i')
+#endif
+
+ ) {
continue;
}
/* The mainstream case is the tight loop above. To avoid
- * extra tests in that, all three characters that require
- * special handling are mapped by the MOD to the one tested
- * just above.
- * Use the source to distinguish between the three cases */
+ * extra tests in that, all three characters that always
+ * require special handling are mapped by the MOD to the
+ * one tested just above. Use the source to distinguish
+ * between those cases */
#if UNICODE_MAJOR_VERSION > 2 \
|| (UNICODE_MAJOR_VERSION == 2 && UNICODE_DOT_VERSION >= 1 \
/* uc() of this requires 2 characters, but they are
* ASCII. If not enough room, grow the string */
- if (SvLEN(dest) < ++min) {
+ if (SvLEN(dest) < ++min) {
const UV o = d - (U8*)SvPVX_const(dest);
d = o + (U8*) SvGROW(dest, min);
}
}
#endif
- /* The other two special handling characters have their
+ /* The other special handling characters have their
* upper cases outside the latin1 range, hence need to be
- * in UTF-8, so the whole result needs to be in UTF-8. So,
- * here we are somewhere in the middle of processing a
- * non-UTF-8 string, and realize that we will have to convert
- * the whole thing to UTF-8. What to do? There are
- * several possibilities. The simplest to code is to
- * convert what we have so far, set a flag, and continue on
- * in the loop. The flag would be tested each time through
- * the loop, and if set, the next character would be
- * converted to UTF-8 and stored. But, I (khw) didn't want
- * to slow down the mainstream case at all for this fairly
- * rare case, so I didn't want to add a test that didn't
- * absolutely have to be there in the loop, besides the
- * possibility that it would get too complicated for
- * optimizers to deal with. Another possibility is to just
- * give up, convert the source to UTF-8, and restart the
- * function that way. Another possibility is to convert
- * both what has already been processed and what is yet to
- * come separately to UTF-8, then jump into the loop that
- * handles UTF-8. But the most efficient time-wise of the
- * ones I could think of is what follows, and turned out to
- * not require much extra code. */
-
- /* Convert what we have so far into UTF-8, telling the
+ * in UTF-8, so the whole result needs to be in UTF-8.
+ *
+ * So, here we are somewhere in the middle of processing a
+ * non-UTF-8 string, and realize that we will have to
+ * convert the whole thing to UTF-8. What to do? There
+ * are several possibilities. The simplest to code is to
+ * convert what we have so far, set a flag, and continue on
+ * in the loop. The flag would be tested each time through
+ * the loop, and if set, the next character would be
+ * converted to UTF-8 and stored. But, I (khw) didn't want
+ * to slow down the mainstream case at all for this fairly
+ * rare case, so I didn't want to add a test that didn't
+ * absolutely have to be there in the loop, besides the
+ * possibility that it would get too complicated for
+ * optimizers to deal with. Another possibility is to just
+ * give up, convert the source to UTF-8, and restart the
+ * function that way. Another possibility is to convert
+ * both what has already been processed and what is yet to
+ * come separately to UTF-8, then jump into the loop that
+ * handles UTF-8. But the most efficient time-wise of the
+ * ones I could think of is what follows, and turned out to
+ * not require much extra code.
+ *
+ * First, calculate the extra space needed for the
+ * remainder of the source needing to be in UTF-8. Except
+ * for the 'i' in Turkic locales, in UTF-8 strings, the
+ * uppercase of a character below 256 occupies the same
+ * number of bytes as the original. Therefore, the space
+ * needed is the that number plus the number of characters
+ * that become two bytes when converted to UTF-8, plus, in
+ * turkish locales, the number of 'i's. */
+
+ extra = send - s + variant_under_utf8_count(s, send);
+
+#ifdef USE_LOCALE_CTYPE
+
+ if (UNLIKELY(*s == 'i')) { /* We wouldn't get an 'i' here
+ unless are in a Turkic
+ locale */
+ const U8 * s_peek = s;
+
+ do {
+ extra++;
+
+ s_peek = (U8 *) memchr(s_peek + 1, 'i',
+ send - (s_peek + 1));
+ } while (s_peek != NULL);
+ }
+#endif
+
+ /* Convert what we have so far into UTF-8, telling the
* function that we know it should be converted, and to
* allow extra space for what we haven't processed yet.
- * Assume the worst case space requirements for converting
- * what we haven't processed so far: that it will require
- * two bytes for each remaining source character, plus the
- * NUL at the end. This may cause the string pointer to
- * move, so re-find it. */
+ *
+ * This may cause the string pointer to move, so need to
+ * save and re-find it. */
len = d - (U8*)SvPVX_const(dest);
SvCUR_set(dest, len);
len = sv_utf8_upgrade_flags_grow(dest,
SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- (send -s) * 2 + 1);
+ extra
+ + 1 /* trailing NUL */ );
d = (U8*)SvPVX(dest) + len;
- /* Now process the remainder of the source, converting to
- * upper and UTF-8. If a resulting byte is invariant in
- * UTF-8, output it as-is, otherwise convert to UTF-8 and
- * append it to the output. */
- for (; s < send; s++) {
- (void) _to_upper_title_latin1(*s, d, &len, 'S');
- d += len;
- }
+ /* Now process the remainder of the source, simultaneously
+ * converting to upper and UTF-8.
+ *
+ * To avoid extra tests in the loop body, and since the
+ * loop is so simple, split out the rare Turkic case into
+ * its own loop */
- /* Here have processed the whole source; no need to continue
- * with the outer loop. Each character has been converted
- * to upper case and converted to UTF-8 */
+#ifdef USE_LOCALE_CTYPE
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE)))
+ {
+ for (; s < send; s++) {
+ if (*s == 'i') {
+ *d++ = UTF8_TWO_BYTE_HI(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ *d++ = UTF8_TWO_BYTE_LO(LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
+ }
+ else {
+ (void) _to_upper_title_latin1(*s, d, &len, 'S');
+ d += len;
+ }
+ }
+ }
+ else
+#endif
+ for (; s < send; s++) {
+ (void) _to_upper_title_latin1(*s, d, &len, 'S');
+ d += len;
+ }
+ /* Here have processed the whole source; no need to
+ * continue with the outer loop. Each character has been
+ * converted to upper case and converted to UTF-8. */
break;
} /* End of processing all latin1-style chars */
} /* End of processing all chars */
SV *dest;
const U8 *s;
U8 *d;
+ bool has_turkic_I = FALSE;
SvGETMAGIC(source);
if ( SvPADTMP(source)
&& !SvREADONLY(source) && SvPOK(source)
- && !DO_UTF8(source)) {
+ && !DO_UTF8(source)
- /* We can convert in place, as lowercasing anything in the latin1 range
- * (or else DO_UTF8 would have been on) doesn't lengthen it */
+#ifdef USE_LOCALE_CTYPE
+
+ && ( LIKELY(! IN_LC_RUNTIME(LC_CTYPE))
+ || LIKELY(! PL_in_utf8_turkic_locale))
+
+#endif
+
+ ) {
+
+ /* We can convert in place, as, outside of Turkic UTF-8 locales,
+ * lowercasing anything in the latin1 range (or else DO_UTF8 would have
+ * been on) doesn't lengthen it. */
dest = source;
s = d = (U8*)SvPV_force_nomg(source, len);
min = len + 1;
SETs(dest);
}
+#ifdef USE_LOCALE_CTYPE
+
+ if (IN_LC_RUNTIME(LC_CTYPE)) {
+ const U8 * next_I;
+
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+
+ /* Lowercasing in a Turkic locale can cause non-UTF-8 to need to become
+ * UTF-8 for the single case of the character 'I' */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && ! DO_UTF8(source)
+ && (next_I = (U8 *) memchr(s, 'I', len)))
+ {
+ Size_t I_count = 0;
+ const U8 *const send = s + len;
+
+ do {
+ I_count++;
+
+ next_I = (U8 *) memchr(next_I + 1, 'I',
+ send - (next_I + 1));
+ } while (next_I != NULL);
+
+ /* Except for the 'I', in UTF-8 strings, the lower case of a
+ * character below 256 occupies the same number of bytes as the
+ * original. Therefore, the space needed is the original length
+ * plus I_count plus the number of characters that become two bytes
+ * when converted to UTF-8 */
+ sv_utf8_upgrade_flags_grow(dest, 0, len
+ + I_count
+ + variant_under_utf8_count(s, send)
+ + 1 /* Trailing NUL */ );
+ d = (U8*)SvPVX(dest);
+ has_turkic_I = TRUE;
+ }
+ }
+
+#endif
+
/* Overloaded values may have toggled the UTF-8 flag on source, so we need
to check DO_UTF8 again here. */
if (DO_UTF8(source)) {
const U8 *const send = s + len;
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
+ bool remove_dot_above = FALSE;
while (s < send) {
const STRLEN u = UTF8SKIP(s);
STRLEN ulen;
#ifdef USE_LOCALE_CTYPE
+
_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, IN_LC_RUNTIME(LC_CTYPE));
+
+ /* If we are in a Turkic locale, we have to do more work. As noted
+ * in the comments for lcfirst, there is a special case if a 'I'
+ * is in a grapheme with COMBINING DOT ABOVE UTF8. It turns into a
+ * 'i', and the DOT must be removed. We check for that situation,
+ * and set a flag if the DOT is there. Then each time through the
+ * loop, we have to see if we need to remove the next DOT above,
+ * and if so, do it. We know that there is a DOT because
+ * _toLOWER_utf8_flags() wouldn't have returned 'i' unless there
+ * was one in a proper position. */
+ if ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && IN_LC_RUNTIME(LC_CTYPE))
+ {
+ if ( UNLIKELY(remove_dot_above)
+ && memBEGINs(tmpbuf, sizeof(tmpbuf), COMBINING_DOT_ABOVE_UTF8))
+ {
+ s += u;
+ remove_dot_above = FALSE;
+ continue;
+ }
+ else if (UNLIKELY(*s == 'I' && tmpbuf[0] == 'i')) {
+ remove_dot_above = TRUE;
+ }
+ }
#else
+ PERL_UNUSED_VAR(remove_dot_above);
+
_toLOWER_utf8_flags(s, send, tmpbuf, &ulen, 0);
#endif
- /* Here is where we would do context-sensitive actions. See the
- * commit message for 86510fb15 for why there isn't any */
+ /* Here is where we would do context-sensitive actions for the
+ * Greek final sigma. See the commit message for 86510fb15 for why
+ * there isn't any */
if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
SvUTF8_on(dest);
*d = '\0';
SvCUR_set(dest, d - (U8*)SvPVX_const(dest));
- } else { /* Not utf8 */
+ } else { /* 'source' not utf8 */
if (len) {
const U8 *const send = s + len;
* whole thing in a tight loop, for speed, */
#ifdef USE_LOCALE_CTYPE
if (IN_LC_RUNTIME(LC_CTYPE)) {
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
- for (; s < send; d++, s++)
- *d = toLOWER_LC(*s);
+ if (LIKELY( ! has_turkic_I)) {
+ for (; s < send; d++, s++)
+ *d = toLOWER_LC(*s);
+ }
+ else { /* This is the only case where lc() converts 'dest'
+ into UTF-8 from a non-UTF-8 'source' */
+ for (; s < send; s++) {
+ if (*s == 'I') {
+ *d++ = UTF8_TWO_BYTE_HI(LATIN_SMALL_LETTER_DOTLESS_I);
+ *d++ = UTF8_TWO_BYTE_LO(LATIN_SMALL_LETTER_DOTLESS_I);
+ }
+ else {
+ append_utf8_from_native_byte(toLOWER_LATIN1(*s), &d);
+ }
+ }
+ }
}
else
#endif
#ifdef USE_LOCALE_CTYPE
/* In locale, we quote all non-ASCII Latin1 chars.
* Otherwise use the quoting rules */
-
+
IN_LC_RUNTIME(LC_CTYPE)
||
#endif
SETs(dest);
send = s + len;
+
+#ifdef USE_LOCALE_CTYPE
+
+ if ( IN_LC_RUNTIME(LC_CTYPE) ) { /* Under locale */
+ _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
+ }
+
+#endif
+
if (DO_UTF8(source)) { /* UTF-8 flagged string. */
while (s < send) {
const STRLEN u = UTF8SKIP(s);
if (IN_UTF8_CTYPE_LOCALE) {
goto do_uni_folding;
}
- _CHECK_AND_WARN_PROBLEMATIC_LOCALE;
for (; s < send; d++, s++)
*d = (U8) toFOLD_LC(*s);
}
#ifdef USE_LOCALE_CTYPE
do_uni_folding:
#endif
- /* For ASCII and the Latin-1 range, there's only two troublesome
- * folds, \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full
- * casefolding becomes 'ss'; and \x{B5} (\N{MICRO SIGN}), which
- * under any fold becomes \x{3BC} (\N{GREEK SMALL LETTER MU}) --
+ /* For ASCII and the Latin-1 range, there's potentially three
+ * troublesome folds:
+ * \x{DF} (\N{LATIN SMALL LETTER SHARP S}), which under full
+ * casefolding becomes 'ss';
+ * \x{B5} (\N{MICRO SIGN}), which under any fold becomes
+ * \x{3BC} (\N{GREEK SMALL LETTER MU})
+ * I only in Turkic locales, this folds to \x{131}
+ * \N{LATIN SMALL LETTER DOTLESS I}
* For the rest, the casefold is their lowercase. */
for (; s < send; d++, s++) {
- if (*s == MICRO_SIGN) {
+ if ( UNLIKELY(*s == MICRO_SIGN)
+#ifdef USE_LOCALE_CTYPE
+ || ( UNLIKELY(PL_in_utf8_turkic_locale)
+ && UNLIKELY(IN_LC_RUNTIME(LC_CTYPE))
+ && UNLIKELY(*s == 'I'))
+#endif
+ ) {
+ Size_t extra = send - s
+ + variant_under_utf8_count(s, send);
+
/* \N{MICRO SIGN}'s casefold is \N{GREEK SMALL LETTER MU},
- * which is outside of the latin-1 range. There's a couple
- * of ways to deal with this -- khw discusses them in
- * pp_lc/uc, so go there :) What we do here is upgrade what
- * we had already casefolded, then enter an inner loop that
- * appends the rest of the characters as UTF-8. */
+ * and 'I' in Turkic locales is \N{LATIN SMALL LETTER
+ * DOTLESS I} both of which are outside of the latin-1
+ * range. There's a couple of ways to deal with this -- khw
+ * discusses them in pp_lc/uc, so go there :) What we do
+ * here is upgrade what we had already casefolded, then
+ * enter an inner loop that appends the rest of the
+ * characters as UTF-8.
+ *
+ * First we calculate the needed size of the upgraded dest
+ * beyond what's been processed already (the upgrade
+ * function figures that out). Except for the 'I' in
+ * Turkic locales, in UTF-8 strings, the fold case of a
+ * character below 256 occupies the same number of bytes as
+ * the original (even the Sharp S). Therefore, the space
+ * needed is the number of bytes remaining plus the number
+ * of characters that become two bytes when converted to
+ * UTF-8 plus, in turkish locales, the number of 'I's */
+
+ if (UNLIKELY(*s == 'I')) {
+ const U8 * s_peek = s;
+
+ do {
+ extra++;
+
+ s_peek = (U8 *) memchr(s_peek + 1, 'i',
+ send - (s_peek + 1));
+ } while (s_peek != NULL);
+ }
+
+ /* Growing may move things, so have to save and recalculate
+ * 'd' */
len = d - (U8*)SvPVX_const(dest);
SvCUR_set(dest, len);
len = sv_utf8_upgrade_flags_grow(dest,
SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
- /* The max expansion for latin1
- * chars is 1 byte becomes 2 */
- (send -s) * 2 + 1);
+ extra
+ + 1 /* Trailing NUL */ );
d = (U8*)SvPVX(dest) + len;
- Copy(GREEK_SMALL_LETTER_MU_UTF8, d, small_mu_len, U8);
- d += small_mu_len;
+ *d++ = UTF8_TWO_BYTE_HI(GREEK_SMALL_LETTER_MU);
+ *d++ = UTF8_TWO_BYTE_LO(GREEK_SMALL_LETTER_MU);
s++;
+
for (; s < send; s++) {
STRLEN ulen;
- UV fc = _to_uni_fold_flags(*s, tmpbuf, &ulen, flags);
- if UVCHR_IS_INVARIANT(fc) {
- if (full_folding
- && *s == LATIN_SMALL_LETTER_SHARP_S)
- {
- *d++ = 's';
- *d++ = 's';
- }
- else
- *d++ = (U8)fc;
- }
- else {
- Copy(tmpbuf, d, ulen, U8);
- d += ulen;
- }
+ _to_uni_fold_flags(*s, d, &ulen, flags);
+ d += ulen;
}
break;
}
- else if (full_folding && *s == LATIN_SMALL_LETTER_SHARP_S) {
+ else if ( UNLIKELY(*s == LATIN_SMALL_LETTER_SHARP_S)
+ && full_folding)
+ {
/* Under full casefolding, LATIN SMALL LETTER SHARP S
* becomes "ss", which may require growing the SV. */
if (SvLEN(dest) < ++min) {
*(d)++ = 's';
*d = 's';
}
- else { /* If it's not one of those two, the fold is their lower
- case */
+ else { /* Else, the fold is the lower case */
*d = toLOWER_LATIN1(*s);
}
}
DIE(aTHX_ PL_no_helem_sv, SVfARG(keysv));
}
if (localizing) {
- if (HvNAME_get(hv) && isGV(*svp))
+ if (HvNAME_get(hv) && isGV_or_RVCV(*svp))
save_gp(MUTABLE_GV(*svp), !(PL_op->op_flags & OPf_SPECIAL));
else if (preeminent)
save_helem_flags(hv, keysv, svp,
{
I32 markidx = POPMARK;
if (GIMME_V != G_ARRAY) {
- SV **mark = PL_stack_base + markidx;
+ /* don't initialize mark here, EXTEND() may move the stack */
+ SV **mark;
dSP;
EXTEND(SP, 1); /* in case no arguments, as in @empty */
+ mark = PL_stack_base + markidx;
if (++MARK <= SP)
*MARK = *SP; /* unwanted list, return last item */
else
sp - mark);
}
+ if (SvREADONLY(ary))
+ Perl_croak_no_modify();
+
SP++;
if (++MARK < SP) {
i = -diff;
while (i)
dst[--i] = NULL;
-
+
if (newlen) {
Copy( tmparyval, AvARRAY(ary) + offset, newlen, SV* );
Safefree(tmparyval);
STRLEN len;
SvUTF8_off(TARG); /* decontaminate */
- if (SP - MARK > 1)
+ if (SP - MARK > 1) {
do_join(TARG, &PL_sv_no, MARK, SP);
- else if (SP > MARK)
+ SP = MARK + 1;
+ SETs(TARG);
+ } else if (SP > MARK) {
sv_setsv(TARG, *SP);
- else {
+ SETs(TARG);
+ } else {
sv_setsv(TARG, DEFSV);
- EXTEND(SP, 1);
+ XPUSHs(TARG);
}
up = SvPV_force(TARG, len);
}
(void)SvPOK_only_UTF8(TARG);
}
- SP = MARK + 1;
- SETTARG;
}
RETURN;
}
} else {
while (m < strend && !isSPACE(*m))
++m;
- }
+ }
if (m >= strend)
break;
} else {
while (s < strend && isSPACE(*s))
++s;
- }
+ }
}
}
else if (RX_EXTFLAGS(rx) & RXf_START_ONLY) {
/* The rx->minlen is in characters but we want to step
* s ahead by bytes. */
if (do_utf8)
- s = (char*)utf8_hop((U8*)m, len);
+ s = (char*)utf8_hop_forward((U8*) m, len, (U8*) strend);
else
s = m + len; /* Fake \n at the end */
}
/* The rx->minlen is in characters but we want to step
* s ahead by bytes. */
if (do_utf8)
- s = (char*)utf8_hop((U8*)m, len);
+ s = (char*)utf8_hop_forward((U8*)m, len, (U8 *) strend);
else
s = m + len; /* Fake \n at the end */
}
}
}
else if (arg) {
- S_localise_gv_slot(aTHX_ (GV *)arg,
+ S_localise_gv_slot(aTHX_ (GV *)arg,
PL_op->op_private & OPpLVREF_TYPE);
}
else if (!(PL_op->op_private & OPpPAD_STATE))
while (++MARK <= SP) {
SV * const elemsv = *MARK;
- if (SvTYPE(av) == SVt_PVAV)
- S_localise_aelem_lval(aTHX_ av, elemsv, can_preserve);
- else
- S_localise_helem_lval(aTHX_ (HV *)av, elemsv, can_preserve);
+ if (UNLIKELY(localizing)) {
+ if (SvTYPE(av) == SVt_PVAV)
+ S_localise_aelem_lval(aTHX_ av, elemsv, can_preserve);
+ else
+ S_localise_helem_lval(aTHX_ (HV *)av, elemsv, can_preserve);
+ }
*MARK = sv_2mortal(newSV_type(SVt_PVMG));
sv_magic(*MARK,(SV *)av,PERL_MAGIC_lvref,(char *)elemsv,HEf_SVKEY);
}
* for $: (OPf_STACKED ? *sp : $_[N])
* for @/%: @_[N..$#_]
*
- * It's equivalent to
+ * It's equivalent to
* my $foo = $_[N];
* or
* my $foo = (value-on-stack)
return sv;
}
-/* Check a a subs arguments - i.e. that it has the correct number of args
+/* Check a sub's arguments - i.e. that it has the correct number of args
* (and anything else we might think of in future). Typically used with
* signatured subs.
*/
PP(pp_argcheck)
{
OP * const o = PL_op;
- UNOP_AUX_item *aux = cUNOP_AUXo->op_aux;
- IV params = aux[0].iv;
- IV opt_params = aux[1].iv;
- char slurpy = (char)(aux[2].iv);
+ struct op_argcheck_aux *aux = (struct op_argcheck_aux *)cUNOP_AUXo->op_aux;
+ UV params = aux->params;
+ UV opt_params = aux->opt_params;
+ char slurpy = aux->slurpy;
AV *defav = GvAV(PL_defgv); /* @_ */
- IV argc;
+ UV argc;
bool too_few;
assert(!SvMAGICAL(defav));
- argc = (AvFILLp(defav) + 1);
+ argc = (UV)(AvFILLp(defav) + 1);
too_few = (argc < (params - opt_params));
if (UNLIKELY(too_few || (!slurpy && argc > params)))