sv = SvRV(sv);
if (SvTYPE(sv) == SVt_PVIO) {
GV * const gv = MUTABLE_GV(sv_newmortal());
- gv_init(gv, 0, "", 0, 0);
+ gv_init(gv, 0, "$__ANONIO__", 11, 0);
GvIOp(gv) = MUTABLE_IO(sv);
SvREFCNT_inc_void_NN(sv);
sv = MUTABLE_SV(gv);
}
cv = sv_2cv(TOPs, &stash, &gv, 0);
if (cv && SvPOK(cv))
- ret = newSVpvn_flags(SvPVX_const(cv), SvCUR(cv), SVs_TEMP);
+ ret = newSVpvn_flags(
+ CvPROTO(cv), CvPROTOLEN(cv), SVs_TEMP | SvUTF8(cv)
+ );
set:
SETs(ret);
RETURN;
dVAR; dSP;
SV *sv = POPs;
- const char * const elem = SvPV_nolen_const(sv);
+ STRLEN len;
+ const char * const elem = SvPV_const(sv, len);
GV * const gv = MUTABLE_GV(POPs);
SV * tmpRef = NULL;
const char * const second_letter = elem + 1;
switch (*elem) {
case 'A':
- if (strEQ(second_letter, "RRAY"))
+ if (len == 5 && strEQ(second_letter, "RRAY"))
tmpRef = MUTABLE_SV(GvAV(gv));
break;
case 'C':
- if (strEQ(second_letter, "ODE"))
+ if (len == 4 && strEQ(second_letter, "ODE"))
tmpRef = MUTABLE_SV(GvCVu(gv));
break;
case 'F':
- if (strEQ(second_letter, "ILEHANDLE")) {
+ if (len == 10 && strEQ(second_letter, "ILEHANDLE")) {
/* finally deprecated in 5.8.0 */
deprecate("*glob{FILEHANDLE}");
tmpRef = MUTABLE_SV(GvIOp(gv));
}
else
- if (strEQ(second_letter, "ORMAT"))
+ if (len == 6 && strEQ(second_letter, "ORMAT"))
tmpRef = MUTABLE_SV(GvFORM(gv));
break;
case 'G':
- if (strEQ(second_letter, "LOB"))
+ if (len == 4 && strEQ(second_letter, "LOB"))
tmpRef = MUTABLE_SV(gv);
break;
case 'H':
- if (strEQ(second_letter, "ASH"))
+ if (len == 4 && strEQ(second_letter, "ASH"))
tmpRef = MUTABLE_SV(GvHV(gv));
break;
case 'I':
- if (*second_letter == 'O' && !elem[2])
+ if (*second_letter == 'O' && !elem[2] && len == 2)
tmpRef = MUTABLE_SV(GvIOp(gv));
break;
case 'N':
- if (strEQ(second_letter, "AME"))
+ if (len == 4 && strEQ(second_letter, "AME"))
sv = newSVhek(GvNAME_HEK(gv));
break;
case 'P':
- if (strEQ(second_letter, "ACKAGE")) {
+ if (len == 7 && strEQ(second_letter, "ACKAGE")) {
const HV * const stash = GvSTASH(gv);
const HEK * const hek = stash ? HvNAME_HEK(stash) : NULL;
sv = hek ? newSVhek(hek) : newSVpvs("__ANON__");
}
break;
case 'S':
- if (strEQ(second_letter, "CALAR"))
+ if (len == 6 && strEQ(second_letter, "CALAR"))
tmpRef = GvSVn(gv);
break;
}
break;
case SVt_PVCV:
if (cv_const_sv((const CV *)sv))
- Perl_ck_warner(aTHX_ packWARN(WARN_MISC), "Constant subroutine %s undefined",
- CvANON((const CV *)sv) ? "(anonymous)"
- : GvENAME(CvGV((const CV *)sv)));
+ Perl_ck_warner(aTHX_ packWARN(WARN_MISC),
+ "Constant subroutine %"SVf" undefined",
+ SVfARG(CvANON((const CV *)sv)
+ ? newSVpvs_flags("(anonymous)", SVs_TEMP)
+ : sv_2mortal(newSVhek(GvENAME_HEK(CvGV((const CV *)sv))))));
/* FALLTHROUGH */
case SVt_PVFM:
{
IV len_iv = 0;
int len_is_uv = 1;
const I32 lvalue = PL_op->op_flags & OPf_MOD || LVRET;
+ const bool rvalue = (GIMME_V != G_VOID);
const char *tmps;
SV *repl_sv = NULL;
const char *repl = NULL;
RETURN;
}
- SvTAINTED_off(TARG); /* decontaminate */
- SvUTF8_off(TARG); /* decontaminate */
-
tmps += byte_pos;
- sv_setpvn(TARG, tmps, byte_len);
+
+ if (rvalue) {
+ SvTAINTED_off(TARG); /* decontaminate */
+ SvUTF8_off(TARG); /* decontaminate */
+ sv_setpvn(TARG, tmps, byte_len);
#ifdef USE_LOCALE_COLLATE
- sv_unmagic(TARG, PERL_MAGIC_collxfrm);
+ sv_unmagic(TARG, PERL_MAGIC_collxfrm);
#endif
- if (utf8_curlen)
- SvUTF8_on(TARG);
+ if (utf8_curlen)
+ SvUTF8_on(TARG);
+ }
if (repl) {
SV* repl_sv_copy = NULL;
}
}
SPAGAIN;
- SvSETMAGIC(TARG);
- PUSHs(TARG);
+ if (rvalue) {
+ SvSETMAGIC(TARG);
+ PUSHs(TARG);
+ }
RETURN;
bound_fail:
/* Generally UTF-8 and UTF-EBCDIC are indistinguishable at this level. So
* most comments below say UTF-8, when in fact they mean UTF-EBCDIC as well */
-/* Below are several macros that generate code */
/* Generates code to store a unicode codepoint c that is known to occupy
- * exactly two UTF-8 and UTF-EBCDIC bytes; it is stored into p and p+1. */
-#define STORE_UNI_TO_UTF8_TWO_BYTE(p, c) \
- STMT_START { \
- *(p) = UTF8_TWO_BYTE_HI(c); \
- *((p)+1) = UTF8_TWO_BYTE_LO(c); \
- } STMT_END
-
-/* Like STORE_UNI_TO_UTF8_TWO_BYTE, but advances p to point to the next
- * available byte after the two bytes */
+ * exactly two UTF-8 and UTF-EBCDIC bytes; it is stored into p and p+1,
+ * and p is advanced to point to the next available byte after the two bytes */
#define CAT_UNI_TO_UTF8_TWO_BYTE(p, c) \
STMT_START { \
*(p)++ = UTF8_TWO_BYTE_HI(c); \
} \
} STMT_END
-/* Like STORE_NON_LATIN1_UC, but advances p to point to the next available byte
- * after the character stored */
-#define CAT_NON_LATIN1_UC(p, l) \
-STMT_START { \
- if ((l) == LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \
- CAT_UNI_TO_UTF8_TWO_BYTE((p), LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS); \
- } else { \
- CAT_UNI_TO_UTF8_TWO_BYTE((p), GREEK_CAPITAL_LETTER_MU); \
- } \
-} STMT_END
-
-/* Generates code to add the two UTF-8 bytes (probably u) that are the upper
- * case of l into p and p+1. u must be the result of toUPPER_LATIN1_MOD(l),
- * and must require two bytes to store it. Advances p to point to the next
- * available position */
-#define CAT_TWO_BYTE_UNI_UPPER_MOD(p, l, u) \
-STMT_START { \
- if ((u) != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) { \
- CAT_UNI_TO_UTF8_TWO_BYTE((p), (u)); /* not special, just save it */ \
- } else if (l == LATIN_SMALL_LETTER_SHARP_S) { \
- *(p)++ = 'S'; *(p)++ = 'S'; /* upper case is 'SS' */ \
- } else {/* else is one of the other two special cases */ \
- CAT_NON_LATIN1_UC((p), (l)); \
- } \
-} STMT_END
-
PP(pp_ucfirst)
{
/* Actually is both lcfirst() and ucfirst(). Only the first character
}
else if (DO_UTF8(source)) { /* Is the source utf8? */
doing_utf8 = TRUE;
+ ulen = UTF8SKIP(s);
+ if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
+ else toLOWER_utf8(s, tmpbuf, &tculen);
- if (UTF8_IS_INVARIANT(*s)) {
-
- /* An invariant source character is either ASCII or, in EBCDIC, an
- * ASCII equivalent or a caseless C1 control. In both these cases,
- * the lower and upper cases of any character are also invariants
- * (and title case is the same as upper case). So it is safe to
- * use the simple case change macros which avoid the overhead of
- * the general functions. Note that if perl were to be extended to
- * do locale handling in UTF-8 strings, this wouldn't be true in,
- * for example, Lithuanian or Turkic. */
- *tmpbuf = (op_type == OP_LCFIRST) ? toLOWER(*s) : toUPPER(*s);
- tculen = ulen = 1;
- need = slen + 1;
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
- U8 chr;
-
- /* Similarly, if the source character isn't invariant but is in the
- * latin1 range (or EBCDIC equivalent thereof), we have the case
- * changes compiled into perl, and can avoid the overhead of the
- * general functions. In this range, the characters are stored as
- * two UTF-8 bytes, and it so happens that any changed-case version
- * is also two bytes (in both ASCIIish and EBCDIC machines). */
- tculen = ulen = 2;
- need = slen + 1;
-
- /* Convert the two source bytes to a single Unicode code point
- * value, change case and save for below */
- chr = TWO_BYTE_UTF8_TO_UNI(*s, *(s+1));
- if (op_type == OP_LCFIRST) { /* lower casing is easy */
- U8 lower = toLOWER_LATIN1(chr);
- STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, lower);
- }
- else { /* ucfirst */
- U8 upper = toUPPER_LATIN1_MOD(chr);
-
- /* Most of the latin1 range characters are well-behaved. Their
- * title and upper cases are the same, and are also in the
- * latin1 range. The macro above returns their upper (hence
- * title) case, and all that need be done is to save the result
- * for below. However, several characters are problematic, and
- * have to be handled specially. The MOD in the macro name
- * above means that these tricky characters all get mapped to
- * the single character LATIN_SMALL_LETTER_Y_WITH_DIAERESIS.
- * This mapping saves some tests for the majority of the
- * characters */
-
- if (upper != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) {
-
- /* Not tricky. Just save it. */
- STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf, upper);
- }
- else if (chr == LATIN_SMALL_LETTER_SHARP_S) {
-
- /* This one is tricky because it is two characters long,
- * though the UTF-8 is still two bytes, so the stored
- * length doesn't change */
- *tmpbuf = 'S'; /* The UTF-8 is 'Ss' */
- *(tmpbuf + 1) = 's';
- }
- else {
-
- /* The other two have their title and upper cases the same,
- * but are tricky because the changed-case characters
- * aren't in the latin1 range. They, however, do fit into
- * two UTF-8 bytes */
- STORE_NON_LATIN1_UC(tmpbuf, chr);
- }
- }
- }
- else {
-
- /* Here, can't short-cut the general case */
-
- utf8_to_uvchr(s, &ulen);
- if (op_type == OP_UCFIRST) toTITLE_utf8(s, tmpbuf, &tculen);
- else toLOWER_utf8(s, tmpbuf, &tculen);
-
- /* we can't do in-place if the length changes. */
- if (ulen != tculen) inplace = FALSE;
- need = slen + 1 - ulen + tculen;
- }
+ /* we can't do in-place if the length changes. */
+ if (ulen != tculen) inplace = FALSE;
+ need = slen + 1 - ulen + tculen;
}
else { /* Non-zero length, non-UTF-8, Need to consider locale and if
* latin1 is treated as caseless. Note that a locale takes
bool in_iota_subscript = FALSE;
while (s < send) {
+ STRLEN u;
+ STRLEN ulen;
+ UV uv;
if (in_iota_subscript && ! is_utf8_mark(s)) {
+
/* A non-mark. Time to output the iota subscript */
#define GREEK_CAPITAL_LETTER_IOTA 0x0399
#define COMBINING_GREEK_YPOGEGRAMMENI 0x0345
CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA);
in_iota_subscript = FALSE;
- }
-
- /* If the UTF-8 character is invariant, then it is in the range
- * known by the standard macro; result is only one byte long */
- if (UTF8_IS_INVARIANT(*s)) {
- *d++ = toUPPER(*s);
- s++;
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-
- /* Likewise, if it fits in a byte, its case change is in our
- * table */
- U8 orig = TWO_BYTE_UTF8_TO_UNI(*s, *(s+1));
- U8 upper = toUPPER_LATIN1_MOD(orig);
- CAT_TWO_BYTE_UNI_UPPER_MOD(d, orig, upper);
- s += 2;
- }
- else {
-
- /* Otherwise, need the general UTF-8 case. Get the changed
- * case value and copy it to the output buffer */
+ }
- const STRLEN u = UTF8SKIP(s);
- STRLEN ulen;
+ /* Then handle the current character. Get the changed case value
+ * and copy it to the output buffer */
- const UV uv = toUPPER_utf8(s, tmpbuf, &ulen);
- if (uv == GREEK_CAPITAL_LETTER_IOTA
- && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
- {
- in_iota_subscript = TRUE;
- }
- else {
- if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
- /* If the eventually required minimum size outgrows
- * the available space, we need to grow. */
- const UV o = d - (U8*)SvPVX_const(dest);
-
- /* If someone uppercases one million U+03B0s we
- * SvGROW() one million times. Or we could try
- * guessing how much to allocate without allocating too
- * much. Such is life. See corresponding comment in
- * lc code for another option */
- SvGROW(dest, min);
- d = (U8*)SvPVX(dest) + o;
- }
- Copy(tmpbuf, d, ulen, U8);
- d += ulen;
- }
- s += u;
- }
+ u = UTF8SKIP(s);
+ uv = toUPPER_utf8(s, tmpbuf, &ulen);
+ if (uv == GREEK_CAPITAL_LETTER_IOTA
+ && utf8_to_uvchr(s, 0) == COMBINING_GREEK_YPOGEGRAMMENI)
+ {
+ in_iota_subscript = TRUE;
+ }
+ else {
+ if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
+ /* If the eventually required minimum size outgrows the
+ * available space, we need to grow. */
+ const UV o = d - (U8*)SvPVX_const(dest);
+
+ /* If someone uppercases one million U+03B0s we SvGROW()
+ * one million times. Or we could try guessing how much to
+ * allocate without allocating too much. Such is life.
+ * See corresponding comment in lc code for another option
+ * */
+ SvGROW(dest, min);
+ d = (U8*)SvPVX(dest) + o;
+ }
+ Copy(tmpbuf, d, ulen, U8);
+ d += ulen;
+ }
+ s += u;
}
if (in_iota_subscript) {
CAT_UNI_TO_UTF8_TWO_BYTE(d, GREEK_CAPITAL_LETTER_IOTA);
else {
for (; s < send; d++, s++) {
*d = toUPPER_LATIN1_MOD(*s);
- if (*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS) continue;
+ if (LIKELY(*d != LATIN_SMALL_LETTER_Y_WITH_DIAERESIS)) continue;
/* The mainstream case is the tight loop above. To avoid
* extra tests in that, all three characters that require
(send -s) * 2 + 1);
d = (U8*)SvPVX(dest) + len;
- /* And append the current character's upper case in UTF-8 */
- CAT_NON_LATIN1_UC(d, *s);
-
/* Now process the remainder of the source, converting to
* upper and UTF-8. If a resulting byte is invariant in
* UTF-8, output it as-is, otherwise convert to UTF-8 and
* append it to the output. */
-
- s++;
for (; s < send; s++) {
- U8 upper = toUPPER_LATIN1_MOD(*s);
- if UTF8_IS_INVARIANT(upper) {
- *d++ = upper;
- }
- else {
- CAT_TWO_BYTE_UNI_UPPER_MOD(d, *s, upper);
- }
+ (void) _to_upper_title_latin1(*s, d, &len, 'S');
+ d += len;
}
/* Here have processed the whole source; no need to continue
U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
while (s < send) {
- if (UTF8_IS_INVARIANT(*s)) {
+ const STRLEN u = UTF8SKIP(s);
+ STRLEN ulen;
- /* Invariant characters use the standard mappings compiled in.
- */
- *d++ = toLOWER(*s);
- s++;
- }
- else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
+ toLOWER_utf8(s, tmpbuf, &ulen);
- /* As do the ones in the Latin1 range */
- U8 lower = toLOWER_LATIN1(TWO_BYTE_UTF8_TO_UNI(*s, *(s+1)));
- CAT_UNI_TO_UTF8_TWO_BYTE(d, lower);
- s += 2;
- }
- else {
- /* Here, is utf8 not in Latin-1 range, have to go out and get
- * the mappings from the tables. */
+ /* Here is where we would do context-sensitive actions. See the
+ * commit message for this comment for why there isn't any */
- const STRLEN u = UTF8SKIP(s);
- STRLEN ulen;
-
-#ifndef CONTEXT_DEPENDENT_CASING
- toLOWER_utf8(s, tmpbuf, &ulen);
-#else
-/* This is ifdefd out because it probably is the wrong thing to do. The right
- * thing is probably to have an I/O layer that converts final sigma to regular
- * on input and vice versa (under the correct circumstances) on output. In
- * effect, the final sigma is just a glyph variation when the regular one
- * occurs at the end of a word. And we don't really know what's going to be
- * the end of the word until it is finally output, as splitting and joining can
- * occur at any time and change what once was the word end to be in the middle,
- * and vice versa. */
-
- const UV uv = toLOWER_utf8(s, tmpbuf, &ulen);
-
- /* If the lower case is a small sigma, it may be that we need
- * to change it to a final sigma. This happens at the end of
- * a word that contains more than just this character, and only
- * when we started with a capital sigma. */
- if (uv == UNICODE_GREEK_SMALL_LETTER_SIGMA &&
- s > send - len && /* Makes sure not the first letter */
- utf8_to_uvchr(s, 0) == UNICODE_GREEK_CAPITAL_LETTER_SIGMA
- ) {
-
- /* We use the algorithm in:
- * http://www.unicode.org/versions/Unicode5.0.0/ch03.pdf (C
- * is a CAPITAL SIGMA): If C is preceded by a sequence
- * consisting of a cased letter and a case-ignorable
- * sequence, and C is not followed by a sequence consisting
- * of a case ignorable sequence and then a cased letter,
- * then when lowercasing C, C becomes a final sigma */
-
- /* To determine if this is the end of a word, need to peek
- * ahead. Look at the next character */
- const U8 *peek = s + u;
-
- /* Skip any case ignorable characters */
- while (peek < send && is_utf8_case_ignorable(peek)) {
- peek += UTF8SKIP(peek);
- }
+ if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
- /* If we reached the end of the string without finding any
- * non-case ignorable characters, or if the next such one
- * is not-cased, then we have met the conditions for it
- * being a final sigma with regards to peek ahead, and so
- * must do peek behind for the remaining conditions. (We
- * know there is stuff behind to look at since we tested
- * above that this isn't the first letter) */
- if (peek >= send || ! is_utf8_cased(peek)) {
- peek = utf8_hop(s, -1);
-
- /* Here are at the beginning of the first character
- * before the original upper case sigma. Keep backing
- * up, skipping any case ignorable characters */
- while (is_utf8_case_ignorable(peek)) {
- peek = utf8_hop(peek, -1);
- }
+ /* If the eventually required minimum size outgrows the
+ * available space, we need to grow. */
+ const UV o = d - (U8*)SvPVX_const(dest);
- /* Here peek points to the first byte of the closest
- * non-case-ignorable character before the capital
- * sigma. If it is cased, then by the Unicode
- * algorithm, we should use a small final sigma instead
- * of what we have */
- if (is_utf8_cased(peek)) {
- STORE_UNI_TO_UTF8_TWO_BYTE(tmpbuf,
- UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA);
- }
- }
- }
- else { /* Not a context sensitive mapping */
-#endif /* End of commented out context sensitive */
- if (ulen > u && (SvLEN(dest) < (min += ulen - u))) {
-
- /* If the eventually required minimum size outgrows
- * the available space, we need to grow. */
- const UV o = d - (U8*)SvPVX_const(dest);
-
- /* If someone lowercases one million U+0130s we
- * SvGROW() one million times. Or we could try
- * guessing how much to allocate without allocating too
- * much. Such is life. Another option would be to
- * grow an extra byte or two more each time we need to
- * grow, which would cut down the million to 500K, with
- * little waste */
- SvGROW(dest, min);
- d = (U8*)SvPVX(dest) + o;
- }
-#ifdef CONTEXT_DEPENDENT_CASING
- }
-#endif
- /* Copy the newly lowercased letter to the output buffer we're
- * building */
- Copy(tmpbuf, d, ulen, U8);
- d += ulen;
- s += u;
+ /* If someone lowercases one million U+0130s we SvGROW() one
+ * million times. Or we could try guessing how much to
+ * allocate without allocating too much. Such is life.
+ * Another option would be to grow an extra byte or two more
+ * each time we need to grow, which would cut down the million
+ * to 500K, with little waste */
+ SvGROW(dest, min);
+ d = (U8*)SvPVX(dest) + o;
}
+
+ /* Copy the newly lowercased letter to the output buffer we're
+ * building */
+ Copy(tmpbuf, d, ulen, U8);
+ d += ulen;
+ s += u;
} /* End of looping through the source string */
SvUTF8_on(dest);
*d = '\0';