This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Perl_uvoffuni_to_utf8_flags() Combine ASCII, EBCDIC branches
This uses the underlying structure of UTF-8 and UTF-EBCDIC to unify most
of the code. Previously, the ASCII platform version unrolled a loop,
and the EBCDIC didn't. Now the loop is used for code points that
require 5 or more bytes to represent in UTF-8 and UTF-EBCDIC. On ASCII
platforms, this means that all leggal Unicode code points use the
unrolled version. I used cachegrind to find that the unrolled savings
were not large, and in the trade-off between performance and
maintainability on code points that Unicode doesn't think are legal,
maintainability wins.
I also moved the tests so that there are no unnecessary tests on ASCII
platforms. For example, if we know that we are in a range of code
points that doesn't have surrogates, no tests for surrogates are done.
Perhaps an optimizing compiler could figure this out. There is a
smidgeon of extra tests on EBCDIC platforms, to keep the code unified
between the two platform types.
Originally, I did try to keep the loop unrolled, which is how I found
that the performance savings wasn't great. Here that code is (with a
space inserted before column 1 '#' chars, so git doesn't think they are
comments:
U8 *
Perl_uvoffuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
{
PERL_ARGS_ASSERT_UVOFFUNI_TO_UTF8_FLAGS;
/* Test for and handle 1-byte result. */
if (OFFUNI_IS_INVARIANT(uv)) {
*d++ = LATIN1_TO_NATIVE(uv);
return d;
}
/* Use shorter names internally in this file */
#define SHIFT UTF_ACCUMULATION_SHIFT
#undef MARK
#define MARK UTF_CONTINUATION_MARK
#define MASK UTF_CONTINUATION_MASK
/* Below is an unrolled version of
*
STRLEN len = OFFUNISKIP(uv);
U8 *p = d+len-1;
while (p > d) {
*p-- = I8_TO_NATIVE_UTF8((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
uv >>= UTF_ACCUMULATION_SHIFT;
}
*p = I8_TO_NATIVE_UTF8((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
return d+len;
*
* Unrolled, it looks like:
*
if (uv < max_2_byte_uv) return the 2 bytes;
if (uv < max_3_byte_uv) return the 3 bytes;
...
*
* Note that on EBCDIC we have to turn things into NATIVE_UTF8, which is a
* no-op on ASCII platforms */
/* Not 1-byte; test for and handle 2-byte result. In the test immediately
* below, the 32 is for start bytes C0-CF, D0-DF, each of which has a
* continuation byte which contributes SHIFT bits. This yields 0x400 on
* EBCDIC platforms, 0x800 on ASCII */
if (uv < (32 * (1U << SHIFT))) {
*d++ = I8_TO_NATIVE_UTF8(( uv >> SHIFT) | UTF_START_MARK(2));
*d++ = I8_TO_NATIVE_UTF8(( uv & MASK) | MARK);
return d;
}
/* Not 2-byte; test for and handle 3-byte result. In the test immediately
* below, the 16 is for start bytes E0-EF (which are the ones that indicate
* 3 bytes), the 2 is for 2 continuation bytes which each contribute SHIFT
* bits. This yields 0x4000 on EBCDIC platforms, 0x1_0000 on ASCII, so 3
* bytes covers the range 0x400-0x3FFF on EBCDIC; 0x800-0xFFFF on ASCII */
if (uv < (16 * (1U << (2 * SHIFT)))) {
*d++ = I8_TO_NATIVE_UTF8(( uv >> ((3 - 1) * SHIFT)) | UTF_START_MARK(3));
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
#ifndef EBCDIC /* These problematic code points are 4 bytes on EBCDIC */
/* The most likely code points in this range are below the surrogates.
* Do an extra test to quickly exclude those. */
if (UNLIKELY(uv >= UNICODE_SURROGATE_FIRST)) {
if (UNLIKELY( UNICODE_IS_32_NONCHARS(uv)
|| UNICODE_IS_xFFF_E_F(uv)))
{
goto handle_nonchar;
}
if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
goto handle_surrogate;
}
}
#endif
return d;
}
/* Not 3-byte; test for and handle 4-byte result. In the test immediately
* below, the 8 is for start bytes F0-F7, the 3 is for 3 continuation bytes
* which each contribute SHIFT bits. This yields 0x4_0000 on EBCDIC
* platforms, 0x20_0000 on ASCII, so 4 bytes covers the range
* 0x4000-0x3_FFFF on EBCDIC; 0x1_0000-0x1F_FFFF on ASCII */
if (uv < (8 * (1U << (3 * SHIFT)))) {
*d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
#ifdef EBCDIC /* These problematic code points are 3 bytes on ASCII */
if (UNLIKELY( UNICODE_IS_32_NONCHARS(uv)
|| UNICODE_IS_xFFF_E_F(uv)))
{
goto handle_nonchar;
}
if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
goto handle_surrogate;
}
#else
if (UNLIKELY( UNICODE_IS_xFFF_E_F(uv))
|| UNICODE_IS_10_FFF_E_F(uv))
{
goto handle_nonchar;
}
if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
goto handle_super;
}
#endif
return d;
}
/* Not 4-byte; test for and handle 5-byte result. In the test immediately
* below, the first 4 is for start bytes F8-FB, the second 4 is for 4
* continuation bytes which each contribute SHIFT bits. This yields
* 0x40_0000 on EBCDIC platforms, 0x400_0000 on ASCII, so 5 bytes covers
* the range 0x4_0000-0x3F_FFFF on EBCDIC; 0x20_0000-0x3FF_FFFF on ASCII */
if (uv < (4 * (1U << (4 * SHIFT)))) {
*d++ = I8_TO_NATIVE_UTF8(( uv >> ((5 - 1) * SHIFT)) | UTF_START_MARK(5));
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((4 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
#ifdef EBCDIC
if (UNLIKELY( UNICODE_IS_xFFF_E_F(uv))
|| UNICODE_IS_10_FFF_E_F(uv))
{
goto handle_nonchar;
}
if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
goto handle_super;
}
return d;
#else
goto handle_super;
#endif
}
/* Not 5-byte; test for and handle 6-byte result. In the test immediately
* below, the 2 is for start bytes FC-FD, the 5 is for 5 continuation bytes
* which each contribute SHIFT bits. This yields 0x400_0000 on EBCDIC
* platforms, 0x8000_0000 on ASCII, so 6 bytes covers the range
* 0x40_0000-0x3FF_FFFF on EBCDIC; 0x400_0000-0x7FFF_FFFF on ASCII. */
if (uv < (2 * (1U << (5 * SHIFT)))) {
*d++ = I8_TO_NATIVE_UTF8(( uv >> ((6 - 1) * SHIFT)) | UTF_START_MARK(6));
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((5 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((4 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
goto handle_super;
}
/* This could be moved down for EBCDIC, but not worth the complexity */
if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED)) {
Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
}
/* Not 6-byte; handle 7-byte result. There is no need for a test on
* platforms where 7 bytes is the maximum possible, . The FE start byte
* can have 6 continuation bytes which each contribute SHIFT bits. This
* yields 0x4000_0000 on EBCDIC platforms, 0x10_0000_0000 on ASCII, so 7
* bytes covers the range 0x400_0000-0x3FFF_FFFF on EBCDIC;
* 0x400_0000-0xF_FFFF_FFFF on ASCII */
#if defined(UV_IS_QUAD) || defined(EBCDIC)
if (uv < ((UV) 1U << (6 * SHIFT)))
#endif
{
*d++ = I8_TO_NATIVE_UTF8(0xfe); /* Can't match U+FEFF! */
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((6 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((5 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((4 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
#ifdef EBCDIC
goto handle_super;
#else
goto handle_above_31_bit;
#endif
}
/* Below is for a 0xFF start byte. You need a 64-bit word size to be able
* to express this on an ASCII machine, but a 32-bit word expresses the
* lower range on EBCDIC platforms */
#if defined(UV_IS_QUAD) || defined(EBCDIC)
{
/* UTF8_MAX_BYTES result. The 0xff start byte is followed by 13
* continuation bytes on EBCDIC; 12 on ASCII. These numbers of bytes
* are essentially arbitrary, but were chosen to be enough to represent
* 2**64 - 1 (plus an extra byte on ASCII). */
*d++ = I8_TO_NATIVE_UTF8(0xff); /* Can't match U+FFFE! */
# ifdef UV_IS_QUAD
# ifndef EBCDIC
*d++ = /* ASCII platform (12 - 1) 6 Reserved bits */ MARK;
# else
*d++ = I8_TO_NATIVE_UTF8(((uv >>((13 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >>((12 - 1) * SHIFT)) & MASK) | MARK);
# endif
*d++ = I8_TO_NATIVE_UTF8(((uv >>((11 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >>((10 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((9 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((8 - 1) * SHIFT)) & MASK) | MARK);
# else /* Here, must be EBCDIC without quad */
*d++ = I8_TO_NATIVE_UTF8( /* (13 - 1) 5 Reserved bits */ MARK);
*d++ = I8_TO_NATIVE_UTF8( /* (12 - 1) 5 Reserved bits */ MARK);
*d++ = I8_TO_NATIVE_UTF8( /* (11 - 1) 5 Reserved bits */ MARK);
*d++ = I8_TO_NATIVE_UTF8( /* (10 - 1) 5 Reserved bits */ MARK);
*d++ = I8_TO_NATIVE_UTF8( /* ( 9 - 1) 5 Reserved bits */ MARK);
*d++ = I8_TO_NATIVE_UTF8( /* ( 8 - 1) 5 Reserved bits */ MARK);
# endif
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((7 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((6 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((5 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((4 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
*d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
}
#endif
#ifdef EBCDIC
if (uv <= 0x7FFFFFFF) {
goto handle_super;
}
#endif
/* FALLTHROUGH */ \
handle_above_31_bit:
if (flags & (UNICODE_WARN_ABOVE_31_BIT|UNICODE_WARN_SUPER)) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
"Code point 0x%"UVXf" is not Unicode, and not portable", uv);
/* So won't warn twice; we have to fall through into handle_super in
* case supers are disallowed */
flags &= ~UNICODE_WARN_SUPER;
}
if (flags & UNICODE_DISALLOW_ABOVE_31_BIT) {
return NULL;
}
handle_super:
if (flags & UNICODE_WARN_SUPER) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
"Code point 0x%04"UVXf" is not Unicode, may not be portable", uv);
}
if (flags & UNICODE_DISALLOW_SUPER) {
return NULL;
}
return d;
handle_surrogate:
if (flags & UNICODE_WARN_SURROGATE) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_SURROGATE),
"UTF-16 surrogate U+%04"UVXf, uv);
}
if (flags & UNICODE_DISALLOW_SURROGATE) {
return NULL;
}
return d;
handle_nonchar:
if (flags & UNICODE_WARN_NONCHAR) {
Perl_ck_warner_d(aTHX_ packWARN(WARN_NONCHAR),
"Unicode non-character U+%04"UVXf" is not recommended for open interchange",
uv);
}
if (flags & UNICODE_DISALLOW_NONCHAR) {
return NULL;
}
return d;
}