* You may distribute under the terms of either the GNU General Public
* License or the Artistic License, as specified in the README file.
*
+ * This file contains tables and code adapted from
+ * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which requires this
+ * copyright notice:
+
+Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+
+ *
* This file is a home for static inline functions that cannot go in other
- * headers files, because they depend on proto.h (included after most other
+ * header files, because they depend on proto.h (included after most other
* headers) or struct definitions.
*
* Each section names the header file that the functions "belong" to.
send = s + len;
-#ifndef EBCDIC
-
/* This looks like 0x010101... */
-#define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
+# define PERL_COUNT_MULTIPLIER (~ (UINTMAX_C(0)) / 0xFF)
/* This looks like 0x808080... */
-#define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
-#define PERL_WORDSIZE sizeof(PERL_COUNT_MULTIPLIER)
-#define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
+# define PERL_VARIANTS_WORD_MASK (PERL_COUNT_MULTIPLIER * 0x80)
+# define PERL_WORDSIZE sizeof(PERL_UINTMAX_T)
+# define PERL_WORD_BOUNDARY_MASK (PERL_WORDSIZE - 1)
/* Evaluates to 0 if 'x' is at a word boundary; otherwise evaluates to 1, by
* or'ing together the lowest bits of 'x'. Hopefully the final term gets
* optimized out completely on a 32-bit system, and its mask gets optimized out
* on a 64-bit system */
-#define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
+# define PERL_IS_SUBWORD_ADDR(x) (1 & ( PTR2nat(x) \
| ( PTR2nat(x) >> 1) \
| ( ( (PTR2nat(x) \
& PERL_WORD_BOUNDARY_MASK) >> 2))))
+#ifndef EBCDIC
+
/* Do the word-at-a-time iff there is at least one usable full word. That
* means that after advancing to a word boundary, there still is at least a
* full word left. The number of bytes needed to advance is 'wordsize -
return FALSE;
}
- /* Otherwise fall into final loop to find which byte it is */
+# if BYTEORDER == 0x1234 || BYTEORDER == 0x12345678 \
+ || BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ *ep = x + _variant_byte_number(* (PERL_UINTMAX_T *) x);
+ assert(*ep >= s && *ep < send);
+
+ return FALSE;
+
+# else /* If weird byte order, drop into next loop to do byte-at-a-time
+ checks. */
+
break;
+# endif
}
+
x += PERL_WORDSIZE;
+
} while (x + PERL_WORDSIZE <= send);
}
-#endif
+#endif /* End of ! EBCDIC */
/* Process per-byte */
while (x < send) {
return TRUE;
}
+#ifndef EBCDIC
+
+PERL_STATIC_INLINE unsigned int
+S__variant_byte_number(PERL_UINTMAX_T word)
+{
+
+ /* This returns the position in a word (0..7) of the first variant byte in
+ * it. This is a helper function. Note that there are no branches */
+
+ assert(word);
+
+ /* Get just the msb bits of each byte */
+ word &= PERL_VARIANTS_WORD_MASK;
+
+# ifdef USING_MSVC6 /* VC6 has some issues with the normal code, and the
+ easiest thing is to hide that from the callers */
+ {
+ unsigned int i;
+ const U8 * s = (U8 *) &word;
+ dTHX;
+
+ for (i = 0; i < sizeof(word); i++ ) {
+ if (s[i]) {
+ return i;
+ }
+ }
+
+ Perl_croak(aTHX_ "panic: %s: %d: unexpected zero word\n",
+ __FILE__, __LINE__);
+ }
+
+# elif BYTEORDER == 0x1234 || BYTEORDER == 0x12345678
+
+ /* Bytes are stored like
+ * Byte8 ... Byte2 Byte1
+ * 63..56...15...8 7...0
+ *
+ * Isolate the lsb;
+ * https://stackoverflow.com/questions/757059/position-of-least-significant-bit-that-is-set
+ *
+ * The word will look this this, with a rightmost set bit in position 's':
+ * ('x's are don't cares)
+ * s
+ * x..x100..0
+ * x..xx10..0 Right shift (rightmost 0 is shifted off)
+ * x..xx01..1 Subtract 1, turns all the trailing zeros into 1's and
+ * the 1 just to their left into a 0; the remainder is
+ * untouched
+ * 0..0011..1 The xor with x..xx10..0 clears that remainder, sets
+ * bottom to all 1
+ * 0..0100..0 Add 1 to clear the word except for the bit in 's'
+ *
+ * Another method is to do 'word &= -word'; but it generates a compiler
+ * message on some platforms about taking the negative of an unsigned */
+
+ word >>= 1;
+ word = 1 + (word ^ (word - 1));
+
+# elif BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ /* Bytes are stored like
+ * Byte1 Byte2 ... Byte8
+ * 63..56 55..47 ... 7...0
+ *
+ * Isolate the msb; http://codeforces.com/blog/entry/10330
+ *
+ * Only the most significant set bit matters. Or'ing word with its right
+ * shift of 1 makes that bit and the next one to its right both 1. Then
+ * right shifting by 2 makes for 4 1-bits in a row. ... We end with the
+ * msb and all to the right being 1. */
+ word |= word >> 1;
+ word |= word >> 2;
+ word |= word >> 4;
+ word |= word >> 8;
+ word |= word >> 16;
+ word |= word >> 32; /* This should get optimized out on 32-bit systems. */
+
+ /* Then subtracting the right shift by 1 clears all but the left-most of
+ * the 1 bits, which is our desired result */
+ word -= (word >> 1);
+
+# else
+# error Unexpected byte order
+# endif
+
+ /* Here 'word' has a single bit set: the msb of the first byte in which it
+ * is set. Calculate that position in the word. We can use this
+ * specialized solution: https://stackoverflow.com/a/32339674/1626653,
+ * assumes an 8-bit byte. (On a 32-bit machine, the larger numbers should
+ * just get shifted off at compile time) */
+ word = (word >> 7) * ((UINTMAX_C( 7) << 56) | (UINTMAX_C(15) << 48)
+ | (UINTMAX_C(23) << 40) | (UINTMAX_C(31) << 32)
+ | (39 << 24) | (47 << 16)
+ | (55 << 8) | (63 << 0));
+ word >>= PERL_WORDSIZE * 7; /* >> by either 56 or 24 */
+
+ /* Here, word contains the position 7..63 of that bit. Convert to 0..7 */
+ word = ((word + 1) >> 3) - 1;
+
+# if BYTEORDER == 0x4321 || BYTEORDER == 0x87654321
+
+ /* And invert the result */
+ word = CHARBITS - word - 1;
+
+# endif
+
+ return (unsigned int) word;
+}
+
+#endif
#if defined(PERL_CORE) || defined(PERL_EXT)
/*
# ifndef EBCDIC
+ /* Test if the string is long enough to use word-at-a-time. (Logic is the
+ * same as for is_utf8_invariant_string()) */
if ((STRLEN) (e - x) >= PERL_WORDSIZE
+ PERL_WORDSIZE * PERL_IS_SUBWORD_ADDR(x)
- (PTR2nat(x) & PERL_WORD_BOUNDARY_MASK))
/* Process per-word as long as we have at least a full word left */
do { /* Commit 03c1e4ab1d6ee9062fb3f94b0ba31db6698724b1 contains an
explanation of how this works */
- count += ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
+ PERL_UINTMAX_T increment
+ = ((((* (PERL_UINTMAX_T *) x) & PERL_VARIANTS_WORD_MASK) >> 7)
* PERL_COUNT_MULTIPLIER)
>> ((PERL_WORDSIZE - 1) * CHARBITS);
+ count += (Size_t) increment;
x += PERL_WORDSIZE;
} while (x + PERL_WORDSIZE <= e);
}
=cut
This is commonly used to determine if a SV's UTF-8 flag should be turned on.
-It needn't be if its string is entirely UTF-8 invariant, and it shouldn't be if
-it otherwise contains invalid UTF-8.
+It generally needn't be if its string is entirely UTF-8 invariant, and it
+shouldn't be if it otherwise contains invalid UTF-8.
It is an internal function because khw thinks that XS code shouldn't be working
at this low a level. A valid use case could change that.
/*
+=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
+that represents some code point; otherwise it evaluates to 0. If non-zero, the
+value gives how many bytes starting at C<s> comprise the code point's
+representation. Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The code point can be any that will fit in an IV on this machine, using Perl's
+extension to official UTF-8 to represent those higher than the Unicode maximum
+of 0x10FFFF. That means that this macro is used to efficiently decide if the
+next few bytes in C<s> is legal UTF-8 for a single character.
+
+Use C<L</isSTRICT_UTF8_CHAR>> to restrict the acceptable code points to those
+defined by Unicode to be fully interchangeable across applications;
+C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
+C<L</is_utf8_string_loclen>> to check entire strings.
+
+Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
+
+=cut
+
+This uses an adaptation of the table and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version. A copyright notice for the original
+version is given at the beginning of this file. The Perl adapation is
+documented at the definition of PL_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+ const U8 * s = s0;
+ UV state = 0;
+
+ PERL_ARGS_ASSERT_ISUTF8_CHAR;
+
+ /* This dfa is fast. If it accepts the input, it was for a well-formed,
+ * code point, which can be returned immediately. Otherwise, it is either
+ * malformed, or for the start byte FF which the dfa doesn't handle (except
+ * on 32-bit ASCII platforms where it trivially is an error). Call a
+ * helper function for the other platforms. */
+
+ while (s < e && LIKELY(state != 1)) {
+ state = PL_extended_utf8_dfa_tab[256
+ + state
+ + PL_extended_utf8_dfa_tab[*s]];
+ if (state != 0) {
+ s++;
+ continue;
+ }
+
+ return s - s0 + 1;
+ }
+
+#if defined(UV_IS_QUAD) || defined(EBCDIC)
+
+ if (NATIVE_UTF8_TO_I8(*s0) == 0xFF && e - s0 >= UTF8_MAXBYTES) {
+ return _is_utf8_char_helper(s0, e, 0);
+ }
+
+#endif
+
+ return 0;
+}
+
+/*
+
+=for apidoc isSTRICT_UTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode code point completely acceptable for open interchange between all
+applications; otherwise it evaluates to 0. If non-zero, the value gives how
+many bytes starting at C<s> comprise the code point's representation. Any
+bytes remaining before C<e>, but beyond the ones needed to form the first code
+point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
+be a surrogate nor a non-character code point. Thus this excludes any code
+point from Perl's extended UTF-8.
+
+This is used to efficiently decide if the next few bytes in C<s> is
+legal Unicode-acceptable UTF-8 for a single character.
+
+Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
+and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
+C<L</is_strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version. A copyright notice for the original
+version is given at the beginning of this file. The Perl adapation is
+documented at the definition of strict_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+ const U8 * s = s0;
+ UV state = 0;
+
+ PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
+
+ while (s < e && LIKELY(state != 1)) {
+ state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
+
+ if (state != 0) {
+ s++;
+ continue;
+ }
+
+ return s - s0 + 1;
+ }
+
+#ifndef EBCDIC
+
+ /* The dfa above drops out for certain Hanguls; handle them specially */
+ if (is_HANGUL_ED_utf8_safe(s0, e)) {
+ return 3;
+ }
+
+#endif
+
+ return 0;
+}
+
+/*
+
+=for apidoc Am|STRLEN|isC9_STRICT_UTF8_CHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode non-surrogate code point; otherwise it evaluates to 0. If non-zero,
+the value gives how many bytes starting at C<s> comprise the code point's
+representation. Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF. This
+differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
+code points. This corresponds to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+which said that non-character code points are merely discouraged rather than
+completely forbidden in open interchange. See
+L<perlunicode/Noncharacter code points>.
+
+Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
+C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
+C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version. A copyright notice for the original
+version is given at the beginning of this file. The Perl adapation is
+documented at the definition of PL_c9_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+ const U8 * s = s0;
+ UV state = 0;
+
+ PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
+
+ while (s < e && LIKELY(state != 1)) {
+ state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
+
+ if (state != 0) {
+ s++;
+ continue;
+ }
+
+ return s - s0 + 1;
+ }
+
+ return 0;
+}
+
+/*
+
=for apidoc is_strict_utf8_string_loc
Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
s--;
}
}
- GCC_DIAG_IGNORE_STMT(-Wcast-qual);
+ GCC_DIAG_IGNORE(-Wcast-qual)
return (U8 *)s;
- GCC_DIAG_RESTORE_STMT;
+ GCC_DIAG_RESTORE
}
/*
while (off--) {
STRLEN skip = UTF8SKIP(s);
if ((STRLEN)(end - s) <= skip) {
- GCC_DIAG_IGNORE_STMT(-Wcast-qual);
+ GCC_DIAG_IGNORE(-Wcast-qual)
return (U8 *)end;
- GCC_DIAG_RESTORE_STMT;
+ GCC_DIAG_RESTORE
}
s += skip;
}
- GCC_DIAG_IGNORE_STMT(-Wcast-qual);
+ GCC_DIAG_IGNORE(-Wcast-qual)
return (U8 *)s;
- GCC_DIAG_RESTORE_STMT;
+ GCC_DIAG_RESTORE
}
/*
assert(off <= 0);
while (off++ && s > start) {
- s--;
- while (UTF8_IS_CONTINUATION(*s) && s > start)
+ do {
s--;
+ } while (UTF8_IS_CONTINUATION(*s) && s > start);
}
- GCC_DIAG_IGNORE_STMT(-Wcast-qual);
+ GCC_DIAG_IGNORE(-Wcast-qual)
return (U8 *)s;
- GCC_DIAG_RESTORE_STMT;
+ GCC_DIAG_RESTORE
}
/*
|| is_utf8_valid_partial_char_flags(*ep, s + len, flags);
}
+PERL_STATIC_INLINE UV
+S_utf8n_to_uvchr_msgs(const U8 *s,
+ STRLEN curlen,
+ STRLEN *retlen,
+ const U32 flags,
+ U32 * errors,
+ AV ** msgs)
+{
+ /* This is the inlined portion of utf8n_to_uvchr_msgs. It handles the
+ * simple cases, and, if necessary calls a helper function to deal with the
+ * more complex ones. Almost all well-formed non-problematic code points
+ * are considered simple, so that it's unlikely that the helper function
+ * will need to be called.
+ *
+ * This is an adaptation of the tables and algorithm given in
+ * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
+ * comprehensive documentation of the original version. A copyright notice
+ * for the original version is given at the beginning of this file. The
+ * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
+ */
+
+ const U8 * const s0 = s;
+ const U8 * send = s0 + curlen;
+ UV uv = 0; /* The 0 silences some stupid compilers */
+ UV state = 0;
+
+ PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+
+ /* This dfa is fast. If it accepts the input, it was for a well-formed,
+ * non-problematic code point, which can be returned immediately.
+ * Otherwise we call a helper function to figure out the more complicated
+ * cases. */
+
+ while (s < send && LIKELY(state != 1)) {
+ UV type = PL_strict_utf8_dfa_tab[*s];
+
+ uv = (state == 0)
+ ? ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
+ : UTF8_ACCUMULATE(uv, *s);
+ state = PL_strict_utf8_dfa_tab[256 + state + type];
+
+ if (state != 0) {
+ s++;
+ continue;
+ }
+
+ if (retlen) {
+ *retlen = s - s0 + 1;
+ }
+ if (errors) {
+ *errors = 0;
+ }
+ if (msgs) {
+ *msgs = NULL;
+ }
+
+ return uv;
+ }
+
+ /* Here is potentially problematic. Use the full mechanism */
+ return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
+}
+
/* ------------------------------- perl.h ----------------------------- */
/*