beginning with FF yields a code point that is too large for 32-bit ASCII
platforms. FF signals to use 13 bytes for the encoded character. This breaks
the paradigm that the number of leading bits gives how many total bytes there
-are in the character.
-
-=cut
-*/
-
-/* Is the representation of the Unicode code point 'cp' the same regardless of
- * being encoded in UTF-8 or not? */
-#define OFFUNI_IS_INVARIANT(cp) isASCII(cp)
-
-/*
-=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp
-
-Evaluates to 1 if the representation of code point C<cp> is the same whether or
-not it is encoded in UTF-8; otherwise evaluates to 0. UTF-8 invariant
-characters can be copied as-is when converting to/from UTF-8, saving time.
-C<cp> is Unicode if above 255; otherwise is platform-native.
-
-=cut
- */
-
-#define UVCHR_IS_INVARIANT(cp) OFFUNI_IS_INVARIANT(cp)
-
-/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
- * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. The |0 makes sure this
- * isn't mistakenly called with a ptr argument */
-#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
- ((U8)((c) | 0)) & UTF_CONTINUATION_MARK)
-
-/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use
- * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
- * be well-formed. Masking with 0xfe allows the low bit to be 0 or 1; thus
- * this matches 0xc[23]. The |0 makes sure this isn't mistakenly called with a
- * ptr argument */
-#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
- (((U8)((c) | 0)) & 0xfe) == 0xc2)
+are in the character. */
/* This is the number of low-order bits a continuation byte in a UTF-8 encoded
* sequence contributes to the specification of the code point. In the bit
(((NATIVE_UTF8_TO_I8(c) & UTF_IS_CONTINUATION_MASK) \
== UTF_CONTINUATION_MARK)))
+/* Is the representation of the Unicode code point 'cp' the same regardless of
+ * being encoded in UTF-8 or not? This is a fundamental property of
+ * UTF-8,EBCDIC */
+#define OFFUNI_IS_INVARIANT(c) (((WIDEST_UTYPE)(c)) < UTF_CONTINUATION_MARK)
+
+/*
+=for apidoc Am|bool|UVCHR_IS_INVARIANT|UV cp
+
+Evaluates to 1 if the representation of code point C<cp> is the same whether or
+not it is encoded in UTF-8; otherwise evaluates to 0. UTF-8 invariant
+characters can be copied as-is when converting to/from UTF-8, saving time.
+C<cp> is Unicode if above 255; otherwise is platform-native.
+
+=cut
+ */
+#define UVCHR_IS_INVARIANT(cp) (OFFUNI_IS_INVARIANT(NATIVE_TO_UNI(cp)))
+
/* Internal macro to be used only in this file to aid in constructing other
* publicly accessible macros.
* The number of bytes required to express this uv in UTF-8, for just those
#define UTF8_IS_ABOVE_LATIN1(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
(NATIVE_UTF8_TO_I8(c) >= UTF_MIN_ABOVE_LATIN1_BYTE))
+/* Is the UTF8-encoded byte 'c' the first byte of a two byte sequence? Use
+ * UTF8_IS_NEXT_CHAR_DOWNGRADEABLE() instead if the input isn't known to
+ * be well-formed. */
+#define UTF8_IS_DOWNGRADEABLE_START(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ inRANGE(NATIVE_UTF8_TO_I8(c), \
+ UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1))
+
/* The largest code point representable by two UTF-8 bytes on this platform.
* As explained in the comments for __COMMON_UNI_SKIP, 32 start bytes with
* UTF_ACCUMULATION_SHIFT bits of information each */
/*
=for apidoc Am|STRLEN|UTF8SKIP|char* s
-returns the number of bytes in the UTF-8 encoded character whose first (perhaps
-only) byte is pointed to by C<s>.
+returns the number of bytes a non-malformed UTF-8 encoded character whose first
+(perhaps only) byte is pointed to by C<s>.
+
+If there is a possibility of malformed input, use instead:
+
+=over
+
+=item L</C<UTF8_SAFE_SKIP>> if you know the maximum ending pointer in the
+buffer pointed to by C<s>; or
+
+=item L</C<UTF8_CHK_SKIP>> if you don't know it.
+
+=back
+
+It is better to restructure your code so the end pointer is passed down so that
+you know what it actually is at the point of this call, but if that isn't
+possible, L</C<UTF8_CHK_SKIP>> can minimize the chance of accessing beyond the end
+of the input buffer.
=cut
*/
#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
+
+/*
+=for apidoc Am|STRLEN|UTF8_SKIP|char* s
+This is a synonym for L</C<UTF8SKIP>>
+
+=cut
+*/
+
#define UTF8_SKIP(s) UTF8SKIP(s)
/*
+=for apidoc Am|STRLEN|UTF8_CHK_SKIP|char* s
+
+This is a safer version of L</C<UTF8SKIP>>, but still not as safe as
+L</C<UTF8_SAFE_SKIP>>. This version doesn't blindly assume that the input
+string pointed to by C<s> is well-formed, but verifies that there isn't a NUL
+terminating character before the expected end of the next character in C<s>.
+The length C<UTF8_CHK_SKIP> returns stops just before any such NUL.
+
+Perl tends to add NULs, as an insurance policy, after the end of strings in
+SV's, so it is likely that using this macro will prevent inadvertent reading
+beyond the end of the input buffer, even if it is malformed UTF-8.
+
+This macro is intended to be used by XS modules where the inputs could be
+malformed, and it isn't feasible to restructure to use the safer
+L</C<UTF8_SAFE_SKIP>>, for example when interfacing with a C library.
+
+=cut
+*/
+
+#define UTF8_CHK_SKIP(s) \
+ (s[0] == '\0' ? 1 : MIN(my_strnlen((char *) (s), UTF8SKIP(s))))
+/*
=for apidoc Am|STRLEN|UTF8_SAFE_SKIP|char* s|char* e
returns 0 if S<C<s E<gt>= e>>; otherwise returns the number of bytes in the
* above show, doesn't matter as to its implementation */
#define NATIVE_BYTE_IS_INVARIANT(c) UVCHR_IS_INVARIANT(c)
+/* Misleadingly named: is the UTF8-encoded byte 'c' part of a variant sequence
+ * in UTF-8? This is the inverse of UTF8_IS_INVARIANT. */
+#define UTF8_IS_CONTINUED(c) (__ASSERT_(FITS_IN_8_BITS(c)) \
+ (! UTF8_IS_INVARIANT(c)))
+
/* The macros in the next 4 sets are used to generate the two utf8 or utfebcdic
* bytes from an ordinal that is known to fit into exactly two (not one) bytes;
* it must be less than 0x3FF to work across both encodings. */