#define FOLD_FLAGS_FULL 0x2
#define FOLD_FLAGS_NOMIX_ASCII 0x4
-/* For _core_swash_init(), internal core use only */
-#define _CORE_SWASH_INIT_USER_DEFINED_PROPERTY 0x1
-#define _CORE_SWASH_INIT_RETURN_IF_UNDEF 0x2
-#define _CORE_SWASH_INIT_ACCEPT_INVLIST 0x4
-
/*
=head1 Unicode Support
L<perlguts/Unicode Support> has an introduction to this API.
#define uvchr_to_utf8_flags_msgs(d,uv,flags,msgs) \
uvoffuni_to_utf8_flags_msgs(d,NATIVE_TO_UNI(uv),flags, msgs)
#define utf8_to_uvchr_buf(s, e, lenp) \
- (__ASSERT_((U8*) (e) > (U8*) (s)) \
- utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp, \
- ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY))
+ utf8_to_uvchr_buf_helper((const U8 *) (s), (const U8 *) e, lenp)
#define utf8n_to_uvchr(s, len, lenp, flags) \
utf8n_to_uvchr_error(s, len, lenp, flags, 0)
#define utf8n_to_uvchr_error(s, len, lenp, flags, errors) \
#else /* ! EBCDIC */
START_EXTERN_C
-/* How wide can a single UTF-8 encoded character become in bytes. */
-/* NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
- * is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
- * expressed with 4 bytes. However, Perl thinks of UTF-8 as a way to encode
- * non-negative integers in a binary format, even those above Unicode */
+/*
+
+=for apidoc AmnU|STRLEN|UTF8_MAXBYTES
+
+The maximum width of a single UTF-8 encoded character, in bytes.
+
+NOTE: Strictly speaking Perl's UTF-8 should not be called UTF-8 since UTF-8
+is an encoding of Unicode, and Unicode's upper limit, 0x10FFFF, can be
+expressed with 4 bytes. However, Perl thinks of UTF-8 as a way to encode
+non-negative integers in a binary format, even those above Unicode.
+
+=cut
+ */
#define UTF8_MAXBYTES 13
#ifdef DOINIT
#define PERL_SMALL_MACRO_BUFFER
#endif
-/* Native character to/from iso-8859-1. Are the identity functions on ASCII
- * platforms */
+/*
+
+=for apidoc Am|U8|NATIVE_TO_LATIN1|U8 ch
+
+Returns the Latin-1 (including ASCII and control characters) equivalent of the
+input native code point given by C<ch>. Thus, C<NATIVE_TO_LATIN1(193)> on
+EBCDIC platforms returns 65. These each represent the character C<"A"> on
+their respective platforms. On ASCII platforms no conversion is needed, so
+this macro expands to just its input, adding no time nor space requirements to
+the implementation.
+
+For conversion of code points potentially larger than will fit in a character,
+use L</NATIVE_TO_UNI>.
+
+=for apidoc Am|U8|LATIN1_TO_NATIVE|U8 ch
+
+Returns the native equivalent of the input Latin-1 code point (including ASCII
+and control characters) given by C<ch>. Thus, C<LATIN1_TO_NATIVE(66)> on
+EBCDIC platforms returns 194. These each represent the character C<"B"> on
+their respective platforms. On ASCII platforms no conversion is needed, so
+this macro expands to just its input, adding no time nor space requirements to
+the implementation.
+
+For conversion of code points potentially larger than will fit in a character,
+use L</UNI_TO_NATIVE>.
+
+=for apidoc Am|UV|NATIVE_TO_UNI|UV ch
+
+Returns the Unicode equivalent of the input native code point given by C<ch>.
+Thus, C<NATIVE_TO_UNI(195)> on EBCDIC platforms returns 67. These each
+represent the character C<"C"> on their respective platforms. On ASCII
+platforms no conversion is needed, so this macro expands to just its input,
+adding no time nor space requirements to the implementation.
+
+=for apidoc Am|UV|UNI_TO_NATIVE|UV ch
+
+Returns the native equivalent of the input Unicode code point given by C<ch>.
+Thus, C<UNI_TO_NATIVE(68)> on EBCDIC platforms returns 196. These each
+represent the character C<"D"> on their respective platforms. On ASCII
+platforms no conversion is needed, so this macro expands to just its input,
+adding no time nor space requirements to the implementation.
+
+=cut
+*/
+
#ifdef PERL_SMALL_MACRO_BUFFER
#define NATIVE_TO_LATIN1(ch) ((U8)(ch))
#define LATIN1_TO_NATIVE(ch) ((U8)(ch))
#define I8_TO_NATIVE_UTF8(ch) (__ASSERT_(FITS_IN_8_BITS(ch)) ((U8) (ch)))
#endif
-/* Transforms in wide UV chars */
#define UNI_TO_NATIVE(ch) ((UV) (ch))
#define NATIVE_TO_UNI(ch) ((UV) (ch))
the paradigm that the number of leading bits gives how many total bytes there
are in the character.
+=cut
*/
/* Is the representation of the Unicode code point 'cp' the same regardless of
#define UTF8SKIP(s) PL_utf8skip[*(const U8*)(s)]
#define UTF8_SKIP(s) UTF8SKIP(s)
+/*
+
+=for apidoc Am|STRLEN|UTF8_SAFE_SKIP|char* s|char* e
+returns 0 if S<C<s E<gt>= e>>; otherwise returns the number of bytes in the
+UTF-8 encoded character whose first byte is pointed to by C<s>. But it never
+returns beyond C<e>. On DEBUGGING builds, it asserts that S<C<s E<lt>= e>>.
+
+=cut
+ */
+#define UTF8_SAFE_SKIP(s, e) (__ASSERT_((e) >= (s)) \
+ ((e) - (s)) <= 0 \
+ ? 0 \
+ : MIN(((e) - (s)), UTF8_SKIP(s)))
+
/* Most code that says 'UNI_' really means the native value for code points up
* through 255 */
#define UNI_IS_INVARIANT(cp) UVCHR_IS_INVARIANT(cp)
&& ( NATIVE_UTF8_TO_I8(*(s)) > 0xF9 \
|| (NATIVE_UTF8_TO_I8(*((s) + 1)) >= 0xA2)) \
&& LIKELY((s) + UTF8SKIP(s) <= (e))) \
- ? _is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
+ ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
#else
# define UTF8_IS_SUPER(s, e) \
(( LIKELY((e) > (s) + 3) \
&& (*(U8*) (s)) >= 0xF4 \
&& ((*(U8*) (s)) > 0xF4 || (*((U8*) (s) + 1) >= 0x90))\
&& LIKELY((s) + UTF8SKIP(s) <= (e))) \
- ? _is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
+ ? is_utf8_char_helper(s, s + UTF8SKIP(s), 0) : 0)
#endif
/* These are now machine generated, and the 'given' clause is no longer
non-zero, the value gives how many bytes starting at C<s> comprise the code
point's representation.
+=for apidoc AmnU|UV|UNICODE_REPLACEMENT
+
+Evaluates to 0xFFFD, the code point of the Unicode REPLACEMENT CHARACTER
+
=cut
*/
#define UTF8_IS_NONCHAR(s, e) \
? 1 \
: UNLIKELY(((e) - (s)) < UTF8SKIP(s)) \
? 0 \
- : _is_utf8_char_helper(s, e, flags))
+ : is_utf8_char_helper(s, e, flags))
/* Do not use; should be deprecated. Use isUTF8_CHAR() instead; this is
* retained solely for backwards compatibility */