+#define UNICODE_SURROGATE_FIRST 0xD800
+#define UNICODE_SURROGATE_LAST 0xDFFF
+
+/*
+=for apidoc Am|bool|UNICODE_IS_SURROGATE|const UV uv
+
+Returns a boolean as to whether or not C<uv> is one of the Unicode surrogate
+code points
+
+=for apidoc Am|bool|UTF8_IS_SURROGATE|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents one
+of the Unicode surrogate code points; otherwise it evaluates to 0. If
+non-zero, the value gives how many bytes starting at C<s> comprise the code
+point's representation.
+
+=cut
+ */
+
+#define UNICODE_IS_SURROGATE(uv) UNLIKELY(inRANGE(uv, UNICODE_SURROGATE_FIRST, \
+ UNICODE_SURROGATE_LAST))
+#define UTF8_IS_SURROGATE(s, e) is_SURROGATE_utf8_safe(s, e)
+
+/*
+
+=for apidoc AmnU|UV|UNICODE_REPLACEMENT
+
+Evaluates to 0xFFFD, the code point of the Unicode REPLACEMENT CHARACTER
+
+=for apidoc Am|bool|UNICODE_IS_REPLACEMENT|const UV uv
+
+Returns a boolean as to whether or not C<uv> is the Unicode REPLACEMENT
+CHARACTER
+
+=for apidoc Am|bool|UTF8_IS_REPLACEMENT|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents the
+Unicode REPLACEMENT CHARACTER; otherwise it evaluates to 0. If non-zero, the
+value gives how many bytes starting at C<s> comprise the code point's
+representation.
+
+=cut
+ */
+#define UNICODE_REPLACEMENT 0xFFFD
+#define UNICODE_IS_REPLACEMENT(uv) UNLIKELY((UV) (uv) == UNICODE_REPLACEMENT)
+#define UTF8_IS_REPLACEMENT(s, send) \
+ UNLIKELY( \
+ ((send) - (s)) >= ((SSize_t)(sizeof(REPLACEMENT_CHARACTER_UTF8) - 1))\
+ && memEQ((s), REPLACEMENT_CHARACTER_UTF8, \
+ sizeof(REPLACEMENT_CHARACTER_UTF8) - 1))
+
+/* Max legal code point according to Unicode */
+#define PERL_UNICODE_MAX 0x10FFFF
+
+/*
+
+=for apidoc Am|bool|UNICODE_IS_SUPER|const UV uv
+
+Returns a boolean as to whether or not C<uv> is above the maximum legal Unicode
+code point of U+10FFFF.
+
+=cut
+*/
+
+#define UNICODE_IS_SUPER(uv) UNLIKELY((UV) (uv) > PERL_UNICODE_MAX)
+
+/*
+=for apidoc Am|bool|UTF8_IS_SUPER|const U8 *s|const U8 *e
+
+Recall that Perl recognizes an extension to UTF-8 that can encode code
+points larger than the ones defined by Unicode, which are 0..0x10FFFF.
+
+This macro evaluates to non-zero if the first few bytes of the string starting
+at C<s> and looking no further than S<C<e - 1>> are from this UTF-8 extension;
+otherwise it evaluates to 0. If non-zero, the return is how many bytes
+starting at C<s> comprise the code point's representation.
+
+0 is returned if the bytes are not well-formed extended UTF-8, or if they
+represent a code point that cannot fit in a UV on the current platform. Hence
+this macro can give different results when run on a 64-bit word machine than on
+one with a 32-bit word size.
+
+Note that it is illegal in Perl to have code points that are larger than what can
+fit in an IV on the current machine; and illegal in Unicode to have any that
+this macro matches
+
+=cut
+
+ * ASCII EBCDIC I8
+ * U+10FFFF: \xF4\x8F\xBF\xBF \xF9\xA1\xBF\xBF\xBF max legal Unicode
+ * U+110000: \xF4\x90\x80\x80 \xF9\xA2\xA0\xA0\xA0
+ * U+110001: \xF4\x90\x80\x81 \xF9\xA2\xA0\xA0\xA1
+ */
+#define UTF_START_BYTE_110000_ UTF_START_BYTE(PERL_UNICODE_MAX + 1, 21)
+#define UTF_FIRST_CONT_BYTE_110000_ \
+ UTF_FIRST_CONT_BYTE(PERL_UNICODE_MAX + 1, 21)
+#define UTF8_IS_SUPER(s, e) \
+ ( ((e) - (s)) >= UNISKIP_BY_MSB_(20) \
+ && ( NATIVE_UTF8_TO_I8(s[0]) >= UTF_START_BYTE_110000_ \
+ && ( NATIVE_UTF8_TO_I8(s[0]) > UTF_START_BYTE_110000_ \
+ || NATIVE_UTF8_TO_I8(s[1]) >= UTF_FIRST_CONT_BYTE_110000_))) \
+ ? isUTF8_CHAR(s, e) \
+ : 0
+
+/*
+=for apidoc Am|bool|UNICODE_IS_NONCHAR|const UV uv
+
+Returns a boolean as to whether or not C<uv> is one of the Unicode
+non-character code points
+
+=cut
+*/
+
+/* Is 'uv' one of the 32 contiguous-range noncharacters? */
+#define UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv) \
+ UNLIKELY(inRANGE(uv, 0xFDD0, 0xFDEF))
+
+/* Is 'uv' one of the 34 plane-ending noncharacters 0xFFFE, 0xFFFF, 0x1FFFE,
+ * 0x1FFFF, ... 0x10FFFE, 0x10FFFF, given that we know that 'uv' is not above
+ * the Unicode legal max */
+#define UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv) \
+ UNLIKELY(((UV) (uv) & 0xFFFE) == 0xFFFE)
+
+#define UNICODE_IS_NONCHAR(uv) \
+ ( UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv)) \
+ || ( UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv)) \
+ && LIKELY(! UNICODE_IS_SUPER(uv))))
+
+/*
+=for apidoc Am|bool|UTF8_IS_NONCHAR|const U8 *s|const U8 *e
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents one
+of the Unicode non-character code points; otherwise it evaluates to 0. If
+non-zero, the value gives how many bytes starting at C<s> comprise the code
+point's representation.
+
+=cut
+*/
+#define UTF8_IS_NONCHAR(s, e) is_NONCHAR_utf8_safe(s,e)
+
+/* This is now machine generated, and the 'given' clause is no longer
+ * applicable */
+#define UTF8_IS_NONCHAR_GIVEN_THAT_NON_SUPER_AND_GE_PROBLEMATIC(s, e) \
+ UTF8_IS_NONCHAR(s, e)
+
+/* Surrogates, non-character code points and above-Unicode code points are
+ * problematic in some contexts. These macros allow code that needs to check
+ * for those to quickly exclude the vast majority of code points it will
+ * encounter.
+ *
+ * The lowest such code point is the smallest surrogate, U+D800. We calculate
+ * the start byte of that. 0xD800 occupies 16 bits. */
+#define isUNICODE_POSSIBLY_PROBLEMATIC(uv) ((uv) >= UNICODE_SURROGATE_FIRST)
+#define isUTF8_POSSIBLY_PROBLEMATIC(c) \
+ (NATIVE_UTF8_TO_I8(c) >= UTF_START_BYTE(UNICODE_SURROGATE_FIRST, 16))
+
+/* Perl extends Unicode so that it is possible to encode (as extended UTF-8 or
+ * UTF-EBCDIC) any 64-bit value. No standard known to khw ever encoded higher
+ * than a 31 bit value. On ASCII platforms this just meant arbitrarily saying
+ * nothing could be higher than this. On these the start byte FD gets you to
+ * 31 bits, and FE and FF are forbidden as start bytes. On EBCDIC platforms,
+ * FD gets you only to 26 bits; adding FE to mean 7 total bytes gets you to 30
+ * bits. To get to 31 bits, they treated an initial FF byte idiosyncratically.
+ * It was considered to be the start byte FE meaning it had 7 total bytes, and
+ * the final 1 was treated as an information bit, getting you to 31 bits.
+ *
+ * Perl used to accept this idiosyncratic interpretation of FF, but now rejects
+ * it in order to get to being able to encode 64 bits. The bottom line is that
+ * it is a Perl extension to use the start bytes FE and FF on ASCII platforms,
+ * and the start byte FF on EBCDIC ones. That translates into that it is a
+ * Perl extension to represent anything occupying more than 31 bits on ASCII
+ * platforms; 30 bits on EBCDIC. */
+#define UNICODE_IS_PERL_EXTENDED(uv) \
+ UNLIKELY((UV) (uv) > nBIT_UMAX(31 - ONE_IF_EBCDIC_ZERO_IF_NOT))
+#define UTF8_IS_PERL_EXTENDED(s) \
+ (UTF8SKIP(s) > 6 + ONE_IF_EBCDIC_ZERO_IF_NOT)
+
+/* Largest code point we accept from external sources */
+#define MAX_LEGAL_CP ((UV)IV_MAX)