+ /* Not 3-byte; that means the code point is at least 0x1_0000 on ASCII
+ * platforms, and 0x4000 on EBCDIC. There are problematic cases that can
+ * happen starting with 4-byte characters on ASCII platforms. We unify the
+ * code for these with EBCDIC, even though some of them require 5-bytes on
+ * those, because khw believes the code saving is worth the very slight
+ * performance hit on these high EBCDIC code points. */
+
+ if (UNLIKELY(UNICODE_IS_SUPER(uv))) {
+ if ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+ && ckWARN_d(WARN_DEPRECATED))
+ {
+ Perl_warner(aTHX_ packWARN(WARN_DEPRECATED),
+ cp_above_legal_max, uv, MAX_NON_DEPRECATED_CP);
+ }
+ if ( (flags & UNICODE_WARN_SUPER)
+ || ( UNICODE_IS_ABOVE_31_BIT(uv)
+ && (flags & UNICODE_WARN_ABOVE_31_BIT)))
+ {
+ Perl_ck_warner_d(aTHX_ packWARN(WARN_NON_UNICODE),
+
+ /* Choose the more dire applicable warning */
+ (UNICODE_IS_ABOVE_31_BIT(uv))
+ ? "Code point 0x%"UVXf" is not Unicode, and not portable"
+ : "Code point 0x%"UVXf" is not Unicode, may not be portable",
+ uv);
+ }
+ if (flags & UNICODE_DISALLOW_SUPER
+ || ( UNICODE_IS_ABOVE_31_BIT(uv)
+ && (flags & UNICODE_DISALLOW_ABOVE_31_BIT)))
+ {
+ return NULL;
+ }
+ }
+ else if (UNLIKELY(UNICODE_IS_END_PLANE_NONCHAR_GIVEN_NOT_SUPER(uv))) {
+ HANDLE_UNICODE_NONCHAR(uv, flags);
+ }
+
+ /* Test for and handle 4-byte result. In the test immediately below, the
+ * 8 is for start bytes F0-F7 (which are all the possible ones for 4 byte
+ * characters). The 3 is for 3 continuation bytes; these each contribute
+ * SHIFT bits. This yields 0x4_0000 on EBCDIC platforms, 0x20_0000 on
+ * ASCII, so 4 bytes covers the range 0x4000-0x3_FFFF on EBCDIC;
+ * 0x1_0000-0x1F_FFFF on ASCII */
+ if (uv < (8 * (1U << (3 * SHIFT)))) {
+ *d++ = I8_TO_NATIVE_UTF8(( uv >> ((4 - 1) * SHIFT)) | UTF_START_MARK(4));
+ *d++ = I8_TO_NATIVE_UTF8(((uv >> ((3 - 1) * SHIFT)) & MASK) | MARK);
+ *d++ = I8_TO_NATIVE_UTF8(((uv >> ((2 - 1) * SHIFT)) & MASK) | MARK);
+ *d++ = I8_TO_NATIVE_UTF8(( uv /* (1 - 1) */ & MASK) | MARK);
+
+#ifdef EBCDIC /* These were handled on ASCII platforms in the code for 3-byte
+ characters. The end-plane non-characters for EBCDIC were
+ handled just above */
+ if (UNLIKELY(UNICODE_IS_32_CONTIGUOUS_NONCHARS(uv))) {
+ HANDLE_UNICODE_NONCHAR(uv, flags);
+ }
+ else if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+ HANDLE_UNICODE_SURROGATE(uv, flags);
+ }
+#endif
+
+ return d;
+ }
+
+ /* Not 4-byte; that means the code point is at least 0x20_0000 on ASCII
+ * platforms, and 0x4000 on EBCDIC. At this point we switch to a loop
+ * format. The unrolled version above turns out to not save all that much
+ * time, and at these high code points (well above the legal Unicode range
+ * on ASCII platforms, and well above anything in common use in EBCDIC),
+ * khw believes that less code outweighs slight performance gains. */
+