+ /* If there were no malformations, or the only malformation is an
+ * overlong, 'uv' is valid */
+ if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+ possible_problems |= UTF8_GOT_SURROGATE;
+ }
+ else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+ possible_problems |= UTF8_GOT_NONCHAR;
+ }
+ }
+ else { /* Otherwise, need to look at the source UTF-8, possibly
+ adjusted to be non-overlong */
+
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+ >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+ {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (curlen > 1) {
+ if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ {
+ possible_problems |= UTF8_GOT_SURROGATE;
+ }
+ }
+
+ /* We need a complete well-formed UTF-8 character to discern
+ * non-characters, so can't look for them here */
+ }
+ }
+
+ ready_to_handle_errors:
+
+ /* At this point:
+ * curlen contains the number of bytes in the sequence that
+ * this call should advance the input by.
+ * avail_len gives the available number of bytes passed in, but
+ * only if this is less than the expected number of
+ * bytes, based on the code point's start byte.
+ * possible_problems' is 0 if there weren't any problems; otherwise a bit
+ * is set in it for each potential problem found.
+ * uv contains the code point the input sequence
+ * represents; or if there is a problem that prevents
+ * a well-defined value from being computed, it is
+ * some subsitute value, typically the REPLACEMENT
+ * CHARACTER.
+ * s0 points to the first byte of the character
+ * send points to just after where that (potentially
+ * partial) character ends
+ * adjusted_s0 normally is the same as s0, but in case of an
+ * overlong for which the UTF-8 matters below, it is
+ * the first byte of the shortest form representation
+ * of the input.
+ * adjusted_send normally is the same as 'send', but if adjusted_s0
+ * is set to something other than s0, this points one
+ * beyond its end
+ */
+
+ if (UNLIKELY(possible_problems)) {
+ bool disallowed = FALSE;
+ const U32 orig_problems = possible_problems;
+
+ while (possible_problems) { /* Handle each possible problem */
+ UV pack_warn = 0;
+ char * message = NULL;
+
+ /* Each 'if' clause handles one problem. They are ordered so that
+ * the first ones' messages will be displayed before the later
+ * ones; this is kinda in decreasing severity order */
+ if (possible_problems & UTF8_GOT_OVERFLOW) {
+
+ /* Overflow means also got a super and above 31 bits, but we
+ * handle all three cases here */
+ possible_problems
+ &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+ *errors |= UTF8_GOT_OVERFLOW;
+
+ /* But the API says we flag all errors found */
+ if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
+ *errors |= UTF8_GOT_SUPER;
+ }
+ if (flags
+ & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT))
+ {
+ *errors |= UTF8_GOT_ABOVE_31_BIT;
+ }
+
+ /* Disallow if any of the three categories say to */
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & ( UTF8_DISALLOW_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT)))
+ {
+ disallowed = TRUE;
+ }
+
+
+ /* Likewise, warn if any say to, plus if deprecation warnings
+ * are on, because this code point is above IV_MAX */
+ if ( ckWARN_d(WARN_DEPRECATED)
+ || ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+ {
+
+ /* The warnings code explicitly says it doesn't handle the
+ * case of packWARN2 and two categories which have
+ * parent-child relationship. Even if it works now to
+ * raise the warning if either is enabled, it wouldn't
+ * necessarily do so in the future. We output (only) the
+ * most dire warning*/
+ if (! (flags & UTF8_CHECK_ONLY)) {
+ if (ckWARN_d(WARN_UTF8)) {
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ else if (ckWARN_d(WARN_NON_UNICODE)) {
+ pack_warn = packWARN(WARN_NON_UNICODE);
+ }
+ if (pack_warn) {
+ message = Perl_form(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0));
+ }
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_EMPTY) {
+ possible_problems &= ~UTF8_GOT_EMPTY;
+ *errors |= UTF8_GOT_EMPTY;
+
+ if (! (flags & UTF8_ALLOW_EMPTY)) {
+
+ /* This so-called malformation is now treated as a bug in
+ * the caller. If you have nothing to decode, skip calling
+ * this function */
+ assert(0);
+
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_ "%s (empty string)",
+ malformed_text);
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_CONTINUATION) {
+ possible_problems &= ~UTF8_GOT_CONTINUATION;
+ *errors |= UTF8_GOT_CONTINUATION;
+
+ if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1), *s0);
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SHORT) {
+ possible_problems &= ~UTF8_GOT_SHORT;
+ *errors |= UTF8_GOT_SHORT;
+
+ if (! (flags & UTF8_ALLOW_SHORT)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (too short; %d byte%s available, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ (int)avail_len,
+ avail_len == 1 ? "" : "s",
+ (int)expectlen);
+ }
+ }