+ /* We need a complete well-formed UTF-8 character to discern
+ * non-characters, so can't look for them here */
+ }
+ }
+
+ ready_to_handle_errors:
+
+ /* At this point:
+ * curlen contains the number of bytes in the sequence that
+ * this call should advance the input by.
+ * avail_len gives the available number of bytes passed in, but
+ * only if this is less than the expected number of
+ * bytes, based on the code point's start byte.
+ * possible_problems' is 0 if there weren't any problems; otherwise a bit
+ * is set in it for each potential problem found.
+ * uv contains the code point the input sequence
+ * represents; or if there is a problem that prevents
+ * a well-defined value from being computed, it is
+ * some subsitute value, typically the REPLACEMENT
+ * CHARACTER.
+ * s0 points to the first byte of the character
+ * send points to just after where that (potentially
+ * partial) character ends
+ * adjusted_s0 normally is the same as s0, but in case of an
+ * overlong for which the UTF-8 matters below, it is
+ * the first byte of the shortest form representation
+ * of the input.
+ * adjusted_send normally is the same as 'send', but if adjusted_s0
+ * is set to something other than s0, this points one
+ * beyond its end
+ */
+
+ if (UNLIKELY(possible_problems)) {
+ bool disallowed = FALSE;
+ const U32 orig_problems = possible_problems;
+
+ while (possible_problems) { /* Handle each possible problem */
+ UV pack_warn = 0;
+ char * message = NULL;
+
+ /* Each 'if' clause handles one problem. They are ordered so that
+ * the first ones' messages will be displayed before the later
+ * ones; this is kinda in decreasing severity order */
+ if (possible_problems & UTF8_GOT_OVERFLOW) {
+
+ /* Overflow means also got a super and above 31 bits, but we
+ * handle all three cases here */
+ possible_problems
+ &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+ *errors |= UTF8_GOT_OVERFLOW;
+
+ /* But the API says we flag all errors found */
+ if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
+ *errors |= UTF8_GOT_SUPER;
+ }
+ if (flags
+ & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT))
+ {
+ *errors |= UTF8_GOT_ABOVE_31_BIT;
+ }
+
+ /* Disallow if any of the three categories say to */
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & ( UTF8_DISALLOW_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT)))
+ {
+ disallowed = TRUE;
+ }
+
+
+ /* Likewise, warn if any say to, plus if deprecation warnings
+ * are on, because this code point is above IV_MAX */
+ if ( ckWARN_d(WARN_DEPRECATED)
+ || ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+ {
+
+ /* The warnings code explicitly says it doesn't handle the
+ * case of packWARN2 and two categories which have
+ * parent-child relationship. Even if it works now to
+ * raise the warning if either is enabled, it wouldn't
+ * necessarily do so in the future. We output (only) the
+ * most dire warning*/
+ if (! (flags & UTF8_CHECK_ONLY)) {
+ if (ckWARN_d(WARN_UTF8)) {
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ else if (ckWARN_d(WARN_NON_UNICODE)) {
+ pack_warn = packWARN(WARN_NON_UNICODE);
+ }
+ if (pack_warn) {
+ message = Perl_form(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0, 0));
+ }
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_EMPTY) {
+ possible_problems &= ~UTF8_GOT_EMPTY;
+ *errors |= UTF8_GOT_EMPTY;
+
+ if (! (flags & UTF8_ALLOW_EMPTY)) {
+
+ /* This so-called malformation is now treated as a bug in
+ * the caller. If you have nothing to decode, skip calling
+ * this function */
+ assert(0);
+
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_ "%s (empty string)",
+ malformed_text);
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_CONTINUATION) {
+ possible_problems &= ~UTF8_GOT_CONTINUATION;
+ *errors |= UTF8_GOT_CONTINUATION;
+
+ if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1, 0), *s0);
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SHORT) {
+ possible_problems &= ~UTF8_GOT_SHORT;
+ *errors |= UTF8_GOT_SHORT;
+
+ if (! (flags & UTF8_ALLOW_SHORT)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (too short; %d byte%s available, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0, 0),
+ (int)avail_len,
+ avail_len == 1 ? "" : "s",
+ (int)expectlen);
+ }
+ }
+
+ }
+ else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+ possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+ *errors |= UTF8_GOT_NON_CONTINUATION;
+
+ if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+
+ /* If we don't know for sure that the input length is
+ * valid, avoid as much as possible reading past the
+ * end of the buffer */
+ int printlen = (flags & _UTF8_NO_CONFIDENCE_IN_CURLEN)
+ ? s - s0
+ : send - s0;
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_ "%s",
+ unexpected_non_continuation_text(s0,
+ printlen,
+ s - s0,
+ (int) expectlen));
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_LONG) {
+ possible_problems &= ~UTF8_GOT_LONG;
+ *errors |= UTF8_GOT_LONG;
+
+ if (flags & UTF8_ALLOW_LONG) {
+
+ /* We don't allow the actual overlong value, unless the
+ * special extra bit is also set */
+ if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
+ & ~UTF8_ALLOW_LONG)))
+ {
+ uv = UNICODE_REPLACEMENT;
+ }
+ }
+ else {
+ disallowed = TRUE;
+
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+
+ /* These error types cause 'uv' to be something that
+ * isn't what was intended, so can't use it in the
+ * message. The other error types either can't
+ * generate an overlong, or else the 'uv' is valid */
+ if (orig_problems &
+ (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+ {
+ message = Perl_form(aTHX_
+ "%s: %s (any UTF-8 sequence that starts"
+ " with \"%s\" is overlong which can and"
+ " should be represented with a"
+ " different, shorter sequence)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(s0, curlen, 0));
+ }
+ else {
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+ uv, 0);
+ message = Perl_form(aTHX_
+ "%s: %s (overlong; instead use %s to represent"
+ " U+%0*" UVXf ")",
+ malformed_text,
+ _byte_dump_string(s0, send - s0, 0),
+ _byte_dump_string(tmpbuf, e - tmpbuf, 0),
+ ((uv < 256) ? 2 : 4), /* Field width of 2 for
+ small code points */
+ uv);
+ }
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SURROGATE) {
+ possible_problems &= ~UTF8_GOT_SURROGATE;
+
+ if (flags & UTF8_WARN_SURROGATE) {
+ *errors |= UTF8_GOT_SURROGATE;
+
+ if ( ! (flags & UTF8_CHECK_ONLY)
+ && ckWARN_d(WARN_SURROGATE))
+ {
+ pack_warn = packWARN(WARN_SURROGATE);
+
+ /* These are the only errors that can occur with a
+ * surrogate when the 'uv' isn't valid */
+ if (orig_problems & UTF8_GOT_TOO_SHORT) {
+ message = Perl_form(aTHX_
+ "UTF-16 surrogate (any UTF-8 sequence that"
+ " starts with \"%s\" is for a surrogate)",
+ _byte_dump_string(s0, curlen, 0));
+ }
+ else {
+ message = Perl_form(aTHX_
+ "UTF-16 surrogate U+%04" UVXf, uv);
+ }
+ }
+ }