+ possible_problems |= UTF8_GOT_CONTINUATION;
+ curlen = 1;
+ uv = UNICODE_REPLACEMENT;
+ goto ready_to_handle_errors;
+ }
+
+ /* Here is not a continuation byte, nor an invariant. The only thing left
+ * is a start byte (possibly for an overlong) */
+
+ /* Convert to I8 on EBCDIC (no-op on ASCII), then remove the leading bits
+ * that indicate the number of bytes in the character's whole UTF-8
+ * sequence, leaving just the bits that are part of the value. */
+ uv = NATIVE_UTF8_TO_I8(uv) & UTF_START_MASK(expectlen);
+
+ /* Now, loop through the remaining bytes in the character's sequence,
+ * accumulating each into the working value as we go. Be sure to not look
+ * past the end of the input string */
+ send = adjusted_send = (U8*) s0 + ((expectlen <= curlen)
+ ? expectlen
+ : curlen);
+ for (s = s0 + 1; s < send; s++) {
+ if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
+ uv = UTF8_ACCUMULATE(uv, *s);
+ continue;
+ }
+
+ /* Here, found a non-continuation before processing all expected bytes.
+ * This byte indicates the beginning of a new character, so quit, even
+ * if allowing this malformation. */
+ curlen = s - s0; /* Save how many bytes we actually got */
+ possible_problems |= UTF8_GOT_NON_CONTINUATION;
+ goto finish_short;
+ } /* End of loop through the character's bytes */
+
+ /* Save how many bytes were actually in the character */
+ curlen = s - s0;
+
+ /* Did we get all the continuation bytes that were expected? Note that we
+ * know this result even without executing the loop above. But we had to
+ * do the loop to see if there are unexpected non-continuations. */
+ if (UNLIKELY(curlen < expectlen)) {
+ possible_problems |= UTF8_GOT_SHORT;
+
+ finish_short:
+ uv_so_far = uv;
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ /* Note that there are two types of too-short malformation. One is when
+ * there is actual wrong data before the normal termination of the
+ * sequence. The other is that the sequence wasn't complete before the end
+ * of the data we are allowed to look at, based on the input 'curlen'.
+ * This means that we were passed data for a partial character, but it is
+ * valid as far as we saw. The other is definitely invalid. This
+ * distinction could be important to a caller, so the two types are kept
+ * separate. */
+
+ /* Check for overflow */
+ if (UNLIKELY(does_utf8_overflow(s0, send))) {
+ possible_problems |= UTF8_GOT_OVERFLOW;
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ /* Check for overlong. If no problems so far, 'uv' is the correct code
+ * point value. Simply see if it is expressible in fewer bytes. Otherwise
+ * we must look at the UTF-8 byte sequence itself to see if it is for an
+ * overlong */
+ if ( ( LIKELY(! possible_problems)
+ && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
+ || ( UNLIKELY( possible_problems)
+ && ( UNLIKELY(! UTF8_IS_START(*s0))
+ || ( curlen > 1
+ && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
+ send - s0))))))
+ {
+ possible_problems |= UTF8_GOT_LONG;
+
+ /* A convenience macro that matches either of the too-short conditions.
+ * */
+# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+ if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+ UV min_uv = uv_so_far;
+ STRLEN i;
+
+ /* Here, the input is both overlong and is missing some trailing
+ * bytes. There is no single code point it could be for, but there
+ * may be enough information present to determine if what we have
+ * so far is for an unallowed code point, such as for a surrogate.
+ * The code below has the intelligence to determine this, but just
+ * for non-overlong UTF-8 sequences. What we do here is calculate
+ * the smallest code point the input could represent if there were
+ * no too short malformation. Then we compute and save the UTF-8
+ * for that, which is what the code below looks at instead of the
+ * raw input. It turns out that the smallest such code point is
+ * all we need. */
+ for (i = curlen; i < expectlen; i++) {
+ min_uv = UTF8_ACCUMULATE(min_uv,
+ I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
+ }
+
+ Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
+ SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
+ to free it ourselves if
+ warnings are made fatal */
+ adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+ }
+ }
+
+ /* Now check that the input isn't for a problematic code point not allowed
+ * by the input parameters. */
+ /* isn't problematic if < this */
+ if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
+ || ( UNLIKELY(possible_problems)
+ && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
+ && ((flags & ( UTF8_DISALLOW_NONCHAR
+ |UTF8_DISALLOW_SURROGATE
+ |UTF8_DISALLOW_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT
+ |UTF8_WARN_NONCHAR
+ |UTF8_WARN_SURROGATE
+ |UTF8_WARN_SUPER
+ |UTF8_WARN_ABOVE_31_BIT))
+ /* In case of a malformation, 'uv' is not valid, and has
+ * been changed to something in the Unicode range.
+ * Currently we don't output a deprecation message if there
+ * is already a malformation, so we don't have to special
+ * case the test immediately below */
+ || ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+ && ckWARN_d(WARN_DEPRECATED))))
+ {
+ /* If there were no malformations, or the only malformation is an
+ * overlong, 'uv' is valid */
+ if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+ possible_problems |= UTF8_GOT_SURROGATE;
+ }
+ else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+ possible_problems |= UTF8_GOT_NONCHAR;
+ }
+ }
+ else { /* Otherwise, need to look at the source UTF-8, possibly
+ adjusted to be non-overlong */
+
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+ >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
+ {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (curlen > 1) {
+ if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ {
+ possible_problems |= UTF8_GOT_SURROGATE;
+ }
+ }