utf8.c: Generalize static fcn return for indeterminate result

author Karl Williamson <khw@cpan.org>

Sat, 1 Jul 2017 12:43:34 +0000 (06:43 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
author Karl Williamson <khw@cpan.org>
Sat, 1 Jul 2017 12:43:34 +0000 (06:43 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
diff --git a/embed.fnc b/embed.fnc

index 6799895..4af3550 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1723,7 +1723,8 @@ EpM       |char * |_byte_dump_string                                      \
                                 |const bool format
  #if defined(PERL_IN_UTF8_C)
  inR    |bool   |does_utf8_overflow|NN const U8 * const s|NN const U8 * e
-inR    |bool   |is_utf8_overlong_given_start_byte_ok|NN const U8 * const s|const STRLEN len
+inR    |int    |is_utf8_overlong_given_start_byte_ok|NN const U8 * const s \
+                                                    |const STRLEN len
  inR    |int    |isFF_OVERLONG  |NN const U8 * const s|const STRLEN len
  sMR    |char * |unexpected_non_continuation_text                       \
                 |NN const U8 * const s                                  \
diff --git a/proto.h b/proto.h

index dd87483..db8c39c 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -5846,7 +5846,7 @@ PERL_STATIC_INLINE bool   S_is_utf8_common_with_len(pTHX_ const U8 *const p, const
  #endif
  
  #ifndef PERL_NO_INLINE_FUNCTIONS
-PERL_STATIC_INLINE bool        S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
+PERL_STATIC_INLINE int S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
                         __attribute__warn_unused_result__;
  #define PERL_ARGS_ASSERT_IS_UTF8_OVERLONG_GIVEN_START_BYTE_OK  \
         assert(s)
diff --git a/utf8.c b/utf8.c

index c05866a..50ce466 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -558,20 +558,25 @@ S_does_utf8_overflow(const U8 * const s, const U8 * e)
      return FALSE;
  }
  
-PERL_STATIC_INLINE bool
+PERL_STATIC_INLINE int
  S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  {
-    /* Overlongs can occur whenever the number of continuation bytes
-     * changes.  That means whenever the number of leading 1 bits in a start
-     * byte increases from the next lower start byte.  That happens for start
-     * bytes C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following
-     * illegal start bytes have already been excluded, so don't need to be
-     * tested here;
+    /* Returns an int indicating whether or not the UTF-8 sequence from 's' to
+     * 's' + 'len' - 1 is an overlong.  It returns 1 if it is an overlong; 0 if
+     * it isn't, and -1 if there isn't enough information to tell.  This last
+     * return value can happen if the sequence is incomplete, missing some
+     * trailing bytes that would form a complete character.  If there are
+     * enough bytes to make a definitive decision, this function does so.
+     * Usually 2 bytes sufficient.
+     *
+     * Overlongs can occur whenever the number of continuation bytes changes.
+     * That means whenever the number of leading 1 bits in a start byte
+     * increases from the next lower start byte.  That happens for start bytes
+     * C0, E0, F0, F8, FC, FE, and FF.  On modern perls, the following illegal
+     * start bytes have already been excluded, so don't need to be tested here;
       * ASCII platforms: C0, C1
       * EBCDIC platforms C0, C1, C2, C3, C4, E0
-     *
-     * At least a second byte is required to determine if other sequences will
-     * be an overlong. */
+     */
  
      const U8 s0 = NATIVE_UTF8_TO_I8(s[0]);
      const U8 s1 = NATIVE_UTF8_TO_I8(s[1]);
@@ -596,7 +601,7 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
  #       else
  
      if (s0 == 0xE0 && UNLIKELY(s1 < 0xA0)) {
-        return TRUE;
+        return 1;
      }
  
  #           define F0_ABOVE_OVERLONG 0x90
@@ -612,11 +617,11 @@ S_is_utf8_overlong_given_start_byte_ok(const U8 * const s, const STRLEN len)
          || (s0 == 0xFC && UNLIKELY(s1 < FC_ABOVE_OVERLONG))
          || (s0 == 0xFE && UNLIKELY(s1 < FE_ABOVE_OVERLONG)))
      {
-        return TRUE;
+        return 1;
      }
  
      /* Check for the FF overlong */
-    return isFF_OVERLONG(s, len) > 0;
+    return isFF_OVERLONG(s, len);
  }
  
  PERL_STATIC_INLINE int
@@ -799,7 +804,7 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  
      /* Here is syntactically valid.  Next, make sure this isn't the start of an
       * overlong. */
-    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len)) {
+    if (len > 1 && is_utf8_overlong_given_start_byte_ok(s, len) > 0) {
          return 0;
      }
  
@@ -1303,7 +1308,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
          || (       UNLIKELY(possible_problems)
              && (   UNLIKELY(! UTF8_IS_START(*s0))
                  || (   curlen > 1
-                    && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
+                    && UNLIKELY(0 < is_utf8_overlong_given_start_byte_ok(s0,
                                                                  s - s0))))))
      {
          possible_problems |= UTF8_GOT_LONG;
author	Karl Williamson <khw@cpan.org>
	Sat, 1 Jul 2017 12:43:34 +0000 (06:43 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
embed.fnc		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history