utf8_to_uvchr_buf() make behavior match docs

[perl5.git] / inline.h
diff --git a/inline.h b/inline.h

index d0e60e4..7d81da6 100644 (file)
--- a/inline.h
+++ b/inline.h
@@ -549,8 +549,8 @@ S__variant_byte_number(PERL_UINTMAX_T word)
       *  x..xx01..1      Subtract 1, turns all the trailing zeros into 1's and
       *                  the 1 just to their left into a 0; the remainder is
       *                  untouched
-     *  0..0011..1      The xor with x..xx10..0 clears that remainder, sets
-     *                  bottom to all 1
+     *  0..0011..1      The xor with the original, x..xx10..0, clears that
+     *                  remainder, sets the bottom to all 1
       *  0..0100..0      Add 1 to clear the word except for the bit in 's'
       *
       * Another method is to do 'word &= -word'; but it generates a compiler
@@ -1013,7 +1013,7 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  
  /*
  
-=for apidoc Am|STRLEN|isUTF8_CHAR|const U8 *s|const U8 *e
+=for apidoc isUTF8_CHAR
  
  Evaluates to non-zero if the first few bytes of the string starting at C<s> and
  looking no further than S<C<e - 1>> are well-formed UTF-8, as extended by Perl,
@@ -1022,7 +1022,7 @@ value gives how many bytes starting at C<s> comprise the code point's
  representation.  Any bytes remaining before C<e>, but beyond the ones needed to
  form the first code point in C<s>, are not examined.
  
-The code point can be any that will fit in a UV on this machine, using Perl's
+The code point can be any that will fit in an IV on this machine, using Perl's
  extension to official UTF-8 to represent those higher than the Unicode maximum
  of 0x10FFFF.  That means that this macro is used to efficiently decide if the
  next few bytes in C<s> is legal UTF-8 for a single character.
@@ -1036,12 +1036,8 @@ code points; and C<L</isUTF8_CHAR_flags>> for a more customized definition.
  Use C<L</is_utf8_string>>, C<L</is_utf8_string_loc>>, and
  C<L</is_utf8_string_loclen>> to check entire strings.
  
-Note that it is deprecated to use code points higher than what will fit in an
-IV.  This macro does not raise any warnings for such code points, treating them
-as valid.
-
-Note also that a UTF-8 INVARIANT character (i.e. ASCII on non-EBCDIC machines)
-is a valid UTF-8 character.
+Note also that a UTF-8 "invariant" character (i.e. ASCII on non-EBCDIC
+machines) is a valid UTF-8 character.
  
  =cut
  
@@ -1049,7 +1045,7 @@ This uses an adaptation of the table and algorithm given in
  http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
  documentation of the original version.  A copyright notice for the original
  version is given at the beginning of this file.  The Perl adapation is
-documented at the definition of perl_extended_utf8_dfa_tab[].
+documented at the definition of PL_extended_utf8_dfa_tab[].
  
  */
  
@@ -1068,9 +1064,9 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
       * helper function for the other platforms. */
  
      while (s < e && LIKELY(state != 1)) {
-        state = perl_extended_utf8_dfa_tab[256
+        state = PL_extended_utf8_dfa_tab[256
                                           + state
-                                         + perl_extended_utf8_dfa_tab[*s]];
+                                         + PL_extended_utf8_dfa_tab[*s]];
          if (state != 0) {
              s++;
              continue;
@@ -1092,6 +1088,131 @@ S_isUTF8_CHAR(const U8 * const s0, const U8 * const e)
  
  /*
  
+=for apidoc isSTRICT_UTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode code point completely acceptable for open interchange between all
+applications; otherwise it evaluates to 0.  If non-zero, the value gives how
+many bytes starting at C<s> comprise the code point's representation.  Any
+bytes remaining before C<e>, but beyond the ones needed to form the first code
+point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF, and must not
+be a surrogate nor a non-character code point.  Thus this excludes any code
+point from Perl's extended UTF-8.
+
+This is used to efficiently decide if the next few bytes in C<s> is
+legal Unicode-acceptable UTF-8 for a single character.
+
+Use C<L</isC9_STRICT_UTF8_CHAR>> to use the L<Unicode Corrigendum
+#9|http://www.unicode.org/versions/corrigendum9.html> definition of allowable
+code points; C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8;
+and C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_strict_utf8_string>>, C<L</is_strict_utf8_string_loc>>, and
+C<L</is_strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version.  A copyright notice for the original
+version is given at the beginning of this file.  The Perl adapation is
+documented at the definition of strict_extended_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isSTRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+    const U8 * s = s0;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_ISSTRICT_UTF8_CHAR;
+
+    while (s < e && LIKELY(state != 1)) {
+        state = PL_strict_utf8_dfa_tab[256 + state + PL_strict_utf8_dfa_tab[*s]];
+
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        return s - s0 + 1;
+    }
+
+#ifndef EBCDIC
+
+    /* The dfa above drops out for certain Hanguls; handle them specially */
+    if (is_HANGUL_ED_utf8_safe(s0, e)) {
+        return 3;
+    }
+
+#endif
+
+    return 0;
+}
+
+/*
+
+=for apidoc isC9_STRICT_UTF8_CHAR
+
+Evaluates to non-zero if the first few bytes of the string starting at C<s> and
+looking no further than S<C<e - 1>> are well-formed UTF-8 that represents some
+Unicode non-surrogate code point; otherwise it evaluates to 0.  If non-zero,
+the value gives how many bytes starting at C<s> comprise the code point's
+representation.  Any bytes remaining before C<e>, but beyond the ones needed to
+form the first code point in C<s>, are not examined.
+
+The largest acceptable code point is the Unicode maximum 0x10FFFF.  This
+differs from C<L</isSTRICT_UTF8_CHAR>> only in that it accepts non-character
+code points.  This corresponds to
+L<Unicode Corrigendum #9|http://www.unicode.org/versions/corrigendum9.html>.
+which said that non-character code points are merely discouraged rather than
+completely forbidden in open interchange.  See
+L<perlunicode/Noncharacter code points>.
+
+Use C<L</isUTF8_CHAR>> to check for Perl's extended UTF-8; and
+C<L</isUTF8_CHAR_flags>> for a more customized definition.
+
+Use C<L</is_c9strict_utf8_string>>, C<L</is_c9strict_utf8_string_loc>>, and
+C<L</is_c9strict_utf8_string_loclen>> to check entire strings.
+
+=cut
+
+This uses an adaptation of the tables and algorithm given in
+http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides comprehensive
+documentation of the original version.  A copyright notice for the original
+version is given at the beginning of this file.  The Perl adapation is
+documented at the definition of PL_c9_utf8_dfa_tab[].
+
+*/
+
+PERL_STATIC_INLINE Size_t
+S_isC9_STRICT_UTF8_CHAR(const U8 * const s0, const U8 * const e)
+{
+    const U8 * s = s0;
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_ISC9_STRICT_UTF8_CHAR;
+
+    while (s < e && LIKELY(state != 1)) {
+        state = PL_c9_utf8_dfa_tab[256 + state + PL_c9_utf8_dfa_tab[*s]];
+
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        return s - s0 + 1;
+    }
+
+    return 0;
+}
+
+/*
+
  =for apidoc is_strict_utf8_string_loc
  
  Like C<L</is_strict_utf8_string>> but stores the location of the failure (in the
@@ -1476,9 +1597,9 @@ Perl_utf8_hop_back(const U8 *s, SSize_t off, const U8 *start)
      assert(off <= 0);
  
      while (off++ && s > start) {
-        s--;
-        while (UTF8_IS_CONTINUATION(*s) && s > start)
+        do {
              s--;
+        } while (UTF8_IS_CONTINUATION(*s) && s > start);
      }
      
      GCC_DIAG_IGNORE(-Wcast-qual)
@@ -1658,12 +1779,96 @@ S_is_utf8_fixed_width_buf_loclen_flags(const U8 * const s,
             || is_utf8_valid_partial_char_flags(*ep, s + len, flags);
  }
  
+PERL_STATIC_INLINE UV
+S_utf8n_to_uvchr_msgs(const U8 *s,
+                      STRLEN curlen,
+                      STRLEN *retlen,
+                      const U32 flags,
+                      U32 * errors,
+                      AV ** msgs)
+{
+    /* This is the inlined portion of utf8n_to_uvchr_msgs.  It handles the
+     * simple cases, and, if necessary calls a helper function to deal with the
+     * more complex ones.  Almost all well-formed non-problematic code points
+     * are considered simple, so that it's unlikely that the helper function
+     * will need to be called.
+     *
+     * This is an adaptation of the tables and algorithm given in
+     * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/, which provides
+     * comprehensive documentation of the original version.  A copyright notice
+     * for the original version is given at the beginning of this file.  The
+     * Perl adapation is documented at the definition of PL_strict_utf8_dfa_tab[].
+     */
+
+    const U8 * const s0 = s;
+    const U8 * send = s0 + curlen;
+    UV uv = 0;      /* The 0 silences some stupid compilers */
+    UV state = 0;
+
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
+
+    /* This dfa is fast.  If it accepts the input, it was for a well-formed,
+     * non-problematic code point, which can be returned immediately.
+     * Otherwise we call a helper function to figure out the more complicated
+     * cases. */
+
+    while (s < send && LIKELY(state != 1)) {
+        UV type = PL_strict_utf8_dfa_tab[*s];
+
+        uv = (state == 0)
+             ?  ((0xff >> type) & NATIVE_UTF8_TO_I8(*s))
+             : UTF8_ACCUMULATE(uv, *s);
+        state = PL_strict_utf8_dfa_tab[256 + state + type];
+
+        if (state != 0) {
+            s++;
+            continue;
+        }
+
+        if (retlen) {
+            *retlen = s - s0 + 1;
+        }
+        if (errors) {
+            *errors = 0;
+        }
+        if (msgs) {
+            *msgs = NULL;
+        }
+
+        return uv;
+    }
+
+    /* Here is potentially problematic.  Use the full mechanism */
+    return _utf8n_to_uvchr_msgs_helper(s0, curlen, retlen, flags, errors, msgs);
+}
+
+PERL_STATIC_INLINE UV
+S__utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
+{
+    PERL_ARGS_ASSERT__UTF8_TO_UVCHR_BUF;
+
+    assert(s < send);
+
+    if (! ckWARN_d(WARN_UTF8)) {
+        return utf8n_to_uvchr(s, send - s, retlen,
+                              (UTF8_ALLOW_ANY & ~UTF8_ALLOW_EMPTY));
+    }
+    else {
+        UV ret = utf8n_to_uvchr(s, send - s, retlen, 0);
+        if (retlen && ret == 0 && *s != '\0') {
+            *retlen = (STRLEN) -1;
+        }
+
+        return ret;
+    }
+}
+
  /* ------------------------------- perl.h ----------------------------- */
  
  /*
  =head1 Miscellaneous Functions
  
-=for apidoc AiR|bool|is_safe_syscall|const char *pv|STRLEN len|const char *what|const char *op_name
+=for apidoc is_safe_syscall
  
  Test that the given C<pv> doesn't contain any internal C<NUL> characters.
  If it does, set C<errno> to C<ENOENT>, optionally warn, and return FALSE.
@@ -1729,6 +1934,40 @@ S_should_warn_nl(const char *pv) {
  
  #endif
  
+#if defined(PERL_IN_PP_C) || defined(PERL_IN_PP_HOT_C)
+
+PERL_STATIC_INLINE bool
+S_lossless_NV_to_IV(const NV nv, IV *ivp)
+{
+    /* This function determines if the input NV 'nv' may be converted without
+     * loss of data to an IV.  If not, it returns FALSE taking no other action.
+     * But if it is possible, it does the conversion, returning TRUE, and
+     * storing the converted result in '*ivp' */
+
+    PERL_ARGS_ASSERT_LOSSLESS_NV_TO_IV;
+
+#  if  defined(Perl_isnan)
+
+    if (UNLIKELY(Perl_isnan(nv))) {
+        return FALSE;
+    }
+
+#  endif
+
+    if (UNLIKELY(nv < IV_MIN) || UNLIKELY(nv > IV_MAX)) {
+        return FALSE;
+    }
+
+    if ((IV) nv != nv) {
+        return FALSE;
+    }
+
+    *ivp = (IV) nv;
+    return TRUE;
+}
+
+#endif
+
  /* ------------------ pp.c, regcomp.c, toke.c, universal.c ------------ */
  
  #define MAX_CHARSET_NAME_LENGTH 2
@@ -2163,10 +2402,10 @@ Perl_foldEQ(const char *s1, const char *s2, I32 len)
  PERL_STATIC_INLINE I32
  Perl_foldEQ_latin1(const char *s1, const char *s2, I32 len)
  {
-    /* Compare non-utf8 using Unicode (Latin1) semantics.  Does not work on
-     * MICRO_SIGN, LATIN_SMALL_LETTER_SHARP_S, nor
-     * LATIN_SMALL_LETTER_Y_WITH_DIAERESIS, and does not check for these.  Nor
-     * does it check that the strings each have at least 'len' characters */
+    /* Compare non-UTF-8 using Unicode (Latin1) semantics.  Works on all folds
+     * representable without UTF-8, except for LATIN_SMALL_LETTER_SHARP_S, and
+     * does not check for this.  Nor does it check that the strings each have
+     * at least 'len' characters. */
  
      const U8 *a = (const U8 *)s1;
      const U8 *b = (const U8 *)s2;