Adapt test to accept local uncommitted changes

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index ecbc4ea..a68af53 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -51,6 +51,37 @@ Unicode characters as a variable number of bytes, in such a way that
  characters in the ASCII range are unmodified, and a zero byte never appears
  within non-zero characters.
  
+=cut
+*/
+
+/*
+=for apidoc is_ascii_string
+
+Returns true if first C<len> bytes of the given string are ASCII (i.e. none
+of them even raise the question of UTF-8-ness).
+
+See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
+
+=cut
+*/
+
+bool
+Perl_is_ascii_string(const U8 *s, STRLEN len)
+{
+    const U8* const send = s + (len ? len : strlen((const char *)s));
+    const U8* x = s;
+
+    PERL_ARGS_ASSERT_IS_ASCII_STRING;
+
+    for (; x < send; ++x) {
+       if (!UTF8_IS_INVARIANT(*x))
+           break;
+    }
+
+    return x == send;
+}
+
+/*
  =for apidoc uvuni_to_utf8_flags
  
  Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
@@ -247,18 +278,17 @@ S_is_utf8_char_slow(const U8 *s, const STRLEN len)
  =for apidoc is_utf8_char
  
  Tests if some arbitrary number of bytes begins in a valid UTF-8
-character.  Note that an INVARIANT (i.e. ASCII) character is a valid
-UTF-8 character.  The actual number of bytes in the UTF-8 character
-will be returned if it is valid, otherwise 0.
+character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
+character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
+character will be returned if it is valid, otherwise 0.
  
  =cut */
  STRLEN
-Perl_is_utf8_char(pTHX_ const U8 *s)
+Perl_is_utf8_char(const U8 *s)
  {
      const STRLEN len = UTF8SKIP(s);
  
      PERL_ARGS_ASSERT_IS_UTF8_CHAR;
-    PERL_UNUSED_CONTEXT;
  #ifdef IS_UTF8_CHAR
      if (IS_UTF8_CHAR_FAST(len))
          return IS_UTF8_CHAR(s, len) ? len : 0;
@@ -266,6 +296,7 @@ Perl_is_utf8_char(pTHX_ const U8 *s)
      return is_utf8_char_slow(s, len);
  }
  
+
  /*
  =for apidoc is_utf8_string
  
@@ -274,19 +305,18 @@ UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
  not mean 'a string that contains code points above 0x7F encoded in UTF-8'
  because a valid ASCII string is a valid UTF-8 string.
  
-See also is_utf8_string_loclen() and is_utf8_string_loc().
+See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
  =cut
  */
  
  bool
-Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
+Perl_is_utf8_string(const U8 *s, STRLEN len)
  {
      const U8* const send = s + (len ? len : strlen((const char *)s));
      const U8* x = s;
  
      PERL_ARGS_ASSERT_IS_UTF8_STRING;
-    PERL_UNUSED_CONTEXT;
  
      while (x < send) {
         STRLEN c;
@@ -345,7 +375,7 @@ See also is_utf8_string_loc() and is_utf8_string().
  */
  
  bool
-Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
+Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
  {
      const U8* const send = s + (len ? len : strlen((const char *)s));
      const U8* x = s;
@@ -353,7 +383,6 @@ Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN
      STRLEN outlen = 0;
  
      PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
-    PERL_UNUSED_CONTEXT;
  
      while (x < send) {
          /* Inline the easy bits of is_utf8_char() here for speed... */
@@ -648,7 +677,7 @@ Returns the Unicode code point of the first character in the string C<s>
  which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
  length, in bytes, of that character.
  
-This function should only be used when returned UV is considered
+This function should only be used when the returned UV is considered
  an index into the Unicode semantic tables (e.g. swashes).
  
  If C<s> does not point to a well-formed UTF-8 character, zero is
@@ -682,7 +711,6 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
  {
      dVAR;
      STRLEN len = 0;
-    U8 t = 0;
  
      PERL_ARGS_ASSERT_UTF8_LENGTH;
  
@@ -693,20 +721,23 @@ Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
      if (e < s)
         goto warn_and_return;
      while (s < e) {
-       t = UTF8SKIP(s);
-       if (e - s < t) {
-           warn_and_return:
-           if (ckWARN_d(WARN_UTF8)) {
-               if (PL_op)
-                   Perl_warner(aTHX_ packWARN(WARN_UTF8),
+       if (!UTF8_IS_INVARIANT(*s))
+           s += UTF8SKIP(s);
+       else
+           s++;
+       len++;
+    }
+
+    if (e != s) {
+       len--;
+        warn_and_return:
+       if (ckWARN_d(WARN_UTF8)) {
+           if (PL_op)
+               Perl_warner(aTHX_ packWARN(WARN_UTF8),
                             "%s in %s", unees, OP_DESC(PL_op));
-               else
-                   Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
-           }
-           return len;
+           else
+               Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
         }
-       s += t;
-       len++;
      }
  
      return len;
@@ -772,7 +803,7 @@ Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
  /*
  =for apidoc utf8_to_bytes
  
-Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
+Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
  Unlike C<bytes_to_utf8>, this over-writes the original string, and
  updates len to contain the new length.
  Returns zero on failure, setting C<len> to -1.
@@ -817,12 +848,13 @@ Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
  /*
  =for apidoc bytes_from_utf8
  
-Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
+Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
  Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
  the newly-created string, and updates C<len> to contain the new
  length.  Returns the original string if no conversion occurs, C<len>
  is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
-0 if C<s> is converted or contains all 7bit characters.
+0 if C<s> is converted or consisted entirely of characters that are invariant
+in utf8 (i.e., US-ASCII on non-EBCDIC machines).
  
  =cut
  */
@@ -874,11 +906,14 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
  /*
  =for apidoc bytes_to_utf8
  
-Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
+Converts a string C<s> of length C<len> from the native encoding into UTF-8.
  Returns a pointer to the newly-created string, and sets C<len> to
  reflect the new length.
  
-If you want to convert to UTF-8 from other encodings than ASCII,
+A NUL character will be written after the end of the string.
+
+If you want to convert to UTF-8 from encodings other than
+the native (Latin1 or EBCDIC),
  see sv_recode_to_utf8().
  
  =cut
@@ -1007,14 +1042,6 @@ Perl_is_uni_alnum(pTHX_ UV c)
  }
  
  bool
-Perl_is_uni_alnumc(pTHX_ UV c)
-{
-    U8 tmpbuf[UTF8_MAXBYTES+1];
-    uvchr_to_utf8(tmpbuf, c);
-    return is_utf8_alnumc(tmpbuf);
-}
-
-bool
  Perl_is_uni_idfirst(pTHX_ UV c)
  {
      U8 tmpbuf[UTF8_MAXBYTES+1];
@@ -1155,12 +1182,6 @@ Perl_is_uni_alnum_lc(pTHX_ UV c)
  }
  
  bool
-Perl_is_uni_alnumc_lc(pTHX_ UV c)
-{
-    return is_uni_alnumc(c);   /* XXX no locale support yet */
-}
-
-bool
  Perl_is_uni_idfirst_lc(pTHX_ UV c)
  {
      return is_uni_idfirst(c);  /* XXX no locale support yet */
@@ -1291,16 +1312,6 @@ Perl_is_utf8_alnum(pTHX_ const U8 *p)
  }
  
  bool
-Perl_is_utf8_alnumc(pTHX_ const U8 *p)
-{
-    dVAR;
-
-    PERL_ARGS_ASSERT_IS_UTF8_ALNUMC;
-
-    return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
-}
-
-bool
  Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
  {
      dVAR;
@@ -1494,7 +1505,7 @@ Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
      if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
           /* It might be "special" (sometimes, but not always,
           * a multicharacter mapping) */
-        HV * const hv = get_hv(special, FALSE);
+        HV * const hv = get_hv(special, 0);
          SV **svp;
  
          if (hv &&