Reorganize perlhack.pod

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 019d49f..fa30a67 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -62,6 +62,8 @@ or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines).  That
  is, if they are invariant.  On ASCII-ish machines, only ASCII characters
  fit this definition, hence the function's name.
  
+If C<len> is 0, it will be calculated using C<strlen(s)>.  
+
  See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
  =cut
@@ -303,9 +305,10 @@ Perl_is_utf8_char(const U8 *s)
  =for apidoc is_utf8_string
  
  Returns true if first C<len> bytes of the given string form a valid
-UTF-8 string, false otherwise.  Note that 'a valid UTF-8 string' does
-not mean 'a string that contains code points above 0x7F encoded in UTF-8'
-because a valid ASCII string is a valid UTF-8 string.
+UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
+using C<strlen(s)>.  Note that 'a valid UTF-8 string' does not mean 'a
+string that contains code points above 0x7F encoded in UTF-8' because a
+valid ASCII string is a valid UTF-8 string.
  
  See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
@@ -836,8 +839,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                 if (u < uend) {
                     U8 c1 = *u++;
                     if (UTF8_IS_CONTINUATION(c1)) {
-                       c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), c1);
-                       c = ASCII_TO_NATIVE(c);
+                       c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, c1));
                     } else {
                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                                          "Malformed UTF-8 character "
@@ -966,8 +968,7 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
         U8 c = *s++;
         if (!UTF8_IS_INVARIANT(c)) {
             /* Then it is two-byte encoded */
-           c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
-           c = ASCII_TO_NATIVE(c);
+           c = UNI_TO_NATIVE(TWO_BYTE_UTF8_TO_UNI(c, *s++));
         }
         *d++ = c;
      }
@@ -979,9 +980,10 @@ Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
  /*
  =for apidoc bytes_to_utf8
  
-Converts a string C<s> of length C<len> from the native encoding into UTF-8.
+Converts a string C<s> of length C<len> bytes from the native encoding into
+UTF-8.
  Returns a pointer to the newly-created string, and sets C<len> to
-reflect the new length.
+reflect the new length in bytes.
  
  A NUL character will be written after the end of the string.
  
@@ -1967,7 +1969,8 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
   * return several Unicode characters for a single Unicode character
   * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
   * the lower-level routine, and it is similarly broken for returning
- * multiple values.  --jhi */
+ * multiple values.  --jhi
+ * For those, you should use to_utf8_case() instead */
  /* Now SWASHGET is recasted into S_swash_get in this file. */
  
  /* Note: