make dNOOP usable outside function in C++

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index c6d08b6..3123bd0 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -952,10 +952,10 @@ Perl__is_utf8_char_helper(const U8 * const s, const U8 * e, const U32 flags)
  }
  
  char *
-Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
+Perl__byte_dump_string(pTHX_ const U8 * const start, const STRLEN len, const bool format)
  {
      /* Returns a mortalized C string that is a displayable copy of the 'len'
-     * bytes starting at 's'.  'format' gives how to display each byte.
+     * bytes starting at 'start'.  'format' gives how to display each byte.
       * Currently, there are only two formats, so it is currently a bool:
       *      0   \xab
       *      1    ab         (that is a space between two hex digit bytes)
@@ -963,7 +963,8 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
  
      const STRLEN output_len = 4 * len + 1;  /* 4 bytes per each input, plus a
                                                 trailing NUL */
-    const U8 * const e = s + len;
+    const U8 * s = start;
+    const U8 * const e = start + len;
      char * output;
      char * d;
  
@@ -973,12 +974,14 @@ Perl__byte_dump_string(pTHX_ const U8 * s, const STRLEN len, const bool format)
      SAVEFREEPV(output);
  
      d = output;
-    for (; s < e; s++) {
+    for (s = start; s < e; s++) {
          const unsigned high_nibble = (*s & 0xF0) >> 4;
          const unsigned low_nibble =  (*s & 0x0F);
  
          if (format) {
-            *d++ = ' ';
+            if (s > start) {
+                *d++ = ' ';
+            }
          }
          else {
              *d++ = '\\';
@@ -2001,9 +2004,12 @@ Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
  /*
  =for apidoc utf8_length
  
-Return the length of the UTF-8 char encoded string C<s> in characters.
-Stops at C<e> (inclusive).  If C<e E<lt> s> or if the scan would end
-up past C<e>, croaks.
+Returns the number of characters in the sequence of UTF-8-encoded bytes starting
+at C<s> and ending at the byte just before C<e>.  If <s> and <e> point to the
+same place, it returns 0 with no warning raised.
+
+If C<e E<lt> s> or if the scan would end up past C<e>, it raises a UTF8 warning
+and returns the number of valid characters.
  
  =cut
  */
@@ -2303,8 +2309,8 @@ Perl_bytes_from_utf8_loc(const U8 *s, STRLEN *lenp, bool *is_utf8p, const U8** f
      }
  
    finish_and_return:
-        *d = '\0';
-        *lenp = d - converted_start;
+    *d = '\0';
+    *lenp = d - converted_start;
  
      /* Trim unused space */
      Renew(converted_start, *lenp + 1, U8);
@@ -2361,10 +2367,19 @@ Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *lenp)
  }
  
  /*
- * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
+ * Convert native (big-endian) UTF-16 to UTF-8.  For reversed (little-endian),
+ * use utf16_to_utf8_reversed().
+ *
+ * UTF-16 requires 2 bytes for every code point below 0x10000; otherwise 4 bytes.
+ * UTF-8 requires 1-3 bytes for every code point below 0x1000; otherwise 4 bytes.
+ * UTF-EBCDIC requires 1-4 bytes for every code point below 0x1000; otherwise 4-5 bytes.
+ *
+ * These functions don't check for overflow.  The worst case is every code
+ * point in the input is 2 bytes, and requires 4 bytes on output.  (If the code
+ * is never going to run in EBCDIC, it is 2 bytes requiring 3 on output.)  Therefore the
+ * destination must be pre-extended to 2 times the source length.
   *
- * Destination must be pre-extended to 3/2 source.  Do not use in-place.
- * We optimize for native, for obvious reasons. */
+ * Do not use in-place.  We optimize for native, for obvious reasons. */
  
  U8*
  Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)