Remove duplicate comment from sv.h

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index d3c3e02..b71ae48 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -323,6 +323,9 @@ character.  Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
  character is a valid UTF-8 character.  The actual number of bytes in the UTF-8
  character will be returned if it is valid, otherwise 0.
  
+WARNING: use only if you *know* that C<s> has at least either UTF8_MAXBYTES or
+UTF8SKIP(s) bytes.
+
  =cut */
  STRLEN
  Perl_is_utf8_char(const U8 *s)
@@ -343,9 +346,9 @@ Perl_is_utf8_char(const U8 *s)
  
  Returns true if first C<len> bytes of the given string form a valid
  UTF-8 string, false otherwise.  If C<len> is 0, it will be calculated
-using C<strlen(s)>.  Note that 'a valid UTF-8 string' does not mean 'a
-string that contains code points above 0x7F encoded in UTF-8' because a
-valid ASCII string is a valid UTF-8 string.
+using C<strlen(s)> (which means if you use this option, that C<s> has to have a
+terminating NUL byte).  Note that all characters being ASCII constitute 'a
+valid UTF-8 string'.
  
  See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
  
@@ -361,35 +364,32 @@ Perl_is_utf8_string(const U8 *s, STRLEN len)
      PERL_ARGS_ASSERT_IS_UTF8_STRING;
  
      while (x < send) {
-       STRLEN c;
          /* Inline the easy bits of is_utf8_char() here for speed... */
-        if (UTF8_IS_INVARIANT(*x))
-             c = 1;
+        if (UTF8_IS_INVARIANT(*x)) {
+           x++;
+        }
          else if (!UTF8_IS_START(*x))
-            goto out;
+            return FALSE;
          else {
               /* ... and call is_utf8_char() only if really needed. */
-#ifdef IS_UTF8_CHAR
-            c = UTF8SKIP(x);
+            const STRLEN c = UTF8SKIP(x);
+            const U8* const next_char_ptr = x + c;
+
+            if (next_char_ptr > send) {
+                return FALSE;
+            }
+
              if (IS_UTF8_CHAR_FAST(c)) {
                  if (!IS_UTF8_CHAR(x, c))
-                    c = 0;
+                    return FALSE;
              }
-            else
-               c = is_utf8_char_slow(x, c);
-#else
-            c = is_utf8_char(x);
-#endif /* #ifdef IS_UTF8_CHAR */
-             if (!c)
-                 goto out;
+            else if (! is_utf8_char_slow(x, c)) {
+                return FALSE;
+            }
+            x = next_char_ptr;
          }
-        x += c;
      }
  
- out:
-    if (x != send)
-       return FALSE;
-
      return TRUE;
  }
  
@@ -427,27 +427,29 @@ Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
      PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
  
      while (x < send) {
+        const U8* next_char_ptr;
+
          /* Inline the easy bits of is_utf8_char() here for speed... */
          if (UTF8_IS_INVARIANT(*x))
-            c = 1;
+            next_char_ptr = x + 1;
          else if (!UTF8_IS_START(*x))
              goto out;
          else {
              /* ... and call is_utf8_char() only if really needed. */
-#ifdef IS_UTF8_CHAR
              c = UTF8SKIP(x);
+            next_char_ptr = c + x;
+            if (next_char_ptr > send) {
+                goto out;
+            }
              if (IS_UTF8_CHAR_FAST(c)) {
                  if (!IS_UTF8_CHAR(x, c))
                      c = 0;
              } else
                  c = is_utf8_char_slow(x, c);
-#else
-            c = is_utf8_char(x);
-#endif /* #ifdef IS_UTF8_CHAR */
              if (!c)
                  goto out;
          }
-         x += c;
+         x = next_char_ptr;
          outlen++;
      }
  
@@ -493,7 +495,7 @@ C<retlen> to C<-1> and return zero.
  Certain code points are considered problematic.  These are Unicode surrogates,
  Unicode non-characters, and code points above the Unicode maximum of 0x10FFF.
  By default these are considered regular code points, but certain situations
-warrant special handling for them.  if C<flags> contains
+warrant special handling for them.  If C<flags> contains
  UTF8_DISALLOW_ILLEGAL_INTERCHANGE, all three classes are treated as
  malformations and handled as such.  The flags UTF8_DISALLOW_SURROGATE,
  UTF8_DISALLOW_NONCHAR, and UTF8_DISALLOW_SUPER (meaning above the legal Unicode
@@ -511,7 +513,7 @@ Very large code points (above 0x7FFF_FFFF) are considered more problematic than
  the others that are above the Unicode legal maximum.  There are several
  reasons, one of which is that the original UTF-8 specification never went above
  this number (the current 0x10FFF limit was imposed later).  The UTF-8 encoding
-on ASCII platforms for these large code point begins with a byte containing
+on ASCII platforms for these large code points begins with a byte containing
  0xFE or 0xFF.  The UTF8_DISALLOW_FE_FF flag will cause them to be treated as
  malformations, while allowing smaller above-Unicode code points.  (Of course
  UTF8_DISALLOW_SUPER will treat all above-Unicode code points, including these,
@@ -2230,8 +2232,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
  }
  
  /* Note:
- * A "swash" is a swatch hash.
- * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
+ * Returns a "swash" which is a hash described in utf8.c:S_swash_fetch().
   * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
   * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
   */
@@ -2321,6 +2322,34 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
   * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
   * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
   * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
+ *
+ * A "swash" is a hash which contains initially the keys/values set up by
+ * SWASHNEW.  The purpose is to be able to completely represent a Unicode
+ * property for all possible code points.  Things are stored in a compact form
+ * (see utf8_heavy.pl) so that calculation is required to find the actual
+ * property value for a given code point.  As code points are looked up, new
+ * key/value pairs are added to the hash, so that the calculation doesn't have
+ * to ever be re-done.  Further, each calculation is done, not just for the
+ * desired one, but for a whole block of code points adjacent to that one.
+ * For binary properties on ASCII machines, the block is usually for 64 code
+ * points, starting with a code point evenly divisible by 64.  Thus if the
+ * property value for code point 257 is requested, the code goes out and
+ * calculates the property values for all 64 code points between 256 and 319,
+ * and stores these as a single 64-bit long bit vector, called a "swatch",
+ * under the key for code point 256.  The key is the UTF-8 encoding for code
+ * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
+ * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
+ * for code point 258 is then requested, this code realizes that it would be
+ * stored under the key for 256, and would find that value and extract the
+ * relevant bit, offset from 256.
+ *
+ * Non-binary properties are stored in as many bits as necessary to represent
+ * their values (32 currently, though the code is more general than that), not
+ * as single bits, but the principal is the same: the value for each key is a
+ * vector that encompasses the property values for all code points whose UTF-8
+ * representations are represented by the key.  That is, for all code points
+ * whose UTF-8 representations are length N bytes, and the key is the first N-1
+ * bytes of that.
   */
  UV
  Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
@@ -2363,19 +2392,6 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
        /* If char is encoded then swatch is for the prefix */
         needents = (1 << UTF_ACCUMULATION_SHIFT);
         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
-       if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
-           const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
-
-           /* This outputs warnings for binary properties only, assuming that
-            * to_utf8_case() will output any for non-binary.  Also, surrogates
-            * aren't checked for, as that would warn on things like
-            * /\p{Gc=Cs}/ */
-           SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-           if (SvUV(*bitssvp) == 1) {
-               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
-                   "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
-           }
-       }
      }
  
      /*
@@ -2432,6 +2448,24 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
             Copy(ptr, PL_last_swash_key, klen, U8);
      }
  
+    if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
+       SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+
+       /* This outputs warnings for binary properties only, assuming that
+        * to_utf8_case() will output any for non-binary.  Also, surrogates
+        * aren't checked for, as that would warn on things like /\p{Gc=Cs}/ */
+
+       if (SvUV(*bitssvp) == 1) {
+           /* User-defined properties can silently match above-Unicode */
+           SV** const user_defined_svp = hv_fetchs(hv, "USER_DEFINED", FALSE);
+           if (! user_defined_svp || ! SvUV(*user_defined_svp)) {
+               const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
+               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
+                   "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
+           }
+       }
+    }
+
      switch ((int)((slen << 3) / needents)) {
      case 1:
         bit = 1 << (off & 7);
@@ -2624,7 +2658,8 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      SvCUR_set(swatch, scur);
      s = (U8*)SvPVX(swatch);
  
-    /* read $swash->{LIST} */
+    /* read $swash->{LIST}.  XXX Note that this is a linear scan through a
+     * sorted list.  A binary search would be much more efficient */
      l = (U8*)SvPV(*listsvp, lcur);
      lend = l + lcur;
      while (l < lend) {
@@ -3345,9 +3380,9 @@ bool
  Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
  {
      /* May change: warns if surrogates, non-character code points, or
-     * non-Unicode code points are in s which has length len.  Returns TRUE if
-     * none found; FALSE otherwise.  The only other validity check is to make
-     * sure that this won't exceed the string's length */
+     * non-Unicode code points are in s which has length len bytes.  Returns
+     * TRUE if none found; FALSE otherwise.  The only other validity check is
+     * to make sure that this won't exceed the string's length */
  
      const U8* const e = s + len;
      bool ok = TRUE;
@@ -3360,7 +3395,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
             return FALSE;
         }
-       if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+       if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
             STRLEN char_len;
             if (UTF8_IS_SUPER(s)) {
                 if (ckWARN_d(WARN_NON_UNICODE)) {
@@ -3563,8 +3598,6 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
      STRLEN n1 = 0, n2 = 0;              /* Number of bytes in current char */
      U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
      U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
-    U8 natbuf[2];               /* Holds native 8-bit char converted to utf8;
-                                   these always fit in 2 bytes */
  
      PERL_ARGS_ASSERT_FOLDEQ_UTF8_FLAGS;
  
@@ -3671,9 +3704,8 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
                 else if (u1) {
                     to_utf8_fold(p1, foldbuf1, &n1);
                 }
-               else {  /* Not utf8, convert to it first and then get fold */
-                   uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
-                   to_utf8_fold(natbuf, foldbuf1, &n1);
+               else {  /* Not utf8, get utf8 fold */
+                   to_uni_fold(NATIVE_TO_UNI(*p1), foldbuf1, &n1);
                 }
                 f1 = foldbuf1;
             }
@@ -3720,8 +3752,7 @@ Perl_foldEQ_utf8_flags(pTHX_ const char *s1, char **pe1, register UV l1, bool u1
                     to_utf8_fold(p2, foldbuf2, &n2);
                 }
                 else {
-                   uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
-                   to_utf8_fold(natbuf, foldbuf2, &n2);
+                   to_uni_fold(NATIVE_TO_UNI(*p2), foldbuf2, &n2);
                 }
                 f2 = foldbuf2;
             }