utf8.c: Add compiler hint

[perl5.git] / utf8.c
diff --git a/utf8.c b/utf8.c

index 805d2bf..b873578 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2230,8 +2230,7 @@ Perl__to_utf8_fold_flags(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, U8 flags)
  }
  
  /* Note:
- * A "swash" is a swatch hash.
- * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
+ * Returns a "swash" is a hash described in utf8.c:S_swash_fetch().
   * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
   * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
   */
@@ -2321,6 +2320,34 @@ Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits
   * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
   * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
   * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
+ *
+ * A "swash" is a hash which contains initially the keys/values set up by
+ * SWASHNEW.  The purpose is to be able to completely represent a Unicode
+ * property for all possible code points.  Things are stored in a compact form
+ * (see utf8_heavy.pl) so that calculation is required to find the actual
+ * property value for a given code point.  As code points are looked up, new
+ * key/value pairs are added to the hash, so that the calculation doesn't have
+ * to ever be re-done.  Further, each calculation is done, not just for the
+ * desired one, but for a whole block of code points adjacent to that one.
+ * For binary properties on ASCII machines, the block is usually for 64 code
+ * points, starting with a code point evenly divisible by 64.  Thus if the
+ * property value for code point 257 is requested, the code goes out and
+ * calculates the property values for all 64 code points between 256 and 319,
+ * and stores these as a single 64-bit long bit vector, called a "swatch",
+ * under the key for code point 256.  The key is the UTF-8 encoding for code
+ * point 256, minus the final byte.  Thus, if the length of the UTF-8 encoding
+ * for a code point is 13 bytes, the key will be 12 bytes long.  If the value
+ * for code point 258 is then requested, this code realizes that it would be
+ * stored under the key for 256, and would find that value and extract the
+ * relevant bit, offset from 256.
+ *
+ * Non-binary properties are stored in as many bits as necessary to represent
+ * their values (32 currently, though the code is more general than that), not
+ * as single bits, but the principal is the same: the value for each key is a
+ * vector that encompasses the property values for all code points whose UTF-8
+ * representations are represented by the key.  That is, for all code points
+ * whose UTF-8 representations are length N bytes, and the key is the first N-1
+ * bytes of that.
   */
  UV
  Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
@@ -2363,19 +2390,6 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
        /* If char is encoded then swatch is for the prefix */
         needents = (1 << UTF_ACCUMULATION_SHIFT);
         off      = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
-       if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
-           const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
-
-           /* This outputs warnings for binary properties only, assuming that
-            * to_utf8_case() will output any for non-binary.  Also, surrogates
-            * aren't checked for, as that would warn on things like
-            * /\p{Gc=Cs}/ */
-           SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
-           if (SvUV(*bitssvp) == 1) {
-               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
-                   "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
-           }
-       }
      }
  
      /*
@@ -2432,6 +2446,24 @@ Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
             Copy(ptr, PL_last_swash_key, klen, U8);
      }
  
+    if (UTF8_IS_SUPER(ptr) && ckWARN_d(WARN_NON_UNICODE)) {
+       SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
+
+       /* This outputs warnings for binary properties only, assuming that
+        * to_utf8_case() will output any for non-binary.  Also, surrogates
+        * aren't checked for, as that would warn on things like /\p{Gc=Cs}/ */
+
+       if (SvUV(*bitssvp) == 1) {
+           /* User-defined properties can silently match above-Unicode */
+           SV** const user_defined_svp = hv_fetchs(hv, "USER_DEFINED", FALSE);
+           if (! user_defined_svp || ! SvUV(*user_defined_svp)) {
+               const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0, 0);
+               Perl_warner(aTHX_ packWARN(WARN_NON_UNICODE),
+                   "Code point 0x%04"UVXf" is not Unicode, all \\p{} matches fail; all \\P{} matches succeed", code_point);
+           }
+       }
+    }
+
      switch ((int)((slen << 3) / needents)) {
      case 1:
         bit = 1 << (off & 7);
@@ -2624,7 +2656,8 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
      SvCUR_set(swatch, scur);
      s = (U8*)SvPVX(swatch);
  
-    /* read $swash->{LIST} */
+    /* read $swash->{LIST}.  XXX Note that this is a linear scan through a
+     * sorted list.  A binary search would be much more efficient */
      l = (U8*)SvPV(*listsvp, lcur);
      lend = l + lcur;
      while (l < lend) {
@@ -2651,6 +2684,10 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
                 STRLEN offset;
                 if (key >= end)
                     goto go_out_list;
+               /* XXX If it should ever happen (very unlikely) that we would
+                * want a non-binary result for the code point at UV_MAX,
+                * special handling would need to be inserted here, as is done
+                * below for the binary case */
                 /* offset must be non-negative (start <= min <= key < end) */
                 offset = octets * (key - start);
                 if (bits == 8)
@@ -2674,6 +2711,15 @@ S_swash_get(pTHX_ SV* swash, UV start, UV span)
             UV key;
             if (min < start)
                 min = start;
+
+            /* Special case when the upper-end is the highest possible code
+             * point representable on the platform.  Otherwise, the code below
+             * exits before setting this bit.  Done here to avoid testing for
+             * this extremely unlikely possibility in the loop */
+           if (UNLIKELY(end == UV_MAX && max == UV_MAX)) {
+               const STRLEN offset = (STRLEN)(max - start);
+               s[offset >> 3] |= 1 << (offset & 7);
+           }
             for (key = min; key <= max; key++) {
                 const STRLEN offset = (STRLEN)(key - start);
                 if (key >= end)
@@ -3332,9 +3378,9 @@ bool
  Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
  {
      /* May change: warns if surrogates, non-character code points, or
-     * non-Unicode code points are in s which has length len.  Returns TRUE if
-     * none found; FALSE otherwise.  The only other validity check is to make
-     * sure that this won't exceed the string's length */
+     * non-Unicode code points are in s which has length len bytes.  Returns
+     * TRUE if none found; FALSE otherwise.  The only other validity check is
+     * to make sure that this won't exceed the string's length */
  
      const U8* const e = s + len;
      bool ok = TRUE;
@@ -3347,7 +3393,7 @@ Perl_check_utf8_print(pTHX_ register const U8* s, const STRLEN len)
                            "%s in %s", unees, PL_op ? OP_DESC(PL_op) : "print");
             return FALSE;
         }
-       if (*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE) {
+       if (UNLIKELY(*s >= UTF8_FIRST_PROBLEMATIC_CODE_POINT_FIRST_BYTE)) {
             STRLEN char_len;
             if (UTF8_IS_SUPER(s)) {
                 if (ckWARN_d(WARN_NON_UNICODE)) {