Add utility and .h for character's UTF-8

[perl5.git] / regcomp.c
diff --git a/regcomp.c b/regcomp.c

index 3dcc6d9..921c0e9 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -89,6 +89,7 @@ extern const struct regexp_engine my_reg_engine;
  #include "dquote_static.c"
  #include "charclass_invlists.h"
  #include "inline_invlist.c"
+#include "utf8_strings.h"
  
  #define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
  #define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
@@ -2825,18 +2826,15 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
              * LETTER SHARP S.  We decrease the min length by 1 for each
              * occurrence of 'ss' found */
  
-#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
-#          define U390_first_byte 0xb4
-           const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
-#          define U3B0_first_byte 0xb5
-           const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
-#else
-#          define U390_first_byte 0xce
-           const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
-#          define U3B0_first_byte 0xcf
-           const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
-#endif
-           const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
+#define U390_FIRST_BYTE GREEK_SMALL_LETTER_IOTA_UTF8_FIRST_BYTE
+#define U3B0_FIRST_BYTE GREEK_SMALL_LETTER_UPSILON_UTF8_FIRST_BYTE
+           const U8 U390_tail[] = GREEK_SMALL_LETTER_IOTA_UTF8_TAIL
+                                   COMBINING_DIAERESIS_UTF8
+                                   COMBINING_ACUTE_ACCENT_UTF8;
+           const U8 U3B0_tail[] = GREEK_SMALL_LETTER_UPSILON_UTF8_TAIL
+                                   COMBINING_DIAERESIS_UTF8
+                                   COMBINING_ACUTE_ACCENT_UTF8;
+            const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
                                                  yields a net of 0 */
             /* Examine the string for one of the problematic sequences */
             for (s = s0;
@@ -2866,7 +2864,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                         }
                         break;
  
-                   case U390_first_byte:
+                   case U390_FIRST_BYTE:
                         if (s_end - s >= len
  
                             /* The 1's are because are skipping comparing the
@@ -2877,7 +2875,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, b
                         }
                         break;
  
-                   case U3B0_first_byte:
+                   case U3B0_FIRST_BYTE:
                         if (! (s_end - s >= len
                                && memEQ(s + 1, U3B0_tail, len - 1)))
                         {
@@ -7076,6 +7074,39 @@ S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
       * Note that when inverting, SvCUR shouldn't change */
  }
  
+PERL_STATIC_INLINE IV*
+S_get_invlist_previous_index_addr(pTHX_ SV* invlist)
+{
+    /* Return the address of the UV that is reserved to hold the cached index
+     * */
+
+    PERL_ARGS_ASSERT_GET_INVLIST_PREVIOUS_INDEX_ADDR;
+
+    return (IV *) (SvPVX(invlist) + (INVLIST_PREVIOUS_INDEX_OFFSET * sizeof (UV)));
+}
+
+PERL_STATIC_INLINE IV
+S_invlist_previous_index(pTHX_ SV* const invlist)
+{
+    /* Returns cached index of previous search */
+
+    PERL_ARGS_ASSERT_INVLIST_PREVIOUS_INDEX;
+
+    return *get_invlist_previous_index_addr(invlist);
+}
+
+PERL_STATIC_INLINE void
+S_invlist_set_previous_index(pTHX_ SV* const invlist, const IV index)
+{
+    /* Caches <index> for later retrieval */
+
+    PERL_ARGS_ASSERT_INVLIST_SET_PREVIOUS_INDEX;
+
+    assert(index == 0 || index < (int) _invlist_len(invlist));
+
+    *get_invlist_previous_index_addr(invlist) = index;
+}
+
  PERL_STATIC_INLINE UV
  S_invlist_max(pTHX_ SV* const invlist)
  {
@@ -7126,8 +7157,9 @@ Perl__new_invlist(pTHX_ IV initial_size)
       * properly */
      *get_invlist_zero_addr(new_list) = UV_MAX;
  
+    *get_invlist_previous_index_addr(new_list) = 0;
      *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
-#if HEADER_LENGTH != 4
+#if HEADER_LENGTH != 5
  #   error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
  #endif
  
@@ -7272,6 +7304,7 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
       * contains <cp> */
  
      IV low = 0;
+    IV mid;
      IV high = _invlist_len(invlist);
      const IV highest_element = high - 1;
      const UV* array;
@@ -7287,8 +7320,42 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
       * can't combine this with the test above, because we can't get the array
       * unless we know the list is non-empty) */
      array = invlist_array(invlist);
-    if (cp < array[0]) {
-        return -1;
+
+    mid = invlist_previous_index(invlist);
+    assert(mid >=0 && mid <= highest_element);
+
+    /* <mid> contains the cache of the result of the previous call to this
+     * function (0 the first time).  See if this call is for the same result,
+     * or if it is for mid-1.  This is under the theory that calls to this
+     * function will often be for related code points that are near each other.
+     * And benchmarks show that caching gives better results.  We also test
+     * here if the code point is within the bounds of the list.  These tests
+     * replace others that would have had to be made anyway to make sure that
+     * the array bounds were not exceeded, and give us extra information at the
+     * same time */
+    if (cp >= array[mid]) {
+        if (cp >= array[highest_element]) {
+            return highest_element;
+        }
+
+        /* Here, array[mid] <= cp < array[highest_element].  This means that
+         * the final element is not the answer, so can exclude it; it also
+         * means that <mid> is not the final element, so can refer to 'mid + 1'
+         * safely */
+        if (cp < array[mid + 1]) {
+            return mid;
+        }
+        high--;
+        low = mid + 1;
+    }
+    else { /* cp < aray[mid] */
+        if (cp < array[0]) { /* Fail if outside the array */
+            return -1;
+        }
+        high = mid;
+        if (cp >= array[mid - 1]) {
+            goto found_entry;
+        }
      }
  
      /* Binary search.  What we are looking for is <i> such that
@@ -7296,7 +7363,7 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
       * The loop below converges on the i+1.  Note that there may not be an
       * (i+1)th element in the array, and things work nonetheless */
      while (low < high) {
-       IV mid = (low + high) / 2;
+       mid = (low + high) / 2;
          assert(mid <= highest_element);
         if (array[mid] <= cp) { /* cp >= array[mid] */
             low = mid + 1;
@@ -7312,7 +7379,10 @@ Perl__invlist_search(pTHX_ SV* const invlist, const UV cp)
         }
      }
  
-    return high - 1;
+  found_entry:
+    high--;
+    invlist_set_previous_index(invlist, high);
+    return high;
  }
  
  void
@@ -11469,6 +11539,10 @@ parseit:
             case 'P':
                 {
                 char *e;
+
+                /* This routine will handle any undefined properties */
+                U8 swash_init_flags = _CORE_SWASH_INIT_RETURN_IF_UNDEF;
+
                 if (RExC_parse >= RExC_end)
                     vFAIL2("Empty \\%c{}", (U8)value);
                 if (*RExC_parse == '{') {
@@ -11523,15 +11597,10 @@ parseit:
                      swash = _core_swash_init("utf8", name, &PL_sv_undef,
                                               1, /* binary */
                                               0, /* not tr/// */
-                                             TRUE, /* this routine will handle
-                                                      undefined properties */
-                                             NULL, FALSE /* No inversion list */
+                                             NULL, /* No inversion list */
+                                             &swash_init_flags
                                              );
-                    if (   ! swash
-                        || ! SvROK(swash)
-                        || ! SvTYPE(SvRV(swash)) == SVt_PVHV
-                        || ! (invlist = _get_swash_invlist(swash)))
-                   {
+                    if (! swash || ! (invlist = _get_swash_invlist(swash))) {
                          if (swash) {
                              SvREFCNT_dec(swash);
                              swash = NULL;
@@ -11560,7 +11629,8 @@ parseit:
                           * the swash is from a user-defined property, then this
                           * whole character class should be regarded as such */
                          has_user_defined_property =
-                                                _is_swash_user_defined(swash);
+                                    (swash_init_flags
+                                     & _CORE_SWASH_INIT_USER_DEFINED_PROPERTY);
  
                          /* Invert if asking for the complement */
                          if (value == 'P') {
@@ -12248,9 +12318,8 @@ parseit:
                          U8 dummy[UTF8_MAXBYTES+1];
                          STRLEN dummy_len;
  
-                        /* This particular string is above \xff in both UTF-8
-                         * and UTFEBCDIC */
-                        to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
+                        /* This string is just a short named one above \xff */
+                        to_utf8_fold((U8*) HYPHEN_UTF8, dummy, &dummy_len);
                          assert(PL_utf8_tofold); /* Verify that worked */
                      }
                      PL_utf8_foldclosures =