regen/mk_invlists.pl: Add new table
authorKarl Williamson <khw@cpan.org>
Thu, 6 Dec 2018 23:57:17 +0000 (16:57 -0700)
committerKarl Williamson <khw@cpan.org>
Fri, 7 Dec 2018 17:55:20 +0000 (10:55 -0700)
This table contains all the code points that are in any multi-character
fold (not the folded-from character, but what that character folds to).

It will be used in a future commit.

charclass_invlists.h
embedvar.h
perlapi.h
perlvars.h
regcomp.c
regen/mk_invlists.pl
uni_keywords.h

index 1e021e3..2ec681d 100644 (file)
@@ -28984,6 +28984,286 @@ static const GCB_enum _Perl_GCB_invmap[] = {  /* for EBCDIC 037 */
 
 #  if 'A' == 65 /* ASCII/Latin1 */
 
+static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = {  /* for ASCII/Latin1 */
+       79,     /* Number of elements */
+       148565664, /* Version and data structure type */
+       1,      /* 0 if the list starts at 0;
+                  1 if it starts at the element beyond 0 */
+       0x0,
+       0x61,
+       0x62,
+       0x66,
+       0x67,
+       0x68,
+       0x6B,
+       0x6C,
+       0x6D,
+       0x6E,
+       0x6F,
+       0x73,
+       0x75,
+       0x77,
+       0x78,
+       0x79,
+       0x7A,
+       0x2BC,
+       0x2BD,
+       0x2BE,
+       0x2BF,
+       0x300,
+       0x302,
+       0x307,
+       0x309,
+       0x30A,
+       0x30B,
+       0x30C,
+       0x30D,
+       0x313,
+       0x314,
+       0x331,
+       0x332,
+       0x342,
+       0x343,
+       0x3AC,
+       0x3AD,
+       0x3AE,
+       0x3AF,
+       0x3B1,
+       0x3B2,
+       0x3B7,
+       0x3B8,
+       0x3B9,
+       0x3BA,
+       0x3C1,
+       0x3C2,
+       0x3C5,
+       0x3C6,
+       0x3C9,
+       0x3CA,
+       0x3CE,
+       0x3CF,
+       0x565,
+       0x566,
+       0x56B,
+       0x56C,
+       0x56D,
+       0x56E,
+       0x574,
+       0x575,
+       0x576,
+       0x577,
+       0x57E,
+       0x57F,
+       0x582,
+       0x583,
+       0x1F00,
+       0x1F08,
+       0x1F20,
+       0x1F28,
+       0x1F60,
+       0x1F68,
+       0x1F70,
+       0x1F71,
+       0x1F74,
+       0x1F75,
+       0x1F7C,
+       0x1F7D
+};
+
+#  endif       /* ASCII/Latin1 */
+
+#  if 'A' == 193 /* EBCDIC 1047 */ \
+     && '\\' == 224 && '[' == 173 && ']' == 189 && '{' == 192 && '}' == 208 \
+     && '^' == 95 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \
+     && '$' == 91 && '@' == 124 && '`' == 121
+
+static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = {  /* for EBCDIC 1047 */
+       81,     /* Number of elements */
+       148565664, /* Version and data structure type */
+       1,      /* 0 if the list starts at 0;
+                  1 if it starts at the element beyond 0 */
+       0x0,
+       0x81,
+       0x82,
+       0x86,
+       0x87,
+       0x88,
+       0x8A,
+       0x91,
+       0x92,
+       0x93,
+       0x94,
+       0x95,
+       0x96,
+       0xA2,
+       0xA4,
+       0xA6,
+       0xA7,
+       0xA8,
+       0xA9,
+       0x2BC,
+       0x2BD,
+       0x2BE,
+       0x2BF,
+       0x300,
+       0x302,
+       0x307,
+       0x309,
+       0x30A,
+       0x30B,
+       0x30C,
+       0x30D,
+       0x313,
+       0x314,
+       0x331,
+       0x332,
+       0x342,
+       0x343,
+       0x3AC,
+       0x3AD,
+       0x3AE,
+       0x3AF,
+       0x3B1,
+       0x3B2,
+       0x3B7,
+       0x3B8,
+       0x3B9,
+       0x3BA,
+       0x3C1,
+       0x3C2,
+       0x3C5,
+       0x3C6,
+       0x3C9,
+       0x3CA,
+       0x3CE,
+       0x3CF,
+       0x565,
+       0x566,
+       0x56B,
+       0x56C,
+       0x56D,
+       0x56E,
+       0x574,
+       0x575,
+       0x576,
+       0x577,
+       0x57E,
+       0x57F,
+       0x582,
+       0x583,
+       0x1F00,
+       0x1F08,
+       0x1F20,
+       0x1F28,
+       0x1F60,
+       0x1F68,
+       0x1F70,
+       0x1F71,
+       0x1F74,
+       0x1F75,
+       0x1F7C,
+       0x1F7D
+};
+
+#  endif       /* EBCDIC 1047 */
+
+#  if 'A' == 193 /* EBCDIC 037 */ \
+     && '\\' == 224 && '[' == 186 && ']' == 187 && '{' == 192 && '}' == 208 \
+     && '^' == 176 && '~' == 161 && '!' == 90 && '#' == 123 && '|' == 79 \
+     && '$' == 91 && '@' == 124 && '`' == 121
+
+static const UV _Perl_Is_In_Multi_Char_Fold_invlist[] = {  /* for EBCDIC 037 */
+       81,     /* Number of elements */
+       148565664, /* Version and data structure type */
+       1,      /* 0 if the list starts at 0;
+                  1 if it starts at the element beyond 0 */
+       0x0,
+       0x81,
+       0x82,
+       0x86,
+       0x87,
+       0x88,
+       0x8A,
+       0x91,
+       0x92,
+       0x93,
+       0x94,
+       0x95,
+       0x96,
+       0xA2,
+       0xA4,
+       0xA6,
+       0xA7,
+       0xA8,
+       0xA9,
+       0x2BC,
+       0x2BD,
+       0x2BE,
+       0x2BF,
+       0x300,
+       0x302,
+       0x307,
+       0x309,
+       0x30A,
+       0x30B,
+       0x30C,
+       0x30D,
+       0x313,
+       0x314,
+       0x331,
+       0x332,
+       0x342,
+       0x343,
+       0x3AC,
+       0x3AD,
+       0x3AE,
+       0x3AF,
+       0x3B1,
+       0x3B2,
+       0x3B7,
+       0x3B8,
+       0x3B9,
+       0x3BA,
+       0x3C1,
+       0x3C2,
+       0x3C5,
+       0x3C6,
+       0x3C9,
+       0x3CA,
+       0x3CE,
+       0x3CF,
+       0x565,
+       0x566,
+       0x56B,
+       0x56C,
+       0x56D,
+       0x56E,
+       0x574,
+       0x575,
+       0x576,
+       0x577,
+       0x57E,
+       0x57F,
+       0x582,
+       0x583,
+       0x1F00,
+       0x1F08,
+       0x1F20,
+       0x1F28,
+       0x1F60,
+       0x1F68,
+       0x1F70,
+       0x1F71,
+       0x1F74,
+       0x1F75,
+       0x1F7C,
+       0x1F7D
+};
+
+#  endif       /* EBCDIC 037 */
+
+#  if 'A' == 65 /* ASCII/Latin1 */
+
 static const UV _Perl_IVCF_invlist[] = {  /* for ASCII/Latin1 */
        1297,   /* Number of elements */
        148565664, /* Version and data structure type */
@@ -383428,5 +383708,5 @@ static const U8 WB_table[23][23] = {
  * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version
  * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl
  * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl
- * 0c7d00d798e6c7b743b6843cb4b524bb55155fb78a736f708577b82324919423 regen/mk_invlists.pl
+ * 35eecb67dfc9b89a150036e4dcd76de5d46f20d6ddd6976188e1df94a4055b7b regen/mk_invlists.pl
  * ex: set ro: */
index 5bd4a4e..8743da7 100644 (file)
 #define PL_GHasMultiCharFold   (my_vars->GHasMultiCharFold)
 #define PL_InBitmap            (my_vars->GInBitmap)
 #define PL_GInBitmap           (my_vars->GInBitmap)
+#define PL_InMultiCharFold     (my_vars->GInMultiCharFold)
+#define PL_GInMultiCharFold    (my_vars->GInMultiCharFold)
 #define PL_LB_invlist          (my_vars->GLB_invlist)
 #define PL_GLB_invlist         (my_vars->GLB_invlist)
 #define PL_Latin1              (my_vars->GLatin1)
index af5b042..bd1d434 100644 (file)
--- a/perlapi.h
+++ b/perlapi.h
@@ -111,6 +111,8 @@ END_EXTERN_C
 #define PL_HasMultiCharFold    (*Perl_GHasMultiCharFold_ptr(NULL))
 #undef  PL_InBitmap
 #define PL_InBitmap            (*Perl_GInBitmap_ptr(NULL))
+#undef  PL_InMultiCharFold
+#define PL_InMultiCharFold     (*Perl_GInMultiCharFold_ptr(NULL))
 #undef  PL_LB_invlist
 #define PL_LB_invlist          (*Perl_GLB_invlist_ptr(NULL))
 #undef  PL_Latin1
index 82bce27..4f0b6c0 100644 (file)
@@ -276,6 +276,7 @@ PERLVAR(G, AboveLatin1,     SV *)
 PERLVAR(G, Assigned_invlist, SV *)
 PERLVAR(G, GCB_invlist, SV *)
 PERLVAR(G, HasMultiCharFold,   SV *)
+PERLVAR(G, InMultiCharFold,   SV *)
 PERLVAR(G, Latin1,     SV *)
 PERLVAR(G, LB_invlist, SV *)
 PERLVAR(G, NonL1NonFinalFold,   SV *)
index f4d7af2..a0cd4d4 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -21427,6 +21427,7 @@ Perl_init_uniprops(pTHX)
     PL_utf8_foldable = _new_invlist_C_array(uni_prop_ptrs[UNI__PERL_ANY_FOLDS]);
     PL_HasMultiCharFold = _new_invlist_C_array(uni_prop_ptrs[
                                             UNI__PERL_FOLDS_TO_MULTI_CHAR]);
+    PL_InMultiCharFold = _new_invlist_C_array(_Perl_Is_In_Multi_Char_Fold_invlist);
     PL_NonL1NonFinalFold = _new_invlist_C_array(
                                             NonL1_Perl_Non_Final_Folds_invlist);
 
index 7f1347c..980b90c 100644 (file)
@@ -887,17 +887,28 @@ die "Could not find inversion map for Case_Folding" unless defined $format;
 die "Incorrect format '$format' for Case_Folding inversion map"
                                                     unless $format eq 'al'
                                                            || $format eq 'a';
+my @is_in_multi_char_fold;
 my @is_non_final_fold;
 
 for my $i (0 .. @$folds_ref - 1) {
     next unless ref $folds_ref->[$i];   # Skip single-char folds
 
-    # Add to the non-finals list each code point that is in a non-final
-    # position
-    for my $j (0 .. @{$folds_ref->[$i]} - 2) {
+    # Add to the is_in_multis ls list each code point that is in a
+    # multi-character fold, and to the non-finals list each code point that is
+    # in a non-final position
+    for my $j (0 .. @{$folds_ref->[$i]} - 1) {
+        push @is_in_multi_char_fold, $folds_ref->[$i][$j];
+        last if $j == @{$folds_ref->[$i]} - 1;
         push @is_non_final_fold, $folds_ref->[$i][$j];
     }
     @is_non_final_fold = uniques @is_non_final_fold;
+    @is_in_multi_char_fold = uniques @is_in_multi_char_fold;
+}
+
+sub _Perl_Is_In_Multi_Char_Fold {
+    @is_in_multi_char_fold = sort { $a <=> $b } @is_in_multi_char_fold;
+    my @return = mk_invlist_from_sorted_cp_list(\@is_in_multi_char_fold);
+    return \@return;
 }
 
 sub _Perl_Non_Final_Folds {
@@ -2338,6 +2349,7 @@ no warnings 'qw';
 my @props;
 push @props, sort { prop_name_for_cmp($a) cmp prop_name_for_cmp($b) } qw(
                     &NonL1_Perl_Non_Final_Folds
+                    &_Perl_Is_In_Multi_Char_Fold
                     &UpperLatin1
                     _Perl_GCB,EDGE,E_Base,E_Base_GAZ,E_Modifier,Glue_After_Zwj,LV,Prepend,Regional_Indicator,SpacingMark,ZWJ,XPG_XX
                     _Perl_LB,EDGE,Close_Parenthesis,Hebrew_Letter,Next_Line,Regional_Indicator,ZWJ,Contingent_Break,E_Base,E_Modifier,H2,H3,JL,JT,JV,Word_Joiner
index 1ab03f3..54ae6ad 100644 (file)
@@ -6994,6 +6994,6 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) {
  * 7bd6bcbe3813e0cd55e0998053d182b7bc8c97dcfd0b85028e9f7f55af4ad61b lib/unicore/version
  * 4bb677187a1a64e39d48f2e341b5ecb6c99857e49d7a79cf503bd8a3c709999b regen/charset_translations.pl
  * 03e51b0f07beebd5da62ab943899aa4934eee1f792fa27c1fb638c33bf4ac6ea regen/mk_PL_charclass.pl
- * 0c7d00d798e6c7b743b6843cb4b524bb55155fb78a736f708577b82324919423 regen/mk_invlists.pl
+ * 35eecb67dfc9b89a150036e4dcd76de5d46f20d6ddd6976188e1df94a4055b7b regen/mk_invlists.pl
  * c42c035b18a0426443184e9f889aa2b16bef5a9add9805cd853c4e2a783712ff regen/mph.pl
  * ex: set ro: */