mktables: Make sure a \p{ID_Continue} is also \p{Word}

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 5e0fc25..422d279 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -9113,7 +9113,6 @@ END
  
          # This first set is in the original old-style proplist.
          push @return, split /\n/, <<'END';
-Alpha     ; Alphabetic
  Bidi_C    ; Bidi_Control
  Dash      ; Dash
  Dia       ; Diacritic
@@ -9184,6 +9183,7 @@ END
      }
      if (-e 'DCoreProperties.txt') {
          push @return, split /\n/, <<'END';
+Alpha     ; Alphabetic
  IDS       ; ID_Start
  XIDC      ; XID_Continue
  XIDS      ; XID_Start
@@ -11717,26 +11717,26 @@ sub filter_blocks_lines {
          # PropList.txt has been in Unicode since version 2.0.  Until 3.1, it
          # was in a completely different syntax.  Ken Whistler of Unicode says
          # that it was something he used as an aid for his own purposes, but
-        # was never an official part of the standard.  However, comments in
-        # DAge.txt indicate that non-character code points were available in
-        # the UCD as of 3.1.  It is unclear to me (khw) how they could be
-        # there except through this file (but on the other hand, they first
-        # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
-        # not.  But the claim is that it was published as an aid to others who
-        # might want some more information than was given in the official UCD
-        # of the time.  Many of the properties in it were incorporated into
-        # the later PropList.txt, but some were not.  This program uses this
-        # early file to generate property tables that are otherwise not
-        # accessible in the early UCD's, and most were probably not really
-        # official at that time, so one could argue that it should be ignored,
-        # and you can easily modify things to skip this.  And there are bugs
-        # in this file in various versions.  (For example, the 2.1.9 version
-        # removes from Alphabetic the CJK range starting at 4E00, and they
-        # weren't added back in until 3.1.0.)  Many of this file's properties
-        # were later sanctioned, so this code generates tables for those
-        # properties that aren't otherwise in the UCD of the time but
-        # eventually did become official, and throws away the rest.  Here is a
-        # list of all the ones that are thrown away:
+        # was never an official part of the standard.  Many of the properties
+        # in it were incorporated into the later PropList.txt, but some were
+        # not.  This program uses this early file to generate property tables
+        # that are otherwise not accessible in the early UCD's.  It does this
+        # for the ones that eventually became official, and don't appear to be
+        # too different in their contents from the later official version, and
+        # throws away the rest.  It could be argued that the ones it generates
+        # were probably not really official at that time, so should be
+        # ignored.  You can easily modify things to skip all of them by
+        # changing this function to just set $_ to "", and return; and to skip
+        # certain of them by by simply removing their declarations from
+        # get_old_property_aliases().
+        #
+        # Here is a list of all the ones that are thrown away:
+        #   Alphabetic                   The definitions for this are very
+        #                                defective, so better to not mislead
+        #                                people into thinking it works.
+        #                                Instead the Perl extension of the
+        #                                same name is constructed from first
+        #                                principles.
          #   Bidi=*                       duplicates UnicodeData.txt
          #   Combining                    never made into official property;
          #                                is \P{ccc=0}
@@ -12646,7 +12646,8 @@ sub compile_perl() {
  
      # Get the best available case definitions.  Early Unicode versions didn't
      # have Uppercase and Lowercase defined, so use the general category
-    # instead for them.
+    # instead for them, modified by hard-coding in the code points each is
+    # missing.
      my $Lower = $perl->add_match_table('Lower');
      my $Unicode_Lower = property_ref('Lowercase');
      if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
@@ -12654,8 +12655,36 @@ sub compile_perl() {
  
      }
      else {
-        $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
-                                                                Related => 1);
+        $Lower += $gc->table('Lowercase_Letter');
+
+        # There are quite a few code points in Lower, that aren't in gc=lc,
+        # and not all are in all releases.
+        foreach my $code_point (    0x00AA,
+                                    0x00BA,
+                                    0x02B0 .. 0x02B8,
+                                    0x02C0 .. 0x02C1,
+                                    0x02E0 .. 0x02E4,
+                                    0x0345,
+                                    0x037A,
+                                    0x1D2C .. 0x1D6A,
+                                    0x1D78,
+                                    0x1D9B .. 0x1DBF,
+                                    0x2071,
+                                    0x207F,
+                                    0x2090 .. 0x209C,
+                                    0x2170 .. 0x217F,
+                                    0x24D0 .. 0x24E9,
+                                    0x2C7C .. 0x2C7D,
+                                    0xA770,
+                                    0xA7F8 .. 0xA7F9,
+        ) {
+            # Don't include the code point unless it is assigned in this
+            # release
+            my $category = $gc->value_of(hex $code_point);
+            next if ! defined $category || $category eq 'Cn';
+
+            $Lower += $code_point;
+        }
      }
      $Lower->add_alias('XPosixLower');
      my $Posix_Lower = $perl->add_match_table("PosixLower",
@@ -12669,8 +12698,12 @@ sub compile_perl() {
          $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
      }
      else {
-        $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
-                                                                Related => 1);
+
+        # Unlike Lower, there are only two ranges in Upper that aren't in
+        # gc=Lu, and all code points were assigned in all releases.
+        $Upper += $gc->table('Uppercase_Letter');
+        $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
+        $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
      }
      $Upper->add_alias('XPosixUpper');
      my $Posix_Upper = $perl->add_match_table("PosixUpper",
@@ -12878,6 +12911,7 @@ sub compile_perl() {
              }
          }
          $Alpha->add_description('Alphabetic');
+        $Alpha->add_alias('Alphabetic');
      }
      $Alpha->add_alias('XPosixAlpha');
      my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
@@ -13055,6 +13089,8 @@ sub compile_perl() {
      }
      else {
          $PosixXDigit->initialize($Xdigit & $ASCII);
+        $PosixXDigit->add_alias('AHex');
+        $PosixXDigit->add_alias('Ascii_Hex_Digit');
      }
      $PosixXDigit->add_description('[0-9A-Fa-f]');
  
@@ -13108,16 +13144,115 @@ sub compile_perl() {
      }
      else {
          # For Unicode versions that don't have the property, construct our own
-        # from first principles.  The actual definition is: Letters + letter
-        # numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code
-        # points, plus stability extensions.  PatSyn and PatWS are not defined
-        # in releases that don't have XIDS defined, so are irrelevant.
-        $perl_xids += $gc->table('Letter');
-        my $nl = $gc->table('Letter_Number');
-        $perl_xids += $nl if defined $nl;
+        # from first principles.  The actual definition is:
+        #     Letters
+        #   + letter numbers (Nl)
+        #   - Pattern_Syntax
+        #   - Pattern_White_Space
+        #   + stability extensions
+        #   - NKFC modifications
+        #
+        # What we do in the code below is to include the identical code points
+        # that are in the first release that had Unicode's version of this
+        # property, essentially extrapolating backwards.  There were no
+        # stability extensions until v4.1, so none are included; likewise in
+        # no Unicode version so far do subtracting PatSyn and PatWS make any
+        # difference, so those also are ignored.
+        $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
+
+        # We do subtract the NFKC modifications that are in the first version
+        # that had this property.  We don't bother to test if they are in the
+        # version in question, because if they aren't, the operation is a
+        # no-op.  The NKFC modifications are discussed in
+        # http://www.unicode.org/reports/tr31/#NFKC_Modifications
+        foreach my $range ( 0x037A,
+                            0x0E33,
+                            0x0EB3,
+                            [ 0xFC5E, 0xFC63 ],
+                            [ 0xFDFA, 0xFE70 ],
+                            [ 0xFE72, 0xFE76 ],
+                            0xFE78,
+                            0xFE7A,
+                            0xFE7C,
+                            0xFE7E,
+                            [ 0xFF9E, 0xFF9F ],
+        ) {
+            if (ref $range) {
+                $perl_xids->delete_range($range->[0], $range->[1]);
+            }
+            else {
+                $perl_xids->delete_range($range, $range);
+            }
+        }
      }
+
      $perl_xids &= $Word;
  
+    my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
+                                        Perl_Extension => 1,
+                                        Fate => $INTERNAL_ONLY);
+    my $XIDC = property_ref('XID_Continue');
+    if (defined $XIDC
+        || defined ($XIDC = property_ref('ID_Continue')))
+    {
+        $perl_xidc += $XIDC->table('Y');
+    }
+    else {
+        # Similarly, we construct our own XIDC if necessary for early Unicode
+        # versions.  The definition is:
+        #     everything in XIDS
+        #   + Gc=Mn
+        #   + Gc=Mc
+        #   + Gc=Nd
+        #   + Gc=Pc
+        #   - Pattern_Syntax
+        #   - Pattern_White_Space
+        #   + stability extensions
+        #   - NFKC modifications
+        #
+        # The same thing applies to this as with XIDS for the PatSyn, PatWS,
+        # and stability extensions.  There is a somewhat different set of NFKC
+        # mods to remove (and add in this case).  The ones below make this
+        # have identical code points as in the first release that defined it.
+        $perl_xidc += $perl_xids
+                    + $gc->table('L')
+                    + $gc->table('Mn')
+                    + $gc->table('Mc')
+                    + $gc->table('Nd')
+                    + 0x00B7
+                    ;
+        if (defined (my $pc = $gc->table('Pc'))) {
+            $perl_xidc += $pc;
+        }
+        else {  # 1.1.5 didn't have Pc, but these should have been in it
+            $perl_xidc += 0xFF3F;
+            $perl_xidc->add_range(0x203F, 0x2040);
+            $perl_xidc->add_range(0xFE33, 0xFE34);
+            $perl_xidc->add_range(0xFE4D, 0xFE4F);
+        }
+
+        # Subtract the NFKC mods
+        foreach my $range ( 0x037A,
+                            [ 0xFC5E, 0xFC63 ],
+                            [ 0xFDFA, 0xFE1F ],
+                            0xFE70,
+                            [ 0xFE72, 0xFE76 ],
+                            0xFE78,
+                            0xFE7A,
+                            0xFE7C,
+                            0xFE7E,
+        ) {
+            if (ref $range) {
+                $perl_xidc->delete_range($range->[0], $range->[1]);
+            }
+            else {
+                $perl_xidc->delete_range($range, $range);
+            }
+        }
+    }
+
+    $perl_xidc &= $Word;
+
      my $gcb = property_ref('Grapheme_Cluster_Break');
      # These are used in Unicode's definition of \X
      my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,