Unicode::UCD: Add charprops_all() public function

[perl5.git] / lib / unicore / mktables
diff --git a/lib/unicore/mktables b/lib/unicore/mktables

index 3c5e060..4a16d83 100644 (file)
--- a/lib/unicore/mktables
+++ b/lib/unicore/mktables
@@ -528,6 +528,9 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
  #   0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
  # Without this change, there are casing problems for this character.
  #
+# Search for $string_compare_versions to see how to compare changes to
+# properties between Unicode versions
+#
  ##############################################################################
  
  my $UNDEF = ':UNDEF:';  # String to print out for undefined values in tracing
@@ -953,7 +956,7 @@ my %why_obsolete;    # Documentation only
          # contains the same information, but without the algorithmically
          # determinable Hangul syllables'.  This file is not published, so it's
          # existence is not noted in the comment.
-        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+        'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
  
          'Indic_Matra_Category' => "Provisional",
          'Indic_Syllabic_Category' => "Provisional",
@@ -962,14 +965,14 @@ my %why_obsolete;    # Documentation only
          # to differentiate between it and gc=c, which can be written as 'isc',
          # which is the same characters as ISO_Comment's short name.
  
-        'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
+        'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
  
-        'Simple_Case_Folding' => "$simple.  Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
-        'Simple_Lowercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
-        'Simple_Titlecase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
-        'Simple_Uppercase_Mapping' => "$simple.  Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+        'Simple_Case_Folding' => "$simple.  Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
+        'Simple_Lowercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+        'Simple_Titlecase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+        'Simple_Uppercase_Mapping' => "$simple.  Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
  
-        FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
+        FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
      );
  
      foreach my $property (
@@ -2509,17 +2512,27 @@ END
              }
              $handle{$addr} = $file_handle; # Cache the open file handle
  
-            if ($v_version ge v3.2.0
-                && lc($file) ne 'unicodedata.txt'
-
-                    # Unihan files used another format until v7
-                && ($v_version ge v7.0.0 || $file !~ /^Unihan/i))
-            {
-                $_ = <$file_handle>;
-                if ($_ !~ / - $string_version \. /x) {
-                    chomp;
-                    $_ =~ s/^#\s*//;
-                    die Carp::my_carp("File '$file' is version '$_'.  It should be version $string_version");
+            if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
+                if ($file !~ /^Unihan/i) {
+                    $_ = <$file_handle>;
+                    if ($_ !~ / - $string_version \. /x) {
+                        chomp;
+                        $_ =~ s/^#\s*//;
+                        die Carp::my_carp("File '$file' is version '$_'.  It should be version $string_version");
+                    }
+                }
+                else {
+                    while (<$file_handle>) {
+                        if ($_ !~ /^#/) {
+                            Carp::my_carp_bug("Could not find the expected version info in file '$file'");
+                            last;
+                        }
+                        chomp;
+                        $_ =~ s/^#\s*//;
+                        next if $_ !~ / version: /x;
+                        last if $_ =~ /$string_version/;
+                        die Carp::my_carp("File '$file' is '$_'.  It should be version $string_version");
+                    }
                  }
              }
          }
@@ -6714,7 +6727,8 @@ sub trace { return main::trace(@_); }
  # but its format and even its name or existence are subject to change without
  # notice in a future Perl version.  Don't use it directly.  Instead, its
  # contents are now retrievable through a stable API in the Unicode::UCD
-# module: Unicode::UCD::prop_invmap('$property_name').
+# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
+# code points can be retrieved via Unicode::UCD::charprop());
  END
          }
          return $return;
@@ -6841,7 +6855,7 @@ END
          }
          $comment .= "\nwhere 'cp' is $cp.";
          if ($ucd_accessible_name) {
-            $comment .= "  Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
+            $comment .= "  Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
          }
  
          # And append any commentary already set from the actual property.
@@ -6996,11 +7010,14 @@ END
                      }
                  }
  
-                # I (khw) have never waded through this line to
-                # understand it well enough to comment it.
+                # The unpack yields a list of the bytes that comprise the
+                # UTF-8 of $code_point, which are each placed in \xZZ format
+                # and output in the %s to map to $tostr, so the result looks
+                # like:
+                # "\xC4\xB0" => "\x{0069}\x{0307}",
                  my $utf8 = sprintf(qq["%s" => "$tostr",],
                          join("", map { sprintf "\\x%02X", $_ }
-                            unpack("U0C*", pack("U", $code_point))));
+                            unpack("U0C*", chr $code_point)));
  
                  # Add a comment so that a human reader can more easily
                  # see what's going on.
@@ -9257,7 +9274,7 @@ sub utf8_heavy_name ($$) {
  {   # Closure
  
      my $indent_increment = " " x (($debugging_build) ? 2 : 0);
-    my %already_output;
+    %main::already_output = ();
  
      $main::simple_dumper_nesting = 0;
  
@@ -9277,7 +9294,7 @@ sub utf8_heavy_name ($$) {
          # nesting level is localized, so that as the call stack pops, it goes
          # back to the prior value.
          local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
-        undef %already_output if $main::simple_dumper_nesting == 0;
+        local %main::already_output = %main::already_output;
          $main::simple_dumper_nesting++;
          #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
  
@@ -9308,10 +9325,10 @@ sub utf8_heavy_name ($$) {
  
              # Keep track of cycles in the input, and refuse to infinitely loop
              my $addr = do { no overloading; pack 'J', $item; };
-            if (defined $already_output{$addr}) {
+            if (defined $main::already_output{$addr}) {
                  return "${indent}ALREADY OUTPUT: $item\n";
              }
-            $already_output{$addr} = $item;
+            $main::already_output{$addr} = $item;
  
              if (ref $item eq 'ARRAY') {
                  my $using_brackets;
@@ -13402,7 +13419,7 @@ sub compile_perl() {
      # have Uppercase and Lowercase defined, so use the general category
      # instead for them, modified by hard-coding in the code points each is
      # missing.
-    my $Lower = $perl->add_match_table('Lower');
+    my $Lower = $perl->add_match_table('XPosixLower');
      my $Unicode_Lower = property_ref('Lowercase');
      if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
          $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
@@ -13440,13 +13457,12 @@ sub compile_perl() {
              $Lower += $code_point;
          }
      }
-    $Lower->add_alias('XPosixLower');
      my $Posix_Lower = $perl->add_match_table("PosixLower",
                              Description => "[a-z]",
                              Initialize => $Lower & $ASCII,
                              );
  
-    my $Upper = $perl->add_match_table('Upper');
+    my $Upper = $perl->add_match_table("XPosixUpper");
      my $Unicode_Upper = property_ref('Uppercase');
      if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
          $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
@@ -13459,7 +13475,6 @@ sub compile_perl() {
          $Upper->add_range(0x2160, 0x216F);  # Uppercase Roman numerals
          $Upper->add_range(0x24B6, 0x24CF);  # Circled Latin upper case letters
      }
-    $Upper->add_alias('XPosixUpper');
      my $Posix_Upper = $perl->add_match_table("PosixUpper",
                              Description => "[A-Z]",
                              Initialize => $Upper & $ASCII,
@@ -13581,7 +13596,7 @@ sub compile_perl() {
      # one whose name generally begins with Posix that is posix-compliant, and
      # one that matches Unicode characters beyond the Posix, ASCII range
  
-    my $Alpha = $perl->add_match_table('Alpha');
+    my $Alpha = $perl->add_match_table('XPosixAlpha');
  
      # Alphabetic was not present in early releases
      my $Alphabetic = property_ref('Alphabetic');
@@ -13668,7 +13683,6 @@ sub compile_perl() {
          $Alpha->add_description('Alphabetic');
          $Alpha->add_alias('Alphabetic');
      }
-    $Alpha->add_alias('XPosixAlpha');
      my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
                              Description => "[A-Za-z]",
                              Initialize => $Alpha & $ASCII,
@@ -13676,22 +13690,20 @@ sub compile_perl() {
      $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
      $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
  
-    my $Alnum = $perl->add_match_table('Alnum',
+    my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
                          Description => 'Alphabetic and (decimal) Numeric',
                          Initialize => $Alpha + $gc->table('Decimal_Number'),
                          );
-    $Alnum->add_alias('XPosixAlnum');
      $perl->add_match_table("PosixAlnum",
                              Description => "[A-Za-z0-9]",
                              Initialize => $Alnum & $ASCII,
                              );
  
-    my $Word = $perl->add_match_table('Word',
+    my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
                                  Description => '\w, including beyond ASCII;'
                                              . ' = \p{Alnum} + \pM + \p{Pc}',
                                  Initialize => $Alnum + $gc->table('Mark'),
                                  );
-    $Word->add_alias('XPosixWord');
      my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
      if (defined $Pc) {
          $Word += $Pc;
@@ -13708,13 +13720,13 @@ sub compile_perl() {
      }
  
      # This is a Perl extension, so the name doesn't begin with Posix.
-    my $PerlWord = $perl->add_match_table('PerlWord',
+    my $PerlWord = $perl->add_match_table('PosixWord',
                      Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
                      Initialize => $Word & $ASCII,
                      );
-    $PerlWord->add_alias('PosixWord');
+    $PerlWord->add_alias('PerlWord');
  
-    my $Blank = $perl->add_match_table('Blank',
+    my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
                                  Description => '\h, Horizontal white space',
  
                                  # 200B is Zero Width Space which is for line
@@ -13725,7 +13737,6 @@ sub compile_perl() {
                                              -   0x200B, # ZWSP
                                  );
      $Blank->add_alias('HorizSpace');        # Another name for it.
-    $Blank->add_alias('XPosixBlank');
      $perl->add_match_table("PosixBlank",
                              Description => "\\t and ' '",
                              Initialize => $Blank & $ASCII,
@@ -13744,34 +13755,22 @@ sub compile_perl() {
                      );
      # No Posix equivalent for vertical space
  
-    my $Space = $perl->add_match_table('Space',
+    my $Space = $perl->add_match_table('XPosixSpace',
                  Description => '\s including beyond ASCII and vertical tab',
                  Initialize => $Blank + $VertSpace,
      );
-    $Space->add_alias('XPosixSpace');
-    my $posix_space = $perl->add_match_table("PosixSpace",
+    $Space->add_alias('XPerlSpace');    # Pre-existing synonyms
+    $Space->add_alias('SpacePerl');
+
+    my $Posix_space = $perl->add_match_table("PosixSpace",
                              Description => "\\t, \\n, \\cK, \\f, \\r, and ' '.  (\\cK is vertical tab)",
                              Initialize => $Space & $ASCII,
                              );
+    $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
  
-    # Perl's traditional space doesn't include Vertical Tab prior to v5.18
-    my $XPerlSpace = $perl->add_match_table('XPerlSpace',
-                                  Description => '\s, including beyond ASCII',
-                                  Initialize => $Space,
-                                  #Initialize => $Space
-                                  # - utf8::unicode_to_native(0x0B]
-                                );
-    $XPerlSpace->add_alias('SpacePerl');    # A pre-existing synonym
-    my $PerlSpace = $perl->add_match_table('PerlSpace',
-                        Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
-                        Initialize => $XPerlSpace & $ASCII,
-                            );
-
-
-    my $Cntrl = $perl->add_match_table('Cntrl',
+    my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
                                          Description => 'Control characters');
      $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
-    $Cntrl->add_alias('XPosixCntrl');
      $perl->add_match_table("PosixCntrl",
                              Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
                              Initialize => $Cntrl & $ASCII,
@@ -13784,22 +13783,20 @@ sub compile_perl() {
      $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
  
      # Graph is  ~space &  ~(Cc|Cs|Cn) = ~(space + $controls)
-    my $Graph = $perl->add_match_table('Graph',
+    my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
                          Description => 'Characters that are graphical',
                          Initialize => ~ ($Space + $controls),
                          );
-    $Graph->add_alias('XPosixGraph');
      $perl->add_match_table("PosixGraph",
                              Description =>
                                  '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
                              Initialize => $Graph & $ASCII,
                              );
  
-    $print = $perl->add_match_table('Print',
+    $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
                          Description => 'Characters that are graphical plus space characters (but no controls)',
                          Initialize => $Blank + $Graph - $gc->table('Control'),
                          );
-    $print->add_alias('XPosixPrint');
      $perl->add_match_table("PosixPrint",
                              Description =>
                                '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
@@ -13821,18 +13818,16 @@ sub compile_perl() {
          Initialize => $ASCII & $XPosixPunct,
          );
  
-    my $Digit = $perl->add_match_table('Digit',
+    my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
                              Description => '[0-9] + all other decimal digits');
      $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
-    $Digit->add_alias('XPosixDigit');
      my $PosixDigit = $perl->add_match_table("PosixDigit",
                                              Description => '[0-9]',
                                              Initialize => $Digit & $ASCII,
                                              );
  
      # Hex_Digit was not present in first release
-    my $Xdigit = $perl->add_match_table('XDigit');
-    $Xdigit->add_alias('XPosixXDigit');
+    my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
      my $Hex = property_ref('Hex_Digit');
      if (defined $Hex && ! $Hex->is_empty) {
          $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
@@ -15408,7 +15403,7 @@ sub make_re_pod_entries($) {
                      }
                  }
  
-                # Ouput both short and single in the same parenthesized
+                # Output both short and single in the same parenthesized
                  # expression, but with only one of 'Single', 'Short' if there
                  # are both items.
                  if ($short_name || $single_form || $table->conflicting) {
@@ -16181,9 +16176,14 @@ $zero_matches
  
  =head1 Properties accessible through Unicode::UCD
  
-All the Unicode character properties mentioned above (except for those marked
-as for internal use by Perl) are also accessible by
-L<Unicode::UCD/prop_invlist()>.
+The value of any Unicode (not including Perl extensions) character
+property mentioned above for any single code point is available through
+L<Unicode::UCD/charprop()>.  L<Unicode::UCD/charprops_all()> returns the
+values of all the Unicode properties for a given code point.
+
+Besides these, all the Unicode character properties mentioned above
+(except for those marked as for internal use by Perl) are also
+accessible by L<Unicode::UCD/prop_invlist()>.
  
  Due to their nature, not all Unicode character properties are suitable for
  regular expression matches, nor C<prop_invlist()>.  The remaining
@@ -17273,8 +17273,6 @@ sub write_all_tables() {
                      # Similarly, we create for Unicode::UCD a list of
                      # property-value aliases.
  
-                    my $property_full_name = $property->full_name;
-
                      # Look at each table in the property...
                      foreach my $table ($property->tables) {
                          my @values_list;
@@ -17313,7 +17311,7 @@ sub write_all_tables() {
                          }
  
                          # To save memory, unlike the similar list for property
-                        # aliases above, only the standard forms hve the list.
+                        # aliases above, only the standard forms have the list.
                          # This forces an extra step of converting from input
                          # name to standard name, but the savings are
                          # considerable.  (There is only marginal savings if we
@@ -18600,11 +18598,6 @@ __DATA__
  use strict;
  use warnings;
  
-# If run outside the normal test suite on an ASCII platform, you can
-# just create a latin1_to_native() function that just returns its
-# inputs, because that's the only function used from test.pl
-require "test.pl";
-
  # Test qr/\X/ and the \p{} regular expression constructs.  This file is
  # constructed by mktables from the tables it generates, so if mktables is
  # buggy, this won't necessarily catch those bugs.  Tests are generated for all
@@ -18754,7 +18747,7 @@ sub Test_X($) {
          my $this_string = "";
          my $this_display = "";
          foreach my $code_point (@code_points) {
-            $this_string .= latin1_to_native(chr(hex $code_point));
+            $this_string .= chr utf8::unicode_to_native(hex $code_point);
              $this_display .= "\\x{$code_point}";
          }