X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/60e471b3c5d78fa99162421059b37772f3ad7090..0eac1e20b62b903f96f87a1a657203b8d3ae7d9e:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 4d1512b..9a99f48 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -31,6 +31,7 @@ use File::Find; use File::Path; use File::Spec; use Text::Tabs; +use re "/aa"; sub DEBUG () { 0 } # Set to 0 for production; 1 for development my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; @@ -844,12 +845,18 @@ my %global_to_output_map = ( Present_In => 0, # Suppress, as easily computed from Age Block => 0, # Suppress, as Blocks.txt is retained. + + # Suppress, as mapping can be found instead from the + # Perl_Decomposition_Mapping file + Decomposition_Type => 0, ); # Properties that this program ignores. -my @unimplemented_properties = ( -'Unicode_Radical_Stroke' # Remove if changing to handle this one. -); +my @unimplemented_properties; + +# With this release, it is automatically handled if the Unihan db is +# downloaded +push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0; # There are several types of obsolete properties defined by Unicode. These # must be hand-edited for every new Unicode release. @@ -877,6 +884,10 @@ my %why_obsolete; # Documentation only 'Other_Lowercase' => $contributory, 'Other_Math' => $contributory, 'Other_Uppercase' => $contributory, + 'Expands_On_NFC' => $why_no_expand, + 'Expands_On_NFD' => $why_no_expand, + 'Expands_On_NFKC' => $why_no_expand, + 'Expands_On_NFKD' => $why_no_expand, ); %why_suppressed = ( @@ -884,28 +895,31 @@ my %why_obsolete; # Documentation only # contains the same information, but without the algorithmically # determinable Hangul syllables'. This file is not published, so it's # existence is not noted in the comment. - 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize', + 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()', - 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2', + # Don't suppress ISO_Comment, as otherwise special handling is needed + # to differentiate between it and gc=c, which can be written as 'isc', + # which is the same characters as ISO_Comment's short name. - 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold", - 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo", - 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo", - 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo", + 'Name' => "Accessible via 'use charnames;' or Unicode::UCD::prop_invmap()", - 'Name' => "Accessible via 'use charnames;'", - 'Name_Alias' => "Accessible via 'use charnames;'", + 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()", + 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", + 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", + 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful', - Expands_On_NFC => $why_no_expand, - Expands_On_NFD => $why_no_expand, - Expands_On_NFKC => $why_no_expand, - Expands_On_NFKD => $why_no_expand, ); # The following are suppressed because they were made contributory or # deprecated by Unicode before Perl ever thought about supporting them. - foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') { + foreach my $property ('Jamo_Short_Name', + 'Grapheme_Link', + 'Expands_On_NFC', + 'Expands_On_NFD', + 'Expands_On_NFKC', + 'Expands_On_NFKD' + ) { $why_suppressed{$property} = $why_deprecated{$property}; } @@ -925,7 +939,7 @@ if ($v_version ge 4.0.0) { if ($v_version ge 5.2.0 && $v_version lt 6.0.0) { $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed'; if ($v_version ge 6.0.0) { - $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed'; + $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed'; } } @@ -1043,18 +1057,21 @@ my %default_mapping = ( # Below are files that Unicode furnishes, but this program ignores, and why my %ignored_files = ( - 'CJKRadicals.txt' => 'Unihan data', - 'Index.txt' => 'An index, not actual data', - 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.', - 'NamesList.txt' => 'Just adds commentary', - 'NormalizationCorrections.txt' => 'Data is already in other files.', - 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases', - 'ReadMe.txt' => 'Just comments', - 'README.TXT' => 'Just comments', - 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped', - 'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications', - 'IndicMatraCategory.txt' => 'Provisional', - 'IndicSyllabicCategory.txt' => 'Provisional', + 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points', + 'Index.txt' => 'Alphabetical index of Unicode characters', + 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F and recompile perl', + 'NamesList.txt' => 'Annotated list of characters', + 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base', + 'Props.txt' => 'Only in very early releases; is a subset of F (which is used instead)', + 'ReadMe.txt' => 'Documentation', + 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L', + 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', + 'IndicMatraCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts', + 'IndicSyllabicCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts', + 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', + 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', + 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', + 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests', ); ### End of externally interesting definitions, except for @input_file_objects @@ -1065,7 +1082,7 @@ my $HEADER=<<"EOF"; # database, Version $string_version. Any changes made here will be lost! EOF -my $INTERNAL_ONLY=<<"EOF"; +my $INTERNAL_ONLY_HEADER = <<"EOF"; # !!!!!!! INTERNAL PERL USE ONLY !!!!!!! # This file is for internal use by core Perl only. The format and even the @@ -1082,16 +1099,16 @@ my $DEVELOPMENT_ONLY=<<"EOF"; EOF -my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF"; -my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING; -my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1; +my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF"; +my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; +my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; # Matches legal code point. 4-6 hex numbers, If there are 6, the first # two must be 10; if there are 5, the first must not be a 0. Written this way -# to decrease backtracking. The first one allows the code point to be at the -# end of a word, but to work properly, the word shouldn't end with a valid hex -# character. The second one won't match a code point at the end of a word, -# and doesn't have the run-on issue +# to decrease backtracking. The first regex allows the code point to be at +# the end of a word, but to work properly, the word shouldn't end with a valid +# hex character. The second one won't match a code point at the end of a +# word, and doesn't have the run-on issue my $run_on_code_point_re = qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; my $code_point_re = qr/\b$run_on_code_point_re/; @@ -1101,15 +1118,19 @@ my $code_point_re = qr/\b$run_on_code_point_re/; # depends on this ending with a semi-colon, so it can assume it is a valid # field when the line is split() by semi-colons my $missing_defaults_prefix = - qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/; + qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/; # Property types. Unicode has more types, but these are sufficient for our # purposes. my $UNKNOWN = -1; # initialized to illegal value my $NON_STRING = 1; # Either binary or enum my $BINARY = 2; -my $ENUM = 3; # Include catalog -my $STRING = 4; # Anything else: string or misc +my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal + # tables, additional true and false tables are + # generated so that false is anything matching the + # default value, and true is everything else. +my $ENUM = 4; # Include catalog +my $STRING = 5; # Anything else: string or misc # Some input files have lines that give default values for code points not # contained in the file. Sometimes these should be ignored. @@ -1156,12 +1177,6 @@ my $CROAK = 5; # Die with an error if is already there # if the flag is changed, the indefinite article referring to it in the # documentation may need to be as well. my $NORMAL = ""; -my $SUPPRESSED = 'z'; # The character should never actually be seen, since - # it is suppressed -my $PLACEHOLDER = 'P'; # A property that is defined as a placeholder in a - # Unicode version that doesn't have it, but we need it - # to be defined, if empty, to have things work. - # Implies no pod entry generated my $DEPRECATED = 'D'; my $a_bold_deprecated = "a 'B<$DEPRECATED>'"; my $A_bold_deprecated = "A 'B<$DEPRECATED>'"; @@ -1180,12 +1195,25 @@ my $A_bold_obsolete = "An 'B<$OBSOLETE>'"; my %status_past_participles = ( $DISCOURAGED => 'discouraged', - $SUPPRESSED => 'should never be generated', $STABILIZED => 'stabilized', $OBSOLETE => 'obsolete', $DEPRECATED => 'deprecated', ); +# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be +# externally documented. +my $ORDINARY = 0; # The normal fate. +my $MAP_PROXIED = 1; # The map table for the property isn't written out, + # but there is a file written that can be used to + # reconstruct this table +my $SUPPRESSED = 3; # The file for this table is not written out. +my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is + # for Perl's internal use only +my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a + # Unicode version that doesn't have it, but we need it + # to be defined, if empty, to have things work. + # Implies no pod entry generated + # The format of the values of the tables: my $EMPTY_FORMAT = ""; my $BINARY_FORMAT = 'b'; @@ -1196,6 +1224,7 @@ my $HEX_FORMAT = 'x'; my $RATIONAL_FORMAT = 'r'; my $STRING_FORMAT = 's'; my $DECOMP_STRING_FORMAT = 'c'; +my $STRING_WHITE_SPACE_LIST = 'sw'; my %map_table_formats = ( $BINARY_FORMAT => 'binary', @@ -1206,6 +1235,7 @@ my %map_table_formats = ( $RATIONAL_FORMAT => 'rational: an integer or a fraction', $STRING_FORMAT => 'string', $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', + $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' ); # Unicode didn't put such derived files in a separate directory at first. @@ -1214,13 +1244,33 @@ my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; my $AUXILIARY = 'auxiliary'; # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl +# and into UCD.pl for the use of UCD.pm my %loose_to_file_of; # loosely maps table names to their respective # files my %stricter_to_file_of; # same; but for stricter mapping. +my %loose_property_to_file_of; # Maps a loose property name to its map file +my %file_to_swash_name; # Maps the file name to its corresponding key name + # in the hash %utf8::SwashInfo my %nv_floating_to_rational; # maps numeric values floating point numbers to # their rational equivalent my %loose_property_name_of; # Loosely maps (non_string) property names to # standard form +my %string_property_loose_to_name; # Same, for string properties. +my %loose_defaults; # keys are of form "prop=value", where 'prop' is + # the property name in standard loose form, and + # 'value' is the default value for that property, + # also in standard loose form. +my %loose_to_standard_value; # loosely maps table names to the canonical + # alias for them +my %ambiguous_names; # keys are alias names (in standard form) that + # have more than one possible meaning. +my %prop_aliases; # Keys are standard property name; values are each + # one's aliases +my %prop_value_aliases; # Keys of top level are standard property name; + # values are keys to another hash, Each one is + # one of the property's values, in standard form. + # The values are that prop-val's aliases. +my %ucd_pod; # Holds entries that will go into the UCD section of the pod # Most properties are immune to caseless matching, otherwise you would get # nonsensical results, as properties are a function of a code point, not @@ -1259,6 +1309,28 @@ my %Jamo_L; # Leading consonants my %Jamo_V; # Vowels my %Jamo_T; # Trailing consonants +# For code points whose name contains its ordinal as a '-ABCD' suffix. +# The key is the base name of the code point, and the value is an +# array giving all the ranges that use this base name. Each range +# is actually a hash giving the 'low' and 'high' values of it. +my %names_ending_in_code_point; +my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes + # removed from the names +# Inverse mapping. The list of ranges that have these kinds of +# names. Each element contains the low, high, and base names in an +# anonymous hash. +my @code_points_ending_in_code_point; + +# Boolean: does this Unicode version have the hangul syllables, and are we +# writing out a table for them? +my $has_hangul_syllables = 0; + +# Does this Unicode version have code points whose names end in their +# respective code points, and are we writing out a table for them? 0 for no; +# otherwise points to first property that a table is needed for them, so that +# if multiple tables are needed, we don't create duplicates +my $needing_code_points_ending_in_code_point = 0; + my @backslash_X_tests; # List of tests read in for testing \X my @unhandled_properties; # Will contain a list of properties found in # the input that we didn't process. @@ -1932,12 +2004,15 @@ sub trace { return main::trace(@_); } main::set_access('non_skip', \%non_skip, 'c'); my %skip; - # This is used to skip processing of this input file semi-permanently. - # It is used for files that we aren't planning to process anytime soon, - # but want to allow to be in the directory and not raise a message that we - # are not handling. Mostly for test files. This is in contrast to the - # non_skip element, which is supposed to be used very temporarily for - # debugging. Sets 'optional' to 1 + # This is used to skip processing of this input file semi-permanently, + # when it evaluates to true. The value should be the reason the file is + # being skipped. It is used for files that we aren't planning to process + # anytime soon, but want to allow to be in the directory and not raise a + # message that we are not handling. Mostly for test files. This is in + # contrast to the non_skip element, which is supposed to be used very + # temporarily for debugging. Sets 'optional' to 1. Also, files that we + # pretty much will never look at can be placed in the global + # %ignored_files instead. Ones used here will be added to that list. main::set_access('skip', \%skip, 'c'); my %each_line_handler; @@ -2060,7 +2135,12 @@ sub trace { return main::trace(@_); } print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n"; } - $optional{$addr} = 1 if $skip{$addr}; + # If skipping, set to optional, and add to list of ignored files, + # including its reason + if ($skip{$addr}) { + $optional{$addr} = 1; + $ignored_files{$file{$addr}} = $skip{$addr} + } return $self; } @@ -2623,21 +2703,26 @@ package Alias; # Should this name match loosely or not. main::set_access('loose_match', \%loose_match, 'r'); - my %make_pod_entry; - # Some aliases should not get their own entries because they are covered - # by a wild-card, and some we want to discourage use of. Binary - main::set_access('make_pod_entry', \%make_pod_entry, 'r'); + my %make_re_pod_entry; + # Some aliases should not get their own entries in the re section of the + # pod, because they are covered by a wild-card, and some we want to + # discourage use of. Binary + main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's'); + + my %ucd; + # Is this documented to be accessible via Unicode::UCD + main::set_access('ucd', \%ucd, 'r', 's'); my %status; # Aliases have a status, like deprecated, or even suppressed (which means # they don't appear in documentation). Enum main::set_access('status', \%status, 'r'); - my %externally_ok; + my %ok_as_filename; # Similarly, some aliases should not be considered as usable ones for # external use, such as file names, or we don't want documentation to # recommend them. Boolean - main::set_access('externally_ok', \%externally_ok, 'r'); + main::set_access('ok_as_filename', \%ok_as_filename, 'r'); sub new { my $class = shift; @@ -2647,14 +2732,15 @@ package Alias; $name{$addr} = shift; $loose_match{$addr} = shift; - $make_pod_entry{$addr} = shift; - $externally_ok{$addr} = shift; + $make_re_pod_entry{$addr} = shift; + $ok_as_filename{$addr} = shift; $status{$addr} = shift; + $ucd{$addr} = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; # Null names are never ok externally - $externally_ok{$addr} = 0 if $name{$addr} eq ""; + $ok_as_filename{$addr} = 0 if $name{$addr} eq ""; return $self; } @@ -3024,7 +3110,7 @@ sub trace { return main::trace(@_); } # If the range list is empty, return a large value that isn't adjacent # to any that could be in the range list, for simpler tests - return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; + return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; return $ranges{$addr}->[0]->start; } @@ -4013,8 +4099,8 @@ sub trace { return main::trace(@_); } # And finally, add the gap from the end of the table to the max # possible code point - if ($max < $LAST_UNICODE_CODEPOINT) { - $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT); + if ($max < $MAX_UNICODE_CODEPOINT) { + $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT); } return $new; } @@ -4274,7 +4360,7 @@ sub trace { return main::trace(@_); } return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF - return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range + return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate return 1; @@ -4423,9 +4509,11 @@ sub trace { return main::trace(@_); } # files. main::set_access('note', \%note, 'readable_array'); - my %internal_only; - # Boolean; if set this table is for internal core Perl only use. - main::set_access('internal_only', \%internal_only); + my %fate; + # Enum; there are a number of possibilities for what happens to this + # table: it could be normal, or suppressed, or not for external use. See + # values at definition for $SUPPRESSED. + main::set_access('fate', \%fate, 'r'); my %find_table_from_alias; # The parent property passes this pointer to a hash which this class adds @@ -4480,8 +4568,8 @@ sub trace { return main::trace(@_); } sub new { # All arguments are key => value pairs, which you can see below, most - # of which match fields documented above. Otherwise: Pod_Entry, - # Externally_Ok, and Fuzzy apply to the names of the table, and are + # of which match fields documented above. Otherwise: Re_Pod_Entry, + # OK_as_Filename, and Fuzzy apply to the names of the table, and are # documented in the Alias package return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; @@ -4499,7 +4587,6 @@ sub trace { return main::trace(@_); } my $complete_name = $complete_name{$addr} = delete $args{'Complete_Name'}; $format{$addr} = delete $args{'Format'}; - $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0; $output_range_counts{$addr} = delete $args{'Output_Range_Counts'}; $property{$addr} = delete $args{'_Property'}; $range_list{$addr} = delete $args{'_Range_List'}; @@ -4507,12 +4594,14 @@ sub trace { return main::trace(@_); } $status_info{$addr} = delete $args{'_Status_Info'} || ""; $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; + $fate{$addr} = delete $args{'Fate'} || $ORDINARY; + my $ucd = delete $args{'UCD'}; my $description = delete $args{'Description'}; - my $externally_ok = delete $args{'Externally_Ok'}; + my $ok_as_filename = delete $args{'OK_as_Filename'}; my $loose_match = delete $args{'Fuzzy'}; my $note = delete $args{'Note'}; - my $make_pod_entry = delete $args{'Pod_Entry'}; + my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; my $perl_extension = delete $args{'Perl_Extension'}; # Shouldn't have any left over @@ -4534,28 +4623,40 @@ sub trace { return main::trace(@_); } push @{$description{$addr}}, $description if $description; push @{$note{$addr}}, $note if $note; - if ($status{$addr} eq $PLACEHOLDER) { + if ($fate{$addr} == $PLACEHOLDER) { # A placeholder table doesn't get documented, is a perl extension, # and quite likely will be empty - $make_pod_entry = 0 if ! defined $make_pod_entry; + $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; $perl_extension = 1 if ! defined $perl_extension; + $ucd = 0 if ! defined $ucd; push @tables_that_may_be_empty, $complete_name{$addr}; + $self->add_comment(<complete_name}) + { + Carp::my_carp_bug("There is no current capability to set the reason for suppressing."); + # perhaps Fate => [ $SUPPRESSED, "reason" ] + } + + # If hasn't set its status already, see if it is on one of the + # lists of properties or tables that have particular statuses; if + # not, is normal. The lists are prioritized so the most serious + # ones are checked first + if (! $status{$addr}) { + if (exists $why_deprecated{$complete_name}) { $status{$addr} = $DEPRECATED; } elsif (exists $why_stabilized{$complete_name}) { @@ -4568,11 +4669,7 @@ sub trace { return main::trace(@_); } # Existence above doesn't necessarily mean there is a message # associated with it. Use the most serious message. if ($status{$addr}) { - if ($why_suppressed{$complete_name}) { - $status_info{$addr} - = $why_suppressed{$complete_name}; - } - elsif ($why_deprecated{$complete_name}) { + if ($why_deprecated{$complete_name}) { $status_info{$addr} = $why_deprecated{$complete_name}; } @@ -4589,24 +4686,35 @@ sub trace { return main::trace(@_); } $perl_extension{$addr} = $perl_extension || 0; + # Don't list a property by default that is internal only + if ($fate{$addr} > $MAP_PROXIED) { + $make_re_pod_entry = 0 if ! defined $make_re_pod_entry; + $ucd = 0 if ! defined $ucd; + } + else { + $ucd = 1 if ! defined $ucd; + } + # By convention what typically gets printed only or first is what's # first in the list, so put the full name there for good output # clarity. Other routines rely on the full name being first on the # list $self->add_alias($full_name{$addr}, - Externally_Ok => $externally_ok, + OK_as_Filename => $ok_as_filename, Fuzzy => $loose_match, - Pod_Entry => $make_pod_entry, + Re_Pod_Entry => $make_re_pod_entry, Status => $status{$addr}, + UCD => $ucd, ); # Then comes the other name, if meaningfully different. if (standardize($full_name{$addr}) ne standardize($name{$addr})) { $self->add_alias($name{$addr}, - Externally_Ok => $externally_ok, + OK_as_Filename => $ok_as_filename, Fuzzy => $loose_match, - Pod_Entry => $make_pod_entry, + Re_Pod_Entry => $make_re_pod_entry, Status => $status{$addr}, + UCD => $ucd, ); } @@ -4667,15 +4775,17 @@ sub trace { return main::trace(@_); } my %args = @_; my $loose_match = delete $args{'Fuzzy'}; - my $make_pod_entry = delete $args{'Pod_Entry'}; - $make_pod_entry = $YES unless defined $make_pod_entry; + my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}; + $make_re_pod_entry = $YES unless defined $make_re_pod_entry; - my $externally_ok = delete $args{'Externally_Ok'}; - $externally_ok = 1 unless defined $externally_ok; + my $ok_as_filename = delete $args{'OK_as_Filename'}; + $ok_as_filename = 1 unless defined $ok_as_filename; my $status = delete $args{'Status'}; $status = $NORMAL unless defined $status; + my $ucd = delete $args{'UCD'} // 1; + Carp::carp_extra_args(\%args) if main::DEBUG && %args; # Capitalize the first letter of the alias unless it is one of the CJK @@ -4742,8 +4852,8 @@ sub trace { return main::trace(@_); } splice @$list, $insert_position, 0, - Alias->new($name, $loose_match, $make_pod_entry, - $externally_ok, $status); + Alias->new($name, $loose_match, $make_re_pod_entry, + $ok_as_filename, $status, $ucd); # This name may be shorter than any existing ones, so clear the cache # of the shortest, so will have to be recalculated. @@ -4787,7 +4897,7 @@ sub trace { return main::trace(@_); } foreach my $alias ($self->aliases()) { # Don't use an alias that isn't ok to use for an external name. - next if ! $alias->externally_ok; + next if ! $alias->ok_as_filename; my $name = main::Standardize($alias->name); trace $self, $name if main::DEBUG && $to_trace; @@ -4959,8 +5069,6 @@ sub trace { return main::trace(@_); } my $return = ""; $return .= $DEVELOPMENT_ONLY if $compare_versions; $return .= $HEADER; - no overloading; - $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self}; return $return; } @@ -5022,7 +5130,8 @@ sub trace { return main::trace(@_); } # certain number of blocks, might as well output the whole # thing if it all will fit in one block. The number of # ranges below is an approximate number for that. - && $self->property->type == $BINARY + && ($self->property->type == $BINARY + || $self->property->type == $FORCED_BINARY) # && $self->property->tables == 2 Can't do this because the # non-binary properties, like NFDQC aren't specifiable # by the notation @@ -5296,6 +5405,41 @@ sub trace { return main::trace(@_); } return; } + sub set_fate { # Set the fate of a table + my $self = shift; + my $fate = shift; + my $reason = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + + return if $fate{$addr} == $fate; # If no-op + + # Can only change the ordinary fate, except if going to $MAP_PROXIED + return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED; + + $fate{$addr} = $fate; + + # Don't document anything to do with a non-normal fated table + if ($fate != $ORDINARY) { + my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0; + foreach my $alias ($self->aliases) { + $alias->set_ucd($put_in_pod); + + # MAP_PROXIED doesn't affect the match tables + next if $fate == $MAP_PROXIED; + $alias->set_make_re_pod_entry($put_in_pod); + } + } + + # Save the reason for suppression for output + if ($fate == $SUPPRESSED && defined $reason) { + $why_suppressed{$complete_name{$addr}} = $reason; + } + + return; + } + sub lock { # Don't allow changes to the table from now on. This stores a stack # trace of where it was called, so that later attempts to modify it @@ -5370,8 +5514,7 @@ sub trace { return main::trace(@_); } *$sub = sub { use strict "refs"; my $self = shift; - no overloading; - return $range_list{pack 'J', $self}->$sub(@_); + return $self->_range_list->$sub(@_); } } @@ -5387,7 +5530,7 @@ sub trace { return main::trace(@_); } return if $self->carp_if_locked; no overloading; - return $range_list{pack 'J', $self}->$sub(@_); + return $self->_range_list->$sub(@_); } } @@ -5442,11 +5585,6 @@ sub trace { return main::trace(@_); } \%anomalous_entries, 'readable_array'); - my %core_access; - # This is a string, solely for documentation, indicating how one can get - # access to this property via the Perl core. - main::set_access('core_access', \%core_access, 'r', 's'); - my %to_output_map; # Enum as to whether or not to write out this map table: # 0 don't output @@ -5468,7 +5606,6 @@ sub trace { return main::trace(@_); } # Optional initialization data for the table. my $initialize = delete $args{'Initialize'}; - my $core_access = delete $args{'Core_Access'}; my $default_map = delete $args{'Default_Map'}; my $property = delete $args{'_Property'}; my $full_name = delete $args{'Full_Name'}; @@ -5488,7 +5625,6 @@ sub trace { return main::trace(@_); } my $addr = do { no overloading; pack 'J', $self; }; $anomalous_entries{$addr} = []; - $core_access{$addr} = $core_access; $default_map{$addr} = $default_map; $self->initialize($initialize) if defined $initialize; @@ -5663,8 +5799,10 @@ sub trace { return main::trace(@_); } if defined $global_to_output_map{$full_name}; # If table says to output, do so; if says to suppress it, do so. + my $fate = $self->fate; + return $INTERNAL_MAP if $fate == $INTERNAL_ONLY; return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; - return 0 if $self->status eq $SUPPRESSED; + return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED; my $type = $self->property->type; @@ -5696,7 +5834,23 @@ sub trace { return main::trace(@_); } my $return = $self->SUPER::header(); - $return .= $INTERNAL_ONLY if $self->to_output_map == $INTERNAL_MAP; + if ($self->to_output_map == $INTERNAL_MAP) { + $return .= $INTERNAL_ONLY_HEADER; + } + else { + my $property_name = $self->property->full_name; + $return .= <externally_ok } $self->aliases; + my @property_aliases = grep { $_->ok_as_filename } $self->aliases; my $count = $self->count; my $default_map = $default_map{$addr}; @@ -5811,16 +5965,8 @@ END $property_aliases[$i]->name . '(cp)' ); } - $comment .= - "\nwhere 'cp' is $cp. Note that $these_mappings $are "; - - my $access = $core_access{$addr}; - if ($access) { - $comment .= "accessible through the Perl core via $access."; - } - else { - $comment .= "not accessible through the Perl core directly."; - } + my $full_name = $self->property->full_name; + $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD"; # And append any commentary already set from the actual property. $comment .= "\n\n" . $self->comment if $self->comment; @@ -5876,20 +6022,8 @@ END # The remaining variables are temporaries used while writing each table, # to output special ranges. - my $has_hangul_syllables; my @multi_code_point_maps; # Map is to more than one code point. - # The key is the base name of the code point, and the value is an - # array giving all the ranges that use this base name. Each range - # is actually a hash giving the 'low' and 'high' values of it. - my %names_ending_in_code_point; - my %loose_names_ending_in_code_point; - - # Inverse mapping. The list of ranges that have these kinds of - # names. Each element contains the low, high, and base names in an - # anonymous hash. - my @code_points_ending_in_code_point; - sub handle_special_range { # Called in the middle of write when it finds a range it doesn't know # how to handle. @@ -5909,32 +6043,47 @@ END # No need to output the range if it maps to the default. return if $map eq $default_map{$addr}; + my $property = $self->property; + # Switch based on the map type... if ($type == $HANGUL_SYLLABLE) { # These are entirely algorithmically determinable based on # some constants furnished by Unicode; for now, just set a # flag to indicate that have them. After everything is figured - # out, we will output the code that does the algorithm. - $has_hangul_syllables = 1; + # out, we will output the code that does the algorithm. (Don't + # output them if not needed because we are suppressing this + # property.) + $has_hangul_syllables = 1 if $property->to_output_map; } elsif ($type == $CP_IN_NAME) { - # Code points whose the name ends in their code point are also + # Code points whose name ends in their code point are also # algorithmically determinable, but need information about the map # to do so. Both the map and its inverse are stored in data - # structures output in the file. - push @{$names_ending_in_code_point{$map}->{'low'}}, $low; - push @{$names_ending_in_code_point{$map}->{'high'}}, $high; - - my $squeezed = $map =~ s/[-\s]+//gr; - push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, $low; - push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, $high; - - push @code_points_ending_in_code_point, { low => $low, - high => $high, - name => $map - }; + # structures output in the file. They are stored in the mean time + # in global lists The lists will be written out later into Name.pm, + # which is created only if needed. In order to prevent duplicates + # in the list, only add to them for one property, should multiple + # ones need them. + if ($needing_code_points_ending_in_code_point == 0) { + $needing_code_points_ending_in_code_point = $property; + } + if ($property == $needing_code_points_ending_in_code_point) { + push @{$names_ending_in_code_point{$map}->{'low'}}, $low; + push @{$names_ending_in_code_point{$map}->{'high'}}, $high; + + my $squeezed = $map =~ s/[-\s]+//gr; + push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, + $low; + push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, + $high; + + push @code_points_ending_in_code_point, { low => $low, + high => $high, + name => $map + }; + } } elsif ($range->type == $MULTI_CP || $range->type == $NULL) { @@ -6015,6 +6164,10 @@ END my $name = $self->property->swash_name; + # Currently there is nothing in the pre_body unless a swash is being + # generated. + return unless defined $name; + if (defined $swash_keys{$name}) { Carp::my_carp(join_lines(<{$base}; - - # Look through the list of ranges that apply to this name to see if - # the code point is in one of them. - for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) { - return if $names_ref->{$base}{'low'}->[$i] > $code_point; - next if $names_ref->{$base}{'high'}->[$i] < $code_point; - - # Here, the code point is in the range. - return $code_point; - } - - # Here, looked like the name had a code point number in it, but - # did not match one of the valid ones. - return; - } - - sub code_point_to_name_special { - my $code_point = shift; - - # Returns the name of a code point if algorithmically determinable; - # undef if not -END - if ($has_hangul_syllables) { - $pre_body .= << 'END'; - - # If in the Hangul range, calculate the name based on Unicode's - # algorithm - if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) { - use integer; - my $SIndex = $code_point - $SBase; - my $L = $LBase + $SIndex / $NCount; - my $V = $VBase + ($SIndex % $NCount) / $TCount; - my $T = $TBase + $SIndex % $TCount; - $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}"; - $name .= $Jamo{$T} if $T != $TBase; - return $name; - } -END - } - $pre_body .= << 'END'; - - # Look through list of these code points for one in range. - foreach my $hash (@code_points_ending_in_code_point) { - return if $code_point < $hash->{'low'}; - if ($code_point <= $hash->{'high'}) { - return sprintf("%s-%04X", $hash->{'name'}, $code_point); - } - } - return; # None found - } -} # End closure - -END - } # End of has hangul or code point in name maps. - my $format = $self->format; my $return = <format; @@ -6520,7 +6436,7 @@ sub trace { return main::trace(@_); } my %complement; # Points to the complement that this table is expressed in terms of; 0 if # none. - main::set_access('complement', \%complement, 'r', 's' ); + main::set_access('complement', \%complement, 'r'); sub new { my $class = shift; @@ -6647,6 +6563,20 @@ sub trace { return main::trace(@_); } return "Table '$name'"; } + sub _range_list { + # Returns the range list associated with this table, which will be the + # complement's if it has one. + + my $self = shift; + my $complement; + if (($complement = $self->complement) != 0) { + return ~ $complement->_range_list; + } + else { + return $self->SUPER::_range_list; + } + } + sub add_alias { # Add a synonym for this table. See the comments in the base class @@ -6793,11 +6723,12 @@ sub trace { return main::trace(@_); } # Any tables that are equivalent to or children of this table must now # instead be equivalent to or (children) to the new leader (parent), # still equivalent. The equivalency includes their matches_all info, - # and for related tables, their status + # and for related tables, their fate and status. # All related tables are of necessity equivalent, but the converse # isn't necessarily true my $status = $other->status; my $status_info = $other->status_info; + my $fate = $other->fate; my $matches_all = $matches_all{other_addr}; my $caseless_equivalent = $other->caseless_equivalent; foreach my $table ($current_leader, @{$equivalents{$leader}}) { @@ -6813,6 +6744,11 @@ sub trace { return main::trace(@_); } $parent{$table_addr} = $other; push @{$children{$other_addr}}, $table; $table->set_status($status, $status_info); + + # This reason currently doesn't get exposed outside; otherwise + # would have to look up the parent's reason and use it instead. + $table->set_fate($fate, "Parent's fate"); + $self->set_caseless_equivalent($caseless_equivalent); } } @@ -6824,22 +6760,65 @@ sub trace { return main::trace(@_); } return; } - sub add_range { # Add a range to the list for this table. + sub set_complement { + # Set $self to be the complement of the parameter table. $self is + # locked, as what it contains should all come from the other table. + my $self = shift; - # Rest of parameters passed on + my $other = shift; - return if $self->carp_if_locked; - return $self->_range_list->add_range(@_); - } + my %args = @_; + Carp::carp_extra_args(\%args) if main::DEBUG && %args; - sub pre_body { # Does nothing for match tables. - return - } + if ($other->complement != 0) { + Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement); + return; + } + my $addr = do { no overloading; pack 'J', $self; }; + $complement{$addr} = $other; + $self->lock; + return; + } + + sub add_range { # Add a range to the list for this table. + my $self = shift; + # Rest of parameters passed on + + return if $self->carp_if_locked; + return $self->_range_list->add_range(@_); + } + + sub header { + my $self = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + # All match tables are to be used only by the Perl core. + return $self->SUPER::header() . $INTERNAL_ONLY_HEADER; + } + + sub pre_body { # Does nothing for match tables. + return + } sub append_to_body { # Does nothing for match tables. return } + sub set_fate { + my $self = shift; + my $fate = shift; + my $reason = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + $self->SUPER::set_fate($fate, $reason); + + # All children share this fate + foreach my $child ($self->children) { + $child->set_fate($fate, $reason); + } + return; + } + sub write { my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -7004,17 +6983,7 @@ END my $flag = $property->status || $table->status || $table_alias_object->status; - if ($flag) { - if ($flag ne $PLACEHOLDER) { - $flags{$flag} = $status_past_participles{$flag}; - } else { - $flags{$flag} = < $table_ref{$addr}, _Property => $self, - # gets property's status by default + # gets property's fate and status by default + Fate => $self->fate, Status => $self->status, _Status_Info => $self->status_info, - %args, - Internal_Only_Warning => 1); # Override any - # input param + %args); return unless defined $table; } @@ -7503,7 +7470,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News."); $type{$addr} = $NON_STRING; } - elsif ($type{$addr} != $ENUM) { + elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) { if (scalar main::uniques(values %{$table_ref{$addr}}) > 2 && $type{$addr} == $BINARY) { @@ -7593,6 +7560,11 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my $addr = do { no overloading; pack 'J', $self; }; + # Swash names are used only on regular map tables; otherwise there + # should be no access to the property map table from other parts of + # Perl. + return if $map{$addr}->fate != $ORDINARY; + return $file{$addr} if defined $file{$addr}; return $map{$addr}->external_name; } @@ -7645,6 +7617,23 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other}); } + sub set_proxy_for { + # Certain tables are not generally written out to files, but + # Unicode::UCD has the intelligence to know that the file for $self + # can be used to reconstruct those tables. This routine just changes + # things so that UCD pod entries for those suppressed tables are + # generated, so the fact that a proxy is used is invisible to the + # user. + + my $self = shift; + + foreach my $property_name (@_) { + my $ref = property_ref($property_name); + next if $ref->to_output_map; + $ref->set_fate($MAP_PROXIED); + } + } + sub set_type { # Set the type of the property. Mostly this is figured out by the # data in the table. But this is used to set it explicitly. The @@ -7656,36 +7645,39 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } my $type = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; - if ($type != $ENUM && $type != $BINARY && $type != $STRING) { + if ($type != $ENUM + && $type != $BINARY + && $type != $FORCED_BINARY + && $type != $STRING) + { Carp::my_carp("Unrecognized type '$type'. Type not set"); return; } { no overloading; $type{pack 'J', $self} = $type; } - return if $type != $BINARY; + return if $type != $BINARY && $type != $FORCED_BINARY; my $yes = $self->table('Y'); $yes = $self->table('Yes') if ! defined $yes; $yes = $self->add_match_table('Y', Full_Name => 'Yes') if ! defined $yes; - # Add aliases in order wanted, duplicates will be ignored. Note, that - # could run into problems in outputting things in that we don't - # distinguish between the name and full name of these. Hopefully, if - # the table was already created before this code is executed, it was - # done with these set properly. - $yes->add_alias('Y'); - $yes->add_alias('Yes'); - $yes->add_alias('T'); - $yes->add_alias('True'); - + # Add aliases in order wanted, duplicates will be ignored. We use a + # binary property present in all releases for its ordered lists of + # true/false aliases. Note, that could run into problems in + # outputting things in that we don't distinguish between the name and + # full name of these. Hopefully, if the table was already created + # before this code is executed, it was done with these set properly. + my $bm = property_ref("Bidi_Mirrored"); + foreach my $alias ($bm->table("Y")->aliases) { + $yes->add_alias($alias->name); + } my $no = $self->table('N'); $no = $self->table('No') if ! defined $no; $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no; - $no->add_alias('N'); - $no->add_alias('No'); - $no->add_alias('F'); - $no->add_alias('False'); + foreach my $alias ($bm->table("N")->aliases) { + $no->add_alias($alias->name); + } return; } @@ -7760,7 +7752,9 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # If already have figured these out, no need to do so again, but we do # a double check on ENUMS to make sure that a string property hasn't # improperly been classified as an ENUM, so continue on with those. - return if $type == $STRING || $type == $BINARY; + return if $type == $STRING + || $type == $BINARY + || $type == $FORCED_BINARY; # If every map is to a code point, is a string property. if ($type == $UNKNOWN @@ -7805,6 +7799,29 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } return; } + sub set_fate { + my $self = shift; + my $fate = shift; + my $reason = shift; # Ignored unless suppressing + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $addr = do { no overloading; pack 'J', $self; }; + if ($fate == $SUPPRESSED) { + $why_suppressed{$self->complete_name} = $reason; + } + + # Each table shares the property's fate, except that MAP_PROXIED + # doesn't affect match tables + $map{$addr}->set_fate($fate, $reason); + if ($fate != $MAP_PROXIED) { + foreach my $table ($map{$addr}, $self->tables) { + $table->set_fate($fate, $reason); + } + } + return; + } + + # Most of the accessors for a property actually apply to its map table. # Setup up accessor functions for those, referring to %map for my $sub (qw( @@ -7819,13 +7836,13 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } comment complete_name containing_range - core_access count default_map delete_range description each_range external_name + fate file_path format initialize @@ -7840,7 +7857,6 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } range_size_1 reset_each_range set_comment - set_core_access set_default_map set_file_path set_final_comment @@ -8242,7 +8258,7 @@ sub utf8_heavy_name ($$) { { # Closure - my $indent_increment = " " x 2; + my $indent_increment = " " x (($debugging_build) ? 2 : 0); my %already_output; $main::simple_dumper_nesting = 0; @@ -8256,7 +8272,7 @@ sub utf8_heavy_name ($$) { my $item = shift; my $indent = shift; - $indent = "" if ! defined $indent; + $indent = "" if ! $debugging_build || ! defined $indent; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -8281,9 +8297,8 @@ sub utf8_heavy_name ($$) { my $copy = $item; $copy = $UNDEF unless defined $copy; - # Quote non-numbers (numbers also have optional leading '-' and - # fractions) - if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) { + # Quote non-integers (integers also have optional leading '-') + if ($copy eq "" || $copy !~ /^ -? \d+ $/x) { # Escape apostrophe and backslash $copy =~ s/ ( ['\\] ) /\\$1/xg; @@ -8322,6 +8337,7 @@ sub utf8_heavy_name ($$) { # Indent array elements one level $output .= &simple_dumper($item->[$i], $next_indent); + next if ! $debugging_build; $output =~ s/\n$//; # Remove any trailing nl so $output .= " # [$i]\n"; # as to add a comment giving # the array index @@ -8541,17 +8557,10 @@ sub finish_property_setup { $gc->add_alias('Category'); # For backwards compatibility, these property files have particular names. - my $upper = property_ref('Uppercase_Mapping'); - $upper->set_core_access('uc()'); - $upper->set_file('Upper'); # This is what utf8.c calls it - - my $lower = property_ref('Lowercase_Mapping'); - $lower->set_core_access('lc()'); - $lower->set_file('Lower'); - - my $title = property_ref('Titlecase_Mapping'); - $title->set_core_access('ucfirst()'); - $title->set_file('Title'); + property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what + # utf8.c calls it + property_ref('Lowercase_Mapping')->set_file('Lower'); + property_ref('Titlecase_Mapping')->set_file('Title'); my $fold = property_ref('Case_Folding'); $fold->set_file('Fold') if defined $fold; @@ -9288,6 +9297,14 @@ sub output_perl_charnames_line ($$) { # the little used $compare_versions feature is enabled. my $compare_versions_range_list; + # These are constants to the $property_info hash in this subroutine, to + # avoid using a quoted-string which might have a typo. + my $TYPE = 'type'; + my $DEFAULT_MAP = 'default_map'; + my $DEFAULT_TABLE = 'default_table'; + my $PSEUDO_MAP_TYPE = 'pseudo_map_type'; + my $MISSINGS = 'missings'; + sub process_generic_property_file { # This processes a file containing property mappings and puts them # into internal map tables. It should be used to handle any property @@ -9466,22 +9483,22 @@ sub output_perl_charnames_line ($$) { # If not the first time for this property, retrieve info about # it from the cache - if (defined ($property_info{$property_addr}{'type'})) { - $property_type = $property_info{$property_addr}{'type'}; - $default_map = $property_info{$property_addr}{'default'}; + if (defined ($property_info{$property_addr}{$TYPE})) { + $property_type = $property_info{$property_addr}{$TYPE}; + $default_map = $property_info{$property_addr}{$DEFAULT_MAP}; $map_type - = $property_info{$property_addr}{'pseudo_map_type'}; + = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}; $default_table - = $property_info{$property_addr}{'default_table'}; + = $property_info{$property_addr}{$DEFAULT_TABLE}; } else { # Here, is the first time for this property. Set up the # cache. - $property_type = $property_info{$property_addr}{'type'} + $property_type = $property_info{$property_addr}{$TYPE} = $property_object->type; $map_type - = $property_info{$property_addr}{'pseudo_map_type'} + = $property_info{$property_addr}{$PSEUDO_MAP_TYPE} = $property_object->pseudo_map_type; # The Unicode files are set up so that if the map is not @@ -9495,7 +9512,7 @@ sub output_perl_charnames_line ($$) { else { $property_object->set_type($BINARY); $property_type - = $property_info{$property_addr}{'type'} + = $property_info{$property_addr}{$TYPE} = $BINARY; } } @@ -9520,17 +9537,17 @@ sub output_perl_charnames_line ($$) { if ($property_type == $STRING || $property_type == $UNKNOWN) { - $property_info{$addr}{'missings'} = $default; + $property_info{$addr}{$MISSINGS} = $default; } else { - $property_info{$addr}{'missings'} + $property_info{$addr}{$MISSINGS} = $property_object->table($default); } } # Finished storing all the @missings defaults in the input # file so far. Get the one for the current property. - my $missings = $property_info{$property_addr}{'missings'}; + my $missings = $property_info{$property_addr}{$MISSINGS}; # But we likely have separately stored what the default # should be. (This is to accommodate versions of the @@ -9594,7 +9611,7 @@ END $default_table = $missings; $default_map = $missings->full_name; } - $property_info{$property_addr}{'default_table'} + $property_info{$property_addr}{$DEFAULT_TABLE} = $default_table; } elsif ($default_map ne $missings) { @@ -9607,7 +9624,7 @@ END } } - $property_info{$property_addr}{'default'} + $property_info{$property_addr}{$DEFAULT_MAP} = $default_map; # If haven't done so already, find the table corresponding @@ -9617,7 +9634,7 @@ END && $property_type != $UNKNOWN) { $default_table = $property_info{$property_addr} - {'default_table'} + {$DEFAULT_TABLE} = $property_object->table($default_map); } } # End of is first time for this property @@ -9859,21 +9876,21 @@ END # first.) A comment for it will later be constructed based on the # actual properties present and used $perl_charname = Property->new('Perl_Charnames', - Core_Access => '\N{...} and "use charnames"', Default_Map => "", Directory => File::Spec->curdir(), File => 'Name', - Internal_Only_Warning => 1, + Fate => $INTERNAL_ONLY, Perl_Extension => 1, Range_Size_1 => \&output_perl_charnames_line, Type => $STRING, ); + $perl_charname->set_proxy_for('Name', 'Name_Alias'); my $Perl_decomp = Property->new('Perl_Decomposition_Mapping', Directory => File::Spec->curdir(), File => 'Decomposition', Format => $DECOMP_STRING_FORMAT, - Internal_Only_Warning => 1, + Fate => $INTERNAL_ONLY, Perl_Extension => 1, Default_Map => $CODE_POINT, @@ -9889,10 +9906,11 @@ END Map_Type => $COMPUTE_NO_MULTI_CP, Type => $STRING, ); + $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type'); $Perl_decomp->add_comment(join_lines(<initialize($full) if $simple->to_output_map(); my $simple_only = Property->new("_s$case", Type => $STRING, Default_Map => $CODE_POINT, Perl_Extension => 1, - Description => "The simple mappings for $case for code points that have full mappings as well"); + Fate => $INTERNAL_ONLY, + Description => "This contains the simple mappings for $case for just the code points that have different full mappings"); $simple_only->set_to_output_map($INTERNAL_MAP); $simple_only->add_comment(join_lines( <to_output_map()) { + $simple_only->set_proxy_for($simple_name); + } } return; @@ -10785,6 +10809,18 @@ sub filter_old_style_case_folding { $to_output_simple = property_ref('Simple_Case_Folding')->to_output_map; + if (! $to_output_simple) { + property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding'); + } + + # If we ever wanted to show that these tables were combined, a new + # property method could be created, like set_combined_props() + property_ref('Case_Folding')->add_comment(join_lines( <set_type($BINARY); - - # We have to change the default map, because the @missing line is - # misleading, given that we are treating it as binary. - $iicore->set_default_map('N'); - $iicore->table("Y") - ->add_note("Converted to a binary property as per unicode.org UAX #38."); + $iicore->set_type($FORCED_BINARY); + $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38."); + + # Unicode doesn't include the maps for this property, so don't + # warn that they are missing. + $iicore->set_pre_declared_maps(0); + $iicore->add_comment(join_lines( < $sc, Default_Map => $sc->default_map, Pre_Declared_Maps => 0, + Format => $STRING_WHITE_SPACE_LIST, ); $scx->add_comment(join_lines( <add_map(7, 7, "ALERT"); +} + sub finish_Unicode() { # This routine should be called after all the Unicode files have been read # in. It: @@ -11407,15 +11446,14 @@ sub finish_Unicode() { } $default_table->set_complement($non_default_table); } + else { - # This fills in any missing values with the default. It's - # tempting to save some time and memory in running this program - # by skipping this step for binary tables where the default - # is easily calculated. But it is needed for generating - # the test file, and other changes would also be required to do - # so. - $property->add_map(0, $LAST_UNICODE_CODEPOINT, - $default_map, Replace => $NO); + # This fills in any missing values with the default. It's not + # necessary to do this with binary properties, as the default + # is defined completely in terms of the Y table. + $property->add_map(0, $MAX_UNICODE_CODEPOINT, + $default_map, Replace => $NO); + } } # Have all we need to populate the match tables. @@ -11437,9 +11475,27 @@ sub finish_Unicode() { $table = $property->add_match_table($map); } + next if $table->complement != 0; # Don't need to populate these $table->add_range($range->start, $range->end); } + # A forced binary property has additional true/false tables which + # should have been set up when it was forced into binary. The false + # table matches exactly the same set as the property's default table. + # The true table matches the complement of that. The false table is + # not the same as an additional set of aliases on top of the default + # table, so use 'set_equivalent_to'. If it were implemented as + # additional aliases, various things would have to be adjusted, but + # especially, if the user wants to get a list of names for the table + # using Unicode::UCD::prop_value_aliases(), s/he should get a + # different set depending on whether they want the default table or + # the false table. + if ($property_type == $FORCED_BINARY) { + $property->table('N')->set_equivalent_to($default_table, + Related => 1); + $property->table('Y')->set_complement($default_table); + } + # For Perl 5.6 compatibility, all properties matchable in regexes can # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl. # But warn if this creates a conflict with a (new) Unicode property @@ -11590,7 +11646,7 @@ sub compile_perl() { # 'Any' is all code points. As an error check, instead of just setting it # to be that, construct it to be the union of all the major categories $Any = $perl->add_match_table('Any', - Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]", + Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]", Matches_All => 1); foreach my $major_table ($gc->tables) { @@ -11601,10 +11657,10 @@ sub compile_perl() { $Any += $major_table; } - if ($Any->max != $LAST_UNICODE_CODEPOINT) { + if ($Any->max != $MAX_UNICODE_CODEPOINT) { Carp::my_carp_bug("Generated highest code point (" . sprintf("%X", $Any->max) - . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.") + . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.") } if ($Any->range_count != 1 || $Any->min != 0) { Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.") @@ -11619,8 +11675,9 @@ sub compile_perl() { ); # Our internal-only property should be treated as more than just a - # synonym. - $perl->add_match_table('_CombAbove') + # synonym; grandfather it in to the pod. + $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1, + Fate => $INTERNAL_ONLY, Status => $DISCOURAGED) ->set_equivalent_to(property_ref('ccc')->table('Above'), Related => 1); @@ -11711,8 +11768,12 @@ sub compile_perl() { # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), # Modifier_Letter (Lm), or Modifier_Symbol (Sk). - # Perl has long had an internal-only alias for this property. - my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable'); + # Perl has long had an internal-only alias for this property; grandfather + # it in to the pod, but discourage its use. + my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable', + Re_Pod_Entry => 1, + Fate => $INTERNAL_ONLY, + Status => $DISCOURAGED); my $case_ignorable = property_ref('Case_Ignorable'); if (defined $case_ignorable && ! $case_ignorable->is_empty) { $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'), @@ -11905,8 +11966,9 @@ sub compile_perl() { Description => '\p{Punct} + ASCII-range \p{Symbol}', Initialize => $gc->table('Punctuation') + ($ASCII & $gc->table('Symbol')), + Perl_Extension => 1 ); - $perl->add_match_table('PosixPunct', + $perl->add_match_table('PosixPunct', Perl_Extension => 1, Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]', Initialize => $ASCII & $XPosixPunct, ); @@ -11955,8 +12017,11 @@ sub compile_perl() { # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier # than SD appeared, construct it ourselves, based on the first release SD - # was in. - my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ'); + # was in. A pod entry is grandfathered in for it + my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1, + Perl_Extension => 1, + Fate => $INTERNAL_ONLY, + Status => $DISCOURAGED); my $soft_dotted = property_ref('Soft_Dotted'); if (defined $soft_dotted && ! $soft_dotted->is_empty) { $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1); @@ -11977,15 +12042,17 @@ sub compile_perl() { } # These are used in Unicode's definition of \X - my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1); - my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1); + my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1, + Fate => $INTERNAL_ONLY); + my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1, + Fate => $INTERNAL_ONLY); # For backward compatibility, Perl has its own definition for IDStart # First, we include the underscore, and then the regular XID_Start also # have to be Words $perl->add_match_table('_Perl_IDStart', Perl_Extension => 1, - Internal_Only => 1, + Fate => $INTERNAL_ONLY, Initialize => ord('_') + (property_ref('XID_Start')->table('Y') & $Word) @@ -12053,7 +12120,9 @@ sub compile_perl() { # More GCB. If we found some hangul syllables, populate a combined # table. - my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V'); + my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V', + Perl_Extension => 1, + Fate => $INTERNAL_ONLY); my $LV = $gcb->table('LV'); if ($LV->is_empty) { push @tables_that_may_be_empty, $lv_lvt_v->complete_name; @@ -12104,6 +12173,13 @@ This file is for charnames.pm. It is the union of the $comment properties. Unicode_1_Name entries are used only for otherwise nameless code points. $alias_sentence +This file doesn't include the algorithmically determinable names. For those, +use 'unicore/Name.pm' +END + )); + property_ref('Name')->add_comment(join_lines( <new('In', Default_Map => $default_map, Full_Name => "Present_In", - Internal_Only_Warning => 1, Perl_Extension => 1, Type => $ENUM, Initialize => $age, @@ -12193,9 +12268,10 @@ END foreach my $alias ($table->aliases) { next if $alias->name =~ /^_/; $table->add_alias('Is_' . $alias->name, - Pod_Entry => 0, + Re_Pod_Entry => 0, + UCD => 0, Status => $alias->status, - Externally_Ok => 0); + OK_as_Filename => 0); } } @@ -12212,7 +12288,7 @@ END Initialize => $gc->table('Unassigned') & property_ref('Noncharacter_Code_Point')->table('N')); - for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) { + for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) { $i = populate_char_info($i); # Note sets $i so may cause skips } } @@ -12235,7 +12311,8 @@ sub add_perl_synonyms() { # Construct the list of tables to get synonyms for. Start with all the # binary and the General_Category ones. - my @tables = grep { $_->type == $BINARY } property_ref('*'); + my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY } + property_ref('*'); push @tables, $gc->tables; # If the version of Unicode includes the Script property, add its tables @@ -12304,8 +12381,8 @@ sub add_perl_synonyms() { # No name collision, so ok to add the perl synonym. - my $make_pod_entry; - my $externally_ok; + my $make_re_pod_entry; + my $ok_as_filename; my $status = $alias->status; if ($nominal_property == $block) { @@ -12315,36 +12392,36 @@ sub add_perl_synonyms() { # we don't want people using the name without the # 'In', so discourage that. if ($prefix eq "") { - $make_pod_entry = 1; + $make_re_pod_entry = 1; $status = $status || $DISCOURAGED; - $externally_ok = 0; + $ok_as_filename = 0; } elsif ($prefix eq 'In_') { - $make_pod_entry = 0; + $make_re_pod_entry = 0; $status = $status || $NORMAL; - $externally_ok = 1; + $ok_as_filename = 1; } else { - $make_pod_entry = 0; + $make_re_pod_entry = 0; $status = $status || $DISCOURAGED; - $externally_ok = 0; + $ok_as_filename = 0; } } elsif ($prefix ne "") { # The 'Is' prefix is handled in the pod by a wild # card, and we won't use it for an external name - $make_pod_entry = 0; + $make_re_pod_entry = 0; $status = $status || $NORMAL; - $externally_ok = 0; + $ok_as_filename = 0; } else { # Here, is an empty prefix, non block. This gets its # own pod entry and can be used for an external name. - $make_pod_entry = 1; + $make_re_pod_entry = 1; $status = $status || $NORMAL; - $externally_ok = 1; + $ok_as_filename = 1; } # Here, there isn't a perl pre-existing table with the @@ -12356,9 +12433,15 @@ sub add_perl_synonyms() { # Here, have found a table for $perl. Add this alias # to it, and are done with this prefix. $equivalent->add_alias($proposed_name, - Pod_Entry => $make_pod_entry, + Re_Pod_Entry => $make_re_pod_entry, + + # Currently don't output these in the + # ucd pod, as are strongly discouraged + # from being used + UCD => 0, + Status => $status, - Externally_Ok => $externally_ok); + OK_as_Filename => $ok_as_filename); trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace; next PREFIX; } @@ -12366,9 +12449,13 @@ sub add_perl_synonyms() { # Here, $perl doesn't already have a table that is a # synonym for this property, add one. my $added_table = $perl->add_match_table($proposed_name, - Pod_Entry => $make_pod_entry, + Re_Pod_Entry => $make_re_pod_entry, + + # See UCD comment just above + UCD => 0, + Status => $status, - Externally_Ok => $externally_ok); + OK_as_Filename => $ok_as_filename); # And it will be related to the actual table, since it is # based on it. $added_table->set_equivalent_to($actual, Related => 1); @@ -12454,7 +12541,10 @@ END # unless they are the same table. For example, N meaning Number or # Neutral is not likely to cause confusion, so don't add caveats to things # like them. - foreach my $property (grep { $_->type != $BINARY } property_ref('*')) { + foreach my $property (grep { $_->type != $BINARY + && $_->type != $FORCED_BINARY } + property_ref('*')) + { my $yes = $property->table('Yes'); if (defined $yes) { my $y = $property->table('Y'); @@ -12491,8 +12581,29 @@ sub register_file_for_name($$$) { if ($table->isa('Property')) { $table->set_file_path(@$directory_ref, $file); - push @map_properties, $table - if $directory_ref->[0] eq $map_directory; + push @map_properties, $table; + + # No swash means don't do the rest of this. + return if $table->fate != $ORDINARY; + + # Get the path to the file + my @path = $table->file_path; + + # Use just the file name if no subdirectory. + shift @path if $path[0] eq File::Spec->curdir(); + + my $file = join '/', @path; + + # Create a hash entry for utf8_heavy to get the file that stores this + # property's map table + foreach my $alias ($table->aliases) { + my $name = $alias->name; + $loose_property_to_file_of{standardize($name)} = $file; + } + + # And a way for utf8_heavy to find the proper key in the SwashInfo + # hash for this property. + $file_to_swash_name{$file} = "To" . $table->swash_name; return; } @@ -12530,12 +12641,38 @@ sub register_file_for_name($$$) { # Associate it with its file internally. Don't include the # $matches_directory first component $table->set_file_path(@$directory_ref, $file); + + # No swash means don't do the rest of this. + next if $table->isa('Map_Table') && $table->fate != $ORDINARY; + my $sub_filename = join('/', $directory_ref->[1, -1], $file); my $property = $table->property; - $property = ($property == $perl) - ? "" # 'perl' is never explicitly stated - : standardize($property->name) . '='; + my $property_name = ($property == $perl) + ? "" # 'perl' is never explicitly stated + : standardize($property->name) . '='; + + my $is_default = 0; # Is this table the default one for the property? + + # To calculate $is_default, we find if this table is the same as the + # default one for the property. But this is complicated by the + # possibility that there is a master table for this one, and the + # information is stored there instead of here. + my $parent = $table->parent; + my $leader_prop = $parent->property; + my $default_map = $leader_prop->default_map; + if (defined $default_map) { + my $default_table = $leader_prop->table($default_map); + $is_default = 1 if defined $default_table && $parent == $default_table; + } + + # Calculate the loose name for this table. Mostly it's just its name, + # standardized. But in the case of Perl tables that are single-form + # equivalents to Unicode properties, it is the latter's name. + my $loose_table_name = + ($property != $perl || $leader_prop == $perl) + ? standardize($table->name) + : standardize($parent->name); my $deprecated = ($table->status eq $DEPRECATED) ? $table->status_info @@ -12575,12 +12712,25 @@ sub register_file_for_name($$$) { if ((my $integer_name = $alias->name) =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) { - $stricter_to_file_of{$property . $integer_name} + $stricter_to_file_of{$property_name . $integer_name} = $sub_filename; } } } + # For Unicode::UCD, create a mapping of the prop=value to the + # canonical =value for that property. + if ($standard =~ /=/) { + + # This could happen if a strict name mapped into an existing + # loose name. In that event, the strict names would have to + # be moved to a new hash. + if (exists($loose_to_standard_value{$standard})) { + Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway"); + } + $loose_to_standard_value{$standard} = $loose_table_name; + } + # Keep a list of the deprecated properties and their filenames if ($deprecated && $complement == 0) { $utf8::why_deprecated{$sub_filename} = $deprecated; @@ -12590,6 +12740,10 @@ sub register_file_for_name($$$) { if ($caseless_equivalent != 0) { $caseless_equivalent_to{$standard} = $caseless_equivalent; } + + # Add to defaults list if the table this alias belongs to is the + # default one + $loose_defaults{$standard} = 1 if $is_default; } } @@ -12771,7 +12925,7 @@ sub format_pod_line ($$$;$$) { my @zero_match_tables; # List of tables that have no matches in this release -sub make_table_pod_entries($) { +sub make_re_pod_entries($) { # This generates the entries for the pod file for a given table. # Also done at this time are any children tables. The output looks like: # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178) @@ -12830,7 +12984,7 @@ sub make_table_pod_entries($) { foreach my $alias ($table->aliases) { # Skip if not to go in pod. - next unless $alias->make_pod_entry; + next unless $alias->make_re_pod_entry; # Start gathering all the components for the entry my $name = $alias->name; @@ -12858,9 +13012,31 @@ sub make_table_pod_entries($) { next if $name ne 'N' && $name ne 'Y'; $rhs = "$name*"; } - else { + elsif ($type != $FORCED_BINARY) { $rhs = $name; } + else { + + # Forced binary properties require special handling. It + # has two sets of tables, one set is true/false; and the + # other set is everything else. Entries are generated for + # each set. Use the Bidi_Mirrored property (which appears + # in all Unicode versions) to get a list of the aliases + # for the true/false tables. Of these, only output the N + # and Y ones, the same as, a regular binary property. And + # output all the rest, same as a non-binary property. + my $bm = property_ref("Bidi_Mirrored"); + if ($name eq 'N' || $name eq 'Y') { + $rhs = "$name*"; + } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases, + $bm->table("N")->aliases) + { + next; + } + else { + $rhs = $name; + } + } # Colon-space is used to give a little more space to be easier # to read; @@ -12963,10 +13139,17 @@ sub make_table_pod_entries($) { # Special case the binary N tables, so that will print # \P{single}, but use the Y table values to populate # 'single', as we haven't likewise populated the N table. + # For forced binary tables, we can't just look at the N + # table, but must see if this table is equivalent to the N + # one, as there are two equivalent beasts in these + # properties. my $test_table; my $p; - if ($type == $BINARY - && $input_table == $property->table('No')) + if ( ($type == $BINARY + && $input_table == $property->table('No')) + || ($type == $FORCED_BINARY + && $property->table('No')-> + is_set_equivalent_to($input_table))) { $test_table = $property->table('Yes'); $p = 'P'; @@ -13064,6 +13247,179 @@ sub make_table_pod_entries($) { return; } +sub make_ucd_table_pod_entries { + my $table = shift; + + # Generate the entries for the UCD section of the pod for $table. This + # also calculates if names are ambiguous, so has to be called even if the + # pod is not being output + + my $short_name = $table->name; + my $standard_short_name = standardize($short_name); + my $full_name = $table->full_name; + my $standard_full_name = standardize($full_name); + + my $full_info = ""; # Text of info column for full-name entries + my $other_info = ""; # Text of info column for short-name entries + my $short_info = ""; # Text of info column for other entries + my $meaning = ""; # Synonym of this table + + my $property = ($table->isa('Property')) + ? $table + : $table->parent->property; + + my $perl_extension = $table->perl_extension; + + # Get the more official name for for perl extensions that aren't + # stand-alone properties + if ($perl_extension && $property != $table) { + if ($property == $perl ||$property->type == $BINARY) { + $meaning = $table->complete_name; + } + else { + $meaning = $property->full_name . "=$full_name"; + } + } + + # There are three types of info column. One for the short name, one for + # the full name, and one for everything else. They mostly are the same, + # so initialize in the same loop. + foreach my $info_ref (\$full_info, \$short_info, \$other_info) { + if ($perl_extension && $property != $table) { + + # Add the synonymous name for the non-full name entries; and to + # the full-name entry if it adds extra information + if ($info_ref == \$other_info + || ($info_ref == \$short_info + && $standard_short_name ne $standard_full_name) + || standardize($meaning) ne $standard_full_name + ) { + $$info_ref .= "$meaning."; + } + } + elsif ($info_ref != \$full_info) { + + # Otherwise, the non-full name columns include the full name + $$info_ref .= $full_name; + } + + # And the full-name entry includes the short name, if different + if ($info_ref == \$full_info + && $standard_short_name ne $standard_full_name) + { + $full_info =~ s/\.\Z//; + $full_info .= " " if $full_info; + $full_info .= "(Short: $short_name)"; + } + + if ($table->perl_extension) { + $$info_ref =~ s/\.\Z//; + $$info_ref .= ". " if $$info_ref; + $$info_ref .= "(Perl extension)"; + } + } + + # Add any extra annotations to the full name entry + foreach my $more_info ($table->description, + $table->note, + $table->status_info) + { + next unless $more_info; + $full_info =~ s/\.\Z//; + $full_info .= ". " if $full_info; + $full_info .= $more_info; + } + + # These keep track if have created full and short name pod entries for the + # property + my $done_full = 0; + my $done_short = 0; + + # Every possible name is kept track of, even those that aren't going to be + # output. This way we can be sure to find the ambiguities. + foreach my $alias ($table->aliases) { + my $name = $alias->name; + my $standard = standardize($name); + my $info; + my $output_this = $alias->ucd; + + # If the full and short names are the same, we want to output the full + # one's entry, so it has priority. + if ($standard eq $standard_full_name) { + next if $done_full; + $done_full = 1; + $info = $full_info; + } + elsif ($standard eq $standard_short_name) { + next if $done_short; + $done_short = 1; + next if $standard_short_name eq $standard_full_name; + $info = $short_info; + } + else { + $info = $other_info; + } + + # Here, we have set up the two columns for this entry. But if an + # entry already exists for this name, we have to decide which one + # we're going to later output. + if (exists $ucd_pod{$standard}) { + + # If the two entries refer to the same property, it's not going to + # be ambiguous. (Likely it's because the names when standardized + # are the same.) But that means if they are different properties, + # there is ambiguity. + if ($ucd_pod{$standard}->{'property'} != $property) { + + # Here, we have an ambiguity. This code assumes that one is + # scheduled to be output and one not and that one is a perl + # extension (which is not to be output) and the other isn't. + # If those assumptions are wrong, things have to be rethought. + if ($ucd_pod{$standard}{'output_this'} == $output_this + || $ucd_pod{$standard}{'perl_extension'} == $perl_extension + || $output_this == $perl_extension) + { + Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway."); + } + + # We modifiy the info column of the one being output to + # indicate the ambiguity. Set $which to point to that one's + # info. + my $which; + if ($ucd_pod{$standard}{'output_this'}) { + $which = \$ucd_pod{$standard}->{'info'}; + } + else { + $which = \$info; + $meaning = $ucd_pod{$standard}{'meaning'}; + } + + chomp $$which; + $$which =~ s/\.\Z//; + $$which .= "; NOT '$standard' meaning '$meaning'"; + + $ambiguous_names{$standard} = 1; + } + + # Use the non-perl-extension variant + next unless $ucd_pod{$standard}{'perl_extension'}; + } + + # Store enough information about this entry that we can later look for + # ambiguities, and output it properly. + $ucd_pod{$standard} = { 'name' => $name, + 'info' => $info, + 'meaning' => $meaning, + 'output_this' => $output_this, + 'perl_extension' => $perl_extension, + 'property' => $property, + 'status' => $alias->status, + }; + } # End of looping through all this table's aliases + + return; +} + sub pod_alphanumeric_sort { # Sort pod entries alphanumerically. @@ -13115,6 +13471,8 @@ sub make_pod () { # Create the .pod file. This generates the various subsections and then # combines them in one big HERE document. + my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; + return unless defined $pod_directory; print "Making pod file\n" if $verbosity >= $PROGRESS; @@ -13146,7 +13504,7 @@ e.g., C<\\p{blk:latin1}>. See L for more information about this. END } - my $text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)"; + my $text = $Is_flags_text; $text = "$exception_message $text" if $has_Is_conflicts; # And the 'Is_ line'; @@ -13223,10 +13581,7 @@ END foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] } keys %why_list) { - # Add to the output, all the properties that have that reason. Start - # with an empty line. - push @bad_re_properties, "\n\n"; - + # Add to the output, all the properties that have that reason. my $has_item = 0; # Flag if actually output anything. foreach my $name (@{$why_list{$why}}) { @@ -13252,6 +13607,9 @@ END my $short_name = $property->name; $short_name .= '=' . $property->table($table)->name if $table; + # Start with an empty line. + push @bad_re_properties, "\n\n" unless $has_item; + # And add the property as an item for the reason. push @bad_re_properties, "\n=item I<$name> ($short_name)\n"; $has_item = 1; @@ -13265,57 +13623,68 @@ END } # End of looping through each reason. - # Generate a list of the properties whose map table we output, from the - # global @map_properties. - my @map_tables_actually_output; - my $info_indent = 20; # Left column is narrower than \p{} table. - foreach my $property (@map_properties) { - - # Get the path to the file; don't output any not in the standard - # directory. - my @path = $property->file_path; - next if $path[0] ne $map_directory; - - # Don't mention map tables that are for internal-use only - next if $property->to_output_map == $INTERNAL_MAP; - - shift @path; # Remove the standard name + if (! @bad_re_properties) { + push @bad_re_properties, + "*** This installation accepts ALL non-Unihan properties ***"; + } + else { + # Add =over only if non-empty to avoid an empty =over/=back section, + # which is considered bad form. + unshift @bad_re_properties, "\n=over 4\n"; + push @bad_re_properties, "\n=back\n"; + } - my $file = join '/', @path; # In case is in sub directory - my $info = $property->full_name; - my $short_name = $property->name; - if ($info ne $short_name) { - $info .= " ($short_name)"; - } - foreach my $more_info ($property->description, - $property->note, - $property->status_info) - { - next unless $more_info; - $info =~ s/\.\Z//; - $info .= ". $more_info"; - } - push @map_tables_actually_output, format_pod_line($info_indent, - $file, - $info, - $property->status); + # Similiarly, generate a list of files that we don't use, grouped by the + # reasons why. First, create a hash whose keys are the reasons, and whose + # values are anonymous arrays of all the files that share that reason. + my %grouped_by_reason; + foreach my $file (keys %ignored_files) { + push @{$grouped_by_reason{$ignored_files{$file}}}, $file; } - # Sort alphabetically, and fold for output - @map_tables_actually_output = sort - pod_alphanumeric_sort @map_tables_actually_output; - @map_tables_actually_output - = simple_fold(\@map_tables_actually_output, - ' ', - $info_indent, - $automatic_pod_indent); + # Then, sort each group. + foreach my $group (keys %grouped_by_reason) { + @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b } + @{$grouped_by_reason{$group}} ; + } - # Generate a list of the formats that can appear in the map tables. - my @map_table_formats; - foreach my $format (sort keys %map_table_formats) { - push @map_table_formats, " $format $map_table_formats{$format}\n"; + # Finally, create the output text. For each reason (sorted by the + # alphabetically first file that has that reason)... + my @unused_files; + foreach my $reason (sort { lc $grouped_by_reason{$a}->[0] + cmp lc $grouped_by_reason{$b}->[0] + } + keys %grouped_by_reason) + { + # Add all the files that have that reason to the output. Start + # with an empty line. + push @unused_files, "\n\n"; + push @unused_files, map { "\n=item F<$_> \n" } + @{$grouped_by_reason{$reason}}; + # And add the reason under the list of files + push @unused_files, "\n$reason\n"; + } + + # Similarly, create the output text for the UCD section of the pod + my @ucd_pod; + foreach my $key (keys %ucd_pod) { + next unless $ucd_pod{$key}->{'output_this'}; + push @ucd_pod, format_pod_line($indent_info_column, + $ucd_pod{$key}->{'name'}, + $ucd_pod{$key}->{'info'}, + $ucd_pod{$key}->{'status'}, + ); } + # Sort alphabetically, and fold for output + @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod; + my $ucd_pod = simple_fold(\@ucd_pod, + ' ', + $indent_info_column, + $automatic_pod_indent); + $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') + . "\n" + . $ucd_pod; local $" = ""; # Everything is ready to assemble. @@ -13330,25 +13699,39 @@ To change this file, edit $0 instead. =head1 NAME -$pod_file - Index of Unicode Version $string_version properties in Perl +$pod_file - Index of Unicode Version $string_version character properties in Perl =head1 DESCRIPTION -There are many properties in Unicode, and Perl provides access to almost all of -them, as well as some additional extensions and short-cut synonyms. +This document provides information about the portion of the Unicode database +that deals with character properties, that is the portion that is defined on +single code points. (L +below briefly mentions other data that Unicode provides.) + +Perl can provide access to all non-provisional Unicode character properties, +though not all are enabled by default. The omitted ones are the Unihan +properties (accessible via the CPAN module L) and certain +deprecated or Unicode-internal properties. (An installation may choose to +recompile Perl's tables to change this. See L.) -And just about all of the few that aren't accessible through the Perl -core are accessible through the modules: L and -L, and for Unihan properties, via the CPAN module -L. +For most purposes, access to Unicode properties from the Perl core is through +regular expression matches, as described in the next section. +For some special purposes, and to access the properties that are not suitable +for regular expression matching, all the Unicode character properties that +Perl handles are accessible via the standard L module, as +described in the section L. + +Perl also provides some additional extensions and short-cut synonyms +for Unicode properties. This document merely lists all available properties and does not attempt to explain what each property really means. There is a brief description of each -Perl extension. There is some detail about Blocks, Scripts, General_Category, +Perl extension; see L for more information on +these. There is some detail about Blocks, Scripts, General_Category, and Bidi_Class in L, but to find out about the intricacies of the -Unicode properties, refer to the Unicode standard. A good starting place is -L<$unicode_reference_url>. More information on the Perl extensions is in -L. +official Unicode properties, refer to the Unicode standard. A good starting +place is L<$unicode_reference_url>. Note that you can define your own properties; see L. @@ -13420,7 +13803,7 @@ There are several varieties of obsolescence: =item Stabilized -Obsolete properties may be stabilized. Such a determination does not indicate +A property may be stabilized. Such a determination does not indicate that the property should or should not be used; instead it is a declaration that the property will not be maintained nor extended for newly encoded characters. Such properties are marked with $a_bold_stabilized in the @@ -13428,7 +13811,7 @@ table. =item Deprecated -An obsolete property may be deprecated, perhaps because its original intent +A property may be deprecated, perhaps because its original intent has been replaced by another property, or because its specification was somehow defective. This means that its use is strongly discouraged, so much so that a warning will be issued if used, unless the @@ -13548,23 +13931,67 @@ $formatted_properties $zero_matches -=head1 Properties not accessible through \\p{} and \\P{} - -A few properties are accessible in Perl via various function calls only. -These are: +=head1 Properties accessible through Unicode::UCD + +All the Unicode character properties mentioned above (except for those marked +as for internal use by Perl) are also accessible by +L. + +Due to their nature, not all Unicode character properties are suitable for +regular expression matches, nor C. The remaining +non-provisional, non-internal ones are accessible via +L (except for those that this Perl installation +hasn't included; see L). + +For compatibility with other parts of Perl, all the single forms given in the +table in the L
+are recognized. BUT, there are some ambiguities between some Perl extensions +and the Unicode properties, all of which are silently resolved in favor of the +official Unicode property. To avoid surprises, you should only use +C for forms listed in the table below, which omits the +non-recommended ones. The affected forms are the Perl single form equivalents +of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of +C<\\p{gc=sc}>, which is treated by C as the C