use File::Path;
use File::Spec;
use Text::Tabs;
+use re "/aa";
sub DEBUG () { 0 } # Set to 0 for production; 1 for development
my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
);
# Properties that this program ignores.
-my @unimplemented_properties = (
-'Unicode_Radical_Stroke' # Remove if changing to handle this one.
-);
+my @unimplemented_properties;
+
+# With this release, it is automatically handled if the Unihan db is
+# downloaded
+push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
# There are several types of obsolete properties defined by Unicode. These
# must be hand-edited for every new Unicode release.
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
- 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
+ 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+
+ 'Indic_Matra_Category' => "Provisional",
+ 'Indic_Syllabic_Category' => "Provisional",
- 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
+ # Don't suppress ISO_Comment, as otherwise special handling is needed
+ # to differentiate between it and gc=c, which can be written as 'isc',
+ # which is the same characters as ISO_Comment's short name.
- 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
- 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
- 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
- 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
- 'Name' => "Accessible via 'use charnames;'",
- 'Name_Alias' => "Accessible via 'use charnames;'",
+ 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
+ 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
);
- # The following are suppressed because they were made contributory or
- # deprecated by Unicode before Perl ever thought about supporting them.
- foreach my $property ('Jamo_Short_Name',
- 'Grapheme_Link',
- 'Expands_On_NFC',
- 'Expands_On_NFD',
- 'Expands_On_NFKC',
- 'Expands_On_NFKD'
+ foreach my $property (
+
+ # The following are suppressed because they were made contributory
+ # or deprecated by Unicode before Perl ever thought about
+ # supporting them.
+ 'Jamo_Short_Name',
+ 'Grapheme_Link',
+ 'Expands_On_NFC',
+ 'Expands_On_NFD',
+ 'Expands_On_NFKC',
+ 'Expands_On_NFKD',
+
+ # The following are suppressed because they have been marked
+ # as deprecated for a sufficient amount of time
+ 'Other_Alphabetic',
+ 'Other_Default_Ignorable_Code_Point',
+ 'Other_Grapheme_Extend',
+ 'Other_ID_Continue',
+ 'Other_ID_Start',
+ 'Other_Lowercase',
+ 'Other_Math',
+ 'Other_Uppercase',
) {
$why_suppressed{$property} = $why_deprecated{$property};
}
'ReadMe.txt' => 'Documentation',
'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
- 'IndicMatraCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
- 'IndicSyllabicCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
);
+my %skipped_files; # List of files that we skip
+
### End of externally interesting definitions, except for @input_file_objects
my $HEADER=<<"EOF";
# if the flag is changed, the indefinite article referring to it in the
# documentation may need to be as well.
my $NORMAL = "";
-my $SUPPRESSED = 'z'; # The character should never actually be seen, since
- # it is suppressed
-my $PLACEHOLDER = 'P'; # A property that is defined as a placeholder in a
- # Unicode version that doesn't have it, but we need it
- # to be defined, if empty, to have things work.
- # Implies no pod entry generated
my $DEPRECATED = 'D';
my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
my %status_past_participles = (
$DISCOURAGED => 'discouraged',
- $SUPPRESSED => 'should never be generated',
$STABILIZED => 'stabilized',
$OBSOLETE => 'obsolete',
$DEPRECATED => 'deprecated',
);
+# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
+# externally documented.
+my $ORDINARY = 0; # The normal fate.
+my $MAP_PROXIED = 1; # The map table for the property isn't written out,
+ # but there is a file written that can be used to
+ # reconstruct this table
+my $SUPPRESSED = 3; # The file for this table is not written out.
+my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
+ # for Perl's internal use only
+my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
+ # Unicode version that doesn't have it, but we need it
+ # to be defined, if empty, to have things work.
+ # Implies no pod entry generated
+
# The format of the values of the tables:
my $EMPTY_FORMAT = "";
my $BINARY_FORMAT = 'b';
my $AUXILIARY = 'auxiliary';
# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
+# and into UCD.pl for the use of UCD.pm
my %loose_to_file_of; # loosely maps table names to their respective
# files
my %stricter_to_file_of; # same; but for stricter mapping.
+my %loose_property_to_file_of; # Maps a loose property name to its map file
+my %file_to_swash_name; # Maps the file name to its corresponding key name
+ # in the hash %utf8::SwashInfo
my %nv_floating_to_rational; # maps numeric values floating point numbers to
# their rational equivalent
my %loose_property_name_of; # Loosely maps (non_string) property names to
# standard form
+my %string_property_loose_to_name; # Same, for string properties.
+my %loose_defaults; # keys are of form "prop=value", where 'prop' is
+ # the property name in standard loose form, and
+ # 'value' is the default value for that property,
+ # also in standard loose form.
+my %loose_to_standard_value; # loosely maps table names to the canonical
+ # alias for them
+my %ambiguous_names; # keys are alias names (in standard form) that
+ # have more than one possible meaning.
+my %prop_aliases; # Keys are standard property name; values are each
+ # one's aliases
+my %prop_value_aliases; # Keys of top level are standard property name;
+ # values are keys to another hash, Each one is
+ # one of the property's values, in standard form.
+ # The values are that prop-val's aliases.
+my %ucd_pod; # Holds entries that will go into the UCD section of the pod
# Most properties are immune to caseless matching, otherwise you would get
# nonsensical results, as properties are a function of a code point, not
# contrast to the non_skip element, which is supposed to be used very
# temporarily for debugging. Sets 'optional' to 1. Also, files that we
# pretty much will never look at can be placed in the global
- # %ignored_files instead. Ones used here will be added to that list.
+ # %ignored_files instead. Ones used here will be added to %skipped files
main::set_access('skip', \%skip, 'c');
my %each_line_handler;
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $ignored_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr}
}
return $self;
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
my $fkey = File::Spec->rel2abs($file);
- my $expecting = delete $potential_files{$fkey};
- $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
+ my $expecting = delete $potential_files{lc($fkey)};
+
Carp::my_carp("Was not expecting '$file'.") if
! $expecting
&& ! defined $handle{$addr};
# Some aliases should not get their own entries in the re section of the
# pod, because they are covered by a wild-card, and some we want to
# discourage use of. Binary
- main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r');
+ main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
+
+ my %ucd;
+ # Is this documented to be accessible via Unicode::UCD
+ main::set_access('ucd', \%ucd, 'r', 's');
my %status;
# Aliases have a status, like deprecated, or even suppressed (which means
# they don't appear in documentation). Enum
main::set_access('status', \%status, 'r');
- my %externally_ok;
+ my %ok_as_filename;
# Similarly, some aliases should not be considered as usable ones for
# external use, such as file names, or we don't want documentation to
# recommend them. Boolean
- main::set_access('externally_ok', \%externally_ok, 'r');
+ main::set_access('ok_as_filename', \%ok_as_filename, 'r');
sub new {
my $class = shift;
$name{$addr} = shift;
$loose_match{$addr} = shift;
$make_re_pod_entry{$addr} = shift;
- $externally_ok{$addr} = shift;
+ $ok_as_filename{$addr} = shift;
$status{$addr} = shift;
+ $ucd{$addr} = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Null names are never ok externally
- $externally_ok{$addr} = 0 if $name{$addr} eq "";
+ $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
return $self;
}
# files.
main::set_access('note', \%note, 'readable_array');
- my %internal_only;
- # Boolean; if set this table is for internal core Perl only use.
- main::set_access('internal_only', \%internal_only, 'r');
+ my %fate;
+ # Enum; there are a number of possibilities for what happens to this
+ # table: it could be normal, or suppressed, or not for external use. See
+ # values at definition for $SUPPRESSED.
+ main::set_access('fate', \%fate, 'r');
my %find_table_from_alias;
# The parent property passes this pointer to a hash which this class adds
sub new {
# All arguments are key => value pairs, which you can see below, most
# of which match fields documented above. Otherwise: Re_Pod_Entry,
- # Externally_Ok, and Fuzzy apply to the names of the table, and are
+ # OK_as_Filename, and Fuzzy apply to the names of the table, and are
# documented in the Alias package
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
my $complete_name = $complete_name{$addr}
= delete $args{'Complete_Name'};
$format{$addr} = delete $args{'Format'};
- $internal_only{$addr} = delete $args{'Internal_Only'} || 0;
$output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
$property{$addr} = delete $args{'_Property'};
$range_list{$addr} = delete $args{'_Range_List'};
$status_info{$addr} = delete $args{'_Status_Info'} || "";
$range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
$caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
+ $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
+ my $ucd = delete $args{'UCD'};
my $description = delete $args{'Description'};
- my $externally_ok = delete $args{'Externally_Ok'};
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
my $loose_match = delete $args{'Fuzzy'};
my $note = delete $args{'Note'};
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
push @{$description{$addr}}, $description if $description;
push @{$note{$addr}}, $note if $note;
- if ($status{$addr} eq $PLACEHOLDER) {
+ if ($fate{$addr} == $PLACEHOLDER) {
# A placeholder table doesn't get documented, is a perl extension,
# and quite likely will be empty
$make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
$perl_extension = 1 if ! defined $perl_extension;
+ $ucd = 0 if ! defined $ucd;
push @tables_that_may_be_empty, $complete_name{$addr};
+ $self->add_comment(<<END);
+This is a placeholder because it is not in Version $string_version of Unicode,
+but is needed by the Perl core to work gracefully. Because it is not in this
+version of Unicode, it will not be listed in $pod_file.pod
+END
}
- elsif (! $status{$addr}) {
-
- # If hasn't set its status already, see if it is on one of the
- # lists of properties or tables that have particular statuses; if
- # not, is normal. The lists are prioritized so the most serious
- # ones are checked first
- if (exists $why_suppressed{$complete_name}
+ elsif (exists $why_suppressed{$complete_name}
# Don't suppress if overridden
&& ! grep { $_ eq $complete_name{$addr} }
@output_mapped_properties)
- {
- $status{$addr} = $SUPPRESSED;
- }
- elsif (exists $why_deprecated{$complete_name}) {
+ {
+ $fate{$addr} = $SUPPRESSED;
+ }
+ elsif ($fate{$addr} == $SUPPRESSED
+ && ! exists $why_suppressed{$property{$addr}->complete_name})
+ {
+ Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
+ # perhaps Fate => [ $SUPPRESSED, "reason" ]
+ }
+
+ # If hasn't set its status already, see if it is on one of the
+ # lists of properties or tables that have particular statuses; if
+ # not, is normal. The lists are prioritized so the most serious
+ # ones are checked first
+ if (! $status{$addr}) {
+ if (exists $why_deprecated{$complete_name}) {
$status{$addr} = $DEPRECATED;
}
elsif (exists $why_stabilized{$complete_name}) {
# Existence above doesn't necessarily mean there is a message
# associated with it. Use the most serious message.
if ($status{$addr}) {
- if ($why_suppressed{$complete_name}) {
- $status_info{$addr}
- = $why_suppressed{$complete_name};
- }
- elsif ($why_deprecated{$complete_name}) {
+ if ($why_deprecated{$complete_name}) {
$status_info{$addr}
= $why_deprecated{$complete_name};
}
$perl_extension{$addr} = $perl_extension || 0;
# Don't list a property by default that is internal only
- $make_re_pod_entry = 0 if ! defined $make_re_pod_entry
- && $internal_only{$addr};
+ if ($fate{$addr} > $MAP_PROXIED) {
+ $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
+ $ucd = 0 if ! defined $ucd;
+ }
+ else {
+ $ucd = 1 if ! defined $ucd;
+ }
# By convention what typically gets printed only or first is what's
# first in the list, so put the full name there for good output
# clarity. Other routines rely on the full name being first on the
# list
$self->add_alias($full_name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
+ UCD => $ucd,
);
# Then comes the other name, if meaningfully different.
if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
$self->add_alias($name{$addr},
- Externally_Ok => $externally_ok,
+ OK_as_Filename => $ok_as_filename,
Fuzzy => $loose_match,
Re_Pod_Entry => $make_re_pod_entry,
Status => $status{$addr},
+ UCD => $ucd,
);
}
my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
$make_re_pod_entry = $YES unless defined $make_re_pod_entry;
- my $externally_ok = delete $args{'Externally_Ok'};
- $externally_ok = 1 unless defined $externally_ok;
+ my $ok_as_filename = delete $args{'OK_as_Filename'};
+ $ok_as_filename = 1 unless defined $ok_as_filename;
my $status = delete $args{'Status'};
$status = $NORMAL unless defined $status;
+ my $ucd = delete $args{'UCD'} // 1;
+
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
# Capitalize the first letter of the alias unless it is one of the CJK
$insert_position,
0,
Alias->new($name, $loose_match, $make_re_pod_entry,
- $externally_ok, $status);
+ $ok_as_filename, $status, $ucd);
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
foreach my $alias ($self->aliases()) {
# Don't use an alias that isn't ok to use for an external name.
- next if ! $alias->externally_ok;
+ next if ! $alias->ok_as_filename;
my $name = main::Standardize($alias->name);
trace $self, $name if main::DEBUG && $to_trace;
return;
}
+ sub set_fate { # Set the fate of a table
+ my $self = shift;
+ my $fate = shift;
+ my $reason = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $addr = do { no overloading; pack 'J', $self; };
+
+ return if $fate{$addr} == $fate; # If no-op
+
+ # Can only change the ordinary fate, except if going to $MAP_PROXIED
+ return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
+
+ $fate{$addr} = $fate;
+
+ # Don't document anything to do with a non-normal fated table
+ if ($fate != $ORDINARY) {
+ my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
+ foreach my $alias ($self->aliases) {
+ $alias->set_ucd($put_in_pod);
+
+ # MAP_PROXIED doesn't affect the match tables
+ next if $fate == $MAP_PROXIED;
+ $alias->set_make_re_pod_entry($put_in_pod);
+ }
+ }
+
+ # Save the reason for suppression for output
+ if ($fate == $SUPPRESSED && defined $reason) {
+ $why_suppressed{$complete_name{$addr}} = $reason;
+ }
+
+ return;
+ }
+
sub lock {
# Don't allow changes to the table from now on. This stores a stack
# trace of where it was called, so that later attempts to modify it
\%anomalous_entries,
'readable_array');
- my %core_access;
- # This is a string, solely for documentation, indicating how one can get
- # access to this property via the Perl core.
- main::set_access('core_access', \%core_access, 'r', 's');
-
my %to_output_map;
# Enum as to whether or not to write out this map table:
# 0 don't output
# Optional initialization data for the table.
my $initialize = delete $args{'Initialize'};
- my $core_access = delete $args{'Core_Access'};
my $default_map = delete $args{'Default_Map'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
my $addr = do { no overloading; pack 'J', $self; };
$anomalous_entries{$addr} = [];
- $core_access{$addr} = $core_access;
$default_map{$addr} = $default_map;
$self->initialize($initialize) if defined $initialize;
if defined $global_to_output_map{$full_name};
# If table says to output, do so; if says to suppress it, do so.
- return $INTERNAL_MAP if $self->internal_only;
+ my $fate = $self->fate;
+ return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
- return 0 if $self->status eq $SUPPRESSED;
+ return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
my $type = $self->property->type;
my $return = $self->SUPER::header();
- $return .= $INTERNAL_ONLY_HEADER if $self->to_output_map == $INTERNAL_MAP;
+ if ($self->to_output_map == $INTERNAL_MAP) {
+ $return .= $INTERNAL_ONLY_HEADER;
+ }
+ else {
+ my $property_name = $self->property->full_name;
+ $return .= <<END;
+
+# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
+
+# This file is for internal use by core Perl only. It is retained for
+# backwards compatibility with applications that may have come to rely on it,
+# but its format and even its name or existence are subject to change without
+# notice in a future Perl version. Don't use it directly. Instead, its
+# contents are now retrievable through a stable API in the Unicode::UCD
+# module: Unicode::UCD::prop_invmap('$property_name').
+END
+ }
return $return;
}
# have our own flag for just this purpose; but it works now to exclude
# Perl generated synonyms from the lists for properties, where the
# name is always the proper Unicode one.
- my @property_aliases = grep { $_->externally_ok } $self->aliases;
+ my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
my $count = $self->count;
my $default_map = $default_map{$addr};
$property_aliases[$i]->name . '(cp)'
);
}
- $comment .=
- "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
-
- my $access = $core_access{$addr};
- if ($access) {
- $comment .= "accessible through the Perl core via $access.";
- }
- else {
- $comment .= "not accessible through the Perl core directly.";
- }
+ my $full_name = $self->property->full_name;
+ $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
# And append any commentary already set from the actual property.
$comment .= "\n\n" . $self->comment if $self->comment;
my $name = $self->property->swash_name;
+ # Currently there is nothing in the pre_body unless a swash is being
+ # generated.
+ return unless defined $name;
+
if (defined $swash_keys{$name}) {
Carp::my_carp(join_lines(<<END
Already created a swash name '$name' for $swash_keys{$name}. This means that
# Any tables that are equivalent to or children of this table must now
# instead be equivalent to or (children) to the new leader (parent),
# still equivalent. The equivalency includes their matches_all info,
- # and for related tables, their status
+ # and for related tables, their fate and status.
# All related tables are of necessity equivalent, but the converse
# isn't necessarily true
my $status = $other->status;
my $status_info = $other->status_info;
+ my $fate = $other->fate;
my $matches_all = $matches_all{other_addr};
my $caseless_equivalent = $other->caseless_equivalent;
foreach my $table ($current_leader, @{$equivalents{$leader}}) {
$parent{$table_addr} = $other;
push @{$children{$other_addr}}, $table;
$table->set_status($status, $status_info);
+
+ # This reason currently doesn't get exposed outside; otherwise
+ # would have to look up the parent's reason and use it instead.
+ $table->set_fate($fate, "Parent's fate");
+
$self->set_caseless_equivalent($caseless_equivalent);
}
}
return
}
+ sub set_fate {
+ my $self = shift;
+ my $fate = shift;
+ my $reason = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ $self->SUPER::set_fate($fate, $reason);
+
+ # All children share this fate
+ foreach my $child ($self->children) {
+ $child->set_fate($fate, $reason);
+ }
+ return;
+ }
+
sub write {
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $flag = $property->status
|| $table->status
|| $table_alias_object->status;
- if ($flag) {
- if ($flag ne $PLACEHOLDER) {
- $flags{$flag} = $status_past_participles{$flag};
- } else {
- $flags{$flag} = <<END;
-a placeholder because it is not in Version $string_version of Unicode, but is
-needed by the Perl core to work gracefully. Because it is not in this version
-of Unicode, it will not be listed in $pod_file.pod
-END
- }
- }
+ $flags{$flag} = $status_past_participles{$flag} if $flag;
$loose_count++;
$any_of_these = 'any of these'
}
- my $comment = "";
+ my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
if ($has_unrelated) {
$comment .= <<END;
This file is for tables that are not necessarily related: To conserve
foreach my $flag (sort keys %flags) {
$comment .= <<END;
'$flag' below means that this form is $flags{$flag}.
+Consult $pod_file.pod
END
- next if $flag eq $PLACEHOLDER;
- $comment .= "Consult $pod_file.pod\n";
}
$comment .= "\n";
}
_Alias_Hash => $table_ref{$addr},
_Property => $self,
- # gets property's status by default
+ # gets property's fate and status by default
+ Fate => $self->fate,
Status => $self->status,
_Status_Info => $self->status_info,
%args);
my $addr = do { no overloading; pack 'J', $self; };
+ # Swash names are used only on regular map tables; otherwise there
+ # should be no access to the property map table from other parts of
+ # Perl.
+ return if $map{$addr}->fate != $ORDINARY;
+
return $file{$addr} if defined $file{$addr};
return $map{$addr}->external_name;
}
return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
}
+ sub set_proxy_for {
+ # Certain tables are not generally written out to files, but
+ # Unicode::UCD has the intelligence to know that the file for $self
+ # can be used to reconstruct those tables. This routine just changes
+ # things so that UCD pod entries for those suppressed tables are
+ # generated, so the fact that a proxy is used is invisible to the
+ # user.
+
+ my $self = shift;
+
+ foreach my $property_name (@_) {
+ my $ref = property_ref($property_name);
+ next if $ref->to_output_map;
+ $ref->set_fate($MAP_PROXIED);
+ }
+ }
+
sub set_type {
# Set the type of the property. Mostly this is figured out by the
# data in the table. But this is used to set it explicitly. The
return;
}
+ sub set_fate {
+ my $self = shift;
+ my $fate = shift;
+ my $reason = shift; # Ignored unless suppressing
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $addr = do { no overloading; pack 'J', $self; };
+ if ($fate == $SUPPRESSED) {
+ $why_suppressed{$self->complete_name} = $reason;
+ }
+
+ # Each table shares the property's fate, except that MAP_PROXIED
+ # doesn't affect match tables
+ $map{$addr}->set_fate($fate, $reason);
+ if ($fate != $MAP_PROXIED) {
+ foreach my $table ($map{$addr}, $self->tables) {
+ $table->set_fate($fate, $reason);
+ }
+ }
+ return;
+ }
+
+
# Most of the accessors for a property actually apply to its map table.
# Setup up accessor functions for those, referring to %map
for my $sub (qw(
comment
complete_name
containing_range
- core_access
count
default_map
delete_range
description
each_range
external_name
+ fate
file_path
format
initialize
range_size_1
reset_each_range
set_comment
- set_core_access
set_default_map
set_file_path
set_final_comment
{ # Closure
- my $indent_increment = " " x 2;
+ my $indent_increment = " " x (($debugging_build) ? 2 : 0);
my %already_output;
$main::simple_dumper_nesting = 0;
my $item = shift;
my $indent = shift;
- $indent = "" if ! defined $indent;
+ $indent = "" if ! $debugging_build || ! defined $indent;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Indent array elements one level
$output .= &simple_dumper($item->[$i], $next_indent);
+ next if ! $debugging_build;
$output =~ s/\n$//; # Remove any trailing nl so
$output .= " # [$i]\n"; # as to add a comment giving
# the array index
$gc->add_alias('Category');
# For backwards compatibility, these property files have particular names.
- my $upper = property_ref('Uppercase_Mapping');
- $upper->set_core_access('uc()');
- $upper->set_file('Upper'); # This is what utf8.c calls it
-
- my $lower = property_ref('Lowercase_Mapping');
- $lower->set_core_access('lc()');
- $lower->set_file('Lower');
-
- my $title = property_ref('Titlecase_Mapping');
- $title->set_core_access('ucfirst()');
- $title->set_file('Title');
+ property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
+ # utf8.c calls it
+ property_ref('Lowercase_Mapping')->set_file('Lower');
+ property_ref('Titlecase_Mapping')->set_file('Title');
my $fold = property_ref('Case_Folding');
$fold->set_file('Fold') if defined $fold;
# the code point and name on each line. This was actually the hardest
# thing to design around. The code points in those ranges may actually
# have real maps not given by these two lines. These maps will either
- # be algorithmically determinable, or in the extracted files furnished
+ # be algorithmically determinable, or be in the extracted files furnished
# with the UCD. In the event of conflicts between these extracted files,
# and this one, Unicode says that this one prevails. But it shouldn't
# prevail for conflicts that occur in these ranges. The data from the
# first.) A comment for it will later be constructed based on the
# actual properties present and used
$perl_charname = Property->new('Perl_Charnames',
- Core_Access => '\N{...} and "use charnames"',
Default_Map => "",
Directory => File::Spec->curdir(),
File => 'Name',
- Internal_Only => 1,
+ Fate => $INTERNAL_ONLY,
Perl_Extension => 1,
Range_Size_1 => \&output_perl_charnames_line,
Type => $STRING,
);
+ $perl_charname->set_proxy_for('Name', 'Name_Alias');
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
Directory => File::Spec->curdir(),
File => 'Decomposition',
Format => $DECOMP_STRING_FORMAT,
- Internal_Only => 1,
+ Fate => $INTERNAL_ONLY,
Perl_Extension => 1,
Default_Map => $CODE_POINT,
Map_Type => $COMPUTE_NO_MULTI_CP,
Type => $STRING,
);
+ $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
$Perl_decomp->add_comment(join_lines(<<END
This mapping is a combination of the Unicode 'Decomposition_Type' and
'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
# The simple version's name in each mapping merely has an 's' in
# front of the full one's
- my $simple = property_ref('s' . $case);
+ my $simple_name = 's' . $case;
+ my $simple = property_ref($simple_name);
$simple->initialize($full) if $simple->to_output_map();
my $simple_only = Property->new("_s$case",
Type => $STRING,
Default_Map => $CODE_POINT,
Perl_Extension => 1,
- Internal_Only => 1,
- Description => "The simple mappings for $case for code points that have full mappings as well");
+ Fate => $INTERNAL_ONLY,
+ Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
$simple_only->set_to_output_map($INTERNAL_MAP);
$simple_only->add_comment(join_lines( <<END
This file is for UCD.pm so that it can construct simple mappings that would
otherwise be lost because they are overridden by full mappings.
END
));
+
+ unless ($simple->to_output_map()) {
+ $simple_only->set_proxy_for($simple_name);
+ }
}
return;
$to_output_simple
= property_ref('Simple_Case_Folding')->to_output_map;
+ if (! $to_output_simple) {
+ property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
+ }
+
# If we ever wanted to show that these tables were combined, a new
# property method could be created, like set_combined_props()
property_ref('Case_Folding')->add_comment(join_lines( <<END
# need to be finished up.
next if $property == $perl;
+ # Nor do we need to do anything with properties that aren't going to
+ # be output.
+ next if $property->fate == $SUPPRESSED;
+
# Handle the properties that have more than one possible default
if (ref $property->default_map) {
my $default_map = $property->default_map;
# Our internal-only property should be treated as more than just a
# synonym; grandfather it in to the pod.
- $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1)
+ $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
+ Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
->set_equivalent_to(property_ref('ccc')->table('Above'),
Related => 1);
# Perl has long had an internal-only alias for this property; grandfather
# it in to the pod, but discourage its use.
my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
- Re_Pod_Entry => 1);
+ Re_Pod_Entry => 1,
+ Fate => $INTERNAL_ONLY,
+ Status => $DISCOURAGED);
my $case_ignorable = property_ref('Case_Ignorable');
if (defined $case_ignorable && ! $case_ignorable->is_empty) {
$perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
my $Alnum = $perl->add_match_table('Alnum',
- Description => 'Alphabetic and (Decimal) Numeric',
+ Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
$Alnum->add_alias('XPosixAlnum');
# than SD appeared, construct it ourselves, based on the first release SD
# was in. A pod entry is grandfathered in for it
my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
- Perl_Extension => 1, Internal_Only => 1);
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY,
+ Status => $DISCOURAGED);
my $soft_dotted = property_ref('Soft_Dotted');
if (defined $soft_dotted && ! $soft_dotted->is_empty) {
$CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
# These are used in Unicode's definition of \X
my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
- Internal_Only => 1);
+ Fate => $INTERNAL_ONLY);
my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
- Internal_Only => 1);
+ Fate => $INTERNAL_ONLY);
# For backward compatibility, Perl has its own definition for IDStart
# First, we include the underscore, and then the regular XID_Start also
# have to be Words
$perl->add_match_table('_Perl_IDStart',
Perl_Extension => 1,
- Internal_Only => 1,
+ Fate => $INTERNAL_ONLY,
Initialize =>
ord('_')
+ (property_ref('XID_Start')->table('Y') & $Word)
# More GCB. If we found some hangul syllables, populate a combined
# table.
- my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V', Perl_Extension => 1, Internal_Only => 1);
+ my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
my $LV = $gcb->table('LV');
if ($LV->is_empty) {
push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
next if $alias->name =~ /^_/;
$table->add_alias('Is_' . $alias->name,
Re_Pod_Entry => 0,
+ UCD => 0,
Status => $alias->status,
- Externally_Ok => 0);
+ OK_as_Filename => 0);
}
}
# No name collision, so ok to add the perl synonym.
my $make_re_pod_entry;
- my $externally_ok;
+ my $ok_as_filename;
my $status = $alias->status;
if ($nominal_property == $block) {
if ($prefix eq "") {
$make_re_pod_entry = 1;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
elsif ($prefix eq 'In_') {
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
else {
$make_re_pod_entry = 0;
$status = $status || $DISCOURAGED;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
}
elsif ($prefix ne "") {
# card, and we won't use it for an external name
$make_re_pod_entry = 0;
$status = $status || $NORMAL;
- $externally_ok = 0;
+ $ok_as_filename = 0;
}
else {
# own pod entry and can be used for an external name.
$make_re_pod_entry = 1;
$status = $status || $NORMAL;
- $externally_ok = 1;
+ $ok_as_filename = 1;
}
# Here, there isn't a perl pre-existing table with the
# to it, and are done with this prefix.
$equivalent->add_alias($proposed_name,
Re_Pod_Entry => $make_re_pod_entry,
+
+ # Currently don't output these in the
+ # ucd pod, as are strongly discouraged
+ # from being used
+ UCD => 0,
+
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
next PREFIX;
}
# synonym for this property, add one.
my $added_table = $perl->add_match_table($proposed_name,
Re_Pod_Entry => $make_re_pod_entry,
+
+ # See UCD comment just above
+ UCD => 0,
+
Status => $status,
- Externally_Ok => $externally_ok);
+ OK_as_Filename => $ok_as_filename);
# And it will be related to the actual table, since it is
# based on it.
$added_table->set_equivalent_to($actual, Related => 1);
if ($table->isa('Property')) {
$table->set_file_path(@$directory_ref, $file);
- push @map_properties, $table
- if $directory_ref->[0] eq $map_directory;
+ push @map_properties, $table;
+
+ # No swash means don't do the rest of this.
+ return if $table->fate != $ORDINARY;
+
+ # Get the path to the file
+ my @path = $table->file_path;
+
+ # Use just the file name if no subdirectory.
+ shift @path if $path[0] eq File::Spec->curdir();
+
+ my $file = join '/', @path;
+
+ # Create a hash entry for utf8_heavy to get the file that stores this
+ # property's map table
+ foreach my $alias ($table->aliases) {
+ my $name = $alias->name;
+ $loose_property_to_file_of{standardize($name)} = $file;
+ }
+
+ # And a way for utf8_heavy to find the proper key in the SwashInfo
+ # hash for this property.
+ $file_to_swash_name{$file} = "To" . $table->swash_name;
return;
}
# Associate it with its file internally. Don't include the
# $matches_directory first component
$table->set_file_path(@$directory_ref, $file);
+
+ # No swash means don't do the rest of this.
+ next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
+
my $sub_filename = join('/', $directory_ref->[1, -1], $file);
my $property = $table->property;
- $property = ($property == $perl)
- ? "" # 'perl' is never explicitly stated
- : standardize($property->name) . '=';
+ my $property_name = ($property == $perl)
+ ? "" # 'perl' is never explicitly stated
+ : standardize($property->name) . '=';
+
+ my $is_default = 0; # Is this table the default one for the property?
+
+ # To calculate $is_default, we find if this table is the same as the
+ # default one for the property. But this is complicated by the
+ # possibility that there is a master table for this one, and the
+ # information is stored there instead of here.
+ my $parent = $table->parent;
+ my $leader_prop = $parent->property;
+ my $default_map = $leader_prop->default_map;
+ if (defined $default_map) {
+ my $default_table = $leader_prop->table($default_map);
+ $is_default = 1 if defined $default_table && $parent == $default_table;
+ }
+
+ # Calculate the loose name for this table. Mostly it's just its name,
+ # standardized. But in the case of Perl tables that are single-form
+ # equivalents to Unicode properties, it is the latter's name.
+ my $loose_table_name =
+ ($property != $perl || $leader_prop == $perl)
+ ? standardize($table->name)
+ : standardize($parent->name);
my $deprecated = ($table->status eq $DEPRECATED)
? $table->status_info
if ((my $integer_name = $alias->name)
=~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
{
- $stricter_to_file_of{$property . $integer_name}
+ $stricter_to_file_of{$property_name . $integer_name}
= $sub_filename;
}
}
}
+ # For Unicode::UCD, create a mapping of the prop=value to the
+ # canonical =value for that property.
+ if ($standard =~ /=/) {
+
+ # This could happen if a strict name mapped into an existing
+ # loose name. In that event, the strict names would have to
+ # be moved to a new hash.
+ if (exists($loose_to_standard_value{$standard})) {
+ Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
+ }
+ $loose_to_standard_value{$standard} = $loose_table_name;
+ }
+
# Keep a list of the deprecated properties and their filenames
if ($deprecated && $complement == 0) {
$utf8::why_deprecated{$sub_filename} = $deprecated;
if ($caseless_equivalent != 0) {
$caseless_equivalent_to{$standard} = $caseless_equivalent;
}
+
+ # Add to defaults list if the table this alias belongs to is the
+ # default one
+ $loose_defaults{$standard} = 1 if $is_default;
}
}
my @zero_match_tables; # List of tables that have no matches in this release
-sub make_table_pod_entries($) {
+sub make_re_pod_entries($) {
# This generates the entries for the pod file for a given table.
# Also done at this time are any children tables. The output looks like:
# \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
return;
}
+sub make_ucd_table_pod_entries {
+ my $table = shift;
+
+ # Generate the entries for the UCD section of the pod for $table. This
+ # also calculates if names are ambiguous, so has to be called even if the
+ # pod is not being output
+
+ my $short_name = $table->name;
+ my $standard_short_name = standardize($short_name);
+ my $full_name = $table->full_name;
+ my $standard_full_name = standardize($full_name);
+
+ my $full_info = ""; # Text of info column for full-name entries
+ my $other_info = ""; # Text of info column for short-name entries
+ my $short_info = ""; # Text of info column for other entries
+ my $meaning = ""; # Synonym of this table
+
+ my $property = ($table->isa('Property'))
+ ? $table
+ : $table->parent->property;
+
+ my $perl_extension = $table->perl_extension;
+
+ # Get the more official name for for perl extensions that aren't
+ # stand-alone properties
+ if ($perl_extension && $property != $table) {
+ if ($property == $perl ||$property->type == $BINARY) {
+ $meaning = $table->complete_name;
+ }
+ else {
+ $meaning = $property->full_name . "=$full_name";
+ }
+ }
+
+ # There are three types of info column. One for the short name, one for
+ # the full name, and one for everything else. They mostly are the same,
+ # so initialize in the same loop.
+ foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
+ if ($perl_extension && $property != $table) {
+
+ # Add the synonymous name for the non-full name entries; and to
+ # the full-name entry if it adds extra information
+ if ($info_ref == \$other_info
+ || ($info_ref == \$short_info
+ && $standard_short_name ne $standard_full_name)
+ || standardize($meaning) ne $standard_full_name
+ ) {
+ $$info_ref .= "$meaning.";
+ }
+ }
+ elsif ($info_ref != \$full_info) {
+
+ # Otherwise, the non-full name columns include the full name
+ $$info_ref .= $full_name;
+ }
+
+ # And the full-name entry includes the short name, if different
+ if ($info_ref == \$full_info
+ && $standard_short_name ne $standard_full_name)
+ {
+ $full_info =~ s/\.\Z//;
+ $full_info .= " " if $full_info;
+ $full_info .= "(Short: $short_name)";
+ }
+
+ if ($table->perl_extension) {
+ $$info_ref =~ s/\.\Z//;
+ $$info_ref .= ". " if $$info_ref;
+ $$info_ref .= "(Perl extension)";
+ }
+ }
+
+ # Add any extra annotations to the full name entry
+ foreach my $more_info ($table->description,
+ $table->note,
+ $table->status_info)
+ {
+ next unless $more_info;
+ $full_info =~ s/\.\Z//;
+ $full_info .= ". " if $full_info;
+ $full_info .= $more_info;
+ }
+
+ # These keep track if have created full and short name pod entries for the
+ # property
+ my $done_full = 0;
+ my $done_short = 0;
+
+ # Every possible name is kept track of, even those that aren't going to be
+ # output. This way we can be sure to find the ambiguities.
+ foreach my $alias ($table->aliases) {
+ my $name = $alias->name;
+ my $standard = standardize($name);
+ my $info;
+ my $output_this = $alias->ucd;
+
+ # If the full and short names are the same, we want to output the full
+ # one's entry, so it has priority.
+ if ($standard eq $standard_full_name) {
+ next if $done_full;
+ $done_full = 1;
+ $info = $full_info;
+ }
+ elsif ($standard eq $standard_short_name) {
+ next if $done_short;
+ $done_short = 1;
+ next if $standard_short_name eq $standard_full_name;
+ $info = $short_info;
+ }
+ else {
+ $info = $other_info;
+ }
+
+ # Here, we have set up the two columns for this entry. But if an
+ # entry already exists for this name, we have to decide which one
+ # we're going to later output.
+ if (exists $ucd_pod{$standard}) {
+
+ # If the two entries refer to the same property, it's not going to
+ # be ambiguous. (Likely it's because the names when standardized
+ # are the same.) But that means if they are different properties,
+ # there is ambiguity.
+ if ($ucd_pod{$standard}->{'property'} != $property) {
+
+ # Here, we have an ambiguity. This code assumes that one is
+ # scheduled to be output and one not and that one is a perl
+ # extension (which is not to be output) and the other isn't.
+ # If those assumptions are wrong, things have to be rethought.
+ if ($ucd_pod{$standard}{'output_this'} == $output_this
+ || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
+ || $output_this == $perl_extension)
+ {
+ Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
+ }
+
+ # We modifiy the info column of the one being output to
+ # indicate the ambiguity. Set $which to point to that one's
+ # info.
+ my $which;
+ if ($ucd_pod{$standard}{'output_this'}) {
+ $which = \$ucd_pod{$standard}->{'info'};
+ }
+ else {
+ $which = \$info;
+ $meaning = $ucd_pod{$standard}{'meaning'};
+ }
+
+ chomp $$which;
+ $$which =~ s/\.\Z//;
+ $$which .= "; NOT '$standard' meaning '$meaning'";
+
+ $ambiguous_names{$standard} = 1;
+ }
+
+ # Use the non-perl-extension variant
+ next unless $ucd_pod{$standard}{'perl_extension'};
+ }
+
+ # Store enough information about this entry that we can later look for
+ # ambiguities, and output it properly.
+ $ucd_pod{$standard} = { 'name' => $name,
+ 'info' => $info,
+ 'meaning' => $meaning,
+ 'output_this' => $output_this,
+ 'perl_extension' => $perl_extension,
+ 'property' => $property,
+ 'status' => $alias->status,
+ };
+ } # End of looping through all this table's aliases
+
+ return;
+}
+
sub pod_alphanumeric_sort {
# Sort pod entries alphanumerically.
# Create the .pod file. This generates the various subsections and then
# combines them in one big HERE document.
+ my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
+
return unless defined $pod_directory;
print "Making pod file\n" if $verbosity >= $PROGRESS;
about this.
END
}
- my $text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
+ my $text = $Is_flags_text;
$text = "$exception_message $text" if $has_Is_conflicts;
# And the 'Is_ line';
foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
keys %why_list)
{
- # Add to the output, all the properties that have that reason. Start
- # with an empty line.
- push @bad_re_properties, "\n\n";
-
+ # Add to the output, all the properties that have that reason.
my $has_item = 0; # Flag if actually output anything.
foreach my $name (@{$why_list{$why}}) {
my $short_name = $property->name;
$short_name .= '=' . $property->table($table)->name if $table;
+ # Start with an empty line.
+ push @bad_re_properties, "\n\n" unless $has_item;
+
# And add the property as an item for the reason.
push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
$has_item = 1;
} # End of looping through each reason.
+ if (! @bad_re_properties) {
+ push @bad_re_properties,
+ "*** This installation accepts ALL non-Unihan properties ***";
+ }
+ else {
+ # Add =over only if non-empty to avoid an empty =over/=back section,
+ # which is considered bad form.
+ unshift @bad_re_properties, "\n=over 4\n";
+ push @bad_re_properties, "\n=back\n";
+ }
+
# Similiarly, generate a list of files that we don't use, grouped by the
# reasons why. First, create a hash whose keys are the reasons, and whose
# values are anonymous arrays of all the files that share that reason.
foreach my $file (keys %ignored_files) {
push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
}
+ foreach my $file (keys %skipped_files) {
+ push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
+ }
# Then, sort each group.
foreach my $group (keys %grouped_by_reason) {
push @unused_files, "\n$reason\n";
}
- # Generate a list of the properties whose map table we output, from the
- # global @map_properties.
- my @map_tables_actually_output;
- my $info_indent = 20; # Left column is narrower than \p{} table.
- foreach my $property (@map_properties) {
-
- # Get the path to the file; don't output any not in the standard
- # directory.
- my @path = $property->file_path;
- next if $path[0] ne $map_directory;
-
- # Don't mention map tables that are for internal-use only
- next if $property->to_output_map == $INTERNAL_MAP;
-
- shift @path; # Remove the standard name
-
- my $file = join '/', @path; # In case is in sub directory
- my $info = $property->full_name;
- my $short_name = $property->name;
- if ($info ne $short_name) {
- $info .= " ($short_name)";
- }
- foreach my $more_info ($property->description,
- $property->note,
- $property->status_info)
- {
- next unless $more_info;
- $info =~ s/\.\Z//;
- $info .= ". $more_info";
- }
- push @map_tables_actually_output, format_pod_line($info_indent,
- $file,
- $info,
- $property->status);
+ # Similarly, create the output text for the UCD section of the pod
+ my @ucd_pod;
+ foreach my $key (keys %ucd_pod) {
+ next unless $ucd_pod{$key}->{'output_this'};
+ push @ucd_pod, format_pod_line($indent_info_column,
+ $ucd_pod{$key}->{'name'},
+ $ucd_pod{$key}->{'info'},
+ $ucd_pod{$key}->{'status'},
+ );
}
# Sort alphabetically, and fold for output
- @map_tables_actually_output = sort
- pod_alphanumeric_sort @map_tables_actually_output;
- @map_tables_actually_output
- = simple_fold(\@map_tables_actually_output,
- ' ',
- $info_indent,
- $automatic_pod_indent);
-
- # Generate a list of the formats that can appear in the map tables.
- my @map_table_formats;
- foreach my $format (sort keys %map_table_formats) {
- push @map_table_formats,
- Text::Tabs::expand("$format\t$map_table_formats{$format}\n");
- }
- @map_table_formats = simple_fold(\@map_table_formats,
- ' ',
- 8,
- $automatic_pod_indent);
+ @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
+ my $ucd_pod = simple_fold(\@ucd_pod,
+ ' ',
+ $indent_info_column,
+ $automatic_pod_indent);
+ $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
+ . "\n"
+ . $ucd_pod;
local $" = "";
# Everything is ready to assemble.
though not all are enabled by default. The omitted ones are the Unihan
properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
deprecated or Unicode-internal properties. (An installation may choose to
-recompile Perl's tables to change this. See L<Unicode regular expression
+recompile Perl's tables to change this. See L<Unicode character
properties that are NOT accepted by Perl>.)
+For most purposes, access to Unicode properties from the Perl core is through
+regular expression matches, as described in the next section.
+For some special purposes, and to access the properties that are not suitable
+for regular expression matching, all the Unicode character properties that
+Perl handles are accessible via the standard L<Unicode::UCD> module, as
+described in the section L</Properties accessible through Unicode::UCD>.
+
Perl also provides some additional extensions and short-cut synonyms
for Unicode properties.
$zero_matches
-=head1 Properties not accessible through \\p{} and \\P{}
-
-A few properties are accessible in Perl via various function calls only.
-These are:
+=head1 Properties accessible through Unicode::UCD
+
+All the Unicode character properties mentioned above (except for those marked
+as for internal use by Perl) are also accessible by
+L<Unicode::UCD/prop_invlist()>.
+
+Due to their nature, not all Unicode character properties are suitable for
+regular expression matches, nor C<prop_invlist()>. The remaining
+non-provisional, non-internal ones are accessible via
+L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
+hasn't included; see L<below for which those are|/Unicode character properties
+that are NOT accepted by Perl>).
+
+For compatibility with other parts of Perl, all the single forms given in the
+table in the L<section above|/Properties accessible through \\p{} and \\P{}>
+are recognized. BUT, there are some ambiguities between some Perl extensions
+and the Unicode properties, all of which are silently resolved in favor of the
+official Unicode property. To avoid surprises, you should only use
+C<prop_invmap()> for forms listed in the table below, which omits the
+non-recommended ones. The affected forms are the Perl single form equivalents
+of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
+C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
+whose short name is C<sc>. The table indicates the current ambiguities in the
+INFO column, beginning with the word C<"NOT">.
+
+The standard Unicode properties listed below are documented in
+L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
+L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
+L<perlunicode/Other Properties>;
+
+The first column in the table is a name for the property; the second column is
+an alternative name, if any, plus possibly some annotations. The alternative
+name is the property's full name, unless that would simply repeat the first
+column, in which case the second column indicates the property's short name
+(if different). The annotations are given only in the entry for the full
+name. If a property is obsolete, etc, the entry will be flagged with the same
+characters used in the table in the L<section above|/Properties accessible
+through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
+
+$ucd_pod
+
+=head1 Properties accessible through other means
+
+Certain properties are accessible also via core function calls. These are:
Lowercase_Mapping lc() and lcfirst()
Titlecase_Mapping ucfirst()
expressions.
And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
-interpolation in double-quoted strings and regular expressions, but both
-usages require a L<use charnames;|charnames> to be specified, which also
-contains related functions viacode(), vianame(), and string_vianame().
+interpolation in double-quoted strings and regular expressions; and functions
+C<charnames::viacode()>, C<charnames::vianame()>, and
+C<charnames::string_vianame()> (which require a C<use charnames ();> to be
+specified.
-=head1 Unicode regular expression properties that are NOT accepted by Perl
+Finally, most properties related to decomposition are accessible via
+L<Unicode::Normalize>.
+
+=head1 Unicode character properties that are NOT accepted by Perl
Perl will generate an error for a few character properties in Unicode when
used in a regular expression. The non-Unihan ones are listed below, with the
to accept any of these. The list is machine generated based on the
choices made for the installation that generated this document.
-=over 4
-
@bad_re_properties
-=back
-
An installation can choose to allow any of these to be matched by downloading
the Unicode database from L<http://www.unicode.org/Public/> to
C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
(C<\%Config> is available from the Config module).
-=head1 Files in the I<To> directory (for serious hackers only)
-
-All Unicode properties are really mappings (in the mathematical sense) from
-code points to their respective values. As part of its build process,
-Perl constructs tables containing these mappings for all properties that it
-deals with. Some, but not all, of these are written out into files.
-Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
-(C<%Config> is available from the C<Config> module).
-
-Perl reserves the right to change the format and even the existence of any of
-those files without notice, except the ones that were in existence prior to
-release 5.14. If those change, a deprecation cycle will be done first. These
-are:
-
-@map_tables_actually_output
-
-Each of the files in this directory defines several hash entries to help
-reading programs decipher it. One of them looks like this:
-
- \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
-
-where "NAME" is a name to indicate the property. For backwards compatibility,
-this is not necessarily the property's official Unicode name. (The "To" is
-also for backwards compatibility.) The hash entry gives the format of the
-mapping fields of the table, currently one of the following:
-
-@map_table_formats
-
-This format applies only to the entries in the main body of the table.
-Entries defined in hashes or ones that are missing from the list can have a
-different format.
-
-The value that the missing entries have is given by another SwashInfo hash
-entry line; it looks like this:
-
- \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
-
-This example line says that any Unicode code points not explicitly listed in
-the file have the value "NaN" under the property indicated by NAME. If the
-value is the special string C<< <code point> >>, it means that the value for
-any missing code point is the code point itself. This happens, for example,
-in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
-character "A", are missing because the uppercase of "A" is itself.
-
-Finally, if the file contains a hash for special case entries, its name is
-specified by an entry that looks like this:
-
- \$utf8::SwashInfo{'ToNAME'}{'specials_name'} = 'utf8::ToSpecNAME';
-
-
=head1 Other information in the Unicode data base
The Unicode data base is delivered in two different formats. The XML version
# Create and write Heavy.pl, which passes info about the tables to
# utf8_heavy.pl
+ # Stringify structures for output
+ my $loose_property_name_of
+ = simple_dumper(\%loose_property_name_of, ' ' x 4);
+ chomp $loose_property_name_of;
+
+ my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
+ chomp $stricter_to_file_of;
+
+ my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
+ chomp $loose_to_file_of;
+
+ my $nv_floating_to_rational
+ = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
+ chomp $nv_floating_to_rational;
+
+ my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
+ chomp $why_deprecated;
+
+ # We set the key to the file when we associated files with tables, but we
+ # couldn't do the same for the value then, as we might not have the file
+ # for the alternate table figured out at that time.
+ foreach my $cased (keys %caseless_equivalent_to) {
+ my @path = $caseless_equivalent_to{$cased}->file_path;
+ my $path = join '/', @path[1, -1];
+ $caseless_equivalent_to{$cased} = $path;
+ }
+ my $caseless_equivalent_to
+ = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
+ chomp $caseless_equivalent_to;
+
+ my $loose_property_to_file_of
+ = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
+ chomp $loose_property_to_file_of;
+
+ my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
+ chomp $file_to_swash_name;
+
my @heavy = <<END;
$HEADER
$INTERNAL_ONLY_HEADER
-# This file is for the use of utf8_heavy.pl
+# This file is for the use of utf8_heavy.pl and Unicode::UCD
# Maps Unicode (not Perl single-form extensions) property names in loose
# standard form to their corresponding standard names
\%utf8::loose_property_name_of = (
-END
-
- push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
- push @heavy, <<END;
+$loose_property_name_of
);
# Maps property, table to file for those using stricter matching
\%utf8::stricter_to_file_of = (
-END
- push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
- push @heavy, <<END;
+$stricter_to_file_of
);
# Maps property, table to file for those using loose matching
\%utf8::loose_to_file_of = (
-END
- push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
- push @heavy, <<END;
+$loose_to_file_of
);
# Maps floating point to fractional form
\%utf8::nv_floating_to_rational = (
-END
- push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
- push @heavy, <<END;
+$nv_floating_to_rational
);
# If a floating point number doesn't have enough digits in it to get this
# the table, so as to avoid duplication, as many property names can map to the
# file, but we only need one entry for all of them.
\%utf8::why_deprecated = (
-END
-
- push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
- push @heavy, <<END;
+$why_deprecated
);
-# A few properties have different behavior under /i matching. This maps the
+# A few properties have different behavior under /i matching. This maps
# those to substitute files to use under /i.
\%utf8::caseless_equivalent = (
-END
+$caseless_equivalent_to
+);
- # We set the key to the file when we associated files with tables, but we
- # couldn't do the same for the value then, as we might not have the file
- # for the alternate table figured out at that time.
- foreach my $cased (keys %caseless_equivalent_to) {
- my @path = $caseless_equivalent_to{$cased}->file_path;
- my $path = join '/', @path[1, -1];
- $utf8::caseless_equivalent_to{$cased} = $path;
- }
- push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
- push @heavy, <<END;
+# Property names to mapping files
+\%utf8::loose_property_to_file_of = (
+$loose_property_to_file_of
+);
+
+# Files to the swash names within them.
+\%utf8::file_to_swash_name = (
+$file_to_swash_name
);
1;
push @name, <<END;
+package charnames;
+
# This module contains machine-generated tables and code for the
# algorithmically-determinable Unicode character names. The following
# routines can be used to translate between name and code point and vice versa
return;
}
+sub make_UCD () {
+ # Create and write UCD.pl, which passes info about the tables to
+ # Unicode::UCD
+
+ # Create a mapping from each alias of Perl single-form extensions to all
+ # its equivalent aliases, for quick look-up.
+ my %perlprop_to_aliases;
+ foreach my $table ($perl->tables) {
+
+ # First create the list of the aliases of each extension
+ my @aliases_list; # List of legal aliases for this extension
+
+ my $table_name = $table->name;
+ my $standard_table_name = standardize($table_name);
+ my $table_full_name = $table->full_name;
+ my $standard_table_full_name = standardize($table_full_name);
+
+ # Make sure that the list has both the short and full names
+ push @aliases_list, $table_name, $table_full_name;
+
+ my $found_ucd = 0; # ? Did we actually get an alias that should be
+ # output for this table
+
+ # Go through all the aliases (including the two just added), and add
+ # any new unique ones to the list
+ foreach my $alias ($table->aliases) {
+
+ # Skip non-legal names
+ next unless $alias->ok_as_filename;
+ next unless $alias->ucd;
+
+ $found_ucd = 1; # have at least one legal name
+
+ my $name = $alias->name;
+ my $standard = standardize($name);
+
+ # Don't repeat a name that is equivalent to one already on the
+ # list
+ next if $standard eq $standard_table_name;
+ next if $standard eq $standard_table_full_name;
+
+ push @aliases_list, $name;
+ }
+
+ # If there were no legal names, don't output anything.
+ next unless $found_ucd;
+
+ # To conserve memory in the program reading these in, omit full names
+ # that are identical to the short name, when those are the only two
+ # aliases for the property.
+ if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
+ pop @aliases_list;
+ }
+
+ # Here, @aliases_list is the list of all the aliases that this
+ # extension legally has. Now can create a map to it from each legal
+ # standardized alias
+ foreach my $alias ($table->aliases) {
+ next unless $alias->ucd;
+ next unless $alias->ok_as_filename;
+ push @{$perlprop_to_aliases{standardize($alias->name)}},
+ @aliases_list;
+ }
+ }
+
+ # Make a list of all combinations of properties/values that are suppressed.
+ my @suppressed;
+ foreach my $property_name (keys %why_suppressed) {
+
+ # Just the value
+ my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
+
+ # The hash may contain properties not in this release of Unicode
+ next unless defined (my $property = property_ref($property_name));
+
+ # Find all combinations
+ foreach my $prop_alias ($property->aliases) {
+ my $prop_alias_name = standardize($prop_alias->name);
+
+ # If no =value, there's just one combination possibe for this
+ if (! $value_name) {
+
+ # The property may be suppressed, but there may be a proxy for
+ # it, so it shouldn't be listed as suppressed
+ next if $prop_alias->ucd;
+ push @suppressed, $prop_alias_name;
+ }
+ else { # Otherwise
+ foreach my $value_alias ($property->table($value_name)->aliases)
+ {
+ next if $value_alias->ucd;
+
+ push @suppressed, "$prop_alias_name="
+ . standardize($value_alias->name);
+ }
+ }
+ }
+ }
+
+ # Convert the structure below (designed for Name.pm) to a form that UCD
+ # wants, so it doesn't have to modify it at all; i.e. so that it includes
+ # an element for the Hangul syllables in the appropriate place, and
+ # otherwise changes the name to include the "-<code point>" suffix.
+ my @algorithm_names;
+ my $done_hangul = 0;
+
+ # Copy it linearly.
+ for my $i (0 .. @code_points_ending_in_code_point - 1) {
+
+ # Insert the hanguls in the correct place.
+ if (! $done_hangul
+ && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
+ {
+ $done_hangul = 1;
+ push @algorithm_names, { low => $SBase,
+ high => $SBase + $SCount - 1,
+ name => '<hangul syllable>',
+ };
+ }
+
+ # Copy the current entry, modified.
+ push @algorithm_names, {
+ low => $code_points_ending_in_code_point[$i]->{'low'},
+ high => $code_points_ending_in_code_point[$i]->{'high'},
+ name =>
+ "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
+ };
+ }
+
+ # Serialize these structures for output.
+ my $loose_to_standard_value
+ = simple_dumper(\%loose_to_standard_value, ' ' x 4);
+ chomp $loose_to_standard_value;
+
+ my $string_property_loose_to_name
+ = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
+ chomp $string_property_loose_to_name;
+
+ my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
+ chomp $perlprop_to_aliases;
+
+ my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
+ chomp $prop_aliases;
+
+ my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
+ chomp $prop_value_aliases;
+
+ my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
+ chomp $suppressed;
+
+ my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
+ chomp $algorithm_names;
+
+ my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
+ chomp $ambiguous_names;
+
+ my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
+ chomp $loose_defaults;
+
+ my @ucd = <<END;
+$HEADER
+$INTERNAL_ONLY_HEADER
+
+# This file is for the use of Unicode::UCD
+
+# Highest legal Unicode code point
+\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
+
+# Hangul syllables
+\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
+\$Unicode::UCD::HANGUL_COUNT = $SCount;
+
+# Keys are all the possible "prop=value" combinations, in loose form; values
+# are the standard loose name for the 'value' part of the key
+\%Unicode::UCD::loose_to_standard_value = (
+$loose_to_standard_value
+);
+
+# String property loose names to standard loose name
+\%Unicode::UCD::string_property_loose_to_name = (
+$string_property_loose_to_name
+);
+
+# Keys are Perl extensions in loose form; values are each one's list of
+# aliases
+\%Unicode::UCD::loose_perlprop_to_name = (
+$perlprop_to_aliases
+);
+
+# Keys are standard property name; values are each one's aliases
+\%Unicode::UCD::prop_aliases = (
+$prop_aliases
+);
+
+# Keys of top level are standard property name; values are keys to another
+# hash, Each one is one of the property's values, in standard form. The
+# values are that prop-val's aliases. If only one specified, the short and
+# long alias are identical.
+\%Unicode::UCD::prop_value_aliases = (
+$prop_value_aliases
+);
+
+# Ordered (by code point ordinal) list of the ranges of code points whose
+# names are algorithmically determined. Each range entry is an anonymous hash
+# of the start and end points and a template for the names within it.
+\@Unicode::UCD::algorithmic_named_code_points = (
+$algorithm_names
+);
+
+# The properties that as-is have two meanings, and which must be disambiguated
+\%Unicode::UCD::ambiguous_names = (
+$ambiguous_names
+);
+
+# Keys are the prop-val combinations which are the default values for the
+# given property, expressed in standard loose form
+\%Unicode::UCD::loose_defaults = (
+$loose_defaults
+);
+
+# All combinations of names that are suppressed.
+# This is actually for UCD.t, so it knows which properties shouldn't have
+# entries. If it got any bigger, would probably want to put it in its own
+# file to use memory only when it was needed, in testing.
+\@Unicode::UCD::suppressed_properties = (
+$suppressed
+);
+
+1;
+END
+
+ main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
+ return;
+}
sub write_all_tables() {
# Write out all the tables generated by this program to files, as well as
# See if should suppress the table if is empty, but warn if it
# contains something.
- my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
- keys %why_suppress_if_empty_warn_if_not;
+ my $suppress_if_empty_warn_if_not
+ = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
# Calculate if this table should have any code points associated
# with it or not.
|| ($table == $property->table('N')
&& $property->table('Y')->is_empty));
-
- # Some tables should match everything
- my $expected_full =
- ($is_property)
- ? # All these types of map tables will be full because
- # they will have been populated with defaults
- ($type == $ENUM || $type == $FORCED_BINARY)
-
- : # A match table should match everything if its method
- # shows it should
- ($table->matches_all
-
- # The complement of an empty binary table will match
- # everything
- || $is_complement_of_empty_binary
- )
- ;
-
if ($table->is_empty) {
if ($suppress_if_empty_warn_if_not) {
- $table->set_status($SUPPRESSED,
- $why_suppress_if_empty_warn_if_not{$complete_name});
+ $table->set_fate($SUPPRESSED,
+ $suppress_if_empty_warn_if_not);
}
# Suppress (by skipping them) expected empty tables.
# this table is a child of another one to avoid duplicating
# the warning that should come from the parent one.
if (($table == $property || $table->parent == $table)
- && $table->status ne $SUPPRESSED
+ && $table->fate != $SUPPRESSED
+ && $table->fate != $MAP_PROXIED
&& ! grep { $complete_name =~ /^$_$/ }
@tables_that_may_be_empty)
{
elsif ($expected_empty) {
my $because = "";
if ($suppress_if_empty_warn_if_not) {
- $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
+ $because = " because $suppress_if_empty_warn_if_not";
}
Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
}
+ # Some tables should match everything
+ my $expected_full =
+ ($table->fate == $SUPPRESSED)
+ ? 0
+ : ($is_property)
+ ? # All these types of map tables will be full because
+ # they will have been populated with defaults
+ ($type == $ENUM || $type == $FORCED_BINARY)
+
+ : # A match table should match everything if its method
+ # shows it should
+ ($table->matches_all
+
+ # The complement of an empty binary table will match
+ # everything
+ || $is_complement_of_empty_binary
+ )
+ ;
+
my $count = $table->count;
if ($expected_full) {
if ($count != $MAX_UNICODE_CODEPOINTS) {
}
}
- if ($table->status eq $SUPPRESSED) {
+ if ($table->fate == $SUPPRESSED) {
if (! $is_property) {
my @children = $table->children;
foreach my $child (@children) {
- if ($child->status ne $SUPPRESSED) {
+ if ($child->fate != $SUPPRESSED) {
Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
}
}
next TABLE;
}
+
if (! $is_property) {
+ make_ucd_table_pod_entries($table) if $table->property == $perl;
+
# Several things need to be done just once for each related
# group of match tables. Do them on the parent.
if ($table->parent == $table) {
# Add an entry in the pod file for the table; it also does
# the children.
- make_table_pod_entries($table) if defined $pod_directory;
+ make_re_pod_entries($table) if defined $pod_directory;
# See if the the table matches identical code points with
# something that has already been output. In that case,
# Don't write out or make references to the $perl property
next if $table == $perl;
- if ($type != $STRING) {
-
- # There is a mapping stored of the various synonyms to the
- # standardized name of the property for utf8_heavy.pl.
- # Also, the pod file contains entries of the form:
- # \p{alias: *} \p{full: *}
- # rather than show every possible combination of things.
+ make_ucd_table_pod_entries($table);
+
+ # There is a mapping stored of the various synonyms to the
+ # standardized name of the property for utf8_heavy.pl.
+ # Also, the pod file contains entries of the form:
+ # \p{alias: *} \p{full: *}
+ # rather than show every possible combination of things.
+
+ my @property_aliases = $property->aliases;
+
+ my $full_property_name = $property->full_name;
+ my $property_name = $property->name;
+ my $standard_property_name = standardize($property_name);
+ my $standard_property_full_name
+ = standardize($full_property_name);
+
+ # We also create for Unicode::UCD a list of aliases for
+ # the property. The list starts with the property name;
+ # then its full name.
+ my @property_list;
+ my @standard_list;
+ if ( $property->fate <= $MAP_PROXIED) {
+ @property_list = ($property_name, $full_property_name);
+ @standard_list = ($standard_property_name,
+ $standard_property_full_name);
+ }
- my @property_aliases = $property->aliases;
+ # For each synonym ...
+ for my $i (0 .. @property_aliases - 1) {
+ my $alias = $property_aliases[$i];
+ my $alias_name = $alias->name;
+ my $alias_standard = standardize($alias_name);
- # The full name of this property is stored by convention
- # first in the alias array
- my $full_property_name =
- '\p{' . $property_aliases[0]->name . ': *}';
- my $standard_property_name = standardize($table->name);
- # For each synonym ...
- for my $i (0 .. @property_aliases - 1) {
- my $alias = $property_aliases[$i];
- my $alias_name = $alias->name;
- my $alias_standard = standardize($alias_name);
+ # Add other aliases to the list of property aliases
+ if ($property->fate <= $MAP_PROXIED
+ && ! grep { $alias_standard eq $_ } @standard_list)
+ {
+ push @property_list, $alias_name;
+ push @standard_list, $alias_standard;
+ }
- # For utf8_heavy, set the mapping of the alias to the
- # property
+ # For utf8_heavy, set the mapping of the alias to the
+ # property
+ if ($type == $STRING) {
+ if ($property->fate <= $MAP_PROXIED) {
+ $string_property_loose_to_name{$alias_standard}
+ = $standard_property_name;
+ }
+ }
+ else {
if (exists ($loose_property_name_of{$alias_standard}))
{
Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
= $standard_property_name;
}
- # Now for the pod entry for this alias. Skip if not
+ # Now for the re pod entry for this alias. Skip if not
# outputting a pod; skip the first one, which is the
# full name so won't have an entry like: '\p{full: *}
# \p{full: *}', and skip if don't want an entry for
|| ! defined $pod_directory
|| ! $alias->make_re_pod_entry;
- my $rhs = $full_property_name;
+ my $rhs = "\\p{$full_property_name: *}";
if ($property != $perl && $table->perl_extension) {
$rhs .= ' (Perl extension)';
}
$rhs,
$alias->status);
}
- } # End of non-string-like property code
+ }
+
+ # The list of all possible names is attached to each alias, so
+ # lookup is easy
+ if (@property_list) {
+ push @{$prop_aliases{$standard_list[0]}}, @property_list;
+ }
+ if ($property->fate <= $MAP_PROXIED) {
+
+ # Similarly, we create for Unicode::UCD a list of
+ # property-value aliases.
+
+ my $property_full_name = $property->full_name;
+
+ # Look at each table in the property...
+ foreach my $table ($property->tables) {
+ my @values_list;
+ my $table_full_name = $table->full_name;
+ my $standard_table_full_name
+ = standardize($table_full_name);
+ my $table_name = $table->name;
+ my $standard_table_name = standardize($table_name);
+
+ # The list starts with the table name and its full
+ # name.
+ push @values_list, $table_name, $table_full_name;
+
+ # We add to the table each unique alias that isn't
+ # discouraged from use.
+ foreach my $alias ($table->aliases) {
+ next if $alias->status
+ && $alias->status eq $DISCOURAGED;
+ my $name = $alias->name;
+ my $standard = standardize($name);
+ next if $standard eq $standard_table_name;
+ next if $standard eq $standard_table_full_name;
+ push @values_list, $name;
+ }
+
+ # Here @values_list is a list of all the aliases for
+ # the table. That is, all the property-values given
+ # by this table. By agreement with Unicode::UCD,
+ # if the name and full name are identical, and there
+ # are no other names, drop the duplcate entry to save
+ # memory.
+ if (@values_list == 2
+ && $values_list[0] eq $values_list[1])
+ {
+ pop @values_list
+ }
+
+ # To save memory, unlike the similar list for property
+ # aliases above, only the standard forms hve the list.
+ # This forces an extra step of converting from input
+ # name to standard name, but the savings are
+ # considerable. (There is only marginal savings if we
+ # did this with the property aliases.)
+ push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
+ }
+ }
# Don't write out a mapping file if not desired.
next if ! $property->to_output_map;
# Write out the pod file
make_pod;
- # And Heavy.pl, Name.pm
+ # And Heavy.pl, Name.pm, UCD.pl
make_Heavy;
make_Name_pm;
+ make_UCD;
make_property_test_script() if $make_test_script;
return;
push @property_aliases, map { Alias->new("Is_" . $_->name,
$_->loose_match,
$_->make_re_pod_entry,
- $_->externally_ok,
- $_->status)
+ $_->ok_as_filename,
+ $_->status,
+ $_->ucd,
+ )
} @property_aliases;
my $max = max(scalar @table_aliases, scalar @property_aliases);
for my $j (0 .. $max - 1) {
Pre_Handler => \&setup_script_extensions,
Each_Line_Handler => \&filter_script_extensions_line,
),
+ # The two Indic files are actually available starting in v6.0.0, but their
+ # property values are missing from PropValueAliases.txt in that release,
+ # so that further work would have to be done to get them to work properly
+ # for that release.
+ Input_file->new('IndicMatraCategory.txt', v6.1.0,
+ Property => 'Indic_Matra_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
+ ),
+ Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
+ Property => 'Indic_Syllabic_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ Skip => "Provisional; for the analysis and processing of Indic scripts",
+ ),
);
# End of all the preliminaries.
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
- $potential_files{$full} = 1
- if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
+ $potential_files{lc $full} = 1
+ if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
}
}
}
my @unknown_input_files;
- foreach my $file (keys %potential_files) {
- next if grep { lc($file) eq lc($_) } @known_files;
+ foreach my $file (keys %potential_files) { # The keys are stored in lc
+ next if grep { $file eq lc($_) } @known_files;
# Here, the file is unknown to us. Get relative path name
$file = File::Spec->abs2rel($file);