}
}
+# Enum values for to_output_map() method in the Map_Table package.
+my $EXTERNAL_MAP = 1;
+my $INTERNAL_MAP = 2;
+
+# To override computed values for writing the map tables for these properties.
+# The default for enum map tables is to write them out, so that the Unicode
+# .txt files can be removed, but all the data to compute any property value
+# for any code point is available in a more compact form.
+my %global_to_output_map = (
+ # Needed by UCD.pm, but don't want to publicize that it exists, so won't
+ # get stuck supporting it if things change. Sinc it is a STRING property,
+ # it normally would be listed in the pod, but INTERNAL_MAP suppresses
+ # that.
+ Unicode_1_Name => $INTERNAL_MAP,
+
+ Present_In => 0, # Suppress, as easily computed from Age
+ Canonical_Combining_Class => 0, # Duplicate of CombiningClass.pl
+ Block => 0, # Suppress, as Blocks.txt is retained.
+);
+
# Properties that this program ignores.
my @unimplemented_properties = (
'Unicode_Radical_Stroke' # Remove if changing to handle this one.
my $other_properties = 'other properties';
my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
- my $why_no_expand = "Deprecated by Unicode: less useful than UTF-specific calculations",
+ my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
%why_deprecated = (
'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
- 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames",
'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
# their rational equivalent
my %loose_property_name_of; # Loosely maps property names to standard form
+# Most properties are immune to caseless matching, otherwise you would get
+# nonsensical results, as properties are a function of a code point, not
+# everything that is caselessly equivalent to that code point. For example,
+# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
+# be true because 's' and 'S' are equivalent caselessly. However,
+# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
+# extend that concept to those very few properties that are like this. Each
+# such property will match the full range caselessly. They are hard-coded in
+# the program; it's not worth trying to make it general as it's extremely
+# unlikely that they will ever change.
+my %caseless_equivalent_to;
+
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
# syllables. The '_string' versions are so generated tables can retain the
# the character very frequently used.
return $try_hard if $code == 0x0000;
- return 0 if $try_hard; # XXX Temporary until fix utf8.c
-
# shun non-character code points.
return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
# A comment about its being obsolete, or whatever non normal status it has
main::set_access('status_info', \%status_info, 'r');
+ my %caseless_equivalent;
+ # The table this is equivalent to under /i matching, if any.
+ main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
+
my %range_size_1;
# Is the table to be output with each range only a single code point?
# This is done to avoid breaking existing code that may have come to rely
$status{$addr} = delete $args{'Status'} || $NORMAL;
$status_info{$addr} = delete $args{'_Status_Info'} || "";
$range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
+ $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
my $description = delete $args{'Description'};
my $externally_ok = delete $args{'Externally_Ok'};
main::set_access('core_access', \%core_access, 'r', 's');
my %to_output_map;
- # Boolean as to whether or not to write out this map table
+ # Enum as to whether or not to write out this map table:
+ # $EXTERNAL_MAP means its existence is noted in the documentation, and
+ # it should not be removed nor its format changed. This
+ # is done for those files that have traditionally been
+ # output.
+ # $INTERNAL_MAP means Perl reserves the right to do anything it wants
+ # with this file
main::set_access('to_output_map', \%to_output_map, 's');
my $default_map = delete $args{'Default_Map'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
+
# Rest of parameters passed on
my $range_list = Range_Map->new(Owner => $property);
return $to_output_map{$addr} if defined $to_output_map{$addr};
my $full_name = $self->full_name;
+ return $global_to_output_map{$full_name}
+ if defined $global_to_output_map{$full_name};
- # If table says to output, do so; if says to suppress it, do do.
- return 1 if grep { $_ eq $full_name } @output_mapped_properties;
+ # If table says to output, do so; if says to suppress it, do so.
+ return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
return 0 if $self->status eq $SUPPRESSED;
my $type = $self->property->type;
return 0 if $type == $BINARY;
# But do want to output string ones.
- return 1 if $type == $STRING;
+ return $EXTERNAL_MAP if $type == $STRING;
- # Otherwise is an $ENUM, don't output it
- return 0;
+ # Otherwise is an $ENUM, do output it, for Perl's purposes
+ return $INTERNAL_MAP;
}
sub inverse_list {
return ~ $current;
}
+ sub header {
+ my $self = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $return = $self->SUPER::header();
+
+ $return .= $INTERNAL_ONLY if $self->to_output_map == $INTERNAL_MAP;
+ return $return;
+ }
+
sub set_final_comment {
# Just before output, create the comment that heads the file
# containing this table.
$self->_set_format($format);
+ # Core Perl has a different definition of mapping ranges than we do,
+ # that is applicable mainly to mapping code points, so for tables
+ # where it is possible that core Perl could be used to read it,
+ # make it range size 1 to prevent possible confusion
+ $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
+
return $self->SUPER::write(
($self->property == $block)
? 7 # block file needs more tab stops
my $status = $other->status;
my $status_info = $other->status_info;
my $matches_all = $matches_all{other_addr};
+ my $caseless_equivalent = $other->caseless_equivalent;
foreach my $table ($current_leader, @{$equivalents{$leader}}) {
next if $table == $other;
trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
$parent{$table_addr} = $other;
push @{$children{$other_addr}}, $table;
$table->set_status($status, $status_info);
+ $self->set_caseless_equivalent($caseless_equivalent);
}
}
return lc $name;
}
+sub utf8_heavy_name ($$) {
+ # Returns the name that utf8_heavy.pl will use to find a table. XXX
+ # perhaps this function should be placed somewhere, like Heavy.pl so that
+ # utf8_heavy can use it directly without duplicating code that can get
+ # out-of sync.
+
+ my $table = shift;
+ my $alias = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $property = $table->property;
+ $property = ($property == $perl)
+ ? "" # 'perl' is never explicitly stated
+ : standardize($property->name) . '=';
+ if ($alias->loose_match) {
+ return $property . standardize($alias->name);
+ }
+ else {
+ return lc ($property . $alias->name);
+ }
+
+ return;
+}
+
{ # Closure
my $indent_increment = " " x 2;
my $fold = property_ref('Case_Folding');
$fold->set_file('Fold') if defined $fold;
- # utf8.c can't currently cope with non range-size-1 for these, and even if
- # it were changed to do so, someone else may be using them, expecting the
- # old style
+ # utf8.c has a different meaning for non range-size-1 for map properties
+ # that this program doesn't currently handle; and even if it were changed
+ # to do so, some other code may be using them expecting range size 1.
foreach my $property (qw {
Case_Folding
Lowercase_Mapping
return;
}
-sub setup_special_casing {
- # SpecialCasing.txt contains the non-simple case change mappings. The
- # simple ones are in UnicodeData.txt, which should already have been read
- # in to the full property data structures, so as to initialize these with
- # the simple ones. Then the SpecialCasing.txt entries overwrite the ones
- # which have different full mappings.
-
- # This routine sees if the simple mappings are to be output, and if so,
- # copies what has already been put into the full mapping tables, while
- # they still contain only the simple mappings.
-
- # The reason it is done this way is that the simple mappings are probably
- # not going to be output, so it saves work to initialize the full tables
- # with the simple mappings, and then overwrite those relatively few
- # entries in them that have different full mappings, and thus skip the
- # simple mapping tables altogether.
-
- my $file= shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+{ # Closure
+ my $lc; # Table for lowercase mapping
+ my $tc;
+ my $uc;
+
+ sub setup_special_casing {
+ # SpecialCasing.txt contains the non-simple case change mappings. The
+ # simple ones are in UnicodeData.txt, which should already have been
+ # read in to the full property data structures, so as to initialize
+ # these with the simple ones. Then the SpecialCasing.txt entries
+ # overwrite the ones which have different full mappings.
+
+ # This routine sees if the simple mappings are to be output, and if
+ # so, copies what has already been put into the full mapping tables,
+ # while they still contain only the simple mappings.
+
+ # The reason it is done this way is that the simple mappings are
+ # probably not going to be output, so it saves work to initialize the
+ # full tables with the simple mappings, and then overwrite those
+ # relatively few entries in them that have different full mappings,
+ # and thus skip the simple mapping tables altogether.
+
+ my $file= shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- # For each of the case change mappings...
- foreach my $case ('lc', 'tc', 'uc') {
- my $full = property_ref($case);
- unless (defined $full && ! $full->is_empty) {
- Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
+ $lc = property_ref('lc');
+ $tc = property_ref('tc');
+ $uc = property_ref('uc');
+
+ # For each of the case change mappings...
+ foreach my $case_table ($lc, $tc, $uc) {
+ my $case = $case_table->name;
+ my $full = property_ref($case);
+ unless (defined $full && ! $full->is_empty) {
+ Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
+ }
+
+ # The simple version's name in each mapping merely has an 's' in
+ # front of the full one's
+ my $simple = property_ref('s' . $case);
+ $simple->initialize($full) if $simple->to_output_map();
+
+ my $simple_only = Property->new("_s$case",
+ Type => $STRING,
+ Default_Map => $CODE_POINT,
+ Perl_Extension => 1,
+ Description => "The simple mappings for $case for code points that have full mappings as well");
+ $simple_only->set_to_output_map($INTERNAL_MAP);
+ $simple_only->add_comment(join_lines( <<END
+This file is for UCD.pm so that it can construct simple mappings that would
+otherwise be lost because they are overridden by full mappings.
+END
+ ));
}
- # The simple version's name in each mapping merely has an 's' in front
- # of the full one's
- my $simple = property_ref('s' . $case);
- $simple->initialize($full) if $simple->to_output_map();
+ return;
}
- return;
-}
+ sub filter_special_casing_line {
+ # Change the format of $_ from SpecialCasing.txt into something that
+ # the generic handler understands. Each input line contains three
+ # case mappings. This will generate three lines to pass to the
+ # generic handler for each of those.
-sub filter_special_casing_line {
- # Change the format of $_ from SpecialCasing.txt into something that the
- # generic handler understands. Each input line contains three case
- # mappings. This will generate three lines to pass to the generic handler
- # for each of those.
-
- # The input syntax (after stripping comments and trailing white space is
- # like one of the following (with the final two being entries that we
- # ignore):
- # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
- # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
- # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
- # Note the trailing semi-colon, unlike many of the input files. That
- # means that there will be an extra null field generated by the split
+ # The input syntax (after stripping comments and trailing white space
+ # is like one of the following (with the final two being entries that
+ # we ignore):
+ # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
+ # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
+ # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
+ # Note the trailing semi-colon, unlike many of the input files. That
+ # means that there will be an extra null field generated by the split
- my $file = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+ my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
+ # fields
+
+ # field #4 is when this mapping is conditional. If any of these get
+ # implemented, it would be by hard-coding in the casing functions in
+ # the Perl core, not through tables. But if there is a new condition
+ # we don't know about, output a warning. We know about all the
+ # conditions through 6.0
+ if ($fields[4] ne "") {
+ my @conditions = split ' ', $fields[4];
+ if ($conditions[0] ne 'tr' # We know that these languages have
+ # conditions, and some are multiple
+ && $conditions[0] ne 'az'
+ && $conditions[0] ne 'lt'
+
+ # And, we know about a single condition Final_Sigma, but
+ # nothing else.
+ && ($v_version gt v5.2.0
+ && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
+ {
+ $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
+ }
+ elsif ($conditions[0] ne 'Final_Sigma') {
- # field #4 is when this mapping is conditional. If any of these get
- # implemented, it would be by hard-coding in the casing functions in the
- # Perl core, not through tables. But if there is a new condition we don't
- # know about, output a warning. We know about all the conditions through
- # 6.0
- if ($fields[4] ne "") {
- my @conditions = split ' ', $fields[4];
- if ($conditions[0] ne 'tr' # We know that these languages have
- # conditions, and some are multiple
- && $conditions[0] ne 'az'
- && $conditions[0] ne 'lt'
-
- # And, we know about a single condition Final_Sigma, but
- # nothing else.
- && ($v_version gt v5.2.0
- && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
- {
- $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
+ # Don't print out a message for Final_Sigma, because we
+ # have hard-coded handling for it. (But the standard
+ # could change what the rule should be, but it wouldn't
+ # show up here anyway.
+
+ print "# SKIPPING Special Casing: $_\n"
+ if $verbosity >= $VERBOSE;
+ }
+ $_ = "";
+ return;
+ }
+ elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
+ $file->carp_bad_line('Extra fields');
+ $_ = "";
+ return;
}
- elsif ($conditions[0] ne 'Final_Sigma') {
- # Don't print out a message for Final_Sigma, because we have
- # hard-coded handling for it. (But the standard could change
- # what the rule should be, but it wouldn't show up here
- # anyway.
+ $_ = "$fields[0]; lc; $fields[1]";
+ $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
+ $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
- print "# SKIPPING Special Casing: $_\n"
- if $verbosity >= $VERBOSE;
+ # Copy any simple case change to the special tables constructed if
+ # being overridden by a multi-character case change.
+ if ($fields[1] ne $fields[0]
+ && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
+ {
+ $file->insert_adjusted_lines("$fields[0]; _slc; $value");
}
- $_ = "";
- return;
- }
- elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
- $file->carp_bad_line('Extra fields');
- $_ = "";
+ if ($fields[2] ne $fields[0]
+ && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
+ {
+ $file->insert_adjusted_lines("$fields[0]; _stc; $value");
+ }
+ if ($fields[3] ne $fields[0]
+ && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
+ {
+ $file->insert_adjusted_lines("$fields[0]; _suc; $value");
+ }
+
return;
}
-
- $_ = "$fields[0]; lc; $fields[1]";
- $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
- $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
-
- return;
}
sub filter_old_style_case_folding {
# it takes no part in anything we do.
my $to_output_simple;
- # XXX
- # These are experimental, perhaps will need these to pass to regcomp.c to
- # handle the cases where for example the Kelvin sign character folds to k,
- # and in regcomp, we need to know which of the characters can have a
- # non-latin1 char fold to it, so it doesn't do the optimizations it might
- # otherwise.
- my @latin1_singly_folded;
- my @latin1_folded;
-
sub setup_case_folding($) {
# Read in the case foldings in CaseFolding.txt. This handles both
# simple and full case folding.
}
# C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
- # I are all full foldings
- if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
- $_ = "$range; Case_Folding; $map";
+ # I are all full foldings; S is single-char. For S, there is always
+ # an F entry, so we must allow multiple values for the same code
+ # point. Fortunately this table doesn't need further manipulation
+ # which would preclude using multiple-values. The S is now included
+ # so that _swash_inversion_hash() is able to construct closures
+ # without having to worry about F mappings.
+ if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
+ $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
}
else {
$_ = "";
- if ($type ne 'S') {
- $file->carp_bad_line('Expecting C F I S or T in second field');
- return;
- }
+ $file->carp_bad_line('Expecting C F I S or T in second field');
}
# C and S are simple foldings, but simple case folding is not needed
$file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
}
- # XXX Experimental, see comment above
- if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
- my @folded = split ' ', $map;
- if (hex $folded[0] < 256 && @folded == 1) {
- push @latin1_singly_folded, hex $folded[0];
- }
- foreach my $folded (@folded) {
- push @latin1_folded, hex $folded if hex $folded < 256;
- }
- }
-
return;
}
- sub post_fold {
- # XXX Experimental, see comment above
- return;
-
- #local $to_trace = 1 if main::DEBUG;
- @latin1_singly_folded = uniques(@latin1_singly_folded);
- @latin1_folded = uniques(@latin1_folded);
- trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
- trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
- return;
- }
} # End case fold closure
sub filter_jamo_line {
$LC->initialize($gc->table('Ll') + $gc->table('Lu'));
# Lt not in release 1.
- $LC += $gc->table('Lt') if defined $gc->table('Lt');
+ if (defined $gc->table('Lt')) {
+ $LC += $gc->table('Lt');
+ $gc->table('Lt')->set_caseless_equivalent($LC);
+ }
}
$LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
+ $gc->table('Ll')->set_caseless_equivalent($LC);
+ $gc->table('Lu')->set_caseless_equivalent($LC);
+
my $Cs = $gc->table('Cs');
- if (defined $Cs) {
- $Cs->add_note('Mostly not usable in Perl.');
- $Cs->add_comment(join_lines(<<END
-Surrogates are used exclusively for I/O in UTF-16, and should not appear in
-Unicode text, and hence their use will generate (usually fatal) messages
-END
- ));
- }
# Folding information was introduced later into Unicode data. To get
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
$Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
+ $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
+ $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
+ $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
+
}
else {
$Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
Related => 1);
}
$Lower->add_alias('XPosixLower');
- $perl->add_match_table("PosixLower",
+ my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Lower & $ASCII,
);
my $Unicode_Upper = property_ref('Uppercase');
if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
+ $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
+ $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
+ $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
}
else {
$Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
Related => 1);
}
$Upper->add_alias('XPosixUpper');
- $perl->add_match_table("PosixUpper",
+ my $Posix_Upper = $perl->add_match_table("PosixUpper",
Description => "[A-Z]",
Initialize => $Upper & $ASCII,
);
# Earliest releases didn't have title case. Initialize it to empty if not
# otherwise present
my $Title = $perl->add_match_table('Title');
+ $Title->add_alias('Titlecase');
my $lt = $gc->table('Lt');
- if (defined $lt) {
- $Title->set_equivalent_to($lt, Related => 1);
- }
+
+ # Earlier versions of mktables had this related to $lt since they have
+ # identical code points, but their casefolds are not equivalent, and so
+ # now must be kept as separate entities.
+ $Title += $lt if defined $lt;
# If this Unicode version doesn't have Cased, set up our own. From
# Unicode 5.1: Definition D120: A character C is defined to be cased if
# and only if C has the Lowercase or Uppercase property or has a
# General_Category value of Titlecase_Letter.
- unless (defined property_ref('Cased')) {
+ my $Unicode_Cased = property_ref('Cased');
+ unless (defined $Unicode_Cased) {
my $cased = $perl->add_match_table('Cased',
Initialize => $Lower + $Upper + $Title,
Description => 'Uppercase or Lowercase or Titlecase',
);
+ $Unicode_Cased = $cased;
}
+ $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
# Similarly, set up our own Case_Ignorable property if this Unicode
# version doesn't have it. From Unicode 5.1: Definition D121: A character
$Alpha->add_description('Alphabetic');
}
$Alpha->add_alias('XPosixAlpha');
- $perl->add_match_table("PosixAlpha",
+ my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
Description => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
);
+ $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
+ $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
my $Alnum = $perl->add_match_table('Alnum',
Description => 'Alphabetic and (Decimal) Numeric',
File => 'CombiningClass',
Directory => File::Spec->curdir(),
);
- $perl_ccc->set_to_output_map(1);
+ $perl_ccc->set_to_output_map($EXTERNAL_MAP);
$perl_ccc->add_comment(join_lines(<<END
This mapping is for normalize.pm. It is currently identical to the Unicode
Canonical_Combining_Class property.
my $deprecated = ($table->status eq $DEPRECATED)
? $table->status_info
: "";
+ my $caseless_equivalent = $table->caseless_equivalent;
# And for each of the table's aliases... This inner loop eventually
# goes through all aliases in the UCD that we generate regex match
# files for
foreach my $alias ($table->aliases) {
- my $name = $alias->name;
+ my $standard = utf8_heavy_name($table, $alias);
# Generate an entry in either the loose or strict hashes, which
# will translate the property and alias names combination into the
# file where the table for them is stored.
- my $standard;
if ($alias->loose_match) {
- $standard = $property . standardize($alias->name);
if (exists $loose_to_file_of{$standard}) {
Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
}
}
}
else {
- $standard = lc ($property . $name);
if (exists $stricter_to_file_of{$standard}) {
Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
}
# will work. Also note that this assumes that such a
# number is matched strictly; so if that were to change,
# this would be wrong.
- if ((my $integer_name = $name)
+ if ((my $integer_name = $alias->name)
=~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
{
$stricter_to_file_of{$property . $integer_name}
if ($deprecated) {
$utf8::why_deprecated{$sub_filename} = $deprecated;
}
+
+ # And a substitute table, if any, for case-insensitive matching
+ if ($caseless_equivalent != 0) {
+ $caseless_equivalent_to{$standard} = $caseless_equivalent;
+ }
}
}
my $string_count = clarify_number($count);
my $status = $input_table->status;
my $status_info = $input_table->status_info;
+ my $caseless_equivalent = $input_table->caseless_equivalent;
my $entry_for_first_table; # The entry for the first table output.
# Almost certainly, it is the parent.
# expression, but with only one of 'Single', 'Short' if there
# are both items.
if ($short_name || $single_form || $table->conflicting) {
- $parenthesized .= '(';
$parenthesized .= "Short: $short_name" if $short_name;
if ($short_name && $single_form) {
$parenthesized .= ', ';
}
}
+ if ($caseless_equivalent != 0) {
+ $parenthesized .= '; ' if $parenthesized ne "";
+ $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
+ }
+
# Warn if this property isn't the same as one that a
# semi-casual user might expect. The other components of this
# to go on every entry.
my $conflicting = join " NOR ", $table->conflicting;
if ($conflicting) {
- $parenthesized .= '(' if ! $parenthesized;
- $parenthesized .= '; ' if $parenthesized ne '(';
+ $parenthesized .= '; ' if $parenthesized ne "";
$parenthesized .= "NOT $conflicting";
}
- $parenthesized .= ')' if $parenthesized;
- push @info, $parenthesized if $parenthesized;
+ push @info, "($parenthesized)" if $parenthesized;
if ($table_property != $perl && $table->perl_extension) {
push @info, '(Perl extension)';
# directory.
my @path = $property->file_path;
next if $path[0] ne $map_directory;
+
+ # Don't mention map tables that are for internal-use only
+ next if $property->to_output_map == $INTERNAL_MAP;
+
shift @path; # Remove the standard name
my $file = join '/', @path; # In case is in sub directory
# Generate a list of the formats that can appear in the map tables.
my @map_table_formats;
foreach my $format (sort keys %map_table_formats) {
- push @map_table_formats, " $format $map_table_formats{$format}\n";
+ push @map_table_formats, " $format $map_table_formats{$format}\n";
}
+ local $" = "";
+
# Everything is ready to assemble.
my @OUT = << "END";
=begin comment
and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
Unicode properties, refer to the Unicode standard. A good starting place is
L<$unicode_reference_url>. More information on the Perl extensions is in
-L<perlrecharclass>.
+L<perlunicode/Other Properties>.
Note that you can define your own properties; see
L<perlunicode/"User-Defined Character Properties">.
=back
-Some properties are considered obsolete, but still available. There are
-several varieties of obsolescence:
+Some properties are considered obsolete by Unicode, but still available.
+There are several varieties of obsolescence:
=over 4
=item Deprecated
An obsolete property may be deprecated, perhaps because its original intent
-has been replaced by another property or because its specification was somehow
-defective. This means that its use is strongly
+has been replaced by another property, or because its specification was
+somehow defective. This means that its use is strongly
discouraged, so much so that a warning will be issued if used, unless the
regular expression is in the scope of a C<S<no warnings 'deprecated'>>
statement. $A_bold_deprecated flags each such entry in the table, and
the property. For emphasis, those properties that match no code points at all
are listed as well in a separate section following the table.
+Most properties match the same code points regardless of whether C<"/i">
+case-insensitive matching is specified or not. But a few properties are
+affected. These are shown with the notation
+
+ (/i= other_property)
+
+in the second column. Under case-insensitive matching they match the
+same code pode points as the property "other_property".
+
There is no description given for most non-Perl defined properties (See
$unicode_reference_url for that).
A few properties are accessible in Perl via various function calls only.
These are:
+
Lowercase_Mapping lc() and lcfirst()
Titlecase_Mapping ucfirst()
Uppercase_Mapping uc()
=back
-An installation can choose to allow any of these to be matched by changing the
+An installation can choose to allow any of these to be matched by downloading
+the Unicode database from L<http://www.unicode.org/Public/> to
+C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
controlling lists contained in the program
-C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
+C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
(C<\%Config> is available from the Config module).
=head1 Files in the I<To> directory (for serious hackers only)
All Unicode properties are really mappings (in the mathematical sense) from
code points to their respective values. As part of its build process,
Perl constructs tables containing these mappings for all properties that it
-deals with. But only a few of these are written out into files.
+deals with. Some, but not all, of these are written out into files.
Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
(%Config is available from the Config module).
-Those ones written are ones needed by Perl internally during execution, or for
-which there is some demand, and those for which there is no access through the
-Perl core. Generally, properties that can be used in regular expression
-matching do not have their map tables written, like Script. Nor are the
-simplistic properties that have a better, more complete version, such as
-Simple_Uppercase_Mapping (Uppercase_Mapping is written instead).
-
-None of the properties in the I<To> directory are currently directly
-accessible through the Perl core, although some may be accessed indirectly.
-For example, the uc() function implements the Uppercase_Mapping property and
-uses the F<Upper.pl> file found in this directory.
-
-The available files in the current installation, with their properties (short
-names in parentheses), and any flags or comments about them, are:
+Perl reserves the right to change the format and even the existence of any of
+those files without notice, except the ones that were in existence prior to
+release 5.13. If those change, a deprecation cycle will be done first. These
+are:
@map_tables_actually_output
-An installation can choose to change which files are generated by changing the
-controlling lists contained in the program
-C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
-
-Each of these files defines two hash entries to help reading programs decipher
-it. One of them looks like this:
+Each of the files in this directory defines two hash entries to help reading
+programs decipher it. One of them looks like this:
\$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
also for backwards compatibility.) The hash entry gives the format of the
mapping fields of the table, currently one of the following:
- @map_table_formats
+@map_table_formats
This format applies only to the entries in the main body of the table.
Entries defined in hashes or ones that are missing from the list can have a
push @heavy, <<END;
);
+# A few properties have different behavior under /i matching. This maps the
+# those to substitute files to use under /i.
+\%utf8::caseless_equivalent = (
+END
+
+
+ # We set the key to the file when we associated files with tables, but we
+ # couldn't do the same for the value then, as we might not have the file
+ # for the alternate table figured out at that time.
+ foreach my $cased (keys %caseless_equivalent_to) {
+ my @path = $caseless_equivalent_to{$cased}->file_path;
+ my $path = join '/', @path[1, -1];
+ $path =~ s/\.pl//;
+ $utf8::caseless_equivalent_to{$cased} = $path;
+ }
+ push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
+ push @heavy, <<END;
+);
+
1;
END
: undef,
\&filter_case_folding_line
],
- Post_Handler => \&post_fold,
),
Input_file->new('DCoreProperties.txt', v3.1.0,
# 5.2 changed this file