X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/dc85bd38b5243856f77df4583119e0e85bd741ac..c85f591ab16d6f032ffc8369fe6a6b4eea7e77df:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index b4f75af..9b2d25a 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -17,7 +17,7 @@ # changed 0+$self to pack 'J', $self.) my $start_time; -BEGIN { # Get the time the script started running; do it at compiliation to +BEGIN { # Get the time the script started running; do it at compilation to # get it as close as possible $start_time= time; } @@ -478,7 +478,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # # Here are some observations about some of the issues in early versions: # -# The number of code points in \p{alpha} halve in 2.1.9. It turns out that +# The number of code points in \p{alpha} halved in 2.1.9. It turns out that # the reason is that the CJK block starting at 4E00 was removed from PropList, # and was not put back in until 3.1.0 # @@ -1136,6 +1136,7 @@ my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in my $UNCONDITIONALLY = 2; # Replace without conditions. my $MULTIPLE = 4; # Don't replace, but add a duplicate record if # already there +my $CROAK = 5; # Die with an error if is already there # Flags to give property statuses. The phrases are to remind maintainers that # if the flag is changed, the indefinite article referring to it in the @@ -1295,7 +1296,7 @@ my @printable; # boolean: And are those characters printable? my @annotate_char_type; # Contains a type of those characters, specifically # for the purposes of annotation. my $annotate_ranges; # A map of ranges of code points that have the same - # name for the purposes of annoation. They map to the + # name for the purposes of annotation. They map to the # upper edge of the range, so that the end point can # be immediately found. This is used to skip ahead to # the end of a range, and avoid processing each @@ -1382,10 +1383,11 @@ sub populate_char_info ($) { $end = min($block->containing_range($i)->end, $unassigned_sans_noncharacters-> containing_range($i)-> end); - } else { - my_carp_bug("Can't figure out how to annotate" - . sprintf("U+%04X", $i) - . "Proceeding anyway."); + } + else { + Carp::my_carp_bug("Can't figure out how to annotate " + . sprintf("U+%04X", $i) + . ". Proceeding anyway."); $viacode[$i] = 'UNKNOWN'; $annotate_char_type[$i] = $UNKNOWN_TYPE; $printable[$i] = 0; @@ -3522,6 +3524,13 @@ sub trace { return main::trace(@_); } if ($clean_insert) { if ($r->[$j]->standard_form ne $standard_form) { $clean_insert = 0; + if ($replace == $CROAK) { + main::croak("The range to add " + . sprintf("%04X", $start) + . '-' + . sprintf("%04X", $end) + . " with value '$value' overlaps an existing range $r->[$j]"); + } } else { @@ -3814,7 +3823,7 @@ sub trace { return main::trace(@_); } trace "i =[", $i, "]", $r->[$i]; trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1; trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2; - trace "removed @return"; + trace "removed ", @return if @return; } # An actual deletion could have changed the maximum in the list. @@ -4213,8 +4222,6 @@ sub trace { return main::trace(@_); } # the character very frequently used. return $try_hard if $code == 0x0000; - return 0 if $try_hard; # XXX Temporary until fix utf8.c - # shun non-character code points. return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF; return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF @@ -4490,7 +4497,7 @@ sub trace { return main::trace(@_); } # not, is normal. The lists are prioritized so the most serious # ones are checked first if (exists $why_suppressed{$complete_name} - # Don't suppress if overriden + # Don't suppress if overridden && ! grep { $_ eq $complete_name{$addr} } @output_mapped_properties) { @@ -5814,7 +5821,7 @@ END # The pack() below can't cope with surrogates. if ($code_point >= 0xD800 && $code_point <= 0xDFFF) { - Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created"); + Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created"); next; } @@ -6554,7 +6561,7 @@ sub trace { return main::trace(@_); } # not quite so many. # If they are related, one must be a perl extension. This is because # we can't guarantee that Unicode won't change one or the other in a - # later release even if they are idential now. + # later release even if they are identical now. my $self = shift; my $other = shift; @@ -7061,7 +7068,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } # each of them is stored in %alias_to_property_of as they are defined. # But it's possible that this subroutine will be called with some # variant, so if the initial lookup fails, it is repeated with the - # standarized form of the input name. If found, besides returning the + # standardized form of the input name. If found, besides returning the # result, the input name is added to the list so future calls won't # have to do the conversion again. @@ -7215,7 +7222,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } . " argument to '-='. Subtraction ignored."); return $self; } - elsif ($reversed) { # Shouldnt happen in a -=, but just in case + elsif ($reversed) { # Shouldn't happen in a -=, but just in case Carp::my_carp_bug("Can't cope with a " . __PACKAGE__ . " being the first parameter in a '-='. Subtraction ignored."); @@ -7638,7 +7645,7 @@ sub join_lines($) { # A blank separates the joined lines except if there is a break; an extra # blank is inserted after a period ending a line. - # Intialize the return with the first line. + # Initialize the return with the first line. my ($return, @lines) = split "\n", shift; # If the first line is null, it was an empty line, add the \n back in @@ -7940,7 +7947,7 @@ sub Standardize($) { $name =~ s/^\s+//g; $name =~ s/\s+$//g; - # Convert interior white space and hypens into underscores. + # Convert interior white space and hyphens into underscores. $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg; # Capitalize the letter following an underscore, and convert a sequence of @@ -7968,6 +7975,30 @@ sub standardize ($) { return lc $name; } +sub utf8_heavy_name ($$) { + # Returns the name that utf8_heavy.pl will use to find a table. XXX + # perhaps this function should be placed somewhere, like Heavy.pl so that + # utf8_heavy can use it directly without duplicating code that can get + # out-of sync. + + my $table = shift; + my $alias = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + my $property = $table->property; + $property = ($property == $perl) + ? "" # 'perl' is never explicitly stated + : standardize($property->name) . '='; + if ($alias->loose_match) { + return $property . standardize($alias->name); + } + else { + return lc ($property . $alias->name); + } + + return; +} + { # Closure my $indent_increment = " " x 2; @@ -8283,9 +8314,9 @@ sub finish_property_setup { my $fold = property_ref('Case_Folding'); $fold->set_file('Fold') if defined $fold; - # utf8.c can't currently cope with non range-size-1 for these, and even if - # it were changed to do so, someone else may be using them, expecting the - # old style + # utf8.c has a different meaning for non range-size-1 for map properties + # that this program doesn't currently handle; and even if it were changed + # to do so, some other code may be using them expecting range size 1. foreach my $property (qw { Case_Folding Lowercase_Mapping @@ -9007,7 +9038,7 @@ sub output_perl_charnames_line ($$) { # # meaning the codepoints in the range all have the value 'map' under # 'property'. - # Beginning and trailing white space in each field are not signficant. + # Beginning and trailing white space in each field are not significant. # Note there is not a trailing semi-colon in the above. A trailing # semi-colon means the map is a null-string. An omitted map, as # opposed to a null-string, is assumed to be 'Y', based on Unicode @@ -9027,8 +9058,8 @@ sub output_perl_charnames_line ($$) { # file, in any order, interspersed in any way. The first time a # property is seen, it gets information about that property and # caches it for quick retrieval later. It also normalizes the maps - # so that only one of many synonym is stored. The Unicode input files - # do use some multiple synonyms. + # so that only one of many synonyms is stored. The Unicode input + # files do use some multiple synonyms. my $file = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -9337,19 +9368,17 @@ END # If the map begins with a special command to us (enclosed in # delimiters), extract the command(s). - if (substr($map, 0, 1) eq $CMD_DELIM) { - while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { - my $command = $1; - if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { - $replace = $1; - } - elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { - $map_type = $1; - } - else { - $file->carp_bad_line("Unknown command line: '$1'"); - next LINE; - } + while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) { + my $command = $1; + if ($command =~ / ^ $REPLACE_CMD= (.*) /x) { + $replace = $1; + } + elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) { + $map_type = $1; + } + else { + $file->carp_bad_line("Unknown command line: '$1'"); + next LINE; } } } @@ -9541,7 +9570,7 @@ END # the code point and name on each line. This was actually the hardest # thing to design around. The code points in those ranges may actually # have real maps not given by these two lines. These maps will either - # be algorthimically determinable, or in the extracted files furnished + # be algorithmically determinable, or in the extracted files furnished # with the UCD. In the event of conflicts between these extracted files, # and this one, Unicode says that this one prevails. But it shouldn't # prevail for conflicts that occur in these ranges. The data from the @@ -10853,7 +10882,7 @@ sub filter_blocks_lines { # one. # Titlecase duplicates UnicodeData.txt: gc=lt # Unassigned Code Value duplicates UnicodeData.txt: gc=cc - # Zero-width never made into offical property; + # Zero-width never made into official property; # subset of gc=cf # Most of the properties have the same names in this file as in later # versions, but a couple do not. @@ -10994,7 +11023,8 @@ sub finish_Unicode() { # Add mappings to the property for each code point in the list foreach my $range ($list->ranges) { - $property->add_map($range->start, $range->end, $default); + $property->add_map($range->start, $range->end, $default, + Replace => $CROAK); } } @@ -11021,11 +11051,12 @@ sub finish_Unicode() { } # Add any remaining code points to the mapping, using the default for - # missing code points + # missing code points. if (defined (my $default_map = $property->default_map)) { - foreach my $range ($property->inverse_list->ranges) { - $property->add_map($range->start, $range->end, $default_map); - } + + # This fills in any missing values with the default. + $property->add_map(0, $LAST_UNICODE_CODEPOINT, + $default_map, Replace => $NO); # Make sure there is a match table for the default if (! defined $property->table($default_map)) { @@ -11123,14 +11154,6 @@ END $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]'); my $Cs = $gc->table('Cs'); - if (defined $Cs) { - $Cs->add_note('Mostly not usable in Perl.'); - $Cs->add_comment(join_lines(<add_description($description_start . $first_age->name); - # To construct the accumlated values, for each of the age tables + # To construct the accumulated values, for each of the age tables # starting with the 2nd earliest, merge the earliest with it, to get # all those code points existing in the 2nd earliest. Repeat merging # the new 2nd earliest with the 3rd earliest to get all those existing @@ -12068,7 +12091,7 @@ END sub register_file_for_name($$$) { # Given info about a table and a datafile that it should be associated - # with, register that assocation + # with, register that association my $table = shift; my $directory_ref = shift; # Array of the directory path for the file @@ -12118,14 +12141,12 @@ sub register_file_for_name($$$) { # goes through all aliases in the UCD that we generate regex match # files for foreach my $alias ($table->aliases) { - my $name = $alias->name; + my $standard = utf8_heavy_name($table, $alias); # Generate an entry in either the loose or strict hashes, which # will translate the property and alias names combination into the # file where the table for them is stored. - my $standard; if ($alias->loose_match) { - $standard = $property . standardize($alias->name); if (exists $loose_to_file_of{$standard}) { Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'."); } @@ -12134,7 +12155,6 @@ sub register_file_for_name($$$) { } } else { - $standard = lc ($property . $name); if (exists $stricter_to_file_of{$standard}) { Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'."); } @@ -12147,7 +12167,7 @@ sub register_file_for_name($$$) { # will work. Also note that this assumes that such a # number is matched strictly; so if that were to change, # this would be wrong. - if ((my $integer_name = $name) + if ((my $integer_name = $alias->name) =~ s/^ ( -? \d+ ) \.0+ $ /$1/x) { $stricter_to_file_of{$property . $integer_name} @@ -12572,7 +12592,6 @@ sub make_table_pod_entries($) { # expression, but with only one of 'Single', 'Short' if there # are both items. if ($short_name || $single_form || $table->conflicting) { - $parenthesized .= '('; $parenthesized .= "Short: $short_name" if $short_name; if ($short_name && $single_form) { $parenthesized .= ', '; @@ -12592,18 +12611,16 @@ sub make_table_pod_entries($) { # to go on every entry. my $conflicting = join " NOR ", $table->conflicting; if ($conflicting) { - $parenthesized .= '(' if ! $parenthesized; - $parenthesized .= '; ' if $parenthesized ne '('; + $parenthesized .= '; ' if $parenthesized ne ""; $parenthesized .= "NOT $conflicting"; } - $parenthesized .= ')' if $parenthesized; - push @info, $parenthesized if $parenthesized; + push @info, "($parenthesized)" if $parenthesized; if ($table_property != $perl && $table->perl_extension) { push @info, '(Perl extension)'; } - push @info, "($string_count)" if $output_range_counts; + push @info, "($string_count)"; # Now, we have both the entry and info so add them to the # list of all the properties. @@ -12966,7 +12983,7 @@ adjacent to (but within) the braces and the colon or equal sign. =back Some properties are considered obsolete, but still available. There are -several varieties of obsolesence: +several varieties of obsolescence: =over 4 @@ -13010,7 +13027,7 @@ flags each such entry in the table. @block_warning The table below has two columns. The left column contains the \\p{} -constructs to look up, possibly preceeded by the flags mentioned above; and +constructs to look up, possibly preceded by the flags mentioned above; and the right column contains information about them, like a description, or synonyms. It shows both the single and compound forms for each property that has them. If the left column is a short name for a property, the right column @@ -13563,7 +13580,7 @@ sub write_all_tables() { $filename = $table->file; } - # Use specified filename if avaliable, or default to property's + # Use specified filename if available, or default to property's # shortest name. We need an 8.3 safe filename (which means "an 8 # safe" filename, since after the dot is only 'pl', which is < 3) # The 2nd parameter is if the filename shouldn't be changed, and @@ -14531,7 +14548,7 @@ if ( $file_list and $make_list ) { # # - First section is input files # ($0 itself is not listed but is automatically considered an input) -# - Section seperator is /^=+\$/ +# - Section separator is /^=+\$/ # - Second section is a list of output files. # - Lines matching /^\\s*#/ are treated as comments # which along with blank lines are ignored.