X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/ae92a9ae5fdd29415eca30d6749c94b11c488f41..2e2778b218a143b698b7844d136cb7df300c9ab3:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 2638c24..58f3678 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -608,6 +608,7 @@ sub uniques { $0 = File::Spec->canonpath($0); my $make_test_script = 0; # ? Should we output a test script +my $make_norm_test_script = 0; # ? Should we output a normalization test script my $write_unchanged_files = 0; # ? Should we update the output files even if # we don't think they have changed my $use_directory = ""; # ? Should we chdir somewhere. @@ -668,6 +669,10 @@ while (@ARGV) { { $make_test_script = 1; } + elsif ($arg eq '-makenormtest') + { + $make_norm_test_script = 1; + } elsif ($arg eq '-makelist') { $make_list = 1; } @@ -716,8 +721,8 @@ usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ] -makelist : Rewrite the file list $file_list based on current setup -annotate : Output an annotation for each character in the table files; useful for debugging mktables, looking at diffs; but is slow, - memory intensive; resulting tables are usable but slow and - very large. + memory intensive; resulting tables are usable but are slow and + very large (and currently fail the Unicode::UCD.t tests). -check A B : Executes $0 only if A and B are the same END } @@ -768,6 +773,8 @@ push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana' if $v_version ge v4.1.0; push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana' if $v_version ge v6.0.0; +push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend' + if $v_version ge v6.1.0; # The lists below are hashes, so the key is the item in the list, and the # value is the reason why it is in the list. This makes generation of @@ -831,6 +838,7 @@ if ($v_version ge v5.2.0) { # Enum values for to_output_map() method in the Map_Table package. my $EXTERNAL_MAP = 1; my $INTERNAL_MAP = 2; +my $OUTPUT_ADJUSTED = 3; # To override computed values for writing the map tables for these properties. # The default for enum map tables is to write them out, so that the Unicode @@ -897,11 +905,14 @@ my %why_obsolete; # Documentation only # existence is not noted in the comment. 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()', + 'Indic_Matra_Category' => "Provisional", + 'Indic_Syllabic_Category' => "Provisional", + # Don't suppress ISO_Comment, as otherwise special handling is needed # to differentiate between it and gc=c, which can be written as 'isc', # which is the same characters as ISO_Comment's short name. - 'Name' => "Accessible via 'use charnames;' or Unicode::UCD::prop_invmap()", + 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()", 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()", 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", @@ -911,14 +922,28 @@ my %why_obsolete; # Documentation only FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful', ); - # The following are suppressed because they were made contributory or - # deprecated by Unicode before Perl ever thought about supporting them. - foreach my $property ('Jamo_Short_Name', - 'Grapheme_Link', - 'Expands_On_NFC', - 'Expands_On_NFD', - 'Expands_On_NFKC', - 'Expands_On_NFKD' + foreach my $property ( + + # The following are suppressed because they were made contributory + # or deprecated by Unicode before Perl ever thought about + # supporting them. + 'Jamo_Short_Name', + 'Grapheme_Link', + 'Expands_On_NFC', + 'Expands_On_NFD', + 'Expands_On_NFKC', + 'Expands_On_NFKD', + + # The following are suppressed because they have been marked + # as deprecated for a sufficient amount of time + 'Other_Alphabetic', + 'Other_Default_Ignorable_Code_Point', + 'Other_Grapheme_Extend', + 'Other_ID_Continue', + 'Other_ID_Start', + 'Other_Lowercase', + 'Other_Math', + 'Other_Uppercase', ) { $why_suppressed{$property} = $why_deprecated{$property}; } @@ -1066,14 +1091,14 @@ my %ignored_files = ( 'ReadMe.txt' => 'Documentation', 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L', 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values', - 'IndicMatraCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts', - 'IndicSyllabicCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts', 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests', 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests', 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests', 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests', ); +my %skipped_files; # List of files that we skip + ### End of externally interesting definitions, except for @input_file_objects my $HEADER=<<"EOF"; @@ -1169,9 +1194,11 @@ my $YES = 1; my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in # the comments at the subroutine definition. my $UNCONDITIONALLY = 2; # Replace without conditions. -my $MULTIPLE = 4; # Don't replace, but add a duplicate record if +my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if + # already there +my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if # already there -my $CROAK = 5; # Die with an error if is already there +my $CROAK = 6; # Die with an error if is already there # Flags to give property statuses. The phrases are to remind maintainers that # if the flag is changed, the indefinite article referring to it in the @@ -1223,6 +1250,7 @@ my $INTEGER_FORMAT = 'i'; my $HEX_FORMAT = 'x'; my $RATIONAL_FORMAT = 'r'; my $STRING_FORMAT = 's'; +my $ADJUST_FORMAT = 'a'; my $DECOMP_STRING_FORMAT = 'c'; my $STRING_WHITE_SPACE_LIST = 'sw'; @@ -1234,6 +1262,7 @@ my %map_table_formats = ( $HEX_FORMAT => 'non-negative hex whole number; a code point', $RATIONAL_FORMAT => 'rational: an integer or a fraction', $STRING_FORMAT => 'string', + $ADJUST_FORMAT => 'some entries need adjustment', $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping', $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators' ); @@ -1321,6 +1350,9 @@ my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes # anonymous hash. my @code_points_ending_in_code_point; +# To hold Unicode's normalization test suite +my @normalization_tests; + # Boolean: does this Unicode version have the hangul syllables, and are we # writing out a table for them? my $has_hangul_syllables = 0; @@ -2012,7 +2044,7 @@ sub trace { return main::trace(@_); } # contrast to the non_skip element, which is supposed to be used very # temporarily for debugging. Sets 'optional' to 1. Also, files that we # pretty much will never look at can be placed in the global - # %ignored_files instead. Ones used here will be added to that list. + # %ignored_files instead. Ones used here will be added to %skipped files main::set_access('skip', \%skip, 'c'); my %each_line_handler; @@ -2139,7 +2171,7 @@ sub trace { return main::trace(@_); } # including its reason if ($skip{$addr}) { $optional{$addr} = 1; - $ignored_files{$file{$addr}} = $skip{$addr} + $skipped_files{$file{$addr}} = $skip{$addr} } return $self; @@ -2150,6 +2182,7 @@ sub trace { return main::trace(@_); } fallback => 0, qw("") => "_operator_stringify", "." => \&main::_operator_dot, + ".=" => \&main::_operator_dot_equal, ; sub _operator_stringify { @@ -2227,7 +2260,7 @@ sub trace { return main::trace(@_); } # its name if ($seen_non_extracted_non_age) { if ($file =~ /$EXTRACTED/i) { - Carp::my_carp_bug(join_lines(<rel2abs($file); - my $expecting = delete $potential_files{$fkey}; - $expecting = delete $potential_files{lc($fkey)} unless defined $expecting; + my $expecting = delete $potential_files{lc($fkey)}; + Carp::my_carp("Was not expecting '$file'.") if ! $expecting && ! defined $handle{$addr}; @@ -2415,7 +2448,8 @@ END || @defaults > 2 || ($default =~ /^$/i - && $default !~ /^$/i)) + && $default !~ /^$/i + && $default !~ /^