From b5c66e73f5dfdc3424c469a7407d517635040a9c Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Wed, 19 Feb 2020 14:55:50 -0700 Subject: [PATCH] mktables: Handle versioning of non-UCD files Unicode has lately been asking implementations to support non-Unicode Character Database properties. Files for these contain a different versioning syntax than the UCD files. Previously I was hand-editing those files before commitiing to bring them to use a consistent style. But that is tedious, and I decide to invest a little time to be able to handle all the current versioning syntaxes automatically, to save having to manually update in the future. This was complicated by the fact that some Unicode non-UCD files have BOM marks on many comment lines. I submitted a trouble report to them. --- charclass_invlists.h | 2 +- lib/unicore/mktables | 77 +++++++++++++++++++++++++++++++++++---------- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- uni_keywords.h | 2 +- 5 files changed, 65 insertions(+), 20 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index b9a17d3..f010188 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -419812,7 +419812,7 @@ static const U8 WB_table[24][24] = { * 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt - * 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables + * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 52c680f..1820ad3 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -2375,6 +2375,11 @@ sub trace { return main::trace(@_); } # giving the first release without this file. main::set_access('withdrawn', \%withdrawn, 'c'); + my %ucd; + # Some files are not actually part of the Unicode Character Database. + # These typically have a different way of indicating their version + main::set_access('ucd', \%ucd, 'c'); + my %in_this_release; # Calculated value from %first_released and %withdrawn. Are we compiling # a Unicode release which includes this file? @@ -2404,6 +2409,7 @@ sub trace { return main::trace(@_); } $missings{$addr} = [ ]; $early{$addr} = [ ]; $optional{$addr} = [ ]; + $ucd{$addr} = 1; # Two positional parameters. return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2; @@ -2839,6 +2845,8 @@ END && ! $early{$addr}[0] && lc($file) ne 'unicodedata.txt') { + my $this_version; + if ($file !~ /^Unihan/i) { # The non-Unihan files started getting version numbers in @@ -2849,17 +2857,39 @@ END # numbers are correct. if ($v_version ge v4.0.1) { $_ = <$file_handle>; # The version number is in the - # very first line - if ($_ !~ / - $string_version \. /x) { - chomp; + # very first line if it is a + # UCD file; otherwise, it + # might be + goto valid_version if $_ =~ / - $string_version \. /x; + chomp; + if ($ucd{$addr}) { $_ =~ s/^#\s*//; # 4.0.1 had some valid files that weren't updated. - if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) { - die Carp::my_carp("File '$file' is version " - . "'$_'. It should be " - . "version $string_version"); + goto valid_version + if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/; + $this_version = $_; + goto wrong_version; + } + else { + my $BOM = "\x{FEFF}"; + utf8::encode($BOM); + my $BOM_re = qr/ ^ (?:$BOM)? /x; + + while ($_ =~ s/$BOM_re//) { # BOM; seems to be on + # many lines in some files!! + $_ = <$file_handle>; + chomp; + if ($_ =~ /^# Version: (.*)/) { + $this_version = $1; + goto valid_version + if $this_version eq $string_version; + goto valid_version + if "$this_version.0" eq $string_version; + goto wrong_version; + } } + goto no_version; } } } @@ -2869,23 +2899,30 @@ END # 6.0. The version is somewhere in the first comment # block while (<$file_handle>) { - if ($_ !~ /^#/) { - Carp::my_carp_bug("Could not find the expected " - . "version info in file '$file'"); - last; - } + goto no_version if $_ !~ /^#/; chomp; $_ =~ s/^#\s*//; next if $_ !~ / version: /x; - last if $_ =~ /$string_version/; - die Carp::my_carp("File '$file' is version " - . "'$_'. It should be " - . "version $string_version"); + goto valid_version if $_ =~ /$string_version/; + goto wrong_version; } + goto no_version; + } + else { # Old Unihan; have to assume is valid + goto valid_version; } + + wrong_version: + die Carp::my_carp("File '$file' is version " + . "'$this_version'. It should be " + . "version $string_version"); + no_version: + Carp::my_carp_bug("Could not find the expected " + . "version info in file '$file'"); } } + valid_version: print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS; # Call any special handler for before the file. @@ -20167,18 +20204,26 @@ my @input_file_objects = ( Pre_Handler => \&setup_emojidata, Has_Missings_Defaults => $NOT_IGNORED, Each_Line_Handler => \&filter_emojidata_line, + UCD => 0, ), Input_file->new("$EMOJI/emoji.txt", v13.0.0, Has_Missings_Defaults => $NOT_IGNORED, + UCD => 0, + ), + Input_file->new("$EMOJI/ReadMe.txt", v13.0.0, + Skip => $Documentation, + UCD => 0, ), Input_file->new('IdStatus.txt', v13.0.0, Pre_Handler => \&setup_IdStatus, Property => 'Identifier_Status', + UCD => 0, ), Input_file->new('IdType.txt', v13.0.0, Pre_Handler => \&setup_IdType, Each_Line_Handler => \&filter_IdType_line, Property => 'Identifier_Type', + UCD => 0, ), ); diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index e222178..7fd25b2 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1295,7 +1295,7 @@ # 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt # 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt # 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt -# 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables +# 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables # 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version # 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl # 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index f315cb4..f8e9f0a 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -2247,7 +2247,7 @@ * 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt - * 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables + * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index f754c9d..be271a1 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7540,7 +7540,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * 0fea35394151afefbb4121b6380db1b480be6f9bafb4eba3382dc292dcf68526 lib/unicore/extracted/DLineBreak.txt * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt - * 45e23c57b8ddcfed895b1b7b8869e79f2336b9c3b2432b55f051b426ab5a15c6 lib/unicore/mktables + * 93f508a690aa8949f213d50b573710f0b4a4e843c17283938035ecf19e0220e2 lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl -- 1.8.3.1