From 5f270787832458b6f6e80d68a820cd42a983fbb2 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Mon, 24 Feb 2020 13:35:01 -0700 Subject: [PATCH] mktables: Calculate legal chars in algorithmic names Many ideographic character names are of the form 'prefix-code_point'. For these, we know that the legal names are just the ones in the prefix, the dash, and uppercase hex digits. This commit for each series of these types of names figures out what characters are legal in that series, and adds that info to the hash describing the series. This will be used in a later commit to rule out entire series when matching under some circumstances, without having to try any individual matches within it. --- charclass_invlists.h | 2 +- lib/unicore/mktables | 16 +++++++++++++++- lib/unicore/uni_keywords.pl | 2 +- regcharclass.h | 2 +- uni_keywords.h | 2 +- 5 files changed, 19 insertions(+), 5 deletions(-) diff --git a/charclass_invlists.h b/charclass_invlists.h index f615586..9768196 100644 --- a/charclass_invlists.h +++ b/charclass_invlists.h @@ -419864,7 +419864,7 @@ static const U8 WB_table[23][23] = { * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt - * 91977d5f417fa9252fe9bfebeb61bb28bda9273b630a0e333b6c7b94c8445bca lib/unicore/mktables + * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 2126268..d1fb8e4 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -7505,9 +7505,23 @@ END push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, $high; + # Calculate the set of legal characters in names of this + # series. It includes every character in the name prefix. + my %legal; + $legal{$_} = 1 for split //, $map; + + # Plus the hex code point chars, blank, and minus. Also \n + # can show up as being required due to anchoring + for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") { + $legal{$i} = 1; + } + my $legal = join "", sort { $a cmp $b } keys %legal; + + # The legal chars can be used in match optimizations push @code_points_ending_in_code_point, { low => $low, high => $high, - name => $map + name => $map, + legal => $legal, }; } } diff --git a/lib/unicore/uni_keywords.pl b/lib/unicore/uni_keywords.pl index bcfe12e..3a21ccf 100644 --- a/lib/unicore/uni_keywords.pl +++ b/lib/unicore/uni_keywords.pl @@ -1295,7 +1295,7 @@ # baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt # 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt # 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt -# 91977d5f417fa9252fe9bfebeb61bb28bda9273b630a0e333b6c7b94c8445bca lib/unicore/mktables +# 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables # 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version # 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl # 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl diff --git a/regcharclass.h b/regcharclass.h index 8dd2bf7..38bce8d 100644 --- a/regcharclass.h +++ b/regcharclass.h @@ -2247,7 +2247,7 @@ * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt - * 91977d5f417fa9252fe9bfebeb61bb28bda9273b630a0e333b6c7b94c8445bca lib/unicore/mktables + * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * f9a393e7add8c7c2728356473ce5b52246d51295b2da0c48fb6f0aa21799e2bb regen/regcharclass.pl diff --git a/uni_keywords.h b/uni_keywords.h index 7264512..f6c6408 100644 --- a/uni_keywords.h +++ b/uni_keywords.h @@ -7537,7 +7537,7 @@ MPH_VALt match_uniprop( const unsigned char * const key, const U16 key_len ) { * baba9dfc133e3cb770a89aaf0973b1341fa61c2da6c176baf6428898b3b568d8 lib/unicore/extracted/DLineBreak.txt * 6d4a8c945dd7db83ed617cbb7d937de7f4ecf016ff22970d846e996a7c9a2a5d lib/unicore/extracted/DNumType.txt * 5b7c14380d5cceeaffcfbc18db1ed936391d2af2d51f5a41f1a17b692c77e59b lib/unicore/extracted/DNumValues.txt - * 91977d5f417fa9252fe9bfebeb61bb28bda9273b630a0e333b6c7b94c8445bca lib/unicore/mktables + * 3e37ae63c1a4f3084bba787a2c6ca020dad9d0d56e115c118fe8c68ac290ea7a lib/unicore/mktables * 50b85a67451145545a65cea370dab8d3444fbfe07e9c34cef560c5b7da9d3eef lib/unicore/version * 2680b9254eb236c5c090f11b149605043e8c8433661b96efc4a42fb4709342a5 regen/charset_translations.pl * 6bbad21de0848e0236b02f34f5fa0edd3cdae9ba8173cc9469a5513936b9e728 regen/mk_PL_charclass.pl -- 1.8.3.1