# With this release, it is automatically handled if the Unihan db is
# downloaded
-push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
+push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version lt v5.2.0;
# There are several types of obsolete properties defined by Unicode. These
# must be hand-edited for every new Unicode release.
my $perl_charname;
my $print;
my $All;
+my $Assigned; # All assigned characters in this Unicode release
my $script;
# Are there conflicting names because of beginning with 'In_', or 'Is_'
if (! $optional{$addr} # File could be optional
&& $v_version ge $first_released{$addr})
{
- print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
+ print STDERR "Skipping processing input file '$file' because not found\n";
}
return;
}
'AL');
$lb->set_default_map($default);
}
+ }
- # If has the URS property, make sure that the standard aliases are in
- # it, since not in the input tables in some versions.
- my $urs = property_ref('Unicode_Radical_Stroke');
- if (defined $urs) {
- $urs->add_alias('cjkRSUnicode');
- $urs->add_alias('kRSUnicode');
- }
+ # If has the URS property, make sure that the standard aliases are in
+ # it, since not in the input tables in some versions.
+ my $urs = property_ref('Unicode_Radical_Stroke');
+ if (defined $urs) {
+ $urs->add_alias('cjkRSUnicode');
+ $urs->add_alias('kRSUnicode');
}
# For backwards compatibility with applications that may read the mapping
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my @fields = split /\s*;\s*/;
- if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
- $fields[1] = 'I';
- }
- elsif ($fields[1] eq 'L') {
+
+ if ($fields[1] eq 'L') {
$fields[1] = 'C'; # L => C always
}
elsif ($fields[1] eq 'E') {
# Change hyphens and blanks in the block name field only
$fields[1] =~ s/[ -]/_/g;
- $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
+ $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
$_ = join("; ", @fields);
return;
return;
}
+sub filter_all_caps_script_names {
+
+ # Some early Unicode releases had the script names in all CAPS. This
+ # converts them to just the first letter of each word being capital.
+
+ my ($range, $script, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+ my @words = split "_", $script;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $script = join "_", @words;
+ $_ = join ";", $range, $script, @remainder;
+}
+
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
return $Nl;
}
+sub calculate_Assigned() { # Calculate the gc != Cn code points; may be
+ # called before the Cn's are completely filled.
+ # Works on Unicodes earlier than ones that
+ # explicitly specify Cn.
+ return if defined $Assigned;
+
+ if (! defined $gc || $gc->is_empty()) {
+ Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
+ }
+
+ $Assigned = $perl->add_match_table('Assigned',
+ Description => "All assigned code points",
+ );
+ while (defined (my $range = $gc->each_range())) {
+ my $standard_value = standardize($range->value);
+ next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
+ $Assigned->add_range($range->start, $range->end);
+ }
+}
+
sub compile_perl() {
# Create perl-defined tables. Almost all are part of the pseudo-property
# named 'perl' internally to this program. Many of these are recommended
$Any->add_range(0, $MAX_UNICODE_CODEPOINT);
$Any->add_alias('Unicode');
- # Assigned is the opposite of gc=unassigned
- my $Assigned = $perl->add_match_table('Assigned',
- Description => "All assigned code points",
- Initialize => ~ $gc->table('Unassigned'),
- );
+ calculate_Assigned();
# Our internal-only property should be treated as more than just a
# synonym; grandfather it in to the pod.
# There are quite a few code points in Lower, that aren't in gc=lc,
# and not all are in all releases.
- foreach my $code_point ( utf8::unicode_to_native(0xAA),
- utf8::unicode_to_native(0xBA),
- 0x02B0 .. 0x02B8,
- 0x02C0 .. 0x02C1,
- 0x02E0 .. 0x02E4,
- 0x0345,
- 0x037A,
- 0x1D2C .. 0x1D6A,
- 0x1D78,
- 0x1D9B .. 0x1DBF,
- 0x2071,
- 0x207F,
- 0x2090 .. 0x209C,
- 0x2170 .. 0x217F,
- 0x24D0 .. 0x24E9,
- 0x2C7C .. 0x2C7D,
- 0xA770,
- 0xA7F8 .. 0xA7F9,
- ) {
- # Don't include the code point unless it is assigned in this
- # release
- my $category = $gc->value_of($code_point);
- next if ! defined $category || $category eq 'Cn';
-
- $Lower += $code_point;
- }
+ my $temp = Range_List->new(Initialize => [
+ utf8::unicode_to_native(0xAA),
+ utf8::unicode_to_native(0xBA),
+ 0x02B0 .. 0x02B8,
+ 0x02C0 .. 0x02C1,
+ 0x02E0 .. 0x02E4,
+ 0x0345,
+ 0x037A,
+ 0x1D2C .. 0x1D6A,
+ 0x1D78,
+ 0x1D9B .. 0x1DBF,
+ 0x2071,
+ 0x207F,
+ 0x2090 .. 0x209C,
+ 0x2170 .. 0x217F,
+ 0x24D0 .. 0x24E9,
+ 0x2C7C .. 0x2C7D,
+ 0xA770,
+ 0xA7F8 .. 0xA7F9,
+ ]);
+ $Lower += $temp & $Assigned;
}
my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
Description =>
"Code points whose fold is a string of more than one character",
);
+ if ($v_version lt v3.0.1) {
+ push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
+ }
# Look through all the known folds to populate these tables.
foreach my $range ($cf->ranges) {
$unassigned->set_equivalent_to($age_default, Related => 1);
}
+ my $patws = $perl->add_match_table('_Perl_PatWS',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
+ $patws->initialize($off_patws->table('Y'));
+ }
+ else {
+ $patws->initialize([ ord("\t"),
+ ord("\n"),
+ utf8::unicode_to_native(0x0B), # VT
+ ord("\f"),
+ ord("\r"),
+ ord(" "),
+ utf8::unicode_to_native(0x85), # NEL
+ 0x200E..0x200F, # Left, Right marks
+ 0x2028..0x2029 # Line, Paragraph seps
+ ] );
+ }
+
# See L<perlfunc/quotemeta>
my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
Perl_Extension => 1,
# The first few character columns are filler, plus the '\p{'; and get rid
# of all the trailing stuff, starting with the trailing '}', so as to sort
# on just 'Name=Value'
- (my $a = lc $a) =~ s/^ .*? { //x;
+ (my $a = lc $a) =~ s/^ .*? \{ //x;
$a =~ s/}.*//;
- (my $b = lc $b) =~ s/^ .*? { //x;
+ (my $b = lc $b) =~ s/^ .*? \{ //x;
$b =~ s/}.*//;
# Determine if the two operands are both internal only or both not.
),
Input_file->new('Scripts.txt', v3.1.0,
Property => 'Script',
+ Each_Line_Handler => (($v_version le v4.0.0)
+ ? \&filter_all_caps_script_names
+ : undef),
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('DNormalizationProps.txt', v3.1.0,