# Any files created or read by this program should be listed in 'mktables.lst'
# Use -makelist to regenerate it.
-# Needs 'no overloading' to run faster on miniperl. Code commented out at the
-# subroutine objaddr can be used instead to work as far back (untested) as
-# 5.8: needs pack "U". But almost all occurrences of objaddr have been
-# removed in favor of using 'no overloading'. You also would have to go
-# through and replace occurrences like:
-# my $addr = do { no overloading; pack 'J', $self; }
-# with
-# my $addr = main::objaddr $self;
-# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
-# that instituted the change to main::objaddr, and subsequent commits that
-# changed 0+$self to pack 'J', $self.)
+# There was an attempt when this was first rewritten to make it 5.8
+# compatible, but that has now been abandoned, and newer constructs are used
+# as convenient.
my $start_time;
BEGIN { # Get the time the script started running; do it at compilation to
# As mentioned earlier, some properties are given in more than one file. In
# particular, the files in the extracted directory are supposedly just
# reformattings of the others. But they contain information not easily
-# derivable from the other files, including results for Unihan, which this
-# program doesn't ordinarily look at, and for unassigned code points. They
+# derivable from the other files, including results for Unihan (which isn't
+# usually available to this program) and for unassigned code points. They
# also have historically had errors or been incomplete. In an attempt to
# create the best possible data, this program thus processes them first to
# glean information missing from the other files; then processes those other
#
# There is a bug in the 3.2 data file in which some values for the
# kPrimaryNumeric property have commas and an unexpected comment. A filter
-# could be added for these; or for a particular installation, the Unihan.txt
-# file could be edited to fix them.
+# could be added to correct these; or for a particular installation, the
+# Unihan.txt file could be edited to fix them.
#
# HOW TO ADD A FILE TO BE PROCESSED
#
# handled by Unicode::Normalize, nor will it compile when presented a version
# that has them. However, you can trivially get it to compile by simply
# ignoring those decompositions, by changing the croak to a carp. At the time
-# of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
+# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
+# cpan/Unicode-Normalize/mkheader) reads
#
# croak("Weird Canonical Decomposition of U+$h");
#
-# Simply change to a carp. It will compile, but will not know about any three
-# character decomposition.
+# Simply comment it out. It will compile, but will not know about any three
+# character decompositions.
# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
# that the reason is that the CJK block starting at 4E00 was removed from
# name for the class, it would not have been affected, but if it used the
# mnemonic, it would have been.
#
-# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
+# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code
# points which eventually came to have this script property value, instead
# mapped to "Unknown". But in the next release all these code points were
# moved to \p{sc=common} instead.
+
+# The tests furnished by Unicode for testing WordBreak and SentenceBreak
+# generate errors in 5.0 and earlier.
#
# The default for missing code points for BidiClass is complicated. Starting
# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
|| $caller_name eq 'trace');
my $output = "";
+ #print STDERR __LINE__, ": ", join ", ", @input, "\n";
foreach my $string (@input) {
- #print STDERR __LINE__, ": ", join ", ", @input, "\n";
if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
$output .= simple_dumper($string);
}
# This is for a rarely used development feature that allows you to compare two
# versions of the Unicode standard without having to deal with changes caused
-# by the code points introduced in the later version. Change the 0 to a
-# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
-# code points introduced in that release and earlier will be used; later ones
-# are thrown away. You use the version number of the earliest one you want to
+# by the code points introduced in the later version. You probably also want
+# to use the -annotate option when using this. Change the 0 to a string
+# containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only code
+# points introduced in that release and earlier will be used; later ones are
+# thrown away. You use the version number of the earliest one you want to
# compare; then run this program on directory structures containing each
# release, and compare the outputs. These outputs will therefore include only
# the code points common to both releases, and you can see the changes caused
}
}
-# Enum values for to_output_map() method in the Map_Table package.
+# Enum values for to_output_map() method in the Map_Table package. (0 is don't
+# output)
my $EXTERNAL_MAP = 1;
my $INTERNAL_MAP = 2;
my $OUTPUT_ADJUSTED = 3;
# With this release, it is automatically handled if the Unihan db is
# downloaded
-push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
+push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version lt v5.2.0;
# There are several types of obsolete properties defined by Unicode. These
# must be hand-edited for every new Unicode release.
# existence is not noted in the comment.
'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
- 'Indic_Matra_Category' => "Provisional",
- 'Indic_Syllabic_Category' => "Provisional",
+ 'Indic_Matra_Category' => "Withdrawn by Unicode while still provisional",
# Don't suppress ISO_Comment, as otherwise special handling is needed
# to differentiate between it and gc=c, which can be written as 'isc',
EOF
-my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
+my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
+ ? "10FFFF"
+ : "FFFF";
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
# defaults for code points not listed (i.e., missing) in the file. The code
# depends on this ending with a semi-colon, so it can assume it is a valid
# field when the line is split() by semi-colons
-my $missing_defaults_prefix =
- qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
+my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
# Property types. Unicode has more types, but these are sufficient for our
# purposes.
# files
my %stricter_to_file_of; # same; but for stricter mapping.
my %loose_property_to_file_of; # Maps a loose property name to its map file
+my %strict_property_to_file_of; # Same, but strict
my @inline_definitions = "V0"; # Each element gives a definition of a unique
# inversion list. When a definition is inlined,
# its value in the hash it's in (one of the two
# defined just above) will include an index into
# this array. The 0th element is initialized to
- # the definition for a zero length invwersion list
+ # the definition for a zero length inversion list
my %file_to_swash_name; # Maps the file name to its corresponding key name
# in the hash %utf8::SwashInfo
my %nv_floating_to_rational; # maps numeric values floating point numbers to
# their rational equivalent
my %loose_property_name_of; # Loosely maps (non_string) property names to
# standard form
+my %strict_property_name_of; # Strictly maps (non_string) property names to
+ # standard form
my %string_property_loose_to_name; # Same, for string properties.
my %loose_defaults; # keys are of form "prop=value", where 'prop' is
# the property name in standard loose form, and
my $needing_code_points_ending_in_code_point = 0;
my @backslash_X_tests; # List of tests read in for testing \X
+my @SB_tests; # List of tests read in for testing \b{sb}
+my @WB_tests; # List of tests read in for testing \b{wb}
my @unhandled_properties; # Will contain a list of properties found in
# the input that we didn't process.
my @match_properties; # Properties that have match tables, to be
my $perl_charname;
my $print;
my $All;
+my $Assigned; # All assigned characters in this Unicode release
my $script;
# Are there conflicting names because of beginning with 'In_', or 'Is_'
containing_range($i)->end);
}
}
- elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases
- $viacode[$i] = $gc->value_of($i);
- $annotate_char_type[$i] = $UNKNOWN_TYPE;
- $printable[$i] = 0;
- }
- elsif ($gc-> table('Surrogate')->contains($i)) {
+ elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
$viacode[$i] = 'Surrogate';
$annotate_char_type[$i] = $SURROGATE_TYPE;
$printable[$i] = 0;
# file line is read. This allows the EOF handler to flush buffers, for
# example. The difference between the two routines is that the lines inserted
# by insert_lines() are subjected to the each_line_handler()s. (So if you
-# called it from such a handler, you would get infinite recursion.) Lines
-# inserted by insert_adjusted_lines() go directly to the main handler without
-# any adjustments. If the post-processing handler calls any of these, there
-# will be no effect. Some error checking for these conditions could be added,
-# but it hasn't been done.
+# called it from such a handler, you would get infinite recursion without some
+# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go
+# directly to the main handler without any adjustments. If the
+# post-processing handler calls any of these, there will be no effect. Some
+# error checking for these conditions could be added, but it hasn't been done.
#
# carp_bad_line() should be called to warn of bad input lines, which clears $_
# to prevent further processing of the line. This routine will output the
my %post_handler;
# Subroutine to call after all the lines of the file are read in and
- # processed. If undef, no such handler is called.
+ # processed. If undef, no such handler is called. Note that this cannot
+ # add lines to be processed; instead use eof_handler
main::set_access('post_handler', \%post_handler, qw{ c });
my %progress_message;
$file{$addr} = main::internal_file_to_platform(shift);
$first_released{$addr} = shift;
+ undef $file{$addr} if $first_released{$addr} gt $v_version;
+
# The rest of the arguments are key => value pairs
# %constructor_fields has been set up earlier to list all possible
# ones. Either set or push, depending on how the default has been set
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $skipped_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr};
}
elsif ($properties{$addr}) {
- # Add a handler for each line in the input so that it creates a
- # separate input line for each property in those input lines, thus
- # making them suitable for process_generic_property_file().
+ # Similarly, there may be more than one property represented on
+ # each line, with no clue but the constructor input what those
+ # might be. Add a handler for each line in the input so that it
+ # creates a separate input line for each property in those input
+ # lines, thus making them suitable to handle generically.
push @{$each_line_handler{$addr}},
sub {
};
}
- { # On non-ascii platforms, we use a special handler
+ { # On non-ascii platforms, we use a special pre-handler
no strict;
no warnings 'once';
*next_line = (main::NON_ASCII_PLATFORM)
# than this Unicode version), and isn't there. This means if someone
# copies it into an earlier version's directory, we will go ahead and
# process it.
- return if $first_released{$addr} gt $v_version && ! -e $file;
+ return if $first_released{$addr} gt $v_version
+ && (! defined $file || ! -e $file);
# If in debugging mode and this file doesn't have the non-skip
# flag set, and isn't one of the critical files, skip it.
if (! $optional{$addr} # File could be optional
&& $v_version ge $first_released{$addr})
{
- print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
+ print STDERR "Skipping processing input file '$file' because not found\n";
}
return;
}
}
else {
-
- # Here, the file exists. Some platforms may change the case of
- # its name
if ($seen_non_extracted_non_age) {
- if ($file =~ /$EXTRACTED/i) {
+ if ($file =~ /$EXTRACTED/i) # Some platforms may change the
+ # case of the file's name
+ {
Carp::my_carp_bug(main::join_lines(<<END
$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
$seen_non_extracted_non_age = 1;
}
- # And mark the file as having being processed, and warn if it
+ # Mark the file as having being processed, and warn if it
# isn't a file we are expecting. As we process the files,
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
return;
}
- # Open the file, converting the slashes used in this program
- # into the proper form for the OS
+ # Here, we are going to process the file. Open it, converting the
+ # slashes used in this program into the proper form for the OS
my $file_handle;
if (not open $file_handle, "<", $file) {
Carp::my_carp("Can't open $file. Skipping: $!");
$handle{$addr} = $file_handle; # Cache the open file handle
if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
+
+ # UnicodeData.txt has no version marker; the others started
+ # getting it in 3.2. Unihan files have the version somewhere
+ # in the first comment block; the other files have it as the
+ # very first line
if ($file !~ /^Unihan/i) {
$_ = <$file_handle>;
if ($_ !~ / - $string_version \. /x) {
# => $MULTIPLE_BEFORE means that if this range duplicates an
# existing one, but has a different value,
# don't replace the existing one, but insert
- # this, one so that the same range can occur
+ # this one so that the same range can occur
# multiple times. They are stored LIFO, so
# that the final one inserted is the first one
# returned in an ordered search of the table.
# existing range, this one is discarded
# (leaving the existing one in its original,
# higher priority position
+ # => $CROAK Die with an error if is already there
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
# "same value" means identical for non-type-0 ranges, and it means
# Here, the new range starts just after the current highest in
# the range list, and they have the same type and value.
- # Extend the current range to incorporate the new one.
+ # Extend the existing range to incorporate the new one.
@{$r}[-1]->set_end($end);
}
# In other words,
# r[$i-1]->end < $start <= r[$i]->end
# And:
- # r[$i-1]->end < $start <= $end <= r[$j]->end
+ # r[$i-1]->end < $start <= $end < r[$j+1]->start
#
# Also:
# $clean_insert is a boolean which is set true if and only if
# We now have enough information to decide if this call is a no-op
# or not. It is a no-op if this is an insertion of already
- # existing data.
+ # existing data. To be so, it must be contained entirely in one
+ # range.
if (main::DEBUG && $to_trace && $clean_insert
- && $i == $j
- && $start >= $r->[$i]->start)
+ && $start >= $r->[$i]->start
+ && $end <= $r->[$i]->end)
{
trace "no-op";
}
return if $clean_insert
- && $i == $j # more than one affected range => not no-op
-
- # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
- # Further, $start and/or $end is >= r[$i]->start
- # The test below hence guarantees that
- # r[$i]->start < $start <= $end <= r[$i]->end
- # This means the input range is contained entirely in
- # the one at $i, so is a no-op
- && $start >= $r->[$i]->start;
+ && $start >= $r->[$i]->start
+ && $end <= $r->[$i]->end;
}
# Here, we know that some action will have to be taken. We have
$status = $NORMAL unless defined $status;
# An internal name does not get documented, unless overridden by the
- # input.
+ # input; same for making tests for it.
my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
$insert_position,
0,
Alias->new($name, $loose_match, $make_re_pod_entry,
- $ok_as_filename, $status, $ucd);
+ $ok_as_filename, $status, $ucd);
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
# There are tables which end up only having one element per
# range, but it is not worth keeping track of for making just
# this comment a little better.
- $comment.= <<END;
+ $comment .= <<END;
non-comment portions of the main body of lines of this file is:
START\\tSTOP\\tMAPPING where START is the starting code point of the
range, in hex; STOP is the ending point, or if omitted, the range has just one
main::uniques($leader, @{$equivalents{$addr}});
my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
# tables
-
for my $parent (@parents) {
my $property = $parent->property;
? main::max(scalar @table_aliases,
scalar @property_aliases)
: 0;
- trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
-
+ trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
my $property_had_compound_name = 0;
package main;
- sub display_chr {
- # Converts an ordinal character value to a displayable string, using a
- # NBSP to hold combining characters.
+sub display_chr {
+ # Converts an ordinal printable character value to a displayable string,
+ # using a dotted circle to hold combining characters.
- my $ord = shift;
- my $chr = chr $ord;
- return $chr if $ccc->table(0)->contains($ord);
- return chr(utf8::unicode_to_native(0xA0)) . $chr;
- }
+ my $ord = shift;
+ my $chr = chr $ord;
+ return $chr if $ccc->table(0)->contains($ord);
+ return "\x{25CC}$chr";
+}
sub join_lines($) {
# Returns lines of the input joined together, so that they can be folded
my $object = shift;
my $fields_ref = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $object; };
'AL');
$lb->set_default_map($default);
}
+ }
- # If has the URS property, make sure that the standard aliases are in
- # it, since not in the input tables in some versions.
- my $urs = property_ref('Unicode_Radical_Stroke');
- if (defined $urs) {
- $urs->add_alias('cjkRSUnicode');
- $urs->add_alias('kRSUnicode');
- }
+ # If has the URS property, make sure that the standard aliases are in
+ # it, since not in the input tables in some versions.
+ my $urs = property_ref('Unicode_Radical_Stroke');
+ if (defined $urs) {
+ $urs->add_alias('cjkRSUnicode');
+ $urs->add_alias('kRSUnicode');
}
# For backwards compatibility with applications that may read the mapping
# program generates for this block property value
#$file->insert_lines('blk; n/a; Herited');
-
# Process each line of the file ...
while ($file->next_line) {
# thus shifting the former field 0 to after them.)
splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
+ if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
+ my $new_style = $data[1] =~ s/-/_/gr;
+ splice @data, 1, 0, $new_style;
+ }
+
# Field 0 is a short name unless "n/a"; field 1 is the full name. If
# there is no short name, use the full one in element 1
if ($data[0] eq "n/a") {
$line));
}
- # And process the first range, like any other.
+ # And set things up so that the below will process this first
+ # range, like any other.
$low = $this_range->start;
$high = $this_range->end;
}
$fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
# Rationals require extra effort.
- register_fraction($fields[$NUMERIC])
- if $fields[$NUMERIC] =~ qr{/};
+ if ($fields[$NUMERIC] =~ qr{/}) {
+ reduce_fraction(\$fields[$NUMERIC]);
+ register_fraction($fields[$NUMERIC])
+ }
}
}
return;
}
+sub process_SB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @SB_tests, $_;
+ }
+
+ return;
+}
+
+sub process_WB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @WB_tests, $_;
+ }
+
+ return;
+}
+
sub process_NamedSequences {
# NamedSequences.txt entries are just added to an array. Because these
# don't look like the other tables, they have their own handler.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my @fields = split /\s*;\s*/;
- if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
- $fields[1] = 'I';
- }
- elsif ($fields[1] eq 'L') {
+
+ if ($fields[1] eq 'L') {
$fields[1] = 'C'; # L => C always
}
elsif ($fields[1] eq 'E') {
return;
}
+sub gcd($$) { # Greatest-common-divisor; from
+ # http://en.wikipedia.org/wiki/Euclidean_algorithm
+ my ($a, $b) = @_;
+
+ use integer;
+
+ while ($b != 0) {
+ my $temp = $b;
+ $b = $a % $b;
+ $a = $temp;
+ }
+ return $a;
+}
+
+sub reduce_fraction($) {
+ my $fraction_ref = shift;
+
+ # Reduce a fraction to lowest terms. The Unicode data may be reducible,
+ # hence this is needed. The argument is a reference to the
+ # string denoting the fraction, which must be of the form:
+ if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
+ Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged");
+ return;
+ }
+
+ my $sign = $1;
+ my $numerator = $2;
+ my $denominator = $3;
+
+ use integer;
+
+ # Find greatest common divisor
+ my $gcd = gcd($numerator, $denominator);
+
+ # And reduce using the gcd.
+ if ($gcd != 1) {
+ $numerator /= $gcd;
+ $denominator /= $gcd;
+ $$fraction_ref = "$sign$numerator/$denominator";
+ }
+
+ return;
+}
+
sub filter_numeric_value_line {
# DNumValues contains lines of a different syntax than the typical
# property file:
$_ = "";
return;
}
+ reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
$rational = $fields[3];
+
$_ = join '; ', @fields[ 0, 3 ];
}
else {
# Change hyphens and blanks in the block name field only
$fields[1] =~ s/[ -]/_/g;
- $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
+ $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
$_ = join("; ", @fields);
return;
return;
}
+sub filter_all_caps_script_names {
+
+ # Some early Unicode releases had the script names in all CAPS. This
+ # converts them to just the first letter of each word being capital.
+
+ my ($range, $script, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+ my @words = split "_", $script;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $script = join "_", @words;
+ $_ = join ";", $range, $script, @remainder;
+}
+
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
# Find the non-default table.
for my $table ($property->tables) {
- next if $table == $default_table;
+ if ($table == $default_table) {
+ if ($v_version le v5.0.0) {
+ $table->add_alias($_) for qw(N No F False);
+ }
+ next;
+ } elsif ($v_version le v5.0.0) {
+ $table->add_alias($_) for qw(Y Yes T True);
+ }
$non_default_table = $table;
}
$default_table->set_complement($non_default_table);
$gc->table('Ll')->set_caseless_equivalent($LC);
$gc->table('Lu')->set_caseless_equivalent($LC);
- my $Cs = $gc->table('Cs');
-
# Create digit and case fold tables with the original file names for
# backwards compatibility with applications that read them directly.
my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
return $Nl;
}
+sub calculate_Assigned() { # Calculate the gc != Cn code points; may be
+ # called before the Cn's are completely filled.
+ # Works on Unicodes earlier than ones that
+ # explicitly specify Cn.
+ return if defined $Assigned;
+
+ if (! defined $gc || $gc->is_empty()) {
+ Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
+ }
+
+ $Assigned = $perl->add_match_table('Assigned',
+ Description => "All assigned code points",
+ );
+ while (defined (my $range = $gc->each_range())) {
+ my $standard_value = standardize($range->value);
+ next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
+ $Assigned->add_range($range->start, $range->end);
+ }
+}
+
sub compile_perl() {
# Create perl-defined tables. Almost all are part of the pseudo-property
# named 'perl' internally to this program. Many of these are recommended
}
my $Any = $perl->add_match_table('Any',
- Description => "All Unicode code points: [\\x{0000}-\\x{10FFFF}]",
+ Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
);
- $Any->add_range(0, 0x10FFFF);
+ $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
$Any->add_alias('Unicode');
- # Assigned is the opposite of gc=unassigned
- my $Assigned = $perl->add_match_table('Assigned',
- Description => "All assigned code points",
- Initialize => ~ $gc->table('Unassigned'),
- );
+ calculate_Assigned();
# Our internal-only property should be treated as more than just a
# synonym; grandfather it in to the pod.
# There are quite a few code points in Lower, that aren't in gc=lc,
# and not all are in all releases.
- foreach my $code_point ( utf8::unicode_to_native(0xAA),
- utf8::unicode_to_native(0xBA),
- 0x02B0 .. 0x02B8,
- 0x02C0 .. 0x02C1,
- 0x02E0 .. 0x02E4,
- 0x0345,
- 0x037A,
- 0x1D2C .. 0x1D6A,
- 0x1D78,
- 0x1D9B .. 0x1DBF,
- 0x2071,
- 0x207F,
- 0x2090 .. 0x209C,
- 0x2170 .. 0x217F,
- 0x24D0 .. 0x24E9,
- 0x2C7C .. 0x2C7D,
- 0xA770,
- 0xA7F8 .. 0xA7F9,
- ) {
- # Don't include the code point unless it is assigned in this
- # release
- my $category = $gc->value_of(hex $code_point);
- next if ! defined $category || $category eq 'Cn';
-
- $Lower += $code_point;
- }
+ my $temp = Range_List->new(Initialize => [
+ utf8::unicode_to_native(0xAA),
+ utf8::unicode_to_native(0xBA),
+ 0x02B0 .. 0x02B8,
+ 0x02C0 .. 0x02C1,
+ 0x02E0 .. 0x02E4,
+ 0x0345,
+ 0x037A,
+ 0x1D2C .. 0x1D6A,
+ 0x1D78,
+ 0x1D9B .. 0x1DBF,
+ 0x2071,
+ 0x207F,
+ 0x2090 .. 0x209C,
+ 0x2170 .. 0x217F,
+ 0x24D0 .. 0x24E9,
+ 0x2C7C .. 0x2C7D,
+ 0xA770,
+ 0xA7F8 .. 0xA7F9,
+ ]);
+ $Lower += $temp & $Assigned;
}
my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Cntrl & $ASCII,
);
+ my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
+ if (defined (my $Cs = $gc->table('Cs'))) {
+ $perl_surrogate += $Cs;
+ }
+ else {
+ push @tables_that_may_be_empty, '_Perl_Surrogate';
+ }
+
# $controls is a temporary used to construct Graph.
my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
- + $gc->table('Control'));
- # Cs not in release 1
- $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
+ + $gc->table('Control')
+ + $perl_surrogate);
# Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
# 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
# of the MU issue.
foreach my $range ($loc_problem_folds->ranges) {
- foreach my $code_point($range->start .. $range->end) {
+ foreach my $code_point ($range->start .. $range->end) {
my $fold_range = $cf->containing_range($code_point);
next unless defined $fold_range;
+ # Skip if folds to itself
+ next if $fold_range->value eq $CODE_POINT;
+
my @hex_folds = split " ", $fold_range->value;
- my $start_cp = hex $hex_folds[0];
+ my $start_cp = $hex_folds[0];
+ next if $start_cp eq $CODE_POINT;
+ $start_cp = hex $start_cp;
foreach my $i (0 .. @hex_folds - 1) {
- my $cp = hex $hex_folds[$i];
+ my $cp = $hex_folds[$i];
+ next if $cp eq $CODE_POINT;
+ $cp = hex $cp;
next unless $cp > 255; # Already have the < 256 ones
$loc_problem_folds->add_range($cp, $cp);
Description =>
"Code points whose fold is a string of more than one character",
);
+ if ($v_version lt v3.0.1) {
+ push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
+ }
# Look through all the known folds to populate these tables.
foreach my $range ($cf->ranges) {
+ next if $range->value eq $CODE_POINT;
my $start = $range->start;
my $end = $range->end;
$any_folds->add_range($start, $end);
+ utf8::unicode_to_native(0xA0) # NBSP
);
- # These two tables are for matching \X, which is based on the 'extended'
- # grapheme cluster, which came in 5.1; create empty ones if not already
- # present. The straight 'grapheme cluster' (non-extended) is used prior
- # to 5.1, and differs from the extended (see
- # http://www.unicode.org/reports/tr29/) only by these two tables, so we
- # get the older definition automatically when they are empty.
- my $gcb = property_ref('Grapheme_Cluster_Break');
- my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
- $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
- }
- else {
- push @tables_that_may_be_empty, $perl_prepend->complete_name;
- }
-
- # All the tables with _X_ in their names are used in defining \X handling,
- # and are based on the Unicode GCB property. Basically, \X matches:
- # CR LF
- # | Prepend* Begin Extend*
- # | .
- # Begin is: ( Special_Begin | ! Control )
- # Begin is also: ( Regular_Begin | Special_Begin )
- # where Regular_Begin is defined as ( ! Control - Special_Begin )
- # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- # Extend is: ( Grapheme_Extend | Spacing_Mark )
- # Control is: [ GCB_Control | CR | LF ]
- # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- foreach my $gcb_name (qw{ L V T LV LVT }) {
-
- # The perl internal extension's name is the gcb table name prepended
- # with an '_X_'
- my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table($gcb_name),
- );
- # Version 1 had mostly different Hangul syllables that were removed
- # from later versions, so some of the tables may not apply.
- if ($v_version lt v2.0) {
- push @tables_that_may_be_empty, $perl_table->complete_name;
- }
- }
-
- # More GCB. Populate a combined hangul syllables table
- my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
-
- my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if ($v_version ge v6.2) {
- $ri += $gcb->table('RI');
- }
- else {
- push @tables_that_may_be_empty, $ri->full_name;
- }
-
- my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $lv_lvt_v
- + $gcb->table('L')
- + $gcb->table('T')
- + $ri
- );
- $specials_begin->add_comment(join_lines( <<END
-For use in \\X; matches first (perhaps only) character of potential
-multi-character sequences that can begin an extended grapheme cluster. They
-need special handling because of their complicated nature.
-END
- ));
- my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => ~ $gcb->table('Control')
- - $specials_begin
- - $gcb->table('CR')
- - $gcb->table('LF')
- );
- $regular_begin->add_comment(join_lines( <<END
-For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster, except those that require special handling.
-END
- ));
-
- my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table('Extend')
- );
- if (defined (my $sm = $gcb->table('SpacingMark'))) {
- $extend += $sm;
- }
- $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
-
- # End of GCB \X processing
-
my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
if (@named_sequences) {
END
# Now add the Unicode_1 names for the controls. The Unicode_1 names had
- # precedence before 6.1, so should be first in the file; the other names
- # have precedence starting in 6.1,
+ # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
+ # so should be first in the file; the other names have precedence starting
+ # in 6.1,
my $before_or_after = ($v_version lt v6.1.0)
? $MULTIPLE_BEFORE
: $MULTIPLE_AFTER;
$unassigned->set_equivalent_to($age_default, Related => 1);
}
+ my $patws = $perl->add_match_table('_Perl_PatWS',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
+ $patws->initialize($off_patws->table('Y'));
+ }
+ else {
+ $patws->initialize([ ord("\t"),
+ ord("\n"),
+ utf8::unicode_to_native(0x0B), # VT
+ ord("\f"),
+ ord("\r"),
+ ord(" "),
+ utf8::unicode_to_native(0x85), # NEL
+ 0x200E..0x200F, # Left, Right marks
+ 0x2028..0x2029 # Line, Paragraph seps
+ ] );
+ }
+
# See L<perlfunc/quotemeta>
my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
Perl_Extension => 1,
# Initialize to what's common in
# all Unicode releases.
Initialize =>
- $Space
- + $gc->table('Control')
+ $gc->table('Control')
+ + $Space
+ + $patws
+ + ((~ $Word) & $ASCII)
);
- # In early releases without the proper Unicode properties, just set to \W.
- if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
- || ! defined (my $patws = property_ref('Pattern_White_Space'))
- || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
- {
- $quotemeta += ~ $Word;
+ if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
+ $quotemeta += $patsyn->table('Y');
+ }
+ else {
+ $quotemeta += ((~ $Word) & Range->new(0, 255))
+ - utf8::unicode_to_native(0xA8)
+ - utf8::unicode_to_native(0xAF)
+ - utf8::unicode_to_native(0xB2)
+ - utf8::unicode_to_native(0xB3)
+ - utf8::unicode_to_native(0xB4)
+ - utf8::unicode_to_native(0xB7)
+ - utf8::unicode_to_native(0xB8)
+ - utf8::unicode_to_native(0xB9)
+ - utf8::unicode_to_native(0xBC)
+ - utf8::unicode_to_native(0xBD)
+ - utf8::unicode_to_native(0xBE);
+ $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
+ # same in all releases
+ 0x2010 .. 0x2027,
+ 0x2030 .. 0x203E,
+ 0x2041 .. 0x2053,
+ 0x2055 .. 0x205E,
+ 0x2190 .. 0x245F,
+ 0x2500 .. 0x2775,
+ 0x2794 .. 0x2BFF,
+ 0x2E00 .. 0x2E7F,
+ 0x3001 .. 0x3003,
+ 0x3008 .. 0x3020,
+ 0x3030 .. 0x3030,
+ 0xFD3E .. 0xFD3F,
+ 0xFE45 .. 0xFE46
+ ];
+ }
+
+ if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
+ $quotemeta += $di->table('Y')
}
else {
- $quotemeta += $patsyn->table('Y')
- + $patws->table('Y')
- + $di->table('Y')
- + ((~ $Word) & $ASCII);
+ if ($v_version ge v2.0) {
+ $quotemeta += $gc->table('Cf')
+ + $gc->table('Cs');
+ }
+ $quotemeta += $gc->table('Cc')
+ - $Space;
+ my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
+ 0x2060 .. 0x206F,
+ 0xFE00 .. 0xFE0F,
+ 0xFFF0 .. 0xFFFB,
+ 0xE0000 .. 0xE0FFF,
+ ]);
+ $quotemeta += $temp & $Assigned;
+ }
+
+ my $nchar = $perl->add_match_table('_Perl_Nchar',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ if (defined (my $off_nchar = property_ref('Nchar'))) {
+ $nchar->initialize($off_nchar->table('Y'));
+ }
+ else {
+ $nchar->initialize([ 0xFFFE .. 0xFFFF ]);
+ if ($v_version ge v2.0) { # First release with these nchars
+ for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
+ $nchar += [ $i .. $i+1 ];
+ }
+ }
}
# Finished creating all the perl properties. All non-internal non-string
my $file = shift; # The file name in the final directory.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
+ trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
if ($table->isa('Property')) {
$table->set_file_path(@$directory_ref, $file);
# property's map table
foreach my $alias ($table->aliases) {
my $name = $alias->name;
- $loose_property_to_file_of{standardize($name)} = $file;
+ if ($name =~ /^_/) {
+ $strict_property_to_file_of{lc $name} = $file;
+ }
+ else {
+ $loose_property_to_file_of{standardize($name)} = $file;
+ }
}
# And a way for utf8_heavy to find the proper key in the SwashInfo
# The first few character columns are filler, plus the '\p{'; and get rid
# of all the trailing stuff, starting with the trailing '}', so as to sort
# on just 'Name=Value'
- (my $a = lc $a) =~ s/^ .*? { //x;
+ (my $a = lc $a) =~ s/^ .*? \{ //x;
$a =~ s/}.*//;
- (my $b = lc $b) =~ s/^ .*? { //x;
+ (my $b = lc $b) =~ s/^ .*? \{ //x;
$b =~ s/}.*//;
# Determine if the two operands are both internal only or both not.
C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
(C<\%Config> is available from the Config module).
+Also, perl can be recompiled to operate on an earlier version of the Unicode
+standard. Further information is at
+C<\$Config{privlib}>/F<unicore/README.perl>.
+
=head1 Other information in the Unicode data base
The Unicode data base is delivered in two different formats. The XML version
= simple_dumper(\%loose_property_name_of, ' ' x 4);
chomp $loose_property_name_of;
+ my $strict_property_name_of
+ = simple_dumper(\%strict_property_name_of, ' ' x 4);
+ chomp $strict_property_name_of;
+
my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
chomp $stricter_to_file_of;
= simple_dumper(\%loose_property_to_file_of, ' ' x 4);
chomp $loose_property_to_file_of;
+ my $strict_property_to_file_of
+ = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
+ chomp $strict_property_to_file_of;
+
my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
chomp $file_to_swash_name;
$loose_property_name_of
);
+# Same, but strict names
+\%utf8::strict_property_name_of = (
+$strict_property_name_of
+);
+
# Gives the definitions (in the form of inversion lists) for those properties
# whose definitions aren't kept in files
\@utf8::inline_definitions = (
$loose_property_to_file_of
);
+# Property names to mapping files
+\%utf8::strict_property_to_file_of = (
+$strict_property_to_file_of
+);
+
# Files to the swash names within them.
\%utf8::file_to_swash_name = (
$file_to_swash_name
# an element for the Hangul syllables in the appropriate place, and
# otherwise changes the name to include the "-<code point>" suffix.
my @algorithm_names;
- my $done_hangul = 0;
-
+ my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came
+ # along in this version
# Copy it linearly.
for my $i (0 .. @code_points_ending_in_code_point - 1) {
}
}
else {
- if (exists ($loose_property_name_of{$alias_standard}))
- {
- Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
+ my $hash_ref = ($alias_standard =~ /^_/)
+ ? \%strict_property_name_of
+ : \%loose_property_name_of;
+ if (exists $hash_ref->{$alias_standard}) {
+ Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained");
}
else {
- $loose_property_name_of{$alias_standard}
+ $hash_ref->{$alias_standard}
= $standard_property_name;
}
my @output;
# Create a complete set of tests, with complements.
if (defined $valid_code) {
- push @output, <<"EOC"
+ push @output, <<"EOC"
Expect(1, $valid_code, '\\p{$name}', $warning);
Expect(0, $valid_code, '\\p{^$name}', $warning);
Expect(0, $valid_code, '\\P{$name}', $warning);
EOC
}
if (defined $invalid_code) {
- push @output, <<"EOC"
+ push @output, <<"EOC"
Expect(0, $invalid_code, '\\p{$name}', $warning);
Expect(1, $invalid_code, '\\p{^$name}', $warning);
Expect(1, $invalid_code, '\\P{$name}', $warning);
<DATA>,
@output,
(map {"Test_GCB('$_');\n"} @backslash_X_tests),
+ (map {"Test_SB('$_');\n"} @SB_tests),
+ (map {"Test_WB('$_');\n"} @WB_tests),
"Finished();\n"
]);
my @input_file_objects = (
Input_file->new('PropertyAliases.txt', v0,
Handler => \&process_PropertyAliases,
- ),
+ ),
Input_file->new(undef, v0, # No file associated with this
Progress_Message => 'Finishing property setup',
Handler => \&finish_property_setup,
- ),
+ ),
Input_file->new('PropValueAliases.txt', v0,
Handler => \&process_PropValueAliases,
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('DAge.txt', v3.2.0,
Has_Missings_Defaults => $NOT_IGNORED,
Property => 'Age'
- ),
+ ),
Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
Property => 'General_Category',
- ),
+ ),
Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
Property => 'Canonical_Combining_Class',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
Property => 'Numeric_Type',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
Property => 'East_Asian_Width',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
Property => 'Line_Break',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
Property => 'Bidi_Class',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
Property => 'Decomposition_Type',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
Property => 'Numeric_Value',
Each_Line_Handler => \&filter_numeric_value_line,
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
Property => 'Joining_Group',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
Property => 'Joining_Type',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('Jamo.txt', v2.0.0,
Property => 'Jamo_Short_Name',
Each_Line_Handler => \&filter_jamo_line,
- ),
+ ),
Input_file->new('UnicodeData.txt', v1.1.5,
Pre_Handler => \&setup_UnicodeData,
\&filter_UnicodeData_line,
],
EOF_Handler => \&EOF_UnicodeData,
- ),
+ ),
Input_file->new('ArabicShaping.txt', v2.0.0,
Each_Line_Handler =>
($v_version lt 4.1.0)
# not used by Perl
Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('Blocks.txt', v2.0.0,
Property => 'Block',
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => \&filter_blocks_lines
- ),
+ ),
Input_file->new('PropList.txt', v2.0.0,
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_old_style_proplist
: undef),
- ),
+ ),
Input_file->new('Unihan.txt', v2.0.0,
Pre_Handler => \&setup_unihan,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('SpecialCasing.txt', v2.1.8,
Each_Line_Handler => ($v_version eq 2.1.8)
? \&filter_2_1_8_special_casing_line
: \&filter_special_casing_line,
Pre_Handler => \&setup_special_casing,
Has_Missings_Defaults => $IGNORED,
- ),
+ ),
Input_file->new(
'LineBreak.txt', v3.0.0,
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_early_ea_lb
: undef),
- ),
+ ),
Input_file->new('EastAsianWidth.txt', v3.0.0,
Property => 'East_Asian_Width',
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => (($v_version lt v3.1.0)
? \&filter_early_ea_lb
: undef),
- ),
+ ),
Input_file->new('CompositionExclusions.txt', v3.0.0,
Property => 'Composition_Exclusion',
- ),
+ ),
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
Has_Missings_Defaults => ($v_version lt v6.2.0)
# anything to us, we will use the
# null string
: $IGNORED,
-
- ),
+ ),
Input_file->new("NormTest.txt", v3.0.0,
Handler => \&process_NormalizationsTest,
Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
- ),
+ ),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Each_Line_Handler =>
\&filter_case_folding_line
],
Has_Missings_Defaults => $IGNORED,
- ),
+ ),
Input_file->new('DCoreProperties.txt', v3.1.0,
# 5.2 changed this file
Has_Missings_Defaults => (($v_version ge v5.2.0)
? $NOT_IGNORED
: $NO_DEFAULTS),
- ),
+ ),
Input_file->new('Scripts.txt', v3.1.0,
Property => 'Script',
+ Each_Line_Handler => (($v_version le v4.0.0)
+ ? \&filter_all_caps_script_names
+ : undef),
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('DNormalizationProps.txt', v3.1.0,
Has_Missings_Defaults => $NOT_IGNORED,
Each_Line_Handler => (($v_version lt v4.0.1)
? \&filter_old_style_normalization_lines
: undef),
- ),
+ ),
Input_file->new('HangulSyllableType.txt', v0,
Has_Missings_Defaults => $NOT_IGNORED,
Property => 'Hangul_Syllable_Type',
Pre_Handler => ($v_version lt v4.0.0)
? \&generate_hst
: undef,
- ),
+ ),
Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
Property => 'Word_Break',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
Property => 'Grapheme_Cluster_Break',
Has_Missings_Defaults => $NOT_IGNORED,
Pre_Handler => ($v_version lt v4.1.0)
? \&generate_GCB
: undef,
- ),
+ ),
Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
Handler => \&process_GCB_test,
- ),
+ ),
Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
Skip => 'Validation Tests',
- ),
+ ),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
- ),
+ Handler => \&process_SB_test,
+ ),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
- ),
+ Handler => \&process_WB_test,
+ ),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
Has_Missings_Defaults => $NOT_IGNORED,
- ),
+ ),
Input_file->new('NamedSequences.txt', v4.1.0,
Handler => \&process_NamedSequences
- ),
+ ),
Input_file->new('NameAliases.txt', v0,
Property => 'Name_Alias',
Pre_Handler => ($v_version le v6.0.0)
Each_Line_Handler => ($v_version le v6.0.0)
? \&filter_early_version_name_alias_line
: \&filter_later_version_name_alias_line,
- ),
+ ),
Input_file->new("BidiTest.txt", v5.2.0,
Skip => 'Validation Tests',
- ),
+ ),
Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanIRGSources.txt', v5.2.0,
Optional => 1,
Pre_Handler => \&setup_unihan,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanNumericValues.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanOtherMappings.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanReadings.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('UnihanVariants.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
- ),
+ ),
Input_file->new('ScriptExtensions.txt', v6.0.0,
Property => 'Script_Extensions',
Pre_Handler => \&setup_script_extensions,
Has_Missings_Defaults => (($v_version le v6.0.0)
? $NO_DEFAULTS
: $IGNORED),
- ),
- # The two Indic files are actually available starting in v6.0.0, but their
- # property values are missing from PropValueAliases.txt in that release,
- # so that further work would have to be done to get them to work properly
- # for that release.
- Input_file->new('IndicMatraCategory.txt', v6.1.0,
+ ),
+ # These two Indic files are actually not usable as-is until 6.1.0,
+ # because their property values are missing from PropValueAliases.txt
+ # until that release, so that further work would have to be done to get
+ # them to work properly, which isn't worth it because of them being
+ # provisional.
+ Input_file->new('IndicMatraCategory.txt', v6.0.0,
Property => 'Indic_Matra_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- Skip => "Provisional; for the analysis and processing of Indic scripts",
- ),
- Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
+ Skip => "Withdrawn by Unicode while still provisional",
+ ),
+ Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
Property => 'Indic_Syllabic_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- Skip => "Provisional; for the analysis and processing of Indic scripts",
- ),
+ Skip => (($v_version lt v8.0.0)
+ ? "Provisional; for the analysis and processing of Indic scripts"
+ : 0),
+ ),
Input_file->new('BidiBrackets.txt', v6.3.0,
- Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
+ Properties => [ 'Bidi_Paired_Bracket',
+ 'Bidi_Paired_Bracket_Type'
+ ],
Has_Missings_Defaults => $NO_DEFAULTS,
- ),
+ ),
Input_file->new("BidiCharacterTest.txt", v6.3.0,
Skip => 'Validation Tests',
- ),
+ ),
+ Input_file->new('IndicPositionalCategory.txt', v8.0.0,
+ Property => 'Indic_Positional_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
);
# End of all the preliminaries.
my $Tests = 0;
my $Fails = 0;
+# loc_tools.pl requires this function to be defined
+sub ok($@) {
+ my ($pass, @msg) = @_;
+ print "not " unless $pass;
+ print "ok ";
+ print ++$Tests;
+ print " - ", join "", @msg if @msg;
+ print "\n";
+}
+
sub Expect($$$$) {
my $expected = shift;
my $ord = shift;
_test_break(shift, 'gcb');
}
+sub Test_SB($) {
+ _test_break(shift, 'sb');
+}
+
+sub Test_WB($) {
+ _test_break(shift, 'wb');
+}
+
sub Finished() {
print "1..$Tests\n";
exit($Fails ? -1 : 0);