#
# croak("Weird Canonical Decomposition of U+$h");
#
-# Simply change to a carp. It will compile, but will not know about any three
-# character decomposition.
+# Simply comment it out. It will compile, but will not know about any three
+# character decompositions. If using the .pm version, there is a similar
+# line.
# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
# that the reason is that the CJK block starting at 4E00 was removed from
# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
# Without this change, there are casing problems for this character.
#
+# Search for $string_compare_versions to see how to compare changes to
+# properties between Unicode versions
+#
##############################################################################
my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
# With this release, it is automatically handled if the Unihan db is
# downloaded
-push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
+push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version lt v5.2.0;
# There are several types of obsolete properties defined by Unicode. These
# must be hand-edited for every new Unicode release.
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
- 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+ 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
- 'Indic_Matra_Category' => "Provisional",
- 'Indic_Syllabic_Category' => "Provisional",
+ 'Indic_Matra_Category' => "Withdrawn by Unicode while still provisional",
# Don't suppress ISO_Comment, as otherwise special handling is needed
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
- 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
- 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
- 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
- FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
+ FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
);
foreach my $property (
EOF
-my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
+my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
+ ? "10FFFF"
+ : "FFFF";
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
+# We work with above-Unicode code points, up to UV_MAX. But when you get
+# that high, above IV_MAX, some operations don't work, and you can easily get
+# overflow. Therefore for internal use, we use a much smaller number,
+# translating it to UV_MAX only for output. The exact number is immaterial
+# (all Unicode code points are treated exactly the same), but the algorithm
+# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
+my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
+my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
+
+my $MAX_PLATFORM_CODEPOINT = ~0;
+
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
# two must be 10; if there are 5, the first must not be a 0. Written this way
# to decrease backtracking. The first regex allows the code point to be at
# defaults for code points not listed (i.e., missing) in the file. The code
# depends on this ending with a semi-colon, so it can assume it is a valid
# field when the line is split() by semi-colons
-my $missing_defaults_prefix =
- qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
+my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
# Property types. Unicode has more types, but these are sufficient for our
# purposes.
my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
my $AUXILIARY = 'auxiliary';
-# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
-# and into UCD.pl for the use of UCD.pm
+# Hashes and arrays that will eventually go into Heavy.pl for the use of
+# utf8_heavy.pl and into UCD.pl for the use of UCD.pm
my %loose_to_file_of; # loosely maps table names to their respective
# files
my %stricter_to_file_of; # same; but for stricter mapping.
my %loose_property_to_file_of; # Maps a loose property name to its map file
+my @inline_definitions = "V0"; # Each element gives a definition of a unique
+ # inversion list. When a definition is inlined,
+ # its value in the hash it's in (one of the two
+ # defined just above) will include an index into
+ # this array. The 0th element is initialized to
+ # the definition for a zero length invwersion list
my %file_to_swash_name; # Maps the file name to its corresponding key name
# in the hash %utf8::SwashInfo
my %nv_floating_to_rational; # maps numeric values floating point numbers to
my $needing_code_points_ending_in_code_point = 0;
my @backslash_X_tests; # List of tests read in for testing \X
+my @SB_tests; # List of tests read in for testing \b{sb}
+my @WB_tests; # List of tests read in for testing \b{wb}
my @unhandled_properties; # Will contain a list of properties found in
# the input that we didn't process.
my @match_properties; # Properties that have match tables, to be
my $block;
my $perl_charname;
my $print;
-my $Any;
+my $All;
+my $Assigned; # All assigned characters in this Unicode release
my $script;
# Are there conflicting names because of beginning with 'In_', or 'Is_'
my $PRIVATE_USE_TYPE = -3;
my $NONCHARACTER_TYPE = -4;
my $CONTROL_TYPE = -5;
-my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
+my $ABOVE_UNICODE_TYPE = -6;
+my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program
sub populate_char_info ($) {
# Used only with the $annotate option. Populates the arrays with the
my $end;
if (! $viacode[$i]) {
my $nonchar;
- if ($gc-> table('Private_use')->contains($i)) {
+ if ($i > $MAX_UNICODE_CODEPOINT) {
+ $viacode[$i] = 'Above-Unicode';
+ $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
+ $printable[$i] = 0;
+ $end = $MAX_WORKING_CODEPOINT;
+ }
+ elsif ($gc-> table('Private_use')->contains($i)) {
$viacode[$i] = 'Private Use';
$annotate_char_type[$i] = $PRIVATE_USE_TYPE;
$printable[$i] = 0;
return $number;
}
+sub clarify_code_point_count ($) {
+ # This is like clarify_number(), but the input is assumed to be a count of
+ # code points, rather than a generic number.
+
+ my $append = "";
+
+ my $number = shift;
+ if ($number > $MAX_UNICODE_CODEPOINTS) {
+ $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
+ return "All above-Unicode code points" if $number == 0;
+ $append = " + all above-Unicode code points";
+ }
+ return clarify_number($number) . $append;
+}
package Carp;
{ # Closure
# This program uses the inside-out method for objects, as recommended in
- # "Perl Best Practices". This closure aids in generating those. There
+ # "Perl Best Practices". (This is the best solution still, since this has
+ # to run under miniperl.) This closure aids in generating those. There
# are two routines. setup_package() is called once per package to set
# things up, and then set_access() is called for each hash representing a
# field in the object. These routines arrange for the object to be
$file{$addr} = main::internal_file_to_platform(shift);
$first_released{$addr} = shift;
+ undef $file{$addr} if $first_released{$addr} gt $v_version;
+
# The rest of the arguments are key => value pairs
# %constructor_fields has been set up earlier to list all possible
# ones. Either set or push, depending on how the default has been set
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $skipped_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr};
}
elsif ($properties{$addr}) {
# than this Unicode version), and isn't there. This means if someone
# copies it into an earlier version's directory, we will go ahead and
# process it.
- return if $first_released{$addr} gt $v_version && ! -e $file;
+ return if $first_released{$addr} gt $v_version
+ && (! defined $file || ! -e $file);
# If in debugging mode and this file doesn't have the non-skip
# flag set, and isn't one of the critical files, skip it.
if (! $optional{$addr} # File could be optional
&& $v_version ge $first_released{$addr})
{
- print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
+ print STDERR "Skipping processing input file '$file' because not found\n";
}
return;
}
$handle{$addr} = $file_handle; # Cache the open file handle
if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
- $_ = <$file_handle>;
- if ($_ !~ / - $string_version \. /x) {
- chomp;
- $_ =~ s/^#\s*//;
- die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ if ($file !~ /^Unihan/i) {
+ $_ = <$file_handle>;
+ if ($_ !~ / - $string_version \. /x) {
+ chomp;
+ $_ =~ s/^#\s*//;
+ die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ }
+ }
+ else {
+ while (<$file_handle>) {
+ if ($_ !~ /^#/) {
+ Carp::my_carp_bug("Could not find the expected version info in file '$file'");
+ last;
+ }
+ chomp;
+ $_ =~ s/^#\s*//;
+ next if $_ !~ / version: /x;
+ last if $_ =~ /$string_version/;
+ die Carp::my_carp("File '$file' is '$_'. It should be version $string_version");
+ }
}
}
}
# If the range list is empty, return a large value that isn't adjacent
# to any that could be in the range list, for simpler tests
- return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
+ return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
return $ranges{$addr}->[0]->start;
}
Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
return;
}
- if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
- Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
- }
#local $to_trace = 1 if main::DEBUG;
if ($operation eq '-') {
# In other words,
# r[$i-1]->end < $start <= r[$i]->end
# And:
- # r[$i-1]->end < $start <= $end <= r[$j]->end
+ # r[$i-1]->end < $start <= $end <= r[$j+1]->start
#
# Also:
# $clean_insert is a boolean which is set true if and only if
# We now have enough information to decide if this call is a no-op
# or not. It is a no-op if this is an insertion of already
- # existing data.
+ # existing data. To be so, it must be contained entirely in one
+ # range.
if (main::DEBUG && $to_trace && $clean_insert
- && $i == $j
- && $start >= $r->[$i]->start)
+ && $start >= $r->[$i]->start
+ && $end <= $r->[$i]->end)
{
trace "no-op";
}
return if $clean_insert
- && $i == $j # more than one affected range => not no-op
-
- # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
- # Further, $start and/or $end is >= r[$i]->start
- # The test below hence guarantees that
- # r[$i]->start < $start <= $end <= r[$i]->end
- # This means the input range is contained entirely in
- # the one at $i, so is a no-op
- && $start >= $r->[$i]->start;
+ && $start >= $r->[$i]->start
+ && $end <= $r->[$i]->end;
}
# Here, we know that some action will have to be taken. We have
# And finally, add the gap from the end of the table to the max
# possible code point
- if ($max < $MAX_UNICODE_CODEPOINT) {
- $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
+ if ($max < $MAX_WORKING_CODEPOINT) {
+ $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
}
return $new;
}
# range.
my $end = $set->end;
return $end if is_code_point_usable($end, $try_hard);
+ $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
# End point didn't, work. Start at the beginning and try
# every one until find one that does work.
# The constructor can override the global flag of the same name.
main::set_access('output_range_counts', \%output_range_counts, 'r');
+ my %write_as_invlist;
+ # A boolean set iff the output file for this table is to be in the form of
+ # an inversion list/map.
+ main::set_access('write_as_invlist', \%write_as_invlist, 'r');
+
my %format;
# The format of the entries of the table. This is calculated from the
# data in the table (or passed in the constructor). This is an enum e.g.,
$range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
$caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
$fate{$addr} = delete $args{'Fate'} || $ORDINARY;
+ $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
my $ucd = delete $args{'UCD'};
my $description = delete $args{'Description'};
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $self; };
+ my $write_as_invlist = $write_as_invlist{$addr};
# Start with the header
my @HEADER = $self->header;
# This is a kludge for empty tables to silence a warning in
# utf8.c, which can't really deal with empty tables, but it can
- # deal with a table that matches nothing, as the inverse of 'Any'
+ # deal with a table that matches nothing, as the inverse of 'All'
# does.
- push @OUT, "!utf8::Any\n";
+ push @OUT, "!utf8::All\n";
}
elsif ($self->name eq 'N'
}
else {
my $range_size_1 = $range_size_1{$addr};
- my $format; # Used only in $annotate option
- my $include_name; # Used only in $annotate option
- my $include_cp; # Used only in $annotate option
# To make it more readable, use a minimum indentation
- my $comment_indent = 16;
-
- if ($annotate) {
+ my $comment_indent;
+
+ # These are used only in $annotate option
+ my $format; # e.g. $HEX_ADJUST_FORMAT
+ my $include_name; # ? Include the character's name in the
+ # annotation?
+ my $include_cp; # ? Include its code point
+
+ if (! $annotate) {
+ $comment_indent = ($self->isa('Map_Table'))
+ ? 24
+ : ($write_as_invlist)
+ ? 8
+ : 16;
+ }
+ else {
$format = $self->format;
# The name of the character is output only for tables that
# the first column
$include_cp = ! $range_size_1;
- if ($self->isa('Map_Table')) {
+ if (! $self->isa('Map_Table')) {
+ $comment_indent = ($write_as_invlist) ? 8 : 16;
+ }
+ else {
+ $comment_indent = 16;
# There are just a few short ranges in this table, so no
# need to include the code point in the annotation.
my $next_end;
my $next_value;
my $offset = 0;
+ my $invlist_count = 0;
my $output_value_in_hex = $self->isa('Map_Table')
&& ($self->format eq $HEX_ADJUST_FORMAT
$previous_value = $value;
}
- # If there is a range
- if ($start != $end) {
- push @OUT, sprintf "$hex_format\t$hex_format",
- $start, $end;
- if ($value ne "") {
+ if ($write_as_invlist) {
+
+ # Inversion list format has a single number per line,
+ # the starting code point of a range that matches the
+ # property
+ push @OUT, $start, "\n";
+ $invlist_count++;
+
+ # Add a comment with the size of the range, if
+ # requested.
+ if ($output_range_counts{$addr}) {
+ $OUT[-1] = merge_single_annotation_line(
+ $OUT[-1],
+ "# ["
+ . main::clarify_code_point_count($end - $start + 1)
+ . "]\n",
+ $comment_indent);
+ }
+ }
+ elsif ($start != $end) { # If there is a range
+ if ($end == $MAX_WORKING_CODEPOINT) {
+ push @OUT, sprintf "$hex_format\t$hex_format",
+ $start,
+ $MAX_PLATFORM_CODEPOINT;
+ }
+ else {
+ push @OUT, sprintf "$hex_format\t$hex_format",
+ $start, $end;
+ }
+ if (length $value) {
if ($convert_map_to_from_hex) {
$OUT[-1] .= sprintf "\t$hex_format\n", $value;
}
$OUT[-1] = merge_single_annotation_line(
$OUT[-1],
"# ["
- . main::clarify_number($end - $start + 1)
+ . main::clarify_code_point_count($end - $start + 1)
. "]\n",
$comment_indent);
}
}
if ($i != $start || $range_end < $end) {
- $annotation = sprintf "%04X..%04X",
- $i, $range_end;
+ if ($range_end < $MAX_WORKING_CODEPOINT)
+ {
+ $annotation = sprintf "%04X..%04X",
+ $i, $range_end;
+ }
+ else {
+ $annotation = sprintf "%04X..INFINITY",
+ $i;
+ }
}
else { # Indent if not displaying code points
$annotation = " " x 4;
# Include the number of code points in the
# range
my $count =
- main::clarify_number($range_end - $i + 1);
+ main::clarify_code_point_count($range_end - $i + 1);
$annotation .= " [$count]\n";
# Skip to the end of the range
}
}
+ # Add the beginning of the range that doesn't match the
+ # property, except if the just added match range extends
+ # to infinity. We do this after any annotations for the
+ # match range.
+ if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
+ push @OUT, $end + 1, "\n";
+ $invlist_count++;
+ }
+
# If we split the range, set up so the next time through
# we get the remainder, and redo.
if ($next_start) {
} # End of loop through all the table's ranges
push @OUT, @annotation; # Add orphaned annotation, if any
+
+ splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
}
# Add anything that goes after the main body, but within the here
Full_Name => $full_name,
_Property => $property,
_Range_List => $range_list,
+ Write_As_Invlist => 0,
%args);
my $addr = do { no overloading; pack 'J', $self; };
# but its format and even its name or existence are subject to change without
# notice in a future Perl version. Don't use it directly. Instead, its
# contents are now retrievable through a stable API in the Unicode::UCD
-# module: Unicode::UCD::prop_invmap('$property_name').
+# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
+# code points can be retrieved via Unicode::UCD::charprop());
END
}
return $return;
# Get a string version of $count with underscores in large numbers,
# for clarity.
- my $string_count = main::clarify_number($count);
+ my $string_count = main::clarify_code_point_count($count);
my $code_points = ($count == 1)
? 'single code point'
}
$comment .= "\nwhere 'cp' is $cp.";
if ($ucd_accessible_name) {
- $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
+ $comment .= " Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
}
# And append any commentary already set from the actual property.
}
}
- # I (khw) have never waded through this line to
- # understand it well enough to comment it.
+ # The unpack yields a list of the bytes that comprise the
+ # UTF-8 of $code_point, which are each placed in \xZZ format
+ # and output in the %s to map to $tostr, so the result looks
+ # like:
+ # "\xC4\xB0" => "\x{0069}\x{0307}",
my $utf8 = sprintf(qq["%s" => "$tostr",],
join("", map { sprintf "\\x%02X", $_ }
- unpack("U0C*", pack("U", $code_point))));
+ unpack("U0C*", chr $code_point)));
# Add a comment so that a human reader can more easily
# see what's going on.
_Property => $property,
_Range_List => $range_list,
Format => $EMPTY_FORMAT,
+ Write_As_Invlist => 1,
);
my $addr = do { no overloading; pack 'J', $self; };
# Get the number of code points matched by each of the tables in this
# file, and add underscores for clarity.
my $count = $leader->count;
- my $string_count = main::clarify_number($count);
+ my $unicode_count;
+ my $non_unicode_string;
+ if ($count > $MAX_UNICODE_CODEPOINTS) {
+ $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+ - $MAX_UNICODE_CODEPOINT);
+ $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
+ }
+ else {
+ $unicode_count = $count;
+ $non_unicode_string = "";
+ }
+ my $string_count = main::clarify_code_point_count($unicode_count);
my $loose_count = 0; # how many aliases loosely matched
my $compound_name = ""; # ? Are any names compound?, and if so, an
}
} # End of looping through all tables
+ $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
+
my $code_points;
my $match;
my $any_of_these;
- if ($count == 1) {
+ if ($unicode_count == 1) {
$match = 'matches';
$code_points = 'single code point';
}
if ($count) { # The format differs if no code points, and needs no
# explanation in that case
+ if ($leader->write_as_invlist) {
$comment.= <<END;
-The format of the lines of this file is:
+The first data line of this file begins with the letter V to indicate it is in
+inversion list format. The number following the V gives the number of lines
+remaining. Each of those remaining lines is a single number representing the
+starting code point of a range which goes up to but not including the number
+on the next line; The 0th, 2nd, 4th... ranges are for code points that match
+the property; the 1st, 3rd, 5th... are ranges of code points that don't match
+the property. The final line's range extends to the platform's infinity.
END
- $comment.= <<END;
+ }
+ else {
+ $comment.= <<END;
+The format of the lines of this file is:
START\\tSTOP\\twhere START is the starting code point of the range, in hex;
STOP is the ending point, or if omitted, the range has just one code point.
END
+ }
if ($leader->output_range_counts) {
$comment .= <<END;
Numbers in comments in [brackets] indicate how many code points are in the
package main;
- sub display_chr {
- # Converts an ordinal character value to a displayable string, using a
- # NBSP to hold combining characters.
- my $ord = shift;
- my $chr = chr $ord;
- return $chr if $ccc->table(0)->contains($ord);
- return chr(utf8::unicode_to_native(0xA0)) . $chr;
- }
+sub display_chr {
+ # Converts an ordinal printable character value to a displayable string,
+ # using a dotted circle to hold combining characters.
+
+ my $ord = shift;
+ my $chr = chr $ord;
+ return $chr if $ccc->table(0)->contains($ord);
+ return "\x{25CC}$chr";
+}
sub join_lines($) {
# Returns lines of the input joined together, so that they can be folded
{ # Closure
my $indent_increment = " " x (($debugging_build) ? 2 : 0);
- my %already_output;
+ %main::already_output = ();
$main::simple_dumper_nesting = 0;
# nesting level is localized, so that as the call stack pops, it goes
# back to the prior value.
local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
- undef %already_output if $main::simple_dumper_nesting == 0;
+ local %main::already_output = %main::already_output;
$main::simple_dumper_nesting++;
#print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
# Keep track of cycles in the input, and refuse to infinitely loop
my $addr = do { no overloading; pack 'J', $item; };
- if (defined $already_output{$addr}) {
+ if (defined $main::already_output{$addr}) {
return "${indent}ALREADY OUTPUT: $item\n";
}
- $already_output{$addr} = $item;
+ $main::already_output{$addr} = $item;
if (ref $item eq 'ARRAY') {
my $using_brackets;
my $object = shift;
my $fields_ref = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $object; };
'AL');
$lb->set_default_map($default);
}
+ }
- # If has the URS property, make sure that the standard aliases are in
- # it, since not in the input tables in some versions.
- my $urs = property_ref('Unicode_Radical_Stroke');
- if (defined $urs) {
- $urs->add_alias('cjkRSUnicode');
- $urs->add_alias('kRSUnicode');
- }
+ # If has the URS property, make sure that the standard aliases are in
+ # it, since not in the input tables in some versions.
+ my $urs = property_ref('Unicode_Radical_Stroke');
+ if (defined $urs) {
+ $urs->add_alias('cjkRSUnicode');
+ $urs->add_alias('kRSUnicode');
}
# For backwards compatibility with applications that may read the mapping
$fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
# Rationals require extra effort.
- register_fraction($fields[$NUMERIC])
- if $fields[$NUMERIC] =~ qr{/};
+ if ($fields[$NUMERIC] =~ qr{/}) {
+ reduce_fraction(\$fields[$NUMERIC]);
+ register_fraction($fields[$NUMERIC])
+ }
}
}
return;
}
+sub process_SB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @SB_tests, $_;
+ }
+
+ return;
+}
+
+sub process_WB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @WB_tests, $_;
+ }
+
+ return;
+}
+
sub process_NamedSequences {
# NamedSequences.txt entries are just added to an array. Because these
# don't look like the other tables, they have their own handler.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my @fields = split /\s*;\s*/;
- if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
- $fields[1] = 'I';
- }
- elsif ($fields[1] eq 'L') {
+
+ if ($fields[1] eq 'L') {
$fields[1] = 'C'; # L => C always
}
elsif ($fields[1] eq 'E') {
return;
}
+sub gcd($$) { # Greatest-common-divisor; from
+ # http://en.wikipedia.org/wiki/Euclidean_algorithm
+ my ($a, $b) = @_;
+
+ use integer;
+
+ while ($b != 0) {
+ my $temp = $b;
+ $b = $a % $b;
+ $a = $temp;
+ }
+ return $a;
+}
+
+sub reduce_fraction($) {
+ my $fraction_ref = shift;
+
+ # Reduce a fraction to lowest terms. The Unicode data may be reducible,
+ # hence this is needed. The argument is a reference to the
+ # string denoting the fraction, which must be of the form:
+ if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
+ Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged");
+ return;
+ }
+
+ my $sign = $1;
+ my $numerator = $2;
+ my $denominator = $3;
+
+ use integer;
+
+ # Find greatest common divisor
+ my $gcd = gcd($numerator, $denominator);
+
+ # And reduce using the gcd.
+ if ($gcd != 1) {
+ $numerator /= $gcd;
+ $denominator /= $gcd;
+ $$fraction_ref = "$sign$numerator/$denominator";
+ }
+
+ return;
+}
+
sub filter_numeric_value_line {
# DNumValues contains lines of a different syntax than the typical
# property file:
$_ = "";
return;
}
+ reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
$rational = $fields[3];
+
$_ = join '; ', @fields[ 0, 3 ];
}
else {
# Change hyphens and blanks in the block name field only
$fields[1] =~ s/[ -]/_/g;
- $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
+ $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
$_ = join("; ", @fields);
return;
return;
}
+sub filter_all_caps_script_names {
+
+ # Some early Unicode releases had the script names in all CAPS. This
+ # converts them to just the first letter of each word being capital.
+
+ my ($range, $script, @remainder)
+ = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
+ my @words = split "_", $script;
+ for my $word (@words) {
+ $word =
+ ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $script = join "_", @words;
+ $_ = join ";", $range, $script, @remainder;
+}
+
sub finish_Unicode() {
# This routine should be called after all the Unicode files have been read
# in. It:
# This fills in any missing values with the default. It's not
# necessary to do this with binary properties, as the default
# is defined completely in terms of the Y table.
- $property->add_map(0, $MAX_UNICODE_CODEPOINT,
+ $property->add_map(0, $MAX_WORKING_CODEPOINT,
$default_map, Replace => $NO);
}
}
$gc->table('Ll')->set_caseless_equivalent($LC);
$gc->table('Lu')->set_caseless_equivalent($LC);
- my $Cs = $gc->table('Cs');
-
# Create digit and case fold tables with the original file names for
# backwards compatibility with applications that read them directly.
my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
return $Nl;
}
+sub calculate_Assigned() { # Calculate the gc != Cn code points; may be
+ # called before the Cn's are completely filled.
+ # Works on Unicodes earlier than ones that
+ # explicitly specify Cn.
+ return if defined $Assigned;
+
+ if (! defined $gc || $gc->is_empty()) {
+ Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
+ }
+
+ $Assigned = $perl->add_match_table('Assigned',
+ Description => "All assigned code points",
+ );
+ while (defined (my $range = $gc->each_range())) {
+ my $standard_value = standardize($range->value);
+ next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
+ $Assigned->add_range($range->start, $range->end);
+ }
+}
+
sub compile_perl() {
# Create perl-defined tables. Almost all are part of the pseudo-property
# named 'perl' internally to this program. Many of these are recommended
# just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
# to the full Unicode range, by our guesses as to what is appropriate.
- # 'Any' is all code points. As an error check, instead of just setting it
+ # 'All' is all code points. As an error check, instead of just setting it
# to be that, construct it to be the union of all the major categories
- $Any = $perl->add_match_table('Any',
- Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
- Matches_All => 1);
+ $All = $perl->add_match_table('All',
+ Description
+ => "All code points, including those above Unicode. Same as qr/./s",
+ Matches_All => 1);
foreach my $major_table ($gc->tables) {
# Major categories are the ones with single letter names.
next if length($major_table->name) != 1;
- $Any += $major_table;
+ $All += $major_table;
}
- if ($Any->max != $MAX_UNICODE_CODEPOINT) {
+ if ($All->max != $MAX_WORKING_CODEPOINT) {
Carp::my_carp_bug("Generated highest code point ("
- . sprintf("%X", $Any->max)
- . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
+ . sprintf("%X", $All->max)
+ . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
}
- if ($Any->range_count != 1 || $Any->min != 0) {
- Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
+ if ($All->range_count != 1 || $All->min != 0) {
+ Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
}
- $Any->add_alias('All');
+ my $Any = $perl->add_match_table('Any',
+ Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
+ );
+ $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
+ $Any->add_alias('Unicode');
- # Assigned is the opposite of gc=unassigned
- my $Assigned = $perl->add_match_table('Assigned',
- Description => "All assigned code points",
- Initialize => ~ $gc->table('Unassigned'),
- );
+ calculate_Assigned();
# Our internal-only property should be treated as more than just a
# synonym; grandfather it in to the pod.
# have Uppercase and Lowercase defined, so use the general category
# instead for them, modified by hard-coding in the code points each is
# missing.
- my $Lower = $perl->add_match_table('Lower');
+ my $Lower = $perl->add_match_table('XPosixLower');
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
$Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
# There are quite a few code points in Lower, that aren't in gc=lc,
# and not all are in all releases.
- foreach my $code_point ( utf8::unicode_to_native(0xAA),
- utf8::unicode_to_native(0xBA),
- 0x02B0 .. 0x02B8,
- 0x02C0 .. 0x02C1,
- 0x02E0 .. 0x02E4,
- 0x0345,
- 0x037A,
- 0x1D2C .. 0x1D6A,
- 0x1D78,
- 0x1D9B .. 0x1DBF,
- 0x2071,
- 0x207F,
- 0x2090 .. 0x209C,
- 0x2170 .. 0x217F,
- 0x24D0 .. 0x24E9,
- 0x2C7C .. 0x2C7D,
- 0xA770,
- 0xA7F8 .. 0xA7F9,
- ) {
- # Don't include the code point unless it is assigned in this
- # release
- my $category = $gc->value_of(hex $code_point);
- next if ! defined $category || $category eq 'Cn';
-
- $Lower += $code_point;
- }
+ my $temp = Range_List->new(Initialize => [
+ utf8::unicode_to_native(0xAA),
+ utf8::unicode_to_native(0xBA),
+ 0x02B0 .. 0x02B8,
+ 0x02C0 .. 0x02C1,
+ 0x02E0 .. 0x02E4,
+ 0x0345,
+ 0x037A,
+ 0x1D2C .. 0x1D6A,
+ 0x1D78,
+ 0x1D9B .. 0x1DBF,
+ 0x2071,
+ 0x207F,
+ 0x2090 .. 0x209C,
+ 0x2170 .. 0x217F,
+ 0x24D0 .. 0x24E9,
+ 0x2C7C .. 0x2C7D,
+ 0xA770,
+ 0xA7F8 .. 0xA7F9,
+ ]);
+ $Lower += $temp & $Assigned;
}
- $Lower->add_alias('XPosixLower');
my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Lower & $ASCII,
);
- my $Upper = $perl->add_match_table('Upper');
+ my $Upper = $perl->add_match_table("XPosixUpper");
my $Unicode_Upper = property_ref('Uppercase');
if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
$Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
$Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
- $Upper->add_alias('XPosixUpper');
my $Posix_Upper = $perl->add_match_table("PosixUpper",
Description => "[A-Z]",
Initialize => $Upper & $ASCII,
# one whose name generally begins with Posix that is posix-compliant, and
# one that matches Unicode characters beyond the Posix, ASCII range
- my $Alpha = $perl->add_match_table('Alpha');
+ my $Alpha = $perl->add_match_table('XPosixAlpha');
# Alphabetic was not present in early releases
my $Alphabetic = property_ref('Alphabetic');
$Alpha->add_description('Alphabetic');
$Alpha->add_alias('Alphabetic');
}
- $Alpha->add_alias('XPosixAlpha');
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
Description => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
$Posix_Upper->set_caseless_equivalent($Posix_Alpha);
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
- my $Alnum = $perl->add_match_table('Alnum',
+ my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
- $Alnum->add_alias('XPosixAlnum');
$perl->add_match_table("PosixAlnum",
Description => "[A-Za-z0-9]",
Initialize => $Alnum & $ASCII,
);
- my $Word = $perl->add_match_table('Word',
+ my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
Description => '\w, including beyond ASCII;'
. ' = \p{Alnum} + \pM + \p{Pc}',
Initialize => $Alnum + $gc->table('Mark'),
);
- $Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
if (defined $Pc) {
$Word += $Pc;
}
# This is a Perl extension, so the name doesn't begin with Posix.
- my $PerlWord = $perl->add_match_table('PerlWord',
+ my $PerlWord = $perl->add_match_table('PosixWord',
Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
- $PerlWord->add_alias('PosixWord');
+ $PerlWord->add_alias('PerlWord');
- my $Blank = $perl->add_match_table('Blank',
+ my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
Description => '\h, Horizontal white space',
# 200B is Zero Width Space which is for line
- 0x200B, # ZWSP
);
$Blank->add_alias('HorizSpace'); # Another name for it.
- $Blank->add_alias('XPosixBlank');
$perl->add_match_table("PosixBlank",
Description => "\\t and ' '",
Initialize => $Blank & $ASCII,
);
# No Posix equivalent for vertical space
- my $Space = $perl->add_match_table('Space',
+ my $Space = $perl->add_match_table('XPosixSpace',
Description => '\s including beyond ASCII and vertical tab',
Initialize => $Blank + $VertSpace,
);
- $Space->add_alias('XPosixSpace');
- my $posix_space = $perl->add_match_table("PosixSpace",
+ $Space->add_alias('XPerlSpace'); # Pre-existing synonyms
+ $Space->add_alias('SpacePerl');
+
+ my $Posix_space = $perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
+ $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
- # Perl's traditional space doesn't include Vertical Tab prior to v5.18
- my $XPerlSpace = $perl->add_match_table('XPerlSpace',
- Description => '\s, including beyond ASCII',
- Initialize => $Space,
- #Initialize => $Space
- # - utf8::unicode_to_native(0x0B]
- );
- $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
- my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
- Initialize => $XPerlSpace & $ASCII,
- );
-
-
- my $Cntrl = $perl->add_match_table('Cntrl',
+ my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
- $Cntrl->add_alias('XPosixCntrl');
$perl->add_match_table("PosixCntrl",
Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
Initialize => $Cntrl & $ASCII,
$controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
# Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
- my $Graph = $perl->add_match_table('Graph',
+ my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
Description => 'Characters that are graphical',
Initialize => ~ ($Space + $controls),
);
- $Graph->add_alias('XPosixGraph');
$perl->add_match_table("PosixGraph",
Description =>
'[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
Initialize => $Graph & $ASCII,
);
- $print = $perl->add_match_table('Print',
+ $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
- $print->add_alias('XPosixPrint');
$perl->add_match_table("PosixPrint",
Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $ASCII & $XPosixPunct,
);
- my $Digit = $perl->add_match_table('Digit',
+ my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
- $Digit->add_alias('XPosixDigit');
my $PosixDigit = $perl->add_match_table("PosixDigit",
Description => '[0-9]',
Initialize => $Digit & $ASCII,
);
# Hex_Digit was not present in first release
- my $Xdigit = $perl->add_match_table('XDigit');
- $Xdigit->add_alias('XPosixXDigit');
+ my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
my $Hex = property_ref('Hex_Digit');
if (defined $Hex && ! $Hex->is_empty) {
$Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
Description => "Code points that particpate in some fold",
);
- #
- foreach my $range (property_ref('Case_Folding')->ranges) {
- $any_folds->add_range($range->start, $range->end);
- foreach my $hex_code_point (split " ", $range->value) {
- my $code_point = hex $hex_code_point;
- $any_folds->add_range($code_point, $code_point);
+ my $loc_problem_folds = $perl->add_match_table(
+ "_Perl_Problematic_Locale_Folds",
+ Description =>
+ "Code points that are in some way problematic under locale",
+ );
+
+ # This allows regexec.c to skip some work when appropriate. Some of the
+ # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
+ my $loc_problem_folds_start = $perl->add_match_table(
+ "_Perl_Problematic_Locale_Foldeds_Start",
+ Description =>
+ "The first character of every sequence in _Perl_Problematic_Locale_Folds",
+ );
+
+ my $cf = property_ref('Case_Folding');
+
+ # Every character 0-255 is problematic because what each folds to depends
+ # on the current locale
+ $loc_problem_folds->add_range(0, 255);
+ $loc_problem_folds_start += $loc_problem_folds;
+
+ # Also problematic are anything these fold to outside the range. Likely
+ # forever the only thing folded to by these outside the 0-255 range is the
+ # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
+ # completely general, which should catch any unexpected changes or errors.
+ # We look at each code point 0-255, and add its fold (including each part
+ # of a multi-char fold) to the list. See commit message
+ # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
+ # of the MU issue.
+ foreach my $range ($loc_problem_folds->ranges) {
+ foreach my $code_point($range->start .. $range->end) {
+ my $fold_range = $cf->containing_range($code_point);
+ next unless defined $fold_range;
+
+ my @hex_folds = split " ", $fold_range->value;
+ my $start_cp = hex $hex_folds[0];
+ foreach my $i (0 .. @hex_folds - 1) {
+ my $cp = hex $hex_folds[$i];
+ next unless $cp > 255; # Already have the < 256 ones
+
+ $loc_problem_folds->add_range($cp, $cp);
+ $loc_problem_folds_start->add_range($start_cp, $start_cp);
+ }
+ }
+ }
+
+ my $folds_to_multi_char = $perl->add_match_table(
+ "_Perl_Folds_To_Multi_Char",
+ Description =>
+ "Code points whose fold is a string of more than one character",
+ );
+ if ($v_version lt v3.0.1) {
+ push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
+ }
+
+ # Look through all the known folds to populate these tables.
+ foreach my $range ($cf->ranges) {
+ my $start = $range->start;
+ my $end = $range->end;
+ $any_folds->add_range($start, $end);
+
+ my @hex_folds = split " ", $range->value;
+ if (@hex_folds > 1) { # Is multi-char fold
+ $folds_to_multi_char->add_range($start, $end);
+ }
+
+ my $found_locale_problematic = 0;
+
+ # Look at each of the folded-to characters...
+ foreach my $i (0 .. @hex_folds - 1) {
+ my $cp = hex $hex_folds[$i];
+ $any_folds->add_range($cp, $cp);
+
+ # The fold is problematic if any of the folded-to characters is
+ # already considered problematic.
+ if ($loc_problem_folds->contains($cp)) {
+ $loc_problem_folds->add_range($start, $end);
+ $found_locale_problematic = 1;
+ }
+ }
+
+ # If this is a problematic fold, add to the start chars the
+ # folding-from characters and first folded-to character.
+ if ($found_locale_problematic) {
+ $loc_problem_folds_start->add_range($start, $end);
+ my $cp = hex $hex_folds[0];
+ $loc_problem_folds_start->add_range($cp, $cp);
}
}
+ utf8::unicode_to_native(0xA0) # NBSP
);
- # These two tables are for matching \X, which is based on the 'extended'
- # grapheme cluster, which came in 5.1; create empty ones if not already
- # present. The straight 'grapheme cluster' (non-extended) is used prior
- # to 5.1, and differs from the extended (see
- # http://www.unicode.org/reports/tr29/) only by these two tables, so we
- # get the older definition automatically when they are empty.
- my $gcb = property_ref('Grapheme_Cluster_Break');
- my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
- $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
- }
- else {
- push @tables_that_may_be_empty, $perl_prepend->complete_name;
- }
-
- # All the tables with _X_ in their names are used in defining \X handling,
- # and are based on the Unicode GCB property. Basically, \X matches:
- # CR LF
- # | Prepend* Begin Extend*
- # | .
- # Begin is: ( Special_Begin | ! Control )
- # Begin is also: ( Regular_Begin | Special_Begin )
- # where Regular_Begin is defined as ( ! Control - Special_Begin )
- # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- # Extend is: ( Grapheme_Extend | Spacing_Mark )
- # Control is: [ GCB_Control | CR | LF ]
- # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- foreach my $gcb_name (qw{ L V T LV LVT }) {
-
- # The perl internal extension's name is the gcb table name prepended
- # with an '_X_'
- my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table($gcb_name),
- );
- # Version 1 had mostly different Hangul syllables that were removed
- # from later versions, so some of the tables may not apply.
- if ($v_version lt v2.0) {
- push @tables_that_may_be_empty, $perl_table->complete_name;
- }
- }
-
- # More GCB. Populate a combined hangul syllables table
- my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
-
- my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if ($v_version ge v6.2) {
- $ri += $gcb->table('RI');
- }
- else {
- push @tables_that_may_be_empty, $ri->full_name;
- }
-
- my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $lv_lvt_v
- + $gcb->table('L')
- + $gcb->table('T')
- + $ri
- );
- $specials_begin->add_comment(join_lines( <<END
-For use in \\X; matches first (perhaps only) character of potential
-multi-character sequences that can begin an extended grapheme cluster. They
-need special handling because of their complicated nature.
-END
- ));
- my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => ~ $gcb->table('Control')
- - $specials_begin
- - $gcb->table('CR')
- - $gcb->table('LF')
- );
- $regular_begin->add_comment(join_lines( <<END
-For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster, except those that require special handling.
-END
- ));
-
- my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table('Extend')
- );
- if (defined (my $sm = $gcb->table('SpacingMark'))) {
- $extend += $sm;
- }
- $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
-
- # End of GCB \X processing
-
my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
if (@named_sequences) {
$unassigned_sans_noncharacters &= $nonchars->table('N');
}
- for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
+ for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
$i = populate_char_info($i); # Note sets $i so may cause skips
+
}
}
# Join all the file path components together, using slashes.
my $full_filename = join('/', @$directory_ref, $file);
- # All go in the same subdirectory of unicore
- if ($directory_ref->[0] ne $matches_directory) {
+ # All go in the same subdirectory of unicore, or the special
+ # pseudo-directory '#'
+ if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
Carp::my_carp("Unexpected directory in "
. join('/', @{$directory_ref}, $file));
}
my $full_name = $property->full_name;
my $count = $input_table->count;
- my $string_count = clarify_number($count);
+ my $unicode_count;
+ my $non_unicode_string;
+ if ($count > $MAX_UNICODE_CODEPOINTS) {
+ $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+ - $MAX_UNICODE_CODEPOINT);
+ $non_unicode_string = " plus all above-Unicode code points";
+ }
+ else {
+ $unicode_count = $count;
+ $non_unicode_string = "";
+ }
+ my $string_count = clarify_number($unicode_count) . $non_unicode_string;
my $status = $input_table->status;
my $status_info = $input_table->status_info;
my $caseless_equivalent = $input_table->caseless_equivalent;
# property always use the single form.
if ($table_property == $perl) {
$entry = "\\p{$name}";
+ $entry .= " \\p$name" if length $name == 1; # Show non-braced
+ # form too
$entry_ref = "\\p{$name}";
}
else { # Compound form.
}
}
- # Ouput both short and single in the same parenthesized
+ # Output both short and single in the same parenthesized
# expression, but with only one of 'Single', 'Short' if there
# are both items.
if ($short_name || $single_form || $table->conflicting) {
# The first few character columns are filler, plus the '\p{'; and get rid
# of all the trailing stuff, starting with the trailing '}', so as to sort
# on just 'Name=Value'
- (my $a = lc $a) =~ s/^ .*? { //x;
+ (my $a = lc $a) =~ s/^ .*? \{ //x;
$a =~ s/}.*//;
- (my $b = lc $b) =~ s/^ .*? { //x;
+ (my $b = lc $b) =~ s/^ .*? \{ //x;
$b =~ s/}.*//;
# Determine if the two operands are both internal only or both not.
column is the longest name, the right column will show any equivalent shortest
name, in both single and compound forms if applicable.
+If braces are not needed to specify a property (e.g., C<\\pL>), the left
+column contains both forms, with and without braces.
+
The right column will also caution you if a property means something different
than what might normally be expected.
=head1 Properties accessible through Unicode::UCD
-All the Unicode character properties mentioned above (except for those marked
-as for internal use by Perl) are also accessible by
-L<Unicode::UCD/prop_invlist()>.
+The value of any Unicode (not including Perl extensions) character
+property mentioned above for any single code point is available through
+L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the
+values of all the Unicode properties for a given code point.
+
+Besides these, all the Unicode character properties mentioned above
+(except for those marked as for internal use by Perl) are also
+accessible by L<Unicode::UCD/prop_invlist()>.
Due to their nature, not all Unicode character properties are suitable for
regular expression matches, nor C<prop_invlist()>. The remaining
C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
(C<\%Config> is available from the Config module).
+Also, perl can be recompiled to operate on an earlier version of the Unicode
+standard. Further information is at
+C<\$Config{privlib}>/F<unicore/README.perl>.
+
=head1 Other information in the Unicode data base
The Unicode data base is delivered in two different formats. The XML version
my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
chomp $stricter_to_file_of;
+ my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
+ chomp $inline_definitions;
+
my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
chomp $loose_to_file_of;
# for the alternate table figured out at that time.
foreach my $cased (keys %caseless_equivalent_to) {
my @path = $caseless_equivalent_to{$cased}->file_path;
- my $path = join '/', @path[1, -1];
+ my $path;
+ if ($path[0] eq "#") { # Pseudo-directory '#'
+ $path = join '/', @path;
+ }
+ else { # Gets rid of lib/
+ $path = join '/', @path[1, -1];
+ }
$caseless_equivalent_to{$cased} = $path;
}
my $caseless_equivalent_to
$loose_property_name_of
);
-# Maps property, table to file for those using stricter matching
+# Gives the definitions (in the form of inversion lists) for those properties
+# whose definitions aren't kept in files
+\@utf8::inline_definitions = (
+$inline_definitions
+);
+
+# Maps property, table to file for those using stricter matching. For paths
+# whose directory is '#', the file is in the form of a numeric index into
+# \@inline_definitions
\%utf8::stricter_to_file_of = (
$stricter_to_file_of
);
-# Maps property, table to file for those using loose matching
+# Maps property, table to file for those using loose matching. For paths
+# whose directory is '#', the file is in the form of a numeric index into
+# \@inline_definitions
\%utf8::loose_to_file_of = (
$loose_to_file_of
);
# don't care if both defined, as they had better be different anyway.)
# The property named 'Perl' needs to be first (it doesn't have any
# immutable file name) because empty properties are defined in terms of
- # it's table named 'Any'.) We also sort by the property's name. This is
- # just for repeatability of the outputs between runs of this program, but
- # does not affect correctness.
+ # its table named 'All' under the -annotate option.) We also sort by
+ # the property's name. This is just for repeatability of the outputs
+ # between runs of this program, but does not affect correctness.
PROPERTY:
foreach my $property ($perl,
sort { return -1 if defined $a->file;
push @unhandled_properties, "$table";
}
- # An empty table is just the complement of everything.
- $table->set_complement($Any) if $table != $property;
+ # The old way of expressing an empty match list was to
+ # complement the list that matches everything. The new way is
+ # to create an empty inversion list, but this doesn't work for
+ # annotating, so use the old way then.
+ $table->set_complement($All) if $annotate
+ && $table != $property;
}
elsif ($expected_empty) {
my $because = "";
my $count = $table->count;
if ($expected_full) {
- if ($count != $MAX_UNICODE_CODEPOINTS) {
+ if ($count != $MAX_WORKING_CODEPOINTS) {
Carp::my_carp("$table matches only "
. clarify_number($count)
. " Unicode code points but should match "
- . clarify_number($MAX_UNICODE_CODEPOINTS)
+ . clarify_number($MAX_WORKING_CODEPOINTS)
. " (off by "
- . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
+ . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
. "). Proceeding anyway.");
}
}
}
elsif ($count == $MAX_UNICODE_CODEPOINTS
+ && $name ne "Any"
&& ($table == $property || $table->leader == $table)
&& $table->property->status ne $NORMAL)
{
# Similarly, we create for Unicode::UCD a list of
# property-value aliases.
- my $property_full_name = $property->full_name;
-
# Look at each table in the property...
foreach my $table ($property->tables) {
my @values_list;
}
# To save memory, unlike the similar list for property
- # aliases above, only the standard forms hve the list.
+ # aliases above, only the standard forms have the list.
# This forces an extra step of converting from input
# name to standard name, but the savings are
# considerable. (There is only marginal savings if we
my $filename;
my $property = $table->property;
my $is_property = ($table == $property);
- if (! $is_property) {
+ # For very short tables, instead of writing them out to actual files,
+ # we in-line their inversion list definitions into Heavy.pl. The
+ # definition replaces the file name, and the special pseudo-directory
+ # '#' is used to signal this. This significantly cuts down the number
+ # of files written at little extra cost to the hashes in Heavy.pl.
+ # And it means, no run-time files to read to get the definitions.
+ if (! $is_property
+ && ! $annotate # For annotation, we want to explicitly show
+ # everything, so keep in files
+ && $table->ranges <= 3)
+ {
+ my @ranges = $table->ranges;
+ my $count = @ranges;
+ if ($count == 0) { # 0th index reserved for 0-length lists
+ $filename = 0;
+ }
+ elsif ($table->leader != $table) {
+
+ # Here, is a table that is equivalent to another; code
+ # in register_file_for_name() causes its leader's definition
+ # to be used
+
+ next;
+ }
+ else { # No equivalent table so far.
+
+ # Build up its definition range-by-range.
+ my $definition = "";
+ while (defined (my $range = shift @ranges)) {
+ my $end = $range->end;
+ if ($end < $MAX_WORKING_CODEPOINT) {
+ $count++;
+ $end = "\n" . ($end + 1);
+ }
+ else { # Extends to infinity, hence no 'end'
+ $end = "";
+ }
+ $definition .= "\n" . $range->start . $end;
+ }
+ $definition = "V$count" . $definition;
+ $filename = @inline_definitions;
+ push @inline_definitions, $definition;
+ }
+ @directory = "#";
+ register_file_for_name($table, \@directory, $filename);
+ next;
+ }
+
+ if (! $is_property) {
# Match tables for the property go in lib/$subdirectory, which is
# the property's name. Don't use the standard file name for this,
# as may get an unfamiliar alias
&write($t_path,
0, # Not utf8;
- [<DATA>,
+ [$HEADER,
+ <DATA>,
@output,
- (map {"Test_X('$_');\n"} @backslash_X_tests),
- "Finished();\n"]);
+ (map {"Test_GCB('$_');\n"} @backslash_X_tests),
+ (map {"Test_SB('$_');\n"} @SB_tests),
+ (map {"Test_WB('$_');\n"} @WB_tests),
+ "Finished();\n"
+ ]);
+
return;
}
),
Input_file->new('Scripts.txt', v3.1.0,
Property => 'Script',
+ Each_Line_Handler => (($v_version le v4.0.0)
+ ? \&filter_all_caps_script_names
+ : undef),
Has_Missings_Defaults => $NOT_IGNORED,
),
Input_file->new('DNormalizationProps.txt', v3.1.0,
Skip => 'Validation Tests',
),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
+ Handler => \&process_SB_test,
),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
+ Handler => \&process_WB_test,
),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
Input_file->new('IndicMatraCategory.txt', v6.1.0,
Property => 'Indic_Matra_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- Skip => "Provisional; for the analysis and processing of Indic scripts",
+ Skip => "Withdrawn by Unicode while still provisional",
),
Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
Property => 'Indic_Syllabic_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- Skip => "Provisional; for the analysis and processing of Indic scripts",
+ Skip => (($v_version lt v8.0.0)
+ ? "Provisional; for the analysis and processing of Indic scripts"
+ : 0),
),
Input_file->new('BidiBrackets.txt', v6.3.0,
Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
Input_file->new("BidiCharacterTest.txt", v6.3.0,
Skip => 'Validation Tests',
),
+ Input_file->new('IndicPositionalCategory.txt', v8.0.0,
+ Property => 'Indic_Positional_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
);
# End of all the preliminaries.
use strict;
use warnings;
-# If run outside the normal test suite on an ASCII platform, you can
-# just create a latin1_to_native() function that just returns its
-# inputs, because that's the only function used from test.pl
-require "test.pl";
-
# Test qr/\X/ and the \p{} regular expression constructs. This file is
# constructed by mktables from the tables it generates, so if mktables is
# buggy, this won't necessarily catch those bugs. Tests are generated for all
my $Tests = 0;
my $Fails = 0;
+# loc_tools.pl requires this function to be defined
+sub ok($@) {
+ my ($pass, @msg) = @_;
+ print "not " unless $pass;
+ print "ok ";
+ print ++$Tests;
+ print " - ", join "", @msg if @msg;
+ print "\n";
+}
+
sub Expect($$$$) {
my $expected = shift;
my $ord = shift;
return;
}
-# GCBTest.txt character that separates grapheme clusters
+# Break test files (e.g. GCBTest.txt) character that break allowed here
my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
utf8::upgrade($breakable_utf8);
-# GCBTest.txt character that indicates that the adjoining code points are part
-# of the same grapheme cluster
+# Break test files (e.g. GCBTest.txt) character that indicates can't break
+# here
my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
utf8::upgrade($nobreak_utf8);
-sub Test_X($) {
+use Config;
+my $utf8_locale;
+chdir 't' if -d 't';
+eval { require "./loc_tools.pl" };
+$utf8_locale = &find_utf8_ctype_locale if defined &find_utf8_ctype_locale;
+
+sub _test_break($$) {
# Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
# Each such line is a sequence of code points given by their hex numbers,
# separated by the two characters defined just before this subroutine that
# Each \X should match the next cluster; and that is what is checked.
my $template = shift;
+ my $break_type = shift;
- my $line = (caller)[2];
+ my $line = (caller 1)[2]; # Line number
# The line contains characters above the ASCII range, but in Latin1. It
# may or may not be in utf8, and if it is, it may or may not know it. So,
$template =~ s/$breakable_utf8/$breakable/g;
}
- # Get rid of the leading and trailing breakables
- $template =~ s/^ \s* $breakable \s* //x;
- $template =~ s/ \s* $breakable \s* $ //x;
+ # The input is just the break/no-break symbols and sequences of Unicode
+ # code points as hex digits separated by spaces for legibility. e.g.:
+ # ÷ 0020 × 0308 ÷ 0020 ÷
+ # Convert to native \x format
+ $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
+ $template =~ s/ \s* //gx; # Probably the line above removed all spaces;
+ # but be sure
- # And no-breaks become just a space.
- $template =~ s/ \s* $nobreak \s* / /xg;
+ # Make a copy of the input with the symbols replaced by \b{} and \B{} as
+ # appropriate
+ my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
+ $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
- # Split the input into segments that are breakable between them.
- my @segments = split /\s*$breakable\s*/, $template;
+ my $display_string = $template =~ s/[$breakable$nobreak]//gr;
+ my $string = eval "\"$display_string\"";
- my $string = "";
- my $display_string = "";
- my @should_match;
- my @should_display;
+ # The remaining massaging of the input is for the \X tests. Get rid of
+ # the leading and trailing breakables
+ $template =~ s/^ \s* $breakable \s* //x;
+ $template =~ s/ \s* $breakable \s* $ //x;
- # Convert the code point sequence in each segment into a Perl string of
- # characters
- foreach my $segment (@segments) {
- my @code_points = split /\s+/, $segment;
- my $this_string = "";
- my $this_display = "";
- foreach my $code_point (@code_points) {
- $this_string .= latin1_to_native(chr(hex $code_point));
- $this_display .= "\\x{$code_point}";
- }
+ # Delete no-breaks
+ $template =~ s/ \s* $nobreak \s* //xg;
- # The next cluster should match the string in this segment.
- push @should_match, $this_string;
- push @should_display, $this_display;
- $string .= $this_string;
- $display_string .= $this_display;
- }
+ # Split the input into segments that are breakable between them.
+ my @should_display = split /\s*$breakable\s*/, $template;
+ my @should_match = map { eval "\"$_\"" } @should_display;
# If a string can be represented in both non-ut8 and utf8, test both cases
+ my $display_upgrade = "";
UPGRADE:
for my $to_upgrade (0 .. 1) {
next UPGRADE if utf8::is_utf8($string);
utf8::upgrade($string);
+ $display_upgrade = " (utf8-upgraded)";
+ }
+
+ # The /l modifier has C after it to indicate the locale to try
+ my @modifiers = qw(a aa d lC u i);
+ push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+ # Test for each of the regex modifiers.
+ for my $modifier (@modifiers) {
+ my $display_locale = "";
+
+ # For /l, set the locale to what it says to.
+ if ($modifier =~ / ^ l (.*) /x) {
+ my $locale = $1;
+ $display_locale = "(locale = $locale)";
+ use Config;
+ if (defined $Config{d_setlocale}) {
+ eval { require POSIX; import POSIX 'locale_h'; };
+ if (defined &POSIX::LC_CTYPE) {
+ POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
+ }
+ }
+ $modifier = 'l';
+ }
+
+ no warnings qw(locale regexp surrogate);
+ my $pattern = "(?$modifier:$break_pattern)";
+
+ # Actually do the test
+ my $matched = $string =~ qr/$pattern/;
+ print "not " unless $matched;
+
+ # Fancy display of test results
+ $matched = ($matched) ? "matched" : "failed to match";
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+
+ # Repeat with the first \B{} in the pattern. This makes sure the
+ # code in regexec.c:find_byclass() for \B gets executed
+ if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
+ my $B_pattern = "$1$2";
+ $matched = $string =~ qr/$B_pattern/;
+ print "not " unless $matched;
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+ }
}
+ next if $break_type ne 'gcb';
+
# Finally, do the \X match.
my @matches = $string =~ /(\X)/g;
print " correctly matched $should_display[$i]; line $line\n";
} else {
$matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
- unpack("U*", $matches[$i]));
+ split "", $matches[$i]);
print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
$i + 1,
" should have matched $should_display[$i]",
return;
}
+sub Test_GCB($) {
+ _test_break(shift, 'gcb');
+}
+
+sub Test_SB($) {
+ _test_break(shift, 'sb');
+}
+
+sub Test_WB($) {
+ _test_break(shift, 'wb');
+}
+
sub Finished() {
print "1..$Tests\n";
exit($Fails ? -1 : 0);
}
Error('\p{Script=InGreek}'); # Bug #69018
-Test_X("1100 $nobreak 1161"); # Bug #70940
+Test_GCB("1100 $nobreak 1161"); # Bug #70940
Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726