# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
# Without this change, there are casing problems for this character.
#
+# Search for $string_compare_versions to see how to compare changes to
+# properties between Unicode versions
+#
##############################################################################
my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
- 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+ 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
'Indic_Matra_Category' => "Provisional",
'Indic_Syllabic_Category' => "Provisional",
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
- 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
- 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
- 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
- FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
+ FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
);
foreach my $property (
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
+# We work with above-Unicode code points, up to UV_MAX. But when you get
+# that high, above IV_MAX, some operations don't work, and you can easily get
+# overflow. Therefore for internal use, we use a much smaller number,
+# translating it to UV_MAX only for output. The exact number is immaterial
+# (all Unicode code points are treated exactly the same), but the algorithm
+# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
+my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
+my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
+my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
+
+my $MAX_PLATFORM_CODEPOINT = ~0;
+
# Matches legal code point. 4-6 hex numbers, If there are 6, the first
# two must be 10; if there are 5, the first must not be a 0. Written this way
# to decrease backtracking. The first regex allows the code point to be at
my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
my $AUXILIARY = 'auxiliary';
-# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
-# and into UCD.pl for the use of UCD.pm
+# Hashes and arrays that will eventually go into Heavy.pl for the use of
+# utf8_heavy.pl and into UCD.pl for the use of UCD.pm
my %loose_to_file_of; # loosely maps table names to their respective
# files
my %stricter_to_file_of; # same; but for stricter mapping.
my %loose_property_to_file_of; # Maps a loose property name to its map file
+my @inline_definitions = "V0"; # Each element gives a definition of a unique
+ # inversion list. When a definition is inlined,
+ # its value in the hash it's in (one of the two
+ # defined just above) will include an index into
+ # this array. The 0th element is initialized to
+ # the definition for a zero length invwersion list
my %file_to_swash_name; # Maps the file name to its corresponding key name
# in the hash %utf8::SwashInfo
my %nv_floating_to_rational; # maps numeric values floating point numbers to
my $PRIVATE_USE_TYPE = -3;
my $NONCHARACTER_TYPE = -4;
my $CONTROL_TYPE = -5;
-my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
+my $ABOVE_UNICODE_TYPE = -6;
+my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program
sub populate_char_info ($) {
# Used only with the $annotate option. Populates the arrays with the
my $end;
if (! $viacode[$i]) {
my $nonchar;
- if ($gc-> table('Private_use')->contains($i)) {
+ if ($i > $MAX_UNICODE_CODEPOINT) {
+ $viacode[$i] = 'Above-Unicode';
+ $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
+ $printable[$i] = 0;
+ $end = $MAX_WORKING_CODEPOINT;
+ }
+ elsif ($gc-> table('Private_use')->contains($i)) {
$viacode[$i] = 'Private Use';
$annotate_char_type[$i] = $PRIVATE_USE_TYPE;
$printable[$i] = 0;
# This is like clarify_number(), but the input is assumed to be a count of
# code points, rather than a generic number.
- return clarify_number(shift);
+ my $append = "";
+
+ my $number = shift;
+ if ($number > $MAX_UNICODE_CODEPOINTS) {
+ $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
+ return "All above-Unicode code points" if $number == 0;
+ $append = " + all above-Unicode code points";
+ }
+ return clarify_number($number) . $append;
}
package Carp;
{ # Closure
# This program uses the inside-out method for objects, as recommended in
- # "Perl Best Practices". This closure aids in generating those. There
+ # "Perl Best Practices". (This is the best solution still, since this has
+ # to run under miniperl.) This closure aids in generating those. There
# are two routines. setup_package() is called once per package to set
# things up, and then set_access() is called for each hash representing a
# field in the object. These routines arrange for the object to be
$handle{$addr} = $file_handle; # Cache the open file handle
if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
- $_ = <$file_handle>;
- if ($_ !~ / - $string_version \. /x) {
- chomp;
- $_ =~ s/^#\s*//;
- die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ if ($file !~ /^Unihan/i) {
+ $_ = <$file_handle>;
+ if ($_ !~ / - $string_version \. /x) {
+ chomp;
+ $_ =~ s/^#\s*//;
+ die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ }
+ }
+ else {
+ while (<$file_handle>) {
+ if ($_ !~ /^#/) {
+ Carp::my_carp_bug("Could not find the expected version info in file '$file'");
+ last;
+ }
+ chomp;
+ $_ =~ s/^#\s*//;
+ next if $_ !~ / version: /x;
+ last if $_ =~ /$string_version/;
+ die Carp::my_carp("File '$file' is '$_'. It should be version $string_version");
+ }
}
}
}
# If the range list is empty, return a large value that isn't adjacent
# to any that could be in the range list, for simpler tests
- return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
+ return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
return $ranges{$addr}->[0]->start;
}
Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
return;
}
- if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
- Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
- }
#local $to_trace = 1 if main::DEBUG;
if ($operation eq '-') {
# And finally, add the gap from the end of the table to the max
# possible code point
- if ($max < $MAX_UNICODE_CODEPOINT) {
- $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
+ if ($max < $MAX_WORKING_CODEPOINT) {
+ $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
}
return $new;
}
# range.
my $end = $set->end;
return $end if is_code_point_usable($end, $try_hard);
+ $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
# End point didn't, work. Start at the beginning and try
# every one until find one that does work.
# The constructor can override the global flag of the same name.
main::set_access('output_range_counts', \%output_range_counts, 'r');
+ my %write_as_invlist;
+ # A boolean set iff the output file for this table is to be in the form of
+ # an inversion list/map.
+ main::set_access('write_as_invlist', \%write_as_invlist, 'r');
+
my %format;
# The format of the entries of the table. This is calculated from the
# data in the table (or passed in the constructor). This is an enum e.g.,
$range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
$caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
$fate{$addr} = delete $args{'Fate'} || $ORDINARY;
+ $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
my $ucd = delete $args{'UCD'};
my $description = delete $args{'Description'};
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $self; };
+ my $write_as_invlist = $write_as_invlist{$addr};
# Start with the header
my @HEADER = $self->header;
else {
my $range_size_1 = $range_size_1{$addr};
+ # To make it more readable, use a minimum indentation
+ my $comment_indent;
+
# These are used only in $annotate option
my $format; # e.g. $HEX_ADJUST_FORMAT
my $include_name; # ? Include the character's name in the
# annotation?
my $include_cp; # ? Include its code point
- # To make it more readable, use a minimum indentation
- my $comment_indent = 16;
-
- if ($annotate) {
+ if (! $annotate) {
+ $comment_indent = ($self->isa('Map_Table'))
+ ? 24
+ : ($write_as_invlist)
+ ? 8
+ : 16;
+ }
+ else {
$format = $self->format;
# The name of the character is output only for tables that
# the first column
$include_cp = ! $range_size_1;
- if ($self->isa('Map_Table')) {
+ if (! $self->isa('Map_Table')) {
+ $comment_indent = ($write_as_invlist) ? 8 : 16;
+ }
+ else {
+ $comment_indent = 16;
# There are just a few short ranges in this table, so no
# need to include the code point in the annotation.
my $next_end;
my $next_value;
my $offset = 0;
+ my $invlist_count = 0;
my $output_value_in_hex = $self->isa('Map_Table')
&& ($self->format eq $HEX_ADJUST_FORMAT
$previous_value = $value;
}
- # If there is a range
- if ($start != $end) {
- push @OUT, sprintf "$hex_format\t$hex_format",
- $start, $end;
- if ($value ne "") {
+ if ($write_as_invlist) {
+
+ # Inversion list format has a single number per line,
+ # the starting code point of a range that matches the
+ # property
+ push @OUT, $start, "\n";
+ $invlist_count++;
+
+ # Add a comment with the size of the range, if
+ # requested.
+ if ($output_range_counts{$addr}) {
+ $OUT[-1] = merge_single_annotation_line(
+ $OUT[-1],
+ "# ["
+ . main::clarify_code_point_count($end - $start + 1)
+ . "]\n",
+ $comment_indent);
+ }
+ }
+ elsif ($start != $end) { # If there is a range
+ if ($end == $MAX_WORKING_CODEPOINT) {
+ push @OUT, sprintf "$hex_format\t$hex_format",
+ $start,
+ $MAX_PLATFORM_CODEPOINT;
+ }
+ else {
+ push @OUT, sprintf "$hex_format\t$hex_format",
+ $start, $end;
+ }
+ if (length $value) {
if ($convert_map_to_from_hex) {
$OUT[-1] .= sprintf "\t$hex_format\n", $value;
}
}
if ($i != $start || $range_end < $end) {
- $annotation = sprintf "%04X..%04X",
- $i, $range_end;
+ if ($range_end < $MAX_WORKING_CODEPOINT)
+ {
+ $annotation = sprintf "%04X..%04X",
+ $i, $range_end;
+ }
+ else {
+ $annotation = sprintf "%04X..INFINITY",
+ $i;
+ }
}
else { # Indent if not displaying code points
$annotation = " " x 4;
}
}
+ # Add the beginning of the range that doesn't match the
+ # property, except if the just added match range extends
+ # to infinity. We do this after any annotations for the
+ # match range.
+ if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
+ push @OUT, $end + 1, "\n";
+ $invlist_count++;
+ }
+
# If we split the range, set up so the next time through
# we get the remainder, and redo.
if ($next_start) {
} # End of loop through all the table's ranges
push @OUT, @annotation; # Add orphaned annotation, if any
+
+ splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
}
# Add anything that goes after the main body, but within the here
Full_Name => $full_name,
_Property => $property,
_Range_List => $range_list,
+ Write_As_Invlist => 0,
%args);
my $addr = do { no overloading; pack 'J', $self; };
# but its format and even its name or existence are subject to change without
# notice in a future Perl version. Don't use it directly. Instead, its
# contents are now retrievable through a stable API in the Unicode::UCD
-# module: Unicode::UCD::prop_invmap('$property_name').
+# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
+# code points can be retrieved via Unicode::UCD::charprop());
END
}
return $return;
}
$comment .= "\nwhere 'cp' is $cp.";
if ($ucd_accessible_name) {
- $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
+ $comment .= " Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
}
# And append any commentary already set from the actual property.
}
}
- # I (khw) have never waded through this line to
- # understand it well enough to comment it.
+ # The unpack yields a list of the bytes that comprise the
+ # UTF-8 of $code_point, which are each placed in \xZZ format
+ # and output in the %s to map to $tostr, so the result looks
+ # like:
+ # "\xC4\xB0" => "\x{0069}\x{0307}",
my $utf8 = sprintf(qq["%s" => "$tostr",],
join("", map { sprintf "\\x%02X", $_ }
- unpack("U0C*", pack("U", $code_point))));
+ unpack("U0C*", chr $code_point)));
# Add a comment so that a human reader can more easily
# see what's going on.
_Property => $property,
_Range_List => $range_list,
Format => $EMPTY_FORMAT,
+ Write_As_Invlist => 1,
);
my $addr = do { no overloading; pack 'J', $self; };
# Get the number of code points matched by each of the tables in this
# file, and add underscores for clarity.
my $count = $leader->count;
- my $string_count = main::clarify_code_point_count($count);
+ my $unicode_count;
+ my $non_unicode_string;
+ if ($count > $MAX_UNICODE_CODEPOINTS) {
+ $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+ - $MAX_UNICODE_CODEPOINT);
+ $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
+ }
+ else {
+ $unicode_count = $count;
+ $non_unicode_string = "";
+ }
+ my $string_count = main::clarify_code_point_count($unicode_count);
my $loose_count = 0; # how many aliases loosely matched
my $compound_name = ""; # ? Are any names compound?, and if so, an
}
} # End of looping through all tables
+ $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
+
my $code_points;
my $match;
my $any_of_these;
- if ($count == 1) {
+ if ($unicode_count == 1) {
$match = 'matches';
$code_points = 'single code point';
}
if ($count) { # The format differs if no code points, and needs no
# explanation in that case
+ if ($leader->write_as_invlist) {
$comment.= <<END;
-The format of the lines of this file is:
+The first data line of this file begins with the letter V to indicate it is in
+inversion list format. The number following the V gives the number of lines
+remaining. Each of those remaining lines is a single number representing the
+starting code point of a range which goes up to but not including the number
+on the next line; The 0th, 2nd, 4th... ranges are for code points that match
+the property; the 1st, 3rd, 5th... are ranges of code points that don't match
+the property. The final line's range extends to the platform's infinity.
END
- $comment.= <<END;
+ }
+ else {
+ $comment.= <<END;
+The format of the lines of this file is:
START\\tSTOP\\twhere START is the starting code point of the range, in hex;
STOP is the ending point, or if omitted, the range has just one code point.
END
+ }
if ($leader->output_range_counts) {
$comment .= <<END;
Numbers in comments in [brackets] indicate how many code points are in the
{ # Closure
my $indent_increment = " " x (($debugging_build) ? 2 : 0);
- my %already_output;
+ %main::already_output = ();
$main::simple_dumper_nesting = 0;
# nesting level is localized, so that as the call stack pops, it goes
# back to the prior value.
local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
- undef %already_output if $main::simple_dumper_nesting == 0;
+ local %main::already_output = %main::already_output;
$main::simple_dumper_nesting++;
#print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
# Keep track of cycles in the input, and refuse to infinitely loop
my $addr = do { no overloading; pack 'J', $item; };
- if (defined $already_output{$addr}) {
+ if (defined $main::already_output{$addr}) {
return "${indent}ALREADY OUTPUT: $item\n";
}
- $already_output{$addr} = $item;
+ $main::already_output{$addr} = $item;
if (ref $item eq 'ARRAY') {
my $using_brackets;
# This fills in any missing values with the default. It's not
# necessary to do this with binary properties, as the default
# is defined completely in terms of the Y table.
- $property->add_map(0, $MAX_UNICODE_CODEPOINT,
+ $property->add_map(0, $MAX_WORKING_CODEPOINT,
$default_map, Replace => $NO);
}
}
# 'All' is all code points. As an error check, instead of just setting it
# to be that, construct it to be the union of all the major categories
$All = $perl->add_match_table('All',
- Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
- Matches_All => 1);
+ Description
+ => "All code points, including those above Unicode. Same as qr/./s",
+ Matches_All => 1);
foreach my $major_table ($gc->tables) {
$All += $major_table;
}
- if ($All->max != $MAX_UNICODE_CODEPOINT) {
+ if ($All->max != $MAX_WORKING_CODEPOINT) {
Carp::my_carp_bug("Generated highest code point ("
. sprintf("%X", $All->max)
- . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
+ . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
}
if ($All->range_count != 1 || $All->min != 0) {
Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
# have Uppercase and Lowercase defined, so use the general category
# instead for them, modified by hard-coding in the code points each is
# missing.
- my $Lower = $perl->add_match_table('Lower');
+ my $Lower = $perl->add_match_table('XPosixLower');
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
$Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
$Lower += $code_point;
}
}
- $Lower->add_alias('XPosixLower');
my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Lower & $ASCII,
);
- my $Upper = $perl->add_match_table('Upper');
+ my $Upper = $perl->add_match_table("XPosixUpper");
my $Unicode_Upper = property_ref('Uppercase');
if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
$Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
$Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
- $Upper->add_alias('XPosixUpper');
my $Posix_Upper = $perl->add_match_table("PosixUpper",
Description => "[A-Z]",
Initialize => $Upper & $ASCII,
# one whose name generally begins with Posix that is posix-compliant, and
# one that matches Unicode characters beyond the Posix, ASCII range
- my $Alpha = $perl->add_match_table('Alpha');
+ my $Alpha = $perl->add_match_table('XPosixAlpha');
# Alphabetic was not present in early releases
my $Alphabetic = property_ref('Alphabetic');
$Alpha->add_description('Alphabetic');
$Alpha->add_alias('Alphabetic');
}
- $Alpha->add_alias('XPosixAlpha');
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
Description => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
$Posix_Upper->set_caseless_equivalent($Posix_Alpha);
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
- my $Alnum = $perl->add_match_table('Alnum',
+ my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
- $Alnum->add_alias('XPosixAlnum');
$perl->add_match_table("PosixAlnum",
Description => "[A-Za-z0-9]",
Initialize => $Alnum & $ASCII,
);
- my $Word = $perl->add_match_table('Word',
+ my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
Description => '\w, including beyond ASCII;'
. ' = \p{Alnum} + \pM + \p{Pc}',
Initialize => $Alnum + $gc->table('Mark'),
);
- $Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
if (defined $Pc) {
$Word += $Pc;
}
# This is a Perl extension, so the name doesn't begin with Posix.
- my $PerlWord = $perl->add_match_table('PerlWord',
+ my $PerlWord = $perl->add_match_table('PosixWord',
Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
- $PerlWord->add_alias('PosixWord');
+ $PerlWord->add_alias('PerlWord');
- my $Blank = $perl->add_match_table('Blank',
+ my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
Description => '\h, Horizontal white space',
# 200B is Zero Width Space which is for line
- 0x200B, # ZWSP
);
$Blank->add_alias('HorizSpace'); # Another name for it.
- $Blank->add_alias('XPosixBlank');
$perl->add_match_table("PosixBlank",
Description => "\\t and ' '",
Initialize => $Blank & $ASCII,
);
# No Posix equivalent for vertical space
- my $Space = $perl->add_match_table('Space',
+ my $Space = $perl->add_match_table('XPosixSpace',
Description => '\s including beyond ASCII and vertical tab',
Initialize => $Blank + $VertSpace,
);
- $Space->add_alias('XPosixSpace');
- my $posix_space = $perl->add_match_table("PosixSpace",
+ $Space->add_alias('XPerlSpace'); # Pre-existing synonyms
+ $Space->add_alias('SpacePerl');
+
+ my $Posix_space = $perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
+ $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
- # Perl's traditional space doesn't include Vertical Tab prior to v5.18
- my $XPerlSpace = $perl->add_match_table('XPerlSpace',
- Description => '\s, including beyond ASCII',
- Initialize => $Space,
- #Initialize => $Space
- # - utf8::unicode_to_native(0x0B]
- );
- $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
- my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
- Initialize => $XPerlSpace & $ASCII,
- );
-
-
- my $Cntrl = $perl->add_match_table('Cntrl',
+ my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
- $Cntrl->add_alias('XPosixCntrl');
$perl->add_match_table("PosixCntrl",
Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
Initialize => $Cntrl & $ASCII,
$controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
# Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
- my $Graph = $perl->add_match_table('Graph',
+ my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
Description => 'Characters that are graphical',
Initialize => ~ ($Space + $controls),
);
- $Graph->add_alias('XPosixGraph');
$perl->add_match_table("PosixGraph",
Description =>
'[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
Initialize => $Graph & $ASCII,
);
- $print = $perl->add_match_table('Print',
+ $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
- $print->add_alias('XPosixPrint');
$perl->add_match_table("PosixPrint",
Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $ASCII & $XPosixPunct,
);
- my $Digit = $perl->add_match_table('Digit',
+ my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
- $Digit->add_alias('XPosixDigit');
my $PosixDigit = $perl->add_match_table("PosixDigit",
Description => '[0-9]',
Initialize => $Digit & $ASCII,
);
# Hex_Digit was not present in first release
- my $Xdigit = $perl->add_match_table('XDigit');
- $Xdigit->add_alias('XPosixXDigit');
+ my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
my $Hex = property_ref('Hex_Digit');
if (defined $Hex && ! $Hex->is_empty) {
$Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
Description => "Code points that particpate in some fold",
);
- #
- foreach my $range (property_ref('Case_Folding')->ranges) {
- $any_folds->add_range($range->start, $range->end);
- foreach my $hex_code_point (split " ", $range->value) {
- my $code_point = hex $hex_code_point;
- $any_folds->add_range($code_point, $code_point);
+ my $loc_problem_folds = $perl->add_match_table(
+ "_Perl_Problematic_Locale_Folds",
+ Description =>
+ "Code points that are in some way problematic under locale",
+ );
+
+ # This allows regexec.c to skip some work when appropriate. Some of the
+ # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
+ my $loc_problem_folds_start = $perl->add_match_table(
+ "_Perl_Problematic_Locale_Foldeds_Start",
+ Description =>
+ "The first character of every sequence in _Perl_Problematic_Locale_Folds",
+ );
+
+ my $cf = property_ref('Case_Folding');
+
+ # Every character 0-255 is problematic because what each folds to depends
+ # on the current locale
+ $loc_problem_folds->add_range(0, 255);
+ $loc_problem_folds_start += $loc_problem_folds;
+
+ # Also problematic are anything these fold to outside the range. Likely
+ # forever the only thing folded to by these outside the 0-255 range is the
+ # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
+ # completely general, which should catch any unexpected changes or errors.
+ # We look at each code point 0-255, and add its fold (including each part
+ # of a multi-char fold) to the list. See commit message
+ # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
+ # of the MU issue.
+ foreach my $range ($loc_problem_folds->ranges) {
+ foreach my $code_point($range->start .. $range->end) {
+ my $fold_range = $cf->containing_range($code_point);
+ next unless defined $fold_range;
+
+ my @hex_folds = split " ", $fold_range->value;
+ my $start_cp = hex $hex_folds[0];
+ foreach my $i (0 .. @hex_folds - 1) {
+ my $cp = hex $hex_folds[$i];
+ next unless $cp > 255; # Already have the < 256 ones
+
+ $loc_problem_folds->add_range($cp, $cp);
+ $loc_problem_folds_start->add_range($start_cp, $start_cp);
+ }
+ }
+ }
+
+ my $folds_to_multi_char = $perl->add_match_table(
+ "_Perl_Folds_To_Multi_Char",
+ Description =>
+ "Code points whose fold is a string of more than one character",
+ );
+
+ # Look through all the known folds to populate these tables.
+ foreach my $range ($cf->ranges) {
+ my $start = $range->start;
+ my $end = $range->end;
+ $any_folds->add_range($start, $end);
+
+ my @hex_folds = split " ", $range->value;
+ if (@hex_folds > 1) { # Is multi-char fold
+ $folds_to_multi_char->add_range($start, $end);
+ }
+
+ my $found_locale_problematic = 0;
+
+ # Look at each of the folded-to characters...
+ foreach my $i (0 .. @hex_folds - 1) {
+ my $cp = hex $hex_folds[$i];
+ $any_folds->add_range($cp, $cp);
+
+ # The fold is problematic if any of the folded-to characters is
+ # already considered problematic.
+ if ($loc_problem_folds->contains($cp)) {
+ $loc_problem_folds->add_range($start, $end);
+ $found_locale_problematic = 1;
+ }
+ }
+
+ # If this is a problematic fold, add to the start chars the
+ # folding-from characters and first folded-to character.
+ if ($found_locale_problematic) {
+ $loc_problem_folds_start->add_range($start, $end);
+ my $cp = hex $hex_folds[0];
+ $loc_problem_folds_start->add_range($cp, $cp);
}
}
$unassigned_sans_noncharacters &= $nonchars->table('N');
}
- for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
+ for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
$i = populate_char_info($i); # Note sets $i so may cause skips
+
}
}
# Join all the file path components together, using slashes.
my $full_filename = join('/', @$directory_ref, $file);
- # All go in the same subdirectory of unicore
- if ($directory_ref->[0] ne $matches_directory) {
+ # All go in the same subdirectory of unicore, or the special
+ # pseudo-directory '#'
+ if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
Carp::my_carp("Unexpected directory in "
. join('/', @{$directory_ref}, $file));
}
my $full_name = $property->full_name;
my $count = $input_table->count;
- my $string_count = clarify_number($count);
+ my $unicode_count;
+ my $non_unicode_string;
+ if ($count > $MAX_UNICODE_CODEPOINTS) {
+ $unicode_count = $count - ($MAX_WORKING_CODEPOINT
+ - $MAX_UNICODE_CODEPOINT);
+ $non_unicode_string = " plus all above-Unicode code points";
+ }
+ else {
+ $unicode_count = $count;
+ $non_unicode_string = "";
+ }
+ my $string_count = clarify_number($unicode_count) . $non_unicode_string;
my $status = $input_table->status;
my $status_info = $input_table->status_info;
my $caseless_equivalent = $input_table->caseless_equivalent;
# property always use the single form.
if ($table_property == $perl) {
$entry = "\\p{$name}";
+ $entry .= " \\p$name" if length $name == 1; # Show non-braced
+ # form too
$entry_ref = "\\p{$name}";
}
else { # Compound form.
}
}
- # Ouput both short and single in the same parenthesized
+ # Output both short and single in the same parenthesized
# expression, but with only one of 'Single', 'Short' if there
# are both items.
if ($short_name || $single_form || $table->conflicting) {
column is the longest name, the right column will show any equivalent shortest
name, in both single and compound forms if applicable.
+If braces are not needed to specify a property (e.g., C<\\pL>), the left
+column contains both forms, with and without braces.
+
The right column will also caution you if a property means something different
than what might normally be expected.
=head1 Properties accessible through Unicode::UCD
-All the Unicode character properties mentioned above (except for those marked
-as for internal use by Perl) are also accessible by
-L<Unicode::UCD/prop_invlist()>.
+The value of any Unicode (not including Perl extensions) character
+property mentioned above for any single code point is available through
+L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the
+values of all the Unicode properties for a given code point.
+
+Besides these, all the Unicode character properties mentioned above
+(except for those marked as for internal use by Perl) are also
+accessible by L<Unicode::UCD/prop_invlist()>.
Due to their nature, not all Unicode character properties are suitable for
regular expression matches, nor C<prop_invlist()>. The remaining
my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
chomp $stricter_to_file_of;
+ my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
+ chomp $inline_definitions;
+
my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
chomp $loose_to_file_of;
# for the alternate table figured out at that time.
foreach my $cased (keys %caseless_equivalent_to) {
my @path = $caseless_equivalent_to{$cased}->file_path;
- my $path = join '/', @path[1, -1];
+ my $path;
+ if ($path[0] eq "#") { # Pseudo-directory '#'
+ $path = join '/', @path;
+ }
+ else { # Gets rid of lib/
+ $path = join '/', @path[1, -1];
+ }
$caseless_equivalent_to{$cased} = $path;
}
my $caseless_equivalent_to
$loose_property_name_of
);
-# Maps property, table to file for those using stricter matching
+# Gives the definitions (in the form of inversion lists) for those properties
+# whose definitions aren't kept in files
+\@utf8::inline_definitions = (
+$inline_definitions
+);
+
+# Maps property, table to file for those using stricter matching. For paths
+# whose directory is '#', the file is in the form of a numeric index into
+# \@inline_definitions
\%utf8::stricter_to_file_of = (
$stricter_to_file_of
);
-# Maps property, table to file for those using loose matching
+# Maps property, table to file for those using loose matching. For paths
+# whose directory is '#', the file is in the form of a numeric index into
+# \@inline_definitions
\%utf8::loose_to_file_of = (
$loose_to_file_of
);
# don't care if both defined, as they had better be different anyway.)
# The property named 'Perl' needs to be first (it doesn't have any
# immutable file name) because empty properties are defined in terms of
- # it's table named 'All'.) We also sort by the property's name. This is
- # just for repeatability of the outputs between runs of this program, but
- # does not affect correctness.
+ # its table named 'All' under the -annotate option.) We also sort by
+ # the property's name. This is just for repeatability of the outputs
+ # between runs of this program, but does not affect correctness.
PROPERTY:
foreach my $property ($perl,
sort { return -1 if defined $a->file;
push @unhandled_properties, "$table";
}
- # An empty table is just the complement of everything.
- $table->set_complement($All) if $table != $property;
+ # The old way of expressing an empty match list was to
+ # complement the list that matches everything. The new way is
+ # to create an empty inversion list, but this doesn't work for
+ # annotating, so use the old way then.
+ $table->set_complement($All) if $annotate
+ && $table != $property;
}
elsif ($expected_empty) {
my $because = "";
my $count = $table->count;
if ($expected_full) {
- if ($count != $MAX_UNICODE_CODEPOINTS) {
+ if ($count != $MAX_WORKING_CODEPOINTS) {
Carp::my_carp("$table matches only "
. clarify_number($count)
. " Unicode code points but should match "
- . clarify_number($MAX_UNICODE_CODEPOINTS)
+ . clarify_number($MAX_WORKING_CODEPOINTS)
. " (off by "
- . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
+ . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
. "). Proceeding anyway.");
}
# Similarly, we create for Unicode::UCD a list of
# property-value aliases.
- my $property_full_name = $property->full_name;
-
# Look at each table in the property...
foreach my $table ($property->tables) {
my @values_list;
}
# To save memory, unlike the similar list for property
- # aliases above, only the standard forms hve the list.
+ # aliases above, only the standard forms have the list.
# This forces an extra step of converting from input
# name to standard name, but the savings are
# considerable. (There is only marginal savings if we
my $filename;
my $property = $table->property;
my $is_property = ($table == $property);
- if (! $is_property) {
+ # For very short tables, instead of writing them out to actual files,
+ # we in-line their inversion list definitions into Heavy.pl. The
+ # definition replaces the file name, and the special pseudo-directory
+ # '#' is used to signal this. This significantly cuts down the number
+ # of files written at little extra cost to the hashes in Heavy.pl.
+ # And it means, no run-time files to read to get the definitions.
+ if (! $is_property
+ && ! $annotate # For annotation, we want to explicitly show
+ # everything, so keep in files
+ && $table->ranges <= 3)
+ {
+ my @ranges = $table->ranges;
+ my $count = @ranges;
+ if ($count == 0) { # 0th index reserved for 0-length lists
+ $filename = 0;
+ }
+ elsif ($table->leader != $table) {
+
+ # Here, is a table that is equivalent to another; code
+ # in register_file_for_name() causes its leader's definition
+ # to be used
+
+ next;
+ }
+ else { # No equivalent table so far.
+
+ # Build up its definition range-by-range.
+ my $definition = "";
+ while (defined (my $range = shift @ranges)) {
+ my $end = $range->end;
+ if ($end < $MAX_WORKING_CODEPOINT) {
+ $count++;
+ $end = "\n" . ($end + 1);
+ }
+ else { # Extends to infinity, hence no 'end'
+ $end = "";
+ }
+ $definition .= "\n" . $range->start . $end;
+ }
+ $definition = "V$count" . $definition;
+ $filename = @inline_definitions;
+ push @inline_definitions, $definition;
+ }
+ @directory = "#";
+ register_file_for_name($table, \@directory, $filename);
+ next;
+ }
+
+ if (! $is_property) {
# Match tables for the property go in lib/$subdirectory, which is
# the property's name. Don't use the standard file name for this,
# as may get an unfamiliar alias
&write($t_path,
0, # Not utf8;
- [<DATA>,
+ [$HEADER,
+ <DATA>,
@output,
(map {"Test_X('$_');\n"} @backslash_X_tests),
- "Finished();\n"]);
+ "Finished();\n"
+ ]);
return;
}
use strict;
use warnings;
-# If run outside the normal test suite on an ASCII platform, you can
-# just create a latin1_to_native() function that just returns its
-# inputs, because that's the only function used from test.pl
-require "test.pl";
-
# Test qr/\X/ and the \p{} regular expression constructs. This file is
# constructed by mktables from the tables it generates, so if mktables is
# buggy, this won't necessarily catch those bugs. Tests are generated for all
my $this_string = "";
my $this_display = "";
foreach my $code_point (@code_points) {
- $this_string .= latin1_to_native(chr(hex $code_point));
+ $this_string .= chr utf8::unicode_to_native(hex $code_point);
$this_display .= "\\x{$code_point}";
}