X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/731cb813cb160baa0c7969c70e0864d4da074738..40416981c6fd7d3e5668285c2762ac534f4d1e06:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index f3d5c83..4a16d83 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -528,6 +528,9 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F # Without this change, there are casing problems for this character. # +# Search for $string_compare_versions to see how to compare changes to +# properties between Unicode versions +# ############################################################################## my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing @@ -953,7 +956,7 @@ my %why_obsolete; # Documentation only # contains the same information, but without the algorithmically # determinable Hangul syllables'. This file is not published, so it's # existence is not noted in the comment. - 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()', + 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::', 'Indic_Matra_Category' => "Provisional", 'Indic_Syllabic_Category' => "Provisional", @@ -962,14 +965,14 @@ my %why_obsolete; # Documentation only # to differentiate between it and gc=c, which can be written as 'isc', # which is the same characters as ISO_Comment's short name. - 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()", + 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::", - 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()", - 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", - 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", - 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()", + 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD", + 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", + 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", + 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD", - FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful', + FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful', ); foreach my $property ( @@ -1200,6 +1203,18 @@ my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF"; my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; +# We work with above-Unicode code points, up to UV_MAX. But when you get +# that high, above IV_MAX, some operations don't work, and you can easily get +# overflow. Therefore for internal use, we use a much smaller number, +# translating it to UV_MAX only for output. The exact number is immaterial +# (all Unicode code points are treated exactly the same), but the algorithm +# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1; +my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; +my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; +my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); + +my $MAX_PLATFORM_CODEPOINT = ~0; + # Matches legal code point. 4-6 hex numbers, If there are 6, the first # two must be 10; if there are 5, the first must not be a 0. Written this way # to decrease backtracking. The first regex allows the code point to be at @@ -1353,12 +1368,18 @@ my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : ""; my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : ""; my $AUXILIARY = 'auxiliary'; -# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl -# and into UCD.pl for the use of UCD.pm +# Hashes and arrays that will eventually go into Heavy.pl for the use of +# utf8_heavy.pl and into UCD.pl for the use of UCD.pm my %loose_to_file_of; # loosely maps table names to their respective # files my %stricter_to_file_of; # same; but for stricter mapping. my %loose_property_to_file_of; # Maps a loose property name to its map file +my @inline_definitions = "V0"; # Each element gives a definition of a unique + # inversion list. When a definition is inlined, + # its value in the hash it's in (one of the two + # defined just above) will include an index into + # this array. The 0th element is initialized to + # the definition for a zero length invwersion list my %file_to_swash_name; # Maps the file name to its corresponding key name # in the hash %utf8::SwashInfo my %nv_floating_to_rational; # maps numeric values floating point numbers to @@ -1531,7 +1552,8 @@ my $UNASSIGNED_TYPE = -2; my $PRIVATE_USE_TYPE = -3; my $NONCHARACTER_TYPE = -4; my $CONTROL_TYPE = -5; -my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program +my $ABOVE_UNICODE_TYPE = -6; +my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program sub populate_char_info ($) { # Used only with the $annotate option. Populates the arrays with the @@ -1562,7 +1584,13 @@ sub populate_char_info ($) { my $end; if (! $viacode[$i]) { my $nonchar; - if ($gc-> table('Private_use')->contains($i)) { + if ($i > $MAX_UNICODE_CODEPOINT) { + $viacode[$i] = 'Above-Unicode'; + $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE; + $printable[$i] = 0; + $end = $MAX_WORKING_CODEPOINT; + } + elsif ($gc-> table('Private_use')->contains($i)) { $viacode[$i] = 'Private Use'; $annotate_char_type[$i] = $PRIVATE_USE_TYPE; $printable[$i] = 0; @@ -1715,7 +1743,15 @@ sub clarify_code_point_count ($) { # This is like clarify_number(), but the input is assumed to be a count of # code points, rather than a generic number. - return clarify_number(shift); + my $append = ""; + + my $number = shift; + if ($number > $MAX_UNICODE_CODEPOINTS) { + $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS); + return "All above-Unicode code points" if $number == 0; + $append = " + all above-Unicode code points"; + } + return clarify_number($number) . $append; } package Carp; @@ -1824,7 +1860,8 @@ package main; { # Closure # This program uses the inside-out method for objects, as recommended in - # "Perl Best Practices". This closure aids in generating those. There + # "Perl Best Practices". (This is the best solution still, since this has + # to run under miniperl.) This closure aids in generating those. There # are two routines. setup_package() is called once per package to set # things up, and then set_access() is called for each hash representing a # field in the object. These routines arrange for the object to be @@ -2476,11 +2513,26 @@ END $handle{$addr} = $file_handle; # Cache the open file handle if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') { - $_ = <$file_handle>; - if ($_ !~ / - $string_version \. /x) { - chomp; - $_ =~ s/^#\s*//; - die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version"); + if ($file !~ /^Unihan/i) { + $_ = <$file_handle>; + if ($_ !~ / - $string_version \. /x) { + chomp; + $_ =~ s/^#\s*//; + die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version"); + } + } + else { + while (<$file_handle>) { + if ($_ !~ /^#/) { + Carp::my_carp_bug("Could not find the expected version info in file '$file'"); + last; + } + chomp; + $_ =~ s/^#\s*//; + next if $_ !~ / version: /x; + last if $_ =~ /$string_version/; + die Carp::my_carp("File '$file' is '$_'. It should be version $string_version"); + } } } } @@ -3450,7 +3502,7 @@ sub trace { return main::trace(@_); } # If the range list is empty, return a large value that isn't adjacent # to any that could be in the range list, for simpler tests - return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; + return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}}; return $ranges{$addr}->[0]->start; } @@ -3729,9 +3781,6 @@ sub trace { return main::trace(@_); } Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken."); return; } - if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') { - Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway"); - } #local $to_trace = 1 if main::DEBUG; if ($operation eq '-') { @@ -4529,8 +4578,8 @@ sub trace { return main::trace(@_); } # And finally, add the gap from the end of the table to the max # possible code point - if ($max < $MAX_UNICODE_CODEPOINT) { - $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT); + if ($max < $MAX_WORKING_CODEPOINT) { + $new->add_range($max + 1, $MAX_WORKING_CODEPOINT); } return $new; } @@ -4819,6 +4868,7 @@ sub trace { return main::trace(@_); } # range. my $end = $set->end; return $end if is_code_point_usable($end, $try_hard); + $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT; # End point didn't, work. Start at the beginning and try # every one until find one that does work. @@ -4994,6 +5044,11 @@ sub trace { return main::trace(@_); } # The constructor can override the global flag of the same name. main::set_access('output_range_counts', \%output_range_counts, 'r'); + my %write_as_invlist; + # A boolean set iff the output file for this table is to be in the form of + # an inversion list/map. + main::set_access('write_as_invlist', \%write_as_invlist, 'r'); + my %format; # The format of the entries of the table. This is calculated from the # data in the table (or passed in the constructor). This is an enum e.g., @@ -5030,6 +5085,7 @@ sub trace { return main::trace(@_); } $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0; $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0; $fate{$addr} = delete $args{'Fate'} || $ORDINARY; + $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default my $ucd = delete $args{'UCD'}; my $description = delete $args{'Description'}; @@ -5547,6 +5603,7 @@ END Carp::carp_extra_args(\@_) if main::DEBUG && @_; my $addr = do { no overloading; pack 'J', $self; }; + my $write_as_invlist = $write_as_invlist{$addr}; # Start with the header my @HEADER = $self->header; @@ -5599,16 +5656,23 @@ END else { my $range_size_1 = $range_size_1{$addr}; + # To make it more readable, use a minimum indentation + my $comment_indent; + # These are used only in $annotate option my $format; # e.g. $HEX_ADJUST_FORMAT my $include_name; # ? Include the character's name in the # annotation? my $include_cp; # ? Include its code point - # To make it more readable, use a minimum indentation - my $comment_indent = 16; - - if ($annotate) { + if (! $annotate) { + $comment_indent = ($self->isa('Map_Table')) + ? 24 + : ($write_as_invlist) + ? 8 + : 16; + } + else { $format = $self->format; # The name of the character is output only for tables that @@ -5626,7 +5690,11 @@ END # the first column $include_cp = ! $range_size_1; - if ($self->isa('Map_Table')) { + if (! $self->isa('Map_Table')) { + $comment_indent = ($write_as_invlist) ? 8 : 16; + } + else { + $comment_indent = 16; # There are just a few short ranges in this table, so no # need to include the code point in the annotation. @@ -5722,6 +5790,7 @@ END my $next_end; my $next_value; my $offset = 0; + my $invlist_count = 0; my $output_value_in_hex = $self->isa('Map_Table') && ($self->format eq $HEX_ADJUST_FORMAT @@ -5853,11 +5922,36 @@ END $previous_value = $value; } - # If there is a range - if ($start != $end) { - push @OUT, sprintf "$hex_format\t$hex_format", - $start, $end; - if ($value ne "") { + if ($write_as_invlist) { + + # Inversion list format has a single number per line, + # the starting code point of a range that matches the + # property + push @OUT, $start, "\n"; + $invlist_count++; + + # Add a comment with the size of the range, if + # requested. + if ($output_range_counts{$addr}) { + $OUT[-1] = merge_single_annotation_line( + $OUT[-1], + "# [" + . main::clarify_code_point_count($end - $start + 1) + . "]\n", + $comment_indent); + } + } + elsif ($start != $end) { # If there is a range + if ($end == $MAX_WORKING_CODEPOINT) { + push @OUT, sprintf "$hex_format\t$hex_format", + $start, + $MAX_PLATFORM_CODEPOINT; + } + else { + push @OUT, sprintf "$hex_format\t$hex_format", + $start, $end; + } + if (length $value) { if ($convert_map_to_from_hex) { $OUT[-1] .= sprintf "\t$hex_format\n", $value; } @@ -5958,8 +6052,15 @@ END } if ($i != $start || $range_end < $end) { - $annotation = sprintf "%04X..%04X", - $i, $range_end; + if ($range_end < $MAX_WORKING_CODEPOINT) + { + $annotation = sprintf "%04X..%04X", + $i, $range_end; + } + else { + $annotation = sprintf "%04X..INFINITY", + $i; + } } else { # Indent if not displaying code points $annotation = " " x 4; @@ -6076,6 +6177,15 @@ END } } + # Add the beginning of the range that doesn't match the + # property, except if the just added match range extends + # to infinity. We do this after any annotations for the + # match range. + if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) { + push @OUT, $end + 1, "\n"; + $invlist_count++; + } + # If we split the range, set up so the next time through # we get the remainder, and redo. if ($next_start) { @@ -6089,6 +6199,8 @@ END } # End of loop through all the table's ranges push @OUT, @annotation; # Add orphaned annotation, if any + + splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count; } # Add anything that goes after the main body, but within the here @@ -6368,6 +6480,7 @@ sub trace { return main::trace(@_); } Full_Name => $full_name, _Property => $property, _Range_List => $range_list, + Write_As_Invlist => 0, %args); my $addr = do { no overloading; pack 'J', $self; }; @@ -6614,7 +6727,8 @@ sub trace { return main::trace(@_); } # but its format and even its name or existence are subject to change without # notice in a future Perl version. Don't use it directly. Instead, its # contents are now retrievable through a stable API in the Unicode::UCD -# module: Unicode::UCD::prop_invmap('$property_name'). +# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual +# code points can be retrieved via Unicode::UCD::charprop()); END } return $return; @@ -6741,7 +6855,7 @@ END } $comment .= "\nwhere 'cp' is $cp."; if ($ucd_accessible_name) { - $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD"; + $comment .= " Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD"; } # And append any commentary already set from the actual property. @@ -6896,11 +7010,14 @@ END } } - # I (khw) have never waded through this line to - # understand it well enough to comment it. + # The unpack yields a list of the bytes that comprise the + # UTF-8 of $code_point, which are each placed in \xZZ format + # and output in the %s to map to $tostr, so the result looks + # like: + # "\xC4\xB0" => "\x{0069}\x{0307}", my $utf8 = sprintf(qq["%s" => "$tostr",], join("", map { sprintf "\\x%02X", $_ } - unpack("U0C*", pack("U", $code_point)))); + unpack("U0C*", chr $code_point))); # Add a comment so that a human reader can more easily # see what's going on. @@ -7290,6 +7407,7 @@ sub trace { return main::trace(@_); } _Property => $property, _Range_List => $range_list, Format => $EMPTY_FORMAT, + Write_As_Invlist => 1, ); my $addr = do { no overloading; pack 'J', $self; }; @@ -7696,7 +7814,18 @@ END # Get the number of code points matched by each of the tables in this # file, and add underscores for clarity. my $count = $leader->count; - my $string_count = main::clarify_code_point_count($count); + my $unicode_count; + my $non_unicode_string; + if ($count > $MAX_UNICODE_CODEPOINTS) { + $unicode_count = $count - ($MAX_WORKING_CODEPOINT + - $MAX_UNICODE_CODEPOINT); + $non_unicode_string = "All above-Unicode code points match as well, and are also returned"; + } + else { + $unicode_count = $count; + $non_unicode_string = ""; + } + my $string_count = main::clarify_code_point_count($unicode_count); my $loose_count = 0; # how many aliases loosely matched my $compound_name = ""; # ? Are any names compound?, and if so, an @@ -7894,11 +8023,13 @@ END } } # End of looping through all tables + $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string; + my $code_points; my $match; my $any_of_these; - if ($count == 1) { + if ($unicode_count == 1) { $match = 'matches'; $code_points = 'single code point'; } @@ -7998,14 +8129,25 @@ END if ($count) { # The format differs if no code points, and needs no # explanation in that case + if ($leader->write_as_invlist) { $comment.= <output_range_counts) { $comment .= <add_map(0, $MAX_UNICODE_CODEPOINT, + $property->add_map(0, $MAX_WORKING_CODEPOINT, $default_map, Replace => $NO); } } @@ -13211,8 +13353,9 @@ sub compile_perl() { # 'All' is all code points. As an error check, instead of just setting it # to be that, construct it to be the union of all the major categories $All = $perl->add_match_table('All', - Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]", - Matches_All => 1); + Description + => "All code points, including those above Unicode. Same as qr/./s", + Matches_All => 1); foreach my $major_table ($gc->tables) { @@ -13222,10 +13365,10 @@ sub compile_perl() { $All += $major_table; } - if ($All->max != $MAX_UNICODE_CODEPOINT) { + if ($All->max != $MAX_WORKING_CODEPOINT) { Carp::my_carp_bug("Generated highest code point (" . sprintf("%X", $All->max) - . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.") + . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.") } if ($All->range_count != 1 || $All->min != 0) { Carp::my_carp_bug("Generated table 'All' doesn't match all code points.") @@ -13276,7 +13419,7 @@ sub compile_perl() { # have Uppercase and Lowercase defined, so use the general category # instead for them, modified by hard-coding in the code points each is # missing. - my $Lower = $perl->add_match_table('Lower'); + my $Lower = $perl->add_match_table('XPosixLower'); my $Unicode_Lower = property_ref('Lowercase'); if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) { $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1); @@ -13314,13 +13457,12 @@ sub compile_perl() { $Lower += $code_point; } } - $Lower->add_alias('XPosixLower'); my $Posix_Lower = $perl->add_match_table("PosixLower", Description => "[a-z]", Initialize => $Lower & $ASCII, ); - my $Upper = $perl->add_match_table('Upper'); + my $Upper = $perl->add_match_table("XPosixUpper"); my $Unicode_Upper = property_ref('Uppercase'); if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) { $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1); @@ -13333,7 +13475,6 @@ sub compile_perl() { $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters } - $Upper->add_alias('XPosixUpper'); my $Posix_Upper = $perl->add_match_table("PosixUpper", Description => "[A-Z]", Initialize => $Upper & $ASCII, @@ -13455,7 +13596,7 @@ sub compile_perl() { # one whose name generally begins with Posix that is posix-compliant, and # one that matches Unicode characters beyond the Posix, ASCII range - my $Alpha = $perl->add_match_table('Alpha'); + my $Alpha = $perl->add_match_table('XPosixAlpha'); # Alphabetic was not present in early releases my $Alphabetic = property_ref('Alphabetic'); @@ -13542,7 +13683,6 @@ sub compile_perl() { $Alpha->add_description('Alphabetic'); $Alpha->add_alias('Alphabetic'); } - $Alpha->add_alias('XPosixAlpha'); my $Posix_Alpha = $perl->add_match_table("PosixAlpha", Description => "[A-Za-z]", Initialize => $Alpha & $ASCII, @@ -13550,22 +13690,20 @@ sub compile_perl() { $Posix_Upper->set_caseless_equivalent($Posix_Alpha); $Posix_Lower->set_caseless_equivalent($Posix_Alpha); - my $Alnum = $perl->add_match_table('Alnum', + my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum', Description => 'Alphabetic and (decimal) Numeric', Initialize => $Alpha + $gc->table('Decimal_Number'), ); - $Alnum->add_alias('XPosixAlnum'); $perl->add_match_table("PosixAlnum", Description => "[A-Za-z0-9]", Initialize => $Alnum & $ASCII, ); - my $Word = $perl->add_match_table('Word', + my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord', Description => '\w, including beyond ASCII;' . ' = \p{Alnum} + \pM + \p{Pc}', Initialize => $Alnum + $gc->table('Mark'), ); - $Word->add_alias('XPosixWord'); my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 if (defined $Pc) { $Word += $Pc; @@ -13582,13 +13720,13 @@ sub compile_perl() { } # This is a Perl extension, so the name doesn't begin with Posix. - my $PerlWord = $perl->add_match_table('PerlWord', + my $PerlWord = $perl->add_match_table('PosixWord', Description => '\w, restricted to ASCII = [A-Za-z0-9_]', Initialize => $Word & $ASCII, ); - $PerlWord->add_alias('PosixWord'); + $PerlWord->add_alias('PerlWord'); - my $Blank = $perl->add_match_table('Blank', + my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank', Description => '\h, Horizontal white space', # 200B is Zero Width Space which is for line @@ -13599,7 +13737,6 @@ sub compile_perl() { - 0x200B, # ZWSP ); $Blank->add_alias('HorizSpace'); # Another name for it. - $Blank->add_alias('XPosixBlank'); $perl->add_match_table("PosixBlank", Description => "\\t and ' '", Initialize => $Blank & $ASCII, @@ -13618,34 +13755,22 @@ sub compile_perl() { ); # No Posix equivalent for vertical space - my $Space = $perl->add_match_table('Space', + my $Space = $perl->add_match_table('XPosixSpace', Description => '\s including beyond ASCII and vertical tab', Initialize => $Blank + $VertSpace, ); - $Space->add_alias('XPosixSpace'); - my $posix_space = $perl->add_match_table("PosixSpace", + $Space->add_alias('XPerlSpace'); # Pre-existing synonyms + $Space->add_alias('SpacePerl'); + + my $Posix_space = $perl->add_match_table("PosixSpace", Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", Initialize => $Space & $ASCII, ); + $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym - # Perl's traditional space doesn't include Vertical Tab prior to v5.18 - my $XPerlSpace = $perl->add_match_table('XPerlSpace', - Description => '\s, including beyond ASCII', - Initialize => $Space, - #Initialize => $Space - # - utf8::unicode_to_native(0x0B] - ); - $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym - my $PerlSpace = $perl->add_match_table('PerlSpace', - Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab', - Initialize => $XPerlSpace & $ASCII, - ); - - - my $Cntrl = $perl->add_match_table('Cntrl', + my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl', Description => 'Control characters'); $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); - $Cntrl->add_alias('XPosixCntrl'); $perl->add_match_table("PosixCntrl", Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL", Initialize => $Cntrl & $ASCII, @@ -13658,22 +13783,20 @@ sub compile_perl() { $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate'); # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls) - my $Graph = $perl->add_match_table('Graph', + my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph', Description => 'Characters that are graphical', Initialize => ~ ($Space + $controls), ); - $Graph->add_alias('XPosixGraph'); $perl->add_match_table("PosixGraph", Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]', Initialize => $Graph & $ASCII, ); - $print = $perl->add_match_table('Print', + $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint', Description => 'Characters that are graphical plus space characters (but no controls)', Initialize => $Blank + $Graph - $gc->table('Control'), ); - $print->add_alias('XPosixPrint'); $perl->add_match_table("PosixPrint", Description => '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]', @@ -13695,18 +13818,16 @@ sub compile_perl() { Initialize => $ASCII & $XPosixPunct, ); - my $Digit = $perl->add_match_table('Digit', + my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit', Description => '[0-9] + all other decimal digits'); $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); - $Digit->add_alias('XPosixDigit'); my $PosixDigit = $perl->add_match_table("PosixDigit", Description => '[0-9]', Initialize => $Digit & $ASCII, ); # Hex_Digit was not present in first release - my $Xdigit = $perl->add_match_table('XDigit'); - $Xdigit->add_alias('XPosixXDigit'); + my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit'); my $Hex = property_ref('Hex_Digit'); if (defined $Hex && ! $Hex->is_empty) { $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1); @@ -13735,12 +13856,90 @@ sub compile_perl() { my $any_folds = $perl->add_match_table("_Perl_Any_Folds", Description => "Code points that particpate in some fold", ); - # - foreach my $range (property_ref('Case_Folding')->ranges) { - $any_folds->add_range($range->start, $range->end); - foreach my $hex_code_point (split " ", $range->value) { - my $code_point = hex $hex_code_point; - $any_folds->add_range($code_point, $code_point); + my $loc_problem_folds = $perl->add_match_table( + "_Perl_Problematic_Locale_Folds", + Description => + "Code points that are in some way problematic under locale", + ); + + # This allows regexec.c to skip some work when appropriate. Some of the + # entries in _Perl_Problematic_Locale_Folds are multi-character folds, + my $loc_problem_folds_start = $perl->add_match_table( + "_Perl_Problematic_Locale_Foldeds_Start", + Description => + "The first character of every sequence in _Perl_Problematic_Locale_Folds", + ); + + my $cf = property_ref('Case_Folding'); + + # Every character 0-255 is problematic because what each folds to depends + # on the current locale + $loc_problem_folds->add_range(0, 255); + $loc_problem_folds_start += $loc_problem_folds; + + # Also problematic are anything these fold to outside the range. Likely + # forever the only thing folded to by these outside the 0-255 range is the + # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code + # completely general, which should catch any unexpected changes or errors. + # We look at each code point 0-255, and add its fold (including each part + # of a multi-char fold) to the list. See commit message + # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description + # of the MU issue. + foreach my $range ($loc_problem_folds->ranges) { + foreach my $code_point($range->start .. $range->end) { + my $fold_range = $cf->containing_range($code_point); + next unless defined $fold_range; + + my @hex_folds = split " ", $fold_range->value; + my $start_cp = hex $hex_folds[0]; + foreach my $i (0 .. @hex_folds - 1) { + my $cp = hex $hex_folds[$i]; + next unless $cp > 255; # Already have the < 256 ones + + $loc_problem_folds->add_range($cp, $cp); + $loc_problem_folds_start->add_range($start_cp, $start_cp); + } + } + } + + my $folds_to_multi_char = $perl->add_match_table( + "_Perl_Folds_To_Multi_Char", + Description => + "Code points whose fold is a string of more than one character", + ); + + # Look through all the known folds to populate these tables. + foreach my $range ($cf->ranges) { + my $start = $range->start; + my $end = $range->end; + $any_folds->add_range($start, $end); + + my @hex_folds = split " ", $range->value; + if (@hex_folds > 1) { # Is multi-char fold + $folds_to_multi_char->add_range($start, $end); + } + + my $found_locale_problematic = 0; + + # Look at each of the folded-to characters... + foreach my $i (0 .. @hex_folds - 1) { + my $cp = hex $hex_folds[$i]; + $any_folds->add_range($cp, $cp); + + # The fold is problematic if any of the folded-to characters is + # already considered problematic. + if ($loc_problem_folds->contains($cp)) { + $loc_problem_folds->add_range($start, $end); + $found_locale_problematic = 1; + } + } + + # If this is a problematic fold, add to the start chars the + # folding-from characters and first folded-to character. + if ($found_locale_problematic) { + $loc_problem_folds_start->add_range($start, $end); + my $cp = hex $hex_folds[0]; + $loc_problem_folds_start->add_range($cp, $cp); } } @@ -14284,8 +14483,9 @@ END $unassigned_sans_noncharacters &= $nonchars->table('N'); } - for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) { + for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) { $i = populate_char_info($i); # Note sets $i so may cause skips + } } @@ -14625,8 +14825,9 @@ sub register_file_for_name($$$) { # Join all the file path components together, using slashes. my $full_filename = join('/', @$directory_ref, $file); - # All go in the same subdirectory of unicore - if ($directory_ref->[0] ne $matches_directory) { + # All go in the same subdirectory of unicore, or the special + # pseudo-directory '#' + if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) { Carp::my_carp("Unexpected directory in " . join('/', @{$directory_ref}, $file)); } @@ -14937,7 +15138,18 @@ sub make_re_pod_entries($) { my $full_name = $property->full_name; my $count = $input_table->count; - my $string_count = clarify_number($count); + my $unicode_count; + my $non_unicode_string; + if ($count > $MAX_UNICODE_CODEPOINTS) { + $unicode_count = $count - ($MAX_WORKING_CODEPOINT + - $MAX_UNICODE_CODEPOINT); + $non_unicode_string = " plus all above-Unicode code points"; + } + else { + $unicode_count = $count; + $non_unicode_string = ""; + } + my $string_count = clarify_number($unicode_count) . $non_unicode_string; my $status = $input_table->status; my $status_info = $input_table->status_info; my $caseless_equivalent = $input_table->caseless_equivalent; @@ -15004,6 +15216,8 @@ sub make_re_pod_entries($) { # property always use the single form. if ($table_property == $perl) { $entry = "\\p{$name}"; + $entry .= " \\p$name" if length $name == 1; # Show non-braced + # form too $entry_ref = "\\p{$name}"; } else { # Compound form. @@ -15189,7 +15403,7 @@ sub make_re_pod_entries($) { } } - # Ouput both short and single in the same parenthesized + # Output both short and single in the same parenthesized # expression, but with only one of 'Single', 'Short' if there # are both items. if ($short_name || $single_form || $table->conflicting) { @@ -15870,6 +16084,9 @@ the right column will give its longer, more descriptive name; and if the left column is the longest name, the right column will show any equivalent shortest name, in both single and compound forms if applicable. +If braces are not needed to specify a property (e.g., C<\\pL>), the left +column contains both forms, with and without braces. + The right column will also caution you if a property means something different than what might normally be expected. @@ -15959,9 +16176,14 @@ $zero_matches =head1 Properties accessible through Unicode::UCD -All the Unicode character properties mentioned above (except for those marked -as for internal use by Perl) are also accessible by -L. +The value of any Unicode (not including Perl extensions) character +property mentioned above for any single code point is available through +L. L returns the +values of all the Unicode properties for a given code point. + +Besides these, all the Unicode character properties mentioned above +(except for those marked as for internal use by Perl) are also +accessible by L. Due to their nature, not all Unicode character properties are suitable for regular expression matches, nor C. The remaining @@ -16091,6 +16313,9 @@ sub make_Heavy () { my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4); chomp $stricter_to_file_of; + my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4); + chomp $inline_definitions; + my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4); chomp $loose_to_file_of; @@ -16106,7 +16331,13 @@ sub make_Heavy () { # for the alternate table figured out at that time. foreach my $cased (keys %caseless_equivalent_to) { my @path = $caseless_equivalent_to{$cased}->file_path; - my $path = join '/', @path[1, -1]; + my $path; + if ($path[0] eq "#") { # Pseudo-directory '#' + $path = join '/', @path; + } + else { # Gets rid of lib/ + $path = join '/', @path[1, -1]; + } $caseless_equivalent_to{$cased} = $path; } my $caseless_equivalent_to @@ -16132,12 +16363,22 @@ $INTERNAL_ONLY_HEADER $loose_property_name_of ); -# Maps property, table to file for those using stricter matching +# Gives the definitions (in the form of inversion lists) for those properties +# whose definitions aren't kept in files +\@utf8::inline_definitions = ( +$inline_definitions +); + +# Maps property, table to file for those using stricter matching. For paths +# whose directory is '#', the file is in the form of a numeric index into +# \@inline_definitions \%utf8::stricter_to_file_of = ( $stricter_to_file_of ); -# Maps property, table to file for those using loose matching +# Maps property, table to file for those using loose matching. For paths +# whose directory is '#', the file is in the form of a numeric index into +# \@inline_definitions \%utf8::loose_to_file_of = ( $loose_to_file_of ); @@ -16689,9 +16930,9 @@ sub write_all_tables() { # don't care if both defined, as they had better be different anyway.) # The property named 'Perl' needs to be first (it doesn't have any # immutable file name) because empty properties are defined in terms of - # it's table named 'All'.) We also sort by the property's name. This is - # just for repeatability of the outputs between runs of this program, but - # does not affect correctness. + # its table named 'All' under the -annotate option.) We also sort by + # the property's name. This is just for repeatability of the outputs + # between runs of this program, but does not affect correctness. PROPERTY: foreach my $property ($perl, sort { return -1 if defined $a->file; @@ -16807,8 +17048,12 @@ sub write_all_tables() { push @unhandled_properties, "$table"; } - # An empty table is just the complement of everything. - $table->set_complement($All) if $table != $property; + # The old way of expressing an empty match list was to + # complement the list that matches everything. The new way is + # to create an empty inversion list, but this doesn't work for + # annotating, so use the old way then. + $table->set_complement($All) if $annotate + && $table != $property; } elsif ($expected_empty) { my $because = ""; @@ -16840,13 +17085,13 @@ sub write_all_tables() { my $count = $table->count; if ($expected_full) { - if ($count != $MAX_UNICODE_CODEPOINTS) { + if ($count != $MAX_WORKING_CODEPOINTS) { Carp::my_carp("$table matches only " . clarify_number($count) . " Unicode code points but should match " - . clarify_number($MAX_UNICODE_CODEPOINTS) + . clarify_number($MAX_WORKING_CODEPOINTS) . " (off by " - . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count)) + . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count)) . "). Proceeding anyway."); } @@ -17028,8 +17273,6 @@ sub write_all_tables() { # Similarly, we create for Unicode::UCD a list of # property-value aliases. - my $property_full_name = $property->full_name; - # Look at each table in the property... foreach my $table ($property->tables) { my @values_list; @@ -17068,7 +17311,7 @@ sub write_all_tables() { } # To save memory, unlike the similar list for property - # aliases above, only the standard forms hve the list. + # aliases above, only the standard forms have the list. # This forces an extra step of converting from input # name to standard name, but the savings are # considerable. (There is only marginal savings if we @@ -17096,8 +17339,56 @@ sub write_all_tables() { my $filename; my $property = $table->property; my $is_property = ($table == $property); - if (! $is_property) { + # For very short tables, instead of writing them out to actual files, + # we in-line their inversion list definitions into Heavy.pl. The + # definition replaces the file name, and the special pseudo-directory + # '#' is used to signal this. This significantly cuts down the number + # of files written at little extra cost to the hashes in Heavy.pl. + # And it means, no run-time files to read to get the definitions. + if (! $is_property + && ! $annotate # For annotation, we want to explicitly show + # everything, so keep in files + && $table->ranges <= 3) + { + my @ranges = $table->ranges; + my $count = @ranges; + if ($count == 0) { # 0th index reserved for 0-length lists + $filename = 0; + } + elsif ($table->leader != $table) { + + # Here, is a table that is equivalent to another; code + # in register_file_for_name() causes its leader's definition + # to be used + + next; + } + else { # No equivalent table so far. + + # Build up its definition range-by-range. + my $definition = ""; + while (defined (my $range = shift @ranges)) { + my $end = $range->end; + if ($end < $MAX_WORKING_CODEPOINT) { + $count++; + $end = "\n" . ($end + 1); + } + else { # Extends to infinity, hence no 'end' + $end = ""; + } + $definition .= "\n" . $range->start . $end; + } + $definition = "V$count" . $definition; + $filename = @inline_definitions; + push @inline_definitions, $definition; + } + @directory = "#"; + register_file_for_name($table, \@directory, $filename); + next; + } + + if (! $is_property) { # Match tables for the property go in lib/$subdirectory, which is # the property's name. Don't use the standard file name for this, # as may get an unfamiliar alias @@ -17588,10 +17879,12 @@ sub make_property_test_script() { &write($t_path, 0, # Not utf8; - [, + [$HEADER, + , @output, (map {"Test_X('$_');\n"} @backslash_X_tests), - "Finished();\n"]); + "Finished();\n" + ]); return; } @@ -18305,11 +18598,6 @@ __DATA__ use strict; use warnings; -# If run outside the normal test suite on an ASCII platform, you can -# just create a latin1_to_native() function that just returns its -# inputs, because that's the only function used from test.pl -require "test.pl"; - # Test qr/\X/ and the \p{} regular expression constructs. This file is # constructed by mktables from the tables it generates, so if mktables is # buggy, this won't necessarily catch those bugs. Tests are generated for all @@ -18459,7 +18747,7 @@ sub Test_X($) { my $this_string = ""; my $this_display = ""; foreach my $code_point (@code_points) { - $this_string .= latin1_to_native(chr(hex $code_point)); + $this_string .= chr utf8::unicode_to_native(hex $code_point); $this_display .= "\\x{$code_point}"; }