X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1254636bfb1e1850e3455fb1ac48a1211d12f341..8a69b36ef9b27e18d28584a74afb636e41890096:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 8ff762d..e597a2a 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -8,6 +8,11 @@ # compatible, but that has now been abandoned, and newer constructs are used # as convenient. +# NOTE: this script can run quite slowly in older/slower systems. +# It can also consume a lot of memory (128 MB or more), you may need +# to raise your process resource limits (e.g. in bash, "ulimit -a" +# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set) + my $start_time; BEGIN { # Get the time the script started running; do it at compilation to # get it as close as possible @@ -31,6 +36,17 @@ my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/; sub NON_ASCII_PLATFORM { ord("A") != 65 } +# When a new version of Unicode is published, unfortunately the algorithms for +# dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated +# manually. The changes may or may not be backward compatible with older +# releases. The code is in regen/mk_invlist.pl and regexec.c. Make the +# changes, then come back here and set the variable below to what version the +# code is expecting. If a newer version of Unicode is being compiled than +# expected, a warning will be generated. If an older version is being +# compiled, any bounds tests that fail in the generated test file (-maketest +# option) will be marked as TODO. +my $version_of_mk_invlist_bounds = v10.0.0; + ########################################################################## # # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl), @@ -119,7 +135,7 @@ my $map_directory = 'To'; # Where map files go. # each one of the tens of thousands individually. # # In a match table, the value of a range is irrelevant (and hence the type as -# well, which will always be 0), and arbitrarily set to the null string. +# well, which will always be 0), and arbitrarily set to the empty string. # Using the example above, there would be two match tables for those two # entries, one named Upper would contain the 0x41..0x5A range, and the other # named Lower would contain 0x61..0x7A. @@ -347,6 +363,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # # trace ... if main::DEBUG && $to_trace; # +# main::stack_trace() will display what its name implies +# # If there is just one or a few files that you're debugging, you can easily # cause most everything else to be skipped. Change the line # @@ -404,7 +422,7 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # # A NOTE ON UNIHAN # -# This program can generate tables from the Unihan database. But that db +# This program can generate tables from the Unihan database. But that DB # isn't normally available, so it is marked as optional. Prior to version # 5.2, this database was in a single file, Unihan.txt. In 5.2 the database # was split into 8 different files, all beginning with the letters 'Unihan'. @@ -472,8 +490,8 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # handled by Unicode::Normalize, nor will it compile when presented a version # that has them. However, you can trivially get it to compile by simply # ignoring those decompositions, by changing the croak to a carp. At the time -# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or -# cpan/Unicode-Normalize/mkheader) reads +# of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or +# dist/Unicode-Normalize/mkheader) reads # # croak("Weird Canonical Decomposition of U+$h"); # @@ -612,19 +630,37 @@ our $to_trace = 0; } } +sub stack_trace() { + local $to_trace = 1 if main::DEBUG; + my $line = (caller(0))[2]; + my $i = 1; + + # Accumulate the stack trace + while (1) { + my ($pkg, $file, $caller_line, $caller) = caller $i++; + + last unless defined $caller; + + trace "called from $caller() at line $line"; + $line = $caller_line; + } +} + # This is for a rarely used development feature that allows you to compare two # versions of the Unicode standard without having to deal with changes caused # by the code points introduced in the later version. You probably also want -# to use the -annotate option when using this. Change the 0 to a string -# containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only code -# points introduced in that release and earlier will be used; later ones are -# thrown away. You use the version number of the earliest one you want to -# compare; then run this program on directory structures containing each -# release, and compare the outputs. These outputs will therefore include only -# the code points common to both releases, and you can see the changes caused -# just by the underlying release semantic changes. For versions earlier than -# 3.2, you must copy a version of DAge.txt into the directory. -my $string_compare_versions = DEBUG && 0; # e.g., "2.1"; +# to use the -annotate option when using this. Run this program on a unicore +# containing the starting release you want to compare. Save that output +# structure. Then, switching to a unicore with the ending release, change the +# 0 in the $string_compare_versions definition just below to a string +# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding +# to the starting release. This program will then compile, but throw away all +# code points introduced after the starting release. Finally use a diff tool +# to compare the two directory structures. They include only the code points +# common to both releases, and you can see the changes caused just by the +# underlying release semantic changes. For versions earlier than 3.2, you +# must copy a version of DAge.txt into the directory. +my $string_compare_versions = DEBUG && ""; # e.g., "2.1"; my $compare_versions = DEBUG && $string_compare_versions && pack "C*", split /\./, $string_compare_versions; @@ -695,7 +731,8 @@ while (@ARGV) { $verbosity = 0; } elsif ($arg eq '-w') { - $write_unchanged_files = 1; # update the files even if havent changed + # update the files even if they haven't changed + $write_unchanged_files = 1; } elsif ($arg eq '-check') { my $this = shift @ARGV; @@ -802,6 +839,11 @@ close $VERSION; chomp $string_version; my $v_version = pack "C*", split /\./, $string_version; # v string +my $unicode_version = ($compare_versions) + ? ( "$string_compare_versions (using " + . "$string_version rules)") + : $string_version; + # The following are the complete names of properties with property values that # are known to not match any code points in some versions of Unicode, but that # may change in the future so they should be matchable, hence an empty file is @@ -895,9 +937,9 @@ my %why_obsolete; # Documentation only my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; %why_deprecated = ( - 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)', + 'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)', 'Jamo_Short_Name' => $contributory, - 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', + 'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking', 'Other_Alphabetic' => $contributory, 'Other_Default_Ignorable_Code_Point' => $contributory, 'Other_Grapheme_Extend' => $contributory, @@ -1071,7 +1113,7 @@ my %default_mapping = ( my $HEADER=<<"EOF"; # !!!!!!! DO NOT EDIT THIS FILE !!!!!!! # This file is machine-generated by $0 from the Unicode -# database, Version $string_version. Any changes made here will be lost! +# database, Version $unicode_version. Any changes made here will be lost! EOF my $INTERNAL_ONLY_HEADER = <<"EOF"; @@ -1098,17 +1140,17 @@ my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0) my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING; my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1; -# We work with above-Unicode code points, up to UV_MAX. But when you get -# that high, above IV_MAX, some operations don't work, and you can easily get -# overflow. Therefore for internal use, we use a much smaller number, -# translating it to UV_MAX only for output. The exact number is immaterial -# (all Unicode code points are treated exactly the same), but the algorithm -# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1; +# We work with above-Unicode code points, up to IV_MAX, but we may want to use +# sentinels above that number. Therefore for internal use, we use a much +# smaller number, translating it to IV_MAX only for output. The exact number +# is immaterial (all above-Unicode code points are treated exactly the same), +# but the algorithm requires it to be at least +# 2 * $MAX_UNICODE_CODEPOINTS + 1 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8; my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1; my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT); -my $MAX_PLATFORM_CODEPOINT = ~0; +my $MAX_PLATFORM_CODEPOINT = ~0 >> 1; # Matches legal code point. 4-6 hex numbers, If there are 6, the first # two must be 10; if there are 5, the first must not be a 0. Written this way @@ -1120,7 +1162,7 @@ my $run_on_code_point_re = qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x; my $code_point_re = qr/\b$run_on_code_point_re/; -# This matches the beginning of the line in the Unicode db files that give the +# This matches the beginning of the line in the Unicode DB files that give the # defaults for code points not listed (i.e., missing) in the file. The code # depends on this ending with a semi-colon, so it can assume it is a valid # field when the line is split() by semi-colons @@ -1377,6 +1419,7 @@ my $has_hangul_syllables = 0; my $needing_code_points_ending_in_code_point = 0; my @backslash_X_tests; # List of tests read in for testing \X +my @LB_tests; # List of tests read in for testing \b{lb} my @SB_tests; # List of tests read in for testing \b{sb} my @WB_tests; # List of tests read in for testing \b{wb} my @unhandled_properties; # Will contain a list of properties found in @@ -1393,10 +1436,10 @@ my @missing_early_files; # Generated list of absent files that we need to my @files_actually_output; # List of files we generated. my @more_Names; # Some code point names are compound; this is used # to store the extra components of them. -my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at - # the minimum before we consider it equivalent to a - # candidate rational -my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms +my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal + # point of a normalized floating point number + # needed to match before we consider it equivalent + # to a candidate rational # These store references to certain commonly used property objects my $age; @@ -1411,6 +1454,7 @@ my $Assigned; # All assigned characters in this Unicode release my $DI; # Default_Ignorable_Code_Point property my $NChar; # Noncharacter_Code_Point property my $script; +my $scx; # Script_Extensions property # Are there conflicting names because of beginning with 'In_', or 'Is_' my $has_In_conflicts = 0; @@ -1486,7 +1530,7 @@ sub populate_char_info ($) { $viacode[$i] = $perl_charname->value_of($i) || ""; $age[$i] = (defined $age) - ? (($age->value_of($i) =~ / ^ \d \. \d $ /x) + ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x) ? $age->value_of($i) : "") : ""; @@ -2062,7 +2106,7 @@ package Input_file; # 1) call before the first line is read, for pre processing # 2) call to adjust each line of the input before the main handler gets # them. This can be automatically generated, if appropriately simple -# enough, by specifiying a Properties parameter in the constructor. +# enough, by specifying a Properties parameter in the constructor. # 3) call upon EOF before the main handler exits its loop # 4) call at the end, for post processing # @@ -2070,7 +2114,7 @@ package Input_file; # each_line_handler()s. So, if the format of the line is not in the desired # format for the main handler, these are used to do that adjusting. They can # be stacked (by enclosing them in an [ anonymous array ] in the constructor, -# so the $_ output of one is used as the input to the next. The eof handler +# so the $_ output of one is used as the input to the next. The EOF handler # is also stackable, but none of the others are, but could easily be changed # to be so. # @@ -2154,7 +2198,7 @@ sub trace { return main::trace(@_); } # not otherwise be processed, and to not raise a warning about not being # handled. In the constructor call, any value that evaluates to a numeric # 0 or undef means don't skip. Any other value is a string giving the - # reason it is being skippped, and this will appear in generated pod. + # reason it is being skipped, and this will appear in generated pod. # However, an empty string reason will suppress the pod entry. # Internally, calls that evaluate to numeric 0 are changed into undef to # distinguish them from an empty string call. @@ -2167,11 +2211,20 @@ sub trace { return main::trace(@_); } # 'handler' main::set_access('each_line_handler', \%each_line_handler, 'c'); + my %retain_trailing_comments; + # This is used to not discard the comments that end data lines. This + # would be used only for files with non-typical syntax, and most code here + # assumes that comments have been stripped, so special handlers would have + # to be written. It is assumed that the code will use these in + # single-quoted contexts, and so any "'" marks in the comment will be + # prefixed by a backslash. + main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c'); + my %properties; # Optional ordered list of the properties that occur in each # meaningful line of the input file. If present, an appropriate # each_line_handler() is automatically generated and pushed onto the stack # of such handlers. This is useful when a file contains multiple - # proerties per line, but no other special considerations are necessary. + # properties per line, but no other special considerations are necessary. # The special value "" means to discard the corresponding input # field. # Any @missing lines in the file should also match this syntax; no such @@ -2272,6 +2325,15 @@ sub trace { return main::trace(@_); } # may not be the actual reality, but probably nobody cares anyway for # these obsolete characters.) # + # [3] if present is the default value for the property to assign for code + # points not given in the input. If not present, the default from the + # normal property is used + # + # [-1] If there is an extra final element that is the string 'ONLY_EARLY'. + # it means to not add the name in [1] as an alias to the property name + # used for these. Normally, when compiling Unicode versions that don't + # invoke the early handling, the name is added as a synonym. + # # Not all files can be handled in the above way, and so the code ref # alternative is available. It can do whatever it needs to. The other # array elements are optional in this case, and the code is free to use or @@ -2282,6 +2344,9 @@ sub trace { return main::trace(@_); } # makes for easier testing later on. main::set_access('early', \%early, 'c'); + my %only_early; + main::set_access('only_early', \%only_early, 'c'); + my %required_even_in_debug_skip; # debug_skip is used to speed up compilation during debugging by skipping # processing files that are not needed for the task at hand. However, @@ -2312,6 +2377,7 @@ sub trace { return main::trace(@_); } # Set defaults $handler{$addr} = \&main::process_generic_property_file; + $retain_trailing_comments{$addr} = 0; $non_skip{$addr} = 0; $skip{$addr} = undef; $has_missings_defaults{$addr} = $NO_DEFAULTS; @@ -2392,6 +2458,11 @@ sub trace { return main::trace(@_); } my $progress; my $function_instead_of_file = 0; + if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') { + $only_early{$addr} = 1; + pop $early{$addr}->@*; + } + # If we are compiling a Unicode release earlier than the file became # available, the constructor may have supplied a substitute if ($first_released{$addr} gt $v_version && $early{$addr}->@*) { @@ -2400,6 +2471,8 @@ sub trace { return main::trace(@_); } unshift $early{$addr}->@*, 1; # See the definition of %early for what the array elements mean. + # Note that we have just unshifted onto the array, so the numbers + # below are +1 of those in the %early description. # If we have a property this defines, create a table and default # map for it now (at essentially compile time), so that it will be # available for the whole of run time. (We will want to add this @@ -2416,10 +2489,13 @@ sub trace { return main::trace(@_); } Perl_Extension => 1, ); - # Use the default mapping for the regular property for this - # substitute one. - if ( defined $property{$addr} - && defined $default_mapping{$property{$addr}}) + # If not specified by the constructor, use the default mapping + # for the regular property for this substitute one. + if ($early{$addr}[4]) { + $prop_object->set_default_map($early{$addr}[4]); + } + elsif ( defined $property{$addr} + && defined $default_mapping{$property{$addr}}) { $prop_object ->set_default_map($default_mapping{$property{$addr}}); @@ -2625,7 +2701,7 @@ END # once per file, as it destroy's the EOF handlers # flag to make sure extracted files are processed early - state $seen_non_extracted_non_age = 0; + state $seen_non_extracted = 0; my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -2638,7 +2714,7 @@ END $handle{$addr} = 'pretend_is_open'; } else { - if ($seen_non_extracted_non_age) { + if ($seen_non_extracted) { if ($file =~ /$EXTRACTED/i) # Some platforms may change the # case of the file's name { @@ -2655,13 +2731,12 @@ END # We only do this check for generic property files && $handler{$addr} == \&main::process_generic_property_file - && $file !~ /$EXTRACTED/i - && lc($file) ne 'dage.txt') + && $file !~ /$EXTRACTED/i) { # We don't set this (by the 'if' above) if we have no # extracted directory, so if running on an early version, # this test won't work. Not worth worrying about. - $seen_non_extracted_non_age = 1; + $seen_non_extracted = 1; } # Mark the file as having being processed, and warn if it @@ -2688,7 +2763,10 @@ END # official property, we still have to allow the publicly # inaccessible early name so that the core code which uses it # will work regardless. - if (! $early{$addr}[0] && $early{$addr}->@* > 2) { + if ( ! $only_early{$addr} + && ! $early{$addr}[0] + && $early{$addr}->@* > 2) + { my $early_property_name = $early{$addr}[2]; if ($property{$addr} ne $early_property_name) { main::property_ref($property{$addr}) @@ -2965,9 +3043,21 @@ END next; } - # Remove comments and trailing space, and skip this line if the - # result is empty - s/#.*//; + # Unless to keep, remove comments. If to keep, ignore + # comment-only lines + if ($retain_trailing_comments{$addr}) { + next if / ^ \s* \# /x; + + # But escape any single quotes (done in both the comment and + # non-comment portion; this could be a bug someday, but not + # likely) + s/'/\\'/g; + } + else { + s/#.*//; + } + + # Remove trailing space, and skip this line if the result is empty s/\s+$//; next if /^$/; @@ -3089,7 +3179,7 @@ END # Not currently used, not fully tested. # sub peek { -# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank +# # Non-destructive lookahead one non-adjusted, non-comment, non-blank # # record. Not callable from an each_line_handler(), nor does it call # # an each_line_handler() on the line. # @@ -3464,7 +3554,7 @@ sub trace { return main::trace(@_); } main::set_access('end', \%end, 'r', 's'); my %value; - main::set_access('value', \%value, 'r'); + main::set_access('value', \%value, 'r', 's'); my %type; main::set_access('type', \%type, 'r'); @@ -5235,6 +5325,14 @@ use parent '-norequire', '_Range_List_Base'; return $self->_add_delete('+', @_); } + sub replace_map { + # Replace a range + + my $self = shift; + + return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY); + } + sub add_duplicate { # Adds entry to a range list which can duplicate an existing entry @@ -5375,6 +5473,15 @@ sub trace { return main::trace(@_); } # used to override calculations. main::set_access('format', \%format, 'r', 'p_s'); + my %has_dependency; + # A boolean that gives whether some other table in this property is + # defined as the complement of this table. This is a crude, but currently + # sufficient, mechanism to make this table not get destroyed before what + # is dependent on it is. Other dependencies could be added, so the name + # was chosen to reflect a more general situation than actually is + # currently the case. + main::set_access('has_dependency', \%has_dependency, 'r', 's'); + sub new { # All arguments are key => value pairs, which you can see below, most # of which match fields documented above. Otherwise: Re_Pod_Entry, @@ -5430,6 +5537,7 @@ sub trace { return main::trace(@_); } $note{$addr} = [ ]; $file_path{$addr} = [ ]; $locked{$addr} = ""; + $has_dependency{$addr} = 0; push @{$description{$addr}}, $description if $description; push @{$note{$addr}}, $note if $note; @@ -5710,6 +5818,9 @@ END } # Look at each alias + my $is_last_resort = 0; + my $deprecated_or_discouraged + = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x; foreach my $alias ($self->aliases()) { # Don't use an alias that isn't ok to use for an external name. @@ -5718,10 +5829,13 @@ END my $name = main::Standardize($alias->name); trace $self, $name if main::DEBUG && $to_trace; - # Take the first one, or a shorter one that isn't numeric. This + # Take the first one, or any non-deprecated non-discouraged one + # over one that is, or a shorter one that isn't numeric. This # relies on numeric aliases always being last in the array # returned by aliases(). Any alpha one will have precedence. - if (! defined $short_name{$addr} + if ( ! defined $short_name{$addr} + || ( $is_last_resort + && $alias->status !~ $deprecated_or_discouraged) || ($name =~ /\D/ && length($name) < length($short_name{$addr}))) { @@ -5729,14 +5843,16 @@ END ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg; $nominal_short_name_length{$addr} = length $name; + $is_last_resort = $alias->status =~ $deprecated_or_discouraged; } } # If the short name isn't a nice one, perhaps an equivalent table has # a better one. - if (! defined $short_name{$addr} - || $short_name{$addr} eq "" - || $short_name{$addr} eq "_") + if ( $self->can('children') + && ( ! defined $short_name{$addr} + || $short_name{$addr} eq "" + || $short_name{$addr} eq "_")) { my $return; foreach my $follower ($self->children) { # All equivalents @@ -6245,6 +6361,22 @@ END } if ($write_as_invlist) { + if ( $previous_end > 0 + && $output_range_counts{$addr}) + { + my $complement_count = $start - $previous_end - 1; + if ($complement_count > 1) { + $OUT[-1] = merge_single_annotation_line( + $OUT[-1], + "#" + . (" " x 17) + . "[" + . main::clarify_code_point_count( + $complement_count) + . "] in complement\n", + $comment_indent); + } + } # Inversion list format has a single number per line, # the starting code point of a range that matches the @@ -6373,7 +6505,15 @@ END $range_name = "Hangul Syllable"; } - if ($i != $start || $range_end < $end) { + # If the annotation would just repeat what's + # already being output as the range, skip it. + # (When an inversion list is being written, it + # isn't a repeat, as that always is in + # decimal) + if ( $write_as_invlist + || $i != $start + || $range_end < $end) + { if ($range_end < $MAX_WORKING_CODEPOINT) { $annotation = sprintf "%04X..%04X", @@ -6387,6 +6527,7 @@ END else { # Indent if not displaying code points $annotation = " " x 4; } + if ($range_name) { $annotation .= " $age[$i]" if $age[$i]; $annotation .= " $range_name"; @@ -6929,18 +7070,29 @@ sub trace { return main::trace(@_); } sub set_default_map { # Define what code points that are missing from the input files should - # map to + # map to. The optional second parameter 'full_name' indicates to + # force using the full name of the map instead of its standard name. my $self = shift; my $map = shift; + my $use_full_name = shift // 0; Carp::carp_extra_args(\@_) if main::DEBUG && @_; + if ($use_full_name && $use_full_name ne 'full_name') { + Carp::my_carp_bug("Second parameter to set_default_map() if" + . " present, must be 'full_name'"); + } + my $addr = do { no overloading; pack 'J', $self; }; # Convert the input to the standard equivalent, if any (won't have any # for $STRING properties) - my $standard = $self->_find_table_from_alias->{$map}; - $map = $standard->name if defined $standard; + my $standard = $self->property->table($map); + if (defined $standard) { + $map = ($use_full_name) + ? $standard->full_name + : $standard->name; + } # Warn if there already is a non-equivalent default map for this # property. Note that a default map can be a ref, which means that @@ -7147,7 +7299,7 @@ END else { $cp = "one of the $code_points"; } - $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to"; + $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to"; } my $comment = ""; @@ -7590,6 +7742,7 @@ END # Accessors for the underlying list that should fail if locked. for my $sub (qw( add_duplicate + replace_map )) { no strict "refs"; @@ -7657,6 +7810,12 @@ use parent '-norequire', '_Base_Table'; # version. But manual intervention to decide what the actual behavior # should be may be required should this happen. The introductory comments # have more to say about this. +# +# 4) Definition. This is a string for human consumption that specifies the +# code points that this table matches. This is used only for the generated +# pod file. It may be specified explicitly, or automatically computed. +# Only the first portion of complicated definitions is computed and +# displayed. sub standardize { return main::standardize($_[0]); } sub trace { return main::trace(@_); } @@ -7701,6 +7860,11 @@ sub trace { return main::trace(@_); } # none. main::set_access('complement', \%complement, 'r'); + my %definition; + # Human readable string of the first few ranges of code points matched by + # this table + main::set_access('definition', \%definition, 'r', 's'); + sub new { my $class = shift; @@ -7717,6 +7881,7 @@ sub trace { return main::trace(@_); } my $initialize = delete $args{'Initialize'}; my $matches_all = delete $args{'Matches_All'} || 0; my $format = delete $args{'Format'}; + my $definition = delete $args{'Definition'} // ""; # Rest of parameters passed on. my $range_list = Range_List->new(Initialize => $initialize, @@ -7751,6 +7916,7 @@ sub trace { return main::trace(@_); } $leader{$addr} = $self; $parent{$addr} = $self; $complement{$addr} = 0; + $definition{$addr} = $definition; if (defined $format && $format ne $EMPTY_FORMAT) { Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'"); @@ -7861,13 +8027,23 @@ sub trace { return main::trace(@_); } # complement's if it has one. my $self = shift; - my $complement; - if (($complement = $self->complement) != 0) { - return ~ $complement->_range_list; - } - else { - return $self->SUPER::_range_list; + my $complement = $self->complement; + + # In order to avoid re-complementing on each access, only do the + # complement the first time, and store the result in this table's + # range list to use henceforth. However, this wouldn't work if the + # controlling (complement) table changed after we do this, so lock it. + # Currently, the value of the complement isn't needed until after it + # is fully constructed, so this works. If this were to change, the + # each_range iteration functionality would no longer work on this + # complement. + if ($complement != 0 && $self->SUPER::_range_list->count == 0) { + $self->_set_range_list($self->SUPER::_range_list + + ~ $complement->_range_list); + $complement->lock; } + + return $self->SUPER::_range_list; } sub add_alias { @@ -7905,7 +8081,7 @@ sub trace { return main::trace(@_); } # disambiguate with). if (defined $conflicting_object) { foreach my $alias ($self->aliases) { - if ($alias->name eq $conflicting_name) { + if (standardize($alias->name) eq standardize($conflicting_name)) { # Here, there is an exact match. This results in # ambiguous comments, so disambiguate by changing the @@ -7998,7 +8174,19 @@ sub trace { return main::trace(@_); } # add_alias() # instead for same # property - && ! $other->perl_extension) + && ! $other->perl_extension + + # We allow the sc and scx properties to be marked as + # related. They are in fact related, and this allows + # the pod to show that better. This test isn't valid + # if this is an early Unicode release without the scx + # property (having that also implies the sc property + # exists, so don't have to test for no 'sc') + && ( ! defined $scx + && ! ( ( $self->property == $script + || $self->property == $scx) + && ( $self->property == $script + || $self->property == $scx)))) { Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other"); $related = 0; @@ -8069,6 +8257,15 @@ sub trace { return main::trace(@_); } } my $addr = do { no overloading; pack 'J', $self; }; $complement{$addr} = $other; + + # Be sure the other property knows we are depending on them; or the + # other table if it is one in the current property. + if ($self->property != $other->property) { + $other->property->set_has_dependency(1); + } + else { + $other->set_has_dependency(1); + } $self->lock; return; } @@ -8112,6 +8309,235 @@ sub trace { return main::trace(@_); } return; } + sub calculate_table_definition + { + # Returns a human-readable string showing some or all of the code + # points matched by this table. The string will include a + # bracketed-character class for all characters matched in the 00-FF + # range, and the first few ranges matched beyond that. + my $max_ranges = 6; + + my $self = shift; + my $definition = $self->definition || ""; + + # Skip this if already have a definition. + return $definition if $definition; + + my $lows_string = ""; # The string representation of the 0-FF + # characters + my $string_range = ""; # The string rep. of the above FF ranges + my $range_count = 0; # How many ranges in $string_rage + + my @lows_invlist; # The inversion list of the 0-FF code points + my $first_non_control = ord(" "); # Everything below this is a + # control, on ASCII or EBCDIC + my $max_table_code_point = $self->max; + + # On ASCII platforms, the range 80-FF contains no printables. + my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126); + + + # Look through the first few ranges matched by this table. + $self->reset_each_range; # Defensive programming + while (defined (my $range = $self->each_range())) { + my $start = $range->start; + my $end = $range->end; + + # Accumulate an inversion list of the 00-FF code points + if ($start < 256 && ($start > 0 || $end < 256)) { + push @lows_invlist, $start; + push @lows_invlist, 1 + (($end < 256) ? $end : 255); + + # Get next range if there are more ranges below 256 + next if $end < 256 && $end < $max_table_code_point; + + # If the range straddles the 255/256 boundary, we split it + # there. We already added above the low portion to the + # inversion list + $start = 256 if $end > 256; + } + + # Here, @lows_invlist contains the code points below 256, and + # there is no other range, or the current one starts at or above + # 256. Generate the [char class] for the 0-255 ones. + while (@lows_invlist) { + + # If this range (necessarily the first one, by the way) starts + # at 0 ... + if ($lows_invlist[0] == 0) { + + # If it ends within the block of controls, that means that + # some controls are in it and some aren't. Since Unicode + # properties pretty much only know about a few of the + # controls, like \n, \t, this means that its one of them + # that isn't in the range. Complement the inversion list + # which will likely cause these to be output using their + # mnemonics, hence being clearer. + if ($lows_invlist[1] < $first_non_control) { + $lows_string .= '^'; + shift @lows_invlist; + push @lows_invlist, 256; + } + elsif ($lows_invlist[1] <= $highest_printable) { + + # Here, it extends into the printables block. Split + # into two ranges so that the controls are separate. + $lows_string .= sprintf "\\x00-\\x%02x", + $first_non_control - 1; + $lows_invlist[0] = $first_non_control; + } + } + + # If the range completely contains the printables, don't + # individually spell out the printables. + if ( $lows_invlist[0] <= $first_non_control + && $lows_invlist[1] > $highest_printable) + { + $lows_string .= sprintf "\\x%02x-\\x%02x", + $lows_invlist[0], $lows_invlist[1] - 1; + shift @lows_invlist; + shift @lows_invlist; + next; + } + + # Here, the range may include some but not all printables. + # Look at each one individually + foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) { + my $char = chr $ord; + + # If there is already something in the list, an + # alphanumeric char could be the next in sequence. If so, + # we start or extend a range. That is, we could have so + # far something like 'a-c', and the next char is a 'd', so + # we change it to 'a-d'. We use native_to_unicode() + # because a-z on EBCDIC means 26 chars, and excludes the + # gap ones. + if ($lows_string ne "" && $char =~ /[[:alnum:]]/) { + my $prev = substr($lows_string, -1); + if ( $prev !~ /[[:alnum:]]/ + || utf8::native_to_unicode(ord $prev) + 1 + != utf8::native_to_unicode(ord $char)) + { + # Not extending the range + $lows_string .= $char; + } + elsif ( length $lows_string > 1 + && substr($lows_string, -2, 1) eq '-') + { + # We had a sequence like '-c' and the current + # character is 'd'. Extend the range. + substr($lows_string, -1, 1) = $char; + } + else { + # We had something like 'd' and this is 'e'. + # Start a range. + $lows_string .= "-$char"; + } + } + elsif ($char =~ /[[:graph:]]/) { + + # We output a graphic char as-is, preceded by a + # backslash if it is a metacharacter + $lows_string .= '\\' + if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/; + $lows_string .= $char; + } # Otherwise use mnemonic for any that have them + elsif ($char =~ /[\a]/) { + $lows_string .= '\a'; + } + elsif ($char =~ /[\b]/) { + $lows_string .= '\b'; + } + elsif ($char eq "\e") { + $lows_string .= '\e'; + } + elsif ($char eq "\f") { + $lows_string .= '\f'; + } + elsif ($char eq "\cK") { + $lows_string .= '\cK'; + } + elsif ($char eq "\n") { + $lows_string .= '\n'; + } + elsif ($char eq "\r") { + $lows_string .= '\r'; + } + elsif ($char eq "\t") { + $lows_string .= '\t'; + } + else { + + # Here is a non-graphic without a mnemonic. We use \x + # notation. But if the ordinal of this is one above + # the previous, create or extend the range + my $hex_representation = sprintf("%02x", ord $char); + if ( length $lows_string >= 4 + && substr($lows_string, -4, 2) eq '\\x' + && hex(substr($lows_string, -2)) + 1 == ord $char) + { + if ( length $lows_string >= 5 + && substr($lows_string, -5, 1) eq '-' + && ( length $lows_string == 5 + || substr($lows_string, -6, 1) ne '\\')) + { + substr($lows_string, -2) = $hex_representation; + } + else { + $lows_string .= '-\\x' . $hex_representation; + } + } + else { + $lows_string .= '\\x' . $hex_representation; + } + } + } + } + + # Done with assembling the string of all lows. If there are only + # lows in the property, are completely done. + if ($max_table_code_point < 256) { + $self->reset_each_range; + last; + } + + # Otherwise, quit if reached max number of non-lows ranges. If + # there are lows, count them as one unit towards the maximum. + $range_count++; + if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) { + $string_range .= " ..."; + $self->reset_each_range; + last; + } + + # Otherwise add this range. + $string_range .= ", " if $string_range ne ""; + if ($start == $end) { + $string_range .= sprintf("U+%04X", $start); + } + elsif ($end >= $MAX_WORKING_CODEPOINT) { + $string_range .= sprintf("U+%04X..infinity", $start); + } + else { + $string_range .= sprintf("U+%04X..%04X", + $start, $end); + } + } + + # Done with all the ranges we're going to look at. Assemble the + # definition from the lows + non-lows. + + if ($lows_string ne "" || $string_range ne "") { + if ($lows_string ne "") { + $definition .= "[$lows_string]"; + $definition .= ", " if $string_range; + } + $definition .= $string_range; + } + + return $definition; + } + sub write { my $self = shift; Carp::carp_extra_args(\@_) if main::DEBUG && @_; @@ -8393,7 +8819,7 @@ resources, every table that matches the identical set of code points in this version of Unicode uses this file. Each one is listed in a separate group below. It could be that the tables will match the same set of code points in other Unicode releases, or it could be purely coincidence that they happen to -be the same in Unicode $string_version, and hence may not in other versions. +be the same in Unicode $unicode_version, and hence may not in other versions. END } @@ -8418,7 +8844,7 @@ END Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway."); $comment .= < value pairs. See the documentation just @@ -8693,6 +9128,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } $has_only_code_point_maps{$addr} = 1; $table_ref{$addr} = { }; $unique_maps{$addr} = { }; + $has_dependency{$addr} = 0; $map{$addr} = Map_Table->new($name, Full_Name => $full_name{$addr}, @@ -9187,6 +9623,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } containing_range count default_map + definition delete_range description each_range @@ -9205,6 +9642,7 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } range_count ranges range_size_1 + replace_map reset_each_range set_comment set_default_map @@ -10556,8 +10994,8 @@ ea ; W ; Wide END } - if (-e 'LineBreak.txt') { - push @return, split /\n/, <<'END'; + if (-e 'LineBreak.txt' || -e 'LBsubst.txt') { + my @lb = split /\n/, <<'END'; lb ; AI ; Ambiguous lb ; AL ; Alphabetic lb ; B2 ; Break_Both @@ -10588,6 +11026,12 @@ lb ; SY ; Break_Symbols lb ; XX ; Unknown lb ; ZW ; ZWSpace END + # If this Unicode version predates the lb property, we use our + # substitute one + if (-e 'LBsubst.txt') { + $_ = s/^lb/_Perl_LB/r for @lb; + } + push @return, @lb; } if (-e 'DNormalizationProps.txt') { @@ -10735,9 +11179,6 @@ sub output_perl_charnames_line ($$) { } { # Closure - # This is used to store the range list of all the code points usable when - # the little used $compare_versions feature is enabled. - my $compare_versions_range_list; # These are constants to the $property_info hash in this subroutine, to # avoid using a quoted-string which might have a typo. @@ -10842,73 +11283,6 @@ sub output_perl_charnames_line ($$) { my $low = hex $1; my $high = (defined $2) ? hex $2 : $low; - # For the very specialized case of comparing two Unicode - # versions... - if (DEBUG && $compare_versions) { - if ($property_name eq 'Age') { - - # Only allow code points at least as old as the version - # specified. - my $age = pack "C*", split(/\./, $map); # v string - next LINE if $age gt $compare_versions; - } - else { - - # Again, we throw out code points younger than those of - # the specified version. By now, the Age property is - # populated. We use the intersection of each input range - # with this property to find what code points in it are - # valid. To do the intersection, we have to convert the - # Age property map to a Range_list. We only have to do - # this once. - if (! defined $compare_versions_range_list) { - my $age = property_ref('Age'); - if (! -e 'DAge.txt') { - croak "Need to have 'DAge.txt' file to do version comparison"; - } - elsif ($age->count == 0) { - croak "The 'Age' table is empty, but its file exists"; - } - $compare_versions_range_list - = Range_List->new(Initialize => $age); - } - - # An undefined map is always 'Y' - $map = 'Y' if ! defined $map; - - # Calculate the intersection of the input range with the - # code points that are known in the specified version - my @ranges = ($compare_versions_range_list - & Range->new($low, $high))->ranges; - - # If the intersection is empty, throw away this range - next LINE unless @ranges; - - # Only examine the first range this time through the loop. - my $this_range = shift @ranges; - - # Put any remaining ranges in the queue to be processed - # later. Note that there is unnecessary work here, as we - # will do the intersection again for each of these ranges - # during some future iteration of the LINE loop, but this - # code is not used in production. The later intersections - # are guaranteed to not splinter, so this will not become - # an infinite loop. - my $line = join ';', $property_name, $map; - foreach my $range (@ranges) { - $file->insert_adjusted_lines(sprintf("%04X..%04X; %s", - $range->start, - $range->end, - $line)); - } - - # And set things up so that the below will process this first - # range, like any other. - $low = $this_range->start; - $high = $this_range->end; - } - } # End of $compare_versions - # If changing to a new property, get the things constant per # property if ($previous_property_name ne $property_name) { @@ -11644,7 +12018,16 @@ END . $CMD_DELIM . $fields[$CHARNAME]; } - elsif ($fields[$CHARNAME] =~ /^CJK/) { + elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter + + # All the CJK ranges like this have the name given as a + # special case in the next code line. And for the others, we + # hope that Unicode continues to use the correct name in + # future releases, so we don't have to make further special + # cases. + my $name = ($fields[$CHARNAME] =~ /^CJK/) + ? 'CJK UNIFIED IDEOGRAPH' + : uc $fields[$CHARNAME]; # The name for these contains the code point itself, and all # are defined to have the same base name, regardless of what @@ -11656,7 +12039,7 @@ END . '=' . $CP_IN_NAME . $CMD_DELIM - . 'CJK UNIFIED IDEOGRAPH'; + . $name; } elsif ($fields[$CATEGORY] eq 'Co' @@ -12074,6 +12457,18 @@ sub process_GCB_test { return; } +sub process_LB_test { + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + while ($file->next_line) { + push @LB_tests, $_; + } + + return; +} + sub process_SB_test { my $file = shift; @@ -12159,6 +12554,24 @@ sub process_NamedSequences { } } +sub filter_substitute_lb { + # Used on Unicodes that predate the LB property, where there is a + # substitute file. This just does the regular ea_lb handling for such + # files, and then substitutes the long property value name for the short + # one that comes with the file. (The other break files have the long + # names in them, so this is the odd one out.) The reason for doing this + # kludge is that regen/mk_invlists.pl is expecting the long name. This + # also fixes the typo 'Inseperable' that leads to problems. + + filter_early_ea_lb; + return unless $_; + + my @fields = split /\s*;\s*/; + $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name; + $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable'; + $_ = join '; ', @fields; +} + sub filter_old_style_arabic_shaping { # Early versions used a different term for the later one. @@ -12542,6 +12955,15 @@ sub register_fraction($) { my $rational = shift; my $float = eval $rational; + $float = sprintf "%.*e", $E_FLOAT_PRECISION, $float; + if ( defined $nv_floating_to_rational{$float} + && $nv_floating_to_rational{$float} ne $rational) + { + die Carp::my_carp_bug("Both '$rational' and" + . " '$nv_floating_to_rational{$float}' evaluate to" + . " the same floating point number." + . " \$E_FLOAT_PRECISION must be increased"); + } $nv_floating_to_rational{$float} = $rational; return; } @@ -13004,9 +13426,9 @@ sub setup_script_extensions { # The Script_Extensions property starts out with a clone of the Script # property. - my $scx = property_ref("Script_Extensions"); - $scx = Property->new("scx", Full_Name => "Script_Extensions") - if ! defined $scx; + $scx = property_ref("Script_Extensions"); + return unless defined $scx; + $scx->_set_format($STRING_WHITE_SPACE_LIST); $scx->initialize($script); $scx->set_default_map($script->default_map); @@ -13242,7 +13664,7 @@ sub filter_all_caps_script_names { my ($range, $script, @remainder) = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields - my @words = split "_", $script; + my @words = split /[_-]/, $script; for my $word (@words) { $word = ucfirst(lc($word)) if $word ne 'CJK'; @@ -13316,9 +13738,9 @@ END # For each property, fill in any missing mappings, and calculate the re # match tables. If a property has more than one missing mapping, the - # default is a reference to a data structure, and requires data from other - # properties to resolve. The sort is used to cause these to be processed - # last, after all the other properties have been calculated. + # default is a reference to a data structure, and may require data from + # other properties to resolve. The sort is used to cause these to be + # processed last, after all the other properties have been calculated. # (Fortunately, the missing properties so far don't depend on each other.) foreach my $property (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 } @@ -13562,6 +13984,18 @@ numerals. END )); + # Make sure this assumption in perl core code is valid in this Unicode + # release, with known exceptions + foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) { + next if $range->end - $range->start == 9; + next if $range->start == 0x1D7CE; # This whole range was added in 3.1 + next if $range->end == 0x19DA && $v_version eq v5.2.0; + next if $range->end - $range->start < 9 && $v_version le 4.0.0; + Carp::my_carp("Range $range unexpectedly doesn't contain 10" + . " decimal digits. Code in regcomp.c assumes it does," + . " and will have to be fixed. Proceeding anyway."); + } + Property->new('Legacy_Case_Folding', File => "Fold", Directory => $map_directory, @@ -13587,7 +14021,6 @@ END # data is retained in the map table for reference, but the spurious match # tables are deleted. - my $scx = property_ref("Script_Extensions"); if (defined $scx) { foreach my $table ($scx->tables) { next unless $table->name =~ /\s/; # All the new and only the new @@ -13600,6 +14033,21 @@ END } $scx->delete_match_table($table); } + + # Mark the scx table as the parent of the corresponding sc table for + # those which are identical. This causes the pod for the script table + # to refer to the corresponding scx one. + # + # This has to be in a separate loop from above, so as to wait until + # the tables are stabilized before checking for equivalency. + if (defined $pod_directory) { + foreach my $table ($scx->tables) { + my $plain_sc_equiv = $script->table($table->name); + if ($table->matches_identically_to($plain_sc_equiv)) { + $plain_sc_equiv->set_equivalent_to($table, Related => 1); + } + } + } } return; @@ -13628,7 +14076,7 @@ sub pre_3_dot_1_Nl () { return $Nl; } -sub calculate_Assigned() { # Calculate the gc != Cn code points; may be +sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be # called before the Cn's are completely filled. # Works on Unicodes earlier than ones that # explicitly specify Cn. @@ -13702,6 +14150,290 @@ sub calculate_NChar() { # Create a Perl extension match table which is the } } +sub handle_compare_versions () { + # This fixes things up for the $compare_versions capability, where we + # compare Unicode version X with version Y (with Y > X), and we are + # running it on the Unicode Data for version Y. + # + # It works by calculating the code points whose meaning has been specified + # after release X, by using the Age property. The complement of this set + # is the set of code points whose meaning is unchanged between the + # releases. This is the set the program restricts itself to. It includes + # everything whose meaning has been specified by the time version X came + # along, plus those still unassigned by the time of version Y. (We will + # continue to use the word 'assigned' to mean 'meaning has been + # specified', as it's shorter and is accurate in all cases except the + # Noncharacter code points.) + # + # This function is run after all the properties specified by Unicode have + # been calculated for release Y. This makes sure we get all the nuances + # of Y's rules. (It is done before the Perl extensions are calculated, as + # those are based entirely on the Unicode ones.) But doing it after the + # Unicode table calculations means we have to fix up the Unicode tables. + # We do this by subtracting the code points that have been assigned since + # X (which is actually done by ANDing each table of assigned code points + # with the set of unchanged code points). Most Unicode properties are of + # the form such that all unassigned code points have a default, grab-bag, + # property value which is changed when the code point gets assigned. For + # these, we just remove the changed code points from the table for the + # latter property value, and add them back in to the grab-bag one. A few + # other properties are not entirely of this form and have values for some + # or all unassigned code points that are not the grab-bag one. These have + # to be handled specially, and are hard-coded in to this routine based on + # manual inspection of the Unicode character database. A list of the + # outlier code points is made for each of these properties, and those + # outliers are excluded from adding and removing from tables. + # + # Note that there are glitches when comparing against Unicode 1.1, as some + # Hangul syllables in it were later ripped out and eventually replaced + # with other things. + + print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS; + + my $after_first_version = "All matching code points were added after " + . "Unicode $string_compare_versions"; + + # Calculate the delta as those code points that have been newly assigned + # since the first compare version. + my $delta = Range_List->new(); + foreach my $table ($age->tables) { + use version; + next if $table == $age->table('Unassigned'); + next if version->parse($table->name) + le version->parse($string_compare_versions); + $delta += $table; + } + if ($delta->is_empty) { + die ("No changes; perhaps you need a 'DAge.txt' file?"); + } + + my $unchanged = ~ $delta; + + calculate_Assigned() if ! defined $Assigned; + $Assigned &= $unchanged; + + # $Assigned now contains the code points that were assigned as of Unicode + # version X. + + # A block is all or nothing. If nothing is assigned in it, it all goes + # back to the No_Block pool; but if even one code point is assigned, the + # block is retained. + my $no_block = $block->table('No_Block'); + foreach my $this_block ($block->tables) { + next if $this_block == $no_block + || ! ($this_block & $Assigned)->is_empty; + $this_block->set_fate($SUPPRESSED, $after_first_version); + foreach my $range ($this_block->ranges) { + $block->replace_map($range->start, $range->end, 'No_Block') + } + $no_block += $this_block; + } + + my @special_delta_properties; # List of properties that have to be + # handled specially. + my %restricted_delta; # Keys are the entries in + # @special_delta_properties; values + # are the range list of the code points + # that behave normally when they get + # assigned. + + # In the next three properties, the Default Ignorable code points are + # outliers. + calculate_DI(); + $DI &= $unchanged; + + push @special_delta_properties, property_ref('_Perl_GCB'); + $restricted_delta{$special_delta_properties[-1]} = ~ $DI; + + if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded'))) + { + push @special_delta_properties, $cwnfkcc; + $restricted_delta{$special_delta_properties[-1]} = ~ $DI; + } + + calculate_NChar(); # Non-character code points + $NChar &= $unchanged; + + # This may have to be updated from time-to-time to get the most accurate + # results. + my $default_BC_non_LtoR = Range_List->new(Initialize => + # These came from the comments in v8.0 DBidiClass.txt + [ # AL + 0x0600 .. 0x07BF, + 0x08A0 .. 0x08FF, + 0xFB50 .. 0xFDCF, + 0xFDF0 .. 0xFDFF, + 0xFE70 .. 0xFEFF, + 0x1EE00 .. 0x1EEFF, + # R + 0x0590 .. 0x05FF, + 0x07C0 .. 0x089F, + 0xFB1D .. 0xFB4F, + 0x10800 .. 0x10FFF, + 0x1E800 .. 0x1EDFF, + 0x1EF00 .. 0x1EFFF, + # ET + 0x20A0 .. 0x20CF, + ] + ); + $default_BC_non_LtoR += $DI + $NChar; + push @special_delta_properties, property_ref('BidiClass'); + $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR; + + if (defined (my $eaw = property_ref('East_Asian_Width'))) { + + my $default_EA_width_W = Range_List->new(Initialize => + # From comments in v8.0 EastAsianWidth.txt + [ + 0x3400 .. 0x4DBF, + 0x4E00 .. 0x9FFF, + 0xF900 .. 0xFAFF, + 0x20000 .. 0x2A6DF, + 0x2A700 .. 0x2B73F, + 0x2B740 .. 0x2B81F, + 0x2B820 .. 0x2CEAF, + 0x2F800 .. 0x2FA1F, + 0x20000 .. 0x2FFFD, + 0x30000 .. 0x3FFFD, + ] + ); + push @special_delta_properties, $eaw; + $restricted_delta{$special_delta_properties[-1]} + = ~ $default_EA_width_W; + + # Line break came along in the same release as East_Asian_Width, and + # the non-grab-bag default set is a superset of the EAW one. + if (defined (my $lb = property_ref('Line_Break'))) { + my $default_LB_non_XX = Range_List->new(Initialize => + # From comments in v8.0 LineBreak.txt + [ 0x20A0 .. 0x20CF ]); + $default_LB_non_XX += $default_EA_width_W; + push @special_delta_properties, $lb; + $restricted_delta{$special_delta_properties[-1]} + = ~ $default_LB_non_XX; + } + } + + # Go through every property, skipping those we've already worked on, those + # that are immutable, and the perl ones that will be calculated after this + # routine has done its fixup. + foreach my $property (property_ref('*')) { + next if $property == $perl # Done later in the program + || $property == $block # Done just above + || $property == $DI # Done just above + || $property == $NChar # Done just above + + # The next two are invariant across Unicode versions + || $property == property_ref('Pattern_Syntax') + || $property == property_ref('Pattern_White_Space'); + + # Find the grab-bag value. + my $default_map = $property->default_map; + + if (! $property->to_create_match_tables) { + + # Here there aren't any match tables. So far, all such properties + # have a default map, and don't require special handling. Just + # change each newly assigned code point back to the default map, + # as if they were unassigned. + foreach my $range ($delta->ranges) { + $property->add_map($range->start, + $range->end, + $default_map, + Replace => $UNCONDITIONALLY); + } + } + else { # Here there are match tables. Find the one (if any) for the + # grab-bag value that unassigned code points go to. + my $default_table; + if (defined $default_map) { + $default_table = $property->table($default_map); + } + + # If some code points don't go back to the the grab-bag when they + # are considered unassigned, exclude them from the list that does + # that. + my $this_delta = $delta; + my $this_unchanged = $unchanged; + if (grep { $_ == $property } @special_delta_properties) { + $this_delta = $delta & $restricted_delta{$property}; + $this_unchanged = ~ $this_delta; + } + + # Fix up each match table for this property. + foreach my $table ($property->tables) { + if (defined $default_table && $table == $default_table) { + + # The code points assigned after release X (the ones we + # are excluding in this routine) go back on to the default + # (grab-bag) table. However, some of these tables don't + # actually exist, but are specified solely by the other + # tables. (In a binary property, we don't need to + # actually have an 'N' table, as it's just the complement + # of the 'Y' table.) Such tables will be locked, so just + # skip those. + $table += $this_delta unless $table->locked; + } + else { + + # Here the table is not for the default value. We need to + # subtract the code points we are ignoring for this + # comparison (the deltas) from it. But if the table + # started out with nothing, no need to exclude anything, + # and want to skip it here anyway, so it gets listed + # properly in the pod. + next if $table->is_empty; + + # Save the deltas for later, before we do the subtraction + my $deltas = $table & $this_delta; + + $table &= $this_unchanged; + + # Suppress the table if the subtraction left it with + # nothing in it + if ($table->is_empty) { + if ($property->type == $BINARY) { + push @tables_that_may_be_empty, $table->complete_name; + } + else { + $table->set_fate($SUPPRESSED, $after_first_version); + } + } + + # Now we add the removed code points to the property's + # map, as they should now map to the grab-bag default + # property (which they did in the first comparison + # version). But we don't have to do this if the map is + # only for internal use. + if (defined $default_map && $property->to_output_map) { + + # The gc property has pseudo property values whose names + # have length 1. These are the union of all the + # property values whose name is longer than 1 and + # whose first letter is all the same. The replacement + # is done once for the longer-named tables. + next if $property == $gc && length $table->name == 1; + + foreach my $range ($deltas->ranges) { + $property->add_map($range->start, + $range->end, + $default_map, + Replace => $UNCONDITIONALLY); + } + } + } + } + } + } + + # The above code doesn't work on 'gc=C', as it is a superset of the default + # ('Cn') table. It's easiest to just special case it here. + my $C = $gc->table('C'); + $C += $gc->table('Cn'); + + return; +} + sub compile_perl() { # Create perl-defined tables. Almost all are part of the pseudo-property # named 'perl' internally to this program. Many of these are recommended @@ -13740,21 +14472,13 @@ sub compile_perl() { } my $Any = $perl->add_match_table('Any', - Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]", - ); + Description => "All Unicode code points"); $Any->add_range(0, $MAX_UNICODE_CODEPOINT); $Any->add_alias('Unicode'); calculate_Assigned(); - # Our internal-only property should be treated as more than just a - # synonym; grandfather it in to the pod. - $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1, - Fate => $INTERNAL_ONLY, Status => $DISCOURAGED) - ->set_equivalent_to(property_ref('ccc')->table('Above'), - Related => 1); - - my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]'); + my $ASCII = $perl->add_match_table('ASCII'); if (defined $block) { # This is equivalent to the block if have it. my $Unicode_ASCII = $block->table('Basic_Latin'); if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) { @@ -13814,7 +14538,6 @@ sub compile_perl() { $Lower += $temp & $Assigned; } my $Posix_Lower = $perl->add_match_table("PosixLower", - Description => "[a-z]", Initialize => $Lower & $ASCII, ); @@ -13832,7 +14555,6 @@ sub compile_perl() { $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters } my $Posix_Upper = $perl->add_match_table("PosixUpper", - Description => "[A-Z]", Initialize => $Upper & $ASCII, ); @@ -13897,56 +14619,6 @@ sub compile_perl() { $Lower->set_caseless_equivalent($cased); } - # Similarly, set up our own Case_Ignorable property if this Unicode - # version doesn't have it. From Unicode 5.1: Definition D121: A character - # C is defined to be case-ignorable if C has the value MidLetter or the - # value MidNumLet for the Word_Break property or its General_Category is - # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf), - # Modifier_Letter (Lm), or Modifier_Symbol (Sk). - - # Perl has long had an internal-only alias for this property; grandfather - # it in to the pod, but discourage its use. - my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable', - Re_Pod_Entry => 1, - Fate => $INTERNAL_ONLY, - Status => $DISCOURAGED); - my $case_ignorable = property_ref('Case_Ignorable'); - if (defined $case_ignorable && ! $case_ignorable->is_empty) { - $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'), - Related => 1); - } - else { - - $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm')); - - # The following three properties are not in early releases - $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me'); - $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf'); - $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk'); - - # For versions 4.1 - 5.0, there is no MidNumLet property, and - # correspondingly the case-ignorable definition lacks that one. For - # 4.0, it appears that it was meant to be the same definition, but was - # inadvertently omitted from the standard's text, so add it if the - # property actually is there - my $wb = property_ref('Word_Break'); - if (defined $wb) { - my $midlet = $wb->table('MidLetter'); - $perl_case_ignorable += $midlet if defined $midlet; - my $midnumlet = $wb->table('MidNumLet'); - $perl_case_ignorable += $midnumlet if defined $midnumlet; - } - else { - - # In earlier versions of the standard, instead of the above two - # properties , just the following characters were used: - $perl_case_ignorable += - ord("'") - + utf8::unicode_to_native(0xAD) # SOFT HYPHEN (SHY) - + 0x2019; # RIGHT SINGLE QUOTATION MARK - } - } - # The remaining perl defined tables are mostly based on Unicode TR 18, # "Annex C: Compatibility Properties". All of these have two versions, # one whose name generally begins with Posix that is posix-compliant, and @@ -14040,7 +14712,6 @@ sub compile_perl() { $Alpha->add_alias('Alphabetic'); } my $Posix_Alpha = $perl->add_match_table("PosixAlpha", - Description => "[A-Za-z]", Initialize => $Alpha & $ASCII, ); $Posix_Upper->set_caseless_equivalent($Posix_Alpha); @@ -14051,13 +14722,13 @@ sub compile_perl() { Initialize => $Alpha + $gc->table('Decimal_Number'), ); $perl->add_match_table("PosixAlnum", - Description => "[A-Za-z0-9]", Initialize => $Alnum & $ASCII, ); my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord', Description => '\w, including beyond ASCII;' - . ' = \p{Alnum} + \pM + \p{Pc}', + . ' = \p{Alnum} + \pM + \p{Pc}' + . ' + \p{Join_Control}', Initialize => $Alnum + $gc->table('Mark'), ); my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1 @@ -14077,7 +14748,7 @@ sub compile_perl() { # This is a Perl extension, so the name doesn't begin with Posix. my $PerlWord = $perl->add_match_table('PosixWord', - Description => '\w, restricted to ASCII = [A-Za-z0-9_]', + Description => '\w, restricted to ASCII', Initialize => $Word & $ASCII, ); $PerlWord->add_alias('PerlWord'); @@ -14094,7 +14765,6 @@ sub compile_perl() { ); $Blank->add_alias('HorizSpace'); # Another name for it. $perl->add_match_table("PosixBlank", - Description => "\\t and ' '", Initialize => $Blank & $ASCII, ); @@ -14120,7 +14790,6 @@ sub compile_perl() { $Space->add_alias('Space') if $v_version lt v4.1.0; my $Posix_space = $perl->add_match_table("PosixSpace", - Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)", Initialize => $Space & $ASCII, ); $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym @@ -14129,7 +14798,12 @@ sub compile_perl() { Description => 'Control characters'); $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1); $perl->add_match_table("PosixCntrl", - Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL", + Description => "ASCII control characters", + Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2," + . " DC3, DC4, DEL, DLE, ENQ, EOM," + . " EOT, ESC, ETB, ETX, FF, FS, GS," + . " HT, LF, NAK, NUL, RS, SI, SO," + . " SOH, STX, SUB, SYN, US, VT", Initialize => $Cntrl & $ASCII, ); @@ -14153,8 +14827,6 @@ sub compile_perl() { Initialize => ~ ($Space + $controls), ); $perl->add_match_table("PosixGraph", - Description => - '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]', Initialize => $Graph & $ASCII, ); @@ -14163,8 +14835,6 @@ sub compile_perl() { Initialize => $Blank + $Graph - $gc->table('Control'), ); $perl->add_match_table("PosixPrint", - Description => - '[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]', Initialize => $print & $ASCII, ); @@ -14179,7 +14849,6 @@ sub compile_perl() { Perl_Extension => 1 ); $perl->add_match_table('PosixPunct', Perl_Extension => 1, - Description => '[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]', Initialize => $ASCII & $XPosixPunct, ); @@ -14187,7 +14856,6 @@ sub compile_perl() { Description => '[0-9] + all other decimal digits'); $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1); my $PosixDigit = $perl->add_match_table("PosixDigit", - Description => '[0-9]', Initialize => $Digit & $ASCII, ); @@ -14202,7 +14870,6 @@ sub compile_perl() { ord('A') .. ord('F'), ord('a') .. ord('f'), 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); - $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO'); } # AHex was not present in early releases @@ -14216,7 +14883,6 @@ sub compile_perl() { $PosixXDigit->add_alias('AHex'); $PosixXDigit->add_alias('Ascii_Hex_Digit'); } - $PosixXDigit->add_description('[0-9A-Fa-f]'); my $any_folds = $perl->add_match_table("_Perl_Any_Folds", Description => "Code points that particpate in some fold", @@ -14326,33 +14992,6 @@ sub compile_perl() { Note => 'Union of all non-canonical decompositions', ); - # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier - # than SD appeared, construct it ourselves, based on the first release SD - # was in. A pod entry is grandfathered in for it - my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1, - Perl_Extension => 1, - Fate => $INTERNAL_ONLY, - Status => $DISCOURAGED); - my $soft_dotted = property_ref('Soft_Dotted'); - if (defined $soft_dotted && ! $soft_dotted->is_empty) { - $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1); - } - else { - - # This list came from 3.2 Soft_Dotted; all of these code points are in - # all releases - $CanonDCIJ->initialize([ ord('i'), - ord('j'), - 0x012F, - 0x0268, - 0x0456, - 0x0458, - 0x1E2D, - 0x1ECB, - ]); - $CanonDCIJ = $CanonDCIJ & $Assigned; - } - # For backward compatibility, Perl has its own definition for IDStart. # It is regular XID_Start plus the underscore, but all characters must be # Word characters as well @@ -14492,7 +15131,6 @@ sub compile_perl() { + ord("(") + ord(")") + ord("-") - + utf8::unicode_to_native(0xA0) # NBSP ); my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias'); @@ -14673,6 +15311,9 @@ END . $current_age->name . ' or earlier', ); + foreach my $alias ($current_age->aliases) { + $current_in->add_alias($alias->name); + } $previous_in = $current_in; # Add clarifying material for the corresponding age file. This is @@ -14782,8 +15423,8 @@ END 0x2060 .. 0x206F, 0xFE00 .. 0xFE0F, 0xFFF0 .. 0xFFFB, - 0xE0000 .. 0xE0FFF, ]); + $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0; $quotemeta += $temp; } calculate_DI(); @@ -14805,6 +15446,160 @@ END } } + # Perl tailors the WordBreak property so that \b{wb} doesn't split + # adjacent spaces into separate words. First create a copy of the regular + # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB + # was defined for, this will already have been done by the substitute file + # portion for 'Input_file' code for WB.) + my $perl_wb = property_ref('_Perl_WB'); + if (! defined $perl_wb) { + $perl_wb = Property->new('_Perl_WB', + Fate => $INTERNAL_ONLY, + Perl_Extension => 1, + Directory => $map_directory, + Type => $STRING); + my $wb = property_ref('Word_Break'); + $perl_wb->initialize($wb); + $perl_wb->set_default_map($wb->default_map); + } + + # And simply replace the mappings of horizontal space characters that + # otherwise would map to the default to instead map to our tailoring. + my $default = $perl_wb->default_map; + for my $range ($Blank->ranges) { + for my $i ($range->start .. $range->end) { + next unless $perl_wb->value_of($i) eq $default; + $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace', + Replace => $UNCONDITIONALLY); + } + } + + # Create a version of the LineBreak property with the mappings that are + # omitted in the default algorithm remapped to what + # http://www.unicode.org/reports/tr14 says they should be. + # + # Original Resolved General_Category + # AI, SG, XX AL Any + # SA CM Only Mn or Mc + # SA AL Any except Mn and Mc + # CJ NS Any + # + # All property values are also written out in their long form, as + # regen/mk_invlist.pl expects that. This also fixes occurrences of the + # typo in early Unicode versions: 'inseperable'. + my $perl_lb = property_ref('_Perl_LB'); + if (! defined $perl_lb) { + $perl_lb = Property->new('_Perl_LB', + Fate => $INTERNAL_ONLY, + Perl_Extension => 1, + Directory => $map_directory, + Type => $STRING); + my $lb = property_ref('Line_Break'); + + # Populate from $lb, but use full name and fix typo. + foreach my $range ($lb->ranges) { + my $full_name = $lb->table($range->value)->full_name; + $full_name = 'Inseparable' + if standardize($full_name) eq 'inseperable'; + $perl_lb->add_map($range->start, $range->end, $full_name); + } + } + + $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL + + for my $range ($perl_lb->ranges) { + my $value = standardize($range->value); + if ( $value eq standardize('Unknown') + || $value eq standardize('Ambiguous') + || $value eq standardize('Surrogate')) + { + $perl_lb->add_map($range->start, $range->end, 'Alphabetic', + Replace => $UNCONDITIONALLY); + } + elsif ($value eq standardize('Conditional_Japanese_Starter')) { + $perl_lb->add_map($range->start, $range->end, 'Nonstarter', + Replace => $UNCONDITIONALLY); + } + elsif ($value eq standardize('Complex_Context')) { + for my $i ($range->start .. $range->end) { + my $gc_val = $gc->value_of($i); + if ($gc_val eq 'Mn' || $gc_val eq 'Mc') { + $perl_lb->add_map($i, $i, 'Combining_Mark', + Replace => $UNCONDITIONALLY); + } + else { + $perl_lb->add_map($i, $i, 'Alphabetic', + Replace => $UNCONDITIONALLY); + } + } + } + } + + # This property is a modification of the scx property + my $perl_scx = Property->new('_Perl_SCX', + Fate => $INTERNAL_ONLY, + Perl_Extension => 1, + Directory => $map_directory, + Type => $ENUM); + my $source; + + # Use scx if available; otherwise sc; if neither is there (a very old + # Unicode version, just say that everything is 'Common' + if (defined $scx) { + $source = $scx; + $perl_scx->set_default_map('Unknown'); + } + elsif (defined $script) { + $source = $script; + + # Early versions of 'sc', had everything be 'Common' + if (defined $script->table('Unknown')) { + $perl_scx->set_default_map('Unknown'); + } + else { + $perl_scx->set_default_map('Common'); + } + } else { + $perl_scx->add_match_table('Common'); + $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common'); + + $perl_scx->add_match_table('Unknown'); + $perl_scx->set_default_map('Unknown'); + } + + $perl_scx->_set_format($STRING_WHITE_SPACE_LIST); + $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these + + if (defined $source) { + $perl_scx->initialize($source); + + # UTS 39 says that the scx property should be modified for these + # countries where certain mixed scripts are commonly used. + for my $range ($perl_scx->ranges) { + my $value = $range->value; + my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi; + $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi; + $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi; + $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) } + {$1 Katakana Hiragana Jpan}xi; + $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi; + $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi; + + if ($changed) { + $value = join " ", uniques split " ", $value; + $range->set_value($value) + } + } + + foreach my $table ($source->tables) { + my $scx_table = $perl_scx->add_match_table($table->name, + Full_Name => $table->full_name); + foreach my $alias ($table->aliases) { + $scx_table->add_alias($alias->name); + } + } + } + # Here done with all the basic stuff. Ready to populate the information # about each character if annotating them. if ($annotate) { @@ -14832,7 +15627,7 @@ sub add_perl_synonyms() { # the single-form, \p{name}. These are: # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and # \p{Is_Name} as synonyms - # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms + # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no # conflict, \p{Value} and \p{Is_Value} as well @@ -14846,8 +15641,14 @@ sub add_perl_synonyms() { property_ref('*'); push @tables, $gc->tables; - # If the version of Unicode includes the Script property, add its tables - push @tables, $script->tables if defined $script; + # If the version of Unicode includes the Script Extensions (preferably), + # or Script property, add its tables + if (defined $scx) { + push @tables, $scx->tables; + } + else { + push @tables, $script->tables if defined $script; + } # The Block tables are kept separate because they are treated differently. # And the earliest versions of Unicode didn't include them, so add only if @@ -14910,18 +15711,19 @@ sub add_perl_synonyms() { if (! defined $pre_existing) { - # No name collision, so ok to add the perl synonym. + # No name collision, so OK to add the perl synonym. my $make_re_pod_entry; my $ok_as_filename; my $status = $alias->status; if ($nominal_property == $block) { - # For block properties, the 'In' form is preferred for - # external use; the pod file contains wild cards for - # this and the 'Is' form so no entries for those; and - # we don't want people using the name without the - # 'In', so discourage that. + # For block properties, only the compound form is + # preferred for external use; the others are + # discouraged. The pod file contains wild cards for + # the 'In' and 'Is' forms so no entries for those; and + # we don't want people using the name without any + # prefix, so discourage that. if ($prefix eq "") { $make_re_pod_entry = 1; $status = $status || $DISCOURAGED; @@ -14929,7 +15731,7 @@ sub add_perl_synonyms() { } elsif ($prefix eq 'In_') { $make_re_pod_entry = 0; - $status = $status || $NORMAL; + $status = $status || $DISCOURAGED; $ok_as_filename = 1; } else { @@ -15004,7 +15806,7 @@ sub add_perl_synonyms() { next; } - # Here, there is a name collision, but it still could be ok if + # Here, there is a name collision, but it still could be OK if # the tables match the identical set of code points, in which # case, we can combine the names. Compare each table's code # point list to see if they are identical. @@ -15505,7 +16307,18 @@ sub make_re_pod_entries($) { $unicode_count = $count; $non_unicode_string = ""; } + my $string_count = clarify_number($unicode_count) . $non_unicode_string; + + my $definition = $input_table->calculate_table_definition; + if ($definition) { + + # Save the definition for later use. + $input_table->set_definition($definition); + + $definition = ": $definition"; + } + my $status = $input_table->status; my $status_info = $input_table->status_info; my $caseless_equivalent = $input_table->caseless_equivalent; @@ -15708,7 +16521,7 @@ sub make_re_pod_entries($) { # And if this is a compound form name, see if there is a # single form equivalent my $single_form; - if ($table_property != $perl) { + if ($table_property != $perl && $table_property != $block) { # Special case the binary N tables, so that will print # \P{single}, but use the Y table values to populate @@ -15800,7 +16613,10 @@ sub make_re_pod_entries($) { if ($table_property != $perl && $table->perl_extension) { push @info, '(Perl extension)'; } - push @info, "($string_count)"; + my $definition = $table->definition // ""; + $definition = "" if $entry_for_first_alias; + $definition = ": $definition" if $definition; + push @info, "($string_count$definition)"; # Now, we have both the entry and info so add them to the # list of all the properties. @@ -15843,39 +16659,50 @@ sub make_ucd_table_pod_entries { : $table->parent->property; my $perl_extension = $table->perl_extension; + my $is_perl_extension_match_table_but_not_dollar_perl + = $property != $perl + && $perl_extension + && $property != $table; # Get the more official name for for perl extensions that aren't # stand-alone properties - if ($perl_extension && $property != $table) { - if ($property == $perl ||$property->type == $BINARY) { - $meaning = $table->complete_name; + if ($is_perl_extension_match_table_but_not_dollar_perl) { + if ($property->type == $BINARY) { + $meaning = $property->full_name; } else { - $meaning = $property->full_name . "=$full_name"; + $meaning = $table->parent->complete_name; } } # There are three types of info column. One for the short name, one for # the full name, and one for everything else. They mostly are the same, # so initialize in the same loop. + foreach my $info_ref (\$full_info, \$short_info, \$other_info) { - if ($perl_extension && $property != $table) { + if ($info_ref != \$full_info) { + + # The non-full name columns include the full name + $$info_ref .= $full_name; + } + + + if ($is_perl_extension_match_table_but_not_dollar_perl) { # Add the synonymous name for the non-full name entries; and to # the full-name entry if it adds extra information - if ($info_ref == \$other_info - || ($info_ref == \$short_info - && $standard_short_name ne $standard_full_name) - || standardize($meaning) ne $standard_full_name - ) { - $$info_ref .= "$meaning."; + if ( standardize($meaning) ne $standard_full_name + || $info_ref == \$other_info + || $info_ref == \$short_info) + { + my $parenthesized = $info_ref != \$full_info; + $$info_ref .= " " if $$info_ref && $parenthesized; + $$info_ref .= "(=" if $parenthesized; + $$info_ref .= "$meaning"; + $$info_ref .= ")" if $parenthesized; + $$info_ref .= "."; } } - elsif ($info_ref != \$full_info) { - - # Otherwise, the non-full name columns include the full name - $$info_ref .= $full_name; - } # And the full-name entry includes the short name, if shorter if ($info_ref == \$full_info @@ -15893,8 +16720,23 @@ sub make_ucd_table_pod_entries { } } + my $definition; + my $definition_table; + my $type = $table->property->type; + if ($type == $BINARY || $type == $FORCED_BINARY) { + $definition_table = $table->property->table('Y'); + } + elsif ($table->isa('Match_Table')) { + $definition_table = $table; + } + + $definition = $definition_table->calculate_table_definition + if defined $definition_table + && $definition_table != 0; + # Add any extra annotations to the full name entry foreach my $more_info ($table->description, + $definition, $table->note, $table->status_info) { @@ -15970,7 +16812,7 @@ sub make_ucd_table_pod_entries { Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway."); } - # We modifiy the info column of the one being output to + # We modify the info column of the one being output to # indicate the ambiguity. Set $which to point to that one's # info. my $which; @@ -16032,27 +16874,69 @@ sub pod_alphanumeric_sort { return -1 } - # Determine if the two operands are numeric property values or not. - # A numeric property will look like xyz: 3. But the number - # can begin with an optional minus sign, and may have a - # fraction or rational component, like xyz: 3/2. If either - # isn't numeric, use alphabetic sort. - my ($a_initial, $a_number) = - ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); - return $a cmp $b unless defined $a_number; - my ($b_initial, $b_number) = - ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix); - return $a cmp $b unless defined $b_number; - - # Here they are both numeric, but use alphabetic sort if the - # initial parts don't match - return $a cmp $b if $a_initial ne $b_initial; + # Determine if the two operands are compound or not, and if so if are + # "numeric" property values or not, like \p{Age: 3.0}. But there are also + # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0}, + # all of which this considers numeric, and for sorting, looks just at the + # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}. + my $split_re = qr/ + ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the + # property name + [:=] \s* # The syntax for the compound form + (?: # followed by ... + ( # $2 gets defined if what follows is a "numeric" + # expression, which is ... + ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational + # number, optionally signed + | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either + # of these go into $3 + | ( V \d+ _ \d+ ) # or a Unicode's Age property version + # number, into $4 + ) + | .* $ # If not "numeric", accept anything so that $1 gets + # defined if it is any compound form + ) /ix; + my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re); + my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re); + + # Sort alphabeticlly on the whole property name if either operand isn't + # compound, or they differ. + return $a cmp $b if ! defined $a_initial + || ! defined $b_initial + || $a_initial ne $b_initial; + + if (! defined $a_numeric) { + + # If neither is numeric, use alpha sort + return $a cmp $b if ! defined $b_numeric; + return 1; # Sort numeric ahead of alpha + } + + # Here $a is numeric + return -1 if ! defined $b_numeric; # Numeric sorts before alpha + + # Here they are both numeric in the same property. + # Convert version numbers into regular numbers + if (defined $a_version) { + ($a_number = $a_version) =~ s/^V//i; + $a_number =~ s/_/./; + } + else { # Otherwise get rid of the, e.g., CCC in CCC9 */ + $a_number =~ s/ ^ [[:alpha:]]+ //x; + } + if (defined $b_version) { + ($b_number = $b_version) =~ s/^V//i; + $b_number =~ s/_/./; + } + else { + $b_number =~ s/ ^ [[:alpha:]]+ //x; + } # Convert rationals to floating for the comparison. $a_number = eval $a_number if $a_number =~ qr{/}; $b_number = eval $b_number if $b_number =~ qr{/}; - return $a_number <=> $b_number; + return $a_number <=> $b_number || $a cmp $b; } sub make_pod () { @@ -16076,20 +16960,22 @@ sub make_pod () { '\p{Block: *}' . (($has_In_conflicts) ? " $exception_message" - : "")); + : ""), + $DISCOURAGED); @block_warning = << "END"; -Matches in the Block property have shortcuts that begin with "In_". For -example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For -backward compatibility, if there is no conflict with another shortcut, these -may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there -are numerous such conflicting shortcuts. Use of these forms for Block is -discouraged, and are flagged as such, not only because of the potential -confusion as to what is meant, but also because a later release of Unicode may -preempt the shortcut, and your program would no longer be correct. Use the -"In_" form instead to avoid this, or even more clearly, use the compound form, -e.g., C<\\p{blk:latin1}>. See L for more information -about this. +In particular, matches in the Block property have single forms +defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at +all, Like all B forms, these are not stable. For example, +C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>, +C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may +come along that would force Perl to change the meaning of one or more of +these, and your program would no longer be correct. Currently there are no +such conflicts with the form that begins C<"In_">, but there are many with the +other two shortcuts, and Unicode continues to define new properties that begin +with C<"In">, so it's quite possible that a conflict will occur in the future. +The compound form is guaranteed to not become obsolete, and its meaning is +clearer anyway. See L for more information about this. END } my $text = $Is_flags_text; @@ -16223,7 +17109,7 @@ END push @bad_re_properties, "\n=back\n"; } - # Similiarly, generate a list of files that we don't use, grouped by the + # Similarly, generate a list of files that we don't use, grouped by the # reasons why (Don't output if the reason is empty). First, create a hash # whose keys are the reasons, and whose values are anonymous arrays of all # the files that share that reason. @@ -16276,6 +17162,7 @@ END $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO') . "\n" . $ucd_pod; + my $space_hex = sprintf("%02x", ord " "); local $" = ""; # Everything is ready to assemble. @@ -16290,7 +17177,7 @@ To change this file, edit $0 instead. =head1 NAME -$pod_file - Index of Unicode Version $string_version character properties in Perl +$pod_file - Index of Unicode Version $unicode_version character properties in Perl =head1 DESCRIPTION @@ -16303,7 +17190,7 @@ Perl can provide access to all non-provisional Unicode character properties, though not all are enabled by default. The omitted ones are the Unihan properties (accessible via the CPAN module L) and certain deprecated or Unicode-internal properties. (An installation may choose to -recompile Perl's tables to change this. See L.) For most purposes, access to Unicode properties from the Perl core is through @@ -16336,14 +17223,16 @@ constructs, both single and compound forms. B consist of two components, separated by an equals sign or a colon. The first component is the property name, and the second component is the particular value of the property to match against, for example, -C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters -whose Script property value is Greek. +C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean +to match characters whose Script_Extensions property value is Greek. +(C is an improved version of the C