X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/0911a63df6aeaa6d22129e5be33256d3ea1235cc..10914c783fe2ea3ee73a870599f30cedb7de96d0:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index c0ad2f1..e15a37e 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -839,6 +839,26 @@ if ($v_version ge v5.2.0) { } } +# Enum values for to_output_map() method in the Map_Table package. +my $EXTERNAL_MAP = 1; +my $INTERNAL_MAP = 2; + +# To override computed values for writing the map tables for these properties. +# The default for enum map tables is to write them out, so that the Unicode +# .txt files can be removed, but all the data to compute any property value +# for any code point is available in a more compact form. +my %global_to_output_map = ( + # Needed by UCD.pm, but don't want to publicize that it exists, so won't + # get stuck supporting it if things change. Sinc it is a STRING property, + # it normally would be listed in the pod, but INTERNAL_MAP suppresses + # that. + Unicode_1_Name => $INTERNAL_MAP, + + Present_In => 0, # Suppress, as easily computed from Age + Canonical_Combining_Class => 0, # Duplicate of CombiningClass.pl + Block => 0, # Suppress, as Blocks.txt is retained. +); + # Properties that this program ignores. my @unimplemented_properties = ( 'Unicode_Radical_Stroke' # Remove if changing to handle this one. @@ -856,7 +876,7 @@ my %why_obsolete; # Documentation only my $other_properties = 'other properties'; my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone"; - my $why_no_expand = "Deprecated by Unicode: less useful than UTF-specific calculations", + my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character."; %why_deprecated = ( 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)', @@ -880,7 +900,6 @@ my %why_obsolete; # Documentation only 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize', 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2', - 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames", 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold", 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo", @@ -1072,10 +1091,6 @@ my $DEVELOPMENT_ONLY=<<"EOF"; EOF -# Enum values for to_output_map() method in the Map_Table package. -my $EXTERNAL_MAP = 1; -my $INTERNAL_MAP = 2; - my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF"; my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING; my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1; @@ -5580,6 +5595,8 @@ sub trace { return main::trace(@_); } return $to_output_map{$addr} if defined $to_output_map{$addr}; my $full_name = $self->full_name; + return $global_to_output_map{$full_name} + if defined $global_to_output_map{$full_name}; # If table says to output, do so; if says to suppress it, do so. return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties; @@ -10253,7 +10270,7 @@ END # http://www.unicode.org/versions/corrigendum8.html $fields[$BIDI] = "AL"; } - elsif ($^V lt v5.15.0) { # For 5.16 will convert to use Unicode's name + elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name $fields[$CHARNAME] = ""; } @@ -10371,105 +10388,147 @@ sub filter_arabic_shaping_line { return; } -sub setup_special_casing { - # SpecialCasing.txt contains the non-simple case change mappings. The - # simple ones are in UnicodeData.txt, which should already have been read - # in to the full property data structures, so as to initialize these with - # the simple ones. Then the SpecialCasing.txt entries overwrite the ones - # which have different full mappings. - - # This routine sees if the simple mappings are to be output, and if so, - # copies what has already been put into the full mapping tables, while - # they still contain only the simple mappings. - - # The reason it is done this way is that the simple mappings are probably - # not going to be output, so it saves work to initialize the full tables - # with the simple mappings, and then overwrite those relatively few - # entries in them that have different full mappings, and thus skip the - # simple mapping tables altogether. - - my $file= shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; +{ # Closure + my $lc; # Table for lowercase mapping + my $tc; + my $uc; + + sub setup_special_casing { + # SpecialCasing.txt contains the non-simple case change mappings. The + # simple ones are in UnicodeData.txt, which should already have been + # read in to the full property data structures, so as to initialize + # these with the simple ones. Then the SpecialCasing.txt entries + # overwrite the ones which have different full mappings. + + # This routine sees if the simple mappings are to be output, and if + # so, copies what has already been put into the full mapping tables, + # while they still contain only the simple mappings. + + # The reason it is done this way is that the simple mappings are + # probably not going to be output, so it saves work to initialize the + # full tables with the simple mappings, and then overwrite those + # relatively few entries in them that have different full mappings, + # and thus skip the simple mapping tables altogether. + + my $file= shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; - # For each of the case change mappings... - foreach my $case ('lc', 'tc', 'uc') { - my $full = property_ref($case); - unless (defined $full && ! $full->is_empty) { - Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); + $lc = property_ref('lc'); + $tc = property_ref('tc'); + $uc = property_ref('uc'); + + # For each of the case change mappings... + foreach my $case_table ($lc, $tc, $uc) { + my $case = $case_table->name; + my $full = property_ref($case); + unless (defined $full && ! $full->is_empty) { + Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated."); + } + + # The simple version's name in each mapping merely has an 's' in + # front of the full one's + my $simple = property_ref('s' . $case); + $simple->initialize($full) if $simple->to_output_map(); + + my $simple_only = Property->new("_s$case", + Type => $STRING, + Default_Map => $CODE_POINT, + Perl_Extension => 1, + Description => "The simple mappings for $case for code points that have full mappings as well"); + $simple_only->set_to_output_map($INTERNAL_MAP); + $simple_only->add_comment(join_lines( <initialize($full) if $simple->to_output_map(); + return; } - return; -} + sub filter_special_casing_line { + # Change the format of $_ from SpecialCasing.txt into something that + # the generic handler understands. Each input line contains three + # case mappings. This will generate three lines to pass to the + # generic handler for each of those. -sub filter_special_casing_line { - # Change the format of $_ from SpecialCasing.txt into something that the - # generic handler understands. Each input line contains three case - # mappings. This will generate three lines to pass to the generic handler - # for each of those. - - # The input syntax (after stripping comments and trailing white space is - # like one of the following (with the final two being entries that we - # ignore): - # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S - # 03A3; 03C2; 03A3; 03A3; Final_Sigma; - # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE - # Note the trailing semi-colon, unlike many of the input files. That - # means that there will be an extra null field generated by the split + # The input syntax (after stripping comments and trailing white space + # is like one of the following (with the final two being entries that + # we ignore): + # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S + # 03A3; 03C2; 03A3; 03A3; Final_Sigma; + # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE + # Note the trailing semi-colon, unlike many of the input files. That + # means that there will be an extra null field generated by the split - my $file = shift; - Carp::carp_extra_args(\@_) if main::DEBUG && @_; + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; - my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields + my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null + # fields + + # field #4 is when this mapping is conditional. If any of these get + # implemented, it would be by hard-coding in the casing functions in + # the Perl core, not through tables. But if there is a new condition + # we don't know about, output a warning. We know about all the + # conditions through 6.0 + if ($fields[4] ne "") { + my @conditions = split ' ', $fields[4]; + if ($conditions[0] ne 'tr' # We know that these languages have + # conditions, and some are multiple + && $conditions[0] ne 'az' + && $conditions[0] ne 'lt' + + # And, we know about a single condition Final_Sigma, but + # nothing else. + && ($v_version gt v5.2.0 + && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) + { + $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); + } + elsif ($conditions[0] ne 'Final_Sigma') { - # field #4 is when this mapping is conditional. If any of these get - # implemented, it would be by hard-coding in the casing functions in the - # Perl core, not through tables. But if there is a new condition we don't - # know about, output a warning. We know about all the conditions through - # 6.0 - if ($fields[4] ne "") { - my @conditions = split ' ', $fields[4]; - if ($conditions[0] ne 'tr' # We know that these languages have - # conditions, and some are multiple - && $conditions[0] ne 'az' - && $conditions[0] ne 'lt' - - # And, we know about a single condition Final_Sigma, but - # nothing else. - && ($v_version gt v5.2.0 - && (@conditions > 1 || $conditions[0] ne 'Final_Sigma'))) - { - $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore"); + # Don't print out a message for Final_Sigma, because we + # have hard-coded handling for it. (But the standard + # could change what the rule should be, but it wouldn't + # show up here anyway. + + print "# SKIPPING Special Casing: $_\n" + if $verbosity >= $VERBOSE; + } + $_ = ""; + return; + } + elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { + $file->carp_bad_line('Extra fields'); + $_ = ""; + return; } - elsif ($conditions[0] ne 'Final_Sigma') { - # Don't print out a message for Final_Sigma, because we have - # hard-coded handling for it. (But the standard could change - # what the rule should be, but it wouldn't show up here - # anyway. + $_ = "$fields[0]; lc; $fields[1]"; + $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]"); + $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]"); - print "# SKIPPING Special Casing: $_\n" - if $verbosity >= $VERBOSE; + # Copy any simple case change to the special tables constructed if + # being overridden by a multi-character case change. + if ($fields[1] ne $fields[0] + && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT) + { + $file->insert_adjusted_lines("$fields[0]; _slc; $value"); } - $_ = ""; - return; - } - elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) { - $file->carp_bad_line('Extra fields'); - $_ = ""; + if ($fields[2] ne $fields[0] + && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT) + { + $file->insert_adjusted_lines("$fields[0]; _stc; $value"); + } + if ($fields[3] ne $fields[0] + && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT) + { + $file->insert_adjusted_lines("$fields[0]; _suc; $value"); + } + return; } - - $_ = "$fields[0]; lc; $fields[1]"; - $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]"); - $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]"); - - return; } sub filter_old_style_case_folding { @@ -10551,16 +10610,18 @@ sub filter_old_style_case_folding { } # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase - # I are all full foldings - if ($type eq 'C' || $type eq 'F' || $type eq 'I') { - $_ = "$range; Case_Folding; $map"; + # I are all full foldings; S is single-char. For S, there is always + # an F entry, so we must allow multiple values for the same code + # point. Fortunately this table doesn't need further manipulation + # which would preclude using multiple-values. The S is now included + # so that _swash_inversion_hash() is able to construct closures + # without having to worry about F mappings. + if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') { + $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map"; } else { $_ = ""; - if ($type ne 'S') { - $file->carp_bad_line('Expecting C F I S or T in second field'); - return; - } + $file->carp_bad_line('Expecting C F I S or T in second field'); } # C and S are simple foldings, but simple case folding is not needed @@ -11491,8 +11552,8 @@ sub compile_perl() { ); $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym my $PerlSpace = $perl->add_match_table('PerlSpace', - Description => '\s, restricted to ASCII', - Initialize => $XPerlSpace & $ASCII, + Description => '\s, restricted to ASCII = [ \f\n\r\t]', + Initialize => $XPerlSpace & $ASCII, ); @@ -11571,10 +11632,17 @@ sub compile_perl() { 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]); $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO'); } - $perl->add_match_table('PosixXDigit', - Initialize => $ASCII & $Xdigit, - Description => '[0-9A-Fa-f]', - ); + + # AHex was not present in early releases + my $PosixXDigit = $perl->add_match_table('PosixXDigit'); + my $AHex = property_ref('ASCII_Hex_Digit'); + if (defined $AHex && ! $AHex->is_empty) { + $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1); + } + else { + $PosixXDigit->initialize($Xdigit & $ASCII); + } + $PosixXDigit->add_description('[0-9A-Fa-f]'); my $dt = property_ref('Decomposition_Type'); $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical', @@ -12971,7 +13039,7 @@ Perl extension. There is some detail about Blocks, Scripts, General_Category, and Bidi_Class in L, but to find out about the intricacies of the Unicode properties, refer to the Unicode standard. A good starting place is L<$unicode_reference_url>. More information on the Perl extensions is in -L. +L. Note that you can define your own properties; see L. @@ -13036,8 +13104,8 @@ adjacent to (but within) the braces and the colon or equal sign. =back -Some properties are considered obsolete, but still available. There are -several varieties of obsolescence: +Some properties are considered obsolete by Unicode, but still available. +There are several varieties of obsolescence: =over 4 @@ -13057,8 +13125,8 @@ table. =item Deprecated An obsolete property may be deprecated, perhaps because its original intent -has been replaced by another property or because its specification was somehow -defective. This means that its use is strongly +has been replaced by another property, or because its specification was +somehow defective. This means that its use is strongly discouraged, so much so that a warning will be issued if used, unless the regular expression is in the scope of a C> statement. $A_bold_deprecated flags each such entry in the table, and @@ -13167,6 +13235,7 @@ $zero_matches A few properties are accessible in Perl via various function calls only. These are: + Lowercase_Mapping lc() and lcfirst() Titlecase_Mapping ucfirst() Uppercase_Mapping uc() @@ -13191,9 +13260,11 @@ the properties are listed enclosed in (parentheses). =back -An installation can choose to allow any of these to be matched by changing the +An installation can choose to allow any of these to be matched by downloading +the Unicode database from L to +C<\$Config{privlib}>/F in the Perl source tree, changing the controlling lists contained in the program -C<\$Config{privlib}>/F and then re-running F. +C<\$Config{privlib}>/F and then re-compiling and installing. (C<\%Config> is available from the Config module). =head1 Files in the I directory (for serious hackers only) @@ -13201,33 +13272,19 @@ C<\$Config{privlib}>/F and then re-running F. All Unicode properties are really mappings (in the mathematical sense) from code points to their respective values. As part of its build process, Perl constructs tables containing these mappings for all properties that it -deals with. But only a few of these are written out into files. +deals with. Some, but not all, of these are written out into files. Those written out are in the directory C<\$Config{privlib}>/F (%Config is available from the Config module). -Those ones written are ones needed by Perl internally during execution, or for -which there is some demand, and those for which there is no access through the -Perl core. Generally, properties that can be used in regular expression -matching do not have their map tables written, like Script. Nor are the -simplistic properties that have a better, more complete version, such as -Simple_Uppercase_Mapping (Uppercase_Mapping is written instead). - -None of the properties in the I directory are currently directly -accessible through the Perl core, although some may be accessed indirectly. -For example, the uc() function implements the Uppercase_Mapping property and -uses the F file found in this directory. - -The available files in the current installation, with their properties (short -names in parentheses), and any flags or comments about them, are: +Perl reserves the right to change the format and even the existence of any of +those files without notice, except the ones that were in existence prior to +release 5.13. If those change, a deprecation cycle will be done first. These +are: @map_tables_actually_output -An installation can choose to change which files are generated by changing the -controlling lists contained in the program -C<\$Config{privlib}>/F and then re-running F. - -Each of these files defines two hash entries to help reading programs decipher -it. One of them looks like this: +Each of the files in this directory defines two hash entries to help reading +programs decipher it. One of them looks like this: \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';