X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b620bb0293209eb0e8f635a6fff1c3ed761df431..961552db402b7504b032a784781af2f00b2540cd:/lib/unicore/mktables diff --git a/lib/unicore/mktables b/lib/unicore/mktables index 2da7bb3..8a4100c 100644 --- a/lib/unicore/mktables +++ b/lib/unicore/mktables @@ -488,8 +488,9 @@ my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/'; # # croak("Weird Canonical Decomposition of U+$h"); # -# Simply change to a carp. It will compile, but will not know about any three -# character decomposition. +# Simply comment it out. It will compile, but will not know about any three +# character decompositions. If using the .pm version, there is a similar +# line. # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out # that the reason is that the CJK block starting at 4E00 was removed from @@ -1466,6 +1467,8 @@ my $has_hangul_syllables = 0; my $needing_code_points_ending_in_code_point = 0; my @backslash_X_tests; # List of tests read in for testing \X +my @SB_tests; # List of tests read in for testing \b{sb} +my @WB_tests; # List of tests read in for testing \b{wb} my @unhandled_properties; # Will contain a list of properties found in # the input that we didn't process. my @match_properties; # Properties that have match tables, to be @@ -2287,6 +2290,8 @@ sub trace { return main::trace(@_); } $file{$addr} = main::internal_file_to_platform(shift); $first_released{$addr} = shift; + undef $file{$addr} if $first_released{$addr} gt $v_version; + # The rest of the arguments are key => value pairs # %constructor_fields has been set up earlier to list all possible # ones. Either set or push, depending on how the default has been set @@ -2335,7 +2340,7 @@ sub trace { return main::trace(@_); } # including its reason if ($skip{$addr}) { $optional{$addr} = 1; - $skipped_files{$file{$addr}} = $skip{$addr} + $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr}; } elsif ($properties{$addr}) { @@ -2416,7 +2421,8 @@ sub trace { return main::trace(@_); } # than this Unicode version), and isn't there. This means if someone # copies it into an earlier version's directory, we will go ahead and # process it. - return if $first_released{$addr} gt $v_version && ! -e $file; + return if $first_released{$addr} gt $v_version + && (! defined $file || ! -e $file); # If in debugging mode and this file doesn't have the non-skip # flag set, and isn't one of the critical files, skip it. @@ -8896,13 +8902,13 @@ sub trace { return main::trace(@_) if main::DEBUG && $to_trace } package main; sub display_chr { - # Converts an ordinal character value to a displayable string, using a - # NBSP to hold combining characters. + # Converts an ordinal printable character value to a displayable + # string, using a dotted circle to hold combining characters. my $ord = shift; my $chr = chr $ord; return $chr if $ccc->table(0)->contains($ord); - return chr(utf8::unicode_to_native(0xA0)) . $chr; + return "\x{25CC}$chr"; } sub join_lines($) { @@ -11721,6 +11727,30 @@ sub process_GCB_test { return; } +sub process_SB_test { + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + while ($file->next_line) { + push @SB_tests, $_; + } + + return; +} + +sub process_WB_test { + + my $file = shift; + Carp::carp_extra_args(\@_) if main::DEBUG && @_; + + while ($file->next_line) { + push @WB_tests, $_; + } + + return; +} + sub process_NamedSequences { # NamedSequences.txt entries are just added to an array. Because these # don't look like the other tables, they have their own handler. @@ -14119,107 +14149,6 @@ sub compile_perl() { + utf8::unicode_to_native(0xA0) # NBSP ); - # These two tables are for matching \X, which is based on the 'extended' - # grapheme cluster, which came in 5.1; create empty ones if not already - # present. The straight 'grapheme cluster' (non-extended) is used prior - # to 5.1, and differs from the extended (see - # http://www.unicode.org/reports/tr29/) only by these two tables, so we - # get the older definition automatically when they are empty. - my $gcb = property_ref('Grapheme_Cluster_Break'); - my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend', - Perl_Extension => 1, - Fate => $INTERNAL_ONLY); - if (defined (my $gcb_prepend = $gcb->table('Prepend'))) { - $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1); - } - else { - push @tables_that_may_be_empty, $perl_prepend->complete_name; - } - - # All the tables with _X_ in their names are used in defining \X handling, - # and are based on the Unicode GCB property. Basically, \X matches: - # CR LF - # | Prepend* Begin Extend* - # | . - # Begin is: ( Special_Begin | ! Control ) - # Begin is also: ( Regular_Begin | Special_Begin ) - # where Regular_Begin is defined as ( ! Control - Special_Begin ) - # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable ) - # Extend is: ( Grapheme_Extend | Spacing_Mark ) - # Control is: [ GCB_Control | CR | LF ] - # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) )) - - foreach my $gcb_name (qw{ L V T LV LVT }) { - - # The perl internal extension's name is the gcb table name prepended - # with an '_X_' - my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name, - Perl_Extension => 1, - Fate => $INTERNAL_ONLY, - Initialize => $gcb->table($gcb_name), - ); - # Version 1 had mostly different Hangul syllables that were removed - # from later versions, so some of the tables may not apply. - if ($v_version lt v2.0) { - push @tables_that_may_be_empty, $perl_table->complete_name; - } - } - - # More GCB. Populate a combined hangul syllables table - my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V', - Perl_Extension => 1, - Fate => $INTERNAL_ONLY); - $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V'); - $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V'); - - my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1, - Fate => $INTERNAL_ONLY); - if ($v_version ge v6.2) { - $ri += $gcb->table('RI'); - } - else { - push @tables_that_may_be_empty, $ri->full_name; - } - - my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start', - Perl_Extension => 1, - Fate => $INTERNAL_ONLY, - Initialize => $lv_lvt_v - + $gcb->table('L') - + $gcb->table('T') - + $ri - ); - $specials_begin->add_comment(join_lines( <add_match_table('_X_Regular_Begin', - Perl_Extension => 1, - Fate => $INTERNAL_ONLY, - Initialize => ~ $gcb->table('Control') - - $specials_begin - - $gcb->table('CR') - - $gcb->table('LF') - ); - $regular_begin->add_comment(join_lines( <add_match_table('_X_Extend', Perl_Extension => 1, - Fate => $INTERNAL_ONLY, - Initialize => $gcb->table('Extend') - ); - if (defined (my $sm = $gcb->table('SpacingMark'))) { - $extend += $sm; - } - $extend->add_comment('For use in \X; matches: Extend | SpacingMark'); - - # End of GCB \X processing - my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias'); if (@named_sequences) { @@ -16260,6 +16189,10 @@ controlling lists contained in the program C<\$Config{privlib}>/F and then re-compiling and installing. (C<\%Config> is available from the Config module). +Also, perl can be recompiled to operate on an earlier version of the Unicode +standard. Further information is at +C<\$Config{privlib}>/F. + =head1 Other information in the Unicode data base The Unicode data base is delivered in two different formats. The XML version @@ -17883,6 +17816,8 @@ sub make_property_test_script() { , @output, (map {"Test_GCB('$_');\n"} @backslash_X_tests), + (map {"Test_SB('$_');\n"} @SB_tests), + (map {"Test_WB('$_');\n"} @WB_tests), "Finished();\n" ]); @@ -18195,10 +18130,10 @@ my @input_file_objects = ( Skip => 'Validation Tests', ), Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0, - Skip => 'Validation Tests', + Handler => \&process_SB_test, ), Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0, - Skip => 'Validation Tests', + Handler => \&process_WB_test, ), Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0, Property => 'Sentence_Break', @@ -18762,6 +18697,7 @@ sub _test_break($$) { my @should_match = map { eval "\"$_\"" } @should_display; # If a string can be represented in both non-ut8 and utf8, test both cases + my $display_upgrade = ""; UPGRADE: for my $to_upgrade (0 .. 1) { @@ -18771,8 +18707,54 @@ sub _test_break($$) { next UPGRADE if utf8::is_utf8($string); utf8::upgrade($string); + $display_upgrade = " (utf8-upgraded)"; + } + + # The /l modifier has C after it to indicate the locale to try + my @modifiers = qw(a aa d lC u i); + push @modifiers, "l$utf8_locale" if defined $utf8_locale; + + # Test for each of the regex modifiers. + for my $modifier (@modifiers) { + my $display_locale = ""; + + # For /l, set the locale to what it says to. + if ($modifier =~ / ^ l (.*) /x) { + my $locale = $1; + $display_locale = "(locale = $locale)"; + use Config; + if (defined $Config{d_setlocale}) { + eval { require POSIX; import POSIX 'locale_h'; }; + if (defined &POSIX::LC_CTYPE) { + POSIX::setlocale(&POSIX::LC_CTYPE, $locale); + } + } + $modifier = 'l'; + } + + no warnings qw(locale regexp surrogate); + my $pattern = "(?$modifier:$break_pattern)"; + + # Actually do the test + my $matched = $string =~ qr/$pattern/; + print "not " unless $matched; + + # Fancy display of test results + $matched = ($matched) ? "matched" : "failed to match"; + print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n"; + + # Repeat with the first \B{} in the pattern. This makes sure the + # code in regexec.c:find_byclass() for \B gets executed + if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) { + my $B_pattern = "$1$2"; + $matched = $string =~ qr/$B_pattern/; + print "not " unless $matched; + print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n"; + } } + next if $break_type ne 'gcb'; + # Finally, do the \X match. my @matches = $string =~ /(\X)/g; @@ -18817,6 +18799,14 @@ sub Test_GCB($) { _test_break(shift, 'gcb'); } +sub Test_SB($) { + _test_break(shift, 'sb'); +} + +sub Test_WB($) { + _test_break(shift, 'wb'); +} + sub Finished() { print "1..$Tests\n"; exit($Fails ? -1 : 0);