#
# croak("Weird Canonical Decomposition of U+$h");
#
-# Simply change to a carp. It will compile, but will not know about any three
-# character decomposition.
+# Simply comment it out. It will compile, but will not know about any three
+# character decompositions. If using the .pm version, there is a similar
+# line.
# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
# that the reason is that the CJK block starting at 4E00 was removed from
my $needing_code_points_ending_in_code_point = 0;
my @backslash_X_tests; # List of tests read in for testing \X
+my @SB_tests; # List of tests read in for testing \b{sb}
+my @WB_tests; # List of tests read in for testing \b{wb}
my @unhandled_properties; # Will contain a list of properties found in
# the input that we didn't process.
my @match_properties; # Properties that have match tables, to be
$file{$addr} = main::internal_file_to_platform(shift);
$first_released{$addr} = shift;
+ undef $file{$addr} if $first_released{$addr} gt $v_version;
+
# The rest of the arguments are key => value pairs
# %constructor_fields has been set up earlier to list all possible
# ones. Either set or push, depending on how the default has been set
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $skipped_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr};
}
elsif ($properties{$addr}) {
# than this Unicode version), and isn't there. This means if someone
# copies it into an earlier version's directory, we will go ahead and
# process it.
- return if $first_released{$addr} gt $v_version && ! -e $file;
+ return if $first_released{$addr} gt $v_version
+ && (! defined $file || ! -e $file);
# If in debugging mode and this file doesn't have the non-skip
# flag set, and isn't one of the critical files, skip it.
package main;
sub display_chr {
- # Converts an ordinal character value to a displayable string, using a
- # NBSP to hold combining characters.
+ # Converts an ordinal printable character value to a displayable
+ # string, using a dotted circle to hold combining characters.
my $ord = shift;
my $chr = chr $ord;
return $chr if $ccc->table(0)->contains($ord);
- return chr(utf8::unicode_to_native(0xA0)) . $chr;
+ return "\x{25CC}$chr";
}
sub join_lines($) {
return;
}
+sub process_SB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @SB_tests, $_;
+ }
+
+ return;
+}
+
+sub process_WB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @WB_tests, $_;
+ }
+
+ return;
+}
+
sub process_NamedSequences {
# NamedSequences.txt entries are just added to an array. Because these
# don't look like the other tables, they have their own handler.
+ utf8::unicode_to_native(0xA0) # NBSP
);
- # These two tables are for matching \X, which is based on the 'extended'
- # grapheme cluster, which came in 5.1; create empty ones if not already
- # present. The straight 'grapheme cluster' (non-extended) is used prior
- # to 5.1, and differs from the extended (see
- # http://www.unicode.org/reports/tr29/) only by these two tables, so we
- # get the older definition automatically when they are empty.
- my $gcb = property_ref('Grapheme_Cluster_Break');
- my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
- $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
- }
- else {
- push @tables_that_may_be_empty, $perl_prepend->complete_name;
- }
-
- # All the tables with _X_ in their names are used in defining \X handling,
- # and are based on the Unicode GCB property. Basically, \X matches:
- # CR LF
- # | Prepend* Begin Extend*
- # | .
- # Begin is: ( Special_Begin | ! Control )
- # Begin is also: ( Regular_Begin | Special_Begin )
- # where Regular_Begin is defined as ( ! Control - Special_Begin )
- # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- # Extend is: ( Grapheme_Extend | Spacing_Mark )
- # Control is: [ GCB_Control | CR | LF ]
- # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- foreach my $gcb_name (qw{ L V T LV LVT }) {
-
- # The perl internal extension's name is the gcb table name prepended
- # with an '_X_'
- my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table($gcb_name),
- );
- # Version 1 had mostly different Hangul syllables that were removed
- # from later versions, so some of the tables may not apply.
- if ($v_version lt v2.0) {
- push @tables_that_may_be_empty, $perl_table->complete_name;
- }
- }
-
- # More GCB. Populate a combined hangul syllables table
- my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
-
- my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if ($v_version ge v6.2) {
- $ri += $gcb->table('RI');
- }
- else {
- push @tables_that_may_be_empty, $ri->full_name;
- }
-
- my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $lv_lvt_v
- + $gcb->table('L')
- + $gcb->table('T')
- + $ri
- );
- $specials_begin->add_comment(join_lines( <<END
-For use in \\X; matches first (perhaps only) character of potential
-multi-character sequences that can begin an extended grapheme cluster. They
-need special handling because of their complicated nature.
-END
- ));
- my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => ~ $gcb->table('Control')
- - $specials_begin
- - $gcb->table('CR')
- - $gcb->table('LF')
- );
- $regular_begin->add_comment(join_lines( <<END
-For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster, except those that require special handling.
-END
- ));
-
- my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table('Extend')
- );
- if (defined (my $sm = $gcb->table('SpacingMark'))) {
- $extend += $sm;
- }
- $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
-
- # End of GCB \X processing
-
my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
if (@named_sequences) {
C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
(C<\%Config> is available from the Config module).
+Also, perl can be recompiled to operate on an earlier version of the Unicode
+standard. Further information is at
+C<\$Config{privlib}>/F<unicore/README.perl>.
+
=head1 Other information in the Unicode data base
The Unicode data base is delivered in two different formats. The XML version
<DATA>,
@output,
(map {"Test_GCB('$_');\n"} @backslash_X_tests),
+ (map {"Test_SB('$_');\n"} @SB_tests),
+ (map {"Test_WB('$_');\n"} @WB_tests),
"Finished();\n"
]);
Skip => 'Validation Tests',
),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
+ Handler => \&process_SB_test,
),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
+ Handler => \&process_WB_test,
),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
my @should_match = map { eval "\"$_\"" } @should_display;
# If a string can be represented in both non-ut8 and utf8, test both cases
+ my $display_upgrade = "";
UPGRADE:
for my $to_upgrade (0 .. 1) {
next UPGRADE if utf8::is_utf8($string);
utf8::upgrade($string);
+ $display_upgrade = " (utf8-upgraded)";
+ }
+
+ # The /l modifier has C after it to indicate the locale to try
+ my @modifiers = qw(a aa d lC u i);
+ push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+ # Test for each of the regex modifiers.
+ for my $modifier (@modifiers) {
+ my $display_locale = "";
+
+ # For /l, set the locale to what it says to.
+ if ($modifier =~ / ^ l (.*) /x) {
+ my $locale = $1;
+ $display_locale = "(locale = $locale)";
+ use Config;
+ if (defined $Config{d_setlocale}) {
+ eval { require POSIX; import POSIX 'locale_h'; };
+ if (defined &POSIX::LC_CTYPE) {
+ POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
+ }
+ }
+ $modifier = 'l';
+ }
+
+ no warnings qw(locale regexp surrogate);
+ my $pattern = "(?$modifier:$break_pattern)";
+
+ # Actually do the test
+ my $matched = $string =~ qr/$pattern/;
+ print "not " unless $matched;
+
+ # Fancy display of test results
+ $matched = ($matched) ? "matched" : "failed to match";
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+
+ # Repeat with the first \B{} in the pattern. This makes sure the
+ # code in regexec.c:find_byclass() for \B gets executed
+ if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
+ my $B_pattern = "$1$2";
+ $matched = $string =~ qr/$B_pattern/;
+ print "not " unless $matched;
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+ }
}
+ next if $break_type ne 'gcb';
+
# Finally, do the \X match.
my @matches = $string =~ /(\X)/g;
_test_break(shift, 'gcb');
}
+sub Test_SB($) {
+ _test_break(shift, 'sb');
+}
+
+sub Test_WB($) {
+ _test_break(shift, 'wb');
+}
+
sub Finished() {
print "1..$Tests\n";
exit($Fails ? -1 : 0);