#
# croak("Weird Canonical Decomposition of U+$h");
#
-# Simply change to a carp. It will compile, but will not know about any three
-# character decomposition.
+# Simply comment it out. It will compile, but will not know about any three
+# character decompositions. If using the .pm version, there is a similar
+# line.
# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
# that the reason is that the CJK block starting at 4E00 was removed from
# With this release, it is automatically handled if the Unihan db is
# downloaded
-push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
+push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version lt v5.2.0;
# There are several types of obsolete properties defined by Unicode. These
# must be hand-edited for every new Unicode release.
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
- 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+ 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
- 'Indic_Matra_Category' => "Provisional",
- 'Indic_Syllabic_Category' => "Provisional",
+ 'Indic_Matra_Category' => "Withdrawn by Unicode while still provisional",
# Don't suppress ISO_Comment, as otherwise special handling is needed
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
- 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
- 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
- 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
);
EOF
-my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
+my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
+ ? "10FFFF"
+ : "FFFF";
my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
# defaults for code points not listed (i.e., missing) in the file. The code
# depends on this ending with a semi-colon, so it can assume it is a valid
# field when the line is split() by semi-colons
-my $missing_defaults_prefix =
- qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
+my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
# Property types. Unicode has more types, but these are sufficient for our
# purposes.
my $needing_code_points_ending_in_code_point = 0;
my @backslash_X_tests; # List of tests read in for testing \X
+my @SB_tests; # List of tests read in for testing \b{sb}
+my @WB_tests; # List of tests read in for testing \b{wb}
my @unhandled_properties; # Will contain a list of properties found in
# the input that we didn't process.
my @match_properties; # Properties that have match tables, to be
$file{$addr} = main::internal_file_to_platform(shift);
$first_released{$addr} = shift;
+ undef $file{$addr} if $first_released{$addr} gt $v_version;
+
# The rest of the arguments are key => value pairs
# %constructor_fields has been set up earlier to list all possible
# ones. Either set or push, depending on how the default has been set
# including its reason
if ($skip{$addr}) {
$optional{$addr} = 1;
- $skipped_files{$file{$addr}} = $skip{$addr}
+ $skipped_files{$file{$addr}} = $skip{$addr} if $file{$addr};
}
elsif ($properties{$addr}) {
# than this Unicode version), and isn't there. This means if someone
# copies it into an earlier version's directory, we will go ahead and
# process it.
- return if $first_released{$addr} gt $v_version && ! -e $file;
+ return if $first_released{$addr} gt $v_version
+ && (! defined $file || ! -e $file);
# If in debugging mode and this file doesn't have the non-skip
# flag set, and isn't one of the critical files, skip it.
}
$handle{$addr} = $file_handle; # Cache the open file handle
- if ($v_version ge v3.2.0
- && lc($file) ne 'unicodedata.txt'
-
- # Unihan files used another format until v7
- && ($v_version ge v7.0.0 || $file !~ /^Unihan/i))
- {
- $_ = <$file_handle>;
- if ($_ !~ / - $string_version \. /x) {
- chomp;
- $_ =~ s/^#\s*//;
- die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
+ if ($file !~ /^Unihan/i) {
+ $_ = <$file_handle>;
+ if ($_ !~ / - $string_version \. /x) {
+ chomp;
+ $_ =~ s/^#\s*//;
+ die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ }
+ }
+ else {
+ while (<$file_handle>) {
+ if ($_ !~ /^#/) {
+ Carp::my_carp_bug("Could not find the expected version info in file '$file'");
+ last;
+ }
+ chomp;
+ $_ =~ s/^#\s*//;
+ next if $_ !~ / version: /x;
+ last if $_ =~ /$string_version/;
+ die Carp::my_carp("File '$file' is '$_'. It should be version $string_version");
+ }
}
}
}
# In other words,
# r[$i-1]->end < $start <= r[$i]->end
# And:
- # r[$i-1]->end < $start <= $end <= r[$j]->end
+ # r[$i-1]->end < $start <= $end <= r[$j+1]->start
#
# Also:
# $clean_insert is a boolean which is set true if and only if
# We now have enough information to decide if this call is a no-op
# or not. It is a no-op if this is an insertion of already
- # existing data.
+ # existing data. To be so, it must be contained entirely in one
+ # range.
if (main::DEBUG && $to_trace && $clean_insert
- && $i == $j
- && $start >= $r->[$i]->start)
+ && $start >= $r->[$i]->start
+ && $end <= $r->[$i]->end)
{
trace "no-op";
}
return if $clean_insert
- && $i == $j # more than one affected range => not no-op
-
- # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
- # Further, $start and/or $end is >= r[$i]->start
- # The test below hence guarantees that
- # r[$i]->start < $start <= $end <= r[$i]->end
- # This means the input range is contained entirely in
- # the one at $i, so is a no-op
- && $start >= $r->[$i]->start;
+ && $start >= $r->[$i]->start
+ && $end <= $r->[$i]->end;
}
# Here, we know that some action will have to be taken. We have
# but its format and even its name or existence are subject to change without
# notice in a future Perl version. Don't use it directly. Instead, its
# contents are now retrievable through a stable API in the Unicode::UCD
-# module: Unicode::UCD::prop_invmap('$property_name').
+# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
+# code points can be retrieved via Unicode::UCD::charprop());
END
}
return $return;
}
$comment .= "\nwhere 'cp' is $cp.";
if ($ucd_accessible_name) {
- $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
+ $comment .= " Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
}
# And append any commentary already set from the actual property.
package main;
- sub display_chr {
- # Converts an ordinal character value to a displayable string, using a
- # NBSP to hold combining characters.
- my $ord = shift;
- my $chr = chr $ord;
- return $chr if $ccc->table(0)->contains($ord);
- return chr(utf8::unicode_to_native(0xA0)) . $chr;
- }
+sub display_chr {
+ # Converts an ordinal printable character value to a displayable string,
+ # using a dotted circle to hold combining characters.
+
+ my $ord = shift;
+ my $chr = chr $ord;
+ return $chr if $ccc->table(0)->contains($ord);
+ return "\x{25CC}$chr";
+}
sub join_lines($) {
# Returns lines of the input joined together, so that they can be folded
{ # Closure
my $indent_increment = " " x (($debugging_build) ? 2 : 0);
- my %already_output;
+ %main::already_output = ();
$main::simple_dumper_nesting = 0;
# nesting level is localized, so that as the call stack pops, it goes
# back to the prior value.
local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
- undef %already_output if $main::simple_dumper_nesting == 0;
+ local %main::already_output = %main::already_output;
$main::simple_dumper_nesting++;
#print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
# Keep track of cycles in the input, and refuse to infinitely loop
my $addr = do { no overloading; pack 'J', $item; };
- if (defined $already_output{$addr}) {
+ if (defined $main::already_output{$addr}) {
return "${indent}ALREADY OUTPUT: $item\n";
}
- $already_output{$addr} = $item;
+ $main::already_output{$addr} = $item;
if (ref $item eq 'ARRAY') {
my $using_brackets;
my $object = shift;
my $fields_ref = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
my $addr = do { no overloading; pack 'J', $object; };
'AL');
$lb->set_default_map($default);
}
+ }
- # If has the URS property, make sure that the standard aliases are in
- # it, since not in the input tables in some versions.
- my $urs = property_ref('Unicode_Radical_Stroke');
- if (defined $urs) {
- $urs->add_alias('cjkRSUnicode');
- $urs->add_alias('kRSUnicode');
- }
+ # If has the URS property, make sure that the standard aliases are in
+ # it, since not in the input tables in some versions.
+ my $urs = property_ref('Unicode_Radical_Stroke');
+ if (defined $urs) {
+ $urs->add_alias('cjkRSUnicode');
+ $urs->add_alias('kRSUnicode');
}
# For backwards compatibility with applications that may read the mapping
$fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
# Rationals require extra effort.
- register_fraction($fields[$NUMERIC])
- if $fields[$NUMERIC] =~ qr{/};
+ if ($fields[$NUMERIC] =~ qr{/}) {
+ reduce_fraction(\$fields[$NUMERIC]);
+ register_fraction($fields[$NUMERIC])
+ }
}
}
return;
}
+sub process_SB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @SB_tests, $_;
+ }
+
+ return;
+}
+
+sub process_WB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @WB_tests, $_;
+ }
+
+ return;
+}
+
sub process_NamedSequences {
# NamedSequences.txt entries are just added to an array. Because these
# don't look like the other tables, they have their own handler.
return;
}
+sub gcd($$) { # Greatest-common-divisor; from
+ # http://en.wikipedia.org/wiki/Euclidean_algorithm
+ my ($a, $b) = @_;
+
+ use integer;
+
+ while ($b != 0) {
+ my $temp = $b;
+ $b = $a % $b;
+ $a = $temp;
+ }
+ return $a;
+}
+
+sub reduce_fraction($) {
+ my $fraction_ref = shift;
+
+ # Reduce a fraction to lowest terms. The Unicode data may be reducible,
+ # hence this is needed. The argument is a reference to the
+ # string denoting the fraction, which must be of the form:
+ if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
+ Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged");
+ return;
+ }
+
+ my $sign = $1;
+ my $numerator = $2;
+ my $denominator = $3;
+
+ use integer;
+
+ # Find greatest common divisor
+ my $gcd = gcd($numerator, $denominator);
+
+ # And reduce using the gcd.
+ if ($gcd != 1) {
+ $numerator /= $gcd;
+ $denominator /= $gcd;
+ $$fraction_ref = "$sign$numerator/$denominator";
+ }
+
+ return;
+}
+
sub filter_numeric_value_line {
# DNumValues contains lines of a different syntax than the typical
# property file:
$_ = "";
return;
}
+ reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
$rational = $fields[3];
+
$_ = join '; ', @fields[ 0, 3 ];
}
else {
# Change hyphens and blanks in the block name field only
$fields[1] =~ s/[ -]/_/g;
- $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
+ $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
$_ = join("; ", @fields);
return;
$gc->table('Ll')->set_caseless_equivalent($LC);
$gc->table('Lu')->set_caseless_equivalent($LC);
- my $Cs = $gc->table('Cs');
-
# Create digit and case fold tables with the original file names for
# backwards compatibility with applications that read them directly.
my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
}
my $Any = $perl->add_match_table('Any',
- Description => "All Unicode code points: [\\x{0000}-\\x{10FFFF}]",
+ Description => "All Unicode code points: [\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
);
- $Any->add_range(0, 0x10FFFF);
+ $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
$Any->add_alias('Unicode');
# Assigned is the opposite of gc=unassigned
# have Uppercase and Lowercase defined, so use the general category
# instead for them, modified by hard-coding in the code points each is
# missing.
- my $Lower = $perl->add_match_table('Lower');
+ my $Lower = $perl->add_match_table('XPosixLower');
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
$Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
) {
# Don't include the code point unless it is assigned in this
# release
- my $category = $gc->value_of(hex $code_point);
+ my $category = $gc->value_of($code_point);
next if ! defined $category || $category eq 'Cn';
$Lower += $code_point;
}
}
- $Lower->add_alias('XPosixLower');
my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Lower & $ASCII,
);
- my $Upper = $perl->add_match_table('Upper');
+ my $Upper = $perl->add_match_table("XPosixUpper");
my $Unicode_Upper = property_ref('Uppercase');
if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
$Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
$Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
- $Upper->add_alias('XPosixUpper');
my $Posix_Upper = $perl->add_match_table("PosixUpper",
Description => "[A-Z]",
Initialize => $Upper & $ASCII,
# one whose name generally begins with Posix that is posix-compliant, and
# one that matches Unicode characters beyond the Posix, ASCII range
- my $Alpha = $perl->add_match_table('Alpha');
+ my $Alpha = $perl->add_match_table('XPosixAlpha');
# Alphabetic was not present in early releases
my $Alphabetic = property_ref('Alphabetic');
$Alpha->add_description('Alphabetic');
$Alpha->add_alias('Alphabetic');
}
- $Alpha->add_alias('XPosixAlpha');
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
Description => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
$Posix_Upper->set_caseless_equivalent($Posix_Alpha);
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
- my $Alnum = $perl->add_match_table('Alnum',
+ my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
- $Alnum->add_alias('XPosixAlnum');
$perl->add_match_table("PosixAlnum",
Description => "[A-Za-z0-9]",
Initialize => $Alnum & $ASCII,
);
- my $Word = $perl->add_match_table('Word',
+ my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
Description => '\w, including beyond ASCII;'
. ' = \p{Alnum} + \pM + \p{Pc}',
Initialize => $Alnum + $gc->table('Mark'),
);
- $Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
if (defined $Pc) {
$Word += $Pc;
}
# This is a Perl extension, so the name doesn't begin with Posix.
- my $PerlWord = $perl->add_match_table('PerlWord',
+ my $PerlWord = $perl->add_match_table('PosixWord',
Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
- $PerlWord->add_alias('PosixWord');
+ $PerlWord->add_alias('PerlWord');
- my $Blank = $perl->add_match_table('Blank',
+ my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
Description => '\h, Horizontal white space',
# 200B is Zero Width Space which is for line
- 0x200B, # ZWSP
);
$Blank->add_alias('HorizSpace'); # Another name for it.
- $Blank->add_alias('XPosixBlank');
$perl->add_match_table("PosixBlank",
Description => "\\t and ' '",
Initialize => $Blank & $ASCII,
);
# No Posix equivalent for vertical space
- my $Space = $perl->add_match_table('Space',
+ my $Space = $perl->add_match_table('XPosixSpace',
Description => '\s including beyond ASCII and vertical tab',
Initialize => $Blank + $VertSpace,
);
- $Space->add_alias('XPosixSpace');
- my $posix_space = $perl->add_match_table("PosixSpace",
+ $Space->add_alias('XPerlSpace'); # Pre-existing synonyms
+ $Space->add_alias('SpacePerl');
+
+ my $Posix_space = $perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
+ $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
- # Perl's traditional space doesn't include Vertical Tab prior to v5.18
- my $XPerlSpace = $perl->add_match_table('XPerlSpace',
- Description => '\s, including beyond ASCII',
- Initialize => $Space,
- #Initialize => $Space
- # - utf8::unicode_to_native(0x0B]
- );
- $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
- my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
- Initialize => $XPerlSpace & $ASCII,
- );
-
-
- my $Cntrl = $perl->add_match_table('Cntrl',
+ my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
- $Cntrl->add_alias('XPosixCntrl');
$perl->add_match_table("PosixCntrl",
Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
Initialize => $Cntrl & $ASCII,
$controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
# Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
- my $Graph = $perl->add_match_table('Graph',
+ my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
Description => 'Characters that are graphical',
Initialize => ~ ($Space + $controls),
);
- $Graph->add_alias('XPosixGraph');
$perl->add_match_table("PosixGraph",
Description =>
'[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
Initialize => $Graph & $ASCII,
);
- $print = $perl->add_match_table('Print',
+ $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
- $print->add_alias('XPosixPrint');
$perl->add_match_table("PosixPrint",
Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $ASCII & $XPosixPunct,
);
- my $Digit = $perl->add_match_table('Digit',
+ my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
- $Digit->add_alias('XPosixDigit');
my $PosixDigit = $perl->add_match_table("PosixDigit",
Description => '[0-9]',
Initialize => $Digit & $ASCII,
);
# Hex_Digit was not present in first release
- my $Xdigit = $perl->add_match_table('XDigit');
- $Xdigit->add_alias('XPosixXDigit');
+ my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
my $Hex = property_ref('Hex_Digit');
if (defined $Hex && ! $Hex->is_empty) {
$Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
+ utf8::unicode_to_native(0xA0) # NBSP
);
- # These two tables are for matching \X, which is based on the 'extended'
- # grapheme cluster, which came in 5.1; create empty ones if not already
- # present. The straight 'grapheme cluster' (non-extended) is used prior
- # to 5.1, and differs from the extended (see
- # http://www.unicode.org/reports/tr29/) only by these two tables, so we
- # get the older definition automatically when they are empty.
- my $gcb = property_ref('Grapheme_Cluster_Break');
- my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
- $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
- }
- else {
- push @tables_that_may_be_empty, $perl_prepend->complete_name;
- }
-
- # All the tables with _X_ in their names are used in defining \X handling,
- # and are based on the Unicode GCB property. Basically, \X matches:
- # CR LF
- # | Prepend* Begin Extend*
- # | .
- # Begin is: ( Special_Begin | ! Control )
- # Begin is also: ( Regular_Begin | Special_Begin )
- # where Regular_Begin is defined as ( ! Control - Special_Begin )
- # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
- # Extend is: ( Grapheme_Extend | Spacing_Mark )
- # Control is: [ GCB_Control | CR | LF ]
- # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
-
- foreach my $gcb_name (qw{ L V T LV LVT }) {
-
- # The perl internal extension's name is the gcb table name prepended
- # with an '_X_'
- my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table($gcb_name),
- );
- # Version 1 had mostly different Hangul syllables that were removed
- # from later versions, so some of the tables may not apply.
- if ($v_version lt v2.0) {
- push @tables_that_may_be_empty, $perl_table->complete_name;
- }
- }
-
- # More GCB. Populate a combined hangul syllables table
- my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
- $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
-
- my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY);
- if ($v_version ge v6.2) {
- $ri += $gcb->table('RI');
- }
- else {
- push @tables_that_may_be_empty, $ri->full_name;
- }
-
- my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $lv_lvt_v
- + $gcb->table('L')
- + $gcb->table('T')
- + $ri
- );
- $specials_begin->add_comment(join_lines( <<END
-For use in \\X; matches first (perhaps only) character of potential
-multi-character sequences that can begin an extended grapheme cluster. They
-need special handling because of their complicated nature.
-END
- ));
- my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
- Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => ~ $gcb->table('Control')
- - $specials_begin
- - $gcb->table('CR')
- - $gcb->table('LF')
- );
- $regular_begin->add_comment(join_lines( <<END
-For use in \\X; matches first character of anything that can begin an extended
-grapheme cluster, except those that require special handling.
-END
- ));
-
- my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
- Fate => $INTERNAL_ONLY,
- Initialize => $gcb->table('Extend')
- );
- if (defined (my $sm = $gcb->table('SpacingMark'))) {
- $extend += $sm;
- }
- $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
-
- # End of GCB \X processing
-
my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
if (@named_sequences) {
}
}
- # Ouput both short and single in the same parenthesized
+ # Output both short and single in the same parenthesized
# expression, but with only one of 'Single', 'Short' if there
# are both items.
if ($short_name || $single_form || $table->conflicting) {
=head1 Properties accessible through Unicode::UCD
-All the Unicode character properties mentioned above (except for those marked
-as for internal use by Perl) are also accessible by
-L<Unicode::UCD/prop_invlist()>.
+The value of any Unicode (not including Perl extensions) character
+property mentioned above for any single code point is available through
+L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the
+values of all the Unicode properties for a given code point.
+
+Besides these, all the Unicode character properties mentioned above
+(except for those marked as for internal use by Perl) are also
+accessible by L<Unicode::UCD/prop_invlist()>.
Due to their nature, not all Unicode character properties are suitable for
regular expression matches, nor C<prop_invlist()>. The remaining
C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
(C<\%Config> is available from the Config module).
+Also, perl can be recompiled to operate on an earlier version of the Unicode
+standard. Further information is at
+C<\$Config{privlib}>/F<unicore/README.perl>.
+
=head1 Other information in the Unicode data base
The Unicode data base is delivered in two different formats. The XML version
# Similarly, we create for Unicode::UCD a list of
# property-value aliases.
- my $property_full_name = $property->full_name;
-
# Look at each table in the property...
foreach my $table ($property->tables) {
my @values_list;
}
# To save memory, unlike the similar list for property
- # aliases above, only the standard forms hve the list.
+ # aliases above, only the standard forms have the list.
# This forces an extra step of converting from input
# name to standard name, but the savings are
# considerable. (There is only marginal savings if we
[$HEADER,
<DATA>,
@output,
- (map {"Test_X('$_');\n"} @backslash_X_tests),
+ (map {"Test_GCB('$_');\n"} @backslash_X_tests),
+ (map {"Test_SB('$_');\n"} @SB_tests),
+ (map {"Test_WB('$_');\n"} @WB_tests),
"Finished();\n"
]);
+
return;
}
Skip => 'Validation Tests',
),
Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
+ Handler => \&process_SB_test,
),
Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
- Skip => 'Validation Tests',
+ Handler => \&process_WB_test,
),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
Input_file->new('IndicMatraCategory.txt', v6.1.0,
Property => 'Indic_Matra_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- Skip => "Provisional; for the analysis and processing of Indic scripts",
+ Skip => "Withdrawn by Unicode while still provisional",
),
Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
Property => 'Indic_Syllabic_Category',
Has_Missings_Defaults => $NOT_IGNORED,
- Skip => "Provisional; for the analysis and processing of Indic scripts",
+ Skip => (($v_version lt v8.0.0)
+ ? "Provisional; for the analysis and processing of Indic scripts"
+ : 0),
),
Input_file->new('BidiBrackets.txt', v6.3.0,
Properties => [ 'Bidi_Paired_Bracket', 'Bidi_Paired_Bracket_Type' ],
Input_file->new("BidiCharacterTest.txt", v6.3.0,
Skip => 'Validation Tests',
),
+ Input_file->new('IndicPositionalCategory.txt', v8.0.0,
+ Property => 'Indic_Positional_Category',
+ Has_Missings_Defaults => $NOT_IGNORED,
+ ),
);
# End of all the preliminaries.
my $Tests = 0;
my $Fails = 0;
+# loc_tools.pl requires this function to be defined
+sub ok($@) {
+ my ($pass, @msg) = @_;
+ print "not " unless $pass;
+ print "ok ";
+ print ++$Tests;
+ print " - ", join "", @msg if @msg;
+ print "\n";
+}
+
sub Expect($$$$) {
my $expected = shift;
my $ord = shift;
return;
}
-# GCBTest.txt character that separates grapheme clusters
+# Break test files (e.g. GCBTest.txt) character that break allowed here
my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
utf8::upgrade($breakable_utf8);
-# GCBTest.txt character that indicates that the adjoining code points are part
-# of the same grapheme cluster
+# Break test files (e.g. GCBTest.txt) character that indicates can't break
+# here
my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
utf8::upgrade($nobreak_utf8);
-sub Test_X($) {
+use Config;
+my $utf8_locale;
+chdir 't' if -d 't';
+eval { require "./loc_tools.pl" };
+$utf8_locale = &find_utf8_ctype_locale if defined &find_utf8_ctype_locale;
+
+sub _test_break($$) {
# Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
# Each such line is a sequence of code points given by their hex numbers,
# separated by the two characters defined just before this subroutine that
# Each \X should match the next cluster; and that is what is checked.
my $template = shift;
+ my $break_type = shift;
- my $line = (caller)[2];
+ my $line = (caller 1)[2]; # Line number
# The line contains characters above the ASCII range, but in Latin1. It
# may or may not be in utf8, and if it is, it may or may not know it. So,
$template =~ s/$breakable_utf8/$breakable/g;
}
- # Get rid of the leading and trailing breakables
- $template =~ s/^ \s* $breakable \s* //x;
- $template =~ s/ \s* $breakable \s* $ //x;
+ # The input is just the break/no-break symbols and sequences of Unicode
+ # code points as hex digits separated by spaces for legibility. e.g.:
+ # ÷ 0020 × 0308 ÷ 0020 ÷
+ # Convert to native \x format
+ $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
+ $template =~ s/ \s* //gx; # Probably the line above removed all spaces;
+ # but be sure
- # And no-breaks become just a space.
- $template =~ s/ \s* $nobreak \s* / /xg;
+ # Make a copy of the input with the symbols replaced by \b{} and \B{} as
+ # appropriate
+ my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
+ $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
- # Split the input into segments that are breakable between them.
- my @segments = split /\s*$breakable\s*/, $template;
+ my $display_string = $template =~ s/[$breakable$nobreak]//gr;
+ my $string = eval "\"$display_string\"";
- my $string = "";
- my $display_string = "";
- my @should_match;
- my @should_display;
+ # The remaining massaging of the input is for the \X tests. Get rid of
+ # the leading and trailing breakables
+ $template =~ s/^ \s* $breakable \s* //x;
+ $template =~ s/ \s* $breakable \s* $ //x;
- # Convert the code point sequence in each segment into a Perl string of
- # characters
- foreach my $segment (@segments) {
- my @code_points = split /\s+/, $segment;
- my $this_string = "";
- my $this_display = "";
- foreach my $code_point (@code_points) {
- $this_string .= chr utf8::unicode_to_native(hex $code_point);
- $this_display .= "\\x{$code_point}";
- }
+ # Delete no-breaks
+ $template =~ s/ \s* $nobreak \s* //xg;
- # The next cluster should match the string in this segment.
- push @should_match, $this_string;
- push @should_display, $this_display;
- $string .= $this_string;
- $display_string .= $this_display;
- }
+ # Split the input into segments that are breakable between them.
+ my @should_display = split /\s*$breakable\s*/, $template;
+ my @should_match = map { eval "\"$_\"" } @should_display;
# If a string can be represented in both non-ut8 and utf8, test both cases
+ my $display_upgrade = "";
UPGRADE:
for my $to_upgrade (0 .. 1) {
next UPGRADE if utf8::is_utf8($string);
utf8::upgrade($string);
+ $display_upgrade = " (utf8-upgraded)";
+ }
+
+ # The /l modifier has C after it to indicate the locale to try
+ my @modifiers = qw(a aa d lC u i);
+ push @modifiers, "l$utf8_locale" if defined $utf8_locale;
+
+ # Test for each of the regex modifiers.
+ for my $modifier (@modifiers) {
+ my $display_locale = "";
+
+ # For /l, set the locale to what it says to.
+ if ($modifier =~ / ^ l (.*) /x) {
+ my $locale = $1;
+ $display_locale = "(locale = $locale)";
+ use Config;
+ if (defined $Config{d_setlocale}) {
+ eval { require POSIX; import POSIX 'locale_h'; };
+ if (defined &POSIX::LC_CTYPE) {
+ POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
+ }
+ }
+ $modifier = 'l';
+ }
+
+ no warnings qw(locale regexp surrogate);
+ my $pattern = "(?$modifier:$break_pattern)";
+
+ # Actually do the test
+ my $matched = $string =~ qr/$pattern/;
+ print "not " unless $matched;
+
+ # Fancy display of test results
+ $matched = ($matched) ? "matched" : "failed to match";
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$pattern/$display_upgrade; line $line $display_locale\n";
+
+ # Repeat with the first \B{} in the pattern. This makes sure the
+ # code in regexec.c:find_byclass() for \B gets executed
+ if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
+ my $B_pattern = "$1$2";
+ $matched = $string =~ qr/$B_pattern/;
+ print "not " unless $matched;
+ print "ok ", ++$Tests, " - \"$display_string\" $matched /$B_pattern/$display_upgrade; line $line $display_locale\n";
+ }
}
+ next if $break_type ne 'gcb';
+
# Finally, do the \X match.
my @matches = $string =~ /(\X)/g;
print " correctly matched $should_display[$i]; line $line\n";
} else {
$matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
- unpack("U*", $matches[$i]));
+ split "", $matches[$i]);
print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
$i + 1,
" should have matched $should_display[$i]",
return;
}
+sub Test_GCB($) {
+ _test_break(shift, 'gcb');
+}
+
+sub Test_SB($) {
+ _test_break(shift, 'sb');
+}
+
+sub Test_WB($) {
+ _test_break(shift, 'wb');
+}
+
sub Finished() {
print "1..$Tests\n";
exit($Fails ? -1 : 0);
}
Error('\p{Script=InGreek}'); # Bug #69018
-Test_X("1100 $nobreak 1161"); # Bug #70940
+Test_GCB("1100 $nobreak 1161"); # Bug #70940
Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726