# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
# Without this change, there are casing problems for this character.
#
+# Search for $string_compare_versions to see how to compare changes to
+# properties between Unicode versions
+#
##############################################################################
my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
- 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
+ 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
'Indic_Matra_Category' => "Provisional",
'Indic_Syllabic_Category' => "Provisional",
# to differentiate between it and gc=c, which can be written as 'isc',
# which is the same characters as ISO_Comment's short name.
- 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
+ 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
- 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
- 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
- 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
+ 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
+ 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
- FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
+ FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
);
foreach my $property (
}
$handle{$addr} = $file_handle; # Cache the open file handle
- if ($v_version ge v3.2.0
- && lc($file) ne 'unicodedata.txt'
-
- # Unihan files used another format until v7
- && ($v_version ge v7.0.0 || $file !~ /^Unihan/i))
- {
- $_ = <$file_handle>;
- if ($_ !~ / - $string_version \. /x) {
- chomp;
- $_ =~ s/^#\s*//;
- die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ if ($v_version ge v3.2.0 && lc($file) ne 'unicodedata.txt') {
+ if ($file !~ /^Unihan/i) {
+ $_ = <$file_handle>;
+ if ($_ !~ / - $string_version \. /x) {
+ chomp;
+ $_ =~ s/^#\s*//;
+ die Carp::my_carp("File '$file' is version '$_'. It should be version $string_version");
+ }
+ }
+ else {
+ while (<$file_handle>) {
+ if ($_ !~ /^#/) {
+ Carp::my_carp_bug("Could not find the expected version info in file '$file'");
+ last;
+ }
+ chomp;
+ $_ =~ s/^#\s*//;
+ next if $_ !~ / version: /x;
+ last if $_ =~ /$string_version/;
+ die Carp::my_carp("File '$file' is '$_'. It should be version $string_version");
+ }
}
}
}
# but its format and even its name or existence are subject to change without
# notice in a future Perl version. Don't use it directly. Instead, its
# contents are now retrievable through a stable API in the Unicode::UCD
-# module: Unicode::UCD::prop_invmap('$property_name').
+# module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
+# code points can be retrieved via Unicode::UCD::charprop());
END
}
return $return;
}
$comment .= "\nwhere 'cp' is $cp.";
if ($ucd_accessible_name) {
- $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
+ $comment .= " Note that $these_mappings $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
}
# And append any commentary already set from the actual property.
}
}
- # I (khw) have never waded through this line to
- # understand it well enough to comment it.
+ # The unpack yields a list of the bytes that comprise the
+ # UTF-8 of $code_point, which are each placed in \xZZ format
+ # and output in the %s to map to $tostr, so the result looks
+ # like:
+ # "\xC4\xB0" => "\x{0069}\x{0307}",
my $utf8 = sprintf(qq["%s" => "$tostr",],
join("", map { sprintf "\\x%02X", $_ }
- unpack("U0C*", pack("U", $code_point))));
+ unpack("U0C*", chr $code_point)));
# Add a comment so that a human reader can more easily
# see what's going on.
{ # Closure
my $indent_increment = " " x (($debugging_build) ? 2 : 0);
- my %already_output;
+ %main::already_output = ();
$main::simple_dumper_nesting = 0;
# nesting level is localized, so that as the call stack pops, it goes
# back to the prior value.
local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
- undef %already_output if $main::simple_dumper_nesting == 0;
+ local %main::already_output = %main::already_output;
$main::simple_dumper_nesting++;
#print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
# Keep track of cycles in the input, and refuse to infinitely loop
my $addr = do { no overloading; pack 'J', $item; };
- if (defined $already_output{$addr}) {
+ if (defined $main::already_output{$addr}) {
return "${indent}ALREADY OUTPUT: $item\n";
}
- $already_output{$addr} = $item;
+ $main::already_output{$addr} = $item;
if (ref $item eq 'ARRAY') {
my $using_brackets;
# have Uppercase and Lowercase defined, so use the general category
# instead for them, modified by hard-coding in the code points each is
# missing.
- my $Lower = $perl->add_match_table('Lower');
+ my $Lower = $perl->add_match_table('XPosixLower');
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
$Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
$Lower += $code_point;
}
}
- $Lower->add_alias('XPosixLower');
my $Posix_Lower = $perl->add_match_table("PosixLower",
Description => "[a-z]",
Initialize => $Lower & $ASCII,
);
- my $Upper = $perl->add_match_table('Upper');
+ my $Upper = $perl->add_match_table("XPosixUpper");
my $Unicode_Upper = property_ref('Uppercase');
if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
$Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
$Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
- $Upper->add_alias('XPosixUpper');
my $Posix_Upper = $perl->add_match_table("PosixUpper",
Description => "[A-Z]",
Initialize => $Upper & $ASCII,
# one whose name generally begins with Posix that is posix-compliant, and
# one that matches Unicode characters beyond the Posix, ASCII range
- my $Alpha = $perl->add_match_table('Alpha');
+ my $Alpha = $perl->add_match_table('XPosixAlpha');
# Alphabetic was not present in early releases
my $Alphabetic = property_ref('Alphabetic');
$Alpha->add_description('Alphabetic');
$Alpha->add_alias('Alphabetic');
}
- $Alpha->add_alias('XPosixAlpha');
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
Description => "[A-Za-z]",
Initialize => $Alpha & $ASCII,
$Posix_Upper->set_caseless_equivalent($Posix_Alpha);
$Posix_Lower->set_caseless_equivalent($Posix_Alpha);
- my $Alnum = $perl->add_match_table('Alnum',
+ my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
Description => 'Alphabetic and (decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
- $Alnum->add_alias('XPosixAlnum');
$perl->add_match_table("PosixAlnum",
Description => "[A-Za-z0-9]",
Initialize => $Alnum & $ASCII,
);
- my $Word = $perl->add_match_table('Word',
+ my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
Description => '\w, including beyond ASCII;'
. ' = \p{Alnum} + \pM + \p{Pc}',
Initialize => $Alnum + $gc->table('Mark'),
);
- $Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
if (defined $Pc) {
$Word += $Pc;
}
# This is a Perl extension, so the name doesn't begin with Posix.
- my $PerlWord = $perl->add_match_table('PerlWord',
+ my $PerlWord = $perl->add_match_table('PosixWord',
Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
- $PerlWord->add_alias('PosixWord');
+ $PerlWord->add_alias('PerlWord');
- my $Blank = $perl->add_match_table('Blank',
+ my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
Description => '\h, Horizontal white space',
# 200B is Zero Width Space which is for line
- 0x200B, # ZWSP
);
$Blank->add_alias('HorizSpace'); # Another name for it.
- $Blank->add_alias('XPosixBlank');
$perl->add_match_table("PosixBlank",
Description => "\\t and ' '",
Initialize => $Blank & $ASCII,
);
# No Posix equivalent for vertical space
- my $Space = $perl->add_match_table('Space',
+ my $Space = $perl->add_match_table('XPosixSpace',
Description => '\s including beyond ASCII and vertical tab',
Initialize => $Blank + $VertSpace,
);
- $Space->add_alias('XPosixSpace');
- my $posix_space = $perl->add_match_table("PosixSpace",
+ $Space->add_alias('XPerlSpace'); # Pre-existing synonyms
+ $Space->add_alias('SpacePerl');
+
+ my $Posix_space = $perl->add_match_table("PosixSpace",
Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
Initialize => $Space & $ASCII,
);
+ $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
- # Perl's traditional space doesn't include Vertical Tab prior to v5.18
- my $XPerlSpace = $perl->add_match_table('XPerlSpace',
- Description => '\s, including beyond ASCII',
- Initialize => $Space,
- #Initialize => $Space
- # - utf8::unicode_to_native(0x0B]
- );
- $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
- my $PerlSpace = $perl->add_match_table('PerlSpace',
- Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
- Initialize => $XPerlSpace & $ASCII,
- );
-
-
- my $Cntrl = $perl->add_match_table('Cntrl',
+ my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
- $Cntrl->add_alias('XPosixCntrl');
$perl->add_match_table("PosixCntrl",
Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
Initialize => $Cntrl & $ASCII,
$controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
# Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
- my $Graph = $perl->add_match_table('Graph',
+ my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
Description => 'Characters that are graphical',
Initialize => ~ ($Space + $controls),
);
- $Graph->add_alias('XPosixGraph');
$perl->add_match_table("PosixGraph",
Description =>
'[-!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~0-9A-Za-z]',
Initialize => $Graph & $ASCII,
);
- $print = $perl->add_match_table('Print',
+ $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
Description => 'Characters that are graphical plus space characters (but no controls)',
Initialize => $Blank + $Graph - $gc->table('Control'),
);
- $print->add_alias('XPosixPrint');
$perl->add_match_table("PosixPrint",
Description =>
'[- 0-9A-Za-z!"#$%&\'()*+,./:;<=>?@[\\\]^_`{|}~]',
Initialize => $ASCII & $XPosixPunct,
);
- my $Digit = $perl->add_match_table('Digit',
+ my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
- $Digit->add_alias('XPosixDigit');
my $PosixDigit = $perl->add_match_table("PosixDigit",
Description => '[0-9]',
Initialize => $Digit & $ASCII,
);
# Hex_Digit was not present in first release
- my $Xdigit = $perl->add_match_table('XDigit');
- $Xdigit->add_alias('XPosixXDigit');
+ my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
my $Hex = property_ref('Hex_Digit');
if (defined $Hex && ! $Hex->is_empty) {
$Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
}
}
- # Ouput both short and single in the same parenthesized
+ # Output both short and single in the same parenthesized
# expression, but with only one of 'Single', 'Short' if there
# are both items.
if ($short_name || $single_form || $table->conflicting) {
=head1 Properties accessible through Unicode::UCD
-All the Unicode character properties mentioned above (except for those marked
-as for internal use by Perl) are also accessible by
-L<Unicode::UCD/prop_invlist()>.
+The value of any Unicode (not including Perl extensions) character
+property mentioned above for any single code point is available through
+L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the
+values of all the Unicode properties for a given code point.
+
+Besides these, all the Unicode character properties mentioned above
+(except for those marked as for internal use by Perl) are also
+accessible by L<Unicode::UCD/prop_invlist()>.
Due to their nature, not all Unicode character properties are suitable for
regular expression matches, nor C<prop_invlist()>. The remaining
# Similarly, we create for Unicode::UCD a list of
# property-value aliases.
- my $property_full_name = $property->full_name;
-
# Look at each table in the property...
foreach my $table ($property->tables) {
my @values_list;
}
# To save memory, unlike the similar list for property
- # aliases above, only the standard forms hve the list.
+ # aliases above, only the standard forms have the list.
# This forces an extra step of converting from input
# name to standard name, but the savings are
# considerable. (There is only marginal savings if we
use strict;
use warnings;
-# If run outside the normal test suite on an ASCII platform, you can
-# just create a latin1_to_native() function that just returns its
-# inputs, because that's the only function used from test.pl
-require "test.pl";
-
# Test qr/\X/ and the \p{} regular expression constructs. This file is
# constructed by mktables from the tables it generates, so if mktables is
# buggy, this won't necessarily catch those bugs. Tests are generated for all
my $this_string = "";
my $this_display = "";
foreach my $code_point (@code_points) {
- $this_string .= latin1_to_native(chr(hex $code_point));
+ $this_string .= chr utf8::unicode_to_native(hex $code_point);
$this_display .= "\\x{$code_point}";
}