# This first set is in the original old-style proplist.
push @return, split /\n/, <<'END';
-Alpha ; Alphabetic
Bidi_C ; Bidi_Control
Dash ; Dash
Dia ; Diacritic
}
if (-e 'DCoreProperties.txt') {
push @return, split /\n/, <<'END';
+Alpha ; Alphabetic
IDS ; ID_Start
XIDC ; XID_Continue
XIDS ; XID_Start
# PropList.txt has been in Unicode since version 2.0. Until 3.1, it
# was in a completely different syntax. Ken Whistler of Unicode says
# that it was something he used as an aid for his own purposes, but
- # was never an official part of the standard. However, comments in
- # DAge.txt indicate that non-character code points were available in
- # the UCD as of 3.1. It is unclear to me (khw) how they could be
- # there except through this file (but on the other hand, they first
- # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
- # not. But the claim is that it was published as an aid to others who
- # might want some more information than was given in the official UCD
- # of the time. Many of the properties in it were incorporated into
- # the later PropList.txt, but some were not. This program uses this
- # early file to generate property tables that are otherwise not
- # accessible in the early UCD's, and most were probably not really
- # official at that time, so one could argue that it should be ignored,
- # and you can easily modify things to skip this. And there are bugs
- # in this file in various versions. (For example, the 2.1.9 version
- # removes from Alphabetic the CJK range starting at 4E00, and they
- # weren't added back in until 3.1.0.) Many of this file's properties
- # were later sanctioned, so this code generates tables for those
- # properties that aren't otherwise in the UCD of the time but
- # eventually did become official, and throws away the rest. Here is a
- # list of all the ones that are thrown away:
+ # was never an official part of the standard. Many of the properties
+ # in it were incorporated into the later PropList.txt, but some were
+ # not. This program uses this early file to generate property tables
+ # that are otherwise not accessible in the early UCD's. It does this
+ # for the ones that eventually became official, and don't appear to be
+ # too different in their contents from the later official version, and
+ # throws away the rest. It could be argued that the ones it generates
+ # were probably not really official at that time, so should be
+ # ignored. You can easily modify things to skip all of them by
+ # changing this function to just set $_ to "", and return; and to skip
+ # certain of them by by simply removing their declarations from
+ # get_old_property_aliases().
+ #
+ # Here is a list of all the ones that are thrown away:
+ # Alphabetic The definitions for this are very
+ # defective, so better to not mislead
+ # people into thinking it works.
+ # Instead the Perl extension of the
+ # same name is constructed from first
+ # principles.
# Bidi=* duplicates UnicodeData.txt
# Combining never made into official property;
# is \P{ccc=0}
# Get the best available case definitions. Early Unicode versions didn't
# have Uppercase and Lowercase defined, so use the general category
- # instead for them.
+ # instead for them, modified by hard-coding in the code points each is
+ # missing.
my $Lower = $perl->add_match_table('Lower');
my $Unicode_Lower = property_ref('Lowercase');
if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
}
else {
- $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
- Related => 1);
+ $Lower += $gc->table('Lowercase_Letter');
+
+ # There are quite a few code points in Lower, that aren't in gc=lc,
+ # and not all are in all releases.
+ foreach my $code_point ( 0x00AA,
+ 0x00BA,
+ 0x02B0 .. 0x02B8,
+ 0x02C0 .. 0x02C1,
+ 0x02E0 .. 0x02E4,
+ 0x0345,
+ 0x037A,
+ 0x1D2C .. 0x1D6A,
+ 0x1D78,
+ 0x1D9B .. 0x1DBF,
+ 0x2071,
+ 0x207F,
+ 0x2090 .. 0x209C,
+ 0x2170 .. 0x217F,
+ 0x24D0 .. 0x24E9,
+ 0x2C7C .. 0x2C7D,
+ 0xA770,
+ 0xA7F8 .. 0xA7F9,
+ ) {
+ # Don't include the code point unless it is assigned in this
+ # release
+ my $category = $gc->value_of(hex $code_point);
+ next if ! defined $category || $category eq 'Cn';
+
+ $Lower += $code_point;
+ }
}
$Lower->add_alias('XPosixLower');
my $Posix_Lower = $perl->add_match_table("PosixLower",
$Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
}
else {
- $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
- Related => 1);
+
+ # Unlike Lower, there are only two ranges in Upper that aren't in
+ # gc=Lu, and all code points were assigned in all releases.
+ $Upper += $gc->table('Uppercase_Letter');
+ $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
+ $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
}
$Upper->add_alias('XPosixUpper');
my $Posix_Upper = $perl->add_match_table("PosixUpper",
}
}
$Alpha->add_description('Alphabetic');
+ $Alpha->add_alias('Alphabetic');
}
$Alpha->add_alias('XPosixAlpha');
my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
}
else {
$PosixXDigit->initialize($Xdigit & $ASCII);
+ $PosixXDigit->add_alias('AHex');
+ $PosixXDigit->add_alias('Ascii_Hex_Digit');
}
$PosixXDigit->add_description('[0-9A-Fa-f]');
}
else {
# For Unicode versions that don't have the property, construct our own
- # from first principles. The actual definition is: Letters + letter
- # numbers (Nl), minus Pattern_Syntax and Pattern_White_Space code
- # points, plus stability extensions. PatSyn and PatWS are not defined
- # in releases that don't have XIDS defined, so are irrelevant.
- $perl_xids += $gc->table('Letter');
- my $nl = $gc->table('Letter_Number');
- $perl_xids += $nl if defined $nl;
+ # from first principles. The actual definition is:
+ # Letters
+ # + letter numbers (Nl)
+ # - Pattern_Syntax
+ # - Pattern_White_Space
+ # + stability extensions
+ # - NKFC modifications
+ #
+ # What we do in the code below is to include the identical code points
+ # that are in the first release that had Unicode's version of this
+ # property, essentially extrapolating backwards. There were no
+ # stability extensions until v4.1, so none are included; likewise in
+ # no Unicode version so far do subtracting PatSyn and PatWS make any
+ # difference, so those also are ignored.
+ $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
+
+ # We do subtract the NFKC modifications that are in the first version
+ # that had this property. We don't bother to test if they are in the
+ # version in question, because if they aren't, the operation is a
+ # no-op. The NKFC modifications are discussed in
+ # http://www.unicode.org/reports/tr31/#NFKC_Modifications
+ foreach my $range ( 0x037A,
+ 0x0E33,
+ 0x0EB3,
+ [ 0xFC5E, 0xFC63 ],
+ [ 0xFDFA, 0xFE70 ],
+ [ 0xFE72, 0xFE76 ],
+ 0xFE78,
+ 0xFE7A,
+ 0xFE7C,
+ 0xFE7E,
+ [ 0xFF9E, 0xFF9F ],
+ ) {
+ if (ref $range) {
+ $perl_xids->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $perl_xids->delete_range($range, $range);
+ }
+ }
}
+
$perl_xids &= $Word;
+ my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
+ Perl_Extension => 1,
+ Fate => $INTERNAL_ONLY);
+ my $XIDC = property_ref('XID_Continue');
+ if (defined $XIDC
+ || defined ($XIDC = property_ref('ID_Continue')))
+ {
+ $perl_xidc += $XIDC->table('Y');
+ }
+ else {
+ # Similarly, we construct our own XIDC if necessary for early Unicode
+ # versions. The definition is:
+ # everything in XIDS
+ # + Gc=Mn
+ # + Gc=Mc
+ # + Gc=Nd
+ # + Gc=Pc
+ # - Pattern_Syntax
+ # - Pattern_White_Space
+ # + stability extensions
+ # - NFKC modifications
+ #
+ # The same thing applies to this as with XIDS for the PatSyn, PatWS,
+ # and stability extensions. There is a somewhat different set of NFKC
+ # mods to remove (and add in this case). The ones below make this
+ # have identical code points as in the first release that defined it.
+ $perl_xidc += $perl_xids
+ + $gc->table('L')
+ + $gc->table('Mn')
+ + $gc->table('Mc')
+ + $gc->table('Nd')
+ + 0x00B7
+ ;
+ if (defined (my $pc = $gc->table('Pc'))) {
+ $perl_xidc += $pc;
+ }
+ else { # 1.1.5 didn't have Pc, but these should have been in it
+ $perl_xidc += 0xFF3F;
+ $perl_xidc->add_range(0x203F, 0x2040);
+ $perl_xidc->add_range(0xFE33, 0xFE34);
+ $perl_xidc->add_range(0xFE4D, 0xFE4F);
+ }
+
+ # Subtract the NFKC mods
+ foreach my $range ( 0x037A,
+ [ 0xFC5E, 0xFC63 ],
+ [ 0xFDFA, 0xFE1F ],
+ 0xFE70,
+ [ 0xFE72, 0xFE76 ],
+ 0xFE78,
+ 0xFE7A,
+ 0xFE7C,
+ 0xFE7E,
+ ) {
+ if (ref $range) {
+ $perl_xidc->delete_range($range->[0], $range->[1]);
+ }
+ else {
+ $perl_xidc->delete_range($range, $range);
+ }
+ }
+ }
+
+ $perl_xidc &= $Word;
+
my $gcb = property_ref('Grapheme_Cluster_Break');
# These are used in Unicode's definition of \X
my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,