# Needs 'no overloading' to run faster on miniperl. Code commented out at the
# subroutine objaddr can be used instead to work as far back (untested) as
-# 5.8: needs pack "U".
+# 5.8: needs pack "U". But almost all occurrences of objaddr have been
+# removed in favor of using 'no overloading'. You also would have to go
+# through and replace occurrences like:
+# my $addr = do { no overloading; pack 'J', $self; }
+# with
+# my $addr = main::objaddr $self;
+# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
+# that instituted the change to main::objaddr, and subsequent commits that
+# changed 0+$self to pack 'J', $self.)
+
+my $start_time;
+BEGIN { # Get the time the script started running; do it at compilation to
+ # get it as close as possible
+ $start_time= time;
+}
+
+
require 5.010_001;
use strict;
use warnings;
use Carp;
+use Config;
use File::Find;
use File::Path;
use File::Spec;
use Text::Tabs;
sub DEBUG () { 0 } # Set to 0 for production; 1 for development
+my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
##########################################################################
#
# the small actual loop to process the input files and finish up; then
# a __DATA__ section, for the .t tests
#
-# This program works on all releases of Unicode through at least 5.2. The
+# This program works on all releases of Unicode through at least 6.0. The
# outputs have been scrutinized most intently for release 5.1. The others
# have been checked for somewhat more than just sanity. It can handle all
# existing Unicode character properties in those releases.
#
-# This program needs to be able to run under miniperl. Therefore, it uses a
-# minimum of other modules, and hence implements some things itself that could
-# be gotten from CPAN
-#
-# This program uses inputs published by the Unicode Consortium. These can
-# change incompatibly between releases without the Perl maintainers realizing
-# it. Therefore this program is now designed to try to flag these. It looks
-# at the directories where the inputs are, and flags any unrecognized files.
-# It keeps track of all the properties in the files it handles, and flags any
-# that it doesn't know how to handle. It also flags any input lines that
-# don't match the expected syntax, among other checks.
-# It is also designed so if a new input file matches one of the known
-# templates, one hopefully just needs to add it to a list to have it
-# processed.
-#
-# It tries to keep fatal errors to a minimum, to generate something usable for
-# testing purposes. It always looks for files that could be inputs, and will
-# warn about any that it doesn't know how to handle (the -q option suppresses
-# the warning).
-#
# This program is mostly about Unicode character (or code point) properties.
# A property describes some attribute or quality of a code point, like if it
# is lowercase or not, its name, what version of Unicode it was first defined
# writing, such as the path to each one's file. There is a heading in each
# map table that gives the format of its entries, and what the map is for all
# the code points missing from it. (This allows tables to be more compact.)
-
+#
# The Property data structure contains one or more tables. All properties
# contain a map table (except the $perl property which is a
# pseudo-property containing only match tables), and any properties that
# out. But all the ones which can be used in regular expression \p{} and \P{}
# constructs will. Generally a property will have either its map table or its
# match tables written but not both. Again, what gets written is controlled
-# by lists which can easily be changed.
-
+# by lists which can easily be changed. Properties have a 'Type', like
+# binary, or string, or enum depending on how many match tables there are and
+# the content of the maps. This 'Type' is different than a range 'Type', so
+# don't get confused by the two concepts having the same name.
+#
# For information about the Unicode properties, see Unicode's UAX44 document:
my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
# More information on Unicode version glitches is further down in these
# introductory comments.
#
-# This program works on all properties as of 5.2, though the files for some
-# are suppressed from apparent lack of demand for. You can change which are
-# output by changing lists in this program.
-
-# The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
+# This program works on all non-provisional properties as of 6.0, though the
+# files for some are suppressed from apparent lack of demand for them. You
+# can change which are output by changing lists in this program.
+#
+# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
# loose matchings rules (from Unicode TR18):
#
# The recommended names for UCD properties and property values are in
# The program still allows Fuzzy to override its determination of if loose
# matching should be used, but it isn't currently used, as it is no longer
# needed; the calculations it makes are good enough.
-
+#
# SUMMARY OF HOW IT WORKS:
#
# Process arguments
# The Perl-defined properties are created and populated. Many of these
# require data determined from the earlier steps
# Any Perl-defined synonyms are created, and name clashes between Perl
-# and Unicode are reconciled.
+# and Unicode are reconciled and warned about.
# All the properties are written to files
# Any other files are written, and final warnings issued.
-
-# As mentioned above, some properties are given in more than one file. In
-# particular, the files in the extracted directory are supposedly just
-# reformattings of the others. But they contain information not easily
-# derivable from the other files, including results for Unihan, which this
-# program doesn't ordinarily look at, and for unassigned code points. They
-# also have historically had errors or been incomplete. In an attempt to
-# create the best possible data, this program thus processes them first to
-# glean information missing from the other files; then processes those other
-# files to override any errors in the extracted ones.
-
+#
# For clarity, a number of operators have been overloaded to work on tables:
# ~ means invert (take all characters not in the set). The more
# conventional '!' is not used because of the possibility of confusing
# Operations are done on references and affect the underlying structures, so
# that the copy constructors for them have been overloaded to not return a new
# clone, but the input object itself.
-
+#
# The bool operator is deliberately not overloaded to avoid confusion with
# "should it mean if the object merely exists, or also is non-empty?".
-
#
# WHY CERTAIN DESIGN DECISIONS WERE MADE
-
-# XXX These comments need more work.
+#
+# This program needs to be able to run under miniperl. Therefore, it uses a
+# minimum of other modules, and hence implements some things itself that could
+# be gotten from CPAN
+#
+# This program uses inputs published by the Unicode Consortium. These can
+# change incompatibly between releases without the Perl maintainers realizing
+# it. Therefore this program is now designed to try to flag these. It looks
+# at the directories where the inputs are, and flags any unrecognized files.
+# It keeps track of all the properties in the files it handles, and flags any
+# that it doesn't know how to handle. It also flags any input lines that
+# don't match the expected syntax, among other checks.
+#
+# It is also designed so if a new input file matches one of the known
+# templates, one hopefully just needs to add it to a list to have it
+# processed.
+#
+# As mentioned earlier, some properties are given in more than one file. In
+# particular, the files in the extracted directory are supposedly just
+# reformattings of the others. But they contain information not easily
+# derivable from the other files, including results for Unihan, which this
+# program doesn't ordinarily look at, and for unassigned code points. They
+# also have historically had errors or been incomplete. In an attempt to
+# create the best possible data, this program thus processes them first to
+# glean information missing from the other files; then processes those other
+# files to override any errors in the extracted ones. Much of the design was
+# driven by this need to store things and then possibly override them.
+#
+# It tries to keep fatal errors to a minimum, to generate something usable for
+# testing purposes. It always looks for files that could be inputs, and will
+# warn about any that it doesn't know how to handle (the -q option suppresses
+# the warning).
#
# Why have files written out for binary 'N' matches?
# For binary properties, if you know the mapping for either Y or N; the
-# other is trivial to construct, so could be done at Perl run-time instead
-# of having a file for it. That is, if someone types in \p{foo: N}, Perl
-# could translate that to \P{foo: Y} and not need a file. The problem is
-# communicating to Perl that a given property is binary. Perl can't figure
-# it out from looking at the N (or No), as some non-binary properties have
-# these as property values.
-# Why
-# There are several types of properties, based on what form their values can
-# take on. These are described in more detail below in the DATA STRUCTURES
-# section of these comments, but for now, you should know that there are
-# string properties, whose values are strings of one or more code points (such
-# as the Uppercase_mapping property); every other property maps to some other
-# form, like true or false, or a number, or a name, etc. The reason there are
-# two directories for map files is because of the way utf8.c works. It
-# expects that any files there are string properties, that is that the
-# mappings are each to one code point, with mappings in multiple code points
-# handled specially in an extra hash data structure. Digit.pl is a table that
-# is written there for historical reasons, even though it doesn't fit that
-# mold. Thus it can't currently be looked at by the Perl core.
+# other is trivial to construct, so could be done at Perl run-time by just
+# complementing the result, instead of having a file for it. That is, if
+# someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
+# not need a file. The problem is communicating to Perl that a given
+# property is binary. Perl can't figure it out from looking at the N (or
+# No), as some non-binary properties have these as property values. So
+# rather than inventing a way to communicate this info back to the core,
+# which would have required changes there as well, it was simpler just to
+# add the extra tables.
+#
+# Why is there more than one type of range?
+# This simplified things. There are some very specialized code points that
+# have to be handled specially for output, such as Hangul syllable names.
+# By creating a range type (done late in the development process), it
+# allowed this to be stored with the range, and overridden by other input.
+# Originally these were stored in another data structure, and it became a
+# mess trying to decide if a second file that was for the same property was
+# overriding the earlier one or not.
+#
+# Why are there two kinds of tables, match and map?
+# (And there is a base class shared by the two as well.) As stated above,
+# they actually are for different things. Development proceeded much more
+# smoothly when I (khw) realized the distinction. Map tables are used to
+# give the property value for every code point (actually every code point
+# that doesn't map to a default value). Match tables are used for regular
+# expression matches, and are essentially the inverse mapping. Separating
+# the two allows more specialized methods, and error checks so that one
+# can't just take the intersection of two map tables, for example, as that
+# is nonsensical.
#
# There are no match tables generated for matches of the null string. These
-# would like like \p{JSN=}. Perhaps something like them could be added if
-# necessary. The JSN does have a real code point U+110B that maps to the null
-# string, but it is a contributory property, and therefore not output by
-# default.
+# would look like qr/\p{JSN=}/ currently without modifying the regex code.
+# Perhaps something like them could be added if necessary. The JSN does have
+# a real code point U+110B that maps to the null string, but it is a
+# contributory property, and therefore not output by default. And it's easily
+# handled so far by making the null string the default where it is a
+# possibility.
#
# DEBUGGING
#
-# XXX Add more stuff here. use perl instead of miniperl to find problems with
-# Scalar::Util
-
+# This program is written so it will run under miniperl. Occasionally changes
+# will cause an error where the backtrace doesn't work well under miniperl.
+# To diagnose the problem, you can instead run it under regular perl, if you
+# have one compiled.
+#
+# There is a good trace facility. To enable it, first sub DEBUG must be set
+# to return true. Then a line like
+#
+# local $to_trace = 1 if main::DEBUG;
+#
+# can be added to enable tracing in its lexical scope or until you insert
+# another line:
+#
+# local $to_trace = 0 if main::DEBUG;
+#
+# then use a line like "trace $a, @b, %c, ...;
+#
+# Some of the more complex subroutines already have trace statements in them.
+# Permanent trace statements should be like:
+#
+# trace ... if main::DEBUG && $to_trace;
+#
+# If there is just one or a few files that you're debugging, you can easily
+# cause most everything else to be skipped. Change the line
+#
+# my $debug_skip = 0;
+#
+# to 1, and every file whose object is in @input_file_objects and doesn't have
+# a, 'non_skip => 1,' in its constructor will be skipped.
+#
+# To compare the output tables, it may be useful to specify the -annotate
+# flag. This causes the tables to expand so there is one entry for each
+# non-algorithmically named code point giving, currently its name, and its
+# graphic representation if printable (and you have a font that knows about
+# it). This makes it easier to see what the particular code points are in
+# each output table. The tables are usable, but because they don't have
+# ranges (for the most part), a Perl using them will run slower. Non-named
+# code points are annotated with a description of their status, and contiguous
+# ones with the same description will be output as a range rather than
+# individually. Algorithmically named characters are also output as ranges,
+# except when there are just a few contiguous ones.
+#
# FUTURE ISSUES
#
# The program would break if Unicode were to change its names so that
# synonym would have to be used for the new property. This is ugly, and
# manual intervention would certainly be easier to do in the short run; lets
# hope it never comes to this.
-
+#
# A NOTE ON UNIHAN
#
# This program can generate tables from the Unihan database. But it doesn't
# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
# is present in the directory, a table will be generated for that property.
# In 5.2, several more properties were added. For your convenience, the two
-# arrays are initialized with all the 5.2 listed properties that are also in
+# arrays are initialized with all the 6.0 listed properties that are also in
# earlier releases. But these are commented out. You can just uncomment the
# ones you want, or use them as a template for adding entries for other
# properties.
# kPrimaryNumeric property have commas and an unexpected comment. A filter
# could be added for these; or for a particular installation, the Unihan.txt
# file could be edited to fix them.
-# have to be
#
-# HOW TO ADD A FILE
-
-# Unicode Versions Notes
-
-# alpha's numbers halve in 2.1.9, answer cjk block at 4E00 were removed from PropList; not changed, could add gc Letter, put back in in 3.1.0
-# Some versions of 2.1.x Jamo.txt have the wrong value for 1105, which causes
-# real problems for the algorithms for Jamo calculations, so it is changed
-# here.
-# White space vs Space. in 3.2 perl has +205F=medium math space, fixed in 4.0, and ok in 3.1.1 because not there in unicode. synonym introduced in 4.1
-# ATBL = 202. 202 changed to ATB, and all code points stayed there. So if you were useing ATBL you were out of luck.
-# Hrkt Katakana_Or_Hiragana came in 4.01, before was Unknown.
+# HOW TO ADD A FILE TO BE PROCESSED
+#
+# A new file from Unicode needs to have an object constructed for it in
+# @input_file_objects, probably at the end or at the end of the extracted
+# ones. The program should warn you if its name will clash with others on
+# restrictive file systems, like DOS. If so, figure out a better name, and
+# add lines to the README.perl file giving that. If the file is a character
+# property, it should be in the format that Unicode has by default
+# standardized for such files for the more recently introduced ones.
+# If so, the Input_file constructor for @input_file_objects can just be the
+# file name and release it first appeared in. If not, then it should be
+# possible to construct an each_line_handler() to massage the line into the
+# standardized form.
+#
+# For non-character properties, more code will be needed. You can look at
+# the existing entries for clues.
+#
+# UNICODE VERSIONS NOTES
+#
+# The Unicode UCD has had a number of errors in it over the versions. And
+# these remain, by policy, in the standard for that version. Therefore it is
+# risky to correct them, because code may be expecting the error. So this
+# program doesn't generally make changes, unless the error breaks the Perl
+# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
+# for U+1105, which causes real problems for the algorithms for Jamo
+# calculations, so it is changed here.
+#
+# But it isn't so clear cut as to what to do about concepts that are
+# introduced in a later release; should they extend back to earlier releases
+# where the concept just didn't exist? It was easier to do this than to not,
+# so that's what was done. For example, the default value for code points not
+# in the files for various properties was probably undefined until changed by
+# some version. No_Block for blocks is such an example. This program will
+# assign No_Block even in Unicode versions that didn't have it. This has the
+# benefit that code being written doesn't have to special case earlier
+# versions; and the detriment that it doesn't match the Standard precisely for
+# the affected versions.
+#
+# Here are some observations about some of the issues in early versions:
+#
+# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
+# the reason is that the CJK block starting at 4E00 was removed from PropList,
+# and was not put back in until 3.1.0
+#
+# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
+# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
+# reason is that 3.2 introduced U+205F=medium math space, which was not
+# classed as white space, but Perl figured out that it should have been. 4.0
+# reclassified it correctly.
+#
+# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
+# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
+# was left with no code points, as all the ones that mapped to 202 stayed
+# mapped to 202. Thus if your program used the numeric name for the class,
+# it would not have been affected, but if it used the mnemonic, it would have
+# been.
+#
+# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
+# points which eventually came to have this script property value, instead
+# mapped to "Unknown". But in the next release all these code points were
+# moved to \p{sc=common} instead.
#
# The default for missing code points for BidiClass is complicated. Starting
# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
}
}
- if ($print_caller) {
- if (defined $line_number) {
- print STDERR sprintf "%4d: ", $line_number;
- }
- else {
- print STDERR " ";
- }
- $caller_name .= ": ";
- print STDERR $caller_name;
- }
-
+ print STDERR sprintf "%4d: ", $line_number if defined $line_number;
+ print STDERR "$caller_name: " if $print_caller;
print STDERR $output, "\n";
return;
}
# Encapsulated Cleverness". p. 455 in first edition.
my %seen;
+ # Arguably this breaks encapsulation, if the goal is to permit multiple
+ # distinct objects to stringify to the same value, and be interchangeable.
+ # However, for this program, no two objects stringify identically, and all
+ # lists passed to this function are either objects or strings. So this
+ # doesn't affect correctness, but it does give a couple of percent speedup.
+ no overloading;
return grep { ! $seen{$_}++ } @_;
}
# special things
my $glob_list = 0; # ? Should we try to include unknown .txt files
# in the input.
-my $output_range_counts = 1; # ? Should we include the number of code points
- # in ranges in the output
+my $output_range_counts = $debugging_build; # ? Should we include the number
+ # of code points in ranges in
+ # the output
+my $annotate = 0; # ? Should character names be in the output
+
# Verbosity levels; 0 is quiet
my $NORMAL_VERBOSITY = 1;
my $PROGRESS = 2;
elsif ($arg eq '-c') {
$output_range_counts = ! $output_range_counts
}
+ elsif ($arg eq '-annotate') {
+ $annotate = 1;
+ $debugging_build = 1;
+ $output_range_counts = 1;
+ }
else {
my $with_c = 'with';
$with_c .= 'out' if $output_range_counts; # Complements the state
-maketest : Make test script 'TestProp.pl' in current (or -C directory),
overrides -T
-makelist : Rewrite the file list $file_list based on current setup
+ -annotate : Output an annotation for each character in the table files;
+ useful for debugging mktables, looking at diffs; but is slow,
+ memory intensive; resulting tables are usable but slow and
+ very large.
-check A B : Executes $0 only if A and B are the same
END
}
# Stores the most-recently changed file. If none have changed, can skip the
# build
-my $youngest = -M $0; # Do this before the chdir!
+my $most_recent = (stat $0)[9]; # Do this before the chdir!
# Change directories now, because need to read 'version' early.
if ($use_directory) {
'Canonical_Combining_Class=Attached_Below_Left'
}
-# These are listed in the Property aliases file in 5.2, but Unihan is ignored
+# These are listed in the Property aliases file in 6.0, but Unihan is ignored
# unless explicitly added.
if ($v_version ge v5.2.0) {
my $unihan = 'Unihan; remove from list if using Unihan';
- foreach my $table qw (
+ foreach my $table (qw (
kAccountingNumeric
kOtherNumeric
kPrimaryNumeric
kIRG_USource
kIRG_VSource
kRSUnicode
- )
+ ))
{
$why_suppress_if_empty_warn_if_not{$table} = $unihan;
}
my $other_properties = 'other properties';
my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
- my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
+ my $why_no_expand = "Deprecated by Unicode: less useful than UTF-specific calculations",
%why_deprecated = (
- 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead',
+ 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
'Jamo_Short_Name' => $contributory,
'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
'Other_Alphabetic' => $contributory,
);
%why_suppressed = (
- # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
+ # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
# contains the same information, but without the algorithmically
# determinable Hangul syllables'. This file is not published, so it's
# existence is not noted in the comment.
'Name' => "Accessible via 'use charnames;'",
'Name_Alias' => "Accessible via 'use charnames;'",
- # These are sort of jumping the gun; deprecation is proposed for
- # Unicode version 6.0, but they have never been exposed by Perl, and
- # likely are soon to be deprecated, so best not to expose them.
- FC_NFKC_Closure => 'Use NFKC_Casefold instead',
+ FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
Expands_On_NFC => $why_no_expand,
Expands_On_NFD => $why_no_expand,
Expands_On_NFKC => $why_no_expand,
if ($v_version ge 4.0.0) {
$why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
+ if ($v_version ge 6.0.0) {
+ $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
+ }
}
-if ($v_version ge 5.2.0) {
+if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
$why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
+ if ($v_version ge 6.0.0) {
+ $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed';
+ }
}
# Probably obsolete forever
# If you are using the Unihan database, you need to add the properties that
# you want to extract from it to this table. For your convenience, the
-# properties in the 5.2 PropertyAliases.txt file are listed, commented out
+# properties in the 6.0 PropertyAliases.txt file are listed, commented out
my @cjk_properties = split "\n", <<'END';
#cjkAccountingNumeric; kAccountingNumeric
#cjkOtherNumeric; kOtherNumeric
END
# Similarly for the property values. For your convenience, the lines in the
-# 5.2 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
+# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
# '#' marks
my @cjk_property_values = split "\n", <<'END';
## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
'ReadMe.txt' => 'Just comments',
'README.TXT' => 'Just comments',
'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
+ 'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications',
+ 'IndicMatraCategory.txt' => 'Provisional',
+ 'IndicSyllabicCategory.txt' => 'Provisional',
+ 'ScriptExtensions.txt' => 'Provisional',
);
-################ End of externally interesting definitions ###############
+### End of externally interesting definitions, except for @input_file_objects
my $HEADER=<<"EOF";
# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
my $UNCONDITIONALLY = 2; # Replace without conditions.
my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
# already there
+my $CROAK = 5; # Die with an error if is already there
# Flags to give property statuses. The phrases are to remind maintainers that
# if the flag is changed, the indefinite article referring to it in the
my $NORMAL = "";
my $SUPPRESSED = 'z'; # The character should never actually be seen, since
# it is suppressed
+my $PLACEHOLDER = 'P'; # Implies no pod entry generated
my $DEPRECATED = 'D';
my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
$SUPPRESSED => 'should never be generated',
$STABILIZED => 'stabilized',
$OBSOLETE => 'obsolete',
- $DEPRECATED => 'deprecated'
+ $DEPRECATED => 'deprecated',
);
-# The format of the values of the map tables:
+# The format of the values of the tables:
+my $EMPTY_FORMAT = "";
my $BINARY_FORMAT = 'b';
my $DECIMAL_FORMAT = 'd';
my $FLOAT_FORMAT = 'f';
my $HEX_FORMAT = 'x';
my $RATIONAL_FORMAT = 'r';
my $STRING_FORMAT = 's';
+my $DECOMP_STRING_FORMAT = 'c';
my %map_table_formats = (
$BINARY_FORMAT => 'binary',
$INTEGER_FORMAT => 'integer',
$HEX_FORMAT => 'positive hex whole number; a code point',
$RATIONAL_FORMAT => 'rational: an integer or a fraction',
- $STRING_FORMAT => 'arbitrary string',
+ $STRING_FORMAT => 'string',
+ $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
);
# Unicode didn't put such derived files in a separate directory at first.
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
-# syllables
-my $SBase = 0xAC00;
-my $LBase = 0x1100;
-my $VBase = 0x1161;
-my $TBase = 0x11A7;
+# syllables. The '_string' versions are so generated tables can retain the
+# hex format, which is the more familiar value
+my $SBase_string = "0xAC00";
+my $SBase = CORE::hex $SBase_string;
+my $LBase_string = "0x1100";
+my $LBase = CORE::hex $LBase_string;
+my $VBase_string = "0x1161";
+my $VBase = CORE::hex $VBase_string;
+my $TBase_string = "0x11A7";
+my $TBase = CORE::hex $TBase_string;
my $SCount = 11172;
my $LCount = 19;
my $VCount = 21;
my %Jamo_V; # Vowels
my %Jamo_T; # Trailing consonants
+my @backslash_X_tests; # List of tests read in for testing \X
my @unhandled_properties; # Will contain a list of properties found in
# the input that we didn't process.
my @match_properties; # Properties that have match tables, to be
my $gc;
my $perl;
my $block;
+my $perl_charname;
+my $print;
# Are there conflicting names because of beginning with 'In_', or 'Is_'
my $has_In_conflicts = 0;
no overloading; # If overloaded, numifying below won't work.
# Numifying a ref gives its address.
- return 0 + $_[0];
+ return pack 'J', $_[0];
+}
+
+# These are used only if $annotate is true.
+# The entire range of Unicode characters is examined to populate these
+# after all the input has been processed. But most can be skipped, as they
+# have the same descriptive phrases, such as being unassigned
+my @viacode; # Contains the 1 million character names
+my @printable; # boolean: And are those characters printable?
+my @annotate_char_type; # Contains a type of those characters, specifically
+ # for the purposes of annotation.
+my $annotate_ranges; # A map of ranges of code points that have the same
+ # name for the purposes of annotation. They map to the
+ # upper edge of the range, so that the end point can
+ # be immediately found. This is used to skip ahead to
+ # the end of a range, and avoid processing each
+ # individual code point in it.
+my $unassigned_sans_noncharacters; # A Range_List of the unassigned
+ # characters, but excluding those which are
+ # also noncharacter code points
+
+# The annotation types are an extension of the regular range types, though
+# some of the latter are folded into one. Make the new types negative to
+# avoid conflicting with the regular types
+my $SURROGATE_TYPE = -1;
+my $UNASSIGNED_TYPE = -2;
+my $PRIVATE_USE_TYPE = -3;
+my $NONCHARACTER_TYPE = -4;
+my $CONTROL_TYPE = -5;
+my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
+
+sub populate_char_info ($) {
+ # Used only with the $annotate option. Populates the arrays with the
+ # input code point's info that are needed for outputting more detailed
+ # comments. If calling context wants a return, it is the end point of
+ # any contiguous range of characters that share essentially the same info
+
+ my $i = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ $viacode[$i] = $perl_charname->value_of($i) || "";
+
+ # A character is generally printable if Unicode says it is,
+ # but below we make sure that most Unicode general category 'C' types
+ # aren't.
+ $printable[$i] = $print->contains($i);
+
+ $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
+
+ # Only these two regular types are treated specially for annotations
+ # purposes
+ $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
+ && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
+
+ # Give a generic name to all code points that don't have a real name.
+ # We output ranges, if applicable, for these. Also calculate the end
+ # point of the range.
+ my $end;
+ if (! $viacode[$i]) {
+ if ($gc-> table('Surrogate')->contains($i)) {
+ $viacode[$i] = 'Surrogate';
+ $annotate_char_type[$i] = $SURROGATE_TYPE;
+ $printable[$i] = 0;
+ $end = $gc->table('Surrogate')->containing_range($i)->end;
+ }
+ elsif ($gc-> table('Private_use')->contains($i)) {
+ $viacode[$i] = 'Private Use';
+ $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
+ $printable[$i] = 0;
+ $end = $gc->table('Private_Use')->containing_range($i)->end;
+ }
+ elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
+ contains($i))
+ {
+ $viacode[$i] = 'Noncharacter';
+ $annotate_char_type[$i] = $NONCHARACTER_TYPE;
+ $printable[$i] = 0;
+ $end = property_ref('Noncharacter_Code_Point')->table('Y')->
+ containing_range($i)->end;
+ }
+ elsif ($gc-> table('Control')->contains($i)) {
+ $viacode[$i] = 'Control';
+ $annotate_char_type[$i] = $CONTROL_TYPE;
+ $printable[$i] = 0;
+ $end = 0x81 if $i == 0x80; # Hard-code this one known case
+ }
+ elsif ($gc-> table('Unassigned')->contains($i)) {
+ $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
+ $annotate_char_type[$i] = $UNASSIGNED_TYPE;
+ $printable[$i] = 0;
+
+ # Because we name the unassigned by the blocks they are in, it
+ # can't go past the end of that block, and it also can't go past
+ # the unassigned range it is in. The special table makes sure
+ # that the non-characters, which are unassigned, are separated
+ # out.
+ $end = min($block->containing_range($i)->end,
+ $unassigned_sans_noncharacters-> containing_range($i)->
+ end);
+ }
+ else {
+ Carp::my_carp_bug("Can't figure out how to annotate "
+ . sprintf("U+%04X", $i)
+ . ". Proceeding anyway.");
+ $viacode[$i] = 'UNKNOWN';
+ $annotate_char_type[$i] = $UNKNOWN_TYPE;
+ $printable[$i] = 0;
+ }
+ }
+
+ # Here, has a name, but if it's one in which the code point number is
+ # appended to the name, do that.
+ elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
+ $viacode[$i] .= sprintf("-%04X", $i);
+ $end = $perl_charname->containing_range($i)->end;
+ }
+
+ # And here, has a name, but if it's a hangul syllable one, replace it with
+ # the correct name from the Unicode algorithm
+ elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
+ use integer;
+ my $SIndex = $i - $SBase;
+ my $L = $LBase + $SIndex / $NCount;
+ my $V = $VBase + ($SIndex % $NCount) / $TCount;
+ my $T = $TBase + $SIndex % $TCount;
+ $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
+ $viacode[$i] .= $Jamo{$T} if $T != $TBase;
+ $end = $perl_charname->containing_range($i)->end;
+ }
+
+ return if ! defined wantarray;
+ return $i if ! defined $end; # If not a range, return the input
+
+ # Save this whole range so can find the end point quickly
+ $annotate_ranges->add_map($i, $end, $end);
+
+ return $end;
}
# Commented code below should work on Perl 5.8.
# bless $_[0], 'main::Fake';
#
# # Numifying a ref gives its address.
-# my $addr = 0 + $_[0];
+# my $addr = pack 'J', $_[0];
#
# # Return to original class
# bless $_[0], $pkg;
our $Verbose = 1 if main::DEBUG; # Useful info when debugging
+# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
+# and overload trying to load Scalar:Util under miniperl. See
+# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
+undef $overload::VERSION;
+
sub my_carp {
my $message = shift || "";
my $nofold = shift || 0;
# Use typeglob to give the anonymous subroutine the name we want
*$destroy_name = sub {
my $self = shift;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
$self->$destroy_callback if $destroy_callback;
foreach my $field (keys %{$package_fields{$package}}) {
# "protection" is only by convention. All that happens is that the
# accessor functions' names begin with an underscore. So instead of
# calling set_foo, the call is _set_foo. (Real protection could be
- # accomplished by having a new subroutine, end_package called at the
+ # accomplished by having a new subroutine, end_package, called at the
# end of each package, and then storing the __LINE__ ranges and
# checking them on every accessor. But that is way overkill.)
return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
my $self = shift;
my $value = shift;
+ my $addr = do { no overloading; pack 'J', $self; };
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
if (ref $value) {
- return if grep { $value == $_ }
- @{$field->{main::objaddr $self}};
+ return if grep { $value == $_ } @{$field->{$addr}};
}
else {
- return if grep { $value eq $_ }
- @{$field->{main::objaddr $self}};
+ return if grep { $value eq $_ } @{$field->{$addr}};
}
- push @{$field->{main::objaddr $self}}, $value;
+ push @{$field->{$addr}}, $value;
return;
}
}
*$subname = sub {
use strict "refs";
Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
- my $addr = main::objaddr $_[0];
+ my $addr = do { no overloading; pack 'J', $_[0]; };
if (ref $field->{$addr} ne 'ARRAY') {
my $type = ref $field->{$addr};
$type = 'scalar' unless $type;
*$subname = sub {
use strict "refs";
Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
- return $field->{main::objaddr $_[0]};
+ no overloading;
+ return $field->{pack 'J', $_[0]};
}
}
}
Carp::carp_extra_args(\@_) if @_ > 2;
}
# $self is $_[0]; $value is $_[1]
- $field->{main::objaddr $_[0]} = $_[1];
+ no overloading;
+ $field->{pack 'J', $_[0]} = $_[1];
return;
}
}
sub trace { return main::trace(@_); }
-
{ # Closure
# Keep track of fields that are to be put into the constructor.
my %constructor_fields;
# processed when you set the $debug_skip global.
main::set_access('non_skip', \%non_skip, 'c');
+ my %skip;
+ # This is used to skip processing of this input file semi-permanently.
+ # It is used for files that we aren't planning to process anytime soon,
+ # but want to allow to be in the directory and not raise a message that we
+ # are not handling. Mostly for test files. This is in contrast to the
+ # non_skip element, which is supposed to be used very temporarily for
+ # debugging. Sets 'optional' to 1
+ main::set_access('skip', \%skip, 'c');
+
my %each_line_handler;
# list of subroutines to look at and filter each non-comment line in the
# file. defaults to none. The subroutines are called in order, each is
my $class = shift;
my $self = bless \do{ my $anonymous_scalar }, $class;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
# Set defaults
$handler{$addr} = \&main::process_generic_property_file;
$non_skip{$addr} = 0;
+ $skip{$addr} = 0;
$has_missings_defaults{$addr} = $NO_DEFAULTS;
$handle{$addr} = undef;
$added_lines{$addr} = [ ];
$missings{$addr} = [ ];
# Two positional parameters.
+ return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
$file{$addr} = main::internal_file_to_platform(shift);
$first_released{$addr} = shift;
print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
}
+ $optional{$addr} = 1 if $skip{$addr};
+
return $self;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
my $file = $file{$addr};
}
# File could be optional
- if ($optional{$addr}){
+ if ($optional{$addr}) {
return unless -e $file;
my $result = eval $optional{$addr};
if (! defined $result) {
}
else {
- # Here, the file exists
+ # Here, the file exists. Some platforms may change the case of
+ # its name
if ($seen_non_extracted_non_age) {
- if ($file =~ /$EXTRACTED/) {
+ if ($file =~ /$EXTRACTED/i) {
Carp::my_carp_bug(join_lines(<<END
-$file should be processed just after the 'Prop..Alias' files, and before
+$file should be processed just after the 'Prop...Alias' files, and before
anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
have subtle problems
END
}
elsif ($EXTRACTED_DIR
&& $first_released{$addr} ne v0
- && $file !~ /$EXTRACTED/
- && $file ne 'DAge.txt')
+ && $file !~ /$EXTRACTED/i
+ && lc($file) ne 'dage.txt')
{
# We don't set this (by the 'if' above) if we have no
# extracted directory, so if running on an early version,
# isn't a file we are expecting. As we process the files,
# they are deleted from the hash, so any that remain at the
# end of the program are files that we didn't process.
+ my $fkey = File::Spec->rel2abs($file);
+ my $expecting = delete $potential_files{$fkey};
+ $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
Carp::my_carp("Was not expecting '$file'.") if
- ! delete $potential_files{File::Spec->rel2abs($file)}
+ ! $expecting
&& ! defined $handle{$addr};
+ # Having deleted from expected files, we can quit if not to do
+ # anything. Don't print progress unless really want verbosity
+ if ($skip{$addr}) {
+ print "Skipping $file.\n" if $verbosity >= $VERBOSE;
+ return;
+ }
+
# Open the file, converting the slashes used in this program
# into the proper form for the OS
my $file_handle;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# Here the file is open (or if the handle is not a ref, is an open
# 'virtual' file). Get the next line; any inserted lines get priority
# # an each_line_handler() on the line.
#
# my $self = shift;
-# my $addr = main::objaddr $self;
+# my $addr = do { no overloading; pack 'J', $self; };
#
# foreach my $inserted_ref (@{$added_lines{$addr}}) {
# my ($adjusted, $line) = @{$inserted_ref};
# Each inserted line is an array, with the first element being 0 to
# indicate that this line hasn't been adjusted, and needs to be
# processed.
- push @{$added_lines{main::objaddr $self}}, map { [ 0, $_ ] } @_;
+ no overloading;
+ push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
return;
}
# Each inserted line is an array, with the first element being 1 to
# indicate that this line has been adjusted
- push @{$added_lines{main::objaddr $self}}, map { [ 1, $_ ] } @_;
+ no overloading;
+ push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
return;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# If not accepting a list return, just return the first one.
return shift @{$missings{$addr}} unless wantarray;
sub _insert_property_into_line {
# Add a property field to $_, if this file requires it.
- my $property = $property{main::objaddr shift};
+ my $self = shift;
+ my $addr = do { no overloading; pack 'J', $self; };
+ my $property = $property{$addr};
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$_ =~ s/(;|$)/; $property$1/;
my $message = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
$message = 'Unexpected line' unless $message;
# increment the count of how many times it has occurred
unless ($errors{$addr}->{$message}) {
Carp::my_carp("$message in '$_' in "
- . $file{main::objaddr $self}
+ . $file{$addr}
. " at line $.. Skipping this line;");
$errors{$addr}->{$message} = 1;
}
my $class = shift;
my $self = bless \do{my $anonymous_scalar}, $class;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
while (@_ > 1) {
my $default = shift;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
return each %{$class_defaults{$addr}};
}
my $class = shift;
my $self = bless \do { my $anonymous_scalar }, $class;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
$name{$addr} = shift;
$loose_match{$addr} = shift;
my $class = shift;
my $self = bless \do { my $anonymous_scalar }, $class;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
$start{$addr} = shift;
$end{$addr} = shift;
sub _operator_stringify {
my $self = shift;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# Output it like '0041..0065 (value)'
my $return = sprintf("%04X", $start{$addr})
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
return $standard_form{$addr} if defined $standard_form{$addr};
return $value{$addr};
my $indent = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
my $return = $indent
. sprintf("%04X", $start{$addr})
# There are a number of methods to manipulate range lists, and some operators
# are overloaded to handle them.
-# Because of the slowness of pure Perl objaddr() on miniperl, and measurements
-# showing this package was using a lot of real time calculating that, the code
-# was changed to only calculate it once per call stack. This is done by
-# consistently using the package variable $addr in routines, and only calling
-# objaddr() if it isn't defined, and setting that to be local, so that callees
-# will have it already. It would be a good thing to change this. XXX
-
sub trace { return main::trace(@_); }
{ # Closure
return _union($class, $initialize, %args) if defined $initialize;
$self = bless \do { my $anonymous_scalar }, $class;
- local $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
# Optional parent object, only for debug info.
$owner_name_of{$addr} = delete $args{'Owner'};
sub _operator_stringify {
my $self = shift;
- local $addr = main::objaddr($self) if !defined $addr;
+ my $addr = do { no overloading; pack 'J', $self; };
return "Range_List attached to '$owner_name_of{$addr}'"
if $owner_name_of{$addr};
if (! defined $arg) {
my $message = "";
if (defined $self) {
- $message .= $owner_name_of{main::objaddr $self};
+ no overloading;
+ $message .= $owner_name_of{pack 'J', $self};
}
Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
return;
else {
my $message = "";
if (defined $self) {
- $message .= $owner_name_of{main::objaddr $self};
+ no overloading;
+ $message .= $owner_name_of{pack 'J', $self};
}
Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
return;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr($self) if ! defined $addr;
-
- return scalar @{$ranges{$addr}};
+ no overloading;
+ return scalar @{$ranges{pack 'J', $self}};
}
sub min {
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr($self) if ! defined $addr;
+ my $addr = do { no overloading; pack 'J', $self; };
# If the range list is empty, return a large value that isn't adjacent
# to any that could be in the range list, for simpler tests
my $codepoint = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr $self if ! defined $addr;
-
my $i = $self->_search_ranges($codepoint);
return 0 unless defined $i;
# range[$i-1]->end < $codepoint <= range[$i]->end
# So is in the table if and only iff it is at least the start position
# of range $i.
- return 0 if $ranges{$addr}->[$i]->start > $codepoint;
+ no overloading;
+ return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
return $i + 1;
}
- sub value_of {
- # Returns the value associated with the code point, undef if none
+ sub containing_range {
+ # Returns the range object that contains the code point, undef if none
my $self = shift;
my $codepoint = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr $self if ! defined $addr;
-
my $i = $self->contains($codepoint);
return unless $i;
# contains() returns 1 beyond where we should look
- return $ranges{$addr}->[$i-1]->value;
+ no overloading;
+ return $ranges{pack 'J', $self}->[$i-1];
+ }
+
+ sub value_of {
+ # Returns the value associated with the code point, undef if none
+
+ my $self = shift;
+ my $codepoint = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $range = $self->containing_range($codepoint);
+ return unless defined $range;
+
+ return $range->value;
+ }
+
+ sub type_of {
+ # Returns the type of the range containing the code point, undef if
+ # the code point is not in the table
+
+ my $self = shift;
+ my $codepoint = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $range = $self->containing_range($codepoint);
+ return unless defined $range;
+
+ return $range->type;
}
sub _search_ranges {
my $code_point = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr $self if ! defined $addr;
+ my $addr = do { no overloading; pack 'J', $self; };
return if $code_point > $max{$addr};
my $r = $ranges{$addr}; # The current list of ranges
#
# The range list is kept sorted so that the range with the lowest
# starting position is first in the list, and generally, adjacent
- # ranges with the same values are merged into single larger one (see
+ # ranges with the same values are merged into a single larger one (see
# exceptions below).
#
- # There are more parameters, all are key => value pairs:
+ # There are more parameters; all are key => value pairs:
# Type gives the type of the value. It is only valid for '+'.
# All ranges have types; if this parameter is omitted, 0 is
# assumed. Ranges with type 0 are assumed to obey the
# => $IF_NOT_EQUIVALENT means to replace the existing values
# with this one if they are not equivalent.
# Ranges are equivalent if their types are the
- # same, and they are the same string, or if
+ # same, and they are the same string; or if
# both are type 0 ranges, if their Unicode
# standard forms are identical. In this last
# case, the routine chooses the more "modern"
# existing one, but has a different value,
# don't replace the existing one, but insert
# this, one so that the same range can occur
- # multiple times.
+ # multiple times. They are stored LIFO, so
+ # that the final one inserted is the first one
+ # returned in an ordered search of the table.
# => anything else is the same as => $IF_NOT_EQUIVALENT
#
- # "same value" means identical for type-0 ranges, and it means having
- # the same standard forms for non-type-0 ranges.
+ # "same value" means identical for non-type-0 ranges, and it means
+ # having the same standard forms for type-0 ranges.
return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
- local $addr = main::objaddr($self) if ! defined $addr;
+ my $addr = do { no overloading; pack 'J', $self; };
if ($operation ne '+' && $operation ne '-') {
Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
return;
}
- # Here, we have taken care of the case where $replace is $NO, which
- # means that whatever action we now take is done unconditionally. It
- # still could be that this call will result in a no-op, if duplicates
- # aren't allowed, and we are inserting a range that merely duplicates
- # data already in the range list; or also if deleting a non-existent
- # range.
- # $i still points to the first potential affected range. Now find the
- # highest range affected, which will determine the length parameter to
- # splice. (The input range can span multiple existing ones.) While
- # we are looking through the range list, see also if this is an
- # insertion that will change the values of at least one of the
- # affected ranges. We don't need to do this check unless this is an
- # insertion of non-multiples, and also since this is a boolean, we
- # don't need to do it if have already determined that it will make a
- # change; just unconditionally change them. $cdm is created to be 1
- # if either of these is true. (The 'c' in the name comes from below)
- my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
+ # Here, we have taken care of the case where $replace is $NO.
+ # Remember that here, r[$i-1]->end < $start <= r[$i]->end
+ # If inserting a multiple record, this is where it goes, before the
+ # first (if any) existing one. This implies an insertion, and no
+ # change to any existing ranges. Note that $i can be -1 if this new
+ # range doesn't actually duplicate any existing, and comes at the
+ # beginning of the list.
+ if ($replace == $MULTIPLE) {
+
+ if ($start != $end) {
+ Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
+ return;
+ }
+
+ # Don't add an exact duplicate, as it isn't really a multiple
+ if ($end >= $r->[$i]->start) {
+ if ($r->[$i]->start != $r->[$i]->end) {
+ Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the other range ($r->[$i]) contains more than one code point. No action taken.");
+ return;
+ }
+ return if $value eq $r->[$i]->value && $type eq $r->[$i]->type;
+ }
+
+ trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
+ my @return = splice @$r,
+ $i,
+ 0,
+ Range->new($start,
+ $end,
+ Value => $value,
+ Type => $type);
+ if (main::DEBUG && $to_trace) {
+ trace "After splice:";
+ trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
+ trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
+ trace "i =[", $i, "]", $r->[$i] if $i >= 0;
+ trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
+ trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
+ trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
+ }
+ return @return;
+ }
+
+ # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
+ # delete, insert, and replace either unconditionally or if not
+ # equivalent. $i still points to the first potential affected range.
+ # Now find the highest range affected, which will determine the length
+ # parameter to splice. (The input range can span multiple existing
+ # ones.) If this isn't a deletion, while we are looking through the
+ # range list, see also if this is a replacement rather than a clean
+ # insertion; that is if it will change the values of at least one
+ # existing range. Start off assuming it is an insert, until find it
+ # isn't.
+ my $clean_insert = $operation eq '+';
my $j; # This will point to the highest affected range
# For non-zero types, the standard form is the value itself;
# searching
last if $end < $r->[$j]->start;
- # Here, overlaps the range at $j. If the value's don't match,
- # and this is supposedly an insertion, it becomes a change
- # instead. This is what the 'c' stands for in $cdm.
- if (! $cdm) {
+ # Here, overlaps the range at $j. If the values don't match,
+ # and so far we think this is a clean insertion, it becomes a
+ # non-clean insertion, i.e., a 'change' or 'replace' instead.
+ if ($clean_insert) {
if ($r->[$j]->standard_form ne $standard_form) {
- $cdm = 1;
+ $clean_insert = 0;
+ if ($replace == $CROAK) {
+ main::croak("The range to add "
+ . sprintf("%04X", $start)
+ . '-'
+ . sprintf("%04X", $end)
+ . " with value '$value' overlaps an existing range $r->[$j]");
+ }
}
else {
# same, but the non-standardized values aren't. If
# replacing unconditionally, then replace
if( $replace == $UNCONDITIONALLY) {
- $cdm = 1;
+ $clean_insert = 0;
}
else {
&& $pre_existing =~ /[a-z]/;
if ($old_mixed != $new_mixed) {
- $cdm = 1 if $new_mixed;
+ $clean_insert = 0 if $new_mixed;
if (main::DEBUG && $to_trace) {
- if ($cdm) {
- trace "Replacing $pre_existing with $value";
+ if ($clean_insert) {
+ trace "Retaining $pre_existing over $value";
}
else {
- trace "Retaining $pre_existing over $value";
+ trace "Replacing $pre_existing with $value";
}
}
}
my $old_punct = $pre_existing =~ /[-_]/;
if ($old_punct != $new_punct) {
- $cdm = 1 if $new_punct;
+ $clean_insert = 0 if $new_punct;
if (main::DEBUG && $to_trace) {
- if ($cdm) {
- trace "Replacing $pre_existing with $value";
+ if ($clean_insert) {
+ trace "Retaining $pre_existing over $value";
}
else {
- trace "Retaining $pre_existing over $value";
+ trace "Replacing $pre_existing with $value";
}
}
} # else existing one is just as "good";
$j--; # $j now points to the highest affected range.
trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
- # If inserting a multiple record, this is where it goes, after all the
- # existing ones for this range. This implies an insertion, and no
- # change to any existing ranges. Note that $j can be -1 if this new
- # range doesn't actually duplicate any existing, and comes at the
- # beginning of the list, in which case we can handle it like any other
- # insertion, and is easier to do so.
- if ($replace == $MULTIPLE && $j >= 0) {
-
- # This restriction could be remedied with a little extra work, but
- # it won't hopefully ever be necessary
- if ($r->[$j]->start != $r->[$j]->end) {
- Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken.");
- return;
- }
-
- # Don't add an exact duplicate, as it isn't really a multiple
- return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
-
- trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
- my @return = splice @$r,
- $j+1,
- 0,
- Range->new($start,
- $end,
- Value => $value,
- Type => $type);
- if (main::DEBUG && $to_trace) {
- trace "After splice:";
- trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
- trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
- trace "j =[", $j, "]", $r->[$j] if $j >= 0;
- trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
- trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
- trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
- }
- return @return;
- }
-
# Here, have taken care of $NO and $MULTIPLE replaces.
# $j points to the highest affected range. But it can be < $i or even
# -1. These happen only if the insertion is entirely in the gap
}
else {
- # Here the entire input range is not in the gap before $i. There
- # is an affected one, and $j points to the highest such one.
+ # Here part of the input range is not in the gap before $i. Thus,
+ # there is at least one affected one, and $j points to the highest
+ # such one.
# At this point, here is the situation:
# This is not an insertion of a multiple, nor of tentative ($NO)
# r[$i-1]->end < $start <= $end <= r[$j]->end
#
# Also:
- # $cdm is a boolean which is set true if and only if this is a
- # change or deletion (multiple was handled above). In
- # other words, it could be renamed to be just $cd.
+ # $clean_insert is a boolean which is set true if and only if
+ # this is a "clean insertion", i.e., not a change nor a
+ # deletion (multiple was handled above).
# We now have enough information to decide if this call is a no-op
- # or not. It is a no-op if it is a deletion of a non-existent
- # range, or an insertion of already existing data.
+ # or not. It is a no-op if this is an insertion of already
+ # existing data.
- if (main::DEBUG && $to_trace && ! $cdm
+ if (main::DEBUG && $to_trace && $clean_insert
&& $i == $j
&& $start >= $r->[$i]->start)
{
trace "no-op";
}
- return if ! $cdm # change or delete => not no-op
+ return if $clean_insert
&& $i == $j # more than one affected range => not no-op
# Here, r[$i-1]->end < $start <= $end <= r[$i]->end
$extends_above = ($j+1 < $range_list_size
&& $r->[$j+1]->start == $end +1
&& $r->[$j+1]->standard_form eq $standard_form
- && $r->[$j-1]->type == $type);
+ && $r->[$j+1]->type == $type);
}
if ($extends_below && $extends_above) { # Adds to both
$splice_start--; # start replace at element below
# Here the new element adds to the one below, but not to the
# one above. If inserting, and only to that one range, can
# just change its ending to include the new one.
- if ($length == 0 && ! $cdm) {
+ if ($length == 0 && $clean_insert) {
$r->[$i-1]->set_end($end);
trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
return;
# Here the new element adds to the one above, but not below.
# Mirror the code above
- if ($length == 0 && ! $cdm) {
+ if ($length == 0 && $clean_insert) {
$r->[$j+1]->set_start($start);
trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
return;
trace "i =[", $i, "]", $r->[$i];
trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
- trace "removed @return";
+ trace "removed ", @return if @return;
}
# An actual deletion could have changed the maximum in the list.
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr $self if ! defined $addr;
-
- undef $each_range_iterator{$addr};
+ no overloading;
+ undef $each_range_iterator{pack 'J', $self};
return;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr($self) if ! defined $addr;
+ my $addr = do { no overloading; pack 'J', $self; };
return if $self->is_empty;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr($self) if ! defined $addr;
+ my $addr = do { no overloading; pack 'J', $self; };
my $count = 0;
foreach my $range (@{$ranges{$addr}}) {
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr($self) if ! defined $addr;
- return scalar @{$ranges{$addr}} == 0;
+ no overloading;
+ return scalar @{$ranges{pack 'J', $self}} == 0;
}
sub hash {
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- local $addr = main::objaddr($self) if ! defined $addr;
+ my $addr = do { no overloading; pack 'J', $self; };
# These are quickly computable. Return looks like 'min..max;count'
return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
return $self->_add_delete('+', $start, $end, "");
}
- my $non_ASCII = (ord('A') == 65); # Assumes test on same platform
+ sub matches_identically_to {
+ # Return a boolean as to whether or not two Range_Lists match identical
+ # sets of code points.
+
+ my $self = shift;
+ my $other = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ # These are ordered in increasing real time to figure out (at least
+ # until a patch changes that and doesn't change this)
+ return 0 if $self->max != $other->max;
+ return 0 if $self->min != $other->min;
+ return 0 if $self->range_count != $other->range_count;
+ return 0 if $self->count != $other->count;
+
+ # Here they could be identical because all the tests above passed.
+ # The loop below is somewhat simpler since we know they have the same
+ # number of elements. Compare range by range, until reach the end or
+ # find something that differs.
+ my @a_ranges = $self->ranges;
+ my @b_ranges = $other->ranges;
+ for my $i (0 .. @a_ranges - 1) {
+ my $a = $a_ranges[$i];
+ my $b = $b_ranges[$i];
+ trace "self $a; other $b" if main::DEBUG && $to_trace;
+ return 0 if $a->start != $b->start || $a->end != $b->end;
+ }
+ return 1;
+ }
sub is_code_point_usable {
# This used only for making the test script. See if the input
return 0 if $code < 0; # Never use a negative
- # For non-ASCII, we shun the characters that don't have Perl encoding-
- # independent symbols for them. 'A' is such a symbol, so is "\n".
- # Note, this program hopefully will work on 5.8 Perls, and \v is not
- # such a symbol in them.
- return $try_hard if $non_ASCII
- && $code <= 0xFF
- && ($code >= 0x7F
- || ($code >= 0x0E && $code <= 0x1F)
- || ($code >= 0x01 && $code <= 0x06)
- || $code == 0x0B); # \v introduced after 5.8
-
# shun null. I'm (khw) not sure why this was done, but NULL would be
# the character very frequently used.
return $try_hard if $code == 0x0000;
- return 0 if $try_hard; # XXX Temporary until fix utf8.c
-
# shun non-character code points.
return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
# On first pass, don't choose less desirable code points; if no good
# one is found, repeat, allowing a less desirable one to be selected.
# standard.
main::set_access('perl_extension', \%perl_extension, 'r');
+ my %output_range_counts;
+ # A boolean set iff this table is to have comments written in the
+ # output file that contain the number of code points in the range.
+ # The constructor can override the global flag of the same name.
+ main::set_access('output_range_counts', \%output_range_counts, 'r');
+
+ my %format;
+ # The format of the entries of the table. This is calculated from the
+ # data in the table (or passed in the constructor). This is an enum e.g.,
+ # $STRING_FORMAT
+ main::set_access('format', \%format, 'r', 'p_s');
+
sub new {
# All arguments are key => value pairs, which you can see below, most
# of which match fields documented above. Otherwise: Pod_Entry,
my $class = shift;
my $self = bless \do { my $anonymous_scalar }, $class;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
my %args = @_;
$full_name{$addr} = delete $args{'Full_Name'};
my $complete_name = $complete_name{$addr}
= delete $args{'Complete_Name'};
+ $format{$addr} = delete $args{'Format'};
$internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
- $perl_extension{$addr} = delete $args{'Perl_Extension'} || 0;
+ $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
$property{$addr} = delete $args{'_Property'};
$range_list{$addr} = delete $args{'_Range_List'};
$status{$addr} = delete $args{'Status'} || $NORMAL;
my $loose_match = delete $args{'Fuzzy'};
my $note = delete $args{'Note'};
my $make_pod_entry = delete $args{'Pod_Entry'};
+ my $perl_extension = delete $args{'Perl_Extension'};
# Shouldn't have any left over
Carp::carp_extra_args(\%args) if main::DEBUG && %args;
# Can't use || above because conceivably the name could be 0, and
# can't use // operator in case this program gets used in Perl 5.8
$full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
+ $output_range_counts{$addr} = $output_range_counts if
+ ! defined $output_range_counts{$addr};
$aliases{$addr} = [ ];
$comment{$addr} = [ ];
push @{$description{$addr}}, $description if $description;
push @{$note{$addr}}, $note if $note;
- # If hasn't set its status already, see if it is on one of the lists
- # of properties or tables that have particular statuses; if not, is
- # normal. The lists are prioritized so the most serious ones are
- # checked first
- if (! $status{$addr}) {
- if (exists $why_suppressed{$complete_name}) {
+ if ($status{$addr} eq $PLACEHOLDER) {
+
+ # A placeholder table doesn't get documented, is a perl extension,
+ # and quite likely will be empty
+ $make_pod_entry = 0 if ! defined $make_pod_entry;
+ $perl_extension = 1 if ! defined $perl_extension;
+ push @tables_that_may_be_empty, $complete_name{$addr};
+ }
+ elsif (! $status{$addr}) {
+
+ # If hasn't set its status already, see if it is on one of the
+ # lists of properties or tables that have particular statuses; if
+ # not, is normal. The lists are prioritized so the most serious
+ # ones are checked first
+ if (exists $why_suppressed{$complete_name}
+ # Don't suppress if overridden
+ && ! grep { $_ eq $complete_name{$addr} }
+ @output_mapped_properties)
+ {
$status{$addr} = $SUPPRESSED;
}
elsif (exists $why_deprecated{$complete_name}) {
}
}
+ $perl_extension{$addr} = $perl_extension || 0;
+
# By convention what typically gets printed only or first is what's
# first in the list, so put the full name there for good output
# clarity. Other routines rely on the full name being first on the
# Here are the methods that are required to be defined by any derived
# class
- for my $sub qw(
+ for my $sub (qw(
+ handle_special_range
append_to_body
pre_body
- )
- # append_to_body and pre_body are called in the write() method
- # to add stuff after the main body of the table, but before
- # its close; and to prepend stuff before the beginning of the
+ ))
+ # write() knows how to write out normal ranges, but it calls
+ # handle_special_range() when it encounters a non-normal one.
+ # append_to_body() is called by it after it has handled all
+ # ranges to add anything after the main portion of the table.
+ # And finally, pre_body() is called after all this to build up
+ # anything that should appear before the main portion of the
+ # table. Doing it this way allows things in the middle to
+ # affect what should appear before the main portion of the
# table.
{
no strict "refs";
sub ranges {
# Returns the array of ranges associated with this table.
- return $range_list{main::objaddr shift}->ranges;
+ no overloading;
+ return $range_list{pack 'J', shift}->ranges;
}
sub add_alias {
# release
$name = ucfirst($name) unless $name =~ /^k[A-Z]/;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# Figure out if should be loosely matched if not already specified.
if (! defined $loose_match) {
# This name may be shorter than any existing ones, so clear the cache
# of the shortest, so will have to be recalculated.
- undef $short_name{main::objaddr $self};
+ no overloading;
+ undef $short_name{pack 'J', $self};
return;
}
my $nominal_length_ptr = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# For efficiency, don't recalculate, but this means that adding new
# aliases could change what the shortest is, so the code that does
chomp $description;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- push @{$description{main::objaddr $self}}, $description;
+ no overloading;
+ push @{$description{pack 'J', $self}}, $description;
return;
}
chomp $note;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- push @{$note{main::objaddr $self}}, $note;
+ no overloading;
+ push @{$note{pack 'J', $self}}, $note;
return;
}
sub add_comment { # Adds the parameter as a comment.
+ return unless $debugging_build;
+
my $self = shift;
my $comment = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
chomp $comment;
- push @{$comment{main::objaddr $self}}, $comment;
+
+ no overloading;
+ push @{$comment{pack 'J', $self}}, $comment;
return;
}
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my @list = @{$comment{main::objaddr $self}};
+ my $addr = do { no overloading; pack 'J', $self; };
+ my @list = @{$comment{$addr}};
return @list if wantarray;
my $return = "";
foreach my $sentence (@list) {
# initialization for range lists.
my $self = shift;
+ my $addr = do { no overloading; pack 'J', $self; };
my $initialization = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# Replace the current range list with a new one of the same exact
# type.
- my $class = ref $range_list{main::objaddr $self};
- $range_list{main::objaddr $self} = $class->new(Owner => $self,
+ my $class = ref $range_list{$addr};
+ $range_list{$addr} = $class->new(Owner => $self,
Initialize => $initialization);
return;
my $return = "";
$return .= $DEVELOPMENT_ONLY if $compare_versions;
$return .= $HEADER;
- $return .= $INTERNAL_ONLY if $internal_only{main::objaddr $self};
+ no overloading;
+ $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
return $return;
}
sub write {
- # Write a representation of the table to its file.
+ # Write a representation of the table to its file. It calls several
+ # functions furnished by sub-classes of this abstract base class to
+ # handle non-normal ranges, to add stuff before the table, and at its
+ # end.
my $self = shift;
my $tab_stops = shift; # The number of tab stops over to put any
# the range
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
# Start with the header
- my @OUT = $self->header;
+ my @HEADER = $self->header;
# Then the comments
- push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
+ push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
if $comment{$addr};
- # Then any pre-body stuff.
- my $pre_body = $self->pre_body;
- push @OUT, $pre_body, "\n" if $pre_body;
-
- # The main body looks like a 'here' document
- push @OUT, "return <<'END';\n";
+ # Things discovered processing the main body of the document may
+ # affect what gets output before it, therefore pre_body() isn't called
+ # until after all other processing of the table is done.
+
+ # The main body looks like a 'here' document. If annotating, get rid
+ # of the comments before passing to the caller, as some callers, such
+ # as charnames.pm, can't cope with them. (Outputting range counts
+ # also introduces comments, but these don't show up in the tables that
+ # can't cope with comments, and there aren't that many of them that
+ # it's worth the extra real time to get rid of them).
+ my @OUT;
+ if ($annotate) {
+ # Use the line below in Perls that don't have /r
+ #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
+ push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
+ } else {
+ push @OUT, "return <<'END';\n";
+ }
if ($range_list{$addr}->is_empty) {
}
else {
my $range_size_1 = $range_size_1{$addr};
+ my $format; # Used only in $annotate option
+ my $include_name; # Used only in $annotate option
+
+ if ($annotate) {
+
+ # if annotating each code point, must print 1 per line.
+ # The variable could point to a subroutine, and we don't want
+ # to lose that fact, so only set if not set already
+ $range_size_1 = 1 if ! $range_size_1;
+
+ $format = $self->format;
+
+ # The name of the character is output only for tables that
+ # don't already include the name in the output.
+ my $property = $self->property;
+ $include_name =
+ ! ($property == $perl_charname
+ || $property == main::property_ref('Unicode_1_Name')
+ || $property == main::property_ref('Name')
+ || $property == main::property_ref('Name_Alias')
+ );
+ }
# Output each range as part of the here document.
+ RANGE:
for my $set ($range_list{$addr}->ranges) {
+ if ($set->type != 0) {
+ $self->handle_special_range($set);
+ next RANGE;
+ }
my $start = $set->start;
my $end = $set->end;
my $value = $set->value;
# Don't output ranges whose value is the one to suppress
- next if defined $suppress_value && $value eq $suppress_value;
+ next RANGE if defined $suppress_value
+ && $value eq $suppress_value;
- # If has or wants a single point range output
- if ($start == $end || $range_size_1) {
- for my $i ($start .. $end) {
- push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
- }
- }
- else {
- push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
+ # If there is a range and doesn't need a single point range
+ # output
+ if ($start != $end && ! $range_size_1) {
+ push @OUT, sprintf "%04X\t%04X", $start, $end;
+ $OUT[-1] .= "\t$value" if $value ne "";
# Add a comment with the size of the range, if requested.
# Expand Tabs to make sure they all start in the same
# column, and then unexpand to use mostly tabs.
- if (! $output_range_counts) {
+ if (! $output_range_counts{$addr}) {
$OUT[-1] .= "\n";
}
else {
$count);
$OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
}
+ next RANGE;
+ }
+
+ # Here to output a single code point per line
+
+ # If not to annotate, use the simple formats
+ if (! $annotate) {
+
+ # Use any passed in subroutine to output.
+ if (ref $range_size_1 eq 'CODE') {
+ for my $i ($start .. $end) {
+ push @OUT, &{$range_size_1}($i, $value);
+ }
+ }
+ else {
+
+ # Here, caller is ok with default output.
+ for (my $i = $start; $i <= $end; $i++) {
+ push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
+ }
+ }
+ next RANGE;
+ }
+
+ # Here, wants annotation.
+ for (my $i = $start; $i <= $end; $i++) {
+
+ # Get character information if don't have it already
+ main::populate_char_info($i)
+ if ! defined $viacode[$i];
+ my $type = $annotate_char_type[$i];
+
+ # Figure out if should output the next code points as part
+ # of a range or not. If this is not in an annotation
+ # range, then won't output as a range, so returns $i.
+ # Otherwise use the end of the annotation range, but no
+ # further than the maximum possible end point of the loop.
+ my $range_end = main::min($annotate_ranges->value_of($i)
+ || $i,
+ $end);
+
+ # Use a range if it is a range, and either is one of the
+ # special annotation ranges, or the range is at most 3
+ # long. This last case causes the algorithmically named
+ # code points to be output individually in spans of at
+ # most 3, as they are the ones whose $type is > 0.
+ if ($range_end != $i
+ && ( $type < 0 || $range_end - $i > 2))
+ {
+ # Here is to output a range. We don't allow a
+ # caller-specified output format--just use the
+ # standard one.
+ push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
+ $range_end,
+ $value;
+ my $range_name = $viacode[$i];
+
+ # For the code points which end in their hex value, we
+ # eliminate that from the output annotation, and
+ # capitalize only the first letter of each word.
+ if ($type == $CP_IN_NAME) {
+ my $hex = sprintf "%04X", $i;
+ $range_name =~ s/-$hex$//;
+ my @words = split " ", $range_name;
+ for my $word (@words) {
+ $word = ucfirst(lc($word)) if $word ne 'CJK';
+ }
+ $range_name = join " ", @words;
+ }
+ elsif ($type == $HANGUL_SYLLABLE) {
+ $range_name = "Hangul Syllable";
+ }
+
+ $OUT[-1] .= " $range_name" if $range_name;
+
+ # Include the number of code points in the range
+ my $count = main::clarify_number($range_end - $i + 1);
+ $OUT[-1] .= " [$count]\n";
+
+ # Skip to the end of the range
+ $i = $range_end;
+ }
+ else { # Not in a range.
+ my $comment = "";
+
+ # When outputting the names of each character, use
+ # the character itself if printable
+ $comment .= "'" . chr($i) . "' " if $printable[$i];
+
+ # To make it more readable, use a minimum indentation
+ my $comment_indent;
+
+ # Determine the annotation
+ if ($format eq $DECOMP_STRING_FORMAT) {
+
+ # This is very specialized, with the type of
+ # decomposition beginning the line enclosed in
+ # <...>, and the code points that the code point
+ # decomposes to separated by blanks. Create two
+ # strings, one of the printable characters, and
+ # one of their official names.
+ (my $map = $value) =~ s/ \ * < .*? > \ +//x;
+ my $tostr = "";
+ my $to_name = "";
+ my $to_chr = "";
+ foreach my $to (split " ", $map) {
+ $to = CORE::hex $to;
+ $to_name .= " + " if $to_name;
+ $to_chr .= chr($to);
+ main::populate_char_info($to)
+ if ! defined $viacode[$to];
+ $to_name .= $viacode[$to];
+ }
+
+ $comment .=
+ "=> '$to_chr'; $viacode[$i] => $to_name";
+ $comment_indent = 25; # Determined by experiment
+ }
+ else {
+
+ # Assume that any table that has hex format is a
+ # mapping of one code point to another.
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ main::populate_char_info($decimal_value)
+ if ! defined $viacode[$decimal_value];
+ $comment .= "=> '"
+ . chr($decimal_value)
+ . "'; " if $printable[$decimal_value];
+ }
+ $comment .= $viacode[$i] if $include_name
+ && $viacode[$i];
+ if ($format eq $HEX_FORMAT) {
+ my $decimal_value = CORE::hex $value;
+ $comment .= " => $viacode[$decimal_value]"
+ if $viacode[$decimal_value];
+ }
+
+ # If including the name, no need to indent, as the
+ # name will already be way across the line.
+ $comment_indent = ($include_name) ? 0 : 60;
+ }
+
+ # Use any passed in routine to output the base part of
+ # the line.
+ if (ref $range_size_1 eq 'CODE') {
+ my $base_part = &{$range_size_1}($i, $value);
+ chomp $base_part;
+ push @OUT, $base_part;
+ }
+ else {
+ push @OUT, sprintf "%04X\t\t%s", $i, $value;
+ }
+
+ # And add the annotation.
+ $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
+ $OUT[-1],
+ $comment if $comment;
+ $OUT[-1] .= "\n";
+ }
}
} # End of loop through all the table's ranges
}
# And finish the here document.
push @OUT, "END\n";
+ # Done with the main portion of the body. Can now figure out what
+ # should appear before it in the file.
+ my $pre_body = $self->pre_body;
+ push @HEADER, $pre_body, "\n" if $pre_body;
+
# All these files have a .pl suffix
$file_path{$addr}->[-1] .= '.pl';
- main::write($file_path{$addr}, \@OUT);
+ main::write($file_path{$addr},
+ $annotate, # utf8 iff annotating
+ \@HEADER,
+ \@OUT);
return;
}
my $info = shift; # Any message associated with it.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
$status{$addr} = $status;
$status_info{$addr} = $info;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
$locked{$addr} = "";
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
return 0 if ! $locked{$addr};
Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
my $self = shift;
# Rest of parameters passed on
- @{$file_path{main::objaddr $self}} = @_;
+ no overloading;
+ @{$file_path{pack 'J', $self}} = @_;
return
}
# Accessors for the range list stored in this table. First for
# unconditional
- for my $sub qw(
+ for my $sub (qw(
+ containing_range
contains
count
each_range
hash
is_empty
+ matches_identically_to
max
min
range_count
reset_each_range
+ type_of
value_of
- )
+ ))
{
no strict "refs";
*$sub = sub {
use strict "refs";
my $self = shift;
- return $range_list{main::objaddr $self}->$sub(@_);
+ no overloading;
+ return $range_list{pack 'J', $self}->$sub(@_);
}
}
# Then for ones that should fail if locked
- for my $sub qw(
+ for my $sub (qw(
delete_range
- )
+ ))
{
no strict "refs";
*$sub = sub {
my $self = shift;
return if $self->carp_if_locked;
- return $range_list{main::objaddr $self}->$sub(@_);
+ no overloading;
+ return $range_list{pack 'J', $self}->$sub(@_);
}
}
\%anomalous_entries,
'readable_array');
- my %format;
- # The format of the entries of the table. This is calculated from the
- # data in the table (or passed in the constructor). This is an enum e.g.,
- # $STRING_FORMAT
- main::set_access('format', \%format);
-
my %core_access;
# This is a string, solely for documentation, indicating how one can get
# access to this property via the Perl core.
main::set_access('core_access', \%core_access, 'r', 's');
- my %has_specials;
- # Boolean set when non-zero map-type ranges are added to this table,
- # which happens in only a few tables. This is purely for performance, to
- # avoid having to search through every table upon output, so if all the
- # non-zero maps got deleted before output, this would remain set, and the
- # only penalty would be performance. Currently, most map tables that get
- # output have specials in them, so this doesn't help that much anyway.
- main::set_access('has_specials', \%has_specials);
-
my %to_output_map;
# Boolean as to whether or not to write out this map table
main::set_access('to_output_map', \%to_output_map, 's');
my $core_access = delete $args{'Core_Access'};
my $default_map = delete $args{'Default_Map'};
- my $format = delete $args{'Format'};
my $property = delete $args{'_Property'};
my $full_name = delete $args{'Full_Name'};
# Rest of parameters passed on
_Range_List => $range_list,
%args);
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
$anomalous_entries{$addr} = [];
$core_access{$addr} = $core_access;
$default_map{$addr} = $default_map;
- $format{$addr} = $format;
$self->initialize($initialize) if defined $initialize;
# Can't change the table if locked.
return if $self->carp_if_locked;
- my $addr = main::objaddr $self;
-
- $has_specials{$addr} = 1 if $type;
+ my $addr = do { no overloading; pack 'J', $self; };
$self->_range_list->add_map($lower, $upper,
$string,
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
return "" unless @{$anomalous_entries{$addr}};
return join("\n", @{$anomalous_entries{$addr}}) . "\n";
return;
}
- my $addr = main::objaddr $self;
- my $other_addr = main::objaddr $other;
+ my $addr = do { no overloading; pack 'J', $self; };
+ my $other_addr = do { no overloading; pack 'J', $other; };
local $to_trace = 0 if main::DEBUG;
Replace => $UNCONDITIONALLY);
}
- # Copy the specials information from the other table to $self
- if ($has_specials{$other_addr}) {
- $has_specials{$addr} = 1;
- }
-
return;
}
my $map = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# Convert the input to the standard equivalent, if any (won't have any
# for $STRING properties)
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# If overridden, use that
return $to_output_map{$addr} if defined $to_output_map{$addr};
# Just before output, create the comment that heads the file
# containing this table.
+ return unless $debugging_build;
+
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# No sense generating a comment if aren't going to write it out.
return if ! $self->to_output_map;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
my $property = $self->property;
range, in hex; STOP is the ending point, or if omitted, the range has just one
code point; MAPPING is what each code point between START and STOP maps to.
END
- if ($output_range_counts) {
+ if ($self->output_range_counts) {
$comment .= <<END;
Numbers in comments in [brackets] indicate how many code points are in the
range (omitted when the range is a single code point or if the mapping is to
my %swash_keys; # Makes sure don't duplicate swash names.
+ # The remaining variables are temporaries used while writing each table,
+ # to output special ranges.
+ my $has_hangul_syllables;
+ my @multi_code_point_maps; # Map is to more than one code point.
+
+ # The key is the base name of the code point, and the value is an
+ # array giving all the ranges that use this base name. Each range
+ # is actually a hash giving the 'low' and 'high' values of it.
+ my %names_ending_in_code_point;
+
+ # Inverse mapping. The list of ranges that have these kinds of
+ # names. Each element contains the low, high, and base names in a
+ # hash.
+ my @code_points_ending_in_code_point;
+
+ sub handle_special_range {
+ # Called in the middle of write when it finds a range it doesn't know
+ # how to handle.
+
+ my $self = shift;
+ my $range = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $addr = do { no overloading; pack 'J', $self; };
+
+ my $type = $range->type;
+
+ my $low = $range->start;
+ my $high = $range->end;
+ my $map = $range->value;
+
+ # No need to output the range if it maps to the default.
+ return if $map eq $default_map{$addr};
+
+ # Switch based on the map type...
+ if ($type == $HANGUL_SYLLABLE) {
+
+ # These are entirely algorithmically determinable based on
+ # some constants furnished by Unicode; for now, just set a
+ # flag to indicate that have them. After everything is figured
+ # out, we will output the code that does the algorithm.
+ $has_hangul_syllables = 1;
+ }
+ elsif ($type == $CP_IN_NAME) {
+
+ # Code points whose the name ends in their code point are also
+ # algorithmically determinable, but need information about the map
+ # to do so. Both the map and its inverse are stored in data
+ # structures output in the file.
+ push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
+ push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
+
+ push @code_points_ending_in_code_point, { low => $low,
+ high => $high,
+ name => $map
+ };
+ }
+ elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
+
+ # Multi-code point maps and null string maps have an entry
+ # for each code point in the range. They use the same
+ # output format.
+ for my $code_point ($low .. $high) {
+
+ # The pack() below can't cope with surrogates.
+ if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
+ Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
+ next;
+ }
+
+ # Generate the hash entries for these in the form that
+ # utf8.c understands.
+ my $tostr = "";
+ my $to_name = "";
+ my $to_chr = "";
+ foreach my $to (split " ", $map) {
+ if ($to !~ /^$code_point_re$/) {
+ Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
+ next;
+ }
+ $tostr .= sprintf "\\x{%s}", $to;
+ $to = CORE::hex $to;
+ if ($annotate) {
+ $to_name .= " + " if $to_name;
+ $to_chr .= chr($to);
+ main::populate_char_info($to)
+ if ! defined $viacode[$to];
+ $to_name .= $viacode[$to];
+ }
+ }
+
+ # I (khw) have never waded through this line to
+ # understand it well enough to comment it.
+ my $utf8 = sprintf(qq["%s" => "$tostr",],
+ join("", map { sprintf "\\x%02X", $_ }
+ unpack("U0C*", pack("U", $code_point))));
+
+ # Add a comment so that a human reader can more easily
+ # see what's going on.
+ push @multi_code_point_maps,
+ sprintf("%-45s # U+%04X", $utf8, $code_point);
+ if (! $annotate) {
+ $multi_code_point_maps[-1] .= " => $map";
+ }
+ else {
+ main::populate_char_info($code_point)
+ if ! defined $viacode[$code_point];
+ $multi_code_point_maps[-1] .= " '"
+ . chr($code_point)
+ . "' => '$to_chr'; $viacode[$code_point] => $to_name";
+ }
+ }
+ }
+ else {
+ Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
+ }
+
+ return;
+ }
+
sub pre_body {
# Returns the string that should be output in the file before the main
- # body of this table. This includes some hash entries identifying the
- # format of the body, and what the single value should be for all
- # ranges missing from it. It also includes any code points which have
- # map_types that don't go in the main table.
+ # body of this table. It isn't called until the main body is
+ # calculated, saving a pass. The string includes some hash entries
+ # identifying the format of the body, and what the single value should
+ # be for all ranges missing from it. It also includes any code points
+ # which have map_types that don't go in the main table.
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
my $name = $self->property->swash_name;
}
$swash_keys{$name} = "$self";
- my $default_map = $default_map{$addr};
-
my $pre_body = "";
- if ($has_specials{$addr}) {
-
- # Here, some maps with non-zero type have been added to the table.
- # Go through the table and handle each of them. None will appear
- # in the body of the table, so delete each one as we go. The
- # code point count has already been calculated, so ok to delete
- # now.
-
- my @multi_code_point_maps;
- my $has_hangul_syllables = 0;
-
- # The key is the base name of the code point, and the value is an
- # array giving all the ranges that use this base name. Each range
- # is actually a hash giving the 'low' and 'high' values of it.
- my %names_ending_in_code_point;
-
- # Inverse mapping. The list of ranges that have these kinds of
- # names. Each element contains the low, high, and base names in a
- # hash.
- my @code_points_ending_in_code_point;
-
- my $range_map = $self->_range_list;
- foreach my $range ($range_map->ranges) {
- next unless $range->type != 0;
- my $low = $range->start;
- my $high = $range->end;
- my $map = $range->value;
- my $type = $range->type;
-
- # No need to output the range if it maps to the default. And
- # the write method won't output it either, so no need to
- # delete it to keep it from being output, and is faster to
- # skip than to delete anyway.
- next if $map eq $default_map;
-
- # Delete the range to keep write() from trying to output it
- $range_map->delete_range($low, $high);
-
- # Switch based on the map type...
- if ($type == $HANGUL_SYLLABLE) {
-
- # These are entirely algorithmically determinable based on
- # some constants furnished by Unicode; for now, just set a
- # flag to indicate that have them. Below we will output
- # the code that does the algorithm.
- $has_hangul_syllables = 1;
- }
- elsif ($type == $CP_IN_NAME) {
-
- # If the name ends in the code point it represents, are
- # also algorithmically determinable, but need information
- # about the map to do so. Both the map and its inverse
- # are stored in data structures output in the file.
- push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
- push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
-
- push @code_points_ending_in_code_point, { low => $low,
- high => $high,
- name => $map
- };
- }
- elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
-
- # Multi-code point maps and null string maps have an entry
- # for each code point in the range. They use the same
- # output format.
- for my $code_point ($low .. $high) {
-
- # The pack() below can't cope with surrogates.
- if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
- Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
- next;
- }
-
- # Generate the hash entries for these in the form that
- # utf8.c understands.
- my $tostr = "";
- foreach my $to (split " ", $map) {
- if ($to !~ /^$code_point_re$/) {
- Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
- next;
- }
- $tostr .= sprintf "\\x{%s}", $to;
- }
-
- # I (khw) have never waded through this line to
- # understand it well enough to comment it.
- my $utf8 = sprintf(qq["%s" => "$tostr",],
- join("", map { sprintf "\\x%02X", $_ }
- unpack("U0C*", pack("U", $code_point))));
-
- # Add a comment so that a human reader can more easily
- # see what's going on.
- push @multi_code_point_maps,
- sprintf("%-45s # U+%04X => %s", $utf8,
- $code_point,
- $map);
- }
- }
- else {
- Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Using type 0 instead");
- $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
- }
- } # End of loop through all ranges
- # Here have gone through the whole file. If actually generated
- # anything for each map type, add its respective header and
- # trailer
- if (@multi_code_point_maps) {
- $pre_body .= <<END;
+ # Here we assume we were called after have gone through the whole
+ # file. If we actually generated anything for each map type, add its
+ # respective header and trailer
+ if (@multi_code_point_maps) {
+ $pre_body .= <<END;
# Some code points require special handling because their mappings are each to
# multiple code points. These do not appear in the main body, but are defined
# in the hash below.
-# The key: UTF-8 _bytes_, the value: UTF-8 (speed hack)
+# Each key is the string of N bytes that together make up the UTF-8 encoding
+# for the code point. (i.e. the same as looking at the code point's UTF-8
+# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
%utf8::ToSpec$name = (
END
- $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
- }
-
- if ($has_hangul_syllables || @code_points_ending_in_code_point) {
-
- # Convert these structures to output format.
- my $code_points_ending_in_code_point =
- main::simple_dumper(\@code_points_ending_in_code_point,
- ' ' x 8);
- my $names = main::simple_dumper(\%names_ending_in_code_point,
- ' ' x 8);
-
- # Do the same with the Hangul names,
- my $jamo;
- my $jamo_l;
- my $jamo_v;
- my $jamo_t;
- my $jamo_re;
- if ($has_hangul_syllables) {
-
- # Construct a regular expression of all the possible
- # combinations of the Hangul syllables.
- my @L_re; # Leading consonants
- for my $i ($LBase .. $LBase + $LCount - 1) {
- push @L_re, $Jamo{$i}
- }
- my @V_re; # Middle vowels
- for my $i ($VBase .. $VBase + $VCount - 1) {
- push @V_re, $Jamo{$i}
- }
- my @T_re; # Trailing consonants
- for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
- push @T_re, $Jamo{$i}
- }
-
- # The whole re is made up of the L V T combination.
- $jamo_re = '('
- . join ('|', sort @L_re)
- . ')('
- . join ('|', sort @V_re)
- . ')('
- . join ('|', sort @T_re)
- . ')?';
-
- # These hashes needed by the algorithm were generated
- # during reading of the Jamo.txt file
- $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
- $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
- $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
- $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
+ $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
+ }
+
+ if ($has_hangul_syllables || @code_points_ending_in_code_point) {
+
+ # Convert these structures to output format.
+ my $code_points_ending_in_code_point =
+ main::simple_dumper(\@code_points_ending_in_code_point,
+ ' ' x 8);
+ my $names = main::simple_dumper(\%names_ending_in_code_point,
+ ' ' x 8);
+
+ # Do the same with the Hangul names,
+ my $jamo;
+ my $jamo_l;
+ my $jamo_v;
+ my $jamo_t;
+ my $jamo_re;
+ if ($has_hangul_syllables) {
+
+ # Construct a regular expression of all the possible
+ # combinations of the Hangul syllables.
+ my @L_re; # Leading consonants
+ for my $i ($LBase .. $LBase + $LCount - 1) {
+ push @L_re, $Jamo{$i}
+ }
+ my @V_re; # Middle vowels
+ for my $i ($VBase .. $VBase + $VCount - 1) {
+ push @V_re, $Jamo{$i}
+ }
+ my @T_re; # Trailing consonants
+ for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
+ push @T_re, $Jamo{$i}
}
- $pre_body .= <<END;
+ # The whole re is made up of the L V T combination.
+ $jamo_re = '('
+ . join ('|', sort @L_re)
+ . ')('
+ . join ('|', sort @V_re)
+ . ')('
+ . join ('|', sort @T_re)
+ . ')?';
+
+ # These hashes needed by the algorithm were generated
+ # during reading of the Jamo.txt file
+ $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
+ $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
+ $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
+ $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
+ }
+
+ $pre_body .= <<END;
# To achieve significant memory savings when this file is read in,
# algorithmically derivable code points are omitted from the main body below.
$code_points_ending_in_code_point
);
END
- # Earlier releases didn't have Jamos. No sense outputting
- # them unless will be used.
- if ($has_hangul_syllables) {
- $pre_body .= <<END;
+ # Earlier releases didn't have Jamos. No sense outputting
+ # them unless will be used.
+ if ($has_hangul_syllables) {
+ $pre_body .= <<END;
# Convert from code point to Jamo short name for use in composing Hangul
# syllable names
# These constants names and values were taken from the Unicode standard,
# version 5.1, section 3.12. They are used in conjunction with Hangul
# syllables
- my \$SBase = 0xAC00;
- my \$LBase = 0x1100;
- my \$VBase = 0x1161;
- my \$TBase = 0x11A7;
- my \$SCount = 11172;
- my \$LCount = 19;
- my \$VCount = 21;
- my \$TCount = 28;
+ my \$SBase = $SBase_string;
+ my \$LBase = $LBase_string;
+ my \$VBase = $VBase_string;
+ my \$TBase = $TBase_string;
+ my \$SCount = $SCount;
+ my \$LCount = $LCount;
+ my \$VCount = $VCount;
+ my \$TCount = $TCount;
my \$NCount = \$VCount * \$TCount;
END
- } # End of has Jamos
+ } # End of has Jamos
- $pre_body .= << 'END';
+ $pre_body .= << 'END';
sub name_to_code_point_special {
my $name = shift;
# Returns undef if not one of the specially handled names; otherwise
# returns the code point equivalent to the input name
END
- if ($has_hangul_syllables) {
- $pre_body .= << 'END';
+ if ($has_hangul_syllables) {
+ $pre_body .= << 'END';
if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
$name = substr($name, $HANGUL_SYLLABLE_LENGTH);
return ($L * $VCount + $V) * $TCount + $T + $SBase;
}
END
- }
- $pre_body .= << 'END';
+ }
+ $pre_body .= << 'END';
# Name must end in '-code_point' for this to handle.
if ($name !~ /^ (.*) - ($code_point_re) $/x) {
# Returns the name of a code point if algorithmically determinable;
# undef if not
END
- if ($has_hangul_syllables) {
- $pre_body .= << 'END';
+ if ($has_hangul_syllables) {
+ $pre_body .= << 'END';
# If in the Hangul range, calculate the name based on Unicode's
# algorithm
my $L = $LBase + $SIndex / $NCount;
my $V = $VBase + ($SIndex % $NCount) / $TCount;
my $T = $TBase + $SIndex % $TCount;
- $name = "$HANGUL_SYLLABLE $Jamo{$L}$Jamo{$V}";
+ $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
$name .= $Jamo{$T} if $T != $TBase;
return $name;
}
END
- }
- $pre_body .= << 'END';
+ }
+ $pre_body .= << 'END';
# Look through list of these code points for one in range.
foreach my $hash (@code_points_ending_in_code_point) {
} # End closure
END
- } # End of has hangul or code point in name maps.
- } # End of has specials
+ } # End of has hangul or code point in name maps.
+
+ my $format = $self->format;
+
+ my $return = <<END;
+# The name this swash is to be known by, with the format of the mappings in
+# the main body of the table, and what all code points missing from this file
+# map to.
+\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
+END
+ my $default_map = $default_map{$addr};
+ $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
+
+ if ($default_map eq $CODE_POINT) {
+ $return .= ' # code point maps to itself';
+ }
+ elsif ($default_map eq "") {
+ $return .= ' # code point maps to the null string';
+ }
+ $return .= "\n";
+
+ $return .= $pre_body;
+
+ return $return;
+ }
+
+ sub write {
+ # Write the table to the file.
+
+ my $self = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ my $addr = do { no overloading; pack 'J', $self; };
+
+ # Clear the temporaries
+ $has_hangul_syllables = 0;
+ undef @multi_code_point_maps;
+ undef %names_ending_in_code_point;
+ undef @code_points_ending_in_code_point;
# Calculate the format of the table if not already done.
- my $format = $format{$addr};
- my $property = $self->property;
- my $type = $property->type;
+ my $format = $self->format;
+ my $type = $self->property->type;
+ my $default_map = $self->default_map;
if (! defined $format) {
if ($type == $BINARY) {
# most restrictive, and so on.
$format = $DECIMAL_FORMAT;
foreach my $range (@ranges) {
+ next if $range->type != 0; # Non-normal ranges don't
+ # affect the main body
my $map = $range->value;
if ($map ne $default_map) {
last if $format eq $STRING_FORMAT; # already at
&& $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
$format = $STRING_FORMAT if $format eq $HEX_FORMAT
&& $map =~ /[^0-9A-F]/;
- }
- }
- }
- }
- } # end of calculating format
-
- my $return = <<END;
-# The name this swash is to be known by, with the format of the mappings in
-# the main body of the table, and what all code points missing from this file
-# map to.
-\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
-END
- my $missing = $default_map;
- if ($missing eq $CODE_POINT
- && $format ne $HEX_FORMAT
- && ! defined $format{$addr}) # Is expected if was manually set
- {
- Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
- }
- $format{$addr} = $format;
- $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
- if ($missing eq $CODE_POINT) {
- $return .= ' # code point maps to itself';
- }
- elsif ($missing eq "") {
- $return .= ' # code point maps to the null string';
- }
- $return .= "\n";
-
- $return .= $pre_body;
-
- return $return;
- }
-
- sub write {
- # Write the table to the file.
+ }
+ }
+ }
+ }
+ } # end of calculating format
- my $self = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ if ($default_map eq $CODE_POINT
+ && $format ne $HEX_FORMAT
+ && ! defined $self->format) # manual settings are always
+ # considered ok
+ {
+ Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
+ }
- my $addr = main::objaddr $self;
+ $self->_set_format($format);
return $self->SUPER::write(
($self->property == $block)
? 7 # block file needs more tab stops
: 3,
- $default_map{$addr}); # don't write defaulteds
+ $default_map); # don't write defaulteds
}
# Accessors for the underlying list that should fail if locked.
- for my $sub qw(
+ for my $sub (qw(
add_duplicate
- )
+ ))
{
no strict "refs";
*$sub = sub {
# Optional
my $initialize = delete $args{'Initialize'};
my $matches_all = delete $args{'Matches_All'} || 0;
+ my $format = delete $args{'Format'};
# Rest of parameters passed on.
my $range_list = Range_List->new(Initialize => $initialize,
# 'table' (If you change the '=' must also change the ':' in lots of
# places in this program that assume an equal sign)
$complete = $property->full_name . "=$complete" if $property != $perl;
-
my $self = $class->SUPER::new(%args,
Name => $name,
Full_Name => $full_name,
_Property => $property,
_Range_List => $range_list,
+ Format => $EMPTY_FORMAT,
);
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
$conflicting{$addr} = [ ];
$equivalents{$addr} = [ ];
$leader{$addr} = $self;
$parent{$addr} = $self;
+ if (defined $format && $format ne $EMPTY_FORMAT) {
+ Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
+ }
+
return $self;
}
return if $self->carp_if_locked;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
if (ref $other) {
# be an optional parameter.
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# Check if the conflicting name is exactly the same as any existing
# alias in this table (as long as there is a real object there to
return;
}
- sub is_equivalent_to {
+ sub is_set_equivalent_to {
# Return boolean of whether or not the other object is a table of this
# type and has been marked equivalent to this one.
unless ($other->isa(__PACKAGE__)) {
my $ref_other = ref $other;
my $ref_self = ref $self;
- Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
+ Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
return 0;
}
# Two tables are equivalent if they have the same leader.
- return $leader{main::objaddr $self}
- == $leader{main::objaddr $other};
+ no overloading;
+ return $leader{pack 'J', $self} == $leader{pack 'J', $other};
return;
}
- sub matches_identically_to {
- # Return a boolean as to whether or not two tables match identical
- # sets of code points.
-
- my $self = shift;
- my $other = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
-
- unless ($other->isa(__PACKAGE__)) {
- my $ref_other = ref $other;
- my $ref_self = ref $self;
- Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
- return 0;
- }
-
- # These are ordered in increasing real time to figure out (at least
- # until a patch changes that and doesn't change this)
- return 0 if $self->max != $other->max;
- return 0 if $self->min != $other->min;
- return 0 if $self->range_count != $other->range_count;
- return 0 if $self->count != $other->count;
-
- # Here they could be identical because all the tests above passed.
- # The loop below is somewhat simpler since we know they have the same
- # number of elements. Compare range by range, until reach the end or
- # find something that differs.
- my @a_ranges = $self->_range_list->ranges;
- my @b_ranges = $other->_range_list->ranges;
- for my $i (0 .. @a_ranges - 1) {
- my $a = $a_ranges[$i];
- my $b = $b_ranges[$i];
- trace "self $a; other $b" if main::DEBUG && $to_trace;
- return 0 if $a->start != $b->start || $a->end != $b->end;
- }
- return 1;
- }
-
sub set_equivalent_to {
# Set $self equivalent to the parameter table.
# The required Related => 'x' parameter is a boolean indicating
# not quite so many.
# If they are related, one must be a perl extension. This is because
# we can't guarantee that Unicode won't change one or the other in a
- # later release even if they are idential now.
+ # later release even if they are identical now.
my $self = shift;
my $other = shift;
# If already are equivalent, no need to re-do it; if subroutine
# returns null, it found an error, also do nothing
- my $are_equivalent = $self->is_equivalent_to($other);
+ my $are_equivalent = $self->is_set_equivalent_to($other);
return if ! defined $are_equivalent || $are_equivalent;
- my $current_leader = ($related)
- ? $parent{main::objaddr $self}
- : $leader{main::objaddr $self};
+ my $addr = do { no overloading; pack 'J', $self; };
+ my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
- if ($related &&
- ! $other->perl_extension
- && ! $current_leader->perl_extension)
- {
- Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
- $related = 0;
+ if ($related) {
+ if ($current_leader->perl_extension) {
+ if ($other->perl_extension) {
+ Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
+ return;
+ }
+ } elsif (! $other->perl_extension) {
+ Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
+ $related = 0;
+ }
+ }
+
+ if (! $self->is_empty && ! $self->matches_identically_to($other)) {
+ Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
+ return;
}
- my $leader = main::objaddr $current_leader;
- my $other_addr = main::objaddr $other;
+ my $leader = do { no overloading; pack 'J', $current_leader; };
+ my $other_addr = do { no overloading; pack 'J', $other; };
# Any tables that are equivalent to or children of this table must now
# instead be equivalent to or (children) to the new leader (parent),
next if $table == $other;
trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
- my $table_addr = main::objaddr $table;
+ my $table_addr = do { no overloading; pack 'J', $table; };
$leader{$table_addr} = $other;
$matches_all{$table_addr} = $matches_all;
$self->_set_range_list($other->_range_list);
# ones that share the same file. It lists all such tables, ordered so
# that related ones are together.
+ return unless $debugging_build;
+
my $leader = shift; # Should only be called on the leader table of
# an equivalent group
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $leader;
+ my $addr = do { no overloading; pack 'J', $leader; };
if ($leader{$addr} != $leader) {
Carp::my_carp_bug(<<END
&& $parent == $property->table('N')
&& defined (my $yes = $property->table('Y')))
{
- my $yes_addr = main::objaddr $yes;
+ my $yes_addr = do { no overloading; pack 'J', $yes; };
@yes_perl_synonyms
= grep { $_->property == $perl }
main::uniques($yes,
my @conflicting; # Will hold the table conflicts.
# Look at the parent, any yes synonyms, and all the children
+ my $parent_addr = do { no overloading; pack 'J', $parent; };
for my $table ($parent,
@yes_perl_synonyms,
- @{$children{main::objaddr $parent}})
+ @{$children{$parent_addr}})
{
- my $table_addr = main::objaddr $table;
+ my $table_addr = do { no overloading; pack 'J', $table; };
my $table_property = $table->property;
# Tables are separated by a blank line to create a grouping.
my $flag = $property->status
|| $table->status
|| $table_alias_object->status;
- $flags{$flag} = $status_past_participles{$flag} if $flag;
+ if ($flag) {
+ if ($flag ne $PLACEHOLDER) {
+ $flags{$flag} = $status_past_participles{$flag};
+ } else {
+ $flags{$flag} = <<END;
+a placeholder because it is not in Version $string_version of Unicode, but is
+needed by the Perl core to work gracefully. Because it is not in this version
+of Unicode, it will not be listed in $pod_file.pod
+END
+ }
+ }
$loose_count++;
push @note, $table->note;
push @conflicting, $table->conflicting;
+ # And this for output after all the tables.
+ push @global_comments, $table->comment;
+
# Compute an alternate compound name using the final property
# synonym and the first table synonym with a colon instead of
# the equal sign used elsewhere.
if (%flags) {
foreach my $flag (sort keys %flags) {
$comment .= <<END;
-'$flag' below means that this form is $flags{$flag}. Consult $pod_file.pod
+'$flag' below means that this form is $flags{$flag}.
END
+ next if $flag eq $PLACEHOLDER;
+ $comment .= "Consult $pod_file.pod\n";
}
$comment .= "\n";
}
$match$synonyms:
$matches_comment
-$pod_file.pod should be consulted for the rules on using $any_of_these,
+$pod_file.pod should be consulted for the syntax rules for $any_of_these,
including if adding or subtracting white space, underscore, and hyphen
characters matters or doesn't matter, and other permissible syntactic
variants. Upper/lower case distinctions never matter.
# And append any comment(s) from the actual tables. They are all
# gathered here, so may not read all that well.
- $comment .= "\n" . join "\n\n", @global_comments if @global_comments;
+ if (@global_comments) {
+ $comment .= "\n" . join("\n\n", @global_comments) . "\n";
+ }
if ($count) { # The format differs if no code points, and needs no
# explanation in that case
START\\tSTOP\\twhere START is the starting code point of the range, in hex;
STOP is the ending point, or if omitted, the range has just one code point.
END
- if ($output_range_counts) {
+ if ($leader->output_range_counts) {
$comment .= <<END;
Numbers in comments in [brackets] indicate how many code points are in the
range.
}
# Accessors for the underlying list
- for my $sub qw(
+ for my $sub (qw(
get_valid_code_point
get_invalid_code_point
- )
+ ))
{
no strict "refs";
*$sub = sub {
# each of them is stored in %alias_to_property_of as they are defined.
# But it's possible that this subroutine will be called with some
# variant, so if the initial lookup fails, it is repeated with the
- # standarized form of the input name. If found, besides returning the
+ # standardized form of the input name. If found, besides returning the
# result, the input name is added to the list so future calls won't
# have to do the conversion again.
my %args = @_;
$self = bless \do { my $anonymous_scalar }, $class;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
$directory{$addr} = delete $args{'Directory'};
$file{$addr} = delete $args{'File'};
. " argument to '-='. Subtraction ignored.");
return $self;
}
- elsif ($reversed) { # Shouldnt happen in a -=, but just in case
+ elsif ($reversed) { # Shouldn't happen in a -=, but just in case
Carp::my_carp_bug("Can't cope with a "
. __PACKAGE__
. " being the first parameter in a '-='. Subtraction ignored.");
return $self;
}
else {
- $map{main::objaddr $self}->delete_range($other, $other);
+ no overloading;
+ $map{pack 'J', $self}->delete_range($other, $other);
}
return $self;
}
my $name = shift;
my %args = @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
my $table = $table_ref{$addr}{$name};
my $standard_name = main::standardize($name);
my $name = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
# Return a list of pointers to all the match tables attached to this
# property
- return main::uniques(values %{$table_ref{main::objaddr shift}});
+ no overloading;
+ return main::uniques(values %{$table_ref{pack 'J', shift}});
}
sub directory {
# priority; 'undef' is returned if the type isn't defined;
# or $map_directory for everything else.
- my $addr = main::objaddr shift;
+ my $addr = do { no overloading; pack 'J', shift; };
return $directory{$addr} if defined $directory{$addr};
return undef if $type{$addr} == $UNKNOWN;
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
return $file{$addr} if defined $file{$addr};
return $map{$addr}->external_name;
# The whole point of this pseudo property is match tables.
return 1 if $self == $perl;
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# Don't generate tables of code points that match the property values
# of a string property. Such a list would most likely have many
return;
}
- return $map{main::objaddr $self}->
- map_add_or_replace_non_nulls($map{main::objaddr $other});
+ no overloading;
+ return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
}
sub set_type {
return;
}
- $type{main::objaddr $self} = $type;
+ { no overloading; $type{pack 'J', $self} = $type; }
return if $type != $BINARY;
my $yes = $self->table('Y');
my $map = shift; # What the range maps to.
# Rest of parameters passed on.
- my $addr = main::objaddr $self;
+ my $addr = do { no overloading; pack 'J', $self; };
# If haven't the type of the property, gather information to figure it
# out.
my $self = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr($self);
+ my $addr = do { no overloading; pack 'J', $self; };
my $type = $type{$addr};
# Most of the accessors for a property actually apply to its map table.
# Setup up accessor functions for those, referring to %map
- for my $sub qw(
+ for my $sub (qw(
add_alias
add_anomalous_entry
add_comment
aliases
comment
complete_name
+ containing_range
core_access
count
default_map
status
status_info
to_output_map
+ type_of
value_of
write
- )
+ ))
# 'property' above is for symmetry, so that one can take
# the property of a property and get itself, and so don't
# have to distinguish between properties and tables in
*$sub = sub {
use strict "refs";
my $self = shift;
- return $map{main::objaddr $self}->$sub(@_);
+ no overloading;
+ return $map{pack 'J', $self}->$sub(@_);
}
}
# A blank separates the joined lines except if there is a break; an extra
# blank is inserted after a period ending a line.
- # Intialize the return with the first line.
+ # Initialize the return with the first line.
my ($return, @lines) = split "\n", shift;
# If the first line is null, it was an empty line, add the \n back in
return;
}
-sub write ($\@) {
- # Given a filename and a reference to an array of lines, write the lines
- # to the file
+sub write ($$@) {
+ # Given a filename and references to arrays of lines, write the lines of
+ # each array to the file
# Filename can be given as an arrayref of directory names
- my $file = shift;
- my $lines_ref = shift;
- Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
- if (! defined $lines_ref) {
- Carp::my_carp("Missing lines to write parameter for $file. Writing skipped;");
- return;
- }
+ my $file = shift;
+ my $use_utf8 = shift;
# Get into a single string if an array, and get rid of, in Unix terms, any
# leading '.'
push @files_actually_output, $file;
- my $text;
- if (@$lines_ref) {
- $text = join "", @$lines_ref;
- }
- else {
- $text = "";
- Carp::my_carp("Output file '$file' is empty; writing it anyway;");
- }
-
force_unlink ($file);
my $OUT;
Carp::my_carp("can't open $file for output. Skipping this file: $!");
return;
}
+
+ binmode $OUT, ":utf8" if $use_utf8;
+
+ while (defined (my $lines_ref = shift)) {
+ unless (@$lines_ref) {
+ Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
+ }
+
+ print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
+ }
+ close $OUT or die Carp::my_carp("close '$file' failed: $!");
+
print "$file written.\n" if $verbosity >= $VERBOSE;
- print $OUT $text;
- close $OUT;
return;
}
$name =~ s/^\s+//g;
$name =~ s/\s+$//g;
- # Convert interior white space and hypens into underscores.
+ # Convert interior white space and hyphens into underscores.
$name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
# Capitalize the letter following an underscore, and convert a sequence of
else {
# Keep track of cycles in the input, and refuse to infinitely loop
- if (defined $already_output{main::objaddr $item}) {
+ my $addr = do { no overloading; pack 'J', $item; };
+ if (defined $already_output{$addr}) {
return "${indent}ALREADY OUTPUT: $item\n";
}
- $already_output{main::objaddr $item} = $item;
+ $already_output{$addr} = $item;
if (ref $item eq 'ARRAY') {
my $using_brackets;
my $fields_ref = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
- my $addr = main::objaddr $object;
+ my $addr = do { no overloading; pack 'J', $object; };
my %hash;
foreach my $key (keys %$fields_ref) {
}
else {
my $ref = ref $$which;
- my $addr = main::objaddr $$which;
+ my $addr = do { no overloading; pack 'J', $$which; };
$$which = "$ref ($addr)";
}
}
return 0 unless defined $other;
return 0 unless ref $other;
- return main::objaddr $self == main::objaddr $other;
+ no overloading;
+ return $self == $other;
}
sub _operator_not_equal {
}
}
- # This entry is still missing as of 5.2, perhaps because no short name for
+ # This entry is still missing as of 6.0, perhaps because no short name for
# it.
if (-e 'NameAliases.txt') {
my $aliases = property_ref('Name_Alias');
my $fold = property_ref('Case_Folding');
$fold->set_file('Fold') if defined $fold;
- # utf8.c can't currently cope with non range-size-1 for these, and even if
- # it were changed to do so, someone else may be using them, expecting the
- # old style
+ # utf8.c has a different meaning for non range-size-1 for map properties
+ # that this program doesn't currently handle; and even if it were changed
+ # to do so, some other code may be using them expecting range size 1.
foreach my $property (qw {
Case_Folding
Lowercase_Mapping
;
# The defaults apply only to unassigned characters
- $default_R .= '$gc->table("Cn") & $default;';
+ $default_R .= '$gc->table("Unassigned") & $default;';
if ($v_version lt v3.0.0) {
$default = Multi_Default->new(R => $default_R, 'L');
if ($v_version ge 3.1.0) {
$default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
}
- $default_AL .= '$gc->table("Cn") & $default';
+ $default_AL .= '$gc->table("Unassigned") & $default';
$default = Multi_Default->new(AL => $default_AL,
R => $default_R,
'L');
return @return;
}
+sub output_perl_charnames_line ($$) {
+
+ # Output the entries in Perl_charnames specially, using 5 digits instead
+ # of four. This makes the entries a constant length, and simplifies
+ # charnames.pm which this table is for. Unicode can have 6 digit
+ # ordinals, but they are all private use or noncharacters which do not
+ # have names, so won't be in this table.
+
+ return sprintf "%05X\t%s\n", $_[0], $_[1];
+}
+
{ # Closure
# This is used to store the range list of all the code points usable when
# the little used $compare_versions feature is enabled.
# 0374 ; NFD_QC; N
# 003C..003E ; Math
#
- # the fields are: "codepoint range ; property; map"
+ # the fields are: "codepoint-range ; property; map"
#
# meaning the codepoints in the range all have the value 'map' under
# 'property'.
- # Beginning and trailing white space in each field are not signficant.
+ # Beginning and trailing white space in each field are not significant.
# Note there is not a trailing semi-colon in the above. A trailing
# semi-colon means the map is a null-string. An omitted map, as
# opposed to a null-string, is assumed to be 'Y', based on Unicode
# file, in any order, interspersed in any way. The first time a
# property is seen, it gets information about that property and
# caches it for quick retrieval later. It also normalizes the maps
- # so that only one of many synonym is stored. The Unicode input files
- # do use some multiple synonyms.
+ # so that only one of many synonyms is stored. The Unicode input
+ # files do use some multiple synonyms.
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
$file->carp_bad_line("Unexpected property '$property_name'. Skipped");
next LINE;
}
- $property_addr = main::objaddr($property_object);
+ { no overloading; $property_addr = pack 'J', $property_object; }
# Defer changing names until have a line that is acceptable
# (the 'next' statement above means is unacceptable)
if $file->has_missings_defaults;
foreach my $default_ref (@missings_list) {
my $default = $default_ref->[0];
- my $addr = objaddr property_ref($default_ref->[1]);
+ my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
# For string properties, the default is just what the
# file says, but non-string properties should already
else {
$default_map = $missings;
}
+
# And store it with the property for outside use.
$property_object->set_default_map($default_map);
}
# If the map begins with a special command to us (enclosed in
# delimiters), extract the command(s).
- if (substr($map, 0, 1) eq $CMD_DELIM) {
- while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
- my $command = $1;
- if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
- $replace = $1;
- }
- elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
- $map_type = $1;
- }
- else {
- $file->carp_bad_line("Unknown command line: '$1'");
- next LINE;
- }
+ while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
+ my $command = $1;
+ if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
+ $replace = $1;
+ }
+ elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
+ $map_type = $1;
+ }
+ else {
+ $file->carp_bad_line("Unknown command line: '$1'");
+ next LINE;
}
}
}
}
}
-# XXX Unused until revise charnames;
-#sub check_and_handle_compound_name {
-# This looks at Name properties for parenthesized components and splits
-# them off. Thus it finds FF as an equivalent to Form Feed.
-# my $code_point = shift;
-# my $name = shift;
-# if ($name =~ /^ ( .*? ) ( \s* ) \( ( [^)]* ) \) (.*) $/x) {
-# #local $to_trace = 1 if main::DEBUG;
-# trace $1, $2, $3, $4 if main::DEBUG && $to_trace;
-# push @more_Names, "$code_point; $1";
-# push @more_Names, "$code_point; $3";
-# Carp::my_carp_bug("Expecting blank space before left parenthesis in '$_'. Proceeding and assuming it was there;") if $2 ne " ";
-# Carp::my_carp_bug("Not expecting anything after the right parenthesis in '$_'. Proceeding and ignoring that;") if $4 ne "";
-# }
-# return;
-#}
-
{ # Closure for UnicodeData.txt handling
# This file was the first one in the UCD; its design leads to some
# 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
# The fields in order are:
my $i = 0; # The code point is in field 0, and is shifted off.
- my $NAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
+ my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
my $CATEGORY = $i++; # category (e.g. "Lu")
my $CCC = $i++; # Canonical combining class (e.g. "230")
my $BIDI = $i++; # directional class (e.g. "L")
# This routine in addition outputs these extra fields:
my $DECOMP_TYPE = $i++; # Decomposition type
- my $DECOMP_MAP = $i++; # Must be last; another decomposition mapping
+
+ # These fields are modifications of ones above, and are usually
+ # suppressed; they must come last, as for speed, the loop upper bound is
+ # normally set to ignore them
+ my $NAME = $i++; # This is the strict name field, not the one that
+ # charnames uses.
+ my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
+ # by Unicode::Normalize
my $last_field = $i - 1;
# All these are read into an array for each line, with the indices defined
$field_names[$BIDI] = 'Bidi_Class';
$field_names[$CATEGORY] = 'General_Category';
$field_names[$CCC] = 'Canonical_Combining_Class';
+ $field_names[$CHARNAME] = 'Perl_Charnames';
$field_names[$COMMENT] = 'ISO_Comment';
$field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
$field_names[$DECOMP_TYPE] = 'Decomposition_Type';
- $field_names[$LOWER] = 'Simple_Lowercase_Mapping';
+ $field_names[$LOWER] = 'Lowercase_Mapping';
$field_names[$MIRRORED] = 'Bidi_Mirrored';
$field_names[$NAME] = 'Name';
$field_names[$NUMERIC] = 'Numeric_Value';
$field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
$field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
$field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
- $field_names[$TITLE] = 'Simple_Titlecase_Mapping';
+ $field_names[$TITLE] = 'Titlecase_Mapping';
$field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
- $field_names[$UPPER] = 'Simple_Uppercase_Mapping';
-
- # Some of these need a little more explanation. The $PERL_DECIMAL_DIGIT
- # field does not lead to an official Unicode property, but is used in
- # calculating the Numeric_Type. Perl however, creates a file from this
- # field, so a Perl property is created from it. Similarly, the Other
- # Digit field is used only for calculating the Numeric_Type, and so it can
- # be safely re-used as the place to store the value for Numeric_Type;
- # hence it is referred to as $NUMERIC_TYPE_OTHER_DIGIT. The input field
- # named $PERL_DECOMPOSITION is a combination of both the decomposition
- # mapping and its type. Perl creates a file containing exactly this
- # field, so it is used for that. The two properties are separated into
- # two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
+ $field_names[$UPPER] = 'Uppercase_Mapping';
+
+ # Some of these need a little more explanation:
+ # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
+ # property, but is used in calculating the Numeric_Type. Perl however,
+ # creates a file from this field, so a Perl property is created from it.
+ # Similarly, the Other_Digit field is used only for calculating the
+ # Numeric_Type, and so it can be safely re-used as the place to store
+ # the value for Numeric_Type; hence it is referred to as
+ # $NUMERIC_TYPE_OTHER_DIGIT.
+ # The input field named $PERL_DECOMPOSITION is a combination of both the
+ # decomposition mapping and its type. Perl creates a file containing
+ # exactly this field, so it is used for that. The two properties are
+ # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
+ # $DECOMP_MAP is usually suppressed (unless the lists are changed to
+ # output it), as Perl doesn't use it directly.
+ # The input field named here $CHARNAME is used to construct the
+ # Perl_Charnames property, which is a combination of the Name property
+ # (which the input field contains), and the Unicode_1_Name property, and
+ # others from other files. Since, the strict Name property is not used
+ # by Perl, this field is used for the table that Perl does use. The
+ # strict Name property table is usually suppressed (unless the lists are
+ # changed to output it), so it is accumulated in a separate field,
+ # $NAME, which to save time is discarded unless the table is actually to
+ # be output
# This file is processed like most in this program. Control is passed to
# process_generic_property_file() which calls filter_UnicodeData_line()
# the code point and name on each line. This was actually the hardest
# thing to design around. The code points in those ranges may actually
# have real maps not given by these two lines. These maps will either
- # be algorthimically determinable, or in the extracted files furnished
+ # be algorithmically determinable, or in the extracted files furnished
# with the UCD. In the event of conflicts between these extracted files,
# and this one, Unicode says that this one prevails. But it shouldn't
# prevail for conflicts that occur in these ranges. The data from the
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+ # Create a new property specially located that is a combination of the
+ # various Name properties: Name, Unicode_1_Name, Named Sequences, and
+ # Name_Alias properties. (The final duplicates elements of the
+ # first.) A comment for it will later be constructed based on the
+ # actual properties present and used
+ $perl_charname = Property->new('Perl_Charnames',
+ Core_Access => '\N{...} and "use charnames"',
+ Default_Map => "",
+ Directory => File::Spec->curdir(),
+ File => 'Name',
+ Internal_Only_Warning => 1,
+ Perl_Extension => 1,
+ Range_Size_1 => \&output_perl_charnames_line,
+ Type => $STRING,
+ );
+
my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
- Directory => '.',
+ Directory => File::Spec->curdir(),
File => 'Decomposition',
- Format => $STRING_FORMAT,
+ Format => $DECOMP_STRING_FORMAT,
Internal_Only_Warning => 1,
Perl_Extension => 1,
Default_Map => $CODE_POINT,
+ # normalize.pm can't cope with these
+ Output_Range_Counts => 0,
+
# This is a specially formatted table
# explicitly for normalize.pm, which
# is expecting a particular format,
END
));
- # This property is not used for generating anything else, and is
- # usually not output. By making it last in the list, we can just
+ # These properties are not used for generating anything else, and are
+ # usually not output. By making them last in the list, we can just
# change the high end of the loop downwards to avoid the work of
- # generating a table that is just going to get thrown away.
- if (! property_ref('Decomposition_Mapping')->to_output_map) {
- $last_field--;
+ # generating a table(s) that is/are just going to get thrown away.
+ if (! property_ref('Decomposition_Mapping')->to_output_map
+ && ! property_ref('Name')->to_output_map)
+ {
+ $last_field = min($NAME, $DECOMP_MAP) - 1;
+ } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
+ $last_field = $DECOMP_MAP;
+ } elsif (property_ref('Name')->to_output_map) {
+ $last_field = $NAME;
}
return;
}
# Certain fields just haven't been empty so far in any Unicode
# version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
# $CATEGORY. This leaves just the two fields, and so we hard-code in
- # the defaults; which are verly unlikely to ever change.
+ # the defaults; which are very unlikely to ever change.
$fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
$fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
# D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
# that define ranges. These should be processed after the fields are
# adjusted above, as they may override some of them; but mostly what
- # is left is to possibly adjust the $NAME field. The names of all the
+ # is left is to possibly adjust the $CHARNAME field. The names of all the
# paired lines start with a '<', but this is also true of '<control>,
# which isn't one of these special ones.
- if ($fields[$NAME] eq '<control>') {
+ if ($fields[$CHARNAME] eq '<control>') {
# Some code points in this file have the pseudo-name
# '<control>', but the official name for such ones is the null
- # string.
+ # string. For charnames.pm, we use the Unicode version 1 name
$fields[$NAME] = "";
+ $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
# We had better not be in between range lines.
if ($in_range) {
- $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
+ $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
$in_range = 0;
}
}
- elsif (substr($fields[$NAME], 0, 1) ne '<') {
+ elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
# Here is a non-range line. We had better not be in between range
# lines.
if ($in_range) {
- $file->carp_bad_line("Expecting a closing range line, not a $fields[$NAME]'. Trying anyway");
+ $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
$in_range = 0;
}
- # XXX until charnames catches up.
-# if ($fields[$NAME] =~ s/- $cp $//x) {
-#
-# # These are code points whose names end in their code points,
-# # which means the names are algorithmically derivable from the
-# # code points. To shorten the output Name file, the algorithm
-# # for deriving these is placed in the file instead of each
-# # code point, so they have map type $CP_IN_NAME
-# $fields[$NAME] = $CMD_DELIM
-# . $MAP_TYPE_CMD
-# . '='
-# . $CP_IN_NAME
-# . $CMD_DELIM
-# . $fields[$NAME];
-# }
-
- # Some official names are really two alternate names with one in
- # parentheses. What we do here is use the full official one for
- # the standard property (stored just above), but for the charnames
- # table, we add two more entries, one for each of the alternate
- # ones.
- # elsif name ne ""
- #check_and_handle_compound_name($cp, $fields[$NAME]);
- #check_and_handle_compound_name($cp, $unicode_1_name);
- # XXX until charnames catches up.
- }
- elsif ($fields[$NAME] =~ /^<(.+), First>$/) {
- $fields[$NAME] = $1;
+ if ($fields[$CHARNAME] =~ s/- $cp $//x) {
+
+ # These are code points whose names end in their code points,
+ # which means the names are algorithmically derivable from the
+ # code points. To shorten the output Name file, the algorithm
+ # for deriving these is placed in the file instead of each
+ # code point, so they have map type $CP_IN_NAME
+ $fields[$CHARNAME] = $CMD_DELIM
+ . $MAP_TYPE_CMD
+ . '='
+ . $CP_IN_NAME
+ . $CMD_DELIM
+ . $fields[$CHARNAME];
+ }
+ $fields[$NAME] = $fields[$CHARNAME];
+ }
+ elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
+ $fields[$CHARNAME] = $fields[$NAME] = $1;
# Here we are at the beginning of a range pair.
if ($in_range) {
- $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$NAME]'. Trying anyway");
+ $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
}
$in_range = 1;
$force_output = 1;
}
- elsif ($fields[$NAME] !~ s/^<(.+), Last>$/$1/) {
- $file->carp_bad_line("Unexpected name starting with '<' $fields[$NAME]. Ignoring this line.");
+ elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
+ $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
$_ = "";
return;
}
else { # Here, we are at the last line of a range pair.
if (! $in_range) {
- $file->carp_bad_line("Unexpected end of range $fields[$NAME] when not in one. Ignoring this line.");
+ $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
$_ = "";
return;
}
$in_range = 0;
+ $fields[$NAME] = $fields[$CHARNAME];
+
# Check that the input is valid: that the closing of the range is
# the same as the beginning.
foreach my $i (0 .. $last_field) {
}
# The processing differs depending on the type of range,
- # determined by its $NAME
- if ($fields[$NAME] =~ /^Hangul Syllable/) {
+ # determined by its $CHARNAME
+ if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
# Check that the data looks right.
if ($decimal_previous_cp != $SBase) {
# This range is stored in our internal structure with its
# own map type, different from all others.
- $previous_fields[$NAME] = $CMD_DELIM
+ $previous_fields[$CHARNAME] = $previous_fields[$NAME]
+ = $CMD_DELIM
. $MAP_TYPE_CMD
. '='
. $HANGUL_SYLLABLE
. $CMD_DELIM
- . $fields[$NAME];
+ . $fields[$CHARNAME];
}
- elsif ($fields[$NAME] =~ /^CJK/) {
+ elsif ($fields[$CHARNAME] =~ /^CJK/) {
# The name for these contains the code point itself, and all
# are defined to have the same base name, regardless of what
# is in the file. They are stored in our internal structure
# with a map type of $CP_IN_NAME
- $previous_fields[$NAME] = $CMD_DELIM
+ $previous_fields[$CHARNAME] = $previous_fields[$NAME]
+ = $CMD_DELIM
. $MAP_TYPE_CMD
. '='
. $CP_IN_NAME
# null, as there are no names for the private use and
# surrogate code points.
- $previous_fields[$NAME] = "";
+ $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
}
else {
- $file->carp_bad_line("Unexpected code point range $fields[$NAME] because category is $fields[$CATEGORY]. Attempting to process it.");
+ $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
}
# The first line of the range caused everything else to be output,
# code in this subroutine that does the same thing, but doesn't
# know about these ranges.
$_ = "";
+
return;
}
}
return;
}
+
+ sub filter_v6_ucd {
+
+ # Unicode 6.0 co-opted the name BELL for U+1F514, so change the input
+ # to pretend that U+0007 is ALERT instead, and for Perl 5.14, don't
+ # allow the BELL name for U+1F514, so that the old usage can be
+ # deprecated for one cycle.
+
+ return if $_ !~ /^(?:0007|1F514|070F);/;
+
+ my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
+ if ($code_point eq '0007') {
+ $fields[$CHARNAME] = "ALERT";
+ }
+ elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
+ # http://www.unicode.org/versions/corrigendum8.html
+ $fields[$BIDI] = "AL";
+ }
+ elsif ($^V lt v5.15.0) { # For 5.16 will convert to use Unicode's name
+ $fields[$CHARNAME] = "";
+ }
+
+ $_ = join ';', $code_point, @fields;
+
+ return;
+ }
} # End closure for UnicodeData
+sub process_GCB_test {
+
+ my $file = shift;
+ Carp::carp_extra_args(\@_) if main::DEBUG && @_;
+
+ while ($file->next_line) {
+ push @backslash_X_tests, $_;
+ }
+
+ return;
+}
+
sub process_NamedSequences {
# NamedSequences.txt entries are just added to an array. Because these
# don't look like the other tables, they have their own handler.
#
# This just adds the sequence to an array for later handling
- return; # XXX Until charnames catches up
my $file = shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
"Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
next;
}
- push @named_sequences, "$sequence\t\t$name";
+
+ # Note single \t in keeping with special output format of
+ # Perl_charnames. But it turns out that the code points don't have to
+ # be 5 digits long, like the rest, based on the internal workings of
+ # charnames.pm. This could be easily changed for consistency.
+ push @named_sequences, "$sequence\t$name";
}
return;
}
sub setup_special_casing {
# SpecialCasing.txt contains the non-simple case change mappings. The
- # simple ones are in UnicodeData.txt, and should already have been read
- # in.
- # This routine initializes the full mappings to the simple, then as each
- # line is processed, it overrides the simple ones.
+ # simple ones are in UnicodeData.txt, which should already have been read
+ # in to the full property data structures, so as to initialize these with
+ # the simple ones. Then the SpecialCasing.txt entries overwrite the ones
+ # which have different full mappings.
+
+ # This routine sees if the simple mappings are to be output, and if so,
+ # copies what has already been put into the full mapping tables, while
+ # they still contain only the simple mappings.
+
+ # The reason it is done this way is that the simple mappings are probably
+ # not going to be output, so it saves work to initialize the full tables
+ # with the simple mappings, and then overwrite those relatively few
+ # entries in them that have different full mappings, and thus skip the
+ # simple mapping tables altogether.
my $file= shift;
Carp::carp_extra_args(\@_) if main::DEBUG && @_;
# For each of the case change mappings...
foreach my $case ('lc', 'tc', 'uc') {
+ my $full = property_ref($case);
+ unless (defined $full && ! $full->is_empty) {
+ Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
+ }
# The simple version's name in each mapping merely has an 's' in front
# of the full one's
my $simple = property_ref('s' . $case);
- unless (defined $simple && ! $simple->is_empty) {
- Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
- }
-
- # Initialize the full case mappings with the simple ones.
- property_ref($case)->initialize($simple);
+ $simple->initialize($full) if $simple->to_output_map();
}
return;
# implemented, it would be by hard-coding in the casing functions in the
# Perl core, not through tables. But if there is a new condition we don't
# know about, output a warning. We know about all the conditions through
- # 5.2
+ # 6.0
if ($fields[4] ne "") {
my @conditions = split ' ', $fields[4];
if ($conditions[0] ne 'tr' # We know that these languages have
# it takes no part in anything we do.
my $to_output_simple;
+ # XXX
# These are experimental, perhaps will need these to pass to regcomp.c to
# handle the cases where for example the Kelvin sign character folds to k,
# and in regcomp, we need to know which of the characters can have a
$file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
}
- # Experimental, see comment above
+ # XXX Experimental, see comment above
if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
my @folded = split ' ', $map;
if (hex $folded[0] < 256 && @folded == 1) {
}
sub post_fold {
- # Experimental, see comment above
+ # XXX Experimental, see comment above
return;
#local $to_trace = 1 if main::DEBUG;
# one.
# Titlecase duplicates UnicodeData.txt: gc=lt
# Unassigned Code Value duplicates UnicodeData.txt: gc=cc
- # Zero-width never made into offical property;
+ # Zero-width never made into official property;
# subset of gc=cf
# Most of the properties have the same names in this file as in later
# versions, but a couple do not.
# Add mappings to the property for each code point in the list
foreach my $range ($list->ranges) {
- $property->add_map($range->start, $range->end, $default);
+ $property->add_map($range->start, $range->end, $default,
+ Replace => $CROAK);
}
}
}
# Add any remaining code points to the mapping, using the default for
- # missing code points
+ # missing code points.
if (defined (my $default_map = $property->default_map)) {
- foreach my $range ($property->inverse_list->ranges) {
- $property->add_map($range->start, $range->end, $default_map);
- }
+
+ # This fills in any missing values with the default.
+ $property->add_map(0, $LAST_UNICODE_CODEPOINT,
+ $default_map, Replace => $NO);
# Make sure there is a match table for the default
if (! defined $property->table($default_map)) {
# range, with their names prefaced by 'Posix', to signify that these match
# what the Posix standard says they should match. A couple are
# effectively this, but the name doesn't have 'Posix' in it because there
- # just isn't any Posix equivalent.
+ # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
+ # to the full Unicode range, by our guesses as to what is appropriate.
# 'Any' is all code points. As an error check, instead of just setting it
# to be that, construct it to be the union of all the major categories
$ASCII->initialize([ 0..127 ]);
}
- # A number of the Perl synonyms have a restricted-range synonym whose name
- # begins with Posix. This hash gets filled in with them, so that they can
- # be populated in a small loop.
- my %posix_equivalent;
-
# Get the best available case definitions. Early Unicode versions didn't
# have Uppercase and Lowercase defined, so use the general category
# instead for them.
$Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
Related => 1);
}
- $posix_equivalent{'Lower'} = $Lower;
+ $Lower->add_alias('XPosixLower');
+ $perl->add_match_table("PosixLower",
+ Description => "[a-z]",
+ Initialize => $Lower & $ASCII,
+ );
my $Upper = $perl->add_match_table('Upper');
my $Unicode_Upper = property_ref('Uppercase');
$Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
Related => 1);
}
- $posix_equivalent{'Upper'} = $Upper;
+ $Upper->add_alias('XPosixUpper');
+ $perl->add_match_table("PosixUpper",
+ Description => "[A-Z]",
+ Initialize => $Upper & $ASCII,
+ );
# Earliest releases didn't have title case. Initialize it to empty if not
# otherwise present
# one whose name generally begins with Posix that is posix-compliant, and
# one that matches Unicode characters beyond the Posix, ASCII range
- my $Alpha = $perl->add_match_table('Alpha',
- Description => '[[:Alpha:]] extended beyond ASCII');
+ my $Alpha = $perl->add_match_table('Alpha');
# Alphabetic was not present in early releases
my $Alphabetic = property_ref('Alphabetic');
+ $gc->table('Mn')
+ $gc->table('Mc'));
$Alpha += $gc->table('Nl') if defined $gc->table('Nl');
+ $Alpha->add_description('Alphabetic');
}
- $posix_equivalent{'Alpha'} = $Alpha;
+ $Alpha->add_alias('XPosixAlpha');
+ $perl->add_match_table("PosixAlpha",
+ Description => "[A-Za-z]",
+ Initialize => $Alpha & $ASCII,
+ );
my $Alnum = $perl->add_match_table('Alnum',
- Description => "[[:Alnum:]] extended beyond ASCII",
+ Description => 'Alphabetic and (Decimal) Numeric',
Initialize => $Alpha + $gc->table('Decimal_Number'),
);
- $posix_equivalent{'Alnum'} = $Alnum;
+ $Alnum->add_alias('XPosixAlnum');
+ $perl->add_match_table("PosixAlnum",
+ Description => "[A-Za-z0-9]",
+ Initialize => $Alnum & $ASCII,
+ );
my $Word = $perl->add_match_table('Word',
- Description => '\w, including beyond ASCII',
+ Description => '\w, including beyond ASCII;'
+ . ' = \p{Alnum} + \pM + \p{Pc}',
Initialize => $Alnum + $gc->table('Mark'),
);
+ $Word->add_alias('XPosixWord');
my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
$Word += $Pc if defined $Pc;
- # There is no [[:Word:]], so the name doesn't begin with Posix.
- $perl->add_match_table('PerlWord',
+ # This is a Perl extension, so the name doesn't begin with Posix.
+ my $PerlWord = $perl->add_match_table('PerlWord',
Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
Initialize => $Word & $ASCII,
);
+ $PerlWord->add_alias('PosixWord');
my $Blank = $perl->add_match_table('Blank',
Description => '\h, Horizontal white space',
- 0x200B, # ZWSP
);
$Blank->add_alias('HorizSpace'); # Another name for it.
- $posix_equivalent{'Blank'} = $Blank;
+ $Blank->add_alias('XPosixBlank');
+ $perl->add_match_table("PosixBlank",
+ Description => "\\t and ' '",
+ Initialize => $Blank & $ASCII,
+ );
my $VertSpace = $perl->add_match_table('VertSpace',
Description => '\v',
# No Posix equivalent for vertical space
my $Space = $perl->add_match_table('Space',
- Description => '\s including beyond ASCII plus vertical tab = [[:Space:]]',
- Initialize => $Blank + $VertSpace,
+ Description => '\s including beyond ASCII plus vertical tab',
+ Initialize => $Blank + $VertSpace,
);
- $posix_equivalent{'Space'} = $Space;
+ $Space->add_alias('XPosixSpace');
+ $perl->add_match_table("PosixSpace",
+ Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
+ Initialize => $Space & $ASCII,
+ );
# Perl's traditional space doesn't include Vertical Tab
- my $SpacePerl = $perl->add_match_table('SpacePerl',
+ my $XPerlSpace = $perl->add_match_table('XPerlSpace',
Description => '\s, including beyond ASCII',
Initialize => $Space - 0x000B,
);
- $perl->add_match_table('PerlSpace',
+ $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
+ my $PerlSpace = $perl->add_match_table('PerlSpace',
Description => '\s, restricted to ASCII',
- Initialize => $SpacePerl & $ASCII,
+ Initialize => $XPerlSpace & $ASCII,
);
+
my $Cntrl = $perl->add_match_table('Cntrl',
- Description => "[[:Cntrl:]] extended beyond ASCII");
+ Description => 'Control characters');
$Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
- $posix_equivalent{'Cntrl'} = $Cntrl;
+ $Cntrl->add_alias('XPosixCntrl');
+ $perl->add_match_table("PosixCntrl",
+ Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
+ Initialize => $Cntrl & $ASCII,
+ );
# $controls is a temporary used to construct Graph.
my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
# Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
my $Graph = $perl->add_match_table('Graph',
- Description => "[[:Graph:]] extended beyond ASCII",
+ Description => 'Characters that are graphical',
Initialize => ~ ($Space + $controls),
);
- $posix_equivalent{'Graph'} = $Graph;
+ $Graph->add_alias('XPosixGraph');
+ $perl->add_match_table("PosixGraph",
+ Description =>
+ '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
+ Initialize => $Graph & $ASCII,
+ );
- my $Print = $perl->add_match_table('Print',
- Description => "[[:Print:]] extended beyond ASCII",
- Initialize => $Space + $Graph - $gc->table('Control'),
+ $print = $perl->add_match_table('Print',
+ Description => 'Characters that are graphical plus space characters (but no controls)',
+ Initialize => $Blank + $Graph - $gc->table('Control'),
);
- $posix_equivalent{'Print'} = $Print;
+ $print->add_alias('XPosixPrint');
+ $perl->add_match_table("PosixPrint",
+ Description =>
+ '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
+ Initialize => $print & $ASCII,
+ );
my $Punct = $perl->add_match_table('Punct');
$Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
# \p{punct} doesn't include the symbols, which posix does
+ my $XPosixPunct = $perl->add_match_table('XPosixPunct',
+ Description => '\p{Punct} + ASCII-range \p{Symbol}',
+ Initialize => $gc->table('Punctuation')
+ + ($ASCII & $gc->table('Symbol')),
+ );
$perl->add_match_table('PosixPunct',
- Description => "[[:Punct:]]",
- Initialize => $ASCII & ($gc->table('Punctuation')
- + $gc->table('Symbol')),
- );
+ Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
+ Initialize => $ASCII & $XPosixPunct,
+ );
my $Digit = $perl->add_match_table('Digit',
- Description => '\d, extended beyond just [0-9]');
+ Description => '[0-9] + all other decimal digits');
$Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
- $posix_equivalent{'Digit'} = $Digit;
-
- # AHex was not present in early releases
- my $Xdigit = $perl->add_match_table('XDigit',
- Description => '[0-9A-Fa-f]');
- my $AHex = property_ref('ASCII_Hex_Digit');
- if (defined $AHex && ! $AHex->is_empty) {
- $Xdigit->set_equivalent_to($AHex->table('Y'), Related => 1);
+ $Digit->add_alias('XPosixDigit');
+ my $PosixDigit = $perl->add_match_table("PosixDigit",
+ Description => '[0-9]',
+ Initialize => $Digit & $ASCII,
+ );
+
+ # Hex_Digit was not present in first release
+ my $Xdigit = $perl->add_match_table('XDigit');
+ $Xdigit->add_alias('XPosixXDigit');
+ my $Hex = property_ref('Hex_Digit');
+ if (defined $Hex && ! $Hex->is_empty) {
+ $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
}
else {
- # (Have to use hex because could be running on an non-ASCII machine,
- # and we want the Unicode (ASCII) values)
- $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66 ]);
- }
-
- # Now, add the ASCII-restricted tables that get uniform treatment
- while (my ($name, $table) = each %posix_equivalent) {
- $perl->add_match_table("Posix$name",
- Description => "[[:$name:]]",
- Initialize => $table & $ASCII,
- );
- }
- $perl->table('PosixDigit')->add_description('\d, restricted to ASCII');
- $perl->table('PosixDigit')->add_description('[0-9]');
-
+ # (Have to use hex instead of e.g. '0', because could be running on an
+ # non-ASCII machine, and we want the Unicode (ASCII) values)
+ $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
+ 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
+ $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
+ }
+ $perl->add_match_table('PosixXDigit',
+ Initialize => $ASCII & $Xdigit,
+ Description => '[0-9A-Fa-f]',
+ );
my $dt = property_ref('Decomposition_Type');
$dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
Perl_Extension => 1,
- Note => 'Perl extension consisting of the union of all non-canonical decompositions',
+ Note => 'Union of all non-canonical decompositions',
);
# _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
}
# These are used in Unicode's definition of \X
+ my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
+ my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
+
my $gcb = property_ref('Grapheme_Cluster_Break');
- #my $extend = $perl->add_match_table('_X_Extend');
- my $extend = $perl->add_match_table('_GCB_Extend');
- # XXX until decide what todo my $begin = $perl->add_match_table('_X_Begin');
- if (defined $gcb) {
- $extend += $gcb->table('Extend') + $gcb->table('SpacingMark')
- #$begin += ~ ($gcb->table('Control')
- # + $gcb->table('CR')
- # + $gcb->table('LF'));
+
+ # The 'extended' grapheme cluster came in 5.1. The non-extended
+ # definition differs too much from the traditional Perl one to use.
+ if (defined $gcb && defined $gcb->table('SpacingMark')) {
+
+ # Note that assumes HST is defined; it came in an earlier release than
+ # GCB. In the line below, two negatives means: yes hangul
+ $begin += ~ property_ref('Hangul_Syllable_Type')
+ ->table('Not_Applicable')
+ + ~ ($gcb->table('Control')
+ + $gcb->table('CR')
+ + $gcb->table('LF'));
+ $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
+
+ $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
+ $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
}
else { # Old definition, used on early releases.
$extend += $gc->table('Mark')
- + 0x200C # ZWNJ
- + 0x200D; # ZWJ
- #$begin += ~ $extend;
- }
-
- # Create a new property specially located that is a combination of the
- # various Name properties: Name, Unicode_1_Name, Named Sequences, and
- # Name_Alias properties. (The final duplicates elements of the first.) A
- # comment for it is constructed based on the actual properties present and
- # used
- my $perl_charname = Property->new('Perl_Charnames',
- Core_Access => '\N{...} and charnames.pm',
- Default_Map => "",
- Directory => '.',
- File => 'Name',
- Internal_Only_Warning => 1,
- Perl_Extension => 1,
- Range_Size_1 => 1,
- Type => $STRING,
- Initialize => property_ref('Unicode_1_Name'),
- );
- # Name overrides Unicode_1_Name
- $perl_charname->property_add_or_replace_non_nulls(property_ref('Name'));
+ + 0x200C # ZWNJ
+ + 0x200D; # ZWJ
+ $begin += ~ $extend;
+
+ # Here we may have a release that has the regular grapheme cluster
+ # defined, or a release that doesn't have anything defined.
+ # We set things up so the Perl core degrades gracefully, possibly with
+ # placeholders that match nothing.
+
+ if (! defined $gcb) {
+ $gcb = Property->new('GCB', Status => $PLACEHOLDER);
+ }
+ my $hst = property_ref('HST');
+ if (!defined $hst) {
+ $hst = Property->new('HST', Status => $PLACEHOLDER);
+ $hst->add_match_table('Not_Applicable',
+ Initialize => $Any,
+ Matches_All => 1);
+ }
+
+ # On some releases, here we may not have the needed tables for the
+ # perl core, in some releases we may.
+ foreach my $name (qw{ L LV LVT T V prepend }) {
+ my $table = $gcb->table($name);
+ if (! defined $table) {
+ $table = $gcb->add_match_table($name);
+ push @tables_that_may_be_empty, $table->complete_name;
+ }
+
+ # The HST property predates the GCB one, and has identical tables
+ # for some of them, so use it if we can.
+ if ($table->is_empty
+ && defined $hst
+ && defined $hst->table($name))
+ {
+ $table += $hst->table($name);
+ }
+ }
+ }
+
+ # More GCB. If we found some hangul syllables, populate a combined
+ # table.
+ my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
+ my $LV = $gcb->table('LV');
+ if ($LV->is_empty) {
+ push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
+ } else {
+ $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
+ $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
+ }
+
+ # Was previously constructed to contain both Name and Unicode_1_Name
my @composition = ('Name', 'Unicode_1_Name');
if (@named_sequences) {
$alias_sentence = <<END;
The Name_Alias property adds duplicate code point entries with a corrected
name. The original (less correct, but still valid) name will be physically
-first.
+last.
END
}
my $comment;
$comment .= ", and $composition[-1]";
}
- # Wait for charnames to catch up
-# foreach my $entry (@more_Names,
-# split "\n", <<"END"
-#000A; LF
-#000C; FF
-#000D; CR
-#0085; NEL
-#200C; ZWNJ
-#200D; ZWJ
-#FEFF; BOM
-#FEFF; BYTE ORDER MARK
-#END
-# ) {
-# #local $to_trace = 1 if main::DEBUG;
-# trace $entry if main::DEBUG && $to_trace;
-# my ($code_point, $name) = split /\s*;\s*/, $entry;
-# $code_point = hex $code_point;
-# trace $code_point, $name if main::DEBUG && $to_trace;
-# $perl_charname->add_duplicate($code_point, $name);
-# }
-# #$perl_charname->add_comment("This file is for charnames.pm. It is the union of the $comment properties, plus certain commonly used but unofficial names, such as 'FF' and 'ZWNJ'. Unicode_1_Name entries are used only for otherwise nameless code points.$alias_sentence");
$perl_charname->add_comment(join_lines( <<END
This file is for charnames.pm. It is the union of the $comment properties.
Unicode_1_Name entries are used only for otherwise nameless code
Type => $ENUM,
Initialize => $ccc,
File => 'CombiningClass',
- Directory => '.',
+ Directory => File::Spec->curdir(),
);
$perl_ccc->set_to_output_map(1);
$perl_ccc->add_comment(join_lines(<<END
my $description_start = "Code point's usage introduced in version ";
$first_age->add_description($description_start . $first_age->name);
- # To construct the accumlated values, for each of the age tables
+ # To construct the accumulated values, for each of the age tables
# starting with the 2nd earliest, merge the earliest with it, to get
# all those code points existing in the 2nd earliest. Repeat merging
# the new 2nd earliest with the 3rd earliest to get all those existing
}
}
+ # Here done with all the basic stuff. Ready to populate the information
+ # about each character if annotating them.
+ if ($annotate) {
+
+ # See comments at its declaration
+ $annotate_ranges = Range_Map->new;
+
+ # This separates out the non-characters from the other unassigneds, so
+ # can give different annotations for each.
+ $unassigned_sans_noncharacters = Range_List->new(
+ Initialize => $gc->table('Unassigned')
+ & property_ref('Noncharacter_Code_Point')->table('N'));
+
+ for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) {
+ $i = populate_char_info($i); # Note sets $i so may cause skips
+ }
+ }
+
return;
}
# name. We could be in trouble, but not if this is just a
# synonym for another table that we have already made a child
# of the pre-existing one.
- if ($pre_existing->is_equivalent_to($actual)) {
+ if ($pre_existing->is_set_equivalent_to($actual)) {
trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
$pre_existing->add_alias($proposed_name);
next;
sub register_file_for_name($$$) {
# Given info about a table and a datafile that it should be associated
- # with, register that assocation
+ # with, register that association
my $table = shift;
my $directory_ref = shift; # Array of the directory path for the file
$parenthesized .= ')' if $parenthesized;
push @info, $parenthesized if $parenthesized;
- push @info, "($string_count)" if $output_range_counts;
+
+ if ($table_property != $perl && $table->perl_extension) {
+ push @info, '(Perl extension)';
+ }
+ push @info, "($string_count)";
# Now, we have both the entry and info so add them to the
# list of all the properties.
sub pod_alphanumeric_sort {
# Sort pod entries alphanumerically.
- # The first few character columns are filler; and get rid of all the
- # trailing stuff, starting with the trailing '}', so as to sort on just
- # '\p{Name=Value'
- my $a = lc substr($a, $FILLER);
+ # The first few character columns are filler, plus the '\p{'; and get rid
+ # of all the trailing stuff, starting with the trailing '}', so as to sort
+ # on just 'Name=Value'
+ (my $a = lc $a) =~ s/^ .*? { //x;
$a =~ s/}.*//;
- my $b = lc substr($b, $FILLER);
+ (my $b = lc $b) =~ s/^ .*? { //x;
$b =~ s/}.*//;
+ # Determine if the two operands are both internal only or both not.
+ # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
+ # should be the underscore that begins internal only
+ my $a_is_internal = (substr($a, 0, 1) eq '_');
+ my $b_is_internal = (substr($b, 0, 1) eq '_');
+
+ # Sort so the internals come last in the table instead of first (which the
+ # leading underscore would otherwise indicate).
+ if ($a_is_internal != $b_is_internal) {
+ return 1 if $a_is_internal;
+ return -1
+ }
+
# Determine if the two operands are numeric property values or not.
- # A numeric property will look like \p{xyz: 3}. But the number
+ # A numeric property will look like xyz: 3. But the number
# can begin with an optional minus sign, and may have a
- # fraction or rational component, like \p{xyz: 3/2}. If either
+ # fraction or rational component, like xyz: 3/2. If either
# isn't numeric, use alphabetic sort.
my ($a_initial, $a_number) =
- ($a =~ /^\\p{ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
+ ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
return $a cmp $b unless defined $a_number;
my ($b_initial, $b_number) =
- ($b =~ /^\\p{ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
+ ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
return $a cmp $b unless defined $b_number;
# Here they are both numeric, but use alphabetic sort if the
=head1 NAME
-$pod_file - Complete index of Unicode Version $string_version properties
+$pod_file - Index of Unicode Version $string_version properties in Perl
=head1 DESCRIPTION
B<Compound forms> consist of two components, separated by an equals sign or a
colon. The first component is the property name, and the second component is
the particular value of the property to match against, for example,
-'\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
+'\\p{Script: Greek}' and '\\p{Script=Greek}' both mean to match characters
whose Script property is Greek.
B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
=back
Some properties are considered obsolete, but still available. There are
-several varieties of obsolesence:
+several varieties of obsolescence:
=over 4
=item Obsolete
Properties marked with $a_bold_obsolete in the table are considered
-obsolete. At the time of this writing (Unicode version 5.2) there is no
-information in the Unicode standard about the implications of a property being
obsolete.
=item Stabilized
-Obsolete properties may be stabilized. This means that they are not actively
-maintained by Unicode, and will not be extended as new characters are added to
-the standard. Such properties are marked with $a_bold_stabilized in the
-table. At the time of this writing (Unicode version 5.2) there is no further
-information in the Unicode standard about the implications of a property being
-stabilized.
+Obsolete properties may be stabilized. Such a determination does not indicate
+that the property should or should not be used; instead it is a declaration
+that the property will not be maintained nor extended for newly encoded
+characters. Such properties are marked with $a_bold_stabilized in the
+table.
=item Deprecated
-Obsolete properties may be deprecated. This means that their use is strongly
+An obsolete property may be deprecated, perhaps because its original intent
+has been replaced by another property or because its specification was somehow
+defective. This means that its use is strongly
discouraged, so much so that a warning will be issued if used, unless the
regular expression is in the scope of a C<S<no warnings 'deprecated'>>
statement. $A_bold_deprecated flags each such entry in the table, and
@block_warning
The table below has two columns. The left column contains the \\p{}
-constructs to look up, possibly preceeded by the flags mentioned above; and
+constructs to look up, possibly preceded by the flags mentioned above; and
the right column contains information about them, like a description, or
synonyms. It shows both the single and compound forms for each property that
has them. If the left column is a short name for a property, the right column
The right column will also caution you if a property means something different
than what might normally be expected.
+All single forms are Perl extensions; a few compound forms are as well, and
+are noted as such.
+
Numbers in (parentheses) indicate the total number of code points matched by
the property. For emphasis, those properties that match no code points at all
are listed as well in a separate section following the table.
The Name property is accessible through the \\N{} interpolation in
double-quoted strings and regular expressions, but both usages require a C<use
-charnames;> to be specified, which also contains related functions viacode()
-and vianame().
+charnames;> to be specified, which also contains related functions viacode(),
+vianame(), and string_vianame().
=head1 Unicode regular expression properties that are NOT accepted by Perl
=back
An installation can choose to allow any of these to be matched by changing the
-controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
-and then re-running F<$0>. (C<\%Config> is available from the Config module).
+controlling lists contained in the program
+C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
+(C<\%Config> is available from the Config module).
=head1 Files in the I<To> directory (for serious hackers only)
For example, the uc() function implements the Uppercase_Mapping property and
uses the F<Upper.pl> file found in this directory.
-The available files with their properties (short names in parentheses),
-and any flags or comments about them, are:
+The available files in the current installation, with their properties (short
+names in parentheses), and any flags or comments about them, are:
@map_tables_actually_output
An installation can choose to change which files are generated by changing the
-controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
-and then re-running F<$0>.
+controlling lists contained in the program
+C<\$Config{privlib}>/F<unicore/mktables> and then re-running F<mktables>.
Each of these files defines two hash entries to help reading programs decipher
it. One of them looks like this:
END
- # And write it.
- main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
+ # And write it. The 0 means no utf8.
+ main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
return;
}
1;
END
- main::write("Heavy.pl", @heavy);
+ main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
return;
}
|| ! defined $pod_directory
|| ! $alias->make_pod_entry;
+ my $rhs = $full_property_name;
+ if ($property != $perl && $table->perl_extension) {
+ $rhs .= ' (Perl extension)';
+ }
push @match_properties,
format_pod_line($indent_info_column,
'\p{' . $alias->name . ': *}',
- $full_property_name,
+ $rhs,
$alias->status);
}
} # End of non-string-like property code
$filename = $table->file;
}
- # Use specified filename if avaliable, or default to property's
+ # Use specified filename if available, or default to property's
# shortest name. We need an 8.3 safe filename (which means "an 8
# safe" filename, since after the dot is only 'pl', which is < 3)
# The 2nd parameter is if the filename shouldn't be changed, and
. $spaces_after;
}
-sub generate_tests($$$$$$) {
+sub generate_tests($$$$$) {
# This used only for making the test script. It generates test cases that
# are expected to compile successfully in perl. Note that the lhs and
# rhs are assumed to already be as randomized as the caller wants.
- my $file_handle = shift; # Where to output the tests
my $lhs = shift; # The property: what's to the left of the colon
# or equals separator
my $rhs = shift; # The property value; what's to the right
# The whole 'property=value'
my $name = "$lhs$separator$rhs";
+ my @output;
# Create a complete set of tests, with complements.
if (defined $valid_code) {
- printf $file_handle
- qq/Expect(1, $valid_code, '\\p{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(0, $valid_code, '\\p{^$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(0, $valid_code, '\\P{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(1, $valid_code, '\\P{^$name}', $warning);\n/;
+ push @output, <<"EOC"
+Expect(1, $valid_code, '\\p{$name}', $warning);
+Expect(0, $valid_code, '\\p{^$name}', $warning);
+Expect(0, $valid_code, '\\P{$name}', $warning);
+Expect(1, $valid_code, '\\P{^$name}', $warning);
+EOC
}
if (defined $invalid_code) {
- printf $file_handle
- qq/Expect(0, $invalid_code, '\\p{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(1, $invalid_code, '\\p{^$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(1, $invalid_code, '\\P{$name}', $warning);\n/;
- printf $file_handle
- qq/Expect(0, $invalid_code, '\\P{^$name}', $warning);\n/;
- }
- return;
+ push @output, <<"EOC"
+Expect(0, $invalid_code, '\\p{$name}', $warning);
+Expect(1, $invalid_code, '\\p{^$name}', $warning);
+Expect(1, $invalid_code, '\\P{$name}', $warning);
+Expect(0, $invalid_code, '\\P{^$name}', $warning);
+EOC
+ }
+ return @output;
}
-sub generate_error($$$$) {
+sub generate_error($$$) {
# This used only for making the test script. It generates test cases that
# are expected to not only not match, but to be syntax or similar errors
- my $file_handle = shift; # Where to output to.
my $lhs = shift; # The property: what's to the left of the
# colon or equals separator
my $rhs = shift; # The property value; what's to the right
my $property = $lhs . $separator . $rhs;
- print $file_handle qq/Error('\\p{$property}');\n/;
- print $file_handle qq/Error('\\P{$property}');\n/;
- return;
+ return <<"EOC";
+Error('\\p{$property}');
+Error('\\P{$property}');
+EOC
}
# These are used only for making the test script
$t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
- force_unlink ($t_path);
- push @files_actually_output, $t_path;
- my $OUT;
- if (not open $OUT, "> $t_path") {
- Carp::my_carp("Can't open $t_path. Skipping: $!");
- return;
- }
-
# Keep going down an order of magnitude
# until find that adding this quantity to
# 1 remains 1; but put an upper limit on
# use previous one
$min_floating_slop = $next;
}
- print $OUT $HEADER, <DATA>;
+
+ # It doesn't matter whether the elements of this array contain single lines
+ # or multiple lines. main::write doesn't count the lines.
+ my @output;
foreach my $property (property_ref('*')) {
foreach my $table ($property->tables) {
my $already_error = ! $table->file_path;
# Generate error cases for this alias.
- generate_error($OUT,
- $property_name,
- $table_name,
- $already_error);
+ push @output, generate_error($property_name,
+ $table_name,
+ $already_error);
# If the table is guaranteed to always generate an error,
# quit now without generating success cases.
# Don't output duplicate test cases.
if (! exists $test_generated{$test_name}) {
$test_generated{$test_name} = 1;
- generate_tests($OUT,
- $property_name,
- $standard,
- $valid,
- $invalid,
- $warning,
- );
+ push @output, generate_tests($property_name,
+ $standard,
+ $valid,
+ $invalid,
+ $warning,
+ );
}
$random = randomize_loose_name($table_name)
}
my $test_name = "$property_name=$random";
if (! exists $test_generated{$test_name}) {
$test_generated{$test_name} = 1;
- generate_tests($OUT,
- $property_name,
- $random,
- $valid,
- $invalid,
- $warning,
- );
+ push @output, generate_tests($property_name,
+ $random,
+ $valid,
+ $invalid,
+ $warning,
+ );
# If the name is a rational number, add tests for the
# floating point equivalent.
if abs($table_name - $existing)
< $MAX_FLOATING_SLOP;
}
- generate_error($OUT,
- $property_name,
- $table_name,
- 1 # 1 => already an error
- );
+ push @output, generate_error($property_name,
+ $table_name,
+ 1 # 1 => already an error
+ );
}
else {
# Here the number of digits exceeds the
# minimum we think is needed. So generate a
# success test case for it.
- generate_tests($OUT,
- $property_name,
- $table_name,
- $valid,
- $invalid,
- $warning,
- );
+ push @output, generate_tests($property_name,
+ $table_name,
+ $valid,
+ $invalid,
+ $warning,
+ );
}
}
}
}
}
}
- print $OUT "Finished();\n";
- close $OUT;
+
+ &write($t_path,
+ 0, # Not utf8;
+ [<DATA>,
+ @output,
+ (map {"Test_X('$_');\n"} @backslash_X_tests),
+ "Finished();\n"]);
return;
}
? \&filter_v1_ucd
: ($v_version eq v2.1.5)
? \&filter_v2_1_5_ucd
- : undef),
+
+ # And for 5.14 Perls with 6.0,
+ # have to also make changes
+ : ($v_version ge v6.0.0)
+ ? \&filter_v6_ucd
+ : undef),
# And the main filter
\&filter_UnicodeData_line,
Input_file->new('BidiMirroring.txt', v3.0.1,
Property => 'Bidi_Mirroring_Glyph',
),
+ Input_file->new("NormalizationTest.txt", v3.0.1,
+ Skip => 1,
+ ),
Input_file->new('CaseFolding.txt', v3.0.1,
Pre_Handler => \&setup_case_folding,
Each_Line_Handler =>
Property => 'Grapheme_Cluster_Break',
Has_Missings_Defaults => $NOT_IGNORED,
),
+ Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
+ Handler => \&process_GCB_test,
+ ),
+ Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
+ Skip => 1,
+ ),
+ Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
+ Skip => 1,
+ ),
+ Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
+ Skip => 1,
+ ),
Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
Property => 'Sentence_Break',
Has_Missings_Defaults => $NOT_IGNORED,
Input_file->new('NameAliases.txt', v5.0.0,
Property => 'Name_Alias',
),
+ Input_file->new("BidiTest.txt", v5.2.0,
+ Skip => 1,
+ ),
Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
Optional => 1,
Each_Line_Handler => \&filter_unihan_line,
# Put into %potential_files a list of all the files in the directory structure
# that could be inputs to this program, excluding those that we should ignore.
-# Also don't consider test files. Use absolute file names because it makes it
-# easier across machine types.
+# Use absolute file names because it makes it easier across machine types.
my @ignored_files_full_names = map { File::Spec->rel2abs(
internal_file_to_platform($_))
} keys %ignored_files;
File::Find::find({
wanted=>sub {
- return unless /\.txt$/i;
- return if /Test\.txt$/i;
- my $full = File::Spec->rel2abs($_);
+ return unless /\.txt$/i; # Some platforms change the name's case
+ my $full = lc(File::Spec->rel2abs($_));
$potential_files{$full} = 1
- if ! grep { $full eq $_ } @ignored_files_full_names;
+ if ! grep { $full eq lc($_) } @ignored_files_full_names;
return;
}
}, File::Spec->curdir());
my @mktables_list_output_files;
+my $old_start_time = 0;
-if ($write_unchanged_files) {
+if (! -e $file_list) {
+ print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
+ $write_unchanged_files = 1;
+} elsif ($write_unchanged_files) {
print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
}
else {
print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
my $file_handle;
if (! open $file_handle, "<", $file_list) {
- Carp::my_carp("Failed to open '$file_list' (this is expected to be missing the first time); turning on -globlist option instead: $!");
+ Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
$glob_list = 1;
}
else {
for my $list ( \@input, \@mktables_list_output_files ) {
while (<$file_handle>) {
s/^ \s+ | \s+ $//xg;
+ if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
+ $old_start_time = $1;
+ }
next if /^ \s* (?: \# .* )? $/x;
last if /^ =+ $/x;
my ( $file ) = split /\t/;
# The paths are stored with relative names, and with '/' as the
# delimiter; convert to absolute on this machine
- my $full = File::Spec->rel2abs(internal_file_to_platform($input));
+ my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
$potential_files{$full} = 1
- if ! grep { $full eq $_ } @ignored_files_full_names;
+ if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
}
}
my @unknown_input_files;
foreach my $file (keys %potential_files) {
- next if grep { $file eq $_ } @known_files;
+ next if grep { lc($file) eq lc($_) } @known_files;
# Here, the file is unknown to us. Get relative path name
$file = File::Spec->abs2rel($file);
# If the file isn't extracted (meaning none of the directories is the
# extracted one), just add it to the end of the list of inputs.
if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
- push @input_file_objects, Input_file->new($file);
+ push @input_file_objects, Input_file->new($file, v0);
}
else {
# $compare_versions is set.
for (my $i = 0; $i < @input_file_objects; $i++) {
if ($input_file_objects[$i]->first_released ne v0
- && $input_file_objects[$i]->file ne 'DAge.txt'
- && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/)
+ && lc($input_file_objects[$i]->file) ne 'dage.txt'
+ && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
{
- splice @input_file_objects, $i, 0, Input_file->new($file);
+ splice @input_file_objects, $i, 0,
+ Input_file->new($file, v0);
last;
}
}
typical property files. You'll know by later error messages if it worked or
not:
END
- ) . join(", ", @unknown_input_files) . "\n\n");
+ ) . " " . join(", ", @unknown_input_files) . "\n\n");
}
} # End of looking through directory structure for more .txt files.
"Checking ".scalar( @mktables_list_output_files )." output files.\n";
}
-# We set $youngest to be the most recently changed input file, including this
-# program itself (done much earlier in this file)
+# We set $most_recent to be the most recently changed input file, including
+# this program itself (done much earlier in this file)
foreach my $in (@input_files) {
- my $age = -M $in;
- next unless defined $age; # Keep going even if missing a file
- $youngest = $age if $age < $youngest;
+ next unless -e $in; # Keep going even if missing a file
+ my $mod_time = (stat $in)[9];
+ $most_recent = $mod_time if $mod_time > $most_recent;
# See that the input files have distinct names, to warn someone if they
# are adding a new one
}
}
-my $ok = ! $write_unchanged_files
- && scalar @mktables_list_output_files; # If none known, rebuild
+my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
+ || ! scalar @mktables_list_output_files # or if no outputs known
+ || $old_start_time < $most_recent; # or out-of-date
# Now we check to see if any output files are older than youngest, if
# they are, we need to continue on, otherwise we can presumably bail.
-if ($ok) {
+if (! $rebuild) {
foreach my $out (@mktables_list_output_files) {
if ( ! file_exists($out)) {
print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
- $ok = 0;
+ $rebuild = 1;
last;
}
#local $to_trace = 1 if main::DEBUG;
- trace $youngest, -M $out if main::DEBUG && $to_trace;
- if ( -M $out > $youngest ) {
- #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
+ trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
+ if ( (stat $out)[9] <= $most_recent ) {
+ #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
- $ok = 0;
+ $rebuild = 1;
last;
}
}
}
-if ($ok) {
- print "Files seem to be ok, not bothering to rebuild.\n";
+if (! $rebuild) {
+ print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
exit(0);
}
print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
return
}
else {
+ my $localtime = localtime $start_time;
print $ofh <<"END";
#
# $file_list -- File list for $0.
#
-# Autogenerated on @{[scalar localtime]}
+# Autogenerated starting on $start_time ($localtime)
#
# - First section is input files
# ($0 itself is not listed but is automatically considered an input)
-# - Section seperator is /^=+\$/
+# - Section separator is /^=+\$/
# - Second section is a list of output files.
# - Lines matching /^\\s*#/ are treated as comments
# which along with blank lines are ignored.
use strict;
use warnings;
-# Test the \p{} regular expression constructs. This file is constructed by
-# mktables from the tables it generates, so if mktables is buggy, this won't
-# necessarily catch those bugs. Tests are generated for all feasible
-# properties; a few aren't currently feasible; see is_code_point_usable()
-# in mktables for details.
+# If run outside the normal test suite on an ASCII platform, you can
+# just create a latin1_to_native() function that just returns its
+# inputs, because that's the only function used from test.pl
+require "test.pl";
+
+# Test qr/\X/ and the \p{} regular expression constructs. This file is
+# constructed by mktables from the tables it generates, so if mktables is
+# buggy, this won't necessarily catch those bugs. Tests are generated for all
+# feasible properties; a few aren't currently feasible; see
+# is_code_point_usable() in mktables for details.
# Standard test packages are not used because this manipulates SIG_WARN. It
# exits 0 if every non-skipped test succeeded; -1 if any failed.
my $Tests = 0;
my $Fails = 0;
-my $Skips = 0;
-
-my $non_ASCII = (ord('A') == 65);
-
-# The first 127 ASCII characters in ordinal order, with the ones that don't
-# have Perl names (as of 5.8) replaced by dots. The 127th is used as the
-# string delimiter
-my $ascii_to_ebcdic = "\0......\a\b\t\n.\f\r.................. !\"#\$\%&'()*+,-./0123456789:;<=>?\@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~";
-#for my $i (0..126) {
-# print $i, ": ", substr($ascii_to_ebcdic, $i, 1), "\n";
-#}
sub Expect($$$$) {
my $expected = shift;
my $warning_type = shift; # Type of warning message, like 'deprecated'
# or empty if none
my $line = (caller)[2];
+ $ord = ord(latin1_to_native(chr($ord)));
# Convert the code point to hex form
my $string = sprintf "\"\\x{%04X}\"", $ord;
- # Convert the non-ASCII code points expressible as characters in Perl 5.8
- # to their ASCII equivalents, and skip the others.
- if ($non_ASCII && $ord < 255) {
-
- # Dots are used as place holders in the conversion string for the
- # non-convertible ones, so check for it first.
- if ($ord == 0x2E) {
- $ord = ord('.');
- }
- elsif ($ord < 0x7F
- # Any dots returned are non-convertible.
- && ((my $char = substr($ascii_to_ebcdic, $ord, 1)) ne '.'))
- {
- #print STDERR "$ord, $char, \n";
- $ord = ord($char);
- }
- else {
- $Tests++;
- $Skips++;
- print "ok $Tests - $string =~ $regex # Skipped: non-ASCII\n";
- return;
- }
- }
-
- # The first time through, use all warnings.
my @tests = "";
- # If the input should generate a warning, add another time through with
- # them turned off
+ # The first time through, use all warnings. If the input should generate
+ # a warning, add another time through with them turned off
push @tests, "no warnings '$warning_type';" if $warning_type;
foreach my $no_warnings (@tests) {
return;
}
+# GCBTest.txt character that separates grapheme clusters
+my $breakable_utf8 = my $breakable = chr(0xF7);
+utf8::upgrade($breakable_utf8);
+
+# GCBTest.txt character that indicates that the adjoining code points are part
+# of the same grapheme cluster
+my $nobreak_utf8 = my $nobreak = chr(0xD7);
+utf8::upgrade($nobreak_utf8);
+
+sub Test_X($) {
+ # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
+ # Each such line is a sequence of code points given by their hex numbers,
+ # separated by the two characters defined just before this subroutine that
+ # indicate that either there can or cannot be a break between the adjacent
+ # code points. If there isn't a break, that means the sequence forms an
+ # extended grapheme cluster, which means that \X should match the whole
+ # thing. If there is a break, \X should stop there. This is all
+ # converted by this routine into a match:
+ # $string =~ /(\X)/,
+ # Each \X should match the next cluster; and that is what is checked.
+
+ my $template = shift;
+
+ my $line = (caller)[2];
+
+ # The line contains characters above the ASCII range, but in Latin1. It
+ # may or may not be in utf8, and if it is, it may or may not know it. So,
+ # convert these characters to 8 bits. If knows is in utf8, simply
+ # downgrade.
+ if (utf8::is_utf8($template)) {
+ utf8::downgrade($template);
+ } else {
+
+ # Otherwise, if it is in utf8, but doesn't know it, the next lines
+ # convert the two problematic characters to their 8-bit equivalents.
+ # If it isn't in utf8, they don't harm anything.
+ use bytes;
+ $template =~ s/$nobreak_utf8/$nobreak/g;
+ $template =~ s/$breakable_utf8/$breakable/g;
+ }
+
+ # Get rid of the leading and trailing breakables
+ $template =~ s/^ \s* $breakable \s* //x;
+ $template =~ s/ \s* $breakable \s* $ //x;
+
+ # And no-breaks become just a space.
+ $template =~ s/ \s* $nobreak \s* / /xg;
+
+ # Split the input into segments that are breakable between them.
+ my @segments = split /\s*$breakable\s*/, $template;
+
+ my $string = "";
+ my $display_string = "";
+ my @should_match;
+ my @should_display;
+
+ # Convert the code point sequence in each segment into a Perl string of
+ # characters
+ foreach my $segment (@segments) {
+ my @code_points = split /\s+/, $segment;
+ my $this_string = "";
+ my $this_display = "";
+ foreach my $code_point (@code_points) {
+ $this_string .= latin1_to_native(chr(hex $code_point));
+ $this_display .= "\\x{$code_point}";
+ }
+
+ # The next cluster should match the string in this segment.
+ push @should_match, $this_string;
+ push @should_display, $this_display;
+ $string .= $this_string;
+ $display_string .= $this_display;
+ }
+
+ # If a string can be represented in both non-ut8 and utf8, test both cases
+ UPGRADE:
+ for my $to_upgrade (0 .. 1) {
+
+ if ($to_upgrade) {
+
+ # If already in utf8, would just be a repeat
+ next UPGRADE if utf8::is_utf8($string);
+
+ utf8::upgrade($string);
+ }
+
+ # Finally, do the \X match.
+ my @matches = $string =~ /(\X)/g;
+
+ # Look through each matched cluster to verify that it matches what we
+ # expect.
+ my $min = (@matches < @should_match) ? @matches : @should_match;
+ for my $i (0 .. $min - 1) {
+ $Tests++;
+ if ($matches[$i] eq $should_match[$i]) {
+ print "ok $Tests - ";
+ if ($i == 0) {
+ print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
+ } else {
+ print "And \\X #", $i + 1,
+ }
+ print " correctly matched $should_display[$i]; line $line\n";
+ } else {
+ $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
+ unpack("U*", $matches[$i]));
+ print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
+ $i + 1,
+ " should have matched $should_display[$i]",
+ " but instead matched $matches[$i]",
+ ". Abandoning rest of line $line\n";
+ next UPGRADE;
+ }
+ }
+
+ # And the number of matches should equal the number of expected matches.
+ $Tests++;
+ if (@matches == @should_match) {
+ print "ok $Tests - Nothing was left over; line $line\n";
+ } else {
+ print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
+ }
+ }
+
+ return;
+}
+
sub Finished() {
print "1..$Tests\n";
exit($Fails ? -1 : 0);
}
Error('\p{Script=InGreek}'); # Bug #69018
+Test_X("1100 $nobreak 1161"); # Bug #70940
+Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
+Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
+Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726