This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Change function name
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a 19my $start_time;
98dc9551 20BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
21 # get it as close as possible
22 $start_time= time;
23}
24
23e33b60 25require 5.010_001;
d73e5302 26use strict;
99870f4d 27use warnings;
cf25bb62 28use Carp;
bd9ebcfd 29use Config;
99870f4d
KW
30use File::Find;
31use File::Path;
d07a55ed 32use File::Spec;
99870f4d 33use Text::Tabs;
6b64c11c 34use re "/aa";
99870f4d
KW
35
36sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 37my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d
KW
38
39##########################################################################
40#
41# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42# from the Unicode database files (lib/unicore/.../*.txt), It also generates
43# a pod file and a .t file
44#
45# The structure of this file is:
46# First these introductory comments; then
47# code needed for everywhere, such as debugging stuff; then
48# code to handle input parameters; then
49# data structures likely to be of external interest (some of which depend on
50# the input parameters, so follows them; then
51# more data structures and subroutine and package (class) definitions; then
52# the small actual loop to process the input files and finish up; then
53# a __DATA__ section, for the .t tests
54#
5f7264c7 55# This program works on all releases of Unicode through at least 6.0. The
99870f4d
KW
56# outputs have been scrutinized most intently for release 5.1. The others
57# have been checked for somewhat more than just sanity. It can handle all
58# existing Unicode character properties in those releases.
59#
99870f4d
KW
60# This program is mostly about Unicode character (or code point) properties.
61# A property describes some attribute or quality of a code point, like if it
62# is lowercase or not, its name, what version of Unicode it was first defined
63# in, or what its uppercase equivalent is. Unicode deals with these disparate
64# possibilities by making all properties into mappings from each code point
65# into some corresponding value. In the case of it being lowercase or not,
66# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67# property maps each Unicode code point to a single value, called a "property
68# value". (Hence each Unicode property is a true mathematical function with
69# exactly one value per code point.)
70#
71# When using a property in a regular expression, what is desired isn't the
72# mapping of the code point to its property's value, but the reverse (or the
73# mathematical "inverse relation"): starting with the property value, "Does a
74# code point map to it?" These are written in a "compound" form:
75# \p{property=value}, e.g., \p{category=punctuation}. This program generates
76# files containing the lists of code points that map to each such regular
77# expression property value, one file per list
78#
79# There is also a single form shortcut that Perl adds for many of the commonly
80# used properties. This happens for all binary properties, plus script,
81# general_category, and block properties.
82#
83# Thus the outputs of this program are files. There are map files, mostly in
84# the 'To' directory; and there are list files for use in regular expression
85# matching, all in subdirectories of the 'lib' directory, with each
86# subdirectory being named for the property that the lists in it are for.
87# Bookkeeping, test, and documentation files are also generated.
88
89my $matches_directory = 'lib'; # Where match (\p{}) files go.
90my $map_directory = 'To'; # Where map files go.
91
92# DATA STRUCTURES
93#
94# The major data structures of this program are Property, of course, but also
95# Table. There are two kinds of tables, very similar to each other.
96# "Match_Table" is the data structure giving the list of code points that have
97# a particular property value, mentioned above. There is also a "Map_Table"
98# data structure which gives the property's mapping from code point to value.
99# There are two structures because the match tables need to be combined in
100# various ways, such as constructing unions, intersections, complements, etc.,
101# and the map ones don't. And there would be problems, perhaps subtle, if
102# a map table were inadvertently operated on in some of those ways.
103# The use of separate classes with operations defined on one but not the other
104# prevents accidentally confusing the two.
105#
106# At the heart of each table's data structure is a "Range_List", which is just
107# an ordered list of "Ranges", plus ancillary information, and methods to
108# operate on them. A Range is a compact way to store property information.
109# Each range has a starting code point, an ending code point, and a value that
110# is meant to apply to all the code points between the two end points,
111# inclusive. For a map table, this value is the property value for those
112# code points. Two such ranges could be written like this:
113# 0x41 .. 0x5A, 'Upper',
114# 0x61 .. 0x7A, 'Lower'
115#
116# Each range also has a type used as a convenience to classify the values.
117# Most ranges in this program will be Type 0, or normal, but there are some
118# ranges that have a non-zero type. These are used only in map tables, and
119# are for mappings that don't fit into the normal scheme of things. Mappings
120# that require a hash entry to communicate with utf8.c are one example;
121# another example is mappings for charnames.pm to use which indicate a name
122# that is algorithmically determinable from its code point (and vice-versa).
123# These are used to significantly compact these tables, instead of listing
124# each one of the tens of thousands individually.
125#
126# In a match table, the value of a range is irrelevant (and hence the type as
127# well, which will always be 0), and arbitrarily set to the null string.
128# Using the example above, there would be two match tables for those two
129# entries, one named Upper would contain the 0x41..0x5A range, and the other
130# named Lower would contain 0x61..0x7A.
131#
132# Actually, there are two types of range lists, "Range_Map" is the one
133# associated with map tables, and "Range_List" with match tables.
134# Again, this is so that methods can be defined on one and not the other so as
135# to prevent operating on them in incorrect ways.
136#
137# Eventually, most tables are written out to files to be read by utf8_heavy.pl
138# in the perl core. All tables could in theory be written, but some are
139# suppressed because there is no current practical use for them. It is easy
140# to change which get written by changing various lists that are near the top
141# of the actual code in this file. The table data structures contain enough
142# ancillary information to allow them to be treated as separate entities for
143# writing, such as the path to each one's file. There is a heading in each
144# map table that gives the format of its entries, and what the map is for all
145# the code points missing from it. (This allows tables to be more compact.)
678f13d5 146#
99870f4d
KW
147# The Property data structure contains one or more tables. All properties
148# contain a map table (except the $perl property which is a
149# pseudo-property containing only match tables), and any properties that
150# are usable in regular expression matches also contain various matching
151# tables, one for each value the property can have. A binary property can
152# have two values, True and False (or Y and N, which are preferred by Unicode
153# terminology). Thus each of these properties will have a map table that
154# takes every code point and maps it to Y or N (but having ranges cuts the
155# number of entries in that table way down), and two match tables, one
156# which has a list of all the code points that map to Y, and one for all the
157# code points that map to N. (For each of these, a third table is also
158# generated for the pseudo Perl property. It contains the identical code
159# points as the Y table, but can be written, not in the compound form, but in
160# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
161# properties have several possible values, some have many, and properties like
162# Name have a different value for every named code point. Those will not,
163# unless the controlling lists are changed, have their match tables written
164# out. But all the ones which can be used in regular expression \p{} and \P{}
c12f2655
KW
165# constructs will. Prior to 5.14, generally a property would have either its
166# map table or its match tables written but not both. Again, what gets
167# written is controlled by lists which can easily be changed. Starting in
168# 5.14, advantage was taken of this, and all the map tables needed to
169# reconstruct the Unicode db are now written out, while suppressing the
170# Unicode .txt files that contain the data. Our tables are much more compact
171# than the .txt files, so a significant space savings was achieved.
172
173# Properties have a 'Type', like binary, or string, or enum depending on how
174# many match tables there are and the content of the maps. This 'Type' is
175# different than a range 'Type', so don't get confused by the two concepts
176# having the same name.
678f13d5 177#
99870f4d
KW
178# For information about the Unicode properties, see Unicode's UAX44 document:
179
180my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
181
182# As stated earlier, this program will work on any release of Unicode so far.
183# Most obvious problems in earlier data have NOT been corrected except when
184# necessary to make Perl or this program work reasonably. For example, no
c12f2655
KW
185# folding information was given in early releases, so this program substitutes
186# lower case instead, just so that a regular expression with the /i option
187# will do something that actually gives the right results in many cases.
188# There are also a couple other corrections for version 1.1.5, commented at
189# the point they are made. As an example of corrections that weren't made
190# (but could be) is this statement from DerivedAge.txt: "The supplementary
191# private use code points and the non-character code points were assigned in
192# version 2.0, but not specifically listed in the UCD until versions 3.0 and
193# 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
194# on Unicode version glitches is further down in these introductory comments.
99870f4d 195#
5f7264c7
KW
196# This program works on all non-provisional properties as of 6.0, though the
197# files for some are suppressed from apparent lack of demand for them. You
198# can change which are output by changing lists in this program.
678f13d5 199#
dc85bd38 200# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
201# loose matchings rules (from Unicode TR18):
202#
203# The recommended names for UCD properties and property values are in
204# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
205# [PropValue]. There are both abbreviated names and longer, more
206# descriptive names. It is strongly recommended that both names be
207# recognized, and that loose matching of property names be used,
208# whereby the case distinctions, whitespace, hyphens, and underbar
209# are ignored.
210# The program still allows Fuzzy to override its determination of if loose
211# matching should be used, but it isn't currently used, as it is no longer
212# needed; the calculations it makes are good enough.
678f13d5 213#
99870f4d
KW
214# SUMMARY OF HOW IT WORKS:
215#
216# Process arguments
217#
218# A list is constructed containing each input file that is to be processed
219#
220# Each file on the list is processed in a loop, using the associated handler
221# code for each:
222# The PropertyAliases.txt and PropValueAliases.txt files are processed
223# first. These files name the properties and property values.
224# Objects are created of all the property and property value names
225# that the rest of the input should expect, including all synonyms.
226# The other input files give mappings from properties to property
227# values. That is, they list code points and say what the mapping
228# is under the given property. Some files give the mappings for
229# just one property; and some for many. This program goes through
230# each file and populates the properties from them. Some properties
231# are listed in more than one file, and Unicode has set up a
232# precedence as to which has priority if there is a conflict. Thus
233# the order of processing matters, and this program handles the
234# conflict possibility by processing the overriding input files
235# last, so that if necessary they replace earlier values.
236# After this is all done, the program creates the property mappings not
237# furnished by Unicode, but derivable from what it does give.
238# The tables of code points that match each property value in each
239# property that is accessible by regular expressions are created.
240# The Perl-defined properties are created and populated. Many of these
241# require data determined from the earlier steps
242# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 243# and Unicode are reconciled and warned about.
99870f4d
KW
244# All the properties are written to files
245# Any other files are written, and final warnings issued.
678f13d5 246#
99870f4d
KW
247# For clarity, a number of operators have been overloaded to work on tables:
248# ~ means invert (take all characters not in the set). The more
249# conventional '!' is not used because of the possibility of confusing
250# it with the actual boolean operation.
251# + means union
252# - means subtraction
253# & means intersection
254# The precedence of these is the order listed. Parentheses should be
255# copiously used. These are not a general scheme. The operations aren't
256# defined for a number of things, deliberately, to avoid getting into trouble.
257# Operations are done on references and affect the underlying structures, so
258# that the copy constructors for them have been overloaded to not return a new
259# clone, but the input object itself.
678f13d5 260#
99870f4d
KW
261# The bool operator is deliberately not overloaded to avoid confusion with
262# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
263#
264# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
265#
266# This program needs to be able to run under miniperl. Therefore, it uses a
267# minimum of other modules, and hence implements some things itself that could
268# be gotten from CPAN
269#
270# This program uses inputs published by the Unicode Consortium. These can
271# change incompatibly between releases without the Perl maintainers realizing
272# it. Therefore this program is now designed to try to flag these. It looks
273# at the directories where the inputs are, and flags any unrecognized files.
274# It keeps track of all the properties in the files it handles, and flags any
275# that it doesn't know how to handle. It also flags any input lines that
276# don't match the expected syntax, among other checks.
277#
278# It is also designed so if a new input file matches one of the known
279# templates, one hopefully just needs to add it to a list to have it
280# processed.
281#
282# As mentioned earlier, some properties are given in more than one file. In
283# particular, the files in the extracted directory are supposedly just
284# reformattings of the others. But they contain information not easily
285# derivable from the other files, including results for Unihan, which this
286# program doesn't ordinarily look at, and for unassigned code points. They
287# also have historically had errors or been incomplete. In an attempt to
288# create the best possible data, this program thus processes them first to
289# glean information missing from the other files; then processes those other
290# files to override any errors in the extracted ones. Much of the design was
291# driven by this need to store things and then possibly override them.
292#
293# It tries to keep fatal errors to a minimum, to generate something usable for
294# testing purposes. It always looks for files that could be inputs, and will
295# warn about any that it doesn't know how to handle (the -q option suppresses
296# the warning).
99870f4d 297#
678f13d5
KW
298# Why is there more than one type of range?
299# This simplified things. There are some very specialized code points that
300# have to be handled specially for output, such as Hangul syllable names.
301# By creating a range type (done late in the development process), it
302# allowed this to be stored with the range, and overridden by other input.
303# Originally these were stored in another data structure, and it became a
304# mess trying to decide if a second file that was for the same property was
305# overriding the earlier one or not.
306#
307# Why are there two kinds of tables, match and map?
308# (And there is a base class shared by the two as well.) As stated above,
309# they actually are for different things. Development proceeded much more
310# smoothly when I (khw) realized the distinction. Map tables are used to
311# give the property value for every code point (actually every code point
312# that doesn't map to a default value). Match tables are used for regular
313# expression matches, and are essentially the inverse mapping. Separating
314# the two allows more specialized methods, and error checks so that one
315# can't just take the intersection of two map tables, for example, as that
316# is nonsensical.
99870f4d 317#
23e33b60
KW
318# DEBUGGING
319#
678f13d5
KW
320# This program is written so it will run under miniperl. Occasionally changes
321# will cause an error where the backtrace doesn't work well under miniperl.
322# To diagnose the problem, you can instead run it under regular perl, if you
323# have one compiled.
324#
325# There is a good trace facility. To enable it, first sub DEBUG must be set
326# to return true. Then a line like
327#
328# local $to_trace = 1 if main::DEBUG;
329#
330# can be added to enable tracing in its lexical scope or until you insert
331# another line:
332#
333# local $to_trace = 0 if main::DEBUG;
334#
335# then use a line like "trace $a, @b, %c, ...;
336#
337# Some of the more complex subroutines already have trace statements in them.
338# Permanent trace statements should be like:
339#
340# trace ... if main::DEBUG && $to_trace;
341#
342# If there is just one or a few files that you're debugging, you can easily
343# cause most everything else to be skipped. Change the line
344#
345# my $debug_skip = 0;
346#
347# to 1, and every file whose object is in @input_file_objects and doesn't have
348# a, 'non_skip => 1,' in its constructor will be skipped.
349#
b4a0206c 350# To compare the output tables, it may be useful to specify the -annotate
c4019d52
KW
351# flag. This causes the tables to expand so there is one entry for each
352# non-algorithmically named code point giving, currently its name, and its
353# graphic representation if printable (and you have a font that knows about
354# it). This makes it easier to see what the particular code points are in
355# each output table. The tables are usable, but because they don't have
356# ranges (for the most part), a Perl using them will run slower. Non-named
357# code points are annotated with a description of their status, and contiguous
358# ones with the same description will be output as a range rather than
359# individually. Algorithmically named characters are also output as ranges,
360# except when there are just a few contiguous ones.
361#
99870f4d
KW
362# FUTURE ISSUES
363#
364# The program would break if Unicode were to change its names so that
365# interior white space, underscores, or dashes differences were significant
366# within property and property value names.
367#
368# It might be easier to use the xml versions of the UCD if this program ever
369# would need heavy revision, and the ability to handle old versions was not
370# required.
371#
372# There is the potential for name collisions, in that Perl has chosen names
373# that Unicode could decide it also likes. There have been such collisions in
374# the past, with mostly Perl deciding to adopt the Unicode definition of the
375# name. However in the 5.2 Unicode beta testing, there were a number of such
376# collisions, which were withdrawn before the final release, because of Perl's
377# and other's protests. These all involved new properties which began with
378# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
379# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
380# Unicode document, so they are unlikely to be used by Unicode for another
381# purpose. However, they might try something beginning with 'In', or use any
382# of the other Perl-defined properties. This program will warn you of name
383# collisions, and refuse to generate tables with them, but manual intervention
384# will be required in this event. One scheme that could be implemented, if
385# necessary, would be to have this program generate another file, or add a
386# field to mktables.lst that gives the date of first definition of a property.
387# Each new release of Unicode would use that file as a basis for the next
388# iteration. And the Perl synonym addition code could sort based on the age
389# of the property, so older properties get priority, and newer ones that clash
390# would be refused; hence existing code would not be impacted, and some other
391# synonym would have to be used for the new property. This is ugly, and
392# manual intervention would certainly be easier to do in the short run; lets
393# hope it never comes to this.
678f13d5 394#
99870f4d
KW
395# A NOTE ON UNIHAN
396#
397# This program can generate tables from the Unihan database. But it doesn't
398# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
399# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
400# database was split into 8 different files, all beginning with the letters
401# 'Unihan'. This program will read those file(s) if present, but it needs to
402# know which of the many properties in the file(s) should have tables created
403# for them. It will create tables for any properties listed in
404# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
405# @cjk_properties array and the @cjk_property_values array. Thus, if a
406# property you want is not in those files of the release you are building
407# against, you must add it to those two arrays. Starting in 4.0, the
408# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
409# is present in the directory, a table will be generated for that property.
410# In 5.2, several more properties were added. For your convenience, the two
5f7264c7 411# arrays are initialized with all the 6.0 listed properties that are also in
99870f4d
KW
412# earlier releases. But these are commented out. You can just uncomment the
413# ones you want, or use them as a template for adding entries for other
414# properties.
415#
416# You may need to adjust the entries to suit your purposes. setup_unihan(),
417# and filter_unihan_line() are the functions where this is done. This program
418# already does some adjusting to make the lines look more like the rest of the
419# Unicode DB; You can see what that is in filter_unihan_line()
420#
421# There is a bug in the 3.2 data file in which some values for the
422# kPrimaryNumeric property have commas and an unexpected comment. A filter
423# could be added for these; or for a particular installation, the Unihan.txt
424# file could be edited to fix them.
99870f4d 425#
678f13d5
KW
426# HOW TO ADD A FILE TO BE PROCESSED
427#
428# A new file from Unicode needs to have an object constructed for it in
429# @input_file_objects, probably at the end or at the end of the extracted
430# ones. The program should warn you if its name will clash with others on
431# restrictive file systems, like DOS. If so, figure out a better name, and
432# add lines to the README.perl file giving that. If the file is a character
433# property, it should be in the format that Unicode has by default
434# standardized for such files for the more recently introduced ones.
435# If so, the Input_file constructor for @input_file_objects can just be the
436# file name and release it first appeared in. If not, then it should be
437# possible to construct an each_line_handler() to massage the line into the
438# standardized form.
439#
440# For non-character properties, more code will be needed. You can look at
441# the existing entries for clues.
442#
443# UNICODE VERSIONS NOTES
444#
445# The Unicode UCD has had a number of errors in it over the versions. And
446# these remain, by policy, in the standard for that version. Therefore it is
447# risky to correct them, because code may be expecting the error. So this
448# program doesn't generally make changes, unless the error breaks the Perl
449# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
450# for U+1105, which causes real problems for the algorithms for Jamo
451# calculations, so it is changed here.
452#
453# But it isn't so clear cut as to what to do about concepts that are
454# introduced in a later release; should they extend back to earlier releases
455# where the concept just didn't exist? It was easier to do this than to not,
456# so that's what was done. For example, the default value for code points not
457# in the files for various properties was probably undefined until changed by
458# some version. No_Block for blocks is such an example. This program will
459# assign No_Block even in Unicode versions that didn't have it. This has the
460# benefit that code being written doesn't have to special case earlier
461# versions; and the detriment that it doesn't match the Standard precisely for
462# the affected versions.
463#
464# Here are some observations about some of the issues in early versions:
465#
6426c51b 466# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
678f13d5
KW
467# the reason is that the CJK block starting at 4E00 was removed from PropList,
468# and was not put back in until 3.1.0
469#
470# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
471# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
472# reason is that 3.2 introduced U+205F=medium math space, which was not
473# classed as white space, but Perl figured out that it should have been. 4.0
474# reclassified it correctly.
475#
476# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
477# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
478# was left with no code points, as all the ones that mapped to 202 stayed
479# mapped to 202. Thus if your program used the numeric name for the class,
480# it would not have been affected, but if it used the mnemonic, it would have
481# been.
482#
483# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
484# points which eventually came to have this script property value, instead
485# mapped to "Unknown". But in the next release all these code points were
486# moved to \p{sc=common} instead.
99870f4d
KW
487#
488# The default for missing code points for BidiClass is complicated. Starting
489# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
490# tries to do the best it can for earlier releases. It is done in
491# process_PropertyAliases()
492#
493##############################################################################
494
495my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
496 # and errors
497my $MAX_LINE_WIDTH = 78;
498
499# Debugging aid to skip most files so as to not be distracted by them when
500# concentrating on the ones being debugged. Add
501# non_skip => 1,
502# to the constructor for those files you want processed when you set this.
503# Files with a first version number of 0 are special: they are always
c12f2655
KW
504# processed regardless of the state of this flag. Generally, Jamo.txt and
505# UnicodeData.txt must not be skipped if you want this program to not die
506# before normal completion.
99870f4d
KW
507my $debug_skip = 0;
508
509# Set to 1 to enable tracing.
510our $to_trace = 0;
511
512{ # Closure for trace: debugging aid
513 my $print_caller = 1; # ? Include calling subroutine name
514 my $main_with_colon = 'main::';
515 my $main_colon_length = length($main_with_colon);
516
517 sub trace {
518 return unless $to_trace; # Do nothing if global flag not set
519
520 my @input = @_;
521
522 local $DB::trace = 0;
523 $DB::trace = 0; # Quiet 'used only once' message
524
525 my $line_number;
526
527 # Loop looking up the stack to get the first non-trace caller
528 my $caller_line;
529 my $caller_name;
530 my $i = 0;
531 do {
532 $line_number = $caller_line;
533 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
534 $caller = $main_with_colon unless defined $caller;
535
536 $caller_name = $caller;
537
538 # get rid of pkg
539 $caller_name =~ s/.*:://;
540 if (substr($caller_name, 0, $main_colon_length)
541 eq $main_with_colon)
542 {
543 $caller_name = substr($caller_name, $main_colon_length);
544 }
545
546 } until ($caller_name ne 'trace');
547
548 # If the stack was empty, we were called from the top level
549 $caller_name = 'main' if ($caller_name eq ""
550 || $caller_name eq 'trace');
551
552 my $output = "";
553 foreach my $string (@input) {
554 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
555 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
556 $output .= simple_dumper($string);
557 }
558 else {
559 $string = "$string" if ref $string;
560 $string = $UNDEF unless defined $string;
561 chomp $string;
562 $string = '""' if $string eq "";
563 $output .= " " if $output ne ""
564 && $string ne ""
565 && substr($output, -1, 1) ne " "
566 && substr($string, 0, 1) ne " ";
567 $output .= $string;
568 }
569 }
570
99f78760
KW
571 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
572 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
573 print STDERR $output, "\n";
574 return;
575 }
576}
577
578# This is for a rarely used development feature that allows you to compare two
579# versions of the Unicode standard without having to deal with changes caused
c12f2655
KW
580# by the code points introduced in the later version. Change the 0 to a
581# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
582# code points introduced in that release and earlier will be used; later ones
583# are thrown away. You use the version number of the earliest one you want to
584# compare; then run this program on directory structures containing each
585# release, and compare the outputs. These outputs will therefore include only
586# the code points common to both releases, and you can see the changes caused
587# just by the underlying release semantic changes. For versions earlier than
588# 3.2, you must copy a version of DAge.txt into the directory.
589my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
99870f4d
KW
590my $compare_versions = DEBUG
591 && $string_compare_versions
592 && pack "C*", split /\./, $string_compare_versions;
593
594sub uniques {
595 # Returns non-duplicated input values. From "Perl Best Practices:
596 # Encapsulated Cleverness". p. 455 in first edition.
597
598 my %seen;
0e407844
NC
599 # Arguably this breaks encapsulation, if the goal is to permit multiple
600 # distinct objects to stringify to the same value, and be interchangeable.
601 # However, for this program, no two objects stringify identically, and all
602 # lists passed to this function are either objects or strings. So this
603 # doesn't affect correctness, but it does give a couple of percent speedup.
604 no overloading;
99870f4d
KW
605 return grep { ! $seen{$_}++ } @_;
606}
607
608$0 = File::Spec->canonpath($0);
609
610my $make_test_script = 0; # ? Should we output a test script
611my $write_unchanged_files = 0; # ? Should we update the output files even if
612 # we don't think they have changed
613my $use_directory = ""; # ? Should we chdir somewhere.
614my $pod_directory; # input directory to store the pod file.
615my $pod_file = 'perluniprops';
616my $t_path; # Path to the .t test file
617my $file_list = 'mktables.lst'; # File to store input and output file names.
618 # This is used to speed up the build, by not
619 # executing the main body of the program if
620 # nothing on the list has changed since the
621 # previous build
622my $make_list = 1; # ? Should we write $file_list. Set to always
623 # make a list so that when the pumpking is
624 # preparing a release, s/he won't have to do
625 # special things
626my $glob_list = 0; # ? Should we try to include unknown .txt files
627 # in the input.
bd9ebcfd
KW
628my $output_range_counts = $debugging_build; # ? Should we include the number
629 # of code points in ranges in
630 # the output
558712cf 631my $annotate = 0; # ? Should character names be in the output
9ef2b94f 632
99870f4d
KW
633# Verbosity levels; 0 is quiet
634my $NORMAL_VERBOSITY = 1;
635my $PROGRESS = 2;
636my $VERBOSE = 3;
637
638my $verbosity = $NORMAL_VERBOSITY;
639
640# Process arguments
641while (@ARGV) {
cf25bb62
JH
642 my $arg = shift @ARGV;
643 if ($arg eq '-v') {
99870f4d
KW
644 $verbosity = $VERBOSE;
645 }
646 elsif ($arg eq '-p') {
647 $verbosity = $PROGRESS;
648 $| = 1; # Flush buffers as we go.
649 }
650 elsif ($arg eq '-q') {
651 $verbosity = 0;
652 }
653 elsif ($arg eq '-w') {
654 $write_unchanged_files = 1; # update the files even if havent changed
655 }
656 elsif ($arg eq '-check') {
6ae7e459
YO
657 my $this = shift @ARGV;
658 my $ok = shift @ARGV;
659 if ($this ne $ok) {
660 print "Skipping as check params are not the same.\n";
661 exit(0);
662 }
00a8df5c 663 }
99870f4d
KW
664 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
665 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
666 }
3df51b85
KW
667 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
668 {
99870f4d 669 $make_test_script = 1;
99870f4d
KW
670 }
671 elsif ($arg eq '-makelist') {
672 $make_list = 1;
673 }
674 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
675 -d $use_directory or croak "Unknown directory '$use_directory'";
676 }
677 elsif ($arg eq '-L') {
678
679 # Existence not tested until have chdir'd
680 $file_list = shift;
681 }
682 elsif ($arg eq '-globlist') {
683 $glob_list = 1;
684 }
685 elsif ($arg eq '-c') {
686 $output_range_counts = ! $output_range_counts
687 }
b4a0206c 688 elsif ($arg eq '-annotate') {
558712cf 689 $annotate = 1;
bd9ebcfd
KW
690 $debugging_build = 1;
691 $output_range_counts = 1;
9ef2b94f 692 }
99870f4d
KW
693 else {
694 my $with_c = 'with';
695 $with_c .= 'out' if $output_range_counts; # Complements the state
696 croak <<END;
697usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
698 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
699 [-check A B ]
700 -c : Output comments $with_c number of code points in ranges
701 -q : Quiet Mode: Only output serious warnings.
702 -p : Set verbosity level to normal plus show progress.
703 -v : Set Verbosity level high: Show progress and non-serious
704 warnings
705 -w : Write files regardless
706 -C dir : Change to this directory before proceeding. All relative paths
707 except those specified by the -P and -T options will be done
708 with respect to this directory.
709 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 710 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
711 -L filelist : Use alternate 'filelist' instead of standard one
712 -globlist : Take as input all non-Test *.txt files in current and sub
713 directories
3df51b85
KW
714 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
715 overrides -T
99870f4d 716 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 717 -annotate : Output an annotation for each character in the table files;
c4019d52 718 useful for debugging mktables, looking at diffs; but is slow,
b318e5e5
KW
719 memory intensive; resulting tables are usable but are slow and
720 very large (and currently fail the Unicode::UCD.t tests).
99870f4d
KW
721 -check A B : Executes $0 only if A and B are the same
722END
723 }
724}
725
726# Stores the most-recently changed file. If none have changed, can skip the
727# build
aeab6150 728my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
729
730# Change directories now, because need to read 'version' early.
731if ($use_directory) {
3df51b85 732 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
733 $pod_directory = File::Spec->rel2abs($pod_directory);
734 }
3df51b85 735 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 736 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 737 }
99870f4d 738 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 739 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 740 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 741 }
3df51b85 742 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 743 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 744 }
00a8df5c
YO
745}
746
99870f4d
KW
747# Get Unicode version into regular and v-string. This is done now because
748# various tables below get populated based on it. These tables are populated
749# here to be near the top of the file, and so easily seeable by those needing
750# to modify things.
751open my $VERSION, "<", "version"
752 or croak "$0: can't open required file 'version': $!\n";
753my $string_version = <$VERSION>;
754close $VERSION;
755chomp $string_version;
756my $v_version = pack "C*", split /\./, $string_version; # v string
757
758# The following are the complete names of properties with property values that
759# are known to not match any code points in some versions of Unicode, but that
760# may change in the future so they should be matchable, hence an empty file is
761# generated for them.
762my @tables_that_may_be_empty = (
763 'Joining_Type=Left_Joining',
764 );
765push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
766push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
767push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
768 if $v_version ge v4.1.0;
82aed44a
KW
769push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
770 if $v_version ge v6.0.0;
f583b44c
KW
771push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
772 if $v_version ge v6.1.0;
99870f4d
KW
773
774# The lists below are hashes, so the key is the item in the list, and the
775# value is the reason why it is in the list. This makes generation of
776# documentation easier.
777
778my %why_suppressed; # No file generated for these.
779
780# Files aren't generated for empty extraneous properties. This is arguable.
781# Extraneous properties generally come about because a property is no longer
782# used in a newer version of Unicode. If we generated a file without code
783# points, programs that used to work on that property will still execute
784# without errors. It just won't ever match (or will always match, with \P{}).
785# This means that the logic is now likely wrong. I (khw) think its better to
786# find this out by getting an error message. Just move them to the table
787# above to change this behavior
788my %why_suppress_if_empty_warn_if_not = (
789
790 # It is the only property that has ever officially been removed from the
791 # Standard. The database never contained any code points for it.
792 'Special_Case_Condition' => 'Obsolete',
793
794 # Apparently never official, but there were code points in some versions of
795 # old-style PropList.txt
796 'Non_Break' => 'Obsolete',
797);
798
799# These would normally go in the warn table just above, but they were changed
800# a long time before this program was written, so warnings about them are
801# moot.
802if ($v_version gt v3.2.0) {
803 push @tables_that_may_be_empty,
804 'Canonical_Combining_Class=Attached_Below_Left'
805}
806
5f7264c7 807# These are listed in the Property aliases file in 6.0, but Unihan is ignored
99870f4d
KW
808# unless explicitly added.
809if ($v_version ge v5.2.0) {
810 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 811 foreach my $table (qw (
99870f4d
KW
812 kAccountingNumeric
813 kOtherNumeric
814 kPrimaryNumeric
815 kCompatibilityVariant
816 kIICore
817 kIRG_GSource
818 kIRG_HSource
819 kIRG_JSource
820 kIRG_KPSource
821 kIRG_MSource
822 kIRG_KSource
823 kIRG_TSource
824 kIRG_USource
825 kIRG_VSource
826 kRSUnicode
ea25a9b2 827 ))
99870f4d
KW
828 {
829 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
830 }
ca12659b
NC
831}
832
272501f6
KW
833# Enum values for to_output_map() method in the Map_Table package.
834my $EXTERNAL_MAP = 1;
835my $INTERNAL_MAP = 2;
836
fcf1973c
KW
837# To override computed values for writing the map tables for these properties.
838# The default for enum map tables is to write them out, so that the Unicode
839# .txt files can be removed, but all the data to compute any property value
840# for any code point is available in a more compact form.
841my %global_to_output_map = (
842 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
c12f2655
KW
843 # get stuck supporting it if things change. Since it is a STRING
844 # property, it normally would be listed in the pod, but INTERNAL_MAP
845 # suppresses that.
fcf1973c
KW
846 Unicode_1_Name => $INTERNAL_MAP,
847
848 Present_In => 0, # Suppress, as easily computed from Age
fcf1973c 849 Block => 0, # Suppress, as Blocks.txt is retained.
53d34b6c
KW
850
851 # Suppress, as mapping can be found instead from the
852 # Perl_Decomposition_Mapping file
853 Decomposition_Type => 0,
fcf1973c
KW
854);
855
99870f4d 856# Properties that this program ignores.
230e0c16
KW
857my @unimplemented_properties;
858
859# With this release, it is automatically handled if the Unihan db is
860# downloaded
861push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
d73e5302 862
99870f4d
KW
863# There are several types of obsolete properties defined by Unicode. These
864# must be hand-edited for every new Unicode release.
865my %why_deprecated; # Generates a deprecated warning message if used.
866my %why_stabilized; # Documentation only
867my %why_obsolete; # Documentation only
868
869{ # Closure
870 my $simple = 'Perl uses the more complete version of this property';
871 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
872
873 my $other_properties = 'other properties';
874 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 875 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
876
877 %why_deprecated = (
5f7264c7 878 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
879 'Jamo_Short_Name' => $contributory,
880 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
881 'Other_Alphabetic' => $contributory,
882 'Other_Default_Ignorable_Code_Point' => $contributory,
883 'Other_Grapheme_Extend' => $contributory,
884 'Other_ID_Continue' => $contributory,
885 'Other_ID_Start' => $contributory,
886 'Other_Lowercase' => $contributory,
887 'Other_Math' => $contributory,
888 'Other_Uppercase' => $contributory,
e22aaf5c
KW
889 'Expands_On_NFC' => $why_no_expand,
890 'Expands_On_NFD' => $why_no_expand,
891 'Expands_On_NFKC' => $why_no_expand,
892 'Expands_On_NFKD' => $why_no_expand,
99870f4d
KW
893 );
894
895 %why_suppressed = (
5f7264c7 896 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
897 # contains the same information, but without the algorithmically
898 # determinable Hangul syllables'. This file is not published, so it's
899 # existence is not noted in the comment.
e0b29447 900 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
99870f4d 901
3111abc0
KW
902 'Indic_Matra_Category' => "Provisional",
903 'Indic_Syllabic_Category' => "Provisional",
904
5f8d1a89
KW
905 # Don't suppress ISO_Comment, as otherwise special handling is needed
906 # to differentiate between it and gc=c, which can be written as 'isc',
907 # which is the same characters as ISO_Comment's short name.
99870f4d 908
fbb93542 909 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
e0b29447
KW
910
911 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
912 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
913 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
914 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
99870f4d 915
5f7264c7 916 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
917 );
918
1704a0ea
KW
919 foreach my $property (
920
921 # The following are suppressed because they were made contributory
922 # or deprecated by Unicode before Perl ever thought about
923 # supporting them.
924 'Jamo_Short_Name',
925 'Grapheme_Link',
926 'Expands_On_NFC',
927 'Expands_On_NFD',
928 'Expands_On_NFKC',
929 'Expands_On_NFKD',
930
931 # The following are suppressed because they have been marked
932 # as deprecated for a sufficient amount of time
933 'Other_Alphabetic',
934 'Other_Default_Ignorable_Code_Point',
935 'Other_Grapheme_Extend',
936 'Other_ID_Continue',
937 'Other_ID_Start',
938 'Other_Lowercase',
939 'Other_Math',
940 'Other_Uppercase',
e22aaf5c 941 ) {
99870f4d
KW
942 $why_suppressed{$property} = $why_deprecated{$property};
943 }
cf25bb62 944
99870f4d
KW
945 # Customize the message for all the 'Other_' properties
946 foreach my $property (keys %why_deprecated) {
947 next if (my $main_property = $property) !~ s/^Other_//;
948 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
949 }
950}
951
952if ($v_version ge 4.0.0) {
953 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
954 if ($v_version ge 6.0.0) {
955 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
956 }
99870f4d 957}
5f7264c7 958if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 959 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7 960 if ($v_version ge 6.0.0) {
63f74647 961 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
5f7264c7 962 }
99870f4d
KW
963}
964
965# Probably obsolete forever
966if ($v_version ge v4.1.0) {
82aed44a
KW
967 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
968}
969if ($v_version ge v6.0.0) {
2b352efd
KW
970 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
971 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
99870f4d
KW
972}
973
974# This program can create files for enumerated-like properties, such as
975# 'Numeric_Type'. This file would be the same format as for a string
976# property, with a mapping from code point to its value, so you could look up,
977# for example, the script a code point is in. But no one so far wants this
978# mapping, or they have found another way to get it since this is a new
979# feature. So no file is generated except if it is in this list.
980my @output_mapped_properties = split "\n", <<END;
981END
982
c12f2655
KW
983# If you are using the Unihan database in a Unicode version before 5.2, you
984# need to add the properties that you want to extract from it to this table.
985# For your convenience, the properties in the 6.0 PropertyAliases.txt file are
986# listed, commented out
99870f4d
KW
987my @cjk_properties = split "\n", <<'END';
988#cjkAccountingNumeric; kAccountingNumeric
989#cjkOtherNumeric; kOtherNumeric
990#cjkPrimaryNumeric; kPrimaryNumeric
991#cjkCompatibilityVariant; kCompatibilityVariant
992#cjkIICore ; kIICore
993#cjkIRG_GSource; kIRG_GSource
994#cjkIRG_HSource; kIRG_HSource
995#cjkIRG_JSource; kIRG_JSource
996#cjkIRG_KPSource; kIRG_KPSource
997#cjkIRG_KSource; kIRG_KSource
998#cjkIRG_TSource; kIRG_TSource
999#cjkIRG_USource; kIRG_USource
1000#cjkIRG_VSource; kIRG_VSource
1001#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1002END
1003
1004# Similarly for the property values. For your convenience, the lines in the
5f7264c7 1005# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
c12f2655 1006# '#' marks (for Unicode versions before 5.2)
99870f4d
KW
1007my @cjk_property_values = split "\n", <<'END';
1008## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1009## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1010## @missing: 0000..10FFFF; cjkIICore; <none>
1011## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1012## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1013## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1014## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1015## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1016## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1017## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1018## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1019## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1020## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1021## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1022END
1023
1024# The input files don't list every code point. Those not listed are to be
1025# defaulted to some value. Below are hard-coded what those values are for
1026# non-binary properties as of 5.1. Starting in 5.0, there are
1027# machine-parsable comment lines in the files the give the defaults; so this
1028# list shouldn't have to be extended. The claim is that all missing entries
1029# for binary properties will default to 'N'. Unicode tried to change that in
1030# 5.2, but the beta period produced enough protest that they backed off.
1031#
1032# The defaults for the fields that appear in UnicodeData.txt in this hash must
1033# be in the form that it expects. The others may be synonyms.
1034my $CODE_POINT = '<code point>';
1035my %default_mapping = (
1036 Age => "Unassigned",
1037 # Bidi_Class => Complicated; set in code
1038 Bidi_Mirroring_Glyph => "",
1039 Block => 'No_Block',
1040 Canonical_Combining_Class => 0,
1041 Case_Folding => $CODE_POINT,
1042 Decomposition_Mapping => $CODE_POINT,
1043 Decomposition_Type => 'None',
1044 East_Asian_Width => "Neutral",
1045 FC_NFKC_Closure => $CODE_POINT,
1046 General_Category => 'Cn',
1047 Grapheme_Cluster_Break => 'Other',
1048 Hangul_Syllable_Type => 'NA',
1049 ISO_Comment => "",
1050 Jamo_Short_Name => "",
1051 Joining_Group => "No_Joining_Group",
1052 # Joining_Type => Complicated; set in code
1053 kIICore => 'N', # Is converted to binary
1054 #Line_Break => Complicated; set in code
1055 Lowercase_Mapping => $CODE_POINT,
1056 Name => "",
1057 Name_Alias => "",
1058 NFC_QC => 'Yes',
1059 NFD_QC => 'Yes',
1060 NFKC_QC => 'Yes',
1061 NFKD_QC => 'Yes',
1062 Numeric_Type => 'None',
1063 Numeric_Value => 'NaN',
1064 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1065 Sentence_Break => 'Other',
1066 Simple_Case_Folding => $CODE_POINT,
1067 Simple_Lowercase_Mapping => $CODE_POINT,
1068 Simple_Titlecase_Mapping => $CODE_POINT,
1069 Simple_Uppercase_Mapping => $CODE_POINT,
1070 Titlecase_Mapping => $CODE_POINT,
1071 Unicode_1_Name => "",
1072 Unicode_Radical_Stroke => "",
1073 Uppercase_Mapping => $CODE_POINT,
1074 Word_Break => 'Other',
1075);
1076
1077# Below are files that Unicode furnishes, but this program ignores, and why
1078my %ignored_files = (
73ba1144
KW
1079 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1080 'Index.txt' => 'Alphabetical index of Unicode characters',
1081 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1082 'NamesList.txt' => 'Annotated list of characters',
1083 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1084 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1085 'ReadMe.txt' => 'Documentation',
1086 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1087 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
73ba1144
KW
1088 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1089 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1090 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1091 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
99870f4d
KW
1092);
1093
1fec9f60
KW
1094my %skipped_files; # List of files that we skip
1095
678f13d5 1096### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1097
1098my $HEADER=<<"EOF";
1099# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1100# This file is machine-generated by $0 from the Unicode
1101# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1102EOF
1103
126c3d4e 1104my $INTERNAL_ONLY_HEADER = <<"EOF";
99870f4d
KW
1105
1106# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
fac53429
KW
1107# This file is for internal use by core Perl only. The format and even the
1108# name or existence of this file are subject to change without notice. Don't
1109# use it directly.
99870f4d
KW
1110EOF
1111
1112my $DEVELOPMENT_ONLY=<<"EOF";
1113# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1114# This file contains information artificially constrained to code points
1115# present in Unicode release $string_compare_versions.
1116# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1117# not be used for production.
b6922eda
KW
1118
1119EOF
1120
6189eadc
KW
1121my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1122my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1123my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
99870f4d
KW
1124
1125# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1126# two must be 10; if there are 5, the first must not be a 0. Written this way
92199589
KW
1127# to decrease backtracking. The first regex allows the code point to be at
1128# the end of a word, but to work properly, the word shouldn't end with a valid
1129# hex character. The second one won't match a code point at the end of a
1130# word, and doesn't have the run-on issue
8c32d378
KW
1131my $run_on_code_point_re =
1132 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1133my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1134
1135# This matches the beginning of the line in the Unicode db files that give the
1136# defaults for code points not listed (i.e., missing) in the file. The code
1137# depends on this ending with a semi-colon, so it can assume it is a valid
1138# field when the line is split() by semi-colons
1139my $missing_defaults_prefix =
6189eadc 1140 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
99870f4d
KW
1141
1142# Property types. Unicode has more types, but these are sufficient for our
1143# purposes.
1144my $UNKNOWN = -1; # initialized to illegal value
1145my $NON_STRING = 1; # Either binary or enum
1146my $BINARY = 2;
06f26c45
KW
1147my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1148 # tables, additional true and false tables are
1149 # generated so that false is anything matching the
1150 # default value, and true is everything else.
1151my $ENUM = 4; # Include catalog
1152my $STRING = 5; # Anything else: string or misc
99870f4d
KW
1153
1154# Some input files have lines that give default values for code points not
1155# contained in the file. Sometimes these should be ignored.
1156my $NO_DEFAULTS = 0; # Must evaluate to false
1157my $NOT_IGNORED = 1;
1158my $IGNORED = 2;
1159
1160# Range types. Each range has a type. Most ranges are type 0, for normal,
1161# and will appear in the main body of the tables in the output files, but
1162# there are other types of ranges as well, listed below, that are specially
1163# handled. There are pseudo-types as well that will never be stored as a
1164# type, but will affect the calculation of the type.
1165
1166# 0 is for normal, non-specials
1167my $MULTI_CP = 1; # Sequence of more than code point
1168my $HANGUL_SYLLABLE = 2;
1169my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1170my $NULL = 4; # The map is to the null string; utf8.c can't
1171 # handle these, nor is there an accepted syntax
1172 # for them in \p{} constructs
f86864ac 1173my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1174 # otherwise be $MULTI_CP type are instead type 0
1175
1176# process_generic_property_file() can accept certain overrides in its input.
1177# Each of these must begin AND end with $CMD_DELIM.
1178my $CMD_DELIM = "\a";
1179my $REPLACE_CMD = 'replace'; # Override the Replace
1180my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1181
1182my $NO = 0;
1183my $YES = 1;
1184
1185# Values for the Replace argument to add_range.
1186# $NO # Don't replace; add only the code points not
1187 # already present.
1188my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1189 # the comments at the subroutine definition.
1190my $UNCONDITIONALLY = 2; # Replace without conditions.
9470941f 1191my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
99870f4d 1192 # already there
7f4b1e25
KW
1193my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1194 # already there
1195my $CROAK = 6; # Die with an error if is already there
99870f4d
KW
1196
1197# Flags to give property statuses. The phrases are to remind maintainers that
1198# if the flag is changed, the indefinite article referring to it in the
1199# documentation may need to be as well.
1200my $NORMAL = "";
99870f4d
KW
1201my $DEPRECATED = 'D';
1202my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1203my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1204my $DISCOURAGED = 'X';
1205my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1206my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1207my $STRICTER = 'T';
1208my $a_bold_stricter = "a 'B<$STRICTER>'";
1209my $A_bold_stricter = "A 'B<$STRICTER>'";
1210my $STABILIZED = 'S';
1211my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1212my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1213my $OBSOLETE = 'O';
1214my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1215my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1216
1217my %status_past_participles = (
1218 $DISCOURAGED => 'discouraged',
99870f4d
KW
1219 $STABILIZED => 'stabilized',
1220 $OBSOLETE => 'obsolete',
37e2e78e 1221 $DEPRECATED => 'deprecated',
99870f4d
KW
1222);
1223
395dfc19
KW
1224# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1225# externally documented.
301ba948 1226my $ORDINARY = 0; # The normal fate.
395dfc19
KW
1227my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1228 # but there is a file written that can be used to
1229 # reconstruct this table
301ba948
KW
1230my $SUPPRESSED = 3; # The file for this table is not written out.
1231my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
1232 # for Perl's internal use only
1233my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
1234 # Unicode version that doesn't have it, but we need it
1235 # to be defined, if empty, to have things work.
1236 # Implies no pod entry generated
1237
f5817e0a
KW
1238# The format of the values of the tables:
1239my $EMPTY_FORMAT = "";
99870f4d
KW
1240my $BINARY_FORMAT = 'b';
1241my $DECIMAL_FORMAT = 'd';
1242my $FLOAT_FORMAT = 'f';
1243my $INTEGER_FORMAT = 'i';
1244my $HEX_FORMAT = 'x';
1245my $RATIONAL_FORMAT = 'r';
1246my $STRING_FORMAT = 's';
a14f3cb1 1247my $DECOMP_STRING_FORMAT = 'c';
c3ff2976 1248my $STRING_WHITE_SPACE_LIST = 'sw';
99870f4d
KW
1249
1250my %map_table_formats = (
1251 $BINARY_FORMAT => 'binary',
1252 $DECIMAL_FORMAT => 'single decimal digit',
1253 $FLOAT_FORMAT => 'floating point number',
1254 $INTEGER_FORMAT => 'integer',
add63c13 1255 $HEX_FORMAT => 'non-negative hex whole number; a code point',
99870f4d 1256 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1257 $STRING_FORMAT => 'string',
92f9d56c 1258 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
c3ff2976 1259 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
99870f4d
KW
1260);
1261
1262# Unicode didn't put such derived files in a separate directory at first.
1263my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1264my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1265my $AUXILIARY = 'auxiliary';
1266
1267# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
9e4a1e86 1268# and into UCD.pl for the use of UCD.pm
99870f4d
KW
1269my %loose_to_file_of; # loosely maps table names to their respective
1270 # files
1271my %stricter_to_file_of; # same; but for stricter mapping.
315bfd4e 1272my %loose_property_to_file_of; # Maps a loose property name to its map file
89cf10cc
KW
1273my %file_to_swash_name; # Maps the file name to its corresponding key name
1274 # in the hash %utf8::SwashInfo
99870f4d
KW
1275my %nv_floating_to_rational; # maps numeric values floating point numbers to
1276 # their rational equivalent
c12f2655
KW
1277my %loose_property_name_of; # Loosely maps (non_string) property names to
1278 # standard form
86a52d1e 1279my %string_property_loose_to_name; # Same, for string properties.
c15fda25
KW
1280my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1281 # the property name in standard loose form, and
1282 # 'value' is the default value for that property,
1283 # also in standard loose form.
9e4a1e86
KW
1284my %loose_to_standard_value; # loosely maps table names to the canonical
1285 # alias for them
2df7880f
KW
1286my %ambiguous_names; # keys are alias names (in standard form) that
1287 # have more than one possible meaning.
5d1df013
KW
1288my %prop_aliases; # Keys are standard property name; values are each
1289 # one's aliases
1e863613
KW
1290my %prop_value_aliases; # Keys of top level are standard property name;
1291 # values are keys to another hash, Each one is
1292 # one of the property's values, in standard form.
1293 # The values are that prop-val's aliases.
2df7880f 1294my %ucd_pod; # Holds entries that will go into the UCD section of the pod
99870f4d 1295
d867ccfb
KW
1296# Most properties are immune to caseless matching, otherwise you would get
1297# nonsensical results, as properties are a function of a code point, not
1298# everything that is caselessly equivalent to that code point. For example,
1299# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1300# be true because 's' and 'S' are equivalent caselessly. However,
1301# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1302# extend that concept to those very few properties that are like this. Each
1303# such property will match the full range caselessly. They are hard-coded in
1304# the program; it's not worth trying to make it general as it's extremely
1305# unlikely that they will ever change.
1306my %caseless_equivalent_to;
1307
99870f4d
KW
1308# These constants names and values were taken from the Unicode standard,
1309# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1310# syllables. The '_string' versions are so generated tables can retain the
1311# hex format, which is the more familiar value
1312my $SBase_string = "0xAC00";
1313my $SBase = CORE::hex $SBase_string;
1314my $LBase_string = "0x1100";
1315my $LBase = CORE::hex $LBase_string;
1316my $VBase_string = "0x1161";
1317my $VBase = CORE::hex $VBase_string;
1318my $TBase_string = "0x11A7";
1319my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1320my $SCount = 11172;
1321my $LCount = 19;
1322my $VCount = 21;
1323my $TCount = 28;
1324my $NCount = $VCount * $TCount;
1325
1326# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1327# with the above published constants.
1328my %Jamo;
1329my %Jamo_L; # Leading consonants
1330my %Jamo_V; # Vowels
1331my %Jamo_T; # Trailing consonants
1332
bb1dd3da
KW
1333# For code points whose name contains its ordinal as a '-ABCD' suffix.
1334# The key is the base name of the code point, and the value is an
1335# array giving all the ranges that use this base name. Each range
1336# is actually a hash giving the 'low' and 'high' values of it.
1337my %names_ending_in_code_point;
1338my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1339 # removed from the names
1340# Inverse mapping. The list of ranges that have these kinds of
1341# names. Each element contains the low, high, and base names in an
1342# anonymous hash.
1343my @code_points_ending_in_code_point;
1344
1345# Boolean: does this Unicode version have the hangul syllables, and are we
1346# writing out a table for them?
1347my $has_hangul_syllables = 0;
1348
1349# Does this Unicode version have code points whose names end in their
1350# respective code points, and are we writing out a table for them? 0 for no;
1351# otherwise points to first property that a table is needed for them, so that
1352# if multiple tables are needed, we don't create duplicates
1353my $needing_code_points_ending_in_code_point = 0;
1354
37e2e78e 1355my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1356my @unhandled_properties; # Will contain a list of properties found in
1357 # the input that we didn't process.
f86864ac 1358my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1359 # listed in the pod
1360my @map_properties; # Properties that get map files written
1361my @named_sequences; # NamedSequences.txt contents.
1362my %potential_files; # Generated list of all .txt files in the directory
1363 # structure so we can warn if something is being
1364 # ignored.
1365my @files_actually_output; # List of files we generated.
1366my @more_Names; # Some code point names are compound; this is used
1367 # to store the extra components of them.
1368my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1369 # the minimum before we consider it equivalent to a
1370 # candidate rational
1371my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1372
1373# These store references to certain commonly used property objects
1374my $gc;
1375my $perl;
1376my $block;
3e20195b
KW
1377my $perl_charname;
1378my $print;
7fc6cb55 1379my $Any;
359523e2 1380my $script;
99870f4d
KW
1381
1382# Are there conflicting names because of beginning with 'In_', or 'Is_'
1383my $has_In_conflicts = 0;
1384my $has_Is_conflicts = 0;
1385
1386sub internal_file_to_platform ($) {
1387 # Convert our file paths which have '/' separators to those of the
1388 # platform.
1389
1390 my $file = shift;
1391 return undef unless defined $file;
1392
1393 return File::Spec->join(split '/', $file);
d07a55ed 1394}
5beb625e 1395
99870f4d
KW
1396sub file_exists ($) { # platform independent '-e'. This program internally
1397 # uses slash as a path separator.
1398 my $file = shift;
1399 return 0 if ! defined $file;
1400 return -e internal_file_to_platform($file);
1401}
5beb625e 1402
99870f4d 1403sub objaddr($) {
23e33b60
KW
1404 # Returns the address of the blessed input object.
1405 # It doesn't check for blessedness because that would do a string eval
1406 # every call, and the program is structured so that this is never called
1407 # for a non-blessed object.
99870f4d 1408
23e33b60 1409 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1410
1411 # Numifying a ref gives its address.
051df77b 1412 return pack 'J', $_[0];
99870f4d
KW
1413}
1414
558712cf 1415# These are used only if $annotate is true.
c4019d52
KW
1416# The entire range of Unicode characters is examined to populate these
1417# after all the input has been processed. But most can be skipped, as they
1418# have the same descriptive phrases, such as being unassigned
1419my @viacode; # Contains the 1 million character names
1420my @printable; # boolean: And are those characters printable?
1421my @annotate_char_type; # Contains a type of those characters, specifically
1422 # for the purposes of annotation.
1423my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1424 # name for the purposes of annotation. They map to the
c4019d52
KW
1425 # upper edge of the range, so that the end point can
1426 # be immediately found. This is used to skip ahead to
1427 # the end of a range, and avoid processing each
1428 # individual code point in it.
1429my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1430 # characters, but excluding those which are
1431 # also noncharacter code points
1432
1433# The annotation types are an extension of the regular range types, though
1434# some of the latter are folded into one. Make the new types negative to
1435# avoid conflicting with the regular types
1436my $SURROGATE_TYPE = -1;
1437my $UNASSIGNED_TYPE = -2;
1438my $PRIVATE_USE_TYPE = -3;
1439my $NONCHARACTER_TYPE = -4;
1440my $CONTROL_TYPE = -5;
1441my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1442
1443sub populate_char_info ($) {
558712cf 1444 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1445 # input code point's info that are needed for outputting more detailed
1446 # comments. If calling context wants a return, it is the end point of
1447 # any contiguous range of characters that share essentially the same info
1448
1449 my $i = shift;
1450 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1451
1452 $viacode[$i] = $perl_charname->value_of($i) || "";
1453
1454 # A character is generally printable if Unicode says it is,
1455 # but below we make sure that most Unicode general category 'C' types
1456 # aren't.
1457 $printable[$i] = $print->contains($i);
1458
1459 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1460
1461 # Only these two regular types are treated specially for annotations
1462 # purposes
1463 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1464 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1465
1466 # Give a generic name to all code points that don't have a real name.
1467 # We output ranges, if applicable, for these. Also calculate the end
1468 # point of the range.
1469 my $end;
1470 if (! $viacode[$i]) {
1471 if ($gc-> table('Surrogate')->contains($i)) {
1472 $viacode[$i] = 'Surrogate';
1473 $annotate_char_type[$i] = $SURROGATE_TYPE;
1474 $printable[$i] = 0;
1475 $end = $gc->table('Surrogate')->containing_range($i)->end;
1476 }
1477 elsif ($gc-> table('Private_use')->contains($i)) {
1478 $viacode[$i] = 'Private Use';
1479 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1480 $printable[$i] = 0;
1481 $end = $gc->table('Private_Use')->containing_range($i)->end;
1482 }
1483 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1484 contains($i))
1485 {
1486 $viacode[$i] = 'Noncharacter';
1487 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1488 $printable[$i] = 0;
1489 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1490 containing_range($i)->end;
1491 }
1492 elsif ($gc-> table('Control')->contains($i)) {
1493 $viacode[$i] = 'Control';
1494 $annotate_char_type[$i] = $CONTROL_TYPE;
1495 $printable[$i] = 0;
1496 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1497 }
1498 elsif ($gc-> table('Unassigned')->contains($i)) {
1499 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1500 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1501 $printable[$i] = 0;
1502
1503 # Because we name the unassigned by the blocks they are in, it
1504 # can't go past the end of that block, and it also can't go past
1505 # the unassigned range it is in. The special table makes sure
1506 # that the non-characters, which are unassigned, are separated
1507 # out.
1508 $end = min($block->containing_range($i)->end,
1509 $unassigned_sans_noncharacters-> containing_range($i)->
1510 end);
13ca76ff
KW
1511 }
1512 else {
1513 Carp::my_carp_bug("Can't figure out how to annotate "
1514 . sprintf("U+%04X", $i)
1515 . ". Proceeding anyway.");
c4019d52
KW
1516 $viacode[$i] = 'UNKNOWN';
1517 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1518 $printable[$i] = 0;
1519 }
1520 }
1521
1522 # Here, has a name, but if it's one in which the code point number is
1523 # appended to the name, do that.
1524 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1525 $viacode[$i] .= sprintf("-%04X", $i);
1526 $end = $perl_charname->containing_range($i)->end;
1527 }
1528
1529 # And here, has a name, but if it's a hangul syllable one, replace it with
1530 # the correct name from the Unicode algorithm
1531 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1532 use integer;
1533 my $SIndex = $i - $SBase;
1534 my $L = $LBase + $SIndex / $NCount;
1535 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1536 my $T = $TBase + $SIndex % $TCount;
1537 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1538 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1539 $end = $perl_charname->containing_range($i)->end;
1540 }
1541
1542 return if ! defined wantarray;
1543 return $i if ! defined $end; # If not a range, return the input
1544
1545 # Save this whole range so can find the end point quickly
1546 $annotate_ranges->add_map($i, $end, $end);
1547
1548 return $end;
1549}
1550
23e33b60
KW
1551# Commented code below should work on Perl 5.8.
1552## This 'require' doesn't necessarily work in miniperl, and even if it does,
1553## the native perl version of it (which is what would operate under miniperl)
1554## is extremely slow, as it does a string eval every call.
1555#my $has_fast_scalar_util = $\18 !~ /miniperl/
1556# && defined eval "require Scalar::Util";
1557#
1558#sub objaddr($) {
1559# # Returns the address of the blessed input object. Uses the XS version if
1560# # available. It doesn't check for blessedness because that would do a
1561# # string eval every call, and the program is structured so that this is
1562# # never called for a non-blessed object.
1563#
1564# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1565#
1566# # Check at least that is a ref.
1567# my $pkg = ref($_[0]) or return undef;
1568#
1569# # Change to a fake package to defeat any overloaded stringify
1570# bless $_[0], 'main::Fake';
1571#
1572# # Numifying a ref gives its address.
051df77b 1573# my $addr = pack 'J', $_[0];
23e33b60
KW
1574#
1575# # Return to original class
1576# bless $_[0], $pkg;
1577# return $addr;
1578#}
1579
99870f4d
KW
1580sub max ($$) {
1581 my $a = shift;
1582 my $b = shift;
1583 return $a if $a >= $b;
1584 return $b;
1585}
1586
1587sub min ($$) {
1588 my $a = shift;
1589 my $b = shift;
1590 return $a if $a <= $b;
1591 return $b;
1592}
1593
1594sub clarify_number ($) {
1595 # This returns the input number with underscores inserted every 3 digits
1596 # in large (5 digits or more) numbers. Input must be entirely digits, not
1597 # checked.
1598
1599 my $number = shift;
1600 my $pos = length($number) - 3;
1601 return $number if $pos <= 1;
1602 while ($pos > 0) {
1603 substr($number, $pos, 0) = '_';
1604 $pos -= 3;
5beb625e 1605 }
99870f4d 1606 return $number;
99598c8c
JH
1607}
1608
12ac2576 1609
99870f4d 1610package Carp;
7ebf06b3 1611
99870f4d
KW
1612# These routines give a uniform treatment of messages in this program. They
1613# are placed in the Carp package to cause the stack trace to not include them,
1614# although an alternative would be to use another package and set @CARP_NOT
1615# for it.
12ac2576 1616
99870f4d 1617our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1618
99f78760
KW
1619# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1620# and overload trying to load Scalar:Util under miniperl. See
1621# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1622undef $overload::VERSION;
1623
99870f4d
KW
1624sub my_carp {
1625 my $message = shift || "";
1626 my $nofold = shift || 0;
7ebf06b3 1627
99870f4d
KW
1628 if ($message) {
1629 $message = main::join_lines($message);
1630 $message =~ s/^$0: *//; # Remove initial program name
1631 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1632 $message = "\n$0: $message;";
12ac2576 1633
99870f4d
KW
1634 # Fold the message with program name, semi-colon end punctuation
1635 # (which looks good with the message that carp appends to it), and a
1636 # hanging indent for continuation lines.
1637 $message = main::simple_fold($message, "", 4) unless $nofold;
1638 $message =~ s/\n$//; # Remove the trailing nl so what carp
1639 # appends is to the same line
1640 }
12ac2576 1641
99870f4d 1642 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1643
99870f4d
KW
1644 carp $message;
1645 return;
1646}
7ebf06b3 1647
99870f4d
KW
1648sub my_carp_bug {
1649 # This is called when it is clear that the problem is caused by a bug in
1650 # this program.
7ebf06b3 1651
99870f4d
KW
1652 my $message = shift;
1653 $message =~ s/^$0: *//;
1654 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1655 carp $message;
1656 return;
1657}
7ebf06b3 1658
99870f4d
KW
1659sub carp_too_few_args {
1660 if (@_ != 2) {
1661 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1662 return;
12ac2576 1663 }
7ebf06b3 1664
99870f4d
KW
1665 my $args_ref = shift;
1666 my $count = shift;
7ebf06b3 1667
99870f4d
KW
1668 my_carp_bug("Need at least $count arguments to "
1669 . (caller 1)[3]
1670 . ". Instead got: '"
1671 . join ', ', @$args_ref
1672 . "'. No action taken.");
1673 return;
12ac2576
JP
1674}
1675
99870f4d
KW
1676sub carp_extra_args {
1677 my $args_ref = shift;
1678 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1679
99870f4d
KW
1680 unless (ref $args_ref) {
1681 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1682 return;
1683 }
1684 my ($package, $file, $line) = caller;
1685 my $subroutine = (caller 1)[3];
cf25bb62 1686
99870f4d
KW
1687 my $list;
1688 if (ref $args_ref eq 'HASH') {
1689 foreach my $key (keys %$args_ref) {
1690 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1691 }
99870f4d 1692 $list = join ', ', each %{$args_ref};
cf25bb62 1693 }
99870f4d
KW
1694 elsif (ref $args_ref eq 'ARRAY') {
1695 foreach my $arg (@$args_ref) {
1696 $arg = $UNDEF unless defined $arg;
1697 }
1698 $list = join ', ', @$args_ref;
1699 }
1700 else {
1701 my_carp_bug("Can't cope with ref "
1702 . ref($args_ref)
1703 . " . argument to 'carp_extra_args'. Not checking arguments.");
1704 return;
1705 }
1706
1707 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1708 return;
d73e5302
JH
1709}
1710
99870f4d
KW
1711package main;
1712
1713{ # Closure
1714
1715 # This program uses the inside-out method for objects, as recommended in
1716 # "Perl Best Practices". This closure aids in generating those. There
1717 # are two routines. setup_package() is called once per package to set
1718 # things up, and then set_access() is called for each hash representing a
1719 # field in the object. These routines arrange for the object to be
1720 # properly destroyed when no longer used, and for standard accessor
1721 # functions to be generated. If you need more complex accessors, just
1722 # write your own and leave those accesses out of the call to set_access().
1723 # More details below.
1724
1725 my %constructor_fields; # fields that are to be used in constructors; see
1726 # below
1727
1728 # The values of this hash will be the package names as keys to other
1729 # hashes containing the name of each field in the package as keys, and
1730 # references to their respective hashes as values.
1731 my %package_fields;
1732
1733 sub setup_package {
1734 # Sets up the package, creating standard DESTROY and dump methods
1735 # (unless already defined). The dump method is used in debugging by
1736 # simple_dumper().
1737 # The optional parameters are:
1738 # a) a reference to a hash, that gets populated by later
1739 # set_access() calls with one of the accesses being
1740 # 'constructor'. The caller can then refer to this, but it is
1741 # not otherwise used by these two routines.
1742 # b) a reference to a callback routine to call during destruction
1743 # of the object, before any fields are actually destroyed
1744
1745 my %args = @_;
1746 my $constructor_ref = delete $args{'Constructor_Fields'};
1747 my $destroy_callback = delete $args{'Destroy_Callback'};
1748 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1749
1750 my %fields;
1751 my $package = (caller)[0];
1752
1753 $package_fields{$package} = \%fields;
1754 $constructor_fields{$package} = $constructor_ref;
1755
1756 unless ($package->can('DESTROY')) {
1757 my $destroy_name = "${package}::DESTROY";
1758 no strict "refs";
1759
1760 # Use typeglob to give the anonymous subroutine the name we want
1761 *$destroy_name = sub {
1762 my $self = shift;
ffe43484 1763 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1764
1765 $self->$destroy_callback if $destroy_callback;
1766 foreach my $field (keys %{$package_fields{$package}}) {
1767 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1768 delete $package_fields{$package}{$field}{$addr};
1769 }
1770 return;
1771 }
1772 }
1773
1774 unless ($package->can('dump')) {
1775 my $dump_name = "${package}::dump";
1776 no strict "refs";
1777 *$dump_name = sub {
1778 my $self = shift;
1779 return dump_inside_out($self, $package_fields{$package}, @_);
1780 }
1781 }
1782 return;
1783 }
1784
1785 sub set_access {
1786 # Arrange for the input field to be garbage collected when no longer
1787 # needed. Also, creates standard accessor functions for the field
1788 # based on the optional parameters-- none if none of these parameters:
1789 # 'addable' creates an 'add_NAME()' accessor function.
1790 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1791 # function.
1792 # 'settable' creates a 'set_NAME()' accessor function.
1793 # 'constructor' doesn't create an accessor function, but adds the
1794 # field to the hash that was previously passed to
1795 # setup_package();
1796 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1797 # 'add' etc. all mean 'addable'.
1798 # The read accessor function will work on both array and scalar
1799 # values. If another accessor in the parameter list is 'a', the read
1800 # access assumes an array. You can also force it to be array access
1801 # by specifying 'readable_array' instead of 'readable'
1802 #
1803 # A sort-of 'protected' access can be set-up by preceding the addable,
1804 # readable or settable with some initial portion of 'protected_' (but,
1805 # the underscore is required), like 'p_a', 'pro_set', etc. The
1806 # "protection" is only by convention. All that happens is that the
1807 # accessor functions' names begin with an underscore. So instead of
1808 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1809 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1810 # end of each package, and then storing the __LINE__ ranges and
1811 # checking them on every accessor. But that is way overkill.)
1812
1813 # We create anonymous subroutines as the accessors and then use
1814 # typeglobs to assign them to the proper package and name
1815
1816 my $name = shift; # Name of the field
1817 my $field = shift; # Reference to the inside-out hash containing the
1818 # field
1819
1820 my $package = (caller)[0];
1821
1822 if (! exists $package_fields{$package}) {
1823 croak "$0: Must call 'setup_package' before 'set_access'";
1824 }
d73e5302 1825
99870f4d
KW
1826 # Stash the field so DESTROY can get it.
1827 $package_fields{$package}{$name} = $field;
cf25bb62 1828
99870f4d
KW
1829 # Remaining arguments are the accessors. For each...
1830 foreach my $access (@_) {
1831 my $access = lc $access;
cf25bb62 1832
99870f4d 1833 my $protected = "";
cf25bb62 1834
99870f4d
KW
1835 # Match the input as far as it goes.
1836 if ($access =~ /^(p[^_]*)_/) {
1837 $protected = $1;
1838 if (substr('protected_', 0, length $protected)
1839 eq $protected)
1840 {
1841
1842 # Add 1 for the underscore not included in $protected
1843 $access = substr($access, length($protected) + 1);
1844 $protected = '_';
1845 }
1846 else {
1847 $protected = "";
1848 }
1849 }
1850
1851 if (substr('addable', 0, length $access) eq $access) {
1852 my $subname = "${package}::${protected}add_$name";
1853 no strict "refs";
1854
1855 # add_ accessor. Don't add if already there, which we
1856 # determine using 'eq' for scalars and '==' otherwise.
1857 *$subname = sub {
1858 use strict "refs";
1859 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1860 my $self = shift;
1861 my $value = shift;
ffe43484 1862 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1863 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1864 if (ref $value) {
f998e60c 1865 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1866 }
1867 else {
f998e60c 1868 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1869 }
f998e60c 1870 push @{$field->{$addr}}, $value;
99870f4d
KW
1871 return;
1872 }
1873 }
1874 elsif (substr('constructor', 0, length $access) eq $access) {
1875 if ($protected) {
1876 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1877 }
1878 else {
1879 $constructor_fields{$package}{$name} = $field;
1880 }
1881 }
1882 elsif (substr('readable_array', 0, length $access) eq $access) {
1883
1884 # Here has read access. If one of the other parameters for
1885 # access is array, or this one specifies array (by being more
1886 # than just 'readable_'), then create a subroutine that
1887 # assumes the data is an array. Otherwise just a scalar
1888 my $subname = "${package}::${protected}$name";
1889 if (grep { /^a/i } @_
1890 or length($access) > length('readable_'))
1891 {
1892 no strict "refs";
1893 *$subname = sub {
1894 use strict "refs";
23e33b60 1895 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1896 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1897 if (ref $field->{$addr} ne 'ARRAY') {
1898 my $type = ref $field->{$addr};
1899 $type = 'scalar' unless $type;
1900 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1901 return;
1902 }
1903 return scalar @{$field->{$addr}} unless wantarray;
1904
1905 # Make a copy; had problems with caller modifying the
1906 # original otherwise
1907 my @return = @{$field->{$addr}};
1908 return @return;
1909 }
1910 }
1911 else {
1912
1913 # Here not an array value, a simpler function.
1914 no strict "refs";
1915 *$subname = sub {
1916 use strict "refs";
23e33b60 1917 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1918 no overloading;
051df77b 1919 return $field->{pack 'J', $_[0]};
99870f4d
KW
1920 }
1921 }
1922 }
1923 elsif (substr('settable', 0, length $access) eq $access) {
1924 my $subname = "${package}::${protected}set_$name";
1925 no strict "refs";
1926 *$subname = sub {
1927 use strict "refs";
23e33b60
KW
1928 if (main::DEBUG) {
1929 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1930 Carp::carp_extra_args(\@_) if @_ > 2;
1931 }
1932 # $self is $_[0]; $value is $_[1]
f998e60c 1933 no overloading;
051df77b 1934 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1935 return;
1936 }
1937 }
1938 else {
1939 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1940 }
cf25bb62 1941 }
99870f4d 1942 return;
cf25bb62 1943 }
99870f4d
KW
1944}
1945
1946package Input_file;
1947
1948# All input files use this object, which stores various attributes about them,
1949# and provides for convenient, uniform handling. The run method wraps the
1950# processing. It handles all the bookkeeping of opening, reading, and closing
1951# the file, returning only significant input lines.
1952#
1953# Each object gets a handler which processes the body of the file, and is
1954# called by run(). Most should use the generic, default handler, which has
1955# code scrubbed to handle things you might not expect. A handler should
1956# basically be a while(next_line()) {...} loop.
1957#
1958# You can also set up handlers to
1959# 1) call before the first line is read for pre processing
1960# 2) call to adjust each line of the input before the main handler gets them
1961# 3) call upon EOF before the main handler exits its loop
1962# 4) call at the end for post processing
1963#
1964# $_ is used to store the input line, and is to be filtered by the
1965# each_line_handler()s. So, if the format of the line is not in the desired
1966# format for the main handler, these are used to do that adjusting. They can
1967# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1968# so the $_ output of one is used as the input to the next. None of the other
1969# handlers are stackable, but could easily be changed to be so.
1970#
1971# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1972# which insert the parameters as lines to be processed before the next input
1973# file line is read. This allows the EOF handler to flush buffers, for
1974# example. The difference between the two routines is that the lines inserted
1975# by insert_lines() are subjected to the each_line_handler()s. (So if you
1976# called it from such a handler, you would get infinite recursion.) Lines
1977# inserted by insert_adjusted_lines() go directly to the main handler without
1978# any adjustments. If the post-processing handler calls any of these, there
1979# will be no effect. Some error checking for these conditions could be added,
1980# but it hasn't been done.
1981#
1982# carp_bad_line() should be called to warn of bad input lines, which clears $_
1983# to prevent further processing of the line. This routine will output the
1984# message as a warning once, and then keep a count of the lines that have the
1985# same message, and output that count at the end of the file's processing.
1986# This keeps the number of messages down to a manageable amount.
1987#
1988# get_missings() should be called to retrieve any @missing input lines.
1989# Messages will be raised if this isn't done if the options aren't to ignore
1990# missings.
1991
1992sub trace { return main::trace(@_); }
1993
99870f4d
KW
1994{ # Closure
1995 # Keep track of fields that are to be put into the constructor.
1996 my %constructor_fields;
1997
1998 main::setup_package(Constructor_Fields => \%constructor_fields);
1999
2000 my %file; # Input file name, required
2001 main::set_access('file', \%file, qw{ c r });
2002
2003 my %first_released; # Unicode version file was first released in, required
2004 main::set_access('first_released', \%first_released, qw{ c r });
2005
2006 my %handler; # Subroutine to process the input file, defaults to
2007 # 'process_generic_property_file'
2008 main::set_access('handler', \%handler, qw{ c });
2009
2010 my %property;
2011 # name of property this file is for. defaults to none, meaning not
2012 # applicable, or is otherwise determinable, for example, from each line.
2013 main::set_access('property', \%property, qw{ c });
2014
2015 my %optional;
2016 # If this is true, the file is optional. If not present, no warning is
2017 # output. If it is present, the string given by this parameter is
2018 # evaluated, and if false the file is not processed.
2019 main::set_access('optional', \%optional, 'c', 'r');
2020
2021 my %non_skip;
2022 # This is used for debugging, to skip processing of all but a few input
2023 # files. Add 'non_skip => 1' to the constructor for those files you want
2024 # processed when you set the $debug_skip global.
2025 main::set_access('non_skip', \%non_skip, 'c');
2026
37e2e78e 2027 my %skip;
09ca89ce
KW
2028 # This is used to skip processing of this input file semi-permanently,
2029 # when it evaluates to true. The value should be the reason the file is
2030 # being skipped. It is used for files that we aren't planning to process
2031 # anytime soon, but want to allow to be in the directory and not raise a
2032 # message that we are not handling. Mostly for test files. This is in
2033 # contrast to the non_skip element, which is supposed to be used very
2034 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2035 # pretty much will never look at can be placed in the global
1fec9f60 2036 # %ignored_files instead. Ones used here will be added to %skipped files
37e2e78e
KW
2037 main::set_access('skip', \%skip, 'c');
2038
99870f4d
KW
2039 my %each_line_handler;
2040 # list of subroutines to look at and filter each non-comment line in the
2041 # file. defaults to none. The subroutines are called in order, each is
2042 # to adjust $_ for the next one, and the final one adjusts it for
2043 # 'handler'
2044 main::set_access('each_line_handler', \%each_line_handler, 'c');
2045
2046 my %has_missings_defaults;
2047 # ? Are there lines in the file giving default values for code points
2048 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2049 # the norm, but IGNORED means it has such lines, but the handler doesn't
2050 # use them. Having these three states allows us to catch changes to the
2051 # UCD that this program should track
2052 main::set_access('has_missings_defaults',
2053 \%has_missings_defaults, qw{ c r });
2054
2055 my %pre_handler;
2056 # Subroutine to call before doing anything else in the file. If undef, no
2057 # such handler is called.
2058 main::set_access('pre_handler', \%pre_handler, qw{ c });
2059
2060 my %eof_handler;
2061 # Subroutine to call upon getting an EOF on the input file, but before
2062 # that is returned to the main handler. This is to allow buffers to be
2063 # flushed. The handler is expected to call insert_lines() or
2064 # insert_adjusted() with the buffered material
2065 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2066
2067 my %post_handler;
2068 # Subroutine to call after all the lines of the file are read in and
2069 # processed. If undef, no such handler is called.
2070 main::set_access('post_handler', \%post_handler, qw{ c });
2071
2072 my %progress_message;
2073 # Message to print to display progress in lieu of the standard one
2074 main::set_access('progress_message', \%progress_message, qw{ c });
2075
2076 my %handle;
2077 # cache open file handle, internal. Is undef if file hasn't been
2078 # processed at all, empty if has;
2079 main::set_access('handle', \%handle);
2080
2081 my %added_lines;
2082 # cache of lines added virtually to the file, internal
2083 main::set_access('added_lines', \%added_lines);
2084
2085 my %errors;
2086 # cache of errors found, internal
2087 main::set_access('errors', \%errors);
2088
2089 my %missings;
2090 # storage of '@missing' defaults lines
2091 main::set_access('missings', \%missings);
2092
2093 sub new {
2094 my $class = shift;
2095
2096 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2097 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2098
2099 # Set defaults
2100 $handler{$addr} = \&main::process_generic_property_file;
2101 $non_skip{$addr} = 0;
37e2e78e 2102 $skip{$addr} = 0;
99870f4d
KW
2103 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2104 $handle{$addr} = undef;
2105 $added_lines{$addr} = [ ];
2106 $each_line_handler{$addr} = [ ];
2107 $errors{$addr} = { };
2108 $missings{$addr} = [ ];
2109
2110 # Two positional parameters.
99f78760 2111 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2112 $file{$addr} = main::internal_file_to_platform(shift);
2113 $first_released{$addr} = shift;
2114
2115 # The rest of the arguments are key => value pairs
2116 # %constructor_fields has been set up earlier to list all possible
2117 # ones. Either set or push, depending on how the default has been set
2118 # up just above.
2119 my %args = @_;
2120 foreach my $key (keys %args) {
2121 my $argument = $args{$key};
2122
2123 # Note that the fields are the lower case of the constructor keys
2124 my $hash = $constructor_fields{lc $key};
2125 if (! defined $hash) {
2126 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2127 next;
2128 }
2129 if (ref $hash->{$addr} eq 'ARRAY') {
2130 if (ref $argument eq 'ARRAY') {
2131 foreach my $argument (@{$argument}) {
2132 next if ! defined $argument;
2133 push @{$hash->{$addr}}, $argument;
2134 }
2135 }
2136 else {
2137 push @{$hash->{$addr}}, $argument if defined $argument;
2138 }
2139 }
2140 else {
2141 $hash->{$addr} = $argument;
2142 }
2143 delete $args{$key};
2144 };
2145
2146 # If the file has a property for it, it means that the property is not
2147 # listed in the file's entries. So add a handler to the list of line
2148 # handlers to insert the property name into the lines, to provide a
2149 # uniform interface to the final processing subroutine.
2150 # the final code doesn't have to worry about that.
2151 if ($property{$addr}) {
2152 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2153 }
2154
2155 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2156 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 2157 }
99870f4d 2158
09ca89ce
KW
2159 # If skipping, set to optional, and add to list of ignored files,
2160 # including its reason
2161 if ($skip{$addr}) {
2162 $optional{$addr} = 1;
1fec9f60 2163 $skipped_files{$file{$addr}} = $skip{$addr}
09ca89ce 2164 }
37e2e78e 2165
99870f4d 2166 return $self;
d73e5302
JH
2167 }
2168
cf25bb62 2169
99870f4d
KW
2170 use overload
2171 fallback => 0,
2172 qw("") => "_operator_stringify",
2173 "." => \&main::_operator_dot,
2174 ;
cf25bb62 2175
99870f4d
KW
2176 sub _operator_stringify {
2177 my $self = shift;
cf25bb62 2178
99870f4d 2179 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2180 }
d73e5302 2181
99870f4d
KW
2182 # flag to make sure extracted files are processed early
2183 my $seen_non_extracted_non_age = 0;
d73e5302 2184
99870f4d
KW
2185 sub run {
2186 # Process the input object $self. This opens and closes the file and
2187 # calls all the handlers for it. Currently, this can only be called
2188 # once per file, as it destroy's the EOF handler
d73e5302 2189
99870f4d
KW
2190 my $self = shift;
2191 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2192
ffe43484 2193 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2194
99870f4d 2195 my $file = $file{$addr};
d73e5302 2196
99870f4d
KW
2197 # Don't process if not expecting this file (because released later
2198 # than this Unicode version), and isn't there. This means if someone
2199 # copies it into an earlier version's directory, we will go ahead and
2200 # process it.
2201 return if $first_released{$addr} gt $v_version && ! -e $file;
2202
2203 # If in debugging mode and this file doesn't have the non-skip
2204 # flag set, and isn't one of the critical files, skip it.
2205 if ($debug_skip
2206 && $first_released{$addr} ne v0
2207 && ! $non_skip{$addr})
2208 {
2209 print "Skipping $file in debugging\n" if $verbosity;
2210 return;
2211 }
2212
2213 # File could be optional
37e2e78e 2214 if ($optional{$addr}) {
99870f4d
KW
2215 return unless -e $file;
2216 my $result = eval $optional{$addr};
2217 if (! defined $result) {
2218 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2219 return;
2220 }
2221 if (! $result) {
2222 if ($verbosity) {
2223 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2224 }
2225 return;
2226 }
2227 }
2228
2229 if (! defined $file || ! -e $file) {
2230
2231 # If the file doesn't exist, see if have internal data for it
2232 # (based on first_released being 0).
2233 if ($first_released{$addr} eq v0) {
2234 $handle{$addr} = 'pretend_is_open';
2235 }
2236 else {
2237 if (! $optional{$addr} # File could be optional
2238 && $v_version ge $first_released{$addr})
2239 {
2240 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2241 }
2242 return;
2243 }
2244 }
2245 else {
2246
37e2e78e
KW
2247 # Here, the file exists. Some platforms may change the case of
2248 # its name
99870f4d 2249 if ($seen_non_extracted_non_age) {
517956bf 2250 if ($file =~ /$EXTRACTED/i) {
99870f4d 2251 Carp::my_carp_bug(join_lines(<<END
99f78760 2252$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2253anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2254have subtle problems
2255END
2256 ));
2257 }
2258 }
2259 elsif ($EXTRACTED_DIR
2260 && $first_released{$addr} ne v0
517956bf
CB
2261 && $file !~ /$EXTRACTED/i
2262 && lc($file) ne 'dage.txt')
99870f4d
KW
2263 {
2264 # We don't set this (by the 'if' above) if we have no
2265 # extracted directory, so if running on an early version,
2266 # this test won't work. Not worth worrying about.
2267 $seen_non_extracted_non_age = 1;
2268 }
2269
2270 # And mark the file as having being processed, and warn if it
2271 # isn't a file we are expecting. As we process the files,
2272 # they are deleted from the hash, so any that remain at the
2273 # end of the program are files that we didn't process.
517956bf 2274 my $fkey = File::Spec->rel2abs($file);
faf3cf6b
KW
2275 my $expecting = delete $potential_files{lc($fkey)};
2276
678f13d5
KW
2277 Carp::my_carp("Was not expecting '$file'.") if
2278 ! $expecting
99870f4d
KW
2279 && ! defined $handle{$addr};
2280
37e2e78e
KW
2281 # Having deleted from expected files, we can quit if not to do
2282 # anything. Don't print progress unless really want verbosity
2283 if ($skip{$addr}) {
2284 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2285 return;
2286 }
2287
99870f4d
KW
2288 # Open the file, converting the slashes used in this program
2289 # into the proper form for the OS
2290 my $file_handle;
2291 if (not open $file_handle, "<", $file) {
2292 Carp::my_carp("Can't open $file. Skipping: $!");
2293 return 0;
2294 }
2295 $handle{$addr} = $file_handle; # Cache the open file handle
2296 }
2297
2298 if ($verbosity >= $PROGRESS) {
2299 if ($progress_message{$addr}) {
2300 print "$progress_message{$addr}\n";
2301 }
2302 else {
2303 # If using a virtual file, say so.
2304 print "Processing ", (-e $file)
2305 ? $file
2306 : "substitute $file",
2307 "\n";
2308 }
2309 }
2310
2311
2312 # Call any special handler for before the file.
2313 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2314
2315 # Then the main handler
2316 &{$handler{$addr}}($self);
2317
2318 # Then any special post-file handler.
2319 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2320
2321 # If any errors have been accumulated, output the counts (as the first
2322 # error message in each class was output when it was encountered).
2323 if ($errors{$addr}) {
2324 my $total = 0;
2325 my $types = 0;
2326 foreach my $error (keys %{$errors{$addr}}) {
2327 $total += $errors{$addr}->{$error};
2328 delete $errors{$addr}->{$error};
2329 $types++;
2330 }
2331 if ($total > 1) {
2332 my $message
2333 = "A total of $total lines had errors in $file. ";
2334
2335 $message .= ($types == 1)
2336 ? '(Only the first one was displayed.)'
2337 : '(Only the first of each type was displayed.)';
2338 Carp::my_carp($message);
2339 }
2340 }
2341
2342 if (@{$missings{$addr}}) {
2343 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2344 }
2345
2346 # If a real file handle, close it.
2347 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2348 ref $handle{$addr};
2349 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2350 # the file, as opposed to undef
2351 return;
2352 }
2353
2354 sub next_line {
2355 # Sets $_ to be the next logical input line, if any. Returns non-zero
2356 # if such a line exists. 'logical' means that any lines that have
2357 # been added via insert_lines() will be returned in $_ before the file
2358 # is read again.
2359
2360 my $self = shift;
2361 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2362
ffe43484 2363 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2364
2365 # Here the file is open (or if the handle is not a ref, is an open
2366 # 'virtual' file). Get the next line; any inserted lines get priority
2367 # over the file itself.
2368 my $adjusted;
2369
2370 LINE:
2371 while (1) { # Loop until find non-comment, non-empty line
2372 #local $to_trace = 1 if main::DEBUG;
2373 my $inserted_ref = shift @{$added_lines{$addr}};
2374 if (defined $inserted_ref) {
2375 ($adjusted, $_) = @{$inserted_ref};
2376 trace $adjusted, $_ if main::DEBUG && $to_trace;
2377 return 1 if $adjusted;
2378 }
2379 else {
2380 last if ! ref $handle{$addr}; # Don't read unless is real file
2381 last if ! defined ($_ = readline $handle{$addr});
2382 }
2383 chomp;
2384 trace $_ if main::DEBUG && $to_trace;
2385
2386 # See if this line is the comment line that defines what property
2387 # value that code points that are not listed in the file should
2388 # have. The format or existence of these lines is not guaranteed
2389 # by Unicode since they are comments, but the documentation says
2390 # that this was added for machine-readability, so probably won't
2391 # change. This works starting in Unicode Version 5.0. They look
2392 # like:
2393 #
2394 # @missing: 0000..10FFFF; Not_Reordered
2395 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2396 # @missing: 0000..10FFFF; ; NaN
2397 #
2398 # Save the line for a later get_missings() call.
2399 if (/$missing_defaults_prefix/) {
2400 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2401 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2402 }
2403 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2404 my @defaults = split /\s* ; \s*/x, $_;
2405
2406 # The first field is the @missing, which ends in a
2407 # semi-colon, so can safely shift.
2408 shift @defaults;
2409
2410 # Some of these lines may have empty field placeholders
2411 # which get in the way. An example is:
2412 # @missing: 0000..10FFFF; ; NaN
2413 # Remove them. Process starting from the top so the
2414 # splice doesn't affect things still to be looked at.
2415 for (my $i = @defaults - 1; $i >= 0; $i--) {
2416 next if $defaults[$i] ne "";
2417 splice @defaults, $i, 1;
2418 }
2419
2420 # What's left should be just the property (maybe) and the
2421 # default. Having only one element means it doesn't have
2422 # the property.
2423 my $default;
2424 my $property;
2425 if (@defaults >= 1) {
2426 if (@defaults == 1) {
2427 $default = $defaults[0];
2428 }
2429 else {
2430 $property = $defaults[0];
2431 $default = $defaults[1];
2432 }
2433 }
2434
2435 if (@defaults < 1
2436 || @defaults > 2
2437 || ($default =~ /^</
2438 && $default !~ /^<code *point>$/i
09f8d0ac
KW
2439 && $default !~ /^<none>$/i
2440 && $default !~ /^<script>$/i))
99870f4d
KW
2441 {
2442 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2443 }
2444 else {
2445
2446 # If the property is missing from the line, it should
2447 # be the one for the whole file
2448 $property = $property{$addr} if ! defined $property;
2449
2450 # Change <none> to the null string, which is what it
2451 # really means. If the default is the code point
2452 # itself, set it to <code point>, which is what
2453 # Unicode uses (but sometimes they've forgotten the
2454 # space)
2455 if ($default =~ /^<none>$/i) {
2456 $default = "";
2457 }
2458 elsif ($default =~ /^<code *point>$/i) {
2459 $default = $CODE_POINT;
2460 }
09f8d0ac
KW
2461 elsif ($default =~ /^<script>$/i) {
2462
2463 # Special case this one. Currently is from
2464 # ScriptExtensions.txt, and means for all unlisted
2465 # code points, use their Script property values.
2466 # For the code points not listed in that file, the
2467 # default value is 'Unknown'.
2468 $default = "Unknown";
2469 }
99870f4d
KW
2470
2471 # Store them as a sub-arrays with both components.
2472 push @{$missings{$addr}}, [ $default, $property ];
2473 }
2474 }
2475
2476 # There is nothing for the caller to process on this comment
2477 # line.
2478 next;
2479 }
2480
2481 # Remove comments and trailing space, and skip this line if the
2482 # result is empty
2483 s/#.*//;
2484 s/\s+$//;
2485 next if /^$/;
2486
2487 # Call any handlers for this line, and skip further processing of
2488 # the line if the handler sets the line to null.
2489 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2490 &{$sub_ref}($self);
2491 next LINE if /^$/;
2492 }
2493
2494 # Here the line is ok. return success.
2495 return 1;
2496 } # End of looping through lines.
2497
2498 # If there is an EOF handler, call it (only once) and if it generates
2499 # more lines to process go back in the loop to handle them.
2500 if ($eof_handler{$addr}) {
2501 &{$eof_handler{$addr}}($self);
2502 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2503 goto LINE if $added_lines{$addr};
2504 }
2505
2506 # Return failure -- no more lines.
2507 return 0;
2508
2509 }
2510
2511# Not currently used, not fully tested.
2512# sub peek {
2513# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2514# # record. Not callable from an each_line_handler(), nor does it call
2515# # an each_line_handler() on the line.
2516#
2517# my $self = shift;
ffe43484 2518# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2519#
2520# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2521# my ($adjusted, $line) = @{$inserted_ref};
2522# next if $adjusted;
2523#
2524# # Remove comments and trailing space, and return a non-empty
2525# # resulting line
2526# $line =~ s/#.*//;
2527# $line =~ s/\s+$//;
2528# return $line if $line ne "";
2529# }
2530#
2531# return if ! ref $handle{$addr}; # Don't read unless is real file
2532# while (1) { # Loop until find non-comment, non-empty line
2533# local $to_trace = 1 if main::DEBUG;
2534# trace $_ if main::DEBUG && $to_trace;
2535# return if ! defined (my $line = readline $handle{$addr});
2536# chomp $line;
2537# push @{$added_lines{$addr}}, [ 0, $line ];
2538#
2539# $line =~ s/#.*//;
2540# $line =~ s/\s+$//;
2541# return $line if $line ne "";
2542# }
2543#
2544# return;
2545# }
2546
2547
2548 sub insert_lines {
2549 # Lines can be inserted so that it looks like they were in the input
2550 # file at the place it was when this routine is called. See also
2551 # insert_adjusted_lines(). Lines inserted via this routine go through
2552 # any each_line_handler()
2553
2554 my $self = shift;
2555
2556 # Each inserted line is an array, with the first element being 0 to
2557 # indicate that this line hasn't been adjusted, and needs to be
2558 # processed.
f998e60c 2559 no overloading;
051df77b 2560 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2561 return;
2562 }
2563
2564 sub insert_adjusted_lines {
2565 # Lines can be inserted so that it looks like they were in the input
2566 # file at the place it was when this routine is called. See also
2567 # insert_lines(). Lines inserted via this routine are already fully
2568 # adjusted, ready to be processed; each_line_handler()s handlers will
2569 # not be called. This means this is not a completely general
2570 # facility, as only the last each_line_handler on the stack should
2571 # call this. It could be made more general, by passing to each of the
2572 # line_handlers their position on the stack, which they would pass on
2573 # to this routine, and that would replace the boolean first element in
2574 # the anonymous array pushed here, so that the next_line routine could
2575 # use that to call only those handlers whose index is after it on the
2576 # stack. But this is overkill for what is needed now.
2577
2578 my $self = shift;
2579 trace $_[0] if main::DEBUG && $to_trace;
2580
2581 # Each inserted line is an array, with the first element being 1 to
2582 # indicate that this line has been adjusted
f998e60c 2583 no overloading;
051df77b 2584 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2585 return;
2586 }
2587
2588 sub get_missings {
2589 # Returns the stored up @missings lines' values, and clears the list.
2590 # The values are in an array, consisting of the default in the first
2591 # element, and the property in the 2nd. However, since these lines
2592 # can be stacked up, the return is an array of all these arrays.
2593
2594 my $self = shift;
2595 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2596
ffe43484 2597 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2598
2599 # If not accepting a list return, just return the first one.
2600 return shift @{$missings{$addr}} unless wantarray;
2601
2602 my @return = @{$missings{$addr}};
2603 undef @{$missings{$addr}};
2604 return @return;
2605 }
2606
2607 sub _insert_property_into_line {
2608 # Add a property field to $_, if this file requires it.
2609
f998e60c 2610 my $self = shift;
ffe43484 2611 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2612 my $property = $property{$addr};
99870f4d
KW
2613 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2614
2615 $_ =~ s/(;|$)/; $property$1/;
2616 return;
2617 }
2618
2619 sub carp_bad_line {
2620 # Output consistent error messages, using either a generic one, or the
2621 # one given by the optional parameter. To avoid gazillions of the
2622 # same message in case the syntax of a file is way off, this routine
2623 # only outputs the first instance of each message, incrementing a
2624 # count so the totals can be output at the end of the file.
2625
2626 my $self = shift;
2627 my $message = shift;
2628 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2629
ffe43484 2630 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2631
2632 $message = 'Unexpected line' unless $message;
2633
2634 # No trailing punctuation so as to fit with our addenda.
2635 $message =~ s/[.:;,]$//;
2636
2637 # If haven't seen this exact message before, output it now. Otherwise
2638 # increment the count of how many times it has occurred
2639 unless ($errors{$addr}->{$message}) {
2640 Carp::my_carp("$message in '$_' in "
f998e60c 2641 . $file{$addr}
99870f4d
KW
2642 . " at line $.. Skipping this line;");
2643 $errors{$addr}->{$message} = 1;
2644 }
2645 else {
2646 $errors{$addr}->{$message}++;
2647 }
2648
2649 # Clear the line to prevent any further (meaningful) processing of it.
2650 $_ = "";
2651
2652 return;
2653 }
2654} # End closure
2655
2656package Multi_Default;
2657
2658# Certain properties in early versions of Unicode had more than one possible
2659# default for code points missing from the files. In these cases, one
2660# default applies to everything left over after all the others are applied,
2661# and for each of the others, there is a description of which class of code
2662# points applies to it. This object helps implement this by storing the
2663# defaults, and for all but that final default, an eval string that generates
2664# the class that it applies to.
2665
2666
2667{ # Closure
2668
2669 main::setup_package();
2670
2671 my %class_defaults;
2672 # The defaults structure for the classes
2673 main::set_access('class_defaults', \%class_defaults);
2674
2675 my %other_default;
2676 # The default that applies to everything left over.
2677 main::set_access('other_default', \%other_default, 'r');
2678
2679
2680 sub new {
2681 # The constructor is called with default => eval pairs, terminated by
2682 # the left-over default. e.g.
2683 # Multi_Default->new(
2684 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2685 # - 0x200D',
2686 # 'R' => 'some other expression that evaluates to code points',
2687 # .
2688 # .
2689 # .
2690 # 'U'));
2691
2692 my $class = shift;
2693
2694 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2695 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2696
2697 while (@_ > 1) {
2698 my $default = shift;
2699 my $eval = shift;
2700 $class_defaults{$addr}->{$default} = $eval;
2701 }
2702
2703 $other_default{$addr} = shift;
2704
2705 return $self;
2706 }
2707
2708 sub get_next_defaults {
2709 # Iterates and returns the next class of defaults.
2710 my $self = shift;
2711 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2712
ffe43484 2713 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2714
2715 return each %{$class_defaults{$addr}};
2716 }
2717}
2718
2719package Alias;
2720
2721# An alias is one of the names that a table goes by. This class defines them
2722# including some attributes. Everything is currently setup in the
2723# constructor.
2724
2725
2726{ # Closure
2727
2728 main::setup_package();
2729
2730 my %name;
2731 main::set_access('name', \%name, 'r');
2732
2733 my %loose_match;
c12f2655 2734 # Should this name match loosely or not.
99870f4d
KW
2735 main::set_access('loose_match', \%loose_match, 'r');
2736
33e96e72
KW
2737 my %make_re_pod_entry;
2738 # Some aliases should not get their own entries in the re section of the
2739 # pod, because they are covered by a wild-card, and some we want to
2740 # discourage use of. Binary
f82fe4ba 2741 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
99870f4d 2742
fd1e3e84
KW
2743 my %ucd;
2744 # Is this documented to be accessible via Unicode::UCD
2745 main::set_access('ucd', \%ucd, 'r', 's');
2746
99870f4d
KW
2747 my %status;
2748 # Aliases have a status, like deprecated, or even suppressed (which means
2749 # they don't appear in documentation). Enum
2750 main::set_access('status', \%status, 'r');
2751
0eac1e20 2752 my %ok_as_filename;
99870f4d
KW
2753 # Similarly, some aliases should not be considered as usable ones for
2754 # external use, such as file names, or we don't want documentation to
2755 # recommend them. Boolean
0eac1e20 2756 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
99870f4d
KW
2757
2758 sub new {
2759 my $class = shift;
2760
2761 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2762 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2763
2764 $name{$addr} = shift;
2765 $loose_match{$addr} = shift;
33e96e72 2766 $make_re_pod_entry{$addr} = shift;
0eac1e20 2767 $ok_as_filename{$addr} = shift;
99870f4d 2768 $status{$addr} = shift;
fd1e3e84 2769 $ucd{$addr} = shift;
99870f4d
KW
2770
2771 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2772
2773 # Null names are never ok externally
0eac1e20 2774 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
99870f4d
KW
2775
2776 return $self;
2777 }
2778}
2779
2780package Range;
2781
2782# A range is the basic unit for storing code points, and is described in the
2783# comments at the beginning of the program. Each range has a starting code
2784# point; an ending code point (not less than the starting one); a value
2785# that applies to every code point in between the two end-points, inclusive;
2786# and an enum type that applies to the value. The type is for the user's
2787# convenience, and has no meaning here, except that a non-zero type is
2788# considered to not obey the normal Unicode rules for having standard forms.
2789#
2790# The same structure is used for both map and match tables, even though in the
2791# latter, the value (and hence type) is irrelevant and could be used as a
2792# comment. In map tables, the value is what all the code points in the range
2793# map to. Type 0 values have the standardized version of the value stored as
2794# well, so as to not have to recalculate it a lot.
2795
2796sub trace { return main::trace(@_); }
2797
2798{ # Closure
2799
2800 main::setup_package();
2801
2802 my %start;
2803 main::set_access('start', \%start, 'r', 's');
2804
2805 my %end;
2806 main::set_access('end', \%end, 'r', 's');
2807
2808 my %value;
2809 main::set_access('value', \%value, 'r');
2810
2811 my %type;
2812 main::set_access('type', \%type, 'r');
2813
2814 my %standard_form;
2815 # The value in internal standard form. Defined only if the type is 0.
2816 main::set_access('standard_form', \%standard_form);
2817
2818 # Note that if these fields change, the dump() method should as well
2819
2820 sub new {
2821 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2822 my $class = shift;
2823
2824 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2825 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2826
2827 $start{$addr} = shift;
2828 $end{$addr} = shift;
2829
2830 my %args = @_;
2831
2832 my $value = delete $args{'Value'}; # Can be 0
2833 $value = "" unless defined $value;
2834 $value{$addr} = $value;
2835
2836 $type{$addr} = delete $args{'Type'} || 0;
2837
2838 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2839
2840 if (! $type{$addr}) {
2841 $standard_form{$addr} = main::standardize($value);
2842 }
2843
2844 return $self;
2845 }
2846
2847 use overload
2848 fallback => 0,
2849 qw("") => "_operator_stringify",
2850 "." => \&main::_operator_dot,
2851 ;
2852
2853 sub _operator_stringify {
2854 my $self = shift;
ffe43484 2855 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2856
2857 # Output it like '0041..0065 (value)'
2858 my $return = sprintf("%04X", $start{$addr})
2859 . '..'
2860 . sprintf("%04X", $end{$addr});
2861 my $value = $value{$addr};
2862 my $type = $type{$addr};
2863 $return .= ' (';
2864 $return .= "$value";
2865 $return .= ", Type=$type" if $type != 0;
2866 $return .= ')';
2867
2868 return $return;
2869 }
2870
2871 sub standard_form {
2872 # The standard form is the value itself if the standard form is
2873 # undefined (that is if the value is special)
2874
2875 my $self = shift;
2876 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2877
ffe43484 2878 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2879
2880 return $standard_form{$addr} if defined $standard_form{$addr};
2881 return $value{$addr};
2882 }
2883
2884 sub dump {
2885 # Human, not machine readable. For machine readable, comment out this
2886 # entire routine and let the standard one take effect.
2887 my $self = shift;
2888 my $indent = shift;
2889 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2890
ffe43484 2891 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2892
2893 my $return = $indent
2894 . sprintf("%04X", $start{$addr})
2895 . '..'
2896 . sprintf("%04X", $end{$addr})
2897 . " '$value{$addr}';";
2898 if (! defined $standard_form{$addr}) {
2899 $return .= "(type=$type{$addr})";
2900 }
2901 elsif ($standard_form{$addr} ne $value{$addr}) {
2902 $return .= "(standard '$standard_form{$addr}')";
2903 }
2904 return $return;
2905 }
2906} # End closure
2907
2908package _Range_List_Base;
2909
2910# Base class for range lists. A range list is simply an ordered list of
2911# ranges, so that the ranges with the lowest starting numbers are first in it.
2912#
2913# When a new range is added that is adjacent to an existing range that has the
2914# same value and type, it merges with it to form a larger range.
2915#
2916# Ranges generally do not overlap, except that there can be multiple entries
2917# of single code point ranges. This is because of NameAliases.txt.
2918#
2919# In this program, there is a standard value such that if two different
2920# values, have the same standard value, they are considered equivalent. This
2921# value was chosen so that it gives correct results on Unicode data
2922
2923# There are a number of methods to manipulate range lists, and some operators
2924# are overloaded to handle them.
2925
99870f4d
KW
2926sub trace { return main::trace(@_); }
2927
2928{ # Closure
2929
2930 our $addr;
2931
2932 main::setup_package();
2933
2934 my %ranges;
2935 # The list of ranges
2936 main::set_access('ranges', \%ranges, 'readable_array');
2937
2938 my %max;
2939 # The highest code point in the list. This was originally a method, but
2940 # actual measurements said it was used a lot.
2941 main::set_access('max', \%max, 'r');
2942
2943 my %each_range_iterator;
2944 # Iterator position for each_range()
2945 main::set_access('each_range_iterator', \%each_range_iterator);
2946
2947 my %owner_name_of;
2948 # Name of parent this is attached to, if any. Solely for better error
2949 # messages.
2950 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2951
2952 my %_search_ranges_cache;
2953 # A cache of the previous result from _search_ranges(), for better
2954 # performance
2955 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2956
2957 sub new {
2958 my $class = shift;
2959 my %args = @_;
2960
2961 # Optional initialization data for the range list.
2962 my $initialize = delete $args{'Initialize'};
2963
2964 my $self;
2965
2966 # Use _union() to initialize. _union() returns an object of this
2967 # class, which means that it will call this constructor recursively.
2968 # But it won't have this $initialize parameter so that it won't
2969 # infinitely loop on this.
2970 return _union($class, $initialize, %args) if defined $initialize;
2971
2972 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2973 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2974
2975 # Optional parent object, only for debug info.
2976 $owner_name_of{$addr} = delete $args{'Owner'};
2977 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2978
2979 # Stringify, in case it is an object.
2980 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2981
2982 # This is used only for error messages, and so a colon is added
2983 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2984
2985 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2986
2987 # Max is initialized to a negative value that isn't adjacent to 0,
2988 # for simpler tests
2989 $max{$addr} = -2;
2990
2991 $_search_ranges_cache{$addr} = 0;
2992 $ranges{$addr} = [];
2993
2994 return $self;
2995 }
2996
2997 use overload
2998 fallback => 0,
2999 qw("") => "_operator_stringify",
3000 "." => \&main::_operator_dot,
3001 ;
3002
3003 sub _operator_stringify {
3004 my $self = shift;
ffe43484 3005 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3006
3007 return "Range_List attached to '$owner_name_of{$addr}'"
3008 if $owner_name_of{$addr};
3009 return "anonymous Range_List " . \$self;
3010 }
3011
3012 sub _union {
3013 # Returns the union of the input code points. It can be called as
3014 # either a constructor or a method. If called as a method, the result
3015 # will be a new() instance of the calling object, containing the union
3016 # of that object with the other parameter's code points; if called as
3017 # a constructor, the first parameter gives the class the new object
3018 # should be, and the second parameter gives the code points to go into
3019 # it.
3020 # In either case, there are two parameters looked at by this routine;
3021 # any additional parameters are passed to the new() constructor.
3022 #
3023 # The code points can come in the form of some object that contains
3024 # ranges, and has a conventionally named method to access them; or
3025 # they can be an array of individual code points (as integers); or
3026 # just a single code point.
3027 #
3028 # If they are ranges, this routine doesn't make any effort to preserve
3029 # the range values of one input over the other. Therefore this base
3030 # class should not allow _union to be called from other than
3031 # initialization code, so as to prevent two tables from being added
3032 # together where the range values matter. The general form of this
3033 # routine therefore belongs in a derived class, but it was moved here
3034 # to avoid duplication of code. The failure to overload this in this
3035 # class keeps it safe.
3036 #
3037
3038 my $self;
3039 my @args; # Arguments to pass to the constructor
3040
3041 my $class = shift;
3042
3043 # If a method call, will start the union with the object itself, and
3044 # the class of the new object will be the same as self.
3045 if (ref $class) {
3046 $self = $class;
3047 $class = ref $self;
3048 push @args, $self;
3049 }
3050
3051 # Add the other required parameter.
3052 push @args, shift;
3053 # Rest of parameters are passed on to the constructor
3054
3055 # Accumulate all records from both lists.
3056 my @records;
3057 for my $arg (@args) {
3058 #local $to_trace = 0 if main::DEBUG;
3059 trace "argument = $arg" if main::DEBUG && $to_trace;
3060 if (! defined $arg) {
3061 my $message = "";
3062 if (defined $self) {
f998e60c 3063 no overloading;
051df77b 3064 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
3065 }
3066 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
3067 return;
3068 }
3069 $arg = [ $arg ] if ! ref $arg;
3070 my $type = ref $arg;
3071 if ($type eq 'ARRAY') {
3072 foreach my $element (@$arg) {
3073 push @records, Range->new($element, $element);
3074 }
3075 }
3076 elsif ($arg->isa('Range')) {
3077 push @records, $arg;
3078 }
3079 elsif ($arg->can('ranges')) {
3080 push @records, $arg->ranges;
3081 }
3082 else {
3083 my $message = "";
3084 if (defined $self) {
f998e60c 3085 no overloading;
051df77b 3086 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
3087 }
3088 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3089 return;
3090 }
3091 }
3092
3093 # Sort with the range containing the lowest ordinal first, but if
3094 # two ranges start at the same code point, sort with the bigger range
3095 # of the two first, because it takes fewer cycles.
3096 @records = sort { ($a->start <=> $b->start)
3097 or
3098 # if b is shorter than a, b->end will be
3099 # less than a->end, and we want to select
3100 # a, so want to return -1
3101 ($b->end <=> $a->end)
3102 } @records;
3103
3104 my $new = $class->new(@_);
3105
3106 # Fold in records so long as they add new information.
3107 for my $set (@records) {
3108 my $start = $set->start;
3109 my $end = $set->end;
3110 my $value = $set->value;
3111 if ($start > $new->max) {
3112 $new->_add_delete('+', $start, $end, $value);
3113 }
3114 elsif ($end > $new->max) {
3115 $new->_add_delete('+', $new->max +1, $end, $value);
3116 }
3117 }
3118
3119 return $new;
3120 }
3121
3122 sub range_count { # Return the number of ranges in the range list
3123 my $self = shift;
3124 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3125
f998e60c 3126 no overloading;
051df77b 3127 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
3128 }
3129
3130 sub min {
3131 # Returns the minimum code point currently in the range list, or if
3132 # the range list is empty, 2 beyond the max possible. This is a
3133 # method because used so rarely, that not worth saving between calls,
3134 # and having to worry about changing it as ranges are added and
3135 # deleted.
3136
3137 my $self = shift;
3138 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3139
ffe43484 3140 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3141
3142 # If the range list is empty, return a large value that isn't adjacent
3143 # to any that could be in the range list, for simpler tests
6189eadc 3144 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
99870f4d
KW
3145 return $ranges{$addr}->[0]->start;
3146 }
3147
3148 sub contains {
3149 # Boolean: Is argument in the range list? If so returns $i such that:
3150 # range[$i]->end < $codepoint <= range[$i+1]->end
3151 # which is one beyond what you want; this is so that the 0th range
3152 # doesn't return false
3153 my $self = shift;
3154 my $codepoint = shift;
3155 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3156
99870f4d
KW
3157 my $i = $self->_search_ranges($codepoint);
3158 return 0 unless defined $i;
3159
3160 # The search returns $i, such that
3161 # range[$i-1]->end < $codepoint <= range[$i]->end
3162 # So is in the table if and only iff it is at least the start position
3163 # of range $i.
f998e60c 3164 no overloading;
051df77b 3165 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
3166 return $i + 1;
3167 }
3168
2f7a8815
KW
3169 sub containing_range {
3170 # Returns the range object that contains the code point, undef if none
3171
3172 my $self = shift;
3173 my $codepoint = shift;
3174 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3175
3176 my $i = $self->contains($codepoint);
3177 return unless $i;
3178
3179 # contains() returns 1 beyond where we should look
3180 no overloading;
3181 return $ranges{pack 'J', $self}->[$i-1];
3182 }
3183
99870f4d
KW
3184 sub value_of {
3185 # Returns the value associated with the code point, undef if none
3186
3187 my $self = shift;
3188 my $codepoint = shift;
3189 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3190
d69c231b
KW
3191 my $range = $self->containing_range($codepoint);
3192 return unless defined $range;
99870f4d 3193
d69c231b 3194 return $range->value;
99870f4d
KW
3195 }
3196
0a9dbafc
KW
3197 sub type_of {
3198 # Returns the type of the range containing the code point, undef if
3199 # the code point is not in the table
3200
3201 my $self = shift;
3202 my $codepoint = shift;
3203 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3204
3205 my $range = $self->containing_range($codepoint);
3206 return unless defined $range;
3207
3208 return $range->type;
3209 }
3210
99870f4d
KW
3211 sub _search_ranges {
3212 # Find the range in the list which contains a code point, or where it
3213 # should go if were to add it. That is, it returns $i, such that:
3214 # range[$i-1]->end < $codepoint <= range[$i]->end
3215 # Returns undef if no such $i is possible (e.g. at end of table), or
3216 # if there is an error.
3217
3218 my $self = shift;
3219 my $code_point = shift;
3220 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3221
ffe43484 3222 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3223
3224 return if $code_point > $max{$addr};
3225 my $r = $ranges{$addr}; # The current list of ranges
3226 my $range_list_size = scalar @$r;
3227 my $i;
3228
3229 use integer; # want integer division
3230
3231 # Use the cached result as the starting guess for this one, because,
3232 # an experiment on 5.1 showed that 90% of the time the cache was the
3233 # same as the result on the next call (and 7% it was one less).
3234 $i = $_search_ranges_cache{$addr};
3235 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3236 # from an intervening deletion
3237 #local $to_trace = 1 if main::DEBUG;
3238 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3239 return $i if $code_point <= $r->[$i]->end
3240 && ($i == 0 || $r->[$i-1]->end < $code_point);
3241
3242 # Here the cache doesn't yield the correct $i. Try adding 1.
3243 if ($i < $range_list_size - 1
3244 && $r->[$i]->end < $code_point &&
3245 $code_point <= $r->[$i+1]->end)
3246 {
3247 $i++;
3248 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3249 $_search_ranges_cache{$addr} = $i;
3250 return $i;
3251 }
3252
3253 # Here, adding 1 also didn't work. We do a binary search to
3254 # find the correct position, starting with current $i
3255 my $lower = 0;
3256 my $upper = $range_list_size - 1;
3257 while (1) {
3258 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3259
3260 if ($code_point <= $r->[$i]->end) {
3261
3262 # Here we have met the upper constraint. We can quit if we
3263 # also meet the lower one.
3264 last if $i == 0 || $r->[$i-1]->end < $code_point;
3265
3266 $upper = $i; # Still too high.
3267
3268 }
3269 else {
3270
3271 # Here, $r[$i]->end < $code_point, so look higher up.
3272 $lower = $i;
3273 }
3274
3275 # Split search domain in half to try again.
3276 my $temp = ($upper + $lower) / 2;
3277
3278 # No point in continuing unless $i changes for next time
3279 # in the loop.
3280 if ($temp == $i) {
3281
3282 # We can't reach the highest element because of the averaging.
3283 # So if one below the upper edge, force it there and try one
3284 # more time.
3285 if ($i == $range_list_size - 2) {
3286
3287 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3288 $i = $range_list_size - 1;
3289
3290 # Change $lower as well so if fails next time through,
3291 # taking the average will yield the same $i, and we will
3292 # quit with the error message just below.
3293 $lower = $i;
3294 next;
3295 }
3296 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3297 return;
3298 }
3299 $i = $temp;
3300 } # End of while loop
3301
3302 if (main::DEBUG && $to_trace) {
3303 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3304 trace "i= [ $i ]", $r->[$i];
3305 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3306 }
3307
3308 # Here we have found the offset. Cache it as a starting point for the
3309 # next call.
3310 $_search_ranges_cache{$addr} = $i;
3311 return $i;
3312 }
3313
3314 sub _add_delete {
3315 # Add, replace or delete ranges to or from a list. The $type
3316 # parameter gives which:
3317 # '+' => insert or replace a range, returning a list of any changed
3318 # ranges.
3319 # '-' => delete a range, returning a list of any deleted ranges.
3320 #
3321 # The next three parameters give respectively the start, end, and
3322 # value associated with the range. 'value' should be null unless the
3323 # operation is '+';
3324 #
3325 # The range list is kept sorted so that the range with the lowest
3326 # starting position is first in the list, and generally, adjacent
c1739a4a 3327 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3328 # exceptions below).
3329 #
c1739a4a 3330 # There are more parameters; all are key => value pairs:
99870f4d
KW
3331 # Type gives the type of the value. It is only valid for '+'.
3332 # All ranges have types; if this parameter is omitted, 0 is
3333 # assumed. Ranges with type 0 are assumed to obey the
3334 # Unicode rules for casing, etc; ranges with other types are
3335 # not. Otherwise, the type is arbitrary, for the caller's
3336 # convenience, and looked at only by this routine to keep
3337 # adjacent ranges of different types from being merged into
3338 # a single larger range, and when Replace =>
3339 # $IF_NOT_EQUIVALENT is specified (see just below).
3340 # Replace determines what to do if the range list already contains
3341 # ranges which coincide with all or portions of the input
3342 # range. It is only valid for '+':
3343 # => $NO means that the new value is not to replace
3344 # any existing ones, but any empty gaps of the
3345 # range list coinciding with the input range
3346 # will be filled in with the new value.
3347 # => $UNCONDITIONALLY means to replace the existing values with
3348 # this one unconditionally. However, if the
3349 # new and old values are identical, the
3350 # replacement is skipped to save cycles
3351 # => $IF_NOT_EQUIVALENT means to replace the existing values
3352 # with this one if they are not equivalent.
3353 # Ranges are equivalent if their types are the
c1739a4a 3354 # same, and they are the same string; or if
99870f4d
KW
3355 # both are type 0 ranges, if their Unicode
3356 # standard forms are identical. In this last
3357 # case, the routine chooses the more "modern"
3358 # one to use. This is because some of the
3359 # older files are formatted with values that
3360 # are, for example, ALL CAPs, whereas the
3361 # derived files have a more modern style,
3362 # which looks better. By looking for this
3363 # style when the pre-existing and replacement
3364 # standard forms are the same, we can move to
3365 # the modern style
9470941f 3366 # => $MULTIPLE_BEFORE means that if this range duplicates an
99870f4d
KW
3367 # existing one, but has a different value,
3368 # don't replace the existing one, but insert
3369 # this, one so that the same range can occur
53d84487
KW
3370 # multiple times. They are stored LIFO, so
3371 # that the final one inserted is the first one
3372 # returned in an ordered search of the table.
7f4b1e25
KW
3373 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3374 # FIFO, so that this one is inserted after all
3375 # others that currently exist.
99870f4d
KW
3376 # => anything else is the same as => $IF_NOT_EQUIVALENT
3377 #
c1739a4a
KW
3378 # "same value" means identical for non-type-0 ranges, and it means
3379 # having the same standard forms for type-0 ranges.
99870f4d
KW
3380
3381 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3382
3383 my $self = shift;
3384 my $operation = shift; # '+' for add/replace; '-' for delete;
3385 my $start = shift;
3386 my $end = shift;
3387 my $value = shift;
3388
3389 my %args = @_;
3390
3391 $value = "" if not defined $value; # warning: $value can be "0"
3392
3393 my $replace = delete $args{'Replace'};
3394 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3395
3396 my $type = delete $args{'Type'};
3397 $type = 0 unless defined $type;
3398
3399 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3400
ffe43484 3401 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3402
3403 if ($operation ne '+' && $operation ne '-') {
3404 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3405 return;
3406 }
3407 unless (defined $start && defined $end) {
3408 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3409 return;
3410 }
3411 unless ($end >= $start) {
3412 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3413 return;
3414 }
3415 #local $to_trace = 1 if main::DEBUG;
3416
3417 if ($operation eq '-') {
3418 if ($replace != $IF_NOT_EQUIVALENT) {
3419 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3420 $replace = $IF_NOT_EQUIVALENT;
3421 }
3422 if ($type) {
3423 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3424 $type = 0;
3425 }
3426 if ($value ne "") {
3427 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3428 $value = "";
3429 }
3430 }
3431
3432 my $r = $ranges{$addr}; # The current list of ranges
3433 my $range_list_size = scalar @$r; # And its size
3434 my $max = $max{$addr}; # The current high code point in
3435 # the list of ranges
3436
3437 # Do a special case requiring fewer machine cycles when the new range
3438 # starts after the current highest point. The Unicode input data is
3439 # structured so this is common.
3440 if ($start > $max) {
3441
3442 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3443 return if $operation eq '-'; # Deleting a non-existing range is a
3444 # no-op
3445
3446 # If the new range doesn't logically extend the current final one
3447 # in the range list, create a new range at the end of the range
3448 # list. (max cleverly is initialized to a negative number not
3449 # adjacent to 0 if the range list is empty, so even adding a range
3450 # to an empty range list starting at 0 will have this 'if'
3451 # succeed.)
3452 if ($start > $max + 1 # non-adjacent means can't extend.
3453 || @{$r}[-1]->value ne $value # values differ, can't extend.
3454 || @{$r}[-1]->type != $type # types differ, can't extend.
3455 ) {
3456 push @$r, Range->new($start, $end,
3457 Value => $value,
3458 Type => $type);
3459 }
3460 else {
3461
3462 # Here, the new range starts just after the current highest in
3463 # the range list, and they have the same type and value.
3464 # Extend the current range to incorporate the new one.
3465 @{$r}[-1]->set_end($end);
3466 }
3467
3468 # This becomes the new maximum.
3469 $max{$addr} = $end;
3470
3471 return;
3472 }
3473 #local $to_trace = 0 if main::DEBUG;
3474
3475 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3476
3477 # Here, the input range isn't after the whole rest of the range list.
3478 # Most likely 'splice' will be needed. The rest of the routine finds
3479 # the needed splice parameters, and if necessary, does the splice.
3480 # First, find the offset parameter needed by the splice function for
3481 # the input range. Note that the input range may span multiple
3482 # existing ones, but we'll worry about that later. For now, just find
3483 # the beginning. If the input range is to be inserted starting in a
3484 # position not currently in the range list, it must (obviously) come
3485 # just after the range below it, and just before the range above it.
3486 # Slightly less obviously, it will occupy the position currently
3487 # occupied by the range that is to come after it. More formally, we
3488 # are looking for the position, $i, in the array of ranges, such that:
3489 #
3490 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3491 #
3492 # (The ordered relationships within existing ranges are also shown in
3493 # the equation above). However, if the start of the input range is
3494 # within an existing range, the splice offset should point to that
3495 # existing range's position in the list; that is $i satisfies a
3496 # somewhat different equation, namely:
3497 #
3498 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3499 #
3500 # More briefly, $start can come before or after r[$i]->start, and at
3501 # this point, we don't know which it will be. However, these
3502 # two equations share these constraints:
3503 #
3504 # r[$i-1]->end < $start <= r[$i]->end
3505 #
3506 # And that is good enough to find $i.
3507
3508 my $i = $self->_search_ranges($start);
3509 if (! defined $i) {
3510 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3511 return;
3512 }
3513
3514 # The search function returns $i such that:
3515 #
3516 # r[$i-1]->end < $start <= r[$i]->end
3517 #
3518 # That means that $i points to the first range in the range list
3519 # that could possibly be affected by this operation. We still don't
3520 # know if the start of the input range is within r[$i], or if it
3521 # points to empty space between r[$i-1] and r[$i].
3522 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3523
3524 # Special case the insertion of data that is not to replace any
3525 # existing data.
3526 if ($replace == $NO) { # If $NO, has to be operation '+'
3527 #local $to_trace = 1 if main::DEBUG;
3528 trace "Doesn't replace" if main::DEBUG && $to_trace;
3529
3530 # Here, the new range is to take effect only on those code points
3531 # that aren't already in an existing range. This can be done by
3532 # looking through the existing range list and finding the gaps in
3533 # the ranges that this new range affects, and then calling this
3534 # function recursively on each of those gaps, leaving untouched
3535 # anything already in the list. Gather up a list of the changed
3536 # gaps first so that changes to the internal state as new ranges
3537 # are added won't be a problem.
3538 my @gap_list;
3539
3540 # First, if the starting point of the input range is outside an
3541 # existing one, there is a gap from there to the beginning of the
3542 # existing range -- add a span to fill the part that this new
3543 # range occupies
3544 if ($start < $r->[$i]->start) {
3545 push @gap_list, Range->new($start,
3546 main::min($end,
3547 $r->[$i]->start - 1),
3548 Type => $type);
3549 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3550 }
3551
3552 # Then look through the range list for other gaps until we reach
3553 # the highest range affected by the input one.
3554 my $j;
3555 for ($j = $i+1; $j < $range_list_size; $j++) {
3556 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3557 last if $end < $r->[$j]->start;
3558
3559 # If there is a gap between when this range starts and the
3560 # previous one ends, add a span to fill it. Note that just
3561 # because there are two ranges doesn't mean there is a
3562 # non-zero gap between them. It could be that they have
3563 # different values or types
3564 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3565 push @gap_list,
3566 Range->new($r->[$j-1]->end + 1,
3567 $r->[$j]->start - 1,
3568 Type => $type);
3569 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3570 }
3571 }
3572
3573 # Here, we have either found an existing range in the range list,
3574 # beyond the area affected by the input one, or we fell off the
3575 # end of the loop because the input range affects the whole rest
3576 # of the range list. In either case, $j is 1 higher than the
3577 # highest affected range. If $j == $i, it means that there are no
3578 # affected ranges, that the entire insertion is in the gap between
3579 # r[$i-1], and r[$i], which we already have taken care of before
3580 # the loop.
3581 # On the other hand, if there are affected ranges, it might be
3582 # that there is a gap that needs filling after the final such
3583 # range to the end of the input range
3584 if ($r->[$j-1]->end < $end) {
3585 push @gap_list, Range->new(main::max($start,
3586 $r->[$j-1]->end + 1),
3587 $end,
3588 Type => $type);
3589 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3590 }
3591
3592 # Call recursively to fill in all the gaps.
3593 foreach my $gap (@gap_list) {
3594 $self->_add_delete($operation,
3595 $gap->start,
3596 $gap->end,
3597 $value,
3598 Type => $type);
3599 }
3600
3601 return;
3602 }
3603
53d84487
KW
3604 # Here, we have taken care of the case where $replace is $NO.
3605 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3606 # If inserting a multiple record, this is where it goes, before the
7f4b1e25
KW
3607 # first (if any) existing one if inserting LIFO. (If this is to go
3608 # afterwards, FIFO, we below move the pointer to there.) These imply
3609 # an insertion, and no change to any existing ranges. Note that $i
3610 # can be -1 if this new range doesn't actually duplicate any existing,
3611 # and comes at the beginning of the list.
3612 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
53d84487
KW
3613
3614 if ($start != $end) {
3615 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3616 return;
3617 }
3618
19155fcc 3619 # If the new code point is within a current range ...
53d84487 3620 if ($end >= $r->[$i]->start) {
19155fcc
KW
3621
3622 # Don't add an exact duplicate, as it isn't really a multiple
1f6798c4
KW
3623 my $existing_value = $r->[$i]->value;
3624 my $existing_type = $r->[$i]->type;
3625 return if $value eq $existing_value && $type eq $existing_type;
3626
3627 # If the multiple value is part of an existing range, we want
3628 # to split up that range, so that only the single code point
3629 # is affected. To do this, we first call ourselves
3630 # recursively to delete that code point from the table, having
3631 # preserved its current data above. Then we call ourselves
3632 # recursively again to add the new multiple, which we know by
3633 # the test just above is different than the current code
3634 # point's value, so it will become a range containing a single
3635 # code point: just itself. Finally, we add back in the
3636 # pre-existing code point, which will again be a single code
3637 # point range. Because 'i' likely will have changed as a
3638 # result of these operations, we can't just continue on, but
7f4b1e25
KW
3639 # do this operation recursively as well. If we are inserting
3640 # LIFO, the pre-existing code point needs to go after the new
3641 # one, so use MULTIPLE_AFTER; and vice versa.
53d84487 3642 if ($r->[$i]->start != $r->[$i]->end) {
1f6798c4
KW
3643 $self->_add_delete('-', $start, $end, "");
3644 $self->_add_delete('+', $start, $end, $value, Type => $type);
7f4b1e25
KW
3645 return $self->_add_delete('+',
3646 $start, $end,
3647 $existing_value,
3648 Type => $existing_type,
3649 Replace => ($replace == $MULTIPLE_BEFORE)
3650 ? $MULTIPLE_AFTER
3651 : $MULTIPLE_BEFORE);
3652 }
3653 }
3654
3655 # If to place this new record after, move to beyond all existing
3656 # ones.
3657 if ($replace == $MULTIPLE_AFTER) {
3658 while ($i < @$r && $r->[$i]->start == $start) {
3659 $i++;
53d84487 3660 }
53d84487
KW
3661 }
3662
3663 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3664 my @return = splice @$r,
3665 $i,
3666 0,
3667 Range->new($start,
3668 $end,
3669 Value => $value,
3670 Type => $type);
3671 if (main::DEBUG && $to_trace) {
3672 trace "After splice:";
3673 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3674 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3675 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3676 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3677 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3678 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3679 }
3680 return @return;
3681 }
3682
7f4b1e25
KW
3683 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
3684 # leaves delete, insert, and replace either unconditionally or if not
53d84487
KW
3685 # equivalent. $i still points to the first potential affected range.
3686 # Now find the highest range affected, which will determine the length
3687 # parameter to splice. (The input range can span multiple existing
3688 # ones.) If this isn't a deletion, while we are looking through the
3689 # range list, see also if this is a replacement rather than a clean
3690 # insertion; that is if it will change the values of at least one
3691 # existing range. Start off assuming it is an insert, until find it
3692 # isn't.
3693 my $clean_insert = $operation eq '+';
99870f4d
KW
3694 my $j; # This will point to the highest affected range
3695
3696 # For non-zero types, the standard form is the value itself;
3697 my $standard_form = ($type) ? $value : main::standardize($value);
3698
3699 for ($j = $i; $j < $range_list_size; $j++) {
3700 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3701
3702 # If find a range that it doesn't overlap into, we can stop
3703 # searching
3704 last if $end < $r->[$j]->start;
3705
969a34cc
KW
3706 # Here, overlaps the range at $j. If the values don't match,
3707 # and so far we think this is a clean insertion, it becomes a
3708 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3709 if ($clean_insert) {
99870f4d 3710 if ($r->[$j]->standard_form ne $standard_form) {
969a34cc 3711 $clean_insert = 0;
56343c78
KW
3712 if ($replace == $CROAK) {
3713 main::croak("The range to add "
3714 . sprintf("%04X", $start)
3715 . '-'
3716 . sprintf("%04X", $end)
3717 . " with value '$value' overlaps an existing range $r->[$j]");
3718 }
99870f4d
KW
3719 }
3720 else {
3721
3722 # Here, the two values are essentially the same. If the
3723 # two are actually identical, replacing wouldn't change
3724 # anything so skip it.
3725 my $pre_existing = $r->[$j]->value;
3726 if ($pre_existing ne $value) {
3727
3728 # Here the new and old standardized values are the
3729 # same, but the non-standardized values aren't. If
3730 # replacing unconditionally, then replace
3731 if( $replace == $UNCONDITIONALLY) {
969a34cc 3732 $clean_insert = 0;
99870f4d
KW
3733 }
3734 else {
3735
3736 # Here, are replacing conditionally. Decide to
3737 # replace or not based on which appears to look
3738 # the "nicest". If one is mixed case and the
3739 # other isn't, choose the mixed case one.
3740 my $new_mixed = $value =~ /[A-Z]/
3741 && $value =~ /[a-z]/;
3742 my $old_mixed = $pre_existing =~ /[A-Z]/
3743 && $pre_existing =~ /[a-z]/;
3744
3745 if ($old_mixed != $new_mixed) {
969a34cc 3746 $clean_insert = 0 if $new_mixed;
99870f4d 3747 if (main::DEBUG && $to_trace) {
969a34cc
KW
3748 if ($clean_insert) {
3749 trace "Retaining $pre_existing over $value";
99870f4d
KW
3750 }
3751 else {
969a34cc 3752 trace "Replacing $pre_existing with $value";
99870f4d
KW
3753 }
3754 }
3755 }
3756 else {
3757
3758 # Here casing wasn't different between the two.
3759 # If one has hyphens or underscores and the
3760 # other doesn't, choose the one with the
3761 # punctuation.
3762 my $new_punct = $value =~ /[-_]/;
3763 my $old_punct = $pre_existing =~ /[-_]/;
3764
3765 if ($old_punct != $new_punct) {
969a34cc 3766 $clean_insert = 0 if $new_punct;
99870f4d 3767 if (main::DEBUG && $to_trace) {
969a34cc
KW
3768 if ($clean_insert) {
3769 trace "Retaining $pre_existing over $value";
99870f4d
KW
3770 }
3771 else {
969a34cc 3772 trace "Replacing $pre_existing with $value";
99870f4d
KW
3773 }
3774 }
3775 } # else existing one is just as "good";
3776 # retain it to save cycles.
3777 }
3778 }
3779 }
3780 }
3781 }
3782 } # End of loop looking for highest affected range.
3783
3784 # Here, $j points to one beyond the highest range that this insertion
3785 # affects (hence to beyond the range list if that range is the final
3786 # one in the range list).
3787
3788 # The splice length is all the affected ranges. Get it before
3789 # subtracting, for efficiency, so we don't have to later add 1.
3790 my $length = $j - $i;
3791
3792 $j--; # $j now points to the highest affected range.
3793 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3794
7f4b1e25 3795 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
99870f4d
KW
3796 # $j points to the highest affected range. But it can be < $i or even
3797 # -1. These happen only if the insertion is entirely in the gap
3798 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3799 # above exited first time through with $end < $r->[$i]->start. (And
3800 # then we subtracted one from j) This implies also that $start <
3801 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3802 # $start, so the entire input range is in the gap.
3803 if ($j < $i) {
3804
3805 # Here the entire input range is in the gap before $i.
3806
3807 if (main::DEBUG && $to_trace) {
3808 if ($i) {
3809 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3810 }
3811 else {
3812 trace "Entire range is before $r->[$i]";
3813 }
3814 }
3815 return if $operation ne '+'; # Deletion of a non-existent range is
3816 # a no-op
3817 }
3818 else {
3819
969a34cc
KW
3820 # Here part of the input range is not in the gap before $i. Thus,
3821 # there is at least one affected one, and $j points to the highest
3822 # such one.
99870f4d
KW
3823
3824 # At this point, here is the situation:
3825 # This is not an insertion of a multiple, nor of tentative ($NO)
3826 # data.
3827 # $i points to the first element in the current range list that
3828 # may be affected by this operation. In fact, we know
3829 # that the range at $i is affected because we are in
3830 # the else branch of this 'if'
3831 # $j points to the highest affected range.
3832 # In other words,
3833 # r[$i-1]->end < $start <= r[$i]->end
3834 # And:
3835 # r[$i-1]->end < $start <= $end <= r[$j]->end
3836 #
3837 # Also:
969a34cc
KW
3838 # $clean_insert is a boolean which is set true if and only if
3839 # this is a "clean insertion", i.e., not a change nor a
3840 # deletion (multiple was handled above).
99870f4d
KW
3841
3842 # We now have enough information to decide if this call is a no-op
969a34cc
KW
3843 # or not. It is a no-op if this is an insertion of already
3844 # existing data.
99870f4d 3845
969a34cc 3846 if (main::DEBUG && $to_trace && $clean_insert
99870f4d
KW
3847 && $i == $j
3848 && $start >= $r->[$i]->start)
3849 {
3850 trace "no-op";
3851 }
969a34cc 3852 return if $clean_insert
99870f4d
KW
3853 && $i == $j # more than one affected range => not no-op
3854
3855 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3856 # Further, $start and/or $end is >= r[$i]->start
3857 # The test below hence guarantees that
3858 # r[$i]->start < $start <= $end <= r[$i]->end
3859 # This means the input range is contained entirely in
3860 # the one at $i, so is a no-op
3861 && $start >= $r->[$i]->start;
3862 }
3863
3864 # Here, we know that some action will have to be taken. We have
3865 # calculated the offset and length (though adjustments may be needed)
3866 # for the splice. Now start constructing the replacement list.
3867 my @replacement;
3868 my $splice_start = $i;
3869
3870 my $extends_below;
3871 my $extends_above;
3872
3873 # See if should extend any adjacent ranges.
3874 if ($operation eq '-') { # Don't extend deletions
3875 $extends_below = $extends_above = 0;
3876 }
3877 else { # Here, should extend any adjacent ranges. See if there are
3878 # any.
3879 $extends_below = ($i > 0
3880 # can't extend unless adjacent
3881 && $r->[$i-1]->end == $start -1
3882 # can't extend unless are same standard value
3883 && $r->[$i-1]->standard_form eq $standard_form
3884 # can't extend unless share type
3885 && $r->[$i-1]->type == $type);
3886 $extends_above = ($j+1 < $range_list_size
3887 && $r->[$j+1]->start == $end +1
3888 && $r->[$j+1]->standard_form eq $standard_form
23822bda 3889 && $r->[$j+1]->type == $type);
99870f4d
KW
3890 }
3891 if ($extends_below && $extends_above) { # Adds to both
3892 $splice_start--; # start replace at element below
3893 $length += 2; # will replace on both sides
3894 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3895
3896 # The result will fill in any gap, replacing both sides, and
3897 # create one large range.
3898 @replacement = Range->new($r->[$i-1]->start,
3899 $r->[$j+1]->end,
3900 Value => $value,
3901 Type => $type);
3902 }
3903 else {
3904
3905 # Here we know that the result won't just be the conglomeration of
3906 # a new range with both its adjacent neighbors. But it could
3907 # extend one of them.
3908
3909 if ($extends_below) {
3910
3911 # Here the new element adds to the one below, but not to the
3912 # one above. If inserting, and only to that one range, can
3913 # just change its ending to include the new one.
969a34cc 3914 if ($length == 0 && $clean_insert) {
99870f4d
KW
3915 $r->[$i-1]->set_end($end);
3916 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3917 return;
3918 }
3919 else {
3920 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3921 $splice_start--; # start replace at element below
3922 $length++; # will replace the element below
3923 $start = $r->[$i-1]->start;
3924 }
3925 }
3926 elsif ($extends_above) {
3927
3928 # Here the new element adds to the one above, but not below.
3929 # Mirror the code above
969a34cc 3930 if ($length == 0 && $clean_insert) {
99870f4d
KW
3931 $r->[$j+1]->set_start($start);
3932 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3933 return;
3934 }
3935 else {
3936 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3937 $length++; # will replace the element above
3938 $end = $r->[$j+1]->end;
3939 }
3940 }
3941
3942 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3943
3944 # Finally, here we know there will have to be a splice.
3945 # If the change or delete affects only the highest portion of the
3946 # first affected range, the range will have to be split. The
3947 # splice will remove the whole range, but will replace it by a new
3948 # range containing just the unaffected part. So, in this case,
3949 # add to the replacement list just this unaffected portion.
3950 if (! $extends_below
3951 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3952 {
3953 push @replacement,
3954 Range->new($r->[$i]->start,
3955 $start - 1,
3956 Value => $r->[$i]->value,
3957 Type => $r->[$i]->type);
3958 }
3959
3960 # In the case of an insert or change, but not a delete, we have to
3961 # put in the new stuff; this comes next.
3962 if ($operation eq '+') {
3963 push @replacement, Range->new($start,
3964 $end,
3965 Value => $value,
3966 Type => $type);
3967 }
3968
3969 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3970 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3971
3972 # And finally, if we're changing or deleting only a portion of the
3973 # highest affected range, it must be split, as the lowest one was.
3974 if (! $extends_above
3975 && $j >= 0 # Remember that j can be -1 if before first
3976 # current element
3977 && $end >= $r->[$j]->start
3978 && $end < $r->[$j]->end)
3979 {
3980 push @replacement,
3981 Range->new($end + 1,
3982 $r->[$j]->end,
3983 Value => $r->[$j]->value,
3984 Type => $r->[$j]->type);
3985 }
3986 }
3987
3988 # And do the splice, as calculated above
3989 if (main::DEBUG && $to_trace) {
3990 trace "replacing $length element(s) at $i with ";
3991 foreach my $replacement (@replacement) {
3992 trace " $replacement";
3993 }
3994 trace "Before splice:";
3995 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3996 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3997 trace "i =[", $i, "]", $r->[$i];
3998 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3999 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4000 }
4001
4002 my @return = splice @$r, $splice_start, $length, @replacement;
4003
4004 if (main::DEBUG && $to_trace) {
4005 trace "After splice:";
4006 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4007 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4008 trace "i =[", $i, "]", $r->[$i];
4009 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4010 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
e6451557 4011 trace "removed ", @return if @return;
99870f4d
KW
4012 }
4013
4014 # An actual deletion could have changed the maximum in the list.
4015 # There was no deletion if the splice didn't return something, but
4016 # otherwise recalculate it. This is done too rarely to worry about
4017 # performance.
4018 if ($operation eq '-' && @return) {
4019 $max{$addr} = $r->[-1]->end;
4020 }
4021 return @return;
4022 }
4023
4024 sub reset_each_range { # reset the iterator for each_range();
4025 my $self = shift;
4026 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4027
f998e60c 4028 no overloading;
051df77b 4029 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
4030 return;
4031 }
4032
4033 sub each_range {
4034 # Iterate over each range in a range list. Results are undefined if
4035 # the range list is changed during the iteration.
4036
4037 my $self = shift;
4038 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4039
ffe43484 4040 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4041
4042 return if $self->is_empty;
4043
4044 $each_range_iterator{$addr} = -1
4045 if ! defined $each_range_iterator{$addr};
4046 $each_range_iterator{$addr}++;
4047 return $ranges{$addr}->[$each_range_iterator{$addr}]
4048 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4049 undef $each_range_iterator{$addr};
4050 return;
4051 }
4052
4053 sub count { # Returns count of code points in range list
4054 my $self = shift;
4055 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4056
ffe43484 4057 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4058
4059 my $count = 0;
4060 foreach my $range (@{$ranges{$addr}}) {
4061 $count += $range->end - $range->start + 1;
4062 }
4063 return $count;
4064 }
4065
4066 sub delete_range { # Delete a range
4067 my $self = shift;
4068 my $start = shift;
4069 my $end = shift;
4070
4071 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4072
4073 return $self->_add_delete('-', $start, $end, "");
4074 }
4075
4076 sub is_empty { # Returns boolean as to if a range list is empty
4077 my $self = shift;
4078 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4079
f998e60c 4080 no overloading;
051df77b 4081 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
4082 }
4083
4084 sub hash {
4085 # Quickly returns a scalar suitable for separating tables into
4086 # buckets, i.e. it is a hash function of the contents of a table, so
4087 # there are relatively few conflicts.
4088
4089 my $self = shift;
4090 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4091
ffe43484 4092 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4093
4094 # These are quickly computable. Return looks like 'min..max;count'
4095 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4096 }
4097} # End closure for _Range_List_Base
4098
4099package Range_List;
4100use base '_Range_List_Base';
4101
4102# A Range_List is a range list for match tables; i.e. the range values are
4103# not significant. Thus a number of operations can be safely added to it,
4104# such as inversion, intersection. Note that union is also an unsafe
4105# operation when range values are cared about, and that method is in the base
4106# class, not here. But things are set up so that that method is callable only
4107# during initialization. Only in this derived class, is there an operation
4108# that combines two tables. A Range_Map can thus be used to initialize a
4109# Range_List, and its mappings will be in the list, but are not significant to
4110# this class.
4111
4112sub trace { return main::trace(@_); }
4113
4114{ # Closure
4115
4116 use overload
4117 fallback => 0,
4118 '+' => sub { my $self = shift;
4119 my $other = shift;
4120
4121 return $self->_union($other)
4122 },
4123 '&' => sub { my $self = shift;
4124 my $other = shift;
4125
4126 return $self->_intersect($other, 0);
4127 },
4128 '~' => "_invert",
4129 '-' => "_subtract",
4130 ;
4131
4132 sub _invert {
4133 # Returns a new Range_List that gives all code points not in $self.
4134
4135 my $self = shift;
4136
4137 my $new = Range_List->new;
4138
4139 # Go through each range in the table, finding the gaps between them
4140 my $max = -1; # Set so no gap before range beginning at 0
4141 for my $range ($self->ranges) {
4142 my $start = $range->start;
4143 my $end = $range->end;
4144
4145 # If there is a gap before this range, the inverse will contain
4146 # that gap.
4147 if ($start > $max + 1) {
4148 $new->add_range($max + 1, $start - 1);
4149 }
4150 $max = $end;
4151 }
4152
4153 # And finally, add the gap from the end of the table to the max
4154 # possible code point
6189eadc
KW
4155 if ($max < $MAX_UNICODE_CODEPOINT) {
4156 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
99870f4d
KW
4157 }
4158 return $new;
4159 }
4160
4161 sub _subtract {
4162 # Returns a new Range_List with the argument deleted from it. The
4163 # argument can be a single code point, a range, or something that has
4164 # a range, with the _range_list() method on it returning them
4165
4166 my $self = shift;
4167 my $other = shift;
4168 my $reversed = shift;
4169 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4170
4171 if ($reversed) {
4172 Carp::my_carp_bug("Can't cope with a "
4173 . __PACKAGE__
4174 . " being the second parameter in a '-'. Subtraction ignored.");
4175 return $self;
4176 }
4177
4178 my $new = Range_List->new(Initialize => $self);
4179
4180 if (! ref $other) { # Single code point
4181 $new->delete_range($other, $other);
4182 }
4183 elsif ($other->isa('Range')) {
4184 $new->delete_range($other->start, $other->end);
4185 }
4186 elsif ($other->can('_range_list')) {
4187 foreach my $range ($other->_range_list->ranges) {
4188 $new->delete_range($range->start, $range->end);
4189 }
4190 }
4191 else {
4192 Carp::my_carp_bug("Can't cope with a "
4193 . ref($other)
4194 . " argument to '-'. Subtraction ignored."
4195 );
4196 return $self;
4197 }
4198
4199 return $new;
4200 }
4201
4202 sub _intersect {
4203 # Returns either a boolean giving whether the two inputs' range lists
4204 # intersect (overlap), or a new Range_List containing the intersection
4205 # of the two lists. The optional final parameter being true indicates
4206 # to do the check instead of the intersection.
4207
4208 my $a_object = shift;
4209 my $b_object = shift;
4210 my $check_if_overlapping = shift;
4211 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4212 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4213
4214 if (! defined $b_object) {
4215 my $message = "";
4216 $message .= $a_object->_owner_name_of if defined $a_object;
4217 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4218 return;
4219 }
4220
4221 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4222 # Thus the intersection could be much more simply be written:
4223 # return ~(~$a_object + ~$b_object);
4224 # But, this is slower, and when taking the inverse of a large
4225 # range_size_1 table, back when such tables were always stored that
4226 # way, it became prohibitively slow, hence the code was changed to the
4227 # below
4228
4229 if ($b_object->isa('Range')) {
4230 $b_object = Range_List->new(Initialize => $b_object,
4231 Owner => $a_object->_owner_name_of);
4232 }
4233 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4234
4235 my @a_ranges = $a_object->ranges;
4236 my @b_ranges = $b_object->ranges;
4237
4238 #local $to_trace = 1 if main::DEBUG;
4239 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4240
4241 # Start with the first range in each list
4242 my $a_i = 0;
4243 my $range_a = $a_ranges[$a_i];
4244 my $b_i = 0;
4245 my $range_b = $b_ranges[$b_i];
4246
4247 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4248 if ! $check_if_overlapping;
4249
4250 # If either list is empty, there is no intersection and no overlap
4251 if (! defined $range_a || ! defined $range_b) {
4252 return $check_if_overlapping ? 0 : $new;
4253 }
4254 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4255
4256 # Otherwise, must calculate the intersection/overlap. Start with the
4257 # very first code point in each list
4258 my $a = $range_a->start;
4259 my $b = $range_b->start;
4260
4261 # Loop through all the ranges of each list; in each iteration, $a and
4262 # $b are the current code points in their respective lists
4263 while (1) {
4264
4265 # If $a and $b are the same code point, ...
4266 if ($a == $b) {
4267
4268 # it means the lists overlap. If just checking for overlap
4269 # know the answer now,
4270 return 1 if $check_if_overlapping;
4271
4272 # The intersection includes this code point plus anything else
4273 # common to both current ranges.
4274 my $start = $a;
4275 my $end = main::min($range_a->end, $range_b->end);
4276 if (! $check_if_overlapping) {
4277 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4278 $new->add_range($start, $end);
4279 }
4280
4281 # Skip ahead to the end of the current intersect
4282 $a = $b = $end;
4283
4284 # If the current intersect ends at the end of either range (as
4285 # it must for at least one of them), the next possible one
4286 # will be the beginning code point in it's list's next range.
4287 if ($a == $range_a->end) {
4288 $range_a = $a_ranges[++$a_i];
4289 last unless defined $range_a;
4290 $a = $range_a->start;
4291 }
4292 if ($b == $range_b->end) {
4293 $range_b = $b_ranges[++$b_i];
4294 last unless defined $range_b;
4295 $b = $range_b->start;
4296 }
4297
4298 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4299 }
4300 elsif ($a < $b) {
4301
4302 # Not equal, but if the range containing $a encompasses $b,
4303 # change $a to be the middle of the range where it does equal
4304 # $b, so the next iteration will get the intersection
4305 if ($range_a->end >= $b) {
4306 $a = $b;
4307 }
4308 else {
4309
4310 # Here, the current range containing $a is entirely below
4311 # $b. Go try to find a range that could contain $b.
4312 $a_i = $a_object->_search_ranges($b);
4313
4314 # If no range found, quit.
4315 last unless defined $a_i;
4316
4317 # The search returns $a_i, such that
4318 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4319 # Set $a to the beginning of this new range, and repeat.
4320 $range_a = $a_ranges[$a_i];
4321 $a = $range_a->start;
4322 }
4323 }
4324 else { # Here, $b < $a.
4325
4326 # Mirror image code to the leg just above
4327 if ($range_b->end >= $a) {
4328 $b = $a;
4329 }
4330 else {
4331 $b_i = $b_object->_search_ranges($a);
4332 last unless defined $b_i;
4333 $range_b = $b_ranges[$b_i];
4334 $b = $range_b->start;
4335 }
4336 }
4337 } # End of looping through ranges.
4338
4339 # Intersection fully computed, or now know that there is no overlap
4340 return $check_if_overlapping ? 0 : $new;
4341 }
4342
4343 sub overlaps {
4344 # Returns boolean giving whether the two arguments overlap somewhere
4345
4346 my $self = shift;
4347 my $other = shift;
4348 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4349
4350 return $self->_intersect($other, 1);
4351 }
4352
4353 sub add_range {
4354 # Add a range to the list.
4355
4356 my $self = shift;
4357 my $start = shift;
4358 my $end = shift;
4359 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4360
4361 return $self->_add_delete('+', $start, $end, "");
4362 }
4363
09aba7e4
KW
4364 sub matches_identically_to {
4365 # Return a boolean as to whether or not two Range_Lists match identical
4366 # sets of code points.
4367
4368 my $self = shift;
4369 my $other = shift;
4370 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4371
4372 # These are ordered in increasing real time to figure out (at least
4373 # until a patch changes that and doesn't change this)
4374 return 0 if $self->max != $other->max;
4375 return 0 if $self->min != $other->min;
4376 return 0 if $self->range_count != $other->range_count;
4377 return 0 if $self->count != $other->count;
4378
4379 # Here they could be identical because all the tests above passed.
4380 # The loop below is somewhat simpler since we know they have the same
4381 # number of elements. Compare range by range, until reach the end or
4382 # find something that differs.
4383 my @a_ranges = $self->ranges;
4384 my @b_ranges = $other->ranges;
4385 for my $i (0 .. @a_ranges - 1) {
4386 my $a = $a_ranges[$i];
4387 my $b = $b_ranges[$i];
4388 trace "self $a; other $b" if main::DEBUG && $to_trace;
c1c2d9e8
KW
4389 return 0 if ! defined $b
4390 || $a->start != $b->start
4391 || $a->end != $b->end;
09aba7e4
KW
4392 }
4393 return 1;
4394 }
4395
99870f4d
KW
4396 sub is_code_point_usable {
4397 # This used only for making the test script. See if the input
4398 # proposed trial code point is one that Perl will handle. If second
4399 # parameter is 0, it won't select some code points for various
4400 # reasons, noted below.
4401
4402 my $code = shift;
4403 my $try_hard = shift;
4404 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4405
4406 return 0 if $code < 0; # Never use a negative
4407
99870f4d
KW
4408 # shun null. I'm (khw) not sure why this was done, but NULL would be
4409 # the character very frequently used.
4410 return $try_hard if $code == 0x0000;
4411
99870f4d
KW
4412 # shun non-character code points.
4413 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4414 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4415
6189eadc 4416 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
99870f4d
KW
4417 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4418
4419 return 1;
4420 }
4421
4422 sub get_valid_code_point {
4423 # Return a code point that's part of the range list. Returns nothing
4424 # if the table is empty or we can't find a suitable code point. This
4425 # used only for making the test script.
4426
4427 my $self = shift;
4428 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4429
ffe43484 4430 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4431
4432 # On first pass, don't choose less desirable code points; if no good
4433 # one is found, repeat, allowing a less desirable one to be selected.
4434 for my $try_hard (0, 1) {
4435
4436 # Look through all the ranges for a usable code point.
4437 for my $set ($self->ranges) {
4438
4439 # Try the edge cases first, starting with the end point of the
4440 # range.
4441 my $end = $set->end;
4442 return $end if is_code_point_usable($end, $try_hard);
4443
4444 # End point didn't, work. Start at the beginning and try
4445 # every one until find one that does work.
4446 for my $trial ($set->start .. $end - 1) {
4447 return $trial if is_code_point_usable($trial, $try_hard);
4448 }
4449 }
4450 }
4451 return (); # If none found, give up.
4452 }
4453
4454 sub get_invalid_code_point {
4455 # Return a code point that's not part of the table. Returns nothing
4456 # if the table covers all code points or a suitable code point can't
4457 # be found. This used only for making the test script.
4458
4459 my $self = shift;
4460 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4461
4462 # Just find a valid code point of the inverse, if any.
4463 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4464 }
4465} # end closure for Range_List
4466
4467package Range_Map;
4468use base '_Range_List_Base';
4469
4470# A Range_Map is a range list in which the range values (called maps) are
4471# significant, and hence shouldn't be manipulated by our other code, which
4472# could be ambiguous or lose things. For example, in taking the union of two
4473# lists, which share code points, but which have differing values, which one
4474# has precedence in the union?
4475# It turns out that these operations aren't really necessary for map tables,
4476# and so this class was created to make sure they aren't accidentally
4477# applied to them.
4478
4479{ # Closure
4480
4481 sub add_map {
4482 # Add a range containing a mapping value to the list
4483
4484 my $self = shift;
4485 # Rest of parameters passed on
4486
4487 return $self->_add_delete('+', @_);
4488 }
4489
4490 sub add_duplicate {
4491 # Adds entry to a range list which can duplicate an existing entry
4492
4493 my $self = shift;
4494 my $code_point = shift;
4495 my $value = shift;
7f4b1e25
KW
4496 my %args = @_;
4497 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4498 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
99870f4d
KW
4499
4500 return $self->add_map($code_point, $code_point,
7f4b1e25 4501 $value, Replace => $replace);
99870f4d
KW
4502 }
4503} # End of closure for package Range_Map
4504
4505package _Base_Table;
4506
4507# A table is the basic data structure that gets written out into a file for
4508# use by the Perl core. This is the abstract base class implementing the
4509# common elements from the derived ones. A list of the methods to be
4510# furnished by an implementing class is just after the constructor.
4511
4512sub standardize { return main::standardize($_[0]); }
4513sub trace { return main::trace(@_); }
4514
4515{ # Closure
4516
4517 main::setup_package();
4518
4519 my %range_list;
4520 # Object containing the ranges of the table.
4521 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4522
4523 my %full_name;
4524 # The full table name.
4525 main::set_access('full_name', \%full_name, 'r');
4526
4527 my %name;
4528 # The table name, almost always shorter
4529 main::set_access('name', \%name, 'r');
4530
4531 my %short_name;
4532 # The shortest of all the aliases for this table, with underscores removed
4533 main::set_access('short_name', \%short_name);
4534
4535 my %nominal_short_name_length;
4536 # The length of short_name before removing underscores
4537 main::set_access('nominal_short_name_length',
4538 \%nominal_short_name_length);
4539
23e33b60
KW
4540 my %complete_name;
4541 # The complete name, including property.
4542 main::set_access('complete_name', \%complete_name, 'r');
4543
99870f4d
KW
4544 my %property;
4545 # Parent property this table is attached to.
4546 main::set_access('property', \%property, 'r');
4547
4548 my %aliases;
c12f2655
KW
4549 # Ordered list of alias objects of the table's name. The first ones in
4550 # the list are output first in comments
99870f4d
KW
4551 main::set_access('aliases', \%aliases, 'readable_array');
4552
4553 my %comment;
4554 # A comment associated with the table for human readers of the files
4555 main::set_access('comment', \%comment, 's');
4556
4557 my %description;
4558 # A comment giving a short description of the table's meaning for human
4559 # readers of the files.
4560 main::set_access('description', \%description, 'readable_array');
4561
4562 my %note;
4563 # A comment giving a short note about the table for human readers of the
4564 # files.
4565 main::set_access('note', \%note, 'readable_array');
4566
301ba948
KW
4567 my %fate;
4568 # Enum; there are a number of possibilities for what happens to this
4569 # table: it could be normal, or suppressed, or not for external use. See
4570 # values at definition for $SUPPRESSED.
4571 main::set_access('fate', \%fate, 'r');
99870f4d
KW
4572
4573 my %find_table_from_alias;
4574 # The parent property passes this pointer to a hash which this class adds
4575 # all its aliases to, so that the parent can quickly take an alias and
4576 # find this table.
4577 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4578
4579 my %locked;
4580 # After this table is made equivalent to another one; we shouldn't go
4581 # changing the contents because that could mean it's no longer equivalent
4582 main::set_access('locked', \%locked, 'r');
4583
4584 my %file_path;
4585 # This gives the final path to the file containing the table. Each
4586 # directory in the path is an element in the array
4587 main::set_access('file_path', \%file_path, 'readable_array');
4588
4589 my %status;
4590 # What is the table's status, normal, $OBSOLETE, etc. Enum
4591 main::set_access('status', \%status, 'r');
4592
4593 my %status_info;
4594 # A comment about its being obsolete, or whatever non normal status it has
4595 main::set_access('status_info', \%status_info, 'r');
4596
d867ccfb
KW
4597 my %caseless_equivalent;
4598 # The table this is equivalent to under /i matching, if any.
4599 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4600
99870f4d
KW
4601 my %range_size_1;
4602 # Is the table to be output with each range only a single code point?
4603 # This is done to avoid breaking existing code that may have come to rely
4604 # on this behavior in previous versions of this program.)
4605 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4606
4607 my %perl_extension;
4608 # A boolean set iff this table is a Perl extension to the Unicode
4609 # standard.
4610 main::set_access('perl_extension', \%perl_extension, 'r');
4611
0c07e538
KW
4612 my %output_range_counts;
4613 # A boolean set iff this table is to have comments written in the
4614 # output file that contain the number of code points in the range.
4615 # The constructor can override the global flag of the same name.
4616 main::set_access('output_range_counts', \%output_range_counts, 'r');
4617
f5817e0a
KW
4618 my %format;
4619 # The format of the entries of the table. This is calculated from the
4620 # data in the table (or passed in the constructor). This is an enum e.g.,
26561784
KW
4621 # $STRING_FORMAT. It is marked protected as it should not be generally
4622 # used to override calculations.
f5817e0a
KW
4623 main::set_access('format', \%format, 'r', 'p_s');
4624
99870f4d
KW
4625 sub new {
4626 # All arguments are key => value pairs, which you can see below, most
33e96e72 4627 # of which match fields documented above. Otherwise: Re_Pod_Entry,
0eac1e20 4628 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
99870f4d
KW
4629 # documented in the Alias package
4630
4631 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4632
4633 my $class = shift;
4634
4635 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4636 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4637
4638 my %args = @_;
4639
4640 $name{$addr} = delete $args{'Name'};
4641 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4642 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4643 my $complete_name = $complete_name{$addr}
4644 = delete $args{'Complete_Name'};
f5817e0a 4645 $format{$addr} = delete $args{'Format'};
0c07e538 4646 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4647 $property{$addr} = delete $args{'_Property'};
4648 $range_list{$addr} = delete $args{'_Range_List'};
4649 $status{$addr} = delete $args{'Status'} || $NORMAL;
4650 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4651 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
d867ccfb 4652 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
301ba948 4653 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
fd1e3e84 4654 my $ucd = delete $args{'UCD'};
99870f4d
KW
4655
4656 my $description = delete $args{'Description'};
0eac1e20 4657 my $ok_as_filename = delete $args{'OK_as_Filename'};
99870f4d
KW
4658 my $loose_match = delete $args{'Fuzzy'};
4659 my $note = delete $args{'Note'};
33e96e72 4660 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
37e2e78e 4661 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4662
4663 # Shouldn't have any left over
4664 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4665
4666 # Can't use || above because conceivably the name could be 0, and
4667 # can't use // operator in case this program gets used in Perl 5.8
4668 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4669 $output_range_counts{$addr} = $output_range_counts if
4670 ! defined $output_range_counts{$addr};
99870f4d
KW
4671
4672 $aliases{$addr} = [ ];
4673 $comment{$addr} = [ ];
4674 $description{$addr} = [ ];
4675 $note{$addr} = [ ];
4676 $file_path{$addr} = [ ];
4677 $locked{$addr} = "";
4678
4679 push @{$description{$addr}}, $description if $description;
4680 push @{$note{$addr}}, $note if $note;
4681
301ba948 4682 if ($fate{$addr} == $PLACEHOLDER) {
37e2e78e
KW
4683
4684 # A placeholder table doesn't get documented, is a perl extension,
4685 # and quite likely will be empty
33e96e72 4686 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
37e2e78e 4687 $perl_extension = 1 if ! defined $perl_extension;
fd1e3e84 4688 $ucd = 0 if ! defined $ucd;
37e2e78e 4689 push @tables_that_may_be_empty, $complete_name{$addr};
301ba948
KW
4690 $self->add_comment(<<END);
4691This is a placeholder because it is not in Version $string_version of Unicode,
4692but is needed by the Perl core to work gracefully. Because it is not in this
4693version of Unicode, it will not be listed in $pod_file.pod
4694END
37e2e78e 4695 }
301ba948 4696 elsif (exists $why_suppressed{$complete_name}
98dc9551 4697 # Don't suppress if overridden
ec11e5f4
KW
4698 && ! grep { $_ eq $complete_name{$addr} }
4699 @output_mapped_properties)
301ba948
KW
4700 {
4701 $fate{$addr} = $SUPPRESSED;
4702 }
4703 elsif ($fate{$addr} == $SUPPRESSED
4704 && ! exists $why_suppressed{$property{$addr}->complete_name})
4705 {
4706 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4707 # perhaps Fate => [ $SUPPRESSED, "reason" ]
4708 }
4709
4710 # If hasn't set its status already, see if it is on one of the
4711 # lists of properties or tables that have particular statuses; if
4712 # not, is normal. The lists are prioritized so the most serious
4713 # ones are checked first
4714 if (! $status{$addr}) {
4715 if (exists $why_deprecated{$complete_name}) {
99870f4d
KW
4716 $status{$addr} = $DEPRECATED;
4717 }
4718 elsif (exists $why_stabilized{$complete_name}) {
4719 $status{$addr} = $STABILIZED;
4720 }
4721 elsif (exists $why_obsolete{$complete_name}) {
4722 $status{$addr} = $OBSOLETE;
4723 }
4724
4725 # Existence above doesn't necessarily mean there is a message
4726 # associated with it. Use the most serious message.
4727 if ($status{$addr}) {
301ba948 4728 if ($why_deprecated{$complete_name}) {
99870f4d
KW
4729 $status_info{$addr}
4730 = $why_deprecated{$complete_name};
4731 }
4732 elsif ($why_stabilized{$complete_name}) {
4733 $status_info{$addr}
4734 = $why_stabilized{$complete_name};
4735 }
4736 elsif ($why_obsolete{$complete_name}) {
4737 $status_info{$addr}
4738 = $why_obsolete{$complete_name};
4739 }
4740 }
4741 }
4742
37e2e78e
KW
4743 $perl_extension{$addr} = $perl_extension || 0;
4744
8050d00f 4745 # Don't list a property by default that is internal only
395dfc19 4746 if ($fate{$addr} > $MAP_PROXIED) {
301ba948 4747 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
fd1e3e84
KW
4748 $ucd = 0 if ! defined $ucd;
4749 }
4750 else {
4751 $ucd = 1 if ! defined $ucd;
301ba948 4752 }
8050d00f 4753
99870f4d
KW
4754 # By convention what typically gets printed only or first is what's
4755 # first in the list, so put the full name there for good output
4756 # clarity. Other routines rely on the full name being first on the
4757 # list
4758 $self->add_alias($full_name{$addr},
0eac1e20 4759 OK_as_Filename => $ok_as_filename,
99870f4d 4760 Fuzzy => $loose_match,
33e96e72 4761 Re_Pod_Entry => $make_re_pod_entry,
99870f4d 4762 Status => $status{$addr},
fd1e3e84 4763 UCD => $ucd,
99870f4d
KW
4764 );
4765
4766 # Then comes the other name, if meaningfully different.
4767 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4768 $self->add_alias($name{$addr},
0eac1e20 4769 OK_as_Filename => $ok_as_filename,
99870f4d 4770 Fuzzy => $loose_match,
33e96e72 4771 Re_Pod_Entry => $make_re_pod_entry,
99870f4d 4772 Status => $status{$addr},
fd1e3e84 4773 UCD => $ucd,
99870f4d
KW
4774 );
4775 }
4776
4777 return $self;
4778 }
4779
4780 # Here are the methods that are required to be defined by any derived
4781 # class
ea25a9b2 4782 for my $sub (qw(
668b3bfc 4783 handle_special_range
99870f4d 4784 append_to_body
99870f4d 4785 pre_body
ea25a9b2 4786 ))
668b3bfc
KW
4787 # write() knows how to write out normal ranges, but it calls
4788 # handle_special_range() when it encounters a non-normal one.
4789 # append_to_body() is called by it after it has handled all
4790 # ranges to add anything after the main portion of the table.
4791 # And finally, pre_body() is called after all this to build up
4792 # anything that should appear before the main portion of the
4793 # table. Doing it this way allows things in the middle to
4794 # affect what should appear before the main portion of the
99870f4d 4795 # table.
99870f4d
KW
4796 {
4797 no strict "refs";
4798 *$sub = sub {
4799 Carp::my_carp_bug( __LINE__
4800 . ": Must create method '$sub()' for "
4801 . ref shift);
4802 return;
4803 }
4804 }
4805
4806 use overload
4807 fallback => 0,
4808 "." => \&main::_operator_dot,
4809 '!=' => \&main::_operator_not_equal,
4810 '==' => \&main::_operator_equal,
4811 ;
4812
4813 sub ranges {
4814 # Returns the array of ranges associated with this table.
4815
f998e60c 4816 no overloading;
051df77b 4817 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4818 }
4819
4820 sub add_alias {
4821 # Add a synonym for this table.
4822
4823 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4824
4825 my $self = shift;
4826 my $name = shift; # The name to add.
4827 my $pointer = shift; # What the alias hash should point to. For
4828 # map tables, this is the parent property;
4829 # for match tables, it is the table itself.
4830
4831 my %args = @_;
4832 my $loose_match = delete $args{'Fuzzy'};
4833
33e96e72
KW
4834 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4835 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
99870f4d 4836
0eac1e20
KW
4837 my $ok_as_filename = delete $args{'OK_as_Filename'};
4838 $ok_as_filename = 1 unless defined $ok_as_filename;
99870f4d
KW
4839
4840 my $status = delete $args{'Status'};
4841 $status = $NORMAL unless defined $status;
4842
fd1e3e84
KW
4843 my $ucd = delete $args{'UCD'} // 1;
4844
99870f4d
KW
4845 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4846
4847 # Capitalize the first letter of the alias unless it is one of the CJK
4848 # ones which specifically begins with a lower 'k'. Do this because
4849 # Unicode has varied whether they capitalize first letters or not, and
4850 # have later changed their minds and capitalized them, but not the
4851 # other way around. So do it always and avoid changes from release to
4852 # release
4853 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4854
ffe43484 4855 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4856
4857 # Figure out if should be loosely matched if not already specified.
4858 if (! defined $loose_match) {
4859
4860 # Is a loose_match if isn't null, and doesn't begin with an
4861 # underscore and isn't just a number
4862 if ($name ne ""
4863 && substr($name, 0, 1) ne '_'
4864 && $name !~ qr{^[0-9_.+-/]+$})
4865 {
4866 $loose_match = 1;
4867 }
4868 else {
4869 $loose_match = 0;
4870 }
4871 }
4872
4873 # If this alias has already been defined, do nothing.
4874 return if defined $find_table_from_alias{$addr}->{$name};
4875
4876 # That includes if it is standardly equivalent to an existing alias,
4877 # in which case, add this name to the list, so won't have to search
4878 # for it again.
4879 my $standard_name = main::standardize($name);
4880 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4881 $find_table_from_alias{$addr}->{$name}
4882 = $find_table_from_alias{$addr}->{$standard_name};
4883 return;
4884 }
4885
4886 # Set the index hash for this alias for future quick reference.
4887 $find_table_from_alias{$addr}->{$name} = $pointer;
4888 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4889 local $to_trace = 0 if main::DEBUG;
4890 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4891 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4892
4893
4894 # Put the new alias at the end of the list of aliases unless the final
4895 # element begins with an underscore (meaning it is for internal perl
4896 # use) or is all numeric, in which case, put the new one before that
4897 # one. This floats any all-numeric or underscore-beginning aliases to
4898 # the end. This is done so that they are listed last in output lists,
4899 # to encourage the user to use a better name (either more descriptive
4900 # or not an internal-only one) instead. This ordering is relied on
4901 # implicitly elsewhere in this program, like in short_name()
4902 my $list = $aliases{$addr};
4903 my $insert_position = (@$list == 0
4904 || (substr($list->[-1]->name, 0, 1) ne '_'
4905 && $list->[-1]->name =~ /\D/))
4906 ? @$list
4907 : @$list - 1;
4908 splice @$list,
4909 $insert_position,
4910 0,
33e96e72 4911 Alias->new($name, $loose_match, $make_re_pod_entry,
0eac1e20 4912 $ok_as_filename, $status, $ucd);
99870f4d
KW
4913
4914 # This name may be shorter than any existing ones, so clear the cache
4915 # of the shortest, so will have to be recalculated.
f998e60c 4916 no overloading;
051df77b 4917 undef $short_name{pack 'J', $self};
99870f4d
KW
4918 return;
4919 }
4920
4921 sub short_name {
4922 # Returns a name suitable for use as the base part of a file name.
4923 # That is, shorter wins. It can return undef if there is no suitable
4924 # name. The name has all non-essential underscores removed.
4925
4926 # The optional second parameter is a reference to a scalar in which
4927 # this routine will store the length the returned name had before the
4928 # underscores were removed, or undef if the return is undef.
4929
4930 # The shortest name can change if new aliases are added. So using
4931 # this should be deferred until after all these are added. The code
4932 # that does that should clear this one's cache.
4933 # Any name with alphabetics is preferred over an all numeric one, even
4934 # if longer.
4935
4936 my $self = shift;
4937 my $nominal_length_ptr = shift;
4938 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4939
ffe43484 4940 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4941
4942 # For efficiency, don't recalculate, but this means that adding new
4943 # aliases could change what the shortest is, so the code that does
4944 # that needs to undef this.
4945 if (defined $short_name{$addr}) {
4946 if ($nominal_length_ptr) {
4947 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4948 }
4949 return $short_name{$addr};
4950 }
4951
4952 # Look at each alias
4953 foreach my $alias ($self->aliases()) {
4954
4955 # Don't use an alias that isn't ok to use for an external name.
0eac1e20 4956 next if ! $alias->ok_as_filename;
99870f4d
KW
4957
4958 my $name = main::Standardize($alias->name);
4959 trace $self, $name if main::DEBUG && $to_trace;
4960
4961 # Take the first one, or a shorter one that isn't numeric. This
4962 # relies on numeric aliases always being last in the array
4963 # returned by aliases(). Any alpha one will have precedence.
4964 if (! defined $short_name{$addr}
4965 || ($name =~ /\D/
4966 && length($name) < length($short_name{$addr})))
4967 {
4968 # Remove interior underscores.
4969 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4970
4971 $nominal_short_name_length{$addr} = length $name;
4972 }
4973 }
4974
ff485b9e
KW
4975 # If the short name isn't a nice one, perhaps an equivalent table has
4976 # a better one.
4977 if (! defined $short_name{$addr}
4978 || $short_name{$addr} eq ""
4979 || $short_name{$addr} eq "_")
4980 {
4981 my $return;
4982 foreach my $follower ($self->children) { # All equivalents
4983 my $follower_name = $follower->short_name;
4984 next unless defined $follower_name;
4985
4986 # Anything (except undefined) is better than underscore or
4987 # empty
4988 if (! defined $return || $return eq "_") {
4989 $return = $follower_name;
4990 next;
4991 }
4992
4993 # If the new follower name isn't "_" and is shorter than the
4994 # current best one, prefer the new one.
4995 next if $follower_name eq "_";
4996 next if length $follower_name > length $return;
4997 $return = $follower_name;
4998 }
4999 $short_name{$addr} = $return if defined $return;
5000 }
5001
99870f4d
KW
5002 # If no suitable external name return undef
5003 if (! defined $short_name{$addr}) {
5004 $$nominal_length_ptr = undef if $nominal_length_ptr;
5005 return;
5006 }
5007
c12f2655 5008 # Don't allow a null short name.
99870f4d
KW
5009 if ($short_name{$addr} eq "") {
5010 $short_name{$addr} = '_';
5011 $nominal_short_name_length{$addr} = 1;
5012 }
5013
5014 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5015
5016 if ($nominal_length_ptr) {
5017 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5018 }
5019 return $short_name{$addr};
5020 }
5021
5022 sub external_name {
5023 # Returns the external name that this table should be known by. This
c12f2655
KW
5024 # is usually the short_name, but not if the short_name is undefined,
5025 # in which case the external_name is arbitrarily set to the
5026 # underscore.
99870f4d
KW
5027
5028 my $self = shift;
5029 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5030
5031 my $short = $self->short_name;
5032 return $short if defined $short;
5033
5034 return '_';
5035 }
5036
5037 sub add_description { # Adds the parameter as a short description.
5038
5039 my $self = shift;
5040 my $description = shift;
5041 chomp $description;
5042 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5043
f998e60c 5044 no overloading;
051df77b 5045 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
5046
5047 return;
5048 }
5049
5050 sub add_note { # Adds the parameter as a short note.
5051
5052 my $self = shift;
5053 my $note = shift;
5054 chomp $note;
5055 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5056
f998e60c 5057 no overloading;
051df77b 5058 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
5059
5060 return;
5061 }
5062
5063 sub add_comment { # Adds the parameter as a comment.
5064
bd9ebcfd
KW
5065 return unless $debugging_build;
5066
99870f4d
KW
5067 my $self = shift;
5068 my $comment = shift;
5069 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5070
5071 chomp $comment;
f998e60c
KW
5072
5073 no overloading;
051df77b 5074 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
5075
5076 return;
5077 }
5078
5079 sub comment {
5080 # Return the current comment for this table. If called in list
5081 # context, returns the array of comments. In scalar, returns a string
5082 # of each element joined together with a period ending each.
5083
5084 my $self = shift;
5085 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5086
ffe43484 5087 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 5088 my @list = @{$comment{$addr}};
99870f4d
KW
5089 return @list if wantarray;
5090 my $return = "";
5091 foreach my $sentence (@list) {
5092 $return .= '. ' if $return;
5093 $return .= $sentence;
5094 $return =~ s/\.$//;
5095 }
5096 $return .= '.' if $return;
5097 return $return;
5098 }
5099
5100 sub initialize {
5101 # Initialize the table with the argument which is any valid
5102 # initialization for range lists.
5103
5104 my $self = shift;
ffe43484 5105 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5106 my $initialization = shift;
5107 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5108
5109 # Replace the current range list with a new one of the same exact
5110 # type.
f998e60c
KW
5111 my $class = ref $range_list{$addr};
5112 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
5113 Initialize => $initialization);
5114 return;
5115
5116 }
5117
5118 sub header {
5119 # The header that is output for the table in the file it is written
5120 # in.
5121
5122 my $self = shift;
5123 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5124
5125 my $return = "";
5126 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5127 $return .= $HEADER;
99870f4d
KW
5128 return $return;
5129 }
5130
5131 sub write {
668b3bfc
KW
5132 # Write a representation of the table to its file. It calls several
5133 # functions furnished by sub-classes of this abstract base class to
5134 # handle non-normal ranges, to add stuff before the table, and at its
5135 # end.
99870f4d
KW
5136
5137 my $self = shift;
5138 my $tab_stops = shift; # The number of tab stops over to put any
5139 # comment.
5140 my $suppress_value = shift; # Optional, if the value associated with
5141 # a range equals this one, don't write
5142 # the range
5143 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5144
ffe43484 5145 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5146
5147 # Start with the header
668b3bfc 5148 my @HEADER = $self->header;
99870f4d
KW
5149
5150 # Then the comments
668b3bfc 5151 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
99870f4d
KW
5152 if $comment{$addr};
5153
668b3bfc
KW
5154 # Things discovered processing the main body of the document may
5155 # affect what gets output before it, therefore pre_body() isn't called
5156 # until after all other processing of the table is done.
99870f4d 5157
c4019d52
KW
5158 # The main body looks like a 'here' document. If annotating, get rid
5159 # of the comments before passing to the caller, as some callers, such
5160 # as charnames.pm, can't cope with them. (Outputting range counts
5161 # also introduces comments, but these don't show up in the tables that
5162 # can't cope with comments, and there aren't that many of them that
5163 # it's worth the extra real time to get rid of them).
668b3bfc 5164 my @OUT;
558712cf 5165 if ($annotate) {
c4019d52
KW
5166 # Use the line below in Perls that don't have /r
5167 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5168 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5169 } else {
5170 push @OUT, "return <<'END';\n";
5171 }
99870f4d
KW
5172
5173 if ($range_list{$addr}->is_empty) {
5174
5175 # This is a kludge for empty tables to silence a warning in
5176 # utf8.c, which can't really deal with empty tables, but it can
5177 # deal with a table that matches nothing, as the inverse of 'Any'
5178 # does.
67a53d68 5179 push @OUT, "!utf8::Any\n";
99870f4d 5180 }
c69a9c68
KW
5181 elsif ($self->name eq 'N'
5182
5183 # To save disk space and table cache space, avoid putting out
5184 # binary N tables, but instead create a file which just inverts
5185 # the Y table. Since the file will still exist and occupy a
5186 # certain number of blocks, might as well output the whole
5187 # thing if it all will fit in one block. The number of
5188 # ranges below is an approximate number for that.
06f26c45
KW
5189 && ($self->property->type == $BINARY
5190 || $self->property->type == $FORCED_BINARY)
c69a9c68
KW
5191 # && $self->property->tables == 2 Can't do this because the
5192 # non-binary properties, like NFDQC aren't specifiable
5193 # by the notation
5194 && $range_list{$addr}->ranges > 15
5195 && ! $annotate) # Under --annotate, want to see everything
5196 {
5197 push @OUT, "!utf8::" . $self->property->name . "\n";
5198 }
99870f4d
KW
5199 else {
5200 my $range_size_1 = $range_size_1{$addr};
558712cf
KW
5201 my $format; # Used only in $annotate option
5202 my $include_name; # Used only in $annotate option
c4019d52 5203
558712cf 5204 if ($annotate) {
c4019d52
KW
5205
5206 # if annotating each code point, must print 1 per line.
5207 # The variable could point to a subroutine, and we don't want
5208 # to lose that fact, so only set if not set already
5209 $range_size_1 = 1 if ! $range_size_1;
5210
5211 $format = $self->format;
5212
5213 # The name of the character is output only for tables that
5214 # don't already include the name in the output.
5215 my $property = $self->property;
5216 $include_name =
5217 ! ($property == $perl_charname
5218 || $property == main::property_ref('Unicode_1_Name')
5219 || $property == main::property_ref('Name')
5220 || $property == main::property_ref('Name_Alias')
5221 );
5222 }
99870f4d
KW
5223
5224 # Output each range as part of the here document.
5a2b5ddb 5225 RANGE:
99870f4d 5226 for my $set ($range_list{$addr}->ranges) {
5a2b5ddb
KW
5227 if ($set->type != 0) {
5228 $self->handle_special_range($set);
5229 next RANGE;
5230 }
99870f4d
KW
5231 my $start = $set->start;
5232 my $end = $set->end;
5233 my $value = $set->value;
5234
5235 # Don't output ranges whose value is the one to suppress
c4019d52
KW
5236 next RANGE if defined $suppress_value
5237 && $value eq $suppress_value;
99870f4d 5238
c4019d52
KW
5239 # If there is a range and doesn't need a single point range
5240 # output
5241 if ($start != $end && ! $range_size_1) {
bd9ebcfd
KW
5242 push @OUT, sprintf "%04X\t%04X", $start, $end;
5243 $OUT[-1] .= "\t$value" if $value ne "";
99870f4d
KW
5244
5245 # Add a comment with the size of the range, if requested.
5246 # Expand Tabs to make sure they all start in the same
5247 # column, and then unexpand to use mostly tabs.
0c07e538 5248 if (! $output_range_counts{$addr}) {
99870f4d
KW
5249 $OUT[-1] .= "\n";
5250 }
5251 else {
5252 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5253 my $count = main::clarify_number($end - $start + 1);
5254 use integer;
5255
5256 my $width = $tab_stops * 8 - 1;
5257 $OUT[-1] = sprintf("%-*s # [%s]\n",
5258 $width,
5259 $OUT[-1],
5260 $count);
5261 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5262 }
c4019d52
KW
5263 next RANGE;
5264 }
5265
5266 # Here to output a single code point per line
5267
5268 # If not to annotate, use the simple formats
558712cf 5269 if (! $annotate) {
c4019d52
KW
5270
5271 # Use any passed in subroutine to output.
5272 if (ref $range_size_1 eq 'CODE') {
5273 for my $i ($start .. $end) {
5274 push @OUT, &{$range_size_1}($i, $value);
5275 }
5276 }
5277 else {
5278
5279 # Here, caller is ok with default output.
5280 for (my $i = $start; $i <= $end; $i++) {
5281 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5282 }
5283 }
5284 next RANGE;
5285 }
5286
5287 # Here, wants annotation.
5288 for (my $i = $start; $i <= $end; $i++) {
5289
5290 # Get character information if don't have it already
5291 main::populate_char_info($i)
5292 if ! defined $viacode[$i];
5293 my $type = $annotate_char_type[$i];
5294
5295 # Figure out if should output the next code points as part
5296 # of a range or not. If this is not in an annotation
5297 # range, then won't output as a range, so returns $i.
5298 # Otherwise use the end of the annotation range, but no
5299 # further than the maximum possible end point of the loop.
5300 my $range_end = main::min($annotate_ranges->value_of($i)
5301 || $i,
5302 $end);
5303
5304 # Use a range if it is a range, and either is one of the
5305 # special annotation ranges, or the range is at most 3
5306 # long. This last case causes the algorithmically named
5307 # code points to be output individually in spans of at
5308 # most 3, as they are the ones whose $type is > 0.
5309 if ($range_end != $i
5310 && ( $type < 0 || $range_end - $i > 2))
5311 {
5312 # Here is to output a range. We don't allow a
5313 # caller-specified output format--just use the
5314 # standard one.
5315 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5316 $range_end,
5317 $value;
5318 my $range_name = $viacode[$i];
5319
5320 # For the code points which end in their hex value, we
5321 # eliminate that from the output annotation, and
5322 # capitalize only the first letter of each word.
5323 if ($type == $CP_IN_NAME) {
5324 my $hex = sprintf "%04X", $i;
5325 $range_name =~ s/-$hex$//;
5326 my @words = split " ", $range_name;
5327 for my $word (@words) {
5328 $word = ucfirst(lc($word)) if $word ne 'CJK';
5329 }
5330 $range_name = join " ", @words;
5331 }
5332 elsif ($type == $HANGUL_SYLLABLE) {
5333 $range_name = "Hangul Syllable";
5334 }
5335
5336 $OUT[-1] .= " $range_name" if $range_name;
5337
5338 # Include the number of code points in the range
5339 my $count = main::clarify_number($range_end - $i + 1);
5340 $OUT[-1] .= " [$count]\n";
5341
5342 # Skip to the end of the range
5343 $i = $range_end;
5344 }
5345 else { # Not in a range.
5346 my $comment = "";
5347
5348 # When outputting the names of each character, use
5349 # the character itself if printable
5350 $comment .= "'" . chr($i) . "' " if $printable[$i];
5351
5352 # To make it more readable, use a minimum indentation
5353 my $comment_indent;
5354
5355 # Determine the annotation
5356 if ($format eq $DECOMP_STRING_FORMAT) {
5357
5358 # This is very specialized, with the type of
5359 # decomposition beginning the line enclosed in
5360 # <...>, and the code points that the code point
5361 # decomposes to separated by blanks. Create two
5362 # strings, one of the printable characters, and
5363 # one of their official names.
5364 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5365 my $tostr = "";
5366 my $to_name = "";
5367 my $to_chr = "";
5368 foreach my $to (split " ", $map) {
5369 $to = CORE::hex $to;
5370 $to_name .= " + " if $to_name;
5371 $to_chr .= chr($to);
5372 main::populate_char_info($to)
5373 if ! defined $viacode[$to];
5374 $to_name .= $viacode[$to];
5375 }
5376
5377 $comment .=
5378 "=> '$to_chr'; $viacode[$i] => $to_name";
5379 $comment_indent = 25; # Determined by experiment
5380 }
5381 else {
5382
5383 # Assume that any table that has hex format is a
5384 # mapping of one code point to another.
5385 if ($format eq $HEX_FORMAT) {
5386 my $decimal_value = CORE::hex $value;
5387 main::populate_char_info($decimal_value)
5388 if ! defined $viacode[$decimal_value];
5389 $comment .= "=> '"
5390 . chr($decimal_value)
5391 . "'; " if $printable[$decimal_value];
5392 }
5393 $comment .= $viacode[$i] if $include_name
5394 && $viacode[$i];
5395 if ($format eq $HEX_FORMAT) {
5396 my $decimal_value = CORE::hex $value;
5397 $comment .= " => $viacode[$decimal_value]"
5398 if $viacode[$decimal_value];
5399 }
5400
5401 # If including the name, no need to indent, as the
5402 # name will already be way across the line.
5403 $comment_indent = ($include_name) ? 0 : 60;
5404 }
5405
5406 # Use any passed in routine to output the base part of
5407 # the line.
5408 if (ref $range_size_1 eq 'CODE') {
5409 my $base_part = &{$range_size_1}($i, $value);
5410 chomp $base_part;
5411 push @OUT, $base_part;
5412 }
5413 else {
5414 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5415 }
5416
5417 # And add the annotation.
5418 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5419 $OUT[-1],
5420 $comment if $comment;
5421 $OUT[-1] .= "\n";
5422 }
99870f4d
KW
5423 }
5424 } # End of loop through all the table's ranges
5425 }
5426
5427 # Add anything that goes after the main body, but within the here
5428 # document,
5429 my $append_to_body = $self->append_to_body;
5430 push @OUT, $append_to_body if $append_to_body;
5431
5432 # And finish the here document.
5433 push @OUT, "END\n";
5434
668b3bfc
KW
5435 # Done with the main portion of the body. Can now figure out what
5436 # should appear before it in the file.
5437 my $pre_body = $self->pre_body;
5438 push @HEADER, $pre_body, "\n" if $pre_body;
668b3bfc 5439
6b0079b5
KW
5440 # All these files should have a .pl suffix added to them.
5441 my @file_with_pl = @{$file_path{$addr}};
5442 $file_with_pl[-1] .= '.pl';
99870f4d 5443
6b0079b5 5444 main::write(\@file_with_pl,
558712cf 5445 $annotate, # utf8 iff annotating
9218f1cf
KW
5446 \@HEADER,
5447 \@OUT);
99870f4d
KW
5448 return;
5449 }
5450
5451 sub set_status { # Set the table's status
5452 my $self = shift;
5453 my $status = shift; # The status enum value
5454 my $info = shift; # Any message associated with it.
5455 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5456
ffe43484 5457 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5458
5459 $status{$addr} = $status;
5460 $status_info{$addr} = $info;
5461 return;
5462 }
5463
301ba948
KW
5464 sub set_fate { # Set the fate of a table
5465 my $self = shift;
5466 my $fate = shift;
5467 my $reason = shift;
5468 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5469
5470 my $addr = do { no overloading; pack 'J', $self; };
5471
5472 return if $fate{$addr} == $fate; # If no-op
5473
395dfc19
KW
5474 # Can only change the ordinary fate, except if going to $MAP_PROXIED
5475 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
301ba948
KW
5476
5477 $fate{$addr} = $fate;
5478
395dfc19
KW
5479 # Don't document anything to do with a non-normal fated table
5480 if ($fate != $ORDINARY) {
fd1e3e84 5481 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
395dfc19 5482 foreach my $alias ($self->aliases) {
fd1e3e84 5483 $alias->set_ucd($put_in_pod);
395dfc19
KW
5484
5485 # MAP_PROXIED doesn't affect the match tables
5486 next if $fate == $MAP_PROXIED;
fd1e3e84 5487 $alias->set_make_re_pod_entry($put_in_pod);
395dfc19
KW
5488 }
5489 }
5490
301ba948
KW
5491 # Save the reason for suppression for output
5492 if ($fate == $SUPPRESSED && defined $reason) {
5493 $why_suppressed{$complete_name{$addr}} = $reason;
5494 }
5495
5496 return;
5497 }
5498
99870f4d
KW
5499 sub lock {
5500 # Don't allow changes to the table from now on. This stores a stack
5501 # trace of where it was called, so that later attempts to modify it
5502 # can immediately show where it got locked.
5503
5504 my $self = shift;
5505 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5506
ffe43484 5507 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5508
5509 $locked{$addr} = "";
5510
5511 my $line = (caller(0))[2];
5512 my $i = 1;
5513
5514 # Accumulate the stack trace
5515 while (1) {
5516 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5517
5518 last unless defined $caller;
5519
5520 $locked{$addr} .= " called from $caller() at line $line\n";
5521 $line = $caller_line;
5522 }
5523 $locked{$addr} .= " called from main at line $line\n";
5524
5525 return;
5526 }
5527
5528 sub carp_if_locked {
5529 # Return whether a table is locked or not, and, by the way, complain
5530 # if is locked
5531
5532 my $self = shift;
5533 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5534
ffe43484 5535 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5536
5537 return 0 if ! $locked{$addr};
5538 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5539 return 1;
5540 }
5541
5542 sub set_file_path { # Set the final directory path for this table
5543 my $self = shift;
5544 # Rest of parameters passed on
5545
f998e60c 5546 no overloading;
051df77b 5547 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
5548 return
5549 }
5550
5551 # Accessors for the range list stored in this table. First for
5552 # unconditional
ea25a9b2 5553 for my $sub (qw(
2f7a8815 5554 containing_range
99870f4d
KW
5555 contains
5556 count
5557 each_range
5558 hash
5559 is_empty
09aba7e4 5560 matches_identically_to
99870f4d
KW
5561 max
5562 min
5563 range_count
5564 reset_each_range
0a9dbafc 5565 type_of
99870f4d 5566 value_of
ea25a9b2 5567 ))
99870f4d
KW
5568 {
5569 no strict "refs";
5570 *$sub = sub {
5571 use strict "refs";
5572 my $self = shift;
ec40ee88 5573 return $self->_range_list->$sub(@_);
99870f4d
KW
5574 }
5575 }
5576
5577 # Then for ones that should fail if locked
ea25a9b2 5578 for my $sub (qw(
99870f4d 5579 delete_range
ea25a9b2 5580 ))
99870f4d
KW
5581 {
5582 no strict "refs";
5583 *$sub = sub {
5584 use strict "refs";
5585 my $self = shift;
5586
5587 return if $self->carp_if_locked;
f998e60c 5588 no overloading;
ec40ee88 5589 return $self->_range_list->$sub(@_);
99870f4d
KW
5590 }
5591 }
5592
5593} # End closure
5594
5595package Map_Table;
5596use base '_Base_Table';
5597
5598# A Map Table is a table that contains the mappings from code points to
5599# values. There are two weird cases:
5600# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5601# are written in the table's file at the end of the table nonetheless. It
5602# requires specially constructed code to handle these; utf8.c can not read
5603# these in, so they should not go in $map_directory. As of this writing,
5604# the only case that these happen is for named sequences used in
5605# charnames.pm. But this code doesn't enforce any syntax on these, so
5606# something else could come along that uses it.
5607# 2) Specials are anything that doesn't fit syntactically into the body of the
5608# table. The ranges for these have a map type of non-zero. The code below
5609# knows about and handles each possible type. In most cases, these are
5610# written as part of the header.
5611#
5612# A map table deliberately can't be manipulated at will unlike match tables.
5613# This is because of the ambiguities having to do with what to do with
5614# overlapping code points. And there just isn't a need for those things;
5615# what one wants to do is just query, add, replace, or delete mappings, plus
5616# write the final result.
5617# However, there is a method to get the list of possible ranges that aren't in
5618# this table to use for defaulting missing code point mappings. And,
5619# map_add_or_replace_non_nulls() does allow one to add another table to this
5620# one, but it is clearly very specialized, and defined that the other's
5621# non-null values replace this one's if there is any overlap.
5622
5623sub trace { return main::trace(@_); }
5624
5625{ # Closure
5626
5627 main::setup_package();
5628
5629 my %default_map;
5630 # Many input files omit some entries; this gives what the mapping for the
5631 # missing entries should be
5632 main::set_access('default_map', \%default_map, 'r');
5633
5634 my %anomalous_entries;
5635 # Things that go in the body of the table which don't fit the normal
5636 # scheme of things, like having a range. Not much can be done with these
5637 # once there except to output them. This was created to handle named
5638 # sequences.
5639 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5640 main::set_access('anomalous_entries', # Append singular, read plural
5641 \%anomalous_entries,
5642 'readable_array');
5643
99870f4d 5644 my %to_output_map;
8572ace0 5645 # Enum as to whether or not to write out this map table:
c12f2655 5646 # 0 don't output
8572ace0
KW
5647 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5648 # it should not be removed nor its format changed. This
5649 # is done for those files that have traditionally been
5650 # output.
5651 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5652 # with this file
99870f4d
KW
5653 main::set_access('to_output_map', \%to_output_map, 's');
5654
5655
5656 sub new {
5657 my $class = shift;
5658 my $name = shift;
5659
5660 my %args = @_;
5661
5662 # Optional initialization data for the table.
5663 my $initialize = delete $args{'Initialize'};
5664
99870f4d 5665 my $default_map = delete $args{'Default_Map'};
99870f4d 5666 my $property = delete $args{'_Property'};
23e33b60 5667 my $full_name = delete $args{'Full_Name'};
20863809 5668
99870f4d
KW
5669 # Rest of parameters passed on
5670
5671 my $range_list = Range_Map->new(Owner => $property);
5672
5673 my $self = $class->SUPER::new(
5674 Name => $name,
23e33b60
KW
5675 Complete_Name => $full_name,
5676 Full_Name => $full_name,
99870f4d
KW
5677 _Property => $property,
5678 _Range_List => $range_list,
5679 %args);
5680
ffe43484 5681 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5682
5683 $anomalous_entries{$addr} = [];
99870f4d 5684 $default_map{$addr} = $default_map;
99870f4d
KW
5685
5686 $self->initialize($initialize) if defined $initialize;
5687
5688 return $self;
5689 }
5690
5691 use overload
5692 fallback => 0,
5693 qw("") => "_operator_stringify",
5694 ;
5695
5696 sub _operator_stringify {
5697 my $self = shift;
5698
5699 my $name = $self->property->full_name;
5700 $name = '""' if $name eq "";
5701 return "Map table for Property '$name'";
5702 }
5703
99870f4d
KW
5704 sub add_alias {
5705 # Add a synonym for this table (which means the property itself)
5706 my $self = shift;
5707 my $name = shift;
5708 # Rest of parameters passed on.
5709
5710 $self->SUPER::add_alias($name, $self->property, @_);
5711 return;
5712 }
5713
5714 sub add_map {
5715 # Add a range of code points to the list of specially-handled code
5716 # points. $MULTI_CP is assumed if the type of special is not passed
5717 # in.
5718
5719 my $self = shift;
5720 my $lower = shift;
5721 my $upper = shift;
5722 my $string = shift;
5723 my %args = @_;
5724
5725 my $type = delete $args{'Type'} || 0;
5726 # Rest of parameters passed on
5727
5728 # Can't change the table if locked.
5729 return if $self->carp_if_locked;
5730
ffe43484 5731 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 5732
99870f4d
KW
5733 $self->_range_list->add_map($lower, $upper,
5734 $string,
5735 @_,
5736 Type => $type);
5737 return;
5738 }
5739
5740 sub append_to_body {
5741 # Adds to the written HERE document of the table's body any anomalous
5742 # entries in the table..
5743
5744 my $self = shift;
5745 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5746
ffe43484 5747 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5748
5749 return "" unless @{$anomalous_entries{$addr}};
5750 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5751 }
5752
5753 sub map_add_or_replace_non_nulls {
5754 # This adds the mappings in the table $other to $self. Non-null
5755 # mappings from $other override those in $self. It essentially merges
5756 # the two tables, with the second having priority except for null
5757 # mappings.
5758
5759 my $self = shift;
5760 my $other = shift;
5761 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5762
5763 return if $self->carp_if_locked;
5764
5765 if (! $other->isa(__PACKAGE__)) {
5766 Carp::my_carp_bug("$other should be a "
5767 . __PACKAGE__
5768 . ". Not a '"
5769 . ref($other)
5770 . "'. Not added;");
5771 return;
5772 }
5773
ffe43484
NC
5774 my $addr = do { no overloading; pack 'J', $self; };
5775 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5776
5777 local $to_trace = 0 if main::DEBUG;
5778
5779 my $self_range_list = $self->_range_list;
5780 my $other_range_list = $other->_range_list;
5781 foreach my $range ($other_range_list->ranges) {
5782 my $value = $range->value;
5783 next if $value eq "";
5784 $self_range_list->_add_delete('+',
5785 $range->start,
5786 $range->end,
5787 $value,
5788 Type => $range->type,
5789 Replace => $UNCONDITIONALLY);
5790 }
5791
99870f4d
KW
5792 return;
5793 }
5794
5795 sub set_default_map {
5796 # Define what code points that are missing from the input files should
5797 # map to
5798
5799 my $self = shift;
5800 my $map = shift;
5801 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5802
ffe43484 5803 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5804
5805 # Convert the input to the standard equivalent, if any (won't have any
5806 # for $STRING properties)
5807 my $standard = $self->_find_table_from_alias->{$map};
5808 $map = $standard->name if defined $standard;
5809
5810 # Warn if there already is a non-equivalent default map for this
5811 # property. Note that a default map can be a ref, which means that
5812 # what it actually means is delayed until later in the program, and it
5813 # IS permissible to override it here without a message.
5814 my $default_map = $default_map{$addr};
5815 if (defined $default_map
5816 && ! ref($default_map)
5817 && $default_map ne $map
5818 && main::Standardize($map) ne $default_map)
5819 {
5820 my $property = $self->property;
5821 my $map_table = $property->table($map);
5822 my $default_table = $property->table($default_map);
5823 if (defined $map_table
5824 && defined $default_table
5825 && $map_table != $default_table)
5826 {
5827 Carp::my_carp("Changing the default mapping for "
5828 . $property
5829 . " from $default_map to $map'");
5830 }
5831 }
5832
5833 $default_map{$addr} = $map;
5834
5835 # Don't also create any missing table for this map at this point,
5836 # because if we did, it could get done before the main table add is
5837 # done for PropValueAliases.txt; instead the caller will have to make
5838 # sure it exists, if desired.
5839 return;
5840 }
5841
5842 sub to_output_map {
5843 # Returns boolean: should we write this map table?
5844
5845 my $self = shift;
5846 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5847
ffe43484 5848 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5849
5850 # If overridden, use that
5851 return $to_output_map{$addr} if defined $to_output_map{$addr};
5852
5853 my $full_name = $self->full_name;
fcf1973c
KW
5854 return $global_to_output_map{$full_name}
5855 if defined $global_to_output_map{$full_name};
99870f4d 5856
20863809 5857 # If table says to output, do so; if says to suppress it, do so.
301ba948
KW
5858 my $fate = $self->fate;
5859 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
8572ace0 5860 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
395dfc19 5861 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
99870f4d
KW
5862
5863 my $type = $self->property->type;
5864
5865 # Don't want to output binary map tables even for debugging.
5866 return 0 if $type == $BINARY;
5867
5868 # But do want to output string ones.
8572ace0 5869 return $EXTERNAL_MAP if $type == $STRING;
99870f4d 5870
8572ace0
KW
5871 # Otherwise is an $ENUM, do output it, for Perl's purposes
5872 return $INTERNAL_MAP;
99870f4d
KW
5873 }
5874
5875 sub inverse_list {
5876 # Returns a Range_List that is gaps of the current table. That is,
5877 # the inversion
5878
5879 my $self = shift;
5880 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5881
5882 my $current = Range_List->new(Initialize => $self->_range_list,
5883 Owner => $self->property);
5884 return ~ $current;
5885 }
5886
8572ace0
KW
5887 sub header {
5888 my $self = shift;
5889 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5890
5891 my $return = $self->SUPER::header();
5892
ae92a9ae
KW
5893 if ($self->to_output_map == $INTERNAL_MAP) {
5894 $return .= $INTERNAL_ONLY_HEADER;
5895 }
5896 else {
5897 my $property_name = $self->property->full_name;
5898 $return .= <<END;
5899
5900# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
5901
5902# This file is for internal use by core Perl only. It is retained for
5903# backwards compatibility with applications that may have come to rely on it,
5904# but its format and even its name or existence are subject to change without
5905# notice in a future Perl version. Don't use it directly. Instead, its
5906# contents are now retrievable through a stable API in the Unicode::UCD
5907# module: Unicode::UCD::prop_invmap('$property_name').
5908END
5909 }
8572ace0
KW
5910 return $return;
5911 }
5912
99870f4d
KW
5913 sub set_final_comment {
5914 # Just before output, create the comment that heads the file
5915 # containing this table.
5916
bd9ebcfd
KW
5917 return unless $debugging_build;
5918
99870f4d
KW
5919 my $self = shift;
5920 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5921
5922 # No sense generating a comment if aren't going to write it out.
5923 return if ! $self->to_output_map;
5924
ffe43484 5925 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5926
5927 my $property = $self->property;
5928
5929 # Get all the possible names for this property. Don't use any that
5930 # aren't ok for use in a file name, etc. This is perhaps causing that
5931 # flag to do double duty, and may have to be changed in the future to
5932 # have our own flag for just this purpose; but it works now to exclude
5933 # Perl generated synonyms from the lists for properties, where the
5934 # name is always the proper Unicode one.
0eac1e20 5935 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
99870f4d
KW
5936
5937 my $count = $self->count;
5938 my $default_map = $default_map{$addr};
5939
5940 # The ranges that map to the default aren't output, so subtract that
5941 # to get those actually output. A property with matching tables
5942 # already has the information calculated.
5943 if ($property->type != $STRING) {
5944 $count -= $property->table($default_map)->count;
5945 }
5946 elsif (defined $default_map) {
5947
5948 # But for $STRING properties, must calculate now. Subtract the
5949 # count from each range that maps to the default.
5950 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5951 if ($range->value eq $default_map) {
5952 $count -= $range->end +1 - $range->start;
5953 }
5954 }
5955
5956 }
5957
5958 # Get a string version of $count with underscores in large numbers,
5959 # for clarity.
5960 my $string_count = main::clarify_number($count);
5961
5962 my $code_points = ($count == 1)
5963 ? 'single code point'
5964 : "$string_count code points";
5965
5966 my $mapping;
5967 my $these_mappings;
5968 my $are;
5969 if (@property_aliases <= 1) {
5970 $mapping = 'mapping';
5971 $these_mappings = 'this mapping';
5972 $are = 'is'
5973 }
5974 else {
5975 $mapping = 'synonymous mappings';
5976 $these_mappings = 'these mappings';
5977 $are = 'are'
5978 }
5979 my $cp;
5980 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5981 $cp = "any code point in Unicode Version $string_version";
5982 }
5983 else {
5984 my $map_to;
5985 if ($default_map eq "") {
5986 $map_to = 'the null string';
5987 }
5988 elsif ($default_map eq $CODE_POINT) {
5989 $map_to = "itself";
5990 }
5991 else {
5992 $map_to = "'$default_map'";
5993 }
5994 if ($count == 1) {
5995 $cp = "the single code point";
5996 }
5997 else {
5998 $cp = "one of the $code_points";
5999 }
6000 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6001 }
6002
6003 my $comment = "";
6004
6005 my $status = $self->status;
6006 if ($status) {
6007 my $warn = uc $status_past_participles{$status};
6008 $comment .= <<END;
6009
6010!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
6011 All property or property=value combinations contained in this file are $warn.
6012 See $unicode_reference_url for what this means.
6013
6014END
6015 }
6016 $comment .= "This file returns the $mapping:\n";
6017
6018 for my $i (0 .. @property_aliases - 1) {
6019 $comment .= sprintf("%-8s%s\n",
6020 " ",
6021 $property_aliases[$i]->name . '(cp)'
6022 );
6023 }
83b7c87d
KW
6024 my $full_name = $self->property->full_name;
6025 $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
99870f4d
KW
6026
6027 # And append any commentary already set from the actual property.
6028 $comment .= "\n\n" . $self->comment if $self->comment;
6029 if ($self->description) {
6030 $comment .= "\n\n" . join " ", $self->description;
6031 }
6032 if ($self->note) {
6033 $comment .= "\n\n" . join " ", $self->note;
6034 }
6035 $comment .= "\n";
6036
6037 if (! $self->perl_extension) {
6038 $comment .= <<END;
6039
6040For information about what this property really means, see:
6041$unicode_reference_url
6042END
6043 }
6044
6045 if ($count) { # Format differs for empty table
6046 $comment.= "\nThe format of the ";
6047 if ($self->range_size_1) {
6048 $comment.= <<END;
6049main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6050is in hex; MAPPING is what CODE_POINT maps to.
6051END
6052 }
6053 else {
6054
6055 # There are tables which end up only having one element per
6056 # range, but it is not worth keeping track of for making just
6057 # this comment a little better.
6058 $comment.= <<END;
6059non-comment portions of the main body of lines of this file is:
6060START\\tSTOP\\tMAPPING where START is the starting code point of the
6061range, in hex; STOP is the ending point, or if omitted, the range has just one
6062code point; MAPPING is what each code point between START and STOP maps to.
6063END
0c07e538 6064 if ($self->output_range_counts) {
99870f4d
KW
6065 $comment .= <<END;
6066Numbers in comments in [brackets] indicate how many code points are in the
6067range (omitted when the range is a single code point or if the mapping is to
6068the null string).
6069END
6070 }
6071 }
6072 }
6073 $self->set_comment(main::join_lines($comment));
6074 return;
6075 }
6076
6077 my %swash_keys; # Makes sure don't duplicate swash names.
6078
668b3bfc
KW
6079 # The remaining variables are temporaries used while writing each table,
6080 # to output special ranges.
668b3bfc
KW
6081 my @multi_code_point_maps; # Map is to more than one code point.
6082
668b3bfc
KW
6083 sub handle_special_range {
6084 # Called in the middle of write when it finds a range it doesn't know
6085 # how to handle.
6086
6087 my $self = shift;
6088 my $range = shift;
6089 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6090
6091 my $addr = do { no overloading; pack 'J', $self; };
6092
6093 my $type = $range->type;
6094
6095 my $low = $range->start;
6096 my $high = $range->end;
6097 my $map = $range->value;
6098
6099 # No need to output the range if it maps to the default.
6100 return if $map eq $default_map{$addr};
6101
bb1dd3da
KW
6102 my $property = $self->property;
6103
668b3bfc
KW
6104 # Switch based on the map type...
6105 if ($type == $HANGUL_SYLLABLE) {
6106
6107 # These are entirely algorithmically determinable based on
6108 # some constants furnished by Unicode; for now, just set a
6109 # flag to indicate that have them. After everything is figured
bb1dd3da
KW
6110 # out, we will output the code that does the algorithm. (Don't
6111 # output them if not needed because we are suppressing this
6112 # property.)
6113 $has_hangul_syllables = 1 if $property->to_output_map;
668b3bfc
KW
6114 }
6115 elsif ($type == $CP_IN_NAME) {
6116
bb1dd3da 6117 # Code points whose name ends in their code point are also
668b3bfc
KW
6118 # algorithmically determinable, but need information about the map
6119 # to do so. Both the map and its inverse are stored in data
bb1dd3da
KW
6120 # structures output in the file. They are stored in the mean time
6121 # in global lists The lists will be written out later into Name.pm,
6122 # which is created only if needed. In order to prevent duplicates
6123 # in the list, only add to them for one property, should multiple
6124 # ones need them.
6125 if ($needing_code_points_ending_in_code_point == 0) {
6126 $needing_code_points_ending_in_code_point = $property;
6127 }
6128 if ($property == $needing_code_points_ending_in_code_point) {
6c1bafed
KW
6129 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6130 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6131
6132 my $squeezed = $map =~ s/[-\s]+//gr;
6133 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6134 $low;
6135 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6136 $high;
6137
6138 push @code_points_ending_in_code_point, { low => $low,
6139 high => $high,
6140 name => $map
6141 };
bb1dd3da 6142 }
668b3bfc
KW
6143 }
6144 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6145
6146 # Multi-code point maps and null string maps have an entry
6147 # for each code point in the range. They use the same
6148 # output format.
6149 for my $code_point ($low .. $high) {
6150
c12f2655
KW
6151 # The pack() below can't cope with surrogates. XXX This may
6152 # no longer be true
668b3bfc 6153 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
98dc9551 6154 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
668b3bfc
KW
6155 next;
6156 }
6157
6158 # Generate the hash entries for these in the form that
6159 # utf8.c understands.
6160 my $tostr = "";
6161 my $to_name = "";
6162 my $to_chr = "";
6163 foreach my $to (split " ", $map) {
6164 if ($to !~ /^$code_point_re$/) {
6165 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6166 next;
6167 }
6168 $tostr .= sprintf "\\x{%s}", $to;
6169 $to = CORE::hex $to;
558712cf 6170 if ($annotate) {
c4019d52
KW
6171 $to_name .= " + " if $to_name;
6172 $to_chr .= chr($to);
6173 main::populate_char_info($to)
6174 if ! defined $viacode[$to];
6175 $to_name .= $viacode[$to];
6176 }
668b3bfc
KW
6177 }
6178
6179 # I (khw) have never waded through this line to
6180 # understand it well enough to comment it.
6181 my $utf8 = sprintf(qq["%s" => "$tostr",],
6182 join("", map { sprintf "\\x%02X", $_ }
6183 unpack("U0C*", pack("U", $code_point))));
6184
6185 # Add a comment so that a human reader can more easily
6186 # see what's going on.
6187 push @multi_code_point_maps,
6188 sprintf("%-45s # U+%04X", $utf8, $code_point);
558712cf 6189 if (! $annotate) {
c4019d52
KW
6190 $multi_code_point_maps[-1] .= " => $map";
6191 }
6192 else {
6193 main::populate_char_info($code_point)
6194 if ! defined $viacode[$code_point];
6195 $multi_code_point_maps[-1] .= " '"
6196 . chr($code_point)
6197 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6198 }
668b3bfc
KW
6199 }
6200 }
6201 else {
6202 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6203 }
6204
6205 return;
6206 }
6207
99870f4d
KW
6208 sub pre_body {
6209 # Returns the string that should be output in the file before the main
668b3bfc
KW
6210 # body of this table. It isn't called until the main body is
6211 # calculated, saving a pass. The string includes some hash entries
6212 # identifying the format of the body, and what the single value should
6213 # be for all ranges missing from it. It also includes any code points
6214 # which have map_types that don't go in the main table.
99870f4d
KW
6215
6216 my $self = shift;
6217 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6218
ffe43484 6219 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6220
6221 my $name = $self->property->swash_name;
6222
19f751d2
KW
6223 # Currently there is nothing in the pre_body unless a swash is being
6224 # generated.
6225 return unless defined $name;
6226
99870f4d
KW
6227 if (defined $swash_keys{$name}) {
6228 Carp::my_carp(join_lines(<<END
6229Already created a swash name '$name' for $swash_keys{$name}. This means that
6230the same name desired for $self shouldn't be used. Bad News. This must be
6231fixed before production use, but proceeding anyway
6232END
6233 ));
6234 }
6235 $swash_keys{$name} = "$self";
6236
99870f4d 6237 my $pre_body = "";
99870f4d 6238
668b3bfc
KW
6239 # Here we assume we were called after have gone through the whole
6240 # file. If we actually generated anything for each map type, add its
6241 # respective header and trailer
ec2f0128 6242 my $specials_name = "";
668b3bfc 6243 if (@multi_code_point_maps) {
ec2f0128 6244 $specials_name = "utf8::ToSpec$name";
668b3bfc 6245 $pre_body .= <<END;
99870f4d
KW
6246
6247# Some code points require special handling because their mappings are each to
6248# multiple code points. These do not appear in the main body, but are defined
6249# in the hash below.
6250
76591e2b
KW
6251# Each key is the string of N bytes that together make up the UTF-8 encoding
6252# for the code point. (i.e. the same as looking at the code point's UTF-8
6253# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
ec2f0128 6254\%$specials_name = (
99870f4d 6255END
668b3bfc
KW
6256 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6257 }
99870f4d 6258
668b3bfc
KW
6259 my $format = $self->format;
6260
6261 my $return = <<END;
6262# The name this swash is to be known by, with the format of the mappings in
6263# the main body of the table, and what all code points missing from this file
6264# map to.
6265\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6266END
ec2f0128
KW
6267 if ($specials_name) {
6268 $return .= <<END;
6269\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6270END
6271 }
668b3bfc
KW
6272 my $default_map = $default_map{$addr};
6273 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6274
6275 if ($default_map eq $CODE_POINT) {
6276 $return .= ' # code point maps to itself';
6277 }
6278 elsif ($default_map eq "") {
6279 $return .= ' # code point maps to the null string';
6280 }
6281 $return .= "\n";
6282
6283 $return .= $pre_body;
6284
6285 return $return;
6286 }
6287
6288 sub write {
6289 # Write the table to the file.
6290
6291 my $self = shift;
6292 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6293
6294 my $addr = do { no overloading; pack 'J', $self; };
6295
6296 # Clear the temporaries
668b3bfc 6297 undef @multi_code_point_maps;
99870f4d
KW
6298
6299 # Calculate the format of the table if not already done.
f5817e0a 6300 my $format = $self->format;
668b3bfc
KW
6301 my $type = $self->property->type;
6302 my $default_map = $self->default_map;
99870f4d
KW
6303 if (! defined $format) {
6304 if ($type == $BINARY) {
6305
6306 # Don't bother checking the values, because we elsewhere
6307 # verify that a binary table has only 2 values.
6308 $format = $BINARY_FORMAT;
6309 }
6310 else {
6311 my @ranges = $self->_range_list->ranges;
6312
6313 # default an empty table based on its type and default map
6314 if (! @ranges) {
6315
6316 # But it turns out that the only one we can say is a
6317 # non-string (besides binary, handled above) is when the
6318 # table is a string and the default map is to a code point
6319 if ($type == $STRING && $default_map eq $CODE_POINT) {
6320 $format = $HEX_FORMAT;
6321 }
6322 else {
6323 $format = $STRING_FORMAT;
6324 }
6325 }
6326 else {
6327
6328 # Start with the most restrictive format, and as we find
6329 # something that doesn't fit with that, change to the next
6330 # most restrictive, and so on.
6331 $format = $DECIMAL_FORMAT;
6332 foreach my $range (@ranges) {
668b3bfc
KW
6333 next if $range->type != 0; # Non-normal ranges don't
6334 # affect the main body
99870f4d
KW
6335 my $map = $range->value;
6336 if ($map ne $default_map) {
6337 last if $format eq $STRING_FORMAT; # already at
6338 # least
6339 # restrictive
6340 $format = $INTEGER_FORMAT
6341 if $format eq $DECIMAL_FORMAT
6342 && $map !~ / ^ [0-9] $ /x;
6343 $format = $FLOAT_FORMAT
6344 if $format eq $INTEGER_FORMAT
6345 && $map !~ / ^ -? [0-9]+ $ /x;
6346 $format = $RATIONAL_FORMAT
6347 if $format eq $FLOAT_FORMAT
6348 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6349 $format = $HEX_FORMAT
6350 if $format eq $RATIONAL_FORMAT
6351 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6352 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6353 && $map =~ /[^0-9A-F]/;
6354 }
6355 }
6356 }
6357 }
6358 } # end of calculating format
6359
668b3bfc 6360 if ($default_map eq $CODE_POINT
99870f4d 6361 && $format ne $HEX_FORMAT
668b3bfc
KW
6362 && ! defined $self->format) # manual settings are always
6363 # considered ok
99870f4d
KW
6364 {
6365 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6366 }
99870f4d 6367
668b3bfc 6368 $self->_set_format($format);
99870f4d 6369
0911a63d
KW
6370 # Core Perl has a different definition of mapping ranges than we do,
6371 # that is applicable mainly to mapping code points, so for tables
6372 # where it is possible that core Perl could be used to read it,
6373 # make it range size 1 to prevent possible confusion
6374 $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6375
99870f4d
KW
6376 return $self->SUPER::write(
6377 ($self->property == $block)
6378 ? 7 # block file needs more tab stops
6379 : 3,
668b3bfc 6380 $default_map); # don't write defaulteds
99870f4d
KW
6381 }
6382
6383 # Accessors for the underlying list that should fail if locked.
ea25a9b2 6384 for my $sub (qw(
99870f4d 6385 add_duplicate
ea25a9b2 6386 ))
99870f4d
KW
6387 {
6388 no strict "refs";
6389 *$sub = sub {
6390 use strict "refs";
6391 my $self = shift;
6392
6393 return if $self->carp_if_locked;
6394 return $self->_range_list->$sub(@_);
6395 }
6396 }
6397} # End closure for Map_Table
6398
6399package Match_Table;
6400use base '_Base_Table';
6401
6402# A Match table is one which is a list of all the code points that have
6403# the same property and property value, for use in \p{property=value}
6404# constructs in regular expressions. It adds very little data to the base
6405# structure, but many methods, as these lists can be combined in many ways to
6406# form new ones.
6407# There are only a few concepts added:
6408# 1) Equivalents and Relatedness.
6409# Two tables can match the identical code points, but have different names.
6410# This always happens when there is a perl single form extension
6411# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6412# tables are set to be related, with the Perl extension being a child, and
6413# the Unicode property being the parent.
6414#
6415# It may be that two tables match the identical code points and we don't
6416# know if they are related or not. This happens most frequently when the
6417# Block and Script properties have the exact range. But note that a
6418# revision to Unicode could add new code points to the script, which would
6419# now have to be in a different block (as the block was filled, or there
6420# would have been 'Unknown' script code points in it and they wouldn't have
6421# been identical). So we can't rely on any two properties from Unicode
6422# always matching the same code points from release to release, and thus
6423# these tables are considered coincidentally equivalent--not related. When
6424# two tables are unrelated but equivalent, one is arbitrarily chosen as the
6425# 'leader', and the others are 'equivalents'. This concept is useful
6426# to minimize the number of tables written out. Only one file is used for
6427# any identical set of code points, with entries in Heavy.pl mapping all
6428# the involved tables to it.
6429#
6430# Related tables will always be identical; we set them up to be so. Thus
6431# if the Unicode one is deprecated, the Perl one will be too. Not so for
6432# unrelated tables. Relatedness makes generating the documentation easier.
6433#
c12f2655
KW
6434# 2) Complement.
6435# Like equivalents, two tables may be the inverses of each other, the
6436# intersection between them is null, and the union is every Unicode code
6437# point. The two tables that occupy a binary property are necessarily like
6438# this. By specifying one table as the complement of another, we can avoid
6439# storing it on disk (using the other table and performing a fast
6440# transform), and some memory and calculations.
6441#
6442# 3) Conflicting. It may be that there will eventually be name clashes, with
99870f4d
KW
6443# the same name meaning different things. For a while, there actually were
6444# conflicts, but they have so far been resolved by changing Perl's or
6445# Unicode's definitions to match the other, but when this code was written,
6446# it wasn't clear that that was what was going to happen. (Unicode changed
6447# because of protests during their beta period.) Name clashes are warned
6448# about during compilation, and the documentation. The generated tables
6449# are sane, free of name clashes, because the code suppresses the Perl
6450# version. But manual intervention to decide what the actual behavior
6451# should be may be required should this happen. The introductory comments
6452# have more to say about this.
6453
6454sub standardize { return main::standardize($_[0]); }
6455sub trace { return main::trace(@_); }
6456
6457
6458{ # Closure
6459
6460 main::setup_package();
6461
6462 my %leader;
6463 # The leader table of this one; initially $self.
6464 main::set_access('leader', \%leader, 'r');
6465
6466 my %equivalents;
6467 # An array of any tables that have this one as their leader
6468 main::set_access('equivalents', \%equivalents, 'readable_array');
6469
6470 my %parent;
6471 # The parent table to this one, initially $self. This allows us to
c12f2655
KW
6472 # distinguish between equivalent tables that are related (for which this
6473 # is set to), and those which may not be, but share the same output file
6474 # because they match the exact same set of code points in the current
6475 # Unicode release.
99870f4d
KW
6476 main::set_access('parent', \%parent, 'r');
6477
6478 my %children;
6479 # An array of any tables that have this one as their parent
6480 main::set_access('children', \%children, 'readable_array');
6481
6482 my %conflicting;
6483 # Array of any tables that would have the same name as this one with
6484 # a different meaning. This is used for the generated documentation.
6485 main::set_access('conflicting', \%conflicting, 'readable_array');
6486
6487 my %matches_all;
6488 # Set in the constructor for tables that are expected to match all code
6489 # points.
6490 main::set_access('matches_all', \%matches_all, 'r');
6491
a92d5c2e
KW
6492 my %complement;
6493 # Points to the complement that this table is expressed in terms of; 0 if
6494 # none.
8ae00c8a 6495 main::set_access('complement', \%complement, 'r');
a92d5c2e 6496
99870f4d
KW
6497 sub new {
6498 my $class = shift;
6499
6500 my %args = @_;
6501
6502 # The property for which this table is a listing of property values.
6503 my $property = delete $args{'_Property'};
6504
23e33b60
KW
6505 my $name = delete $args{'Name'};
6506 my $full_name = delete $args{'Full_Name'};
6507 $full_name = $name if ! defined $full_name;
6508
99870f4d
KW
6509 # Optional
6510 my $initialize = delete $args{'Initialize'};
6511 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 6512 my $format = delete $args{'Format'};
99870f4d
KW
6513 # Rest of parameters passed on.
6514
6515 my $range_list = Range_List->new(Initialize => $initialize,
6516 Owner => $property);
6517
23e33b60
KW
6518 my $complete = $full_name;
6519 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6520 # but this helps debug if it
6521 # does
6522 # The complete name for a match table includes it's property in a
6523 # compound form 'property=table', except if the property is the
6524 # pseudo-property, perl, in which case it is just the single form,
6525 # 'table' (If you change the '=' must also change the ':' in lots of
6526 # places in this program that assume an equal sign)
6527 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 6528
99870f4d 6529 my $self = $class->SUPER::new(%args,
23e33b60
KW
6530 Name => $name,
6531 Complete_Name => $complete,
6532 Full_Name => $full_name,
99870f4d
KW
6533 _Property => $property,
6534 _Range_List => $range_list,
f5817e0a 6535 Format => $EMPTY_FORMAT,
99870f4d 6536 );
ffe43484 6537 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6538
6539 $conflicting{$addr} = [ ];
6540 $equivalents{$addr} = [ ];
6541 $children{$addr} = [ ];
6542 $matches_all{$addr} = $matches_all;
6543 $leader{$addr} = $self;
6544 $parent{$addr} = $self;
a92d5c2e 6545 $complement{$addr} = 0;
99870f4d 6546
f5817e0a
KW
6547 if (defined $format && $format ne $EMPTY_FORMAT) {
6548 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6549 }
6550
99870f4d
KW
6551 return $self;
6552 }
6553
6554 # See this program's beginning comment block about overloading these.
6555 use overload
6556 fallback => 0,
6557 qw("") => "_operator_stringify",
6558 '=' => sub {
6559 my $self = shift;
6560
6561 return if $self->carp_if_locked;
6562 return $self;
6563 },
6564
6565 '+' => sub {
6566 my $self = shift;
6567 my $other = shift;
6568
6569 return $self->_range_list + $other;
6570 },
6571 '&' => sub {
6572 my $self = shift;
6573 my $other = shift;
6574
6575 return $self->_range_list & $other;
6576 },
6577 '+=' => sub {
6578 my $self = shift;
6579 my $other = shift;
6580
6581 return if $self->carp_if_locked;
6582
ffe43484 6583 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6584
6585 if (ref $other) {
6586
6587 # Change the range list of this table to be the
6588 # union of the two.
6589 $self->_set_range_list($self->_range_list
6590 + $other);
6591 }
6592 else { # $other is just a simple value
6593 $self->add_range($other, $other);
6594 }
6595 return $self;
6596 },
6597 '-' => sub { my $self = shift;
6598 my $other = shift;
6599 my $reversed = shift;
6600
6601 if ($reversed) {
6602 Carp::my_carp_bug("Can't cope with a "
6603 . __PACKAGE__
6604 . " being the first parameter in a '-'. Subtraction ignored.");
6605 return;
6606 }
6607
6608 return $self->_range_list - $other;
6609 },
6610 '~' => sub { my $self = shift;
6611 return ~ $self->_range_list;
6612 },
6613 ;
6614
6615 sub _operator_stringify {
6616 my $self = shift;
6617
23e33b60 6618 my $name = $self->complete_name;
99870f4d
KW
6619 return "Table '$name'";
6620 }
6621
ec40ee88
KW
6622 sub _range_list {
6623 # Returns the range list associated with this table, which will be the
6624 # complement's if it has one.
6625
6626 my $self = shift;
6627 my $complement;
6628 if (($complement = $self->complement) != 0) {
6629 return ~ $complement->_range_list;
6630 }
6631 else {
6632 return $self->SUPER::_range_list;
6633 }
6634 }
6635
99870f4d
KW
6636 sub add_alias {
6637 # Add a synonym for this table. See the comments in the base class
6638
6639 my $self = shift;
6640 my $name = shift;
6641 # Rest of parameters passed on.
6642
6643 $self->SUPER::add_alias($name, $self, @_);
6644 return;
6645 }
6646
6647 sub add_conflicting {
6648 # Add the name of some other object to the list of ones that name
6649 # clash with this match table.
6650
6651 my $self = shift;
6652 my $conflicting_name = shift; # The name of the conflicting object
6653 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6654 my $conflicting_object = shift; # Optional, the conflicting object
6655 # itself. This is used to
6656 # disambiguate the text if the input
6657 # name is identical to any of the
6658 # aliases $self is known by.
6659 # Sometimes the conflicting object is
6660 # merely hypothetical, so this has to
6661 # be an optional parameter.
6662 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6663
ffe43484 6664 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6665
6666 # Check if the conflicting name is exactly the same as any existing
6667 # alias in this table (as long as there is a real object there to
6668 # disambiguate with).
6669 if (defined $conflicting_object) {
6670 foreach my $alias ($self->aliases) {
6671 if ($alias->name eq $conflicting_name) {
6672
6673 # Here, there is an exact match. This results in
6674 # ambiguous comments, so disambiguate by changing the
6675 # conflicting name to its object's complete equivalent.
6676 $conflicting_name = $conflicting_object->complete_name;
6677 last;
6678 }
6679 }
6680 }
6681
6682 # Convert to the \p{...} final name
6683 $conflicting_name = "\\$p" . "{$conflicting_name}";
6684
6685 # Only add once
6686 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6687
6688 push @{$conflicting{$addr}}, $conflicting_name;
6689
6690 return;
6691 }
6692
6505c6e2 6693 sub is_set_equivalent_to {
99870f4d
KW
6694 # Return boolean of whether or not the other object is a table of this
6695 # type and has been marked equivalent to this one.
6696
6697 my $self = shift;
6698 my $other = shift;
6699 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6700
6701 return 0 if ! defined $other; # Can happen for incomplete early
6702 # releases
6703 unless ($other->isa(__PACKAGE__)) {
6704 my $ref_other = ref $other;
6705 my $ref_self = ref $self;
6505c6e2 6706 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
99870f4d
KW
6707 return 0;
6708 }
6709
6710 # Two tables are equivalent if they have the same leader.
f998e60c 6711 no overloading;
051df77b 6712 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6713 return;
6714 }
6715
99870f4d
KW
6716 sub set_equivalent_to {
6717 # Set $self equivalent to the parameter table.
6718 # The required Related => 'x' parameter is a boolean indicating
6719 # whether these tables are related or not. If related, $other becomes
6720 # the 'parent' of $self; if unrelated it becomes the 'leader'
6721 #
6722 # Related tables share all characteristics except names; equivalents
6723 # not quite so many.
6724 # If they are related, one must be a perl extension. This is because
6725 # we can't guarantee that Unicode won't change one or the other in a
98dc9551 6726 # later release even if they are identical now.
99870f4d
KW
6727
6728 my $self = shift;
6729 my $other = shift;
6730
6731 my %args = @_;
6732 my $related = delete $args{'Related'};
6733
6734 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6735
6736 return if ! defined $other; # Keep on going; happens in some early
6737 # Unicode releases.
6738
6739 if (! defined $related) {
6740 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6741 $related = 0;
6742 }
6743
6744 # If already are equivalent, no need to re-do it; if subroutine
6745 # returns null, it found an error, also do nothing
6505c6e2 6746 my $are_equivalent = $self->is_set_equivalent_to($other);
99870f4d
KW
6747 return if ! defined $are_equivalent || $are_equivalent;
6748
ffe43484 6749 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6750 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d 6751
45e32b91
KW
6752 if ($related) {
6753 if ($current_leader->perl_extension) {
6754 if ($other->perl_extension) {
6755 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6756 return;
6757 }
7610e9e2
KW
6758 } elsif ($self->property != $other->property # Depending on
6759 # situation, might
6760 # be better to use
6761 # add_alias()
6762 # instead for same
6763 # property
6764 && ! $other->perl_extension)
6765 {
45e32b91
KW
6766 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6767 $related = 0;
6768 }
6769 }
6770
6771 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6772 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
6773 return;
99870f4d
KW
6774 }
6775
ffe43484
NC
6776 my $leader = do { no overloading; pack 'J', $current_leader; };
6777 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6778
6779 # Any tables that are equivalent to or children of this table must now
6780 # instead be equivalent to or (children) to the new leader (parent),
6781 # still equivalent. The equivalency includes their matches_all info,
301ba948 6782 # and for related tables, their fate and status.
99870f4d
KW
6783 # All related tables are of necessity equivalent, but the converse
6784 # isn't necessarily true
6785 my $status = $other->status;
6786 my $status_info = $other->status_info;
301ba948 6787 my $fate = $other->fate;
99870f4d 6788 my $matches_all = $matches_all{other_addr};
d867ccfb 6789 my $caseless_equivalent = $other->caseless_equivalent;
99870f4d
KW
6790 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6791 next if $table == $other;
6792 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6793
ffe43484 6794 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6795 $leader{$table_addr} = $other;
6796 $matches_all{$table_addr} = $matches_all;
6797 $self->_set_range_list($other->_range_list);
6798 push @{$equivalents{$other_addr}}, $table;
6799 if ($related) {
6800 $parent{$table_addr} = $other;
6801 push @{$children{$other_addr}}, $table;
6802 $table->set_status($status, $status_info);
301ba948
KW
6803
6804 # This reason currently doesn't get exposed outside; otherwise
6805 # would have to look up the parent's reason and use it instead.
6806 $table->set_fate($fate, "Parent's fate");
6807
d867ccfb 6808 $self->set_caseless_equivalent($caseless_equivalent);
99870f4d
KW
6809 }
6810 }
6811
6812 # Now that we've declared these to be equivalent, any changes to one
6813 # of the tables would invalidate that equivalency.
6814 $self->lock;
6815 $other->lock;
6816 return;
6817 }
6818
8ae00c8a
KW
6819 sub set_complement {
6820 # Set $self to be the complement of the parameter table. $self is
6821 # locked, as what it contains should all come from the other table.
6822
6823 my $self = shift;
6824 my $other = shift;
6825
6826 my %args = @_;
6827 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6828
6829 if ($other->complement != 0) {
6830 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
6831 return;
6832 }
6833 my $addr = do { no overloading; pack 'J', $self; };
6834 $complement{$addr} = $other;
6835 $self->lock;
6836 return;
6837 }
6838
99870f4d
KW
6839 sub add_range { # Add a range to the list for this table.
6840 my $self = shift;
6841 # Rest of parameters passed on
6842
6843 return if $self->carp_if_locked;
6844 return $self->_range_list->add_range(@_);
6845 }
6846
88c22f80
KW
6847 sub header {
6848 my $self = shift;
6849 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6850
6851 # All match tables are to be used only by the Perl core.
126c3d4e 6852 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
88c22f80
KW
6853 }
6854
99870f4d
KW
6855 sub pre_body { # Does nothing for match tables.
6856 return
6857 }
6858
6859 sub append_to_body { # Does nothing for match tables.
6860 return
6861 }
6862
301ba948
KW
6863 sub set_fate {
6864 my $self = shift;
6865 my $fate = shift;
6866 my $reason = shift;
6867 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6868
6869 $self->SUPER::set_fate($fate, $reason);
6870
6871 # All children share this fate
6872 foreach my $child ($self->children) {
6873 $child->set_fate($fate, $reason);
6874 }
6875 return;
6876 }
6877
99870f4d
KW
6878 sub write {
6879 my $self = shift;
6880 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6881
6882 return $self->SUPER::write(2); # 2 tab stops
6883 }
6884
6885 sub set_final_comment {
6886 # This creates a comment for the file that is to hold the match table
6887 # $self. It is somewhat convoluted to make the English read nicely,
6888 # but, heh, it's just a comment.
6889 # This should be called only with the leader match table of all the
6890 # ones that share the same file. It lists all such tables, ordered so
6891 # that related ones are together.
6892
bd9ebcfd
KW
6893 return unless $debugging_build;
6894
99870f4d
KW
6895 my $leader = shift; # Should only be called on the leader table of
6896 # an equivalent group
6897 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6898
ffe43484 6899 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6900
6901 if ($leader{$addr} != $leader) {
6902 Carp::my_carp_bug(<<END
6903set_final_comment() must be called on a leader table, which $leader is not.
6904It is equivalent to $leader{$addr}. No comment created
6905END
6906 );
6907 return;
6908 }
6909
6910 # Get the number of code points matched by each of the tables in this
6911 # file, and add underscores for clarity.
6912 my $count = $leader->count;
6913 my $string_count = main::clarify_number($count);
6914
6915 my $loose_count = 0; # how many aliases loosely matched
6916 my $compound_name = ""; # ? Are any names compound?, and if so, an
6917 # example
6918 my $properties_with_compound_names = 0; # count of these
6919
6920
6921 my %flags; # The status flags used in the file
6922 my $total_entries = 0; # number of entries written in the comment
6923 my $matches_comment = ""; # The portion of the comment about the
6924 # \p{}'s
6925 my @global_comments; # List of all the tables' comments that are
6926 # there before this routine was called.
6927
6928 # Get list of all the parent tables that are equivalent to this one
6929 # (including itself).
6930 my @parents = grep { $parent{main::objaddr $_} == $_ }
6931 main::uniques($leader, @{$equivalents{$addr}});
6932 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6933 # tables
6934
6935 for my $parent (@parents) {
6936
6937 my $property = $parent->property;
6938
6939 # Special case 'N' tables in properties with two match tables when
6940 # the other is a 'Y' one. These are likely to be binary tables,
6941 # but not necessarily. In either case, \P{} will match the
6942 # complement of \p{}, and so if something is a synonym of \p, the
6943 # complement of that something will be the synonym of \P. This
6944 # would be true of any property with just two match tables, not
6945 # just those whose values are Y and N; but that would require a
6946 # little extra work, and there are none such so far in Unicode.
6947 my $perl_p = 'p'; # which is it? \p{} or \P{}
6948 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6949
6950 if (scalar $property->tables == 2
6951 && $parent == $property->table('N')
6952 && defined (my $yes = $property->table('Y')))
6953 {
ffe43484 6954 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6955 @yes_perl_synonyms
6956 = grep { $_->property == $perl }
6957 main::uniques($yes,
6958 $parent{$yes_addr},
6959 $parent{$yes_addr}->children);
6960
6961 # But these synonyms are \P{} ,not \p{}
6962 $perl_p = 'P';
6963 }
6964
6965 my @description; # Will hold the table description
6966 my @note; # Will hold the table notes.
6967 my @conflicting; # Will hold the table conflicts.
6968
6969 # Look at the parent, any yes synonyms, and all the children
ffe43484 6970 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6971 for my $table ($parent,
6972 @yes_perl_synonyms,
f998e60c 6973 @{$children{$parent_addr}})
99870f4d 6974 {
ffe43484 6975 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6976 my $table_property = $table->property;
6977
6978 # Tables are separated by a blank line to create a grouping.
6979 $matches_comment .= "\n" if $matches_comment;
6980
6981 # The table is named based on the property and value
6982 # combination it is for, like script=greek. But there may be
6983 # a number of synonyms for each side, like 'sc' for 'script',
6984 # and 'grek' for 'greek'. Any combination of these is a valid
6985 # name for this table. In this case, there are three more,
6986 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6987 # listing all possible combinations in the comment, we make
6988 # sure that each synonym occurs at least once, and add
6989 # commentary that the other combinations are possible.
da912e1e
KW
6990 # Because regular expressions don't recognize things like
6991 # \p{jsn=}, only look at non-null right-hand-sides
99870f4d 6992 my @property_aliases = $table_property->aliases;
da912e1e 6993 my @table_aliases = grep { $_->name ne "" } $table->aliases;
99870f4d
KW
6994
6995 # The alias lists above are already ordered in the order we
6996 # want to output them. To ensure that each synonym is listed,
da912e1e
KW
6997 # we must use the max of the two numbers. But if there are no
6998 # legal synonyms (nothing in @table_aliases), then we don't
6999 # list anything.
7000 my $listed_combos = (@table_aliases)
7001 ? main::max(scalar @table_aliases,
7002 scalar @property_aliases)
7003 : 0;
99870f4d
KW
7004 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7005
da912e1e 7006
99870f4d
KW
7007 my $property_had_compound_name = 0;
7008
7009 for my $i (0 .. $listed_combos - 1) {
7010 $total_entries++;
7011
7012 # The current alias for the property is the next one on
7013 # the list, or if beyond the end, start over. Similarly
7014 # for the table (\p{prop=table})
7015 my $property_alias = $property_aliases
7016 [$i % @property_aliases]->name;
7017 my $table_alias_object = $table_aliases
7018 [$i % @table_aliases];
7019 my $table_alias = $table_alias_object->name;
7020 my $loose_match = $table_alias_object->loose_match;
7021
7022 if ($table_alias !~ /\D/) { # Clarify large numbers.
7023 $table_alias = main::clarify_number($table_alias)
7024 }
7025
7026 # Add a comment for this alias combination
7027 my $current_match_comment;
7028 if ($table_property == $perl) {
7029 $current_match_comment = "\\$perl_p"
7030 . "{$table_alias}";
7031 }
7032 else {
7033 $current_match_comment
7034 = "\\p{$property_alias=$table_alias}";
7035 $property_had_compound_name = 1;
7036 }
7037
7038 # Flag any abnormal status for this table.
7039 my $flag = $property->status
7040 || $table->status
7041 || $table_alias_object->status;
301ba948 7042 $flags{$flag} = $status_past_participles{$flag} if $flag;
99870f4d
KW
7043
7044 $loose_count++;
7045
7046 # Pretty up the comment. Note the \b; it says don't make
7047 # this line a continuation.
7048 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7049 $flag,
7050 " " x 7,
7051 $current_match_comment);
7052 } # End of generating the entries for this table.
7053
7054 # Save these for output after this group of related tables.
7055 push @description, $table->description;
7056 push @note, $table->note;
7057 push @conflicting, $table->conflicting;
7058
37e2e78e
KW
7059 # And this for output after all the tables.
7060 push @global_comments, $table->comment;
7061
99870f4d
KW
7062 # Compute an alternate compound name using the final property
7063 # synonym and the first table synonym with a colon instead of
7064 # the equal sign used elsewhere.
7065 if ($property_had_compound_name) {
7066 $properties_with_compound_names ++;
7067 if (! $compound_name || @property_aliases > 1) {
7068 $compound_name = $property_aliases[-1]->name
7069 . ': '
7070 . $table_aliases[0]->name;
7071 }
7072 }
7073 } # End of looping through all children of this table
7074
7075 # Here have assembled in $matches_comment all the related tables
7076 # to the current parent (preceded by the same info for all the
7077 # previous parents). Put out information that applies to all of
7078 # the current family.
7079 if (@conflicting) {
7080
7081 # But output the conflicting information now, as it applies to
7082 # just this table.
7083 my $conflicting = join ", ", @conflicting;
7084 if ($conflicting) {
7085 $matches_comment .= <<END;
7086
7087 Note that contrary to what you might expect, the above is NOT the same as
7088END
7089 $matches_comment .= "any of: " if @conflicting > 1;
7090 $matches_comment .= "$conflicting\n";
7091 }
7092 }
7093 if (@description) {
7094 $matches_comment .= "\n Meaning: "
7095 . join('; ', @description)
7096 . "\n";
7097 }
7098 if (@note) {
7099 $matches_comment .= "\n Note: "
7100 . join("\n ", @note)
7101 . "\n";
7102 }
7103 } # End of looping through all tables
7104
7105
7106 my $code_points;
7107 my $match;
7108 my $any_of_these;
7109 if ($count == 1) {
7110 $match = 'matches';
7111 $code_points = 'single code point';
7112 }
7113 else {
7114 $match = 'match';
7115 $code_points = "$string_count code points";
7116 }
7117
7118 my $synonyms;
7119 my $entries;
da912e1e 7120 if ($total_entries == 1) {
99870f4d
KW
7121 $synonyms = "";
7122 $entries = 'entry';
7123 $any_of_these = 'this'
7124 }
7125 else {
7126 $synonyms = " any of the following regular expression constructs";
7127 $entries = 'entries';
7128 $any_of_these = 'any of these'
7129 }
7130
6efd5c72 7131 my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
99870f4d
KW
7132 if ($has_unrelated) {
7133 $comment .= <<END;
7134This file is for tables that are not necessarily related: To conserve
7135resources, every table that matches the identical set of code points in this
7136version of Unicode uses this file. Each one is listed in a separate group
7137below. It could be that the tables will match the same set of code points in
7138other Unicode releases, or it could be purely coincidence that they happen to
7139be the same in Unicode $string_version, and hence may not in other versions.
7140
7141END
7142 }
7143
7144 if (%flags) {
7145 foreach my $flag (sort keys %flags) {
7146 $comment .= <<END;
37e2e78e 7147'$flag' below means that this form is $flags{$flag}.
301ba948 7148Consult $pod_file.pod
99870f4d
KW
7149END
7150 }
7151 $comment .= "\n";
7152 }
7153
da912e1e
KW
7154 if ($total_entries == 0) {
7155 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7156 $comment .= <<END;
7157This file returns the $code_points in Unicode Version $string_version for
7158$leader, but it is inaccessible through Perl regular expressions, as
7159"\\p{prop=}" is not recognized.
7160END
7161
7162 } else {
7163 $comment .= <<END;
99870f4d
KW
7164This file returns the $code_points in Unicode Version $string_version that
7165$match$synonyms:
7166
7167$matches_comment
37e2e78e 7168$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
7169including if adding or subtracting white space, underscore, and hyphen
7170characters matters or doesn't matter, and other permissible syntactic
7171variants. Upper/lower case distinctions never matter.
7172END
7173
da912e1e 7174 }
99870f4d
KW
7175 if ($compound_name) {
7176 $comment .= <<END;
7177
7178A colon can be substituted for the equals sign, and
7179END
7180 if ($properties_with_compound_names > 1) {
7181 $comment .= <<END;
7182within each group above,
7183END
7184 }
7185 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7186
7187 # Note the \b below, it says don't make that line a continuation.
7188 $comment .= <<END;
7189anything to the left of the equals (or colon) can be combined with anything to
7190the right. Thus, for example,
7191$compound_name
7192\bis also valid.
7193END
7194 }
7195
7196 # And append any comment(s) from the actual tables. They are all
7197 # gathered here, so may not read all that well.
37e2e78e
KW
7198 if (@global_comments) {
7199 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7200 }
99870f4d
KW
7201
7202 if ($count) { # The format differs if no code points, and needs no
7203 # explanation in that case
7204 $comment.= <<END;
7205
7206The format of the lines of this file is:
7207END
7208 $comment.= <<END;
7209START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7210STOP is the ending point, or if omitted, the range has just one code point.
7211END
0c07e538 7212 if ($leader->output_range_counts) {
99870f4d
KW
7213 $comment .= <<END;
7214Numbers in comments in [brackets] indicate how many code points are in the
7215range.
7216END
7217 }
7218 }
7219
7220 $leader->set_comment(main::join_lines($comment));
7221 return;
7222 }
7223
7224 # Accessors for the underlying list
ea25a9b2 7225 for my $sub (qw(
99870f4d
KW
7226 get_valid_code_point
7227 get_invalid_code_point
ea25a9b2 7228 ))
99870f4d
KW
7229 {
7230 no strict "refs";
7231 *$sub = sub {
7232 use strict "refs";
7233 my $self = shift;
7234
7235 return $self->_range_list->$sub(@_);
7236 }
7237 }
7238} # End closure for Match_Table
7239
7240package Property;
7241
7242# The Property class represents a Unicode property, or the $perl
7243# pseudo-property. It contains a map table initialized empty at construction
7244# time, and for properties accessible through regular expressions, various
7245# match tables, created through the add_match_table() method, and referenced
7246# by the table('NAME') or tables() methods, the latter returning a list of all
7247# of the match tables. Otherwise table operations implicitly are for the map
7248# table.
7249#
7250# Most of the data in the property is actually about its map table, so it
7251# mostly just uses that table's accessors for most methods. The two could
7252# have been combined into one object, but for clarity because of their
7253# differing semantics, they have been kept separate. It could be argued that
7254# the 'file' and 'directory' fields should be kept with the map table.
7255#
7256# Each property has a type. This can be set in the constructor, or in the
7257# set_type accessor, but mostly it is figured out by the data. Every property
7258# starts with unknown type, overridden by a parameter to the constructor, or
7259# as match tables are added, or ranges added to the map table, the data is
7260# inspected, and the type changed. After the table is mostly or entirely
7261# filled, compute_type() should be called to finalize they analysis.
7262#
7263# There are very few operations defined. One can safely remove a range from
7264# the map table, and property_add_or_replace_non_nulls() adds the maps from another
7265# table to this one, replacing any in the intersection of the two.
7266
7267sub standardize { return main::standardize($_[0]); }
7268sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7269
7270{ # Closure
7271
7272 # This hash will contain as keys, all the aliases of all properties, and
7273 # as values, pointers to their respective property objects. This allows
7274 # quick look-up of a property from any of its names.
7275 my %alias_to_property_of;
7276
7277 sub dump_alias_to_property_of {
7278 # For debugging
7279
7280 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7281 return;
7282 }
7283
7284 sub property_ref {
7285 # This is a package subroutine, not called as a method.
7286 # If the single parameter is a literal '*' it returns a list of all
7287 # defined properties.
7288 # Otherwise, the single parameter is a name, and it returns a pointer
7289 # to the corresponding property object, or undef if none.
7290 #
7291 # Properties can have several different names. The 'standard' form of
7292 # each of them is stored in %alias_to_property_of as they are defined.
7293 # But it's possible that this subroutine will be called with some
7294 # variant, so if the initial lookup fails, it is repeated with the
98dc9551 7295 # standardized form of the input name. If found, besides returning the
99870f4d
KW
7296 # result, the input name is added to the list so future calls won't
7297 # have to do the conversion again.
7298
7299 my $name = shift;
7300
7301 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7302
7303 if (! defined $name) {
7304 Carp::my_carp_bug("Undefined input property. No action taken.");
7305 return;
7306 }
7307
7308 return main::uniques(values %alias_to_property_of) if $name eq '*';
7309
7310 # Return cached result if have it.
7311 my $result = $alias_to_property_of{$name};
7312 return $result if defined $result;
7313
7314 # Convert the input to standard form.
7315 my $standard_name = standardize($name);
7316
7317 $result = $alias_to_property_of{$standard_name};
7318 return unless defined $result; # Don't cache undefs
7319
7320 # Cache the result before returning it.
7321 $alias_to_property_of{$name} = $result;
7322 return $result;
7323 }
7324
7325
7326 main::setup_package();
7327
7328 my %map;
7329 # A pointer to the map table object for this property
7330 main::set_access('map', \%map);
7331
7332 my %full_name;
7333 # The property's full name. This is a duplicate of the copy kept in the
7334 # map table, but is needed because stringify needs it during
7335 # construction of the map table, and then would have a chicken before egg
7336 # problem.
7337 main::set_access('full_name', \%full_name, 'r');
7338
7339 my %table_ref;
7340 # This hash will contain as keys, all the aliases of any match tables
7341 # attached to this property, and as values, the pointers to their
7342 # respective tables. This allows quick look-up of a table from any of its
7343 # names.
7344 main::set_access('table_ref', \%table_ref);
7345
7346 my %type;
7347 # The type of the property, $ENUM, $BINARY, etc
7348 main::set_access('type', \%type, 'r');
7349
7350 my %file;
7351 # The filename where the map table will go (if actually written).
7352 # Normally defaulted, but can be overridden.
7353 main::set_access('file', \%file, 'r', 's');
7354
7355 my %directory;
7356 # The directory where the map table will go (if actually written).
7357 # Normally defaulted, but can be overridden.
7358 main::set_access('directory', \%directory, 's');
7359
7360 my %pseudo_map_type;
7361 # This is used to affect the calculation of the map types for all the
7362 # ranges in the table. It should be set to one of the values that signify
7363 # to alter the calculation.
7364 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7365
7366 my %has_only_code_point_maps;
7367 # A boolean used to help in computing the type of data in the map table.
7368 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7369
7370 my %unique_maps;
7371 # A list of the first few distinct mappings this property has. This is
7372 # used to disambiguate between binary and enum property types, so don't
7373 # have to keep more than three.
7374 main::set_access('unique_maps', \%unique_maps);
7375
56557540
KW
7376 my %pre_declared_maps;
7377 # A boolean that gives whether the input data should declare all the
7378 # tables used, or not. If the former, unknown ones raise a warning.
7379 main::set_access('pre_declared_maps',
047274f2 7380 \%pre_declared_maps, 'r', 's');
56557540 7381
99870f4d
KW
7382 sub new {
7383 # The only required parameter is the positionally first, name. All
7384 # other parameters are key => value pairs. See the documentation just
7385 # above for the meanings of the ones not passed directly on to the map
7386 # table constructor.
7387
7388 my $class = shift;
7389 my $name = shift || "";
7390
7391 my $self = property_ref($name);
7392 if (defined $self) {
7393 my $options_string = join ", ", @_;
7394 $options_string = ". Ignoring options $options_string" if $options_string;
7395 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7396 return $self;
7397 }
7398
7399 my %args = @_;
7400
7401 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 7402 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7403
7404 $directory{$addr} = delete $args{'Directory'};
7405 $file{$addr} = delete $args{'File'};
7406 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7407 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7408 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
56557540
KW
7409 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7410 # Starting in this release, property
7411 # values should be defined for all
7412 # properties, except those overriding this
7413 // $v_version ge v5.1.0;
c12f2655 7414
99870f4d
KW
7415 # Rest of parameters passed on.
7416
7417 $has_only_code_point_maps{$addr} = 1;
7418 $table_ref{$addr} = { };
7419 $unique_maps{$addr} = { };
7420
7421 $map{$addr} = Map_Table->new($name,
7422 Full_Name => $full_name{$addr},
7423 _Alias_Hash => \%alias_to_property_of,
7424 _Property => $self,
7425 %args);
7426 return $self;
7427 }
7428
7429 # See this program's beginning comment block about overloading the copy
7430 # constructor. Few operations are defined on properties, but a couple are
7431 # useful. It is safe to take the inverse of a property, and to remove a
7432 # single code point from it.
7433 use overload
7434 fallback => 0,
7435 qw("") => "_operator_stringify",
7436 "." => \&main::_operator_dot,
7437 '==' => \&main::_operator_equal,
7438 '!=' => \&main::_operator_not_equal,
7439 '=' => sub { return shift },
7440 '-=' => "_minus_and_equal",
7441 ;
7442
7443 sub _operator_stringify {
7444 return "Property '" . shift->full_name . "'";
7445 }
7446
7447 sub _minus_and_equal {
7448 # Remove a single code point from the map table of a property.
7449
7450 my $self = shift;
7451 my $other = shift;
7452 my $reversed = shift;
7453 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7454
7455 if (ref $other) {
7456 Carp::my_carp_bug("Can't cope with a "
7457 . ref($other)
7458 . " argument to '-='. Subtraction ignored.");
7459 return $self;
7460 }
98dc9551 7461 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
99870f4d
KW
7462 Carp::my_carp_bug("Can't cope with a "
7463 . __PACKAGE__
7464 . " being the first parameter in a '-='. Subtraction ignored.");
7465 return $self;
7466 }
7467 else {
f998e60c 7468 no overloading;
051df77b 7469 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
7470 }
7471 return $self;
7472 }
7473
7474 sub add_match_table {
7475 # Add a new match table for this property, with name given by the
7476 # parameter. It returns a pointer to the table.
7477
7478 my $self = shift;
7479 my $name = shift;
7480 my %args = @_;
7481
ffe43484 7482 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7483
7484 my $table = $table_ref{$addr}{$name};
7485 my $standard_name = main::standardize($name);
7486 if (defined $table
7487 || (defined ($table = $table_ref{$addr}{$standard_name})))
7488 {
7489 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7490 $table_ref{$addr}{$name} = $table;
7491 return $table;
7492 }
7493 else {
7494
7495 # See if this is a perl extension, if not passed in.
7496 my $perl_extension = delete $args{'Perl_Extension'};
7497 $perl_extension
7498 = $self->perl_extension if ! defined $perl_extension;
7499
7500 $table = Match_Table->new(
7501 Name => $name,
7502 Perl_Extension => $perl_extension,
7503 _Alias_Hash => $table_ref{$addr},
7504 _Property => $self,
7505
301ba948
KW
7506 # gets property's fate and status by default
7507 Fate => $self->fate,
99870f4d
KW
7508 Status => $self->status,
7509 _Status_Info => $self->status_info,
88c22f80 7510 %args);
99870f4d
KW
7511 return unless defined $table;
7512 }
7513
7514 # Save the names for quick look up
7515 $table_ref{$addr}{$standard_name} = $table;
7516 $table_ref{$addr}{$name} = $table;
7517
7518 # Perhaps we can figure out the type of this property based on the
7519 # fact of adding this match table. First, string properties don't
7520 # have match tables; second, a binary property can't have 3 match
7521 # tables
7522 if ($type{$addr} == $UNKNOWN) {
7523 $type{$addr} = $NON_STRING;
7524 }
7525 elsif ($type{$addr} == $STRING) {
7526 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7527 $type{$addr} = $NON_STRING;
7528 }
06f26c45 7529 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
99870f4d
KW
7530 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7531 && $type{$addr} == $BINARY)
7532 {
7533 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7534 $type{$addr} = $ENUM;
7535 }
7536 }
7537
7538 return $table;
7539 }
7540
4b9b0bc5
KW
7541 sub delete_match_table {
7542 # Delete the table referred to by $2 from the property $1.
7543
7544 my $self = shift;
7545 my $table_to_remove = shift;
7546 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7547
7548 my $addr = do { no overloading; pack 'J', $self; };
7549
7550 # Remove all names that refer to it.
7551 foreach my $key (keys %{$table_ref{$addr}}) {
7552 delete $table_ref{$addr}{$key}
7553 if $table_ref{$addr}{$key} == $table_to_remove;
7554 }
7555
7556 $table_to_remove->DESTROY;
7557 return;
7558 }
7559
99870f4d
KW
7560 sub table {
7561 # Return a pointer to the match table (with name given by the
7562 # parameter) associated with this property; undef if none.
7563
7564 my $self = shift;
7565 my $name = shift;
7566 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7567
ffe43484 7568 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7569
7570 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7571
7572 # If quick look-up failed, try again using the standard form of the
7573 # input name. If that succeeds, cache the result before returning so
7574 # won't have to standardize this input name again.
7575 my $standard_name = main::standardize($name);
7576 return unless defined $table_ref{$addr}{$standard_name};
7577
7578 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7579 return $table_ref{$addr}{$name};
7580 }
7581
7582 sub tables {
7583 # Return a list of pointers to all the match tables attached to this
7584 # property
7585
f998e60c 7586 no overloading;
051df77b 7587 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
7588 }
7589
7590 sub directory {
7591 # Returns the directory the map table for this property should be
7592 # output in. If a specific directory has been specified, that has
7593 # priority; 'undef' is returned if the type isn't defined;
7594 # or $map_directory for everything else.
7595
ffe43484 7596 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
7597
7598 return $directory{$addr} if defined $directory{$addr};
7599 return undef if $type{$addr} == $UNKNOWN;
7600 return $map_directory;
7601 }
7602
7603 sub swash_name {
7604 # Return the name that is used to both:
7605 # 1) Name the file that the map table is written to.
7606 # 2) The name of swash related stuff inside that file.
7607 # The reason for this is that the Perl core historically has used
7608 # certain names that aren't the same as the Unicode property names.
7609 # To continue using these, $file is hard-coded in this file for those,
7610 # but otherwise the standard name is used. This is different from the
7611 # external_name, so that the rest of the files, like in lib can use
7612 # the standard name always, without regard to historical precedent.
7613
7614 my $self = shift;
7615 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7616
ffe43484 7617 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 7618
19f751d2
KW
7619 # Swash names are used only on regular map tables; otherwise there
7620 # should be no access to the property map table from other parts of
7621 # Perl.
7622 return if $map{$addr}->fate != $ORDINARY;
7623
99870f4d
KW
7624 return $file{$addr} if defined $file{$addr};
7625 return $map{$addr}->external_name;
7626 }
7627
7628 sub to_create_match_tables {
7629 # Returns a boolean as to whether or not match tables should be
7630 # created for this property.
7631
7632 my $self = shift;
7633 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7634
7635 # The whole point of this pseudo property is match tables.
7636 return 1 if $self == $perl;
7637
ffe43484 7638 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7639
7640 # Don't generate tables of code points that match the property values
7641 # of a string property. Such a list would most likely have many
7642 # property values, each with just one or very few code points mapping
7643 # to it.
7644 return 0 if $type{$addr} == $STRING;
7645
7646 # Don't generate anything for unimplemented properties.
7647 return 0 if grep { $self->complete_name eq $_ }
7648 @unimplemented_properties;
7649 # Otherwise, do.
7650 return 1;
7651 }
7652
7653 sub property_add_or_replace_non_nulls {
7654 # This adds the mappings in the property $other to $self. Non-null
7655 # mappings from $other override those in $self. It essentially merges
7656 # the two properties, with the second having priority except for null
7657 # mappings.
7658
7659 my $self = shift;
7660 my $other = shift;
7661 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7662
7663 if (! $other->isa(__PACKAGE__)) {
7664 Carp::my_carp_bug("$other should be a "
7665 . __PACKAGE__
7666 . ". Not a '"
7667 . ref($other)
7668 . "'. Not added;");
7669 return;
7670 }
7671
f998e60c 7672 no overloading;
051df77b 7673 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
7674 }
7675
5be997b0
KW
7676 sub set_proxy_for {
7677 # Certain tables are not generally written out to files, but
7678 # Unicode::UCD has the intelligence to know that the file for $self
7679 # can be used to reconstruct those tables. This routine just changes
7680 # things so that UCD pod entries for those suppressed tables are
7681 # generated, so the fact that a proxy is used is invisible to the
7682 # user.
7683
7684 my $self = shift;
7685
7686 foreach my $property_name (@_) {
7687 my $ref = property_ref($property_name);
7688 next if $ref->to_output_map;
7689 $ref->set_fate($MAP_PROXIED);
7690 }
7691 }
7692
99870f4d
KW
7693 sub set_type {
7694 # Set the type of the property. Mostly this is figured out by the
7695 # data in the table. But this is used to set it explicitly. The
7696 # reason it is not a standard accessor is that when setting a binary
7697 # property, we need to make sure that all the true/false aliases are
7698 # present, as they were omitted in early Unicode releases.
7699
7700 my $self = shift;
7701 my $type = shift;
7702 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7703
06f26c45
KW
7704 if ($type != $ENUM
7705 && $type != $BINARY
7706 && $type != $FORCED_BINARY
7707 && $type != $STRING)
7708 {
99870f4d
KW
7709 Carp::my_carp("Unrecognized type '$type'. Type not set");
7710 return;
7711 }
7712
051df77b 7713 { no overloading; $type{pack 'J', $self} = $type; }
06f26c45 7714 return if $type != $BINARY && $type != $FORCED_BINARY;
99870f4d
KW
7715
7716 my $yes = $self->table('Y');
7717 $yes = $self->table('Yes') if ! defined $yes;
01adf4be
KW
7718 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7719 if ! defined $yes;
7720
3c6bf941
KW
7721 # Add aliases in order wanted, duplicates will be ignored. We use a
7722 # binary property present in all releases for its ordered lists of
7723 # true/false aliases. Note, that could run into problems in
7724 # outputting things in that we don't distinguish between the name and
7725 # full name of these. Hopefully, if the table was already created
7726 # before this code is executed, it was done with these set properly.
7727 my $bm = property_ref("Bidi_Mirrored");
7728 foreach my $alias ($bm->table("Y")->aliases) {
7729 $yes->add_alias($alias->name);
7730 }
99870f4d
KW
7731 my $no = $self->table('N');
7732 $no = $self->table('No') if ! defined $no;
01adf4be 7733 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
3c6bf941
KW
7734 foreach my $alias ($bm->table("N")->aliases) {
7735 $no->add_alias($alias->name);
7736 }
c12f2655 7737
99870f4d
KW
7738 return;
7739 }
7740
7741 sub add_map {
7742 # Add a map to the property's map table. This also keeps
7743 # track of the maps so that the property type can be determined from
7744 # its data.
7745
7746 my $self = shift;
7747 my $start = shift; # First code point in range
7748 my $end = shift; # Final code point in range
7749 my $map = shift; # What the range maps to.
7750 # Rest of parameters passed on.
7751
ffe43484 7752 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7753
7754 # If haven't the type of the property, gather information to figure it
7755 # out.
7756 if ($type{$addr} == $UNKNOWN) {
7757
7758 # If the map contains an interior blank or dash, or most other
7759 # nonword characters, it will be a string property. This
7760 # heuristic may actually miss some string properties. If so, they
7761 # may need to have explicit set_types called for them. This
7762 # happens in the Unihan properties.
7763 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7764 || $map =~ / [^\w.\/\ -] /x)
7765 {
7766 $self->set_type($STRING);
7767
7768 # $unique_maps is used for disambiguating between ENUM and
7769 # BINARY later; since we know the property is not going to be
7770 # one of those, no point in keeping the data around
7771 undef $unique_maps{$addr};
7772 }
7773 else {
7774
7775 # Not necessarily a string. The final decision has to be
7776 # deferred until all the data are in. We keep track of if all
7777 # the values are code points for that eventual decision.
7778 $has_only_code_point_maps{$addr} &=
7779 $map =~ / ^ $code_point_re $/x;
7780
7781 # For the purposes of disambiguating between binary and other
7782 # enumerations at the end, we keep track of the first three
7783 # distinct property values. Once we get to three, we know
7784 # it's not going to be binary, so no need to track more.
7785 if (scalar keys %{$unique_maps{$addr}} < 3) {
7786 $unique_maps{$addr}{main::standardize($map)} = 1;
7787 }
7788 }
7789 }
7790
7791 # Add the mapping by calling our map table's method
7792 return $map{$addr}->add_map($start, $end, $map, @_);
7793 }
7794
7795 sub compute_type {
7796 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7797 # should be called after the property is mostly filled with its maps.
7798 # We have been keeping track of what the property values have been,
7799 # and now have the necessary information to figure out the type.
7800
7801 my $self = shift;
7802 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7803
ffe43484 7804 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7805
7806 my $type = $type{$addr};
7807
7808 # If already have figured these out, no need to do so again, but we do
7809 # a double check on ENUMS to make sure that a string property hasn't
7810 # improperly been classified as an ENUM, so continue on with those.
06f26c45
KW
7811 return if $type == $STRING
7812 || $type == $BINARY
7813 || $type == $FORCED_BINARY;
99870f4d
KW
7814
7815 # If every map is to a code point, is a string property.
7816 if ($type == $UNKNOWN
7817 && ($has_only_code_point_maps{$addr}
7818 || (defined $map{$addr}->default_map
7819 && $map{$addr}->default_map eq "")))
7820 {
7821 $self->set_type($STRING);
7822 }
7823 else {
7824
7825 # Otherwise, it is to some sort of enumeration. (The case where
7826 # it is a Unicode miscellaneous property, and treated like a
7827 # string in this program is handled in add_map()). Distinguish
7828 # between binary and some other enumeration type. Of course, if
7829 # there are more than two values, it's not binary. But more
7830 # subtle is the test that the default mapping is defined means it
7831 # isn't binary. This in fact may change in the future if Unicode
7832 # changes the way its data is structured. But so far, no binary
7833 # properties ever have @missing lines for them, so the default map
7834 # isn't defined for them. The few properties that are two-valued
7835 # and aren't considered binary have the default map defined
7836 # starting in Unicode 5.0, when the @missing lines appeared; and
7837 # this program has special code to put in a default map for them
7838 # for earlier than 5.0 releases.
7839 if ($type == $ENUM
7840 || scalar keys %{$unique_maps{$addr}} > 2
7841 || defined $self->default_map)
7842 {
7843 my $tables = $self->tables;
7844 my $count = $self->count;
7845 if ($verbosity && $count > 500 && $tables/$count > .1) {
7846 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7847 }
7848 $self->set_type($ENUM);
7849 }
7850 else {
7851 $self->set_type($BINARY);
7852 }
7853 }
7854 undef $unique_maps{$addr}; # Garbage collect
7855 return;
7856 }
7857
301ba948
KW
7858 sub set_fate {
7859 my $self = shift;
7860 my $fate = shift;
7861 my $reason = shift; # Ignored unless suppressing
7862 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7863
7864 my $addr = do { no overloading; pack 'J', $self; };
7865 if ($fate == $SUPPRESSED) {
7866 $why_suppressed{$self->complete_name} = $reason;
7867 }
7868
395dfc19
KW
7869 # Each table shares the property's fate, except that MAP_PROXIED
7870 # doesn't affect match tables
7871 $map{$addr}->set_fate($fate, $reason);
7872 if ($fate != $MAP_PROXIED) {
13a49092
KW
7873 foreach my $table ($map{$addr}, $self->tables) {
7874 $table->set_fate($fate, $reason);
7875 }
395dfc19 7876 }
301ba948
KW
7877 return;
7878 }
7879
7880
99870f4d
KW
7881 # Most of the accessors for a property actually apply to its map table.
7882 # Setup up accessor functions for those, referring to %map
ea25a9b2 7883 for my $sub (qw(
99870f4d
KW
7884 add_alias
7885 add_anomalous_entry
7886 add_comment
7887 add_conflicting
7888 add_description
7889 add_duplicate
7890 add_note
7891 aliases
7892 comment
7893 complete_name
2f7a8815 7894 containing_range
99870f4d
KW
7895 count
7896 default_map
7897 delete_range
7898 description
7899 each_range
7900 external_name
301ba948 7901 fate
99870f4d
KW
7902 file_path
7903 format
7904 initialize
7905 inverse_list
7906 is_empty
7907 name
7908 note
7909 perl_extension
7910 property
7911 range_count
7912 ranges
7913 range_size_1
7914 reset_each_range
7915 set_comment
99870f4d
KW
7916 set_default_map
7917 set_file_path
7918 set_final_comment
26561784 7919 _set_format
99870f4d
KW
7920 set_range_size_1
7921 set_status
7922 set_to_output_map
7923 short_name
7924 status
7925 status_info
7926 to_output_map
0a9dbafc 7927 type_of
99870f4d
KW
7928 value_of
7929 write
ea25a9b2 7930 ))
99870f4d
KW
7931 # 'property' above is for symmetry, so that one can take
7932 # the property of a property and get itself, and so don't
7933 # have to distinguish between properties and tables in
7934 # calling code
7935 {
7936 no strict "refs";
7937 *$sub = sub {
7938 use strict "refs";
7939 my $self = shift;
f998e60c 7940 no overloading;
051df77b 7941 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7942 }
7943 }
7944
7945
7946} # End closure
7947
7948package main;
7949
7950sub join_lines($) {
7951 # Returns lines of the input joined together, so that they can be folded
7952 # properly.
7953 # This causes continuation lines to be joined together into one long line
7954 # for folding. A continuation line is any line that doesn't begin with a
7955 # space or "\b" (the latter is stripped from the output). This is so
7956 # lines can be be in a HERE document so as to fit nicely in the terminal
7957 # width, but be joined together in one long line, and then folded with
7958 # indents, '#' prefixes, etc, properly handled.
7959 # A blank separates the joined lines except if there is a break; an extra
7960 # blank is inserted after a period ending a line.
7961
98dc9551 7962 # Initialize the return with the first line.
99870f4d
KW
7963 my ($return, @lines) = split "\n", shift;
7964
7965 # If the first line is null, it was an empty line, add the \n back in
7966 $return = "\n" if $return eq "";
7967
7968 # Now join the remainder of the physical lines.
7969 for my $line (@lines) {
7970
7971 # An empty line means wanted a blank line, so add two \n's to get that
7972 # effect, and go to the next line.
7973 if (length $line == 0) {
7974 $return .= "\n\n";
7975 next;
7976 }
7977
7978 # Look at the last character of what we have so far.
7979 my $previous_char = substr($return, -1, 1);
7980
7981 # And at the next char to be output.
7982 my $next_char = substr($line, 0, 1);
7983
7984 if ($previous_char ne "\n") {
7985
7986 # Here didn't end wth a nl. If the next char a blank or \b, it
7987 # means that here there is a break anyway. So add a nl to the
7988 # output.
7989 if ($next_char eq " " || $next_char eq "\b") {
7990 $previous_char = "\n";
7991 $return .= $previous_char;
7992 }
7993
7994 # Add an extra space after periods.
7995 $return .= " " if $previous_char eq '.';
7996 }
7997
7998 # Here $previous_char is still the latest character to be output. If
7999 # it isn't a nl, it means that the next line is to be a continuation
8000 # line, with a blank inserted between them.
8001 $return .= " " if $previous_char ne "\n";
8002
8003 # Get rid of any \b
8004 substr($line, 0, 1) = "" if $next_char eq "\b";
8005
8006 # And append this next line.
8007 $return .= $line;
8008 }
8009
8010 return $return;
8011}
8012
8013sub simple_fold($;$$$) {
8014 # Returns a string of the input (string or an array of strings) folded
8015 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8016 # a \n
8017 # This is tailored for the kind of text written by this program,
8018 # especially the pod file, which can have very long names with
8019 # underscores in the middle, or words like AbcDefgHij.... We allow
8020 # breaking in the middle of such constructs if the line won't fit
8021 # otherwise. The break in such cases will come either just after an
8022 # underscore, or just before one of the Capital letters.
8023
8024 local $to_trace = 0 if main::DEBUG;
8025
8026 my $line = shift;
8027 my $prefix = shift; # Optional string to prepend to each output
8028 # line
8029 $prefix = "" unless defined $prefix;
8030
8031 my $hanging_indent = shift; # Optional number of spaces to indent
8032 # continuation lines
8033 $hanging_indent = 0 unless $hanging_indent;
8034
8035 my $right_margin = shift; # Optional number of spaces to narrow the
8036 # total width by.
8037 $right_margin = 0 unless defined $right_margin;
8038
8039 # Call carp with the 'nofold' option to avoid it from trying to call us
8040 # recursively
8041 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8042
8043 # The space available doesn't include what's automatically prepended
8044 # to each line, or what's reserved on the right.
8045 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8046 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8047
8048 if (DEBUG && $hanging_indent >= $max) {
8049 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8050 $hanging_indent = 0;
8051 }
8052
8053 # First, split into the current physical lines.
8054 my @line;
8055 if (ref $line) { # Better be an array, because not bothering to
8056 # test
8057 foreach my $line (@{$line}) {
8058 push @line, split /\n/, $line;
8059 }
8060 }
8061 else {
8062 @line = split /\n/, $line;
8063 }
8064
8065 #local $to_trace = 1 if main::DEBUG;
8066 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8067
8068 # Look at each current physical line.
8069 for (my $i = 0; $i < @line; $i++) {
8070 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8071 #local $to_trace = 1 if main::DEBUG;
8072 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8073
8074 # Remove prefix, because will be added back anyway, don't want
8075 # doubled prefix
8076 $line[$i] =~ s/^$prefix//;
8077
8078 # Remove trailing space
8079 $line[$i] =~ s/\s+\Z//;
8080
8081 # If the line is too long, fold it.
8082 if (length $line[$i] > $max) {
8083 my $remainder;
8084
8085 # Here needs to fold. Save the leading space in the line for
8086 # later.
8087 $line[$i] =~ /^ ( \s* )/x;
8088 my $leading_space = $1;
8089 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8090
8091 # If character at final permissible position is white space,
8092 # fold there, which will delete that white space
8093 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8094 $remainder = substr($line[$i], $max);
8095 $line[$i] = substr($line[$i], 0, $max - 1);
8096 }
8097 else {
8098
8099 # Otherwise fold at an acceptable break char closest to
8100 # the max length. Look at just the maximal initial
8101 # segment of the line
8102 my $segment = substr($line[$i], 0, $max - 1);
8103 if ($segment =~
8104 /^ ( .{$hanging_indent} # Don't look before the
8105 # indent.
8106 \ * # Don't look in leading
8107 # blanks past the indent
8108 [^ ] .* # Find the right-most
8109 (?: # acceptable break:
8110 [ \s = ] # space or equal
8111 | - (?! [.0-9] ) # or non-unary minus.
8112 ) # $1 includes the character
8113 )/x)
8114 {
8115 # Split into the initial part that fits, and remaining
8116 # part of the input
8117 $remainder = substr($line[$i], length $1);
8118 $line[$i] = $1;
8119 trace $line[$i] if DEBUG && $to_trace;
8120 trace $remainder if DEBUG && $to_trace;
8121 }
8122
8123 # If didn't find a good breaking spot, see if there is a
8124 # not-so-good breaking spot. These are just after
8125 # underscores or where the case changes from lower to
8126 # upper. Use \a as a soft hyphen, but give up
8127 # and don't break the line if there is actually a \a
8128 # already in the input. We use an ascii character for the
8129 # soft-hyphen to avoid any attempt by miniperl to try to
8130 # access the files that this program is creating.
8131 elsif ($segment !~ /\a/
8132 && ($segment =~ s/_/_\a/g
8133 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8134 {
8135 # Here were able to find at least one place to insert
8136 # our substitute soft hyphen. Find the right-most one
8137 # and replace it by a real hyphen.
8138 trace $segment if DEBUG && $to_trace;
8139 substr($segment,
8140 rindex($segment, "\a"),
8141 1) = '-';
8142
8143 # Then remove the soft hyphen substitutes.
8144 $segment =~ s/\a//g;
8145 trace $segment if DEBUG && $to_trace;
8146
8147 # And split into the initial part that fits, and
8148 # remainder of the line
8149 my $pos = rindex($segment, '-');
8150 $remainder = substr($line[$i], $pos);
8151 trace $remainder if DEBUG && $to_trace;
8152 $line[$i] = substr($segment, 0, $pos + 1);
8153 }
8154 }
8155
8156 # Here we know if we can fold or not. If we can, $remainder
8157 # is what remains to be processed in the next iteration.
8158 if (defined $remainder) {
8159 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8160
8161 # Insert the folded remainder of the line as a new element
8162 # of the array. (It may still be too long, but we will
8163 # deal with that next time through the loop.) Omit any
8164 # leading space in the remainder.
8165 $remainder =~ s/^\s+//;
8166 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8167
8168 # But then indent by whichever is larger of:
8169 # 1) the leading space on the input line;
8170 # 2) the hanging indent.
8171 # This preserves indentation in the original line.
8172 my $lead = ($leading_space)
8173 ? length $leading_space
8174 : $hanging_indent;
8175 $lead = max($lead, $hanging_indent);
8176 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8177 }
8178 }
8179
8180 # Ready to output the line. Get rid of any trailing space
8181 # And prefix by the required $prefix passed in.
8182 $line[$i] =~ s/\s+$//;
8183 $line[$i] = "$prefix$line[$i]\n";
8184 } # End of looping through all the lines.
8185
8186 return join "", @line;
8187}
8188
8189sub property_ref { # Returns a reference to a property object.
8190 return Property::property_ref(@_);
8191}
8192
8193sub force_unlink ($) {
8194 my $filename = shift;
8195 return unless file_exists($filename);
8196 return if CORE::unlink($filename);
8197
8198 # We might need write permission
8199 chmod 0777, $filename;
8200 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8201 return;
8202}
8203
9218f1cf 8204sub write ($$@) {
9abe8df8
KW
8205 # Given a filename and references to arrays of lines, write the lines of
8206 # each array to the file
99870f4d
KW
8207 # Filename can be given as an arrayref of directory names
8208
9218f1cf 8209 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
99870f4d 8210
9abe8df8 8211 my $file = shift;
9218f1cf 8212 my $use_utf8 = shift;
99870f4d
KW
8213
8214 # Get into a single string if an array, and get rid of, in Unix terms, any
8215 # leading '.'
8216 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8217 $file = File::Spec->canonpath($file);
8218
8219 # If has directories, make sure that they all exist
8220 (undef, my $directories, undef) = File::Spec->splitpath($file);
8221 File::Path::mkpath($directories) if $directories && ! -d $directories;
8222
8223 push @files_actually_output, $file;
8224
99870f4d
KW
8225 force_unlink ($file);
8226
8227 my $OUT;
8228 if (not open $OUT, ">", $file) {
8229 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8230 return;
8231 }
430ada4c 8232
9218f1cf
KW
8233 binmode $OUT, ":utf8" if $use_utf8;
8234
9abe8df8
KW
8235 while (defined (my $lines_ref = shift)) {
8236 unless (@$lines_ref) {
8237 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8238 }
8239
8240 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8241 }
430ada4c
NC
8242 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8243
99870f4d
KW
8244 print "$file written.\n" if $verbosity >= $VERBOSE;
8245
99870f4d
KW
8246 return;
8247}
8248
8249
8250sub Standardize($) {
8251 # This converts the input name string into a standardized equivalent to
8252 # use internally.
8253
8254 my $name = shift;
8255 unless (defined $name) {
8256 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8257 return;
8258 }
8259
8260 # Remove any leading or trailing white space
8261 $name =~ s/^\s+//g;
8262 $name =~ s/\s+$//g;
8263
98dc9551 8264 # Convert interior white space and hyphens into underscores.
99870f4d
KW
8265 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8266
8267 # Capitalize the letter following an underscore, and convert a sequence of
8268 # multiple underscores to a single one
8269 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8270
8271 # And capitalize the first letter, but not for the special cjk ones.
8272 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8273 return $name;
8274}
8275
8276sub standardize ($) {
8277 # Returns a lower-cased standardized name, without underscores. This form
8278 # is chosen so that it can distinguish between any real versus superficial
8279 # Unicode name differences. It relies on the fact that Unicode doesn't
8280 # have interior underscores, white space, nor dashes in any
8281 # stricter-matched name. It should not be used on Unicode code point
8282 # names (the Name property), as they mostly, but not always follow these
8283 # rules.
8284
8285 my $name = Standardize(shift);
8286 return if !defined $name;
8287
8288 $name =~ s/ (?<= .) _ (?= . ) //xg;
8289 return lc $name;
8290}
8291
c85f591a
KW
8292sub utf8_heavy_name ($$) {
8293 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8294 # perhaps this function should be placed somewhere, like Heavy.pl so that
8295 # utf8_heavy can use it directly without duplicating code that can get
8296 # out-of sync.
8297
8298 my $table = shift;
8299 my $alias = shift;
8300 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8301
8302 my $property = $table->property;
8303 $property = ($property == $perl)
8304 ? "" # 'perl' is never explicitly stated
8305 : standardize($property->name) . '=';
8306 if ($alias->loose_match) {
8307 return $property . standardize($alias->name);
8308 }
8309 else {
8310 return lc ($property . $alias->name);
8311 }
8312
8313 return;
8314}
8315
99870f4d
KW
8316{ # Closure
8317
7e3121cc 8318 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
99870f4d
KW
8319 my %already_output;
8320
8321 $main::simple_dumper_nesting = 0;
8322
8323 sub simple_dumper {
8324 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8325 # the real thing as we have to run under miniperl.
8326
8327 # It is designed so that on input it is at the beginning of a line,
8328 # and the final thing output in any call is a trailing ",\n".
8329
8330 my $item = shift;
8331 my $indent = shift;
7e3121cc 8332 $indent = "" if ! $debugging_build || ! defined $indent;
99870f4d
KW
8333
8334 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8335
8336 # nesting level is localized, so that as the call stack pops, it goes
8337 # back to the prior value.
8338 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8339 undef %already_output if $main::simple_dumper_nesting == 0;
8340 $main::simple_dumper_nesting++;
8341 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8342
8343 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8344
8345 # Determine the indent for recursive calls.
8346 my $next_indent = $indent . $indent_increment;
8347
8348 my $output;
8349 if (! ref $item) {
8350
8351 # Dump of scalar: just output it in quotes if not a number. To do
8352 # so we must escape certain characters, and therefore need to
8353 # operate on a copy to avoid changing the original
8354 my $copy = $item;
8355 $copy = $UNDEF unless defined $copy;
8356
02cc6656
KW
8357 # Quote non-integers (integers also have optional leading '-')
8358 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
99870f4d
KW
8359
8360 # Escape apostrophe and backslash
8361 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8362 $copy = "'$copy'";
8363 }
8364 $output = "$indent$copy,\n";
8365 }
8366 else {
8367
8368 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 8369 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 8370 if (defined $already_output{$addr}) {
99870f4d
KW
8371 return "${indent}ALREADY OUTPUT: $item\n";
8372 }
f998e60c 8373 $already_output{$addr} = $item;
99870f4d
KW
8374
8375 if (ref $item eq 'ARRAY') {
8376 my $using_brackets;
8377 $output = $indent;
8378 if ($main::simple_dumper_nesting > 1) {
8379 $output .= '[';
8380 $using_brackets = 1;
8381 }
8382 else {
8383 $using_brackets = 0;
8384 }
8385
8386 # If the array is empty, put the closing bracket on the same
8387 # line. Otherwise, recursively add each array element
8388 if (@$item == 0) {
8389 $output .= " ";
8390 }
8391 else {
8392 $output .= "\n";
8393 for (my $i = 0; $i < @$item; $i++) {
8394
8395 # Indent array elements one level
8396 $output .= &simple_dumper($item->[$i], $next_indent);
7e3121cc 8397 next if ! $debugging_build;
c12f2655
KW
8398 $output =~ s/\n$//; # Remove any trailing nl so
8399 $output .= " # [$i]\n"; # as to add a comment giving
8400 # the array index
99870f4d
KW
8401 }
8402 $output .= $indent; # Indent closing ']' to orig level
8403 }
8404 $output .= ']' if $using_brackets;
8405 $output .= ",\n";
8406 }
8407 elsif (ref $item eq 'HASH') {
8408 my $is_first_line;
8409 my $using_braces;
8410 my $body_indent;
8411
8412 # No surrounding braces at top level
8413 $output .= $indent;
8414 if ($main::simple_dumper_nesting > 1) {
8415 $output .= "{\n";
8416 $is_first_line = 0;
8417 $body_indent = $next_indent;
8418 $next_indent .= $indent_increment;
8419 $using_braces = 1;
8420 }
8421 else {
8422 $is_first_line = 1;
8423 $body_indent = $indent;
8424 $using_braces = 0;
8425 }
8426
8427 # Output hashes sorted alphabetically instead of apparently
8428 # random. Use caseless alphabetic sort
8429 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8430 {
8431 if ($is_first_line) {
8432 $is_first_line = 0;
8433 }
8434 else {
8435 $output .= "$body_indent";
8436 }
8437
8438 # The key must be a scalar, but this recursive call quotes
8439 # it
8440 $output .= &simple_dumper($key);
8441
8442 # And change the trailing comma and nl to the hash fat
8443 # comma for clarity, and so the value can be on the same
8444 # line
8445 $output =~ s/,\n$/ => /;
8446
8447 # Recursively call to get the value's dump.
8448 my $next = &simple_dumper($item->{$key}, $next_indent);
8449
8450 # If the value is all on one line, remove its indent, so
8451 # will follow the => immediately. If it takes more than
8452 # one line, start it on a new line.
8453 if ($next !~ /\n.*\n/) {
8454 $next =~ s/^ *//;
8455 }
8456 else {
8457 $output .= "\n";
8458 }
8459 $output .= $next;
8460 }
8461
8462 $output .= "$indent},\n" if $using_braces;
8463 }
8464 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8465 $output = $indent . ref($item) . "\n";
8466 # XXX see if blessed
8467 }
8468 elsif ($item->can('dump')) {
8469
8470 # By convention in this program, objects furnish a 'dump'
8471 # method. Since not doing any output at this level, just pass
8472 # on the input indent
8473 $output = $item->dump($indent);
8474 }
8475 else {
8476 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8477 }
8478 }
8479 return $output;
8480 }
8481}
8482
8483sub dump_inside_out {
8484 # Dump inside-out hashes in an object's state by converting them to a
8485 # regular hash and then calling simple_dumper on that.
8486
8487 my $object = shift;
8488 my $fields_ref = shift;
8489 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8490
ffe43484 8491 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
8492
8493 my %hash;
8494 foreach my $key (keys %$fields_ref) {
8495 $hash{$key} = $fields_ref->{$key}{$addr};
8496 }
8497
8498 return simple_dumper(\%hash, @_);
8499}
8500
8501sub _operator_dot {
8502 # Overloaded '.' method that is common to all packages. It uses the
8503 # package's stringify method.
8504
8505 my $self = shift;
8506 my $other = shift;
8507 my $reversed = shift;
8508 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8509
8510 $other = "" unless defined $other;
8511
8512 foreach my $which (\$self, \$other) {
8513 next unless ref $$which;
8514 if ($$which->can('_operator_stringify')) {
8515 $$which = $$which->_operator_stringify;
8516 }
8517 else {
8518 my $ref = ref $$which;
ffe43484 8519 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
8520 $$which = "$ref ($addr)";
8521 }
8522 }
8523 return ($reversed)
8524 ? "$other$self"
8525 : "$self$other";
8526}
8527
8528sub _operator_equal {
8529 # Generic overloaded '==' routine. To be equal, they must be the exact
8530 # same object
8531
8532 my $self = shift;
8533 my $other = shift;
8534
8535 return 0 unless defined $other;
8536 return 0 unless ref $other;
f998e60c 8537 no overloading;
2100aa98 8538 return $self == $other;
99870f4d
KW
8539}
8540
8541sub _operator_not_equal {
8542 my $self = shift;
8543 my $other = shift;
8544
8545 return ! _operator_equal($self, $other);
8546}
8547
8548sub process_PropertyAliases($) {
8549 # This reads in the PropertyAliases.txt file, which contains almost all
8550 # the character properties in Unicode and their equivalent aliases:
8551 # scf ; Simple_Case_Folding ; sfc
8552 #
8553 # Field 0 is the preferred short name for the property.
8554 # Field 1 is the full name.
8555 # Any succeeding ones are other accepted names.
8556
8557 my $file= shift;
8558 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8559
8560 # This whole file was non-existent in early releases, so use our own
8561 # internal one.
8562 $file->insert_lines(get_old_property_aliases())
8563 if ! -e 'PropertyAliases.txt';
8564
8565 # Add any cjk properties that may have been defined.
8566 $file->insert_lines(@cjk_properties);
8567
8568 while ($file->next_line) {
8569
8570 my @data = split /\s*;\s*/;
8571
8572 my $full = $data[1];
8573
8574 my $this = Property->new($data[0], Full_Name => $full);
8575
8576 # Start looking for more aliases after these two.
8577 for my $i (2 .. @data - 1) {
8578 $this->add_alias($data[$i]);
8579 }
8580
8581 }
8582 return;
8583}
8584
8585sub finish_property_setup {
8586 # Finishes setting up after PropertyAliases.
8587
8588 my $file = shift;
8589 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8590
8591 # This entry was missing from this file in earlier Unicode versions
8592 if (-e 'Jamo.txt') {
8593 my $jsn = property_ref('JSN');
8594 if (! defined $jsn) {
8595 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8596 }
8597 }
8598
5f7264c7 8599 # This entry is still missing as of 6.0, perhaps because no short name for
99870f4d
KW
8600 # it.
8601 if (-e 'NameAliases.txt') {
8602 my $aliases = property_ref('Name_Alias');
8603 if (! defined $aliases) {
8604 $aliases = Property->new('Name_Alias');
8605 }
8606 }
8607
8608 # These are used so much, that we set globals for them.
8609 $gc = property_ref('General_Category');
8610 $block = property_ref('Block');
359523e2 8611 $script = property_ref('Script');
99870f4d
KW
8612
8613 # Perl adds this alias.
8614 $gc->add_alias('Category');
8615
8616 # For backwards compatibility, these property files have particular names.
83b7c87d
KW
8617 property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
8618 # utf8.c calls it
8619 property_ref('Lowercase_Mapping')->set_file('Lower');
8620 property_ref('Titlecase_Mapping')->set_file('Title');
99870f4d
KW
8621
8622 my $fold = property_ref('Case_Folding');
8623 $fold->set_file('Fold') if defined $fold;
8624
d3cbe105
KW
8625 # Unicode::Normalize expects this file with this name and directory.
8626 my $ccc = property_ref('Canonical_Combining_Class');
8627 if (defined $ccc) {
8628 $ccc->set_file('CombiningClass');
8629 $ccc->set_directory(File::Spec->curdir());
8630 }
8631
2cd56239
KW
8632 # utf8.c has a different meaning for non range-size-1 for map properties
8633 # that this program doesn't currently handle; and even if it were changed
8634 # to do so, some other code may be using them expecting range size 1.
99870f4d
KW
8635 foreach my $property (qw {
8636 Case_Folding
8637 Lowercase_Mapping
8638 Titlecase_Mapping
8639 Uppercase_Mapping
8640 })
8641 {
8642 property_ref($property)->set_range_size_1(1);
8643 }
8644
8645 # These two properties aren't actually used in the core, but unfortunately
8646 # the names just above that are in the core interfere with these, so
8647 # choose different names. These aren't a problem unless the map tables
8648 # for these files get written out.
8649 my $lowercase = property_ref('Lowercase');
8650 $lowercase->set_file('IsLower') if defined $lowercase;
8651 my $uppercase = property_ref('Uppercase');
8652 $uppercase->set_file('IsUpper') if defined $uppercase;
8653
8654 # Set up the hard-coded default mappings, but only on properties defined
8655 # for this release
8656 foreach my $property (keys %default_mapping) {
8657 my $property_object = property_ref($property);
8658 next if ! defined $property_object;
8659 my $default_map = $default_mapping{$property};
8660 $property_object->set_default_map($default_map);
8661
8662 # A map of <code point> implies the property is string.
8663 if ($property_object->type == $UNKNOWN
8664 && $default_map eq $CODE_POINT)
8665 {
8666 $property_object->set_type($STRING);
8667 }
8668 }
8669
8670 # The following use the Multi_Default class to create objects for
8671 # defaults.
8672
8673 # Bidi class has a complicated default, but the derived file takes care of
8674 # the complications, leaving just 'L'.
8675 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8676 property_ref('Bidi_Class')->set_default_map('L');
8677 }
8678 else {
8679 my $default;
8680
8681 # The derived file was introduced in 3.1.1. The values below are
8682 # taken from table 3-8, TUS 3.0
8683 my $default_R =
8684 'my $default = Range_List->new;
8685 $default->add_range(0x0590, 0x05FF);
8686 $default->add_range(0xFB1D, 0xFB4F);'
8687 ;
8688
8689 # The defaults apply only to unassigned characters
a67f160a 8690 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
8691
8692 if ($v_version lt v3.0.0) {
8693 $default = Multi_Default->new(R => $default_R, 'L');
8694 }
8695 else {
8696
8697 # AL apparently not introduced until 3.0: TUS 2.x references are
8698 # not on-line to check it out
8699 my $default_AL =
8700 'my $default = Range_List->new;
8701 $default->add_range(0x0600, 0x07BF);
8702 $default->add_range(0xFB50, 0xFDFF);
8703 $default->add_range(0xFE70, 0xFEFF);'
8704 ;
8705
8706 # Non-character code points introduced in this release; aren't AL
8707 if ($v_version ge 3.1.0) {
8708 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8709 }
a67f160a 8710 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
8711 $default = Multi_Default->new(AL => $default_AL,
8712 R => $default_R,
8713 'L');
8714 }
8715 property_ref('Bidi_Class')->set_default_map($default);
8716 }
8717
8718 # Joining type has a complicated default, but the derived file takes care
8719 # of the complications, leaving just 'U' (or Non_Joining), except the file
8720 # is bad in 3.1.0
8721 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8722 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8723 property_ref('Joining_Type')->set_default_map('Non_Joining');
8724 }
8725 else {
8726
8727 # Otherwise, there are not one, but two possibilities for the
8728 # missing defaults: T and U.
8729 # The missing defaults that evaluate to T are given by:
8730 # T = Mn + Cf - ZWNJ - ZWJ
8731 # where Mn and Cf are the general category values. In other words,
8732 # any non-spacing mark or any format control character, except
8733 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8734 # WIDTH JOINER (joining type C).
8735 my $default = Multi_Default->new(
8736 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8737 'Non_Joining');
8738 property_ref('Joining_Type')->set_default_map($default);
8739 }
8740 }
8741
8742 # Line break has a complicated default in early releases. It is 'Unknown'
8743 # for non-assigned code points; 'AL' for assigned.
8744 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8745 my $lb = property_ref('Line_Break');
8746 if ($v_version gt 3.2.0) {
8747 $lb->set_default_map('Unknown');
8748 }
8749 else {
8750 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8751 'AL');
8752 $lb->set_default_map($default);
8753 }
8754
8755 # If has the URS property, make sure that the standard aliases are in
8756 # it, since not in the input tables in some versions.
8757 my $urs = property_ref('Unicode_Radical_Stroke');
8758 if (defined $urs) {
8759 $urs->add_alias('cjkRSUnicode');
8760 $urs->add_alias('kRSUnicode');
8761 }
8762 }
8763 return;
8764}
8765
8766sub get_old_property_aliases() {
8767 # Returns what would be in PropertyAliases.txt if it existed in very old
8768 # versions of Unicode. It was derived from the one in 3.2, and pared
8769 # down based on the data that was actually in the older releases.
8770 # An attempt was made to use the existence of files to mean inclusion or
8771 # not of various aliases, but if this was not sufficient, using version
8772 # numbers was resorted to.
8773
8774 my @return;
8775
8776 # These are to be used in all versions (though some are constructed by
8777 # this program if missing)
8778 push @return, split /\n/, <<'END';
8779bc ; Bidi_Class
8780Bidi_M ; Bidi_Mirrored
8781cf ; Case_Folding
8782ccc ; Canonical_Combining_Class
8783dm ; Decomposition_Mapping
8784dt ; Decomposition_Type
8785gc ; General_Category
8786isc ; ISO_Comment
8787lc ; Lowercase_Mapping
8788na ; Name
8789na1 ; Unicode_1_Name
8790nt ; Numeric_Type
8791nv ; Numeric_Value
8792sfc ; Simple_Case_Folding
8793slc ; Simple_Lowercase_Mapping
8794stc ; Simple_Titlecase_Mapping
8795suc ; Simple_Uppercase_Mapping
8796tc ; Titlecase_Mapping
8797uc ; Uppercase_Mapping
8798END
8799
8800 if (-e 'Blocks.txt') {
8801 push @return, "blk ; Block\n";
8802 }
8803 if (-e 'ArabicShaping.txt') {
8804 push @return, split /\n/, <<'END';
8805jg ; Joining_Group
8806jt ; Joining_Type
8807END
8808 }
8809 if (-e 'PropList.txt') {
8810
8811 # This first set is in the original old-style proplist.
8812 push @return, split /\n/, <<'END';
8813Alpha ; Alphabetic
8814Bidi_C ; Bidi_Control
8815Dash ; Dash
8816Dia ; Diacritic
8817Ext ; Extender
8818Hex ; Hex_Digit
8819Hyphen ; Hyphen
8820IDC ; ID_Continue
8821Ideo ; Ideographic
8822Join_C ; Join_Control
8823Math ; Math
8824QMark ; Quotation_Mark
8825Term ; Terminal_Punctuation
8826WSpace ; White_Space
8827END
8828 # The next sets were added later
8829 if ($v_version ge v3.0.0) {
8830 push @return, split /\n/, <<'END';
8831Upper ; Uppercase
8832Lower ; Lowercase
8833END
8834 }
8835 if ($v_version ge v3.0.1) {
8836 push @return, split /\n/, <<'END';
8837NChar ; Noncharacter_Code_Point
8838END
8839 }
8840 # The next sets were added in the new-style
8841 if ($v_version ge v3.1.0) {
8842 push @return, split /\n/, <<'END';
8843OAlpha ; Other_Alphabetic
8844OLower ; Other_Lowercase
8845OMath ; Other_Math
8846OUpper ; Other_Uppercase
8847END
8848 }
8849 if ($v_version ge v3.1.1) {
8850 push @return, "AHex ; ASCII_Hex_Digit\n";
8851 }
8852 }
8853 if (-e 'EastAsianWidth.txt') {
8854 push @return, "ea ; East_Asian_Width\n";
8855 }
8856 if (-e 'CompositionExclusions.txt') {
8857 push @return, "CE ; Composition_Exclusion\n";
8858 }
8859 if (-e 'LineBreak.txt') {
8860 push @return, "lb ; Line_Break\n";
8861 }
8862 if (-e 'BidiMirroring.txt') {
8863 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8864 }
8865 if (-e 'Scripts.txt') {
8866 push @return, "sc ; Script\n";
8867 }
8868 if (-e 'DNormalizationProps.txt') {
8869 push @return, split /\n/, <<'END';
8870Comp_Ex ; Full_Composition_Exclusion
8871FC_NFKC ; FC_NFKC_Closure
8872NFC_QC ; NFC_Quick_Check
8873NFD_QC ; NFD_Quick_Check
8874NFKC_QC ; NFKC_Quick_Check
8875NFKD_QC ; NFKD_Quick_Check
8876XO_NFC ; Expands_On_NFC
8877XO_NFD ; Expands_On_NFD
8878XO_NFKC ; Expands_On_NFKC
8879XO_NFKD ; Expands_On_NFKD
8880END
8881 }
8882 if (-e 'DCoreProperties.txt') {
8883 push @return, split /\n/, <<'END';
8884IDS ; ID_Start
8885XIDC ; XID_Continue
8886XIDS ; XID_Start
8887END
8888 # These can also appear in some versions of PropList.txt
8889 push @return, "Lower ; Lowercase\n"
8890 unless grep { $_ =~ /^Lower\b/} @return;
8891 push @return, "Upper ; Uppercase\n"
8892 unless grep { $_ =~ /^Upper\b/} @return;
8893 }
8894
8895 # This flag requires the DAge.txt file to be copied into the directory.
8896 if (DEBUG && $compare_versions) {
8897 push @return, 'age ; Age';
8898 }
8899
8900 return @return;
8901}
8902
8903sub process_PropValueAliases {
8904 # This file contains values that properties look like:
8905 # bc ; AL ; Arabic_Letter
8906 # blk; n/a ; Greek_And_Coptic ; Greek
8907 #
8908 # Field 0 is the property.
8909 # Field 1 is the short name of a property value or 'n/a' if no
8910 # short name exists;
8911 # Field 2 is the full property value name;
8912 # Any other fields are more synonyms for the property value.
8913 # Purely numeric property values are omitted from the file; as are some
8914 # others, fewer and fewer in later releases
8915
8916 # Entries for the ccc property have an extra field before the
8917 # abbreviation:
8918 # ccc; 0; NR ; Not_Reordered
8919 # It is the numeric value that the names are synonyms for.
8920
8921 # There are comment entries for values missing from this file:
8922 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8923 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8924
8925 my $file= shift;
8926 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8927
8928 # This whole file was non-existent in early releases, so use our own
8929 # internal one if necessary.
8930 if (! -e 'PropValueAliases.txt') {
8931 $file->insert_lines(get_old_property_value_aliases());
8932 }
8933
8934 # Add any explicit cjk values
8935 $file->insert_lines(@cjk_property_values);
8936
8937 # This line is used only for testing the code that checks for name
8938 # conflicts. There is a script Inherited, and when this line is executed
8939 # it causes there to be a name conflict with the 'Inherited' that this
8940 # program generates for this block property value
8941 #$file->insert_lines('blk; n/a; Herited');
8942
8943
8944 # Process each line of the file ...
8945 while ($file->next_line) {
8946
8947 my ($property, @data) = split /\s*;\s*/;
8948
66b4eb0a
KW
8949 # The ccc property has an extra field at the beginning, which is the
8950 # numeric value. Move it to be after the other two, mnemonic, fields,
8951 # so that those will be used as the property value's names, and the
8952 # number will be an extra alias. (Rightmost splice removes field 1-2,
8953 # returning them in a slice; left splice inserts that before anything,
8954 # thus shifting the former field 0 to after them.)
8955 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8956
8957 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
8958 # there is no short name, use the full one in element 1
027866c1
KW
8959 if ($data[0] eq "n/a") {
8960 $data[0] = $data[1];
8961 }
8962 elsif ($data[0] ne $data[1]
8963 && standardize($data[0]) eq standardize($data[1])
8964 && $data[1] !~ /[[:upper:]]/)
8965 {
8966 # Also, there is a bug in the file in which "n/a" is omitted, and
8967 # the two fields are identical except for case, and the full name
8968 # is all lower case. Copy the "short" name unto the full one to
8969 # give it some upper case.
8970
8971 $data[1] = $data[0];
8972 }
99870f4d
KW
8973
8974 # Earlier releases had the pseudo property 'qc' that should expand to
8975 # the ones that replace it below.
8976 if ($property eq 'qc') {
8977 if (lc $data[0] eq 'y') {
8978 $file->insert_lines('NFC_QC; Y ; Yes',
8979 'NFD_QC; Y ; Yes',
8980 'NFKC_QC; Y ; Yes',
8981 'NFKD_QC; Y ; Yes',
8982 );
8983 }
8984 elsif (lc $data[0] eq 'n') {
8985 $file->insert_lines('NFC_QC; N ; No',
8986 'NFD_QC; N ; No',
8987 'NFKC_QC; N ; No',
8988 'NFKD_QC; N ; No',
8989 );
8990 }
8991 elsif (lc $data[0] eq 'm') {
8992 $file->insert_lines('NFC_QC; M ; Maybe',
8993 'NFKC_QC; M ; Maybe',
8994 );
8995 }
8996 else {
8997 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8998 }
8999 next;
9000 }
9001
9002 # The first field is the short name, 2nd is the full one.
9003 my $property_object = property_ref($property);
9004 my $table = $property_object->add_match_table($data[0],
9005 Full_Name => $data[1]);
9006
9007 # Start looking for more aliases after these two.
9008 for my $i (2 .. @data - 1) {
9009 $table->add_alias($data[$i]);
9010 }
9011 } # End of looping through the file
9012
9013 # As noted in the comments early in the program, it generates tables for
9014 # the default values for all releases, even those for which the concept
9015 # didn't exist at the time. Here we add those if missing.
9016 my $age = property_ref('age');
9017 if (defined $age && ! defined $age->table('Unassigned')) {
9018 $age->add_match_table('Unassigned');
9019 }
9020 $block->add_match_table('No_Block') if -e 'Blocks.txt'
9021 && ! defined $block->table('No_Block');
9022
9023
9024 # Now set the default mappings of the properties from the file. This is
9025 # done after the loop because a number of properties have only @missings
9026 # entries in the file, and may not show up until the end.
9027 my @defaults = $file->get_missings;
9028 foreach my $default_ref (@defaults) {
9029 my $default = $default_ref->[0];
9030 my $property = property_ref($default_ref->[1]);
9031 $property->set_default_map($default);
9032 }
9033 return;
9034}
9035
9036sub get_old_property_value_aliases () {
9037 # Returns what would be in PropValueAliases.txt if it existed in very old
9038 # versions of Unicode. It was derived from the one in 3.2, and pared
9039 # down. An attempt was made to use the existence of files to mean
9040 # inclusion or not of various aliases, but if this was not sufficient,
9041 # using version numbers was resorted to.
9042
9043 my @return = split /\n/, <<'END';
9044bc ; AN ; Arabic_Number
9045bc ; B ; Paragraph_Separator
9046bc ; CS ; Common_Separator
9047bc ; EN ; European_Number
9048bc ; ES ; European_Separator
9049bc ; ET ; European_Terminator
9050bc ; L ; Left_To_Right
9051bc ; ON ; Other_Neutral
9052bc ; R ; Right_To_Left
9053bc ; WS ; White_Space
9054
9055# The standard combining classes are very much different in v1, so only use
9056# ones that look right (not checked thoroughly)
9057ccc; 0; NR ; Not_Reordered
9058ccc; 1; OV ; Overlay
9059ccc; 7; NK ; Nukta
9060ccc; 8; KV ; Kana_Voicing
9061ccc; 9; VR ; Virama
9062ccc; 202; ATBL ; Attached_Below_Left
9063ccc; 216; ATAR ; Attached_Above_Right
9064ccc; 218; BL ; Below_Left
9065ccc; 220; B ; Below
9066ccc; 222; BR ; Below_Right
9067ccc; 224; L ; Left
9068ccc; 228; AL ; Above_Left
9069ccc; 230; A ; Above
9070ccc; 232; AR ; Above_Right
9071ccc; 234; DA ; Double_Above
9072
9073dt ; can ; canonical
9074dt ; enc ; circle
9075dt ; fin ; final
9076dt ; font ; font
9077dt ; fra ; fraction
9078dt ; init ; initial
9079dt ; iso ; isolated
9080dt ; med ; medial
9081dt ; n/a ; none
9082dt ; nb ; noBreak
9083dt ; sqr ; square
9084dt ; sub ; sub
9085dt ; sup ; super
9086
9087gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9088gc ; Cc ; Control
9089gc ; Cn ; Unassigned
9090gc ; Co ; Private_Use
9091gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9092gc ; LC ; Cased_Letter # Ll | Lt | Lu
9093gc ; Ll ; Lowercase_Letter
9094gc ; Lm ; Modifier_Letter
9095gc ; Lo ; Other_Letter
9096gc ; Lu ; Uppercase_Letter
9097gc ; M ; Mark # Mc | Me | Mn
9098gc ; Mc ; Spacing_Mark
9099gc ; Mn ; Nonspacing_Mark
9100gc ; N ; Number # Nd | Nl | No
9101gc ; Nd ; Decimal_Number
9102gc ; No ; Other_Number
9103gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9104gc ; Pd ; Dash_Punctuation
9105gc ; Pe ; Close_Punctuation
9106gc ; Po ; Other_Punctuation
9107gc ; Ps ; Open_Punctuation
9108gc ; S ; Symbol # Sc | Sk | Sm | So
9109gc ; Sc ; Currency_Symbol
9110gc ; Sm ; Math_Symbol
9111gc ; So ; Other_Symbol
9112gc ; Z ; Separator # Zl | Zp | Zs
9113gc ; Zl ; Line_Separator
9114gc ; Zp ; Paragraph_Separator
9115gc ; Zs ; Space_Separator
9116
9117nt ; de ; Decimal
9118nt ; di ; Digit
9119nt ; n/a ; None
9120nt ; nu ; Numeric
9121END
9122
9123 if (-e 'ArabicShaping.txt') {
9124 push @return, split /\n/, <<'END';
9125jg ; n/a ; AIN
9126jg ; n/a ; ALEF
9127jg ; n/a ; DAL
9128jg ; n/a ; GAF
9129jg ; n/a ; LAM
9130jg ; n/a ; MEEM
9131jg ; n/a ; NO_JOINING_GROUP
9132jg ; n/a ; NOON
9133jg ; n/a ; QAF
9134jg ; n/a ; SAD
9135jg ; n/a ; SEEN
9136jg ; n/a ; TAH
9137jg ; n/a ; WAW
9138
9139jt ; C ; Join_Causing
9140jt ; D ; Dual_Joining
9141jt ; L ; Left_Joining
9142jt ; R ; Right_Joining
9143jt ; U ; Non_Joining
9144jt ; T ; Transparent
9145END
9146 if ($v_version ge v3.0.0) {
9147 push @return, split /\n/, <<'END';
9148jg ; n/a ; ALAPH
9149jg ; n/a ; BEH
9150jg ; n/a ; BETH
9151jg ; n/a ; DALATH_RISH
9152jg ; n/a ; E
9153jg ; n/a ; FEH
9154jg ; n/a ; FINAL_SEMKATH
9155jg ; n/a ; GAMAL
9156jg ; n/a ; HAH
9157jg ; n/a ; HAMZA_ON_HEH_GOAL
9158jg ; n/a ; HE
9159jg ; n/a ; HEH
9160jg ; n/a ; HEH_GOAL
9161jg ; n/a ; HETH
9162jg ; n/a ; KAF
9163jg ; n/a ; KAPH
9164jg ; n/a ; KNOTTED_HEH
9165jg ; n/a ; LAMADH
9166jg ; n/a ; MIM
9167jg ; n/a ; NUN
9168jg ; n/a ; PE
9169jg ; n/a ; QAPH
9170jg ; n/a ; REH
9171jg ; n/a ; REVERSED_PE
9172jg ; n/a ; SADHE
9173jg ; n/a ; SEMKATH
9174jg ; n/a ; SHIN
9175jg ; n/a ; SWASH_KAF
9176jg ; n/a ; TAW
9177jg ; n/a ; TEH_MARBUTA
9178jg ; n/a ; TETH
9179jg ; n/a ; YEH
9180jg ; n/a ; YEH_BARREE
9181jg ; n/a ; YEH_WITH_TAIL
9182jg ; n/a ; YUDH
9183jg ; n/a ; YUDH_HE
9184jg ; n/a ; ZAIN
9185END
9186 }
9187 }
9188
9189
9190 if (-e 'EastAsianWidth.txt') {
9191 push @return, split /\n/, <<'END';
9192ea ; A ; Ambiguous
9193ea ; F ; Fullwidth
9194ea ; H ; Halfwidth
9195ea ; N ; Neutral
9196ea ; Na ; Narrow
9197ea ; W ; Wide
9198END
9199 }
9200
9201 if (-e 'LineBreak.txt') {
9202 push @return, split /\n/, <<'END';
9203lb ; AI ; Ambiguous
9204lb ; AL ; Alphabetic
9205lb ; B2 ; Break_Both
9206lb ; BA ; Break_After
9207lb ; BB ; Break_Before
9208lb ; BK ; Mandatory_Break
9209lb ; CB ; Contingent_Break
9210lb ; CL ; Close_Punctuation
9211lb ; CM ; Combining_Mark
9212lb ; CR ; Carriage_Return
9213lb ; EX ; Exclamation
9214lb ; GL ; Glue
9215lb ; HY ; Hyphen
9216lb ; ID ; Ideographic
9217lb ; IN ; Inseperable
9218lb ; IS ; Infix_Numeric
9219lb ; LF ; Line_Feed
9220lb ; NS ; Nonstarter
9221lb ; NU ; Numeric
9222lb ; OP ; Open_Punctuation
9223lb ; PO ; Postfix_Numeric
9224lb ; PR ; Prefix_Numeric
9225lb ; QU ; Quotation
9226lb ; SA ; Complex_Context
9227lb ; SG ; Surrogate
9228lb ; SP ; Space
9229lb ; SY ; Break_Symbols
9230lb ; XX ; Unknown
9231lb ; ZW ; ZWSpace
9232END
9233 }
9234
9235 if (-e 'DNormalizationProps.txt') {
9236 push @return, split /\n/, <<'END';
9237qc ; M ; Maybe
9238qc ; N ; No
9239qc ; Y ; Yes
9240END
9241 }
9242
9243 if (-e 'Scripts.txt') {
9244 push @return, split /\n/, <<'END';
9245sc ; Arab ; Arabic
9246sc ; Armn ; Armenian
9247sc ; Beng ; Bengali
9248sc ; Bopo ; Bopomofo
9249sc ; Cans ; Canadian_Aboriginal
9250sc ; Cher ; Cherokee
9251sc ; Cyrl ; Cyrillic
9252sc ; Deva ; Devanagari
9253sc ; Dsrt ; Deseret
9254sc ; Ethi ; Ethiopic
9255sc ; Geor ; Georgian
9256sc ; Goth ; Gothic
9257sc ; Grek ; Greek
9258sc ; Gujr ; Gujarati
9259sc ; Guru ; Gurmukhi
9260sc ; Hang ; Hangul
9261sc ; Hani ; Han
9262sc ; Hebr ; Hebrew
9263sc ; Hira ; Hiragana
9264sc ; Ital ; Old_Italic
9265sc ; Kana ; Katakana
9266sc ; Khmr ; Khmer
9267sc ; Knda ; Kannada
9268sc ; Laoo ; Lao
9269sc ; Latn ; Latin
9270sc ; Mlym ; Malayalam
9271sc ; Mong ; Mongolian
9272sc ; Mymr ; Myanmar
9273sc ; Ogam ; Ogham
9274sc ; Orya ; Oriya
9275sc ; Qaai ; Inherited
9276sc ; Runr ; Runic
9277sc ; Sinh ; Sinhala
9278sc ; Syrc ; Syriac
9279sc ; Taml ; Tamil
9280sc ; Telu ; Telugu
9281sc ; Thaa ; Thaana
9282sc ; Thai ; Thai
9283sc ; Tibt ; Tibetan
9284sc ; Yiii ; Yi
9285sc ; Zyyy ; Common
9286END
9287 }
9288
9289 if ($v_version ge v2.0.0) {
9290 push @return, split /\n/, <<'END';
9291dt ; com ; compat
9292dt ; nar ; narrow
9293dt ; sml ; small
9294dt ; vert ; vertical
9295dt ; wide ; wide
9296
9297gc ; Cf ; Format
9298gc ; Cs ; Surrogate
9299gc ; Lt ; Titlecase_Letter
9300gc ; Me ; Enclosing_Mark
9301gc ; Nl ; Letter_Number
9302gc ; Pc ; Connector_Punctuation
9303gc ; Sk ; Modifier_Symbol
9304END
9305 }
9306 if ($v_version ge v2.1.2) {
9307 push @return, "bc ; S ; Segment_Separator\n";
9308 }
9309 if ($v_version ge v2.1.5) {
9310 push @return, split /\n/, <<'END';
9311gc ; Pf ; Final_Punctuation
9312gc ; Pi ; Initial_Punctuation
9313END
9314 }
9315 if ($v_version ge v2.1.8) {
9316 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9317 }
9318
9319 if ($v_version ge v3.0.0) {
9320 push @return, split /\n/, <<'END';
9321bc ; AL ; Arabic_Letter
9322bc ; BN ; Boundary_Neutral
9323bc ; LRE ; Left_To_Right_Embedding
9324bc ; LRO ; Left_To_Right_Override
9325bc ; NSM ; Nonspacing_Mark
9326bc ; PDF ; Pop_Directional_Format
9327bc ; RLE ; Right_To_Left_Embedding
9328bc ; RLO ; Right_To_Left_Override
9329
9330ccc; 233; DB ; Double_Below
9331END
9332 }
9333
9334 if ($v_version ge v3.1.0) {
9335 push @return, "ccc; 226; R ; Right\n";
9336 }
9337
9338 return @return;
9339}
9340
b1c167a3
KW
9341sub output_perl_charnames_line ($$) {
9342
9343 # Output the entries in Perl_charnames specially, using 5 digits instead
9344 # of four. This makes the entries a constant length, and simplifies
9345 # charnames.pm which this table is for. Unicode can have 6 digit
9346 # ordinals, but they are all private use or noncharacters which do not
9347 # have names, so won't be in this table.
9348
73d9566f 9349 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
9350}
9351
99870f4d
KW
9352{ # Closure
9353 # This is used to store the range list of all the code points usable when
9354 # the little used $compare_versions feature is enabled.
9355 my $compare_versions_range_list;
9356
96cfc54a
KW
9357 # These are constants to the $property_info hash in this subroutine, to
9358 # avoid using a quoted-string which might have a typo.
9359 my $TYPE = 'type';
9360 my $DEFAULT_MAP = 'default_map';
9361 my $DEFAULT_TABLE = 'default_table';
9362 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9363 my $MISSINGS = 'missings';
9364
99870f4d
KW
9365 sub process_generic_property_file {
9366 # This processes a file containing property mappings and puts them
9367 # into internal map tables. It should be used to handle any property
9368 # files that have mappings from a code point or range thereof to
9369 # something else. This means almost all the UCD .txt files.
9370 # each_line_handlers() should be set to adjust the lines of these
9371 # files, if necessary, to what this routine understands:
9372 #
9373 # 0374 ; NFD_QC; N
9374 # 003C..003E ; Math
9375 #
92f9d56c 9376 # the fields are: "codepoint-range ; property; map"
99870f4d
KW
9377 #
9378 # meaning the codepoints in the range all have the value 'map' under
9379 # 'property'.
98dc9551 9380 # Beginning and trailing white space in each field are not significant.
99870f4d
KW
9381 # Note there is not a trailing semi-colon in the above. A trailing
9382 # semi-colon means the map is a null-string. An omitted map, as
9383 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9384 # table syntax. (This could have been hidden from this routine by
9385 # doing it in the $file object, but that would require parsing of the
9386 # line there, so would have to parse it twice, or change the interface
9387 # to pass this an array. So not done.)
9388 #
9389 # The map field may begin with a sequence of commands that apply to
9390 # this range. Each such command begins and ends with $CMD_DELIM.
9391 # These are used to indicate, for example, that the mapping for a
9392 # range has a non-default type.
9393 #
9394 # This loops through the file, calling it's next_line() method, and
9395 # then taking the map and adding it to the property's table.
9396 # Complications arise because any number of properties can be in the
9397 # file, in any order, interspersed in any way. The first time a
9398 # property is seen, it gets information about that property and
f86864ac 9399 # caches it for quick retrieval later. It also normalizes the maps
5d7f7709
KW
9400 # so that only one of many synonyms is stored. The Unicode input
9401 # files do use some multiple synonyms.
99870f4d
KW
9402
9403 my $file = shift;
9404 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9405
9406 my %property_info; # To keep track of what properties
9407 # have already had entries in the
9408 # current file, and info about each,
9409 # so don't have to recompute.
9410 my $property_name; # property currently being worked on
9411 my $property_type; # and its type
9412 my $previous_property_name = ""; # name from last time through loop
9413 my $property_object; # pointer to the current property's
9414 # object
9415 my $property_addr; # the address of that object
9416 my $default_map; # the string that code points missing
9417 # from the file map to
9418 my $default_table; # For non-string properties, a
9419 # reference to the match table that
9420 # will contain the list of code
9421 # points that map to $default_map.
9422
9423 # Get the next real non-comment line
9424 LINE:
9425 while ($file->next_line) {
9426
9427 # Default replacement type; means that if parts of the range have
9428 # already been stored in our tables, the new map overrides them if
9429 # they differ more than cosmetically
9430 my $replace = $IF_NOT_EQUIVALENT;
9431 my $map_type; # Default type for the map of this range
9432
9433 #local $to_trace = 1 if main::DEBUG;
9434 trace $_ if main::DEBUG && $to_trace;
9435
9436 # Split the line into components
9437 my ($range, $property_name, $map, @remainder)
9438 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9439
9440 # If more or less on the line than we are expecting, warn and skip
9441 # the line
9442 if (@remainder) {
9443 $file->carp_bad_line('Extra fields');
9444 next LINE;
9445 }
9446 elsif ( ! defined $property_name) {
9447 $file->carp_bad_line('Missing property');
9448 next LINE;
9449 }
9450
9451 # Examine the range.
9452 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9453 {
9454 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9455 next LINE;
9456 }
9457 my $low = hex $1;
9458 my $high = (defined $2) ? hex $2 : $low;
9459
9460 # For the very specialized case of comparing two Unicode
9461 # versions...
9462 if (DEBUG && $compare_versions) {
9463 if ($property_name eq 'Age') {
9464
9465 # Only allow code points at least as old as the version
9466 # specified.
9467 my $age = pack "C*", split(/\./, $map); # v string
9468 next LINE if $age gt $compare_versions;
9469 }
9470 else {
9471
9472 # Again, we throw out code points younger than those of
9473 # the specified version. By now, the Age property is
9474 # populated. We use the intersection of each input range
9475 # with this property to find what code points in it are
9476 # valid. To do the intersection, we have to convert the
9477 # Age property map to a Range_list. We only have to do
9478 # this once.
9479 if (! defined $compare_versions_range_list) {
9480 my $age = property_ref('Age');
9481 if (! -e 'DAge.txt') {
9482 croak "Need to have 'DAge.txt' file to do version comparison";
9483 }
9484 elsif ($age->count == 0) {
9485 croak "The 'Age' table is empty, but its file exists";
9486 }
9487 $compare_versions_range_list
9488 = Range_List->new(Initialize => $age);
9489 }
9490
9491 # An undefined map is always 'Y'
9492 $map = 'Y' if ! defined $map;
9493
9494 # Calculate the intersection of the input range with the
9495 # code points that are known in the specified version
9496 my @ranges = ($compare_versions_range_list
9497 & Range->new($low, $high))->ranges;
9498
9499 # If the intersection is empty, throw away this range
9500 next LINE unless @ranges;
9501
9502 # Only examine the first range this time through the loop.
9503 my $this_range = shift @ranges;
9504
9505 # Put any remaining ranges in the queue to be processed
9506 # later. Note that there is unnecessary work here, as we
9507 # will do the intersection again for each of these ranges
9508 # during some future iteration of the LINE loop, but this
9509 # code is not used in production. The later intersections
9510 # are guaranteed to not splinter, so this will not become
9511 # an infinite loop.
9512 my $line = join ';', $property_name, $map;
9513 foreach my $range (@ranges) {
9514 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9515 $range->start,
9516 $range->end,
9517 $line));
9518 }
9519
9520 # And process the first range, like any other.
9521 $low = $this_range->start;
9522 $high = $this_range->end;
9523 }
9524 } # End of $compare_versions
9525
9526 # If changing to a new property, get the things constant per
9527 # property
9528 if ($previous_property_name ne $property_name) {
9529
9530 $property_object = property_ref($property_name);
9531 if (! defined $property_object) {
9532 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9533 next LINE;
9534 }
051df77b 9535 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
9536
9537 # Defer changing names until have a line that is acceptable
9538 # (the 'next' statement above means is unacceptable)
9539 $previous_property_name = $property_name;
9540
9541 # If not the first time for this property, retrieve info about
9542 # it from the cache
96cfc54a
KW
9543 if (defined ($property_info{$property_addr}{$TYPE})) {
9544 $property_type = $property_info{$property_addr}{$TYPE};
9545 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
99870f4d 9546 $map_type
96cfc54a 9547 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
99870f4d 9548 $default_table
96cfc54a 9549 = $property_info{$property_addr}{$DEFAULT_TABLE};
99870f4d
KW
9550 }
9551 else {
9552
9553 # Here, is the first time for this property. Set up the
9554 # cache.
96cfc54a 9555 $property_type = $property_info{$property_addr}{$TYPE}
99870f4d
KW
9556 = $property_object->type;
9557 $map_type
96cfc54a 9558 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
99870f4d
KW
9559 = $property_object->pseudo_map_type;
9560
9561 # The Unicode files are set up so that if the map is not
9562 # defined, it is a binary property
9563 if (! defined $map && $property_type != $BINARY) {
9564 if ($property_type != $UNKNOWN
9565 && $property_type != $NON_STRING)
9566 {
9567 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9568 }
9569 else {
9570 $property_object->set_type($BINARY);
9571 $property_type
96cfc54a 9572 = $property_info{$property_addr}{$TYPE}
99870f4d
KW
9573 = $BINARY;
9574 }
9575 }
9576
9577 # Get any @missings default for this property. This
9578 # should precede the first entry for the property in the
9579 # input file, and is located in a comment that has been
9580 # stored by the Input_file class until we access it here.
9581 # It's possible that there is more than one such line
9582 # waiting for us; collect them all, and parse
9583 my @missings_list = $file->get_missings
9584 if $file->has_missings_defaults;
9585 foreach my $default_ref (@missings_list) {
9586 my $default = $default_ref->[0];
ffe43484 9587 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
9588
9589 # For string properties, the default is just what the
9590 # file says, but non-string properties should already
9591 # have set up a table for the default property value;
9592 # use the table for these, so can resolve synonyms
9593 # later to a single standard one.
9594 if ($property_type == $STRING
9595 || $property_type == $UNKNOWN)
9596 {
96cfc54a 9597 $property_info{$addr}{$MISSINGS} = $default;
99870f4d
KW
9598 }
9599 else {
96cfc54a 9600 $property_info{$addr}{$MISSINGS}
99870f4d
KW
9601 = $property_object->table($default);
9602 }
9603 }
9604
9605 # Finished storing all the @missings defaults in the input
9606 # file so far. Get the one for the current property.
96cfc54a 9607 my $missings = $property_info{$property_addr}{$MISSINGS};
99870f4d
KW
9608
9609 # But we likely have separately stored what the default
9610 # should be. (This is to accommodate versions of the
9611 # standard where the @missings lines are absent or
9612 # incomplete.) Hopefully the two will match. But check
9613 # it out.
9614 $default_map = $property_object->default_map;
9615
9616 # If the map is a ref, it means that the default won't be
9617 # processed until later, so undef it, so next few lines
9618 # will redefine it to something that nothing will match
9619 undef $default_map if ref $default_map;
9620
9621 # Create a $default_map if don't have one; maybe a dummy
9622 # that won't match anything.
9623 if (! defined $default_map) {
9624
9625 # Use any @missings line in the file.
9626 if (defined $missings) {
9627 if (ref $missings) {
9628 $default_map = $missings->full_name;
9629 $default_table = $missings;
9630 }
9631 else {
9632 $default_map = $missings;
9633 }
678f13d5 9634
99870f4d
KW
9635 # And store it with the property for outside use.
9636 $property_object->set_default_map($default_map);
9637 }
9638 else {
9639
9640 # Neither an @missings nor a default map. Create
9641 # a dummy one, so won't have to test definedness
9642 # in the main loop.
9643 $default_map = '_Perl This will never be in a file
9644 from Unicode';
9645 }
9646 }
9647
9648 # Here, we have $default_map defined, possibly in terms of
9649 # $missings, but maybe not, and possibly is a dummy one.
9650 if (defined $missings) {
9651
9652 # Make sure there is no conflict between the two.
9653 # $missings has priority.
9654 if (ref $missings) {
23e33b60
KW
9655 $default_table
9656 = $property_object->table($default_map);
99870f4d
KW
9657 if (! defined $default_table
9658 || $default_table != $missings)
9659 {
9660 if (! defined $default_table) {
9661 $default_table = $UNDEF;
9662 }
9663 $file->carp_bad_line(<<END
9664The \@missings line for $property_name in $file says that missings default to
9665$missings, but we expect it to be $default_table. $missings used.
9666END
9667 );
9668 $default_table = $missings;
9669 $default_map = $missings->full_name;
9670 }
96cfc54a 9671 $property_info{$property_addr}{$DEFAULT_TABLE}
99870f4d
KW
9672 = $default_table;
9673 }
9674 elsif ($default_map ne $missings) {
9675 $file->carp_bad_line(<<END
9676The \@missings line for $property_name in $file says that missings default to
9677$missings, but we expect it to be $default_map. $missings used.
9678END
9679 );
9680 $default_map = $missings;
9681 }
9682 }
9683
96cfc54a 9684 $property_info{$property_addr}{$DEFAULT_MAP}
99870f4d
KW
9685 = $default_map;
9686
9687 # If haven't done so already, find the table corresponding
9688 # to this map for non-string properties.
9689 if (! defined $default_table
9690 && $property_type != $STRING
9691 && $property_type != $UNKNOWN)
9692 {
9693 $default_table = $property_info{$property_addr}
96cfc54a 9694 {$DEFAULT_TABLE}
99870f4d
KW
9695 = $property_object->table($default_map);
9696 }
9697 } # End of is first time for this property
9698 } # End of switching properties.
9699
9700 # Ready to process the line.
9701 # The Unicode files are set up so that if the map is not defined,
9702 # it is a binary property with value 'Y'
9703 if (! defined $map) {
9704 $map = 'Y';
9705 }
9706 else {
9707
9708 # If the map begins with a special command to us (enclosed in
9709 # delimiters), extract the command(s).
a35d7f90
KW
9710 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9711 my $command = $1;
9712 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9713 $replace = $1;
99870f4d 9714 }
a35d7f90
KW
9715 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9716 $map_type = $1;
9717 }
9718 else {
9719 $file->carp_bad_line("Unknown command line: '$1'");
9720 next LINE;
9721 }
9722 }
99870f4d
KW
9723 }
9724
9725 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9726 {
9727
9728 # Here, we have a map to a particular code point, and the
9729 # default map is to a code point itself. If the range
9730 # includes the particular code point, change that portion of
9731 # the range to the default. This makes sure that in the final
9732 # table only the non-defaults are listed.
9733 my $decimal_map = hex $map;
9734 if ($low <= $decimal_map && $decimal_map <= $high) {
9735
9736 # If the range includes stuff before or after the map
9737 # we're changing, split it and process the split-off parts
9738 # later.
9739 if ($low < $decimal_map) {
9740 $file->insert_adjusted_lines(
9741 sprintf("%04X..%04X; %s; %s",
9742 $low,
9743 $decimal_map - 1,
9744 $property_name,
9745 $map));
9746 }
9747 if ($high > $decimal_map) {
9748 $file->insert_adjusted_lines(
9749 sprintf("%04X..%04X; %s; %s",
9750 $decimal_map + 1,
9751 $high,
9752 $property_name,
9753 $map));
9754 }
9755 $low = $high = $decimal_map;
9756 $map = $CODE_POINT;
9757 }
9758 }
9759
9760 # If we can tell that this is a synonym for the default map, use
9761 # the default one instead.
9762 if ($property_type != $STRING
9763 && $property_type != $UNKNOWN)
9764 {
9765 my $table = $property_object->table($map);
9766 if (defined $table && $table == $default_table) {
9767 $map = $default_map;
9768 }
9769 }
9770
9771 # And figure out the map type if not known.
9772 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9773 if ($map eq "") { # Nulls are always $NULL map type
9774 $map_type = $NULL;
9775 } # Otherwise, non-strings, and those that don't allow
9776 # $MULTI_CP, and those that aren't multiple code points are
9777 # 0
9778 elsif
9779 (($property_type != $STRING && $property_type != $UNKNOWN)
9780 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9781 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9782 {
9783 $map_type = 0;
9784 }
9785 else {
9786 $map_type = $MULTI_CP;
9787 }
9788 }
9789
9790 $property_object->add_map($low, $high,
9791 $map,
9792 Type => $map_type,
9793 Replace => $replace);
9794 } # End of loop through file's lines
9795
9796 return;
9797 }
9798}
9799
99870f4d
KW
9800{ # Closure for UnicodeData.txt handling
9801
9802 # This file was the first one in the UCD; its design leads to some
9803 # awkwardness in processing. Here is a sample line:
9804 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9805 # The fields in order are:
9806 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9807 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9808 my $CATEGORY = $i++; # category (e.g. "Lu")
9809 my $CCC = $i++; # Canonical combining class (e.g. "230")
9810 my $BIDI = $i++; # directional class (e.g. "L")
9811 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9812 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9813 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9814 # Dual-use in this program; see below
9815 my $NUMERIC = $i++; # numeric value
9816 my $MIRRORED = $i++; # ? mirrored
9817 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9818 my $COMMENT = $i++; # iso comment
9819 my $UPPER = $i++; # simple uppercase mapping
9820 my $LOWER = $i++; # simple lowercase mapping
9821 my $TITLE = $i++; # simple titlecase mapping
9822 my $input_field_count = $i;
9823
9824 # This routine in addition outputs these extra fields:
9825 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9826
9827 # These fields are modifications of ones above, and are usually
9828 # suppressed; they must come last, as for speed, the loop upper bound is
9829 # normally set to ignore them
9830 my $NAME = $i++; # This is the strict name field, not the one that
9831 # charnames uses.
9832 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9833 # by Unicode::Normalize
99870f4d
KW
9834 my $last_field = $i - 1;
9835
9836 # All these are read into an array for each line, with the indices defined
9837 # above. The empty fields in the example line above indicate that the
9838 # value is defaulted. The handler called for each line of the input
9839 # changes these to their defaults.
9840
9841 # Here are the official names of the properties, in a parallel array:
9842 my @field_names;
9843 $field_names[$BIDI] = 'Bidi_Class';
9844 $field_names[$CATEGORY] = 'General_Category';
9845 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9846 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9847 $field_names[$COMMENT] = 'ISO_Comment';
9848 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9849 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9850 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9851 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9852 $field_names[$NAME] = 'Name';
9853 $field_names[$NUMERIC] = 'Numeric_Value';
9854 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9855 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9856 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9857 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9858 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9859 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9860
28093d0e
KW
9861 # Some of these need a little more explanation:
9862 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9863 # property, but is used in calculating the Numeric_Type. Perl however,
9864 # creates a file from this field, so a Perl property is created from it.
9865 # Similarly, the Other_Digit field is used only for calculating the
9866 # Numeric_Type, and so it can be safely re-used as the place to store
9867 # the value for Numeric_Type; hence it is referred to as
9868 # $NUMERIC_TYPE_OTHER_DIGIT.
9869 # The input field named $PERL_DECOMPOSITION is a combination of both the
9870 # decomposition mapping and its type. Perl creates a file containing
9871 # exactly this field, so it is used for that. The two properties are
9872 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9873 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9874 # output it), as Perl doesn't use it directly.
9875 # The input field named here $CHARNAME is used to construct the
9876 # Perl_Charnames property, which is a combination of the Name property
9877 # (which the input field contains), and the Unicode_1_Name property, and
9878 # others from other files. Since, the strict Name property is not used
9879 # by Perl, this field is used for the table that Perl does use. The
9880 # strict Name property table is usually suppressed (unless the lists are
9881 # changed to output it), so it is accumulated in a separate field,
9882 # $NAME, which to save time is discarded unless the table is actually to
9883 # be output
99870f4d
KW
9884
9885 # This file is processed like most in this program. Control is passed to
9886 # process_generic_property_file() which calls filter_UnicodeData_line()
9887 # for each input line. This filter converts the input into line(s) that
9888 # process_generic_property_file() understands. There is also a setup
9889 # routine called before any of the file is processed, and a handler for
9890 # EOF processing, all in this closure.
9891
9892 # A huge speed-up occurred at the cost of some added complexity when these
9893 # routines were altered to buffer the outputs into ranges. Almost all the
9894 # lines of the input file apply to just one code point, and for most
9895 # properties, the map for the next code point up is the same as the
9896 # current one. So instead of creating a line for each property for each
9897 # input line, filter_UnicodeData_line() remembers what the previous map
9898 # of a property was, and doesn't generate a line to pass on until it has
9899 # to, as when the map changes; and that passed-on line encompasses the
9900 # whole contiguous range of code points that have the same map for that
9901 # property. This means a slight amount of extra setup, and having to
9902 # flush these buffers on EOF, testing if the maps have changed, plus
9903 # remembering state information in the closure. But it means a lot less
9904 # real time in not having to change the data base for each property on
9905 # each line.
9906
9907 # Another complication is that there are already a few ranges designated
9908 # in the input. There are two lines for each, with the same maps except
9909 # the code point and name on each line. This was actually the hardest
9910 # thing to design around. The code points in those ranges may actually
9911 # have real maps not given by these two lines. These maps will either
56339b2c 9912 # be algorithmically determinable, or be in the extracted files furnished
99870f4d
KW
9913 # with the UCD. In the event of conflicts between these extracted files,
9914 # and this one, Unicode says that this one prevails. But it shouldn't
9915 # prevail for conflicts that occur in these ranges. The data from the
9916 # extracted files prevails in those cases. So, this program is structured
9917 # so that those files are processed first, storing maps. Then the other
9918 # files are processed, generally overwriting what the extracted files
9919 # stored. But just the range lines in this input file are processed
9920 # without overwriting. This is accomplished by adding a special string to
9921 # the lines output to tell process_generic_property_file() to turn off the
9922 # overwriting for just this one line.
9923 # A similar mechanism is used to tell it that the map is of a non-default
9924 # type.
9925
9926 sub setup_UnicodeData { # Called before any lines of the input are read
9927 my $file = shift;
9928 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9929
28093d0e
KW
9930 # Create a new property specially located that is a combination of the
9931 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9932 # Name_Alias properties. (The final duplicates elements of the
9933 # first.) A comment for it will later be constructed based on the
9934 # actual properties present and used
3e20195b 9935 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9936 Default_Map => "",
9937 Directory => File::Spec->curdir(),
9938 File => 'Name',
301ba948 9939 Fate => $INTERNAL_ONLY,
28093d0e 9940 Perl_Extension => 1,
b1c167a3 9941 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9942 Type => $STRING,
9943 );
5be997b0 9944 $perl_charname->set_proxy_for('Name', 'Name_Alias');
28093d0e 9945
99870f4d 9946 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9947 Directory => File::Spec->curdir(),
99870f4d 9948 File => 'Decomposition',
a14f3cb1 9949 Format => $DECOMP_STRING_FORMAT,
301ba948 9950 Fate => $INTERNAL_ONLY,
99870f4d
KW
9951 Perl_Extension => 1,
9952 Default_Map => $CODE_POINT,
9953
0c07e538
KW
9954 # normalize.pm can't cope with these
9955 Output_Range_Counts => 0,
9956
99870f4d
KW
9957 # This is a specially formatted table
9958 # explicitly for normalize.pm, which
9959 # is expecting a particular format,
9960 # which means that mappings containing
9961 # multiple code points are in the main
9962 # body of the table
9963 Map_Type => $COMPUTE_NO_MULTI_CP,
9964 Type => $STRING,
9965 );
5be997b0 9966 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
99870f4d
KW
9967 $Perl_decomp->add_comment(join_lines(<<END
9968This mapping is a combination of the Unicode 'Decomposition_Type' and
9969'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
8d6427a5 9970identical to the official Unicode 'Decomposition_Mapping' property except for
99870f4d
KW
9971two things:
9972 1) It omits the algorithmically determinable Hangul syllable decompositions,
9973which normalize.pm handles algorithmically.
9974 2) It contains the decomposition type as well. Non-canonical decompositions
9975begin with a word in angle brackets, like <super>, which denotes the
9976compatible decomposition type. If the map does not begin with the <angle
9977brackets>, the decomposition is canonical.
9978END
9979 ));
9980
9981 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9982 Default_Map => "",
9983 Perl_Extension => 1,
9984 File => 'Digit', # Trad. location
9985 Directory => $map_directory,
9986 Type => $STRING,
9987 Range_Size_1 => 1,
9988 );
9989 $Decimal_Digit->add_comment(join_lines(<<END
9990This file gives the mapping of all code points which represent a single
9991decimal digit [0-9] to their respective digits. For example, the code point
9992U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9993that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9994numerals.
9995END
9996 ));
9997
28093d0e
KW
9998 # These properties are not used for generating anything else, and are
9999 # usually not output. By making them last in the list, we can just
99870f4d 10000 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
10001 # generating a table(s) that is/are just going to get thrown away.
10002 if (! property_ref('Decomposition_Mapping')->to_output_map
10003 && ! property_ref('Name')->to_output_map)
10004 {
10005 $last_field = min($NAME, $DECOMP_MAP) - 1;
10006 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10007 $last_field = $DECOMP_MAP;
10008 } elsif (property_ref('Name')->to_output_map) {
10009 $last_field = $NAME;
99870f4d
KW
10010 }
10011 return;
10012 }
10013
10014 my $first_time = 1; # ? Is this the first line of the file
10015 my $in_range = 0; # ? Are we in one of the file's ranges
10016 my $previous_cp; # hex code point of previous line
10017 my $decimal_previous_cp = -1; # And its decimal equivalent
10018 my @start; # For each field, the current starting
10019 # code point in hex for the range
10020 # being accumulated.
10021 my @fields; # The input fields;
10022 my @previous_fields; # And those from the previous call
10023
10024 sub filter_UnicodeData_line {
10025 # Handle a single input line from UnicodeData.txt; see comments above
10026 # Conceptually this takes a single line from the file containing N
10027 # properties, and converts it into N lines with one property per line,
10028 # which is what the final handler expects. But there are
10029 # complications due to the quirkiness of the input file, and to save
10030 # time, it accumulates ranges where the property values don't change
10031 # and only emits lines when necessary. This is about an order of
10032 # magnitude fewer lines emitted.
10033
10034 my $file = shift;
10035 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10036
10037 # $_ contains the input line.
10038 # -1 in split means retain trailing null fields
10039 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10040
10041 #local $to_trace = 1 if main::DEBUG;
10042 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10043 if (@fields > $input_field_count) {
10044 $file->carp_bad_line('Extra fields');
10045 $_ = "";
10046 return;
10047 }
10048
10049 my $decimal_cp = hex $cp;
10050
10051 # We have to output all the buffered ranges when the next code point
10052 # is not exactly one after the previous one, which means there is a
10053 # gap in the ranges.
10054 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10055
10056 # The decomposition mapping field requires special handling. It looks
10057 # like either:
10058 #
10059 # <compat> 0032 0020
10060 # 0041 0300
10061 #
10062 # The decomposition type is enclosed in <brackets>; if missing, it
10063 # means the type is canonical. There are two decomposition mapping
10064 # tables: the one for use by Perl's normalize.pm has a special format
10065 # which is this field intact; the other, for general use is of
10066 # standard format. In either case we have to find the decomposition
10067 # type. Empty fields have None as their type, and map to the code
10068 # point itself
10069 if ($fields[$PERL_DECOMPOSITION] eq "") {
10070 $fields[$DECOMP_TYPE] = 'None';
10071 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10072 }
10073 else {
10074 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10075 =~ / < ( .+? ) > \s* ( .+ ) /x;
10076 if (! defined $fields[$DECOMP_TYPE]) {
10077 $fields[$DECOMP_TYPE] = 'Canonical';
10078 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10079 }
10080 else {
10081 $fields[$DECOMP_MAP] = $map;
10082 }
10083 }
10084
10085 # The 3 numeric fields also require special handling. The 2 digit
10086 # fields must be either empty or match the number field. This means
10087 # that if it is empty, they must be as well, and the numeric type is
10088 # None, and the numeric value is 'Nan'.
10089 # The decimal digit field must be empty or match the other digit
10090 # field. If the decimal digit field is non-empty, the code point is
10091 # a decimal digit, and the other two fields will have the same value.
10092 # If it is empty, but the other digit field is non-empty, the code
10093 # point is an 'other digit', and the number field will have the same
10094 # value as the other digit field. If the other digit field is empty,
10095 # but the number field is non-empty, the code point is a generic
10096 # numeric type.
10097 if ($fields[$NUMERIC] eq "") {
10098 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10099 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10100 ) {
10101 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10102 }
10103 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10104 $fields[$NUMERIC] = 'NaN';
10105 }
10106 else {
10107 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10108 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10109 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10110 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10111 }
10112 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10113 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10114 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10115 }
10116 else {
10117 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10118
10119 # Rationals require extra effort.
10120 register_fraction($fields[$NUMERIC])
10121 if $fields[$NUMERIC] =~ qr{/};
10122 }
10123 }
10124
10125 # For the properties that have empty fields in the file, and which
10126 # mean something different from empty, change them to that default.
10127 # Certain fields just haven't been empty so far in any Unicode
10128 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10129 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 10130 # the defaults; which are very unlikely to ever change.
99870f4d
KW
10131 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10132 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10133
10134 # UAX44 says that if title is empty, it is the same as whatever upper
10135 # is,
10136 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10137
10138 # There are a few pairs of lines like:
10139 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10140 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10141 # that define ranges. These should be processed after the fields are
10142 # adjusted above, as they may override some of them; but mostly what
28093d0e 10143 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
10144 # paired lines start with a '<', but this is also true of '<control>,
10145 # which isn't one of these special ones.
28093d0e 10146 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
10147
10148 # Some code points in this file have the pseudo-name
10149 # '<control>', but the official name for such ones is the null
28093d0e 10150 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 10151 $fields[$NAME] = "";
28093d0e 10152 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
10153
10154 # We had better not be in between range lines.
10155 if ($in_range) {
28093d0e 10156 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10157 $in_range = 0;
10158 }
10159 }
28093d0e 10160 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
10161
10162 # Here is a non-range line. We had better not be in between range
10163 # lines.
10164 if ($in_range) {
28093d0e 10165 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10166 $in_range = 0;
10167 }
edb80b88
KW
10168 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10169
10170 # These are code points whose names end in their code points,
10171 # which means the names are algorithmically derivable from the
10172 # code points. To shorten the output Name file, the algorithm
10173 # for deriving these is placed in the file instead of each
10174 # code point, so they have map type $CP_IN_NAME
10175 $fields[$CHARNAME] = $CMD_DELIM
10176 . $MAP_TYPE_CMD
10177 . '='
10178 . $CP_IN_NAME
10179 . $CMD_DELIM
10180 . $fields[$CHARNAME];
10181 }
28093d0e 10182 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 10183 }
28093d0e
KW
10184 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10185 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
10186
10187 # Here we are at the beginning of a range pair.
10188 if ($in_range) {
28093d0e 10189 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10190 }
10191 $in_range = 1;
10192
10193 # Because the properties in the range do not overwrite any already
10194 # in the db, we must flush the buffers of what's already there, so
10195 # they get handled in the normal scheme.
10196 $force_output = 1;
10197
10198 }
28093d0e
KW
10199 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10200 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
10201 $_ = "";
10202 return;
10203 }
10204 else { # Here, we are at the last line of a range pair.
10205
10206 if (! $in_range) {
28093d0e 10207 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
10208 $_ = "";
10209 return;
10210 }
10211 $in_range = 0;
10212
28093d0e
KW
10213 $fields[$NAME] = $fields[$CHARNAME];
10214
99870f4d
KW
10215 # Check that the input is valid: that the closing of the range is
10216 # the same as the beginning.
10217 foreach my $i (0 .. $last_field) {
10218 next if $fields[$i] eq $previous_fields[$i];
10219 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10220 }
10221
10222 # The processing differs depending on the type of range,
28093d0e
KW
10223 # determined by its $CHARNAME
10224 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
10225
10226 # Check that the data looks right.
10227 if ($decimal_previous_cp != $SBase) {
10228 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10229 }
10230 if ($decimal_cp != $SBase + $SCount - 1) {
10231 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10232 }
10233
10234 # The Hangul syllable range has a somewhat complicated name
10235 # generation algorithm. Each code point in it has a canonical
10236 # decomposition also computable by an algorithm. The
10237 # perl decomposition map table built from these is used only
10238 # by normalize.pm, which has the algorithm built in it, so the
10239 # decomposition maps are not needed, and are large, so are
10240 # omitted from it. If the full decomposition map table is to
10241 # be output, the decompositions are generated for it, in the
10242 # EOF handling code for this input file.
10243
10244 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10245
10246 # This range is stored in our internal structure with its
10247 # own map type, different from all others.
28093d0e
KW
10248 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10249 = $CMD_DELIM
99870f4d
KW
10250 . $MAP_TYPE_CMD
10251 . '='
10252 . $HANGUL_SYLLABLE
10253 . $CMD_DELIM
28093d0e 10254 . $fields[$CHARNAME];
99870f4d 10255 }
28093d0e 10256 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
10257
10258 # The name for these contains the code point itself, and all
10259 # are defined to have the same base name, regardless of what
10260 # is in the file. They are stored in our internal structure
10261 # with a map type of $CP_IN_NAME
28093d0e
KW
10262 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10263 = $CMD_DELIM
99870f4d
KW
10264 . $MAP_TYPE_CMD
10265 . '='
10266 . $CP_IN_NAME
10267 . $CMD_DELIM
10268 . 'CJK UNIFIED IDEOGRAPH';
10269
10270 }
10271 elsif ($fields[$CATEGORY] eq 'Co'
10272 || $fields[$CATEGORY] eq 'Cs')
10273 {
10274 # The names of all the code points in these ranges are set to
10275 # null, as there are no names for the private use and
10276 # surrogate code points.
10277
28093d0e 10278 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
10279 }
10280 else {
28093d0e 10281 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
10282 }
10283
10284 # The first line of the range caused everything else to be output,
10285 # and then its values were stored as the beginning values for the
10286 # next set of ranges, which this one ends. Now, for each value,
10287 # add a command to tell the handler that these values should not
10288 # replace any existing ones in our database.
10289 foreach my $i (0 .. $last_field) {
10290 $previous_fields[$i] = $CMD_DELIM
10291 . $REPLACE_CMD
10292 . '='
10293 . $NO
10294 . $CMD_DELIM
10295 . $previous_fields[$i];
10296 }
10297
10298 # And change things so it looks like the entire range has been
10299 # gone through with this being the final part of it. Adding the
10300 # command above to each field will cause this range to be flushed
10301 # during the next iteration, as it guaranteed that the stored
10302 # field won't match whatever value the next one has.
10303 $previous_cp = $cp;
10304 $decimal_previous_cp = $decimal_cp;
10305
10306 # We are now set up for the next iteration; so skip the remaining
10307 # code in this subroutine that does the same thing, but doesn't
10308 # know about these ranges.
10309 $_ = "";
c1739a4a 10310
99870f4d
KW
10311 return;
10312 }
10313
10314 # On the very first line, we fake it so the code below thinks there is
10315 # nothing to output, and initialize so that when it does get output it
10316 # uses the first line's values for the lowest part of the range.
10317 # (One could avoid this by using peek(), but then one would need to
10318 # know the adjustments done above and do the same ones in the setup
10319 # routine; not worth it)
10320 if ($first_time) {
10321 $first_time = 0;
10322 @previous_fields = @fields;
10323 @start = ($cp) x scalar @fields;
10324 $decimal_previous_cp = $decimal_cp - 1;
10325 }
10326
10327 # For each field, output the stored up ranges that this code point
10328 # doesn't fit in. Earlier we figured out if all ranges should be
10329 # terminated because of changing the replace or map type styles, or if
10330 # there is a gap between this new code point and the previous one, and
10331 # that is stored in $force_output. But even if those aren't true, we
10332 # need to output the range if this new code point's value for the
10333 # given property doesn't match the stored range's.
10334 #local $to_trace = 1 if main::DEBUG;
10335 foreach my $i (0 .. $last_field) {
10336 my $field = $fields[$i];
10337 if ($force_output || $field ne $previous_fields[$i]) {
10338
10339 # Flush the buffer of stored values.
10340 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10341
10342 # Start a new range with this code point and its value
10343 $start[$i] = $cp;
10344 $previous_fields[$i] = $field;
10345 }
10346 }
10347
10348 # Set the values for the next time.
10349 $previous_cp = $cp;
10350 $decimal_previous_cp = $decimal_cp;
10351
10352 # The input line has generated whatever adjusted lines are needed, and
10353 # should not be looked at further.
10354 $_ = "";
10355 return;
10356 }
10357
10358 sub EOF_UnicodeData {
10359 # Called upon EOF to flush the buffers, and create the Hangul
10360 # decomposition mappings if needed.
10361
10362 my $file = shift;
10363 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10364
10365 # Flush the buffers.
10366 foreach my $i (1 .. $last_field) {
10367 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10368 }
10369
10370 if (-e 'Jamo.txt') {
10371
10372 # The algorithm is published by Unicode, based on values in
10373 # Jamo.txt, (which should have been processed before this
10374 # subroutine), and the results left in %Jamo
10375 unless (%Jamo) {
10376 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10377 return;
10378 }
10379
10380 # If the full decomposition map table is being output, insert
10381 # into it the Hangul syllable mappings. This is to avoid having
10382 # to publish a subroutine in it to compute them. (which would
10383 # essentially be this code.) This uses the algorithm published by
10384 # Unicode.
10385 if (property_ref('Decomposition_Mapping')->to_output_map) {
10386 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10387 use integer;
10388 my $SIndex = $S - $SBase;
10389 my $L = $LBase + $SIndex / $NCount;
10390 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10391 my $T = $TBase + $SIndex % $TCount;
10392
10393 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10394 my $decomposition = sprintf("%04X %04X", $L, $V);
10395 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10396 $file->insert_adjusted_lines(
10397 sprintf("%04X; Decomposition_Mapping; %s",
10398 $S,
10399 $decomposition));
10400 }
10401 }
10402 }
10403
10404 return;
10405 }
10406
10407 sub filter_v1_ucd {
10408 # Fix UCD lines in version 1. This is probably overkill, but this
10409 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10410 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10411 # removed. This program retains them
10412 # 2) didn't include ranges, which it should have, and which are now
10413 # added in @corrected_lines below. It was hand populated by
10414 # taking the data from Version 2, verified by analyzing
10415 # DAge.txt.
10416 # 3) There is a syntax error in the entry for U+09F8 which could
10417 # cause problems for utf8_heavy, and so is changed. It's
10418 # numeric value was simply a minus sign, without any number.
10419 # (Eventually Unicode changed the code point to non-numeric.)
10420 # 4) The decomposition types often don't match later versions
10421 # exactly, and the whole syntax of that field is different; so
10422 # the syntax is changed as well as the types to their later
10423 # terminology. Otherwise normalize.pm would be very unhappy
10424 # 5) Many ccc classes are different. These are left intact.
10425 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10426 # fields. These are unchanged because it doesn't really cause
10427 # problems for Perl.
10428 # 7) A number of code points, such as controls, don't have their
10429 # Unicode Version 1 Names in this file. These are unchanged.
10430
10431 my @corrected_lines = split /\n/, <<'END';
104324E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
104339FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10434E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10435F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10436F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10437FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10438END
10439
10440 my $file = shift;
10441 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10442
10443 #local $to_trace = 1 if main::DEBUG;
10444 trace $_ if main::DEBUG && $to_trace;
10445
10446 # -1 => retain trailing null fields
10447 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10448
10449 # At the first place that is wrong in the input, insert all the
10450 # corrections, replacing the wrong line.
10451 if ($code_point eq '4E00') {
10452 my @copy = @corrected_lines;
10453 $_ = shift @copy;
10454 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10455
10456 $file->insert_lines(@copy);
10457 }
10458
10459
10460 if ($fields[$NUMERIC] eq '-') {
10461 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10462 }
10463
10464 if ($fields[$PERL_DECOMPOSITION] ne "") {
10465
10466 # Several entries have this change to superscript 2 or 3 in the
10467 # middle. Convert these to the modern version, which is to use
10468 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10469 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10470 # 'HHHH HHHH 00B3 HHHH'.
10471 # It turns out that all of these that don't have another
10472 # decomposition defined at the beginning of the line have the
10473 # <square> decomposition in later releases.
10474 if ($code_point ne '00B2' && $code_point ne '00B3') {
10475 if ($fields[$PERL_DECOMPOSITION]
10476 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10477 {
10478 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10479 $fields[$PERL_DECOMPOSITION] = '<square> '
10480 . $fields[$PERL_DECOMPOSITION];
10481 }
10482 }
10483 }
10484
10485 # If is like '<+circled> 0052 <-circled>', convert to
10486 # '<circled> 0052'
10487 $fields[$PERL_DECOMPOSITION] =~
10488 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10489
10490 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10491 $fields[$PERL_DECOMPOSITION] =~
10492 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10493 or $fields[$PERL_DECOMPOSITION] =~
10494 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10495 or $fields[$PERL_DECOMPOSITION] =~
10496 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10497 or $fields[$PERL_DECOMPOSITION] =~
10498 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10499
10500 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10501 $fields[$PERL_DECOMPOSITION] =~
10502 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10503
10504 # Change names to modern form.
10505 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10506 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10507 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10508 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10509
10510 # One entry has weird braces
10511 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10512 }
10513
10514 $_ = join ';', $code_point, @fields;
10515 trace $_ if main::DEBUG && $to_trace;
10516 return;
10517 }
10518
10519 sub filter_v2_1_5_ucd {
10520 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10521 # columns swapped; These all had mirrored be 'N'. So if the numeric
10522 # column appears to be N, swap it back.
10523
10524 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10525 if ($fields[$NUMERIC] eq 'N') {
10526 $fields[$NUMERIC] = $fields[$MIRRORED];
10527 $fields[$MIRRORED] = 'N';
10528 $_ = join ';', $code_point, @fields;
10529 }
10530 return;
10531 }
3ffed8c2
KW
10532
10533 sub filter_v6_ucd {
10534
c12f2655
KW
10535 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10536 # accepted that yet to allow for some deprecation cycles.
3ffed8c2 10537
484741e1 10538 return if $_ !~ /^(?:0007|1F514|070F);/;
3ffed8c2
KW
10539
10540 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10541 if ($code_point eq '0007') {
dcd72625 10542 $fields[$CHARNAME] = "";
3ffed8c2 10543 }
484741e1
KW
10544 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10545 # http://www.unicode.org/versions/corrigendum8.html
10546 $fields[$BIDI] = "AL";
10547 }
10914c78 10548 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
3ffed8c2
KW
10549 $fields[$CHARNAME] = "";
10550 }
10551
10552 $_ = join ';', $code_point, @fields;
10553
10554 return;
10555 }
99870f4d
KW
10556} # End closure for UnicodeData
10557
37e2e78e
KW
10558sub process_GCB_test {
10559
10560 my $file = shift;
10561 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10562
10563 while ($file->next_line) {
10564 push @backslash_X_tests, $_;
10565 }
678f13d5 10566
37e2e78e
KW
10567 return;
10568}
10569
99870f4d
KW
10570sub process_NamedSequences {
10571 # NamedSequences.txt entries are just added to an array. Because these
10572 # don't look like the other tables, they have their own handler.
10573 # An example:
10574 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10575 #
10576 # This just adds the sequence to an array for later handling
10577
99870f4d
KW
10578 my $file = shift;
10579 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10580
10581 while ($file->next_line) {
10582 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10583 if (@remainder) {
10584 $file->carp_bad_line(
10585 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10586 next;
10587 }
fb121860
KW
10588
10589 # Note single \t in keeping with special output format of
10590 # Perl_charnames. But it turns out that the code points don't have to
10591 # be 5 digits long, like the rest, based on the internal workings of
10592 # charnames.pm. This could be easily changed for consistency.
10593 push @named_sequences, "$sequence\t$name";
99870f4d
KW
10594 }
10595 return;
10596}
10597
10598{ # Closure
10599
10600 my $first_range;
10601
10602 sub filter_early_ea_lb {
10603 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10604 # third field be the name of the code point, which can be ignored in
10605 # most cases. But it can be meaningful if it marks a range:
10606 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10607 # 3400;W;<CJK Ideograph Extension A, First>
10608 #
10609 # We need to see the First in the example above to know it's a range.
10610 # They did not use the later range syntaxes. This routine changes it
10611 # to use the modern syntax.
10612 # $1 is the Input_file object.
10613
10614 my @fields = split /\s*;\s*/;
10615 if ($fields[2] =~ /^<.*, First>/) {
10616 $first_range = $fields[0];
10617 $_ = "";
10618 }
10619 elsif ($fields[2] =~ /^<.*, Last>/) {
10620 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10621 }
10622 else {
10623 undef $first_range;
10624 $_ = "$fields[0]; $fields[1]";
10625 }
10626
10627 return;
10628 }
10629}
10630
10631sub filter_old_style_arabic_shaping {
10632 # Early versions used a different term for the later one.
10633
10634 my @fields = split /\s*;\s*/;
10635 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10636 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10637 $_ = join ';', @fields;
10638 return;
10639}
10640
10641sub filter_arabic_shaping_line {
10642 # ArabicShaping.txt has entries that look like:
10643 # 062A; TEH; D; BEH
10644 # The field containing 'TEH' is not used. The next field is Joining_Type
10645 # and the last is Joining_Group
10646 # This generates two lines to pass on, one for each property on the input
10647 # line.
10648
10649 my $file = shift;
10650 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10651
10652 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10653
10654 if (@fields > 4) {
10655 $file->carp_bad_line('Extra fields');
10656 $_ = "";
10657 return;
10658 }
10659
10660 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10661 $_ = "$fields[0]; Joining_Type; $fields[2]";
10662
10663 return;
10664}
10665
d3fed3dd
KW
10666{ # Closure
10667 my $lc; # Table for lowercase mapping
10668 my $tc;
10669 my $uc;
10670
6c0259ad
KW
10671 sub setup_special_casing {
10672 # SpecialCasing.txt contains the non-simple case change mappings. The
10673 # simple ones are in UnicodeData.txt, which should already have been
10674 # read in to the full property data structures, so as to initialize
10675 # these with the simple ones. Then the SpecialCasing.txt entries
10676 # overwrite the ones which have different full mappings.
10677
10678 # This routine sees if the simple mappings are to be output, and if
10679 # so, copies what has already been put into the full mapping tables,
10680 # while they still contain only the simple mappings.
10681
10682 # The reason it is done this way is that the simple mappings are
10683 # probably not going to be output, so it saves work to initialize the
10684 # full tables with the simple mappings, and then overwrite those
10685 # relatively few entries in them that have different full mappings,
10686 # and thus skip the simple mapping tables altogether.
10687
c12f2655
KW
10688 # New tables with just the simple mappings that are overridden by the
10689 # full ones are constructed. These are for Unicode::UCD, which
10690 # requires the simple mappings. The Case_Folding table is a combined
10691 # table of both the simple and full mappings, with the full ones being
10692 # in the hash, and the simple ones, even those overridden by the hash,
10693 # being in the base table. That same mechanism could have been
10694 # employed here, except that the docs have said that the generated
10695 # files are usuable directly by programs, so we dare not change the
10696 # format in any way.
10697
6c0259ad
KW
10698 my $file= shift;
10699 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10700
6c0259ad
KW
10701 $lc = property_ref('lc');
10702 $tc = property_ref('tc');
10703 $uc = property_ref('uc');
10704
10705 # For each of the case change mappings...
10706 foreach my $case_table ($lc, $tc, $uc) {
10707 my $case = $case_table->name;
10708 my $full = property_ref($case);
10709 unless (defined $full && ! $full->is_empty) {
10710 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10711 }
10712
10713 # The simple version's name in each mapping merely has an 's' in
10714 # front of the full one's
301ba948
KW
10715 my $simple_name = 's' . $case;
10716 my $simple = property_ref($simple_name);
6c0259ad
KW
10717 $simple->initialize($full) if $simple->to_output_map();
10718
10719 my $simple_only = Property->new("_s$case",
10720 Type => $STRING,
10721 Default_Map => $CODE_POINT,
10722 Perl_Extension => 1,
301ba948 10723 Fate => $INTERNAL_ONLY,
9c27f500 10724 Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
6c0259ad
KW
10725 $simple_only->set_to_output_map($INTERNAL_MAP);
10726 $simple_only->add_comment(join_lines( <<END
d3fed3dd
KW
10727This file is for UCD.pm so that it can construct simple mappings that would
10728otherwise be lost because they are overridden by full mappings.
10729END
6c0259ad 10730 ));
5be997b0
KW
10731
10732 unless ($simple->to_output_map()) {
10733 $simple_only->set_proxy_for($simple_name);
10734 }
6c0259ad 10735 }
99870f4d 10736
6c0259ad
KW
10737 return;
10738 }
99870f4d 10739
6c0259ad
KW
10740 sub filter_special_casing_line {
10741 # Change the format of $_ from SpecialCasing.txt into something that
10742 # the generic handler understands. Each input line contains three
10743 # case mappings. This will generate three lines to pass to the
10744 # generic handler for each of those.
99870f4d 10745
6c0259ad
KW
10746 # The input syntax (after stripping comments and trailing white space
10747 # is like one of the following (with the final two being entries that
10748 # we ignore):
10749 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10750 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10751 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10752 # Note the trailing semi-colon, unlike many of the input files. That
10753 # means that there will be an extra null field generated by the split
99870f4d 10754
6c0259ad
KW
10755 my $file = shift;
10756 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10757
6c0259ad
KW
10758 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10759 # fields
10760
10761 # field #4 is when this mapping is conditional. If any of these get
10762 # implemented, it would be by hard-coding in the casing functions in
10763 # the Perl core, not through tables. But if there is a new condition
10764 # we don't know about, output a warning. We know about all the
10765 # conditions through 6.0
10766 if ($fields[4] ne "") {
10767 my @conditions = split ' ', $fields[4];
10768 if ($conditions[0] ne 'tr' # We know that these languages have
10769 # conditions, and some are multiple
10770 && $conditions[0] ne 'az'
10771 && $conditions[0] ne 'lt'
10772
10773 # And, we know about a single condition Final_Sigma, but
10774 # nothing else.
10775 && ($v_version gt v5.2.0
10776 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10777 {
10778 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10779 }
10780 elsif ($conditions[0] ne 'Final_Sigma') {
99870f4d 10781
6c0259ad
KW
10782 # Don't print out a message for Final_Sigma, because we
10783 # have hard-coded handling for it. (But the standard
10784 # could change what the rule should be, but it wouldn't
10785 # show up here anyway.
99870f4d 10786
6c0259ad 10787 print "# SKIPPING Special Casing: $_\n"
99870f4d 10788 if $verbosity >= $VERBOSE;
6c0259ad
KW
10789 }
10790 $_ = "";
10791 return;
10792 }
10793 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10794 $file->carp_bad_line('Extra fields');
10795 $_ = "";
10796 return;
99870f4d 10797 }
99870f4d 10798
6c0259ad
KW
10799 $_ = "$fields[0]; lc; $fields[1]";
10800 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10801 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
99870f4d 10802
6c0259ad
KW
10803 # Copy any simple case change to the special tables constructed if
10804 # being overridden by a multi-character case change.
10805 if ($fields[1] ne $fields[0]
10806 && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10807 {
10808 $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10809 }
10810 if ($fields[2] ne $fields[0]
10811 && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10812 {
10813 $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10814 }
10815 if ($fields[3] ne $fields[0]
10816 && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10817 {
10818 $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10819 }
d3fed3dd 10820
6c0259ad
KW
10821 return;
10822 }
d3fed3dd 10823}
99870f4d
KW
10824
10825sub filter_old_style_case_folding {
10826 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 10827 # and later style. Different letters were used in the earlier.
99870f4d
KW
10828
10829 my $file = shift;
10830 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10831
10832 my @fields = split /\s*;\s*/;
10833 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10834 $fields[1] = 'I';
10835 }
10836 elsif ($fields[1] eq 'L') {
10837 $fields[1] = 'C'; # L => C always
10838 }
10839 elsif ($fields[1] eq 'E') {
10840 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10841 $fields[1] = 'F'
10842 }
10843 else {
10844 $fields[1] = 'C'
10845 }
10846 }
10847 else {
10848 $file->carp_bad_line("Expecting L or E in second field");
10849 $_ = "";
10850 return;
10851 }
10852 $_ = join("; ", @fields) . ';';
10853 return;
10854}
10855
10856{ # Closure for case folding
10857
10858 # Create the map for simple only if are going to output it, for otherwise
10859 # it takes no part in anything we do.
10860 my $to_output_simple;
10861
99870f4d
KW
10862 sub setup_case_folding($) {
10863 # Read in the case foldings in CaseFolding.txt. This handles both
10864 # simple and full case folding.
10865
10866 $to_output_simple
10867 = property_ref('Simple_Case_Folding')->to_output_map;
10868
5be997b0
KW
10869 if (! $to_output_simple) {
10870 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
10871 }
10872
6f2a3287
KW
10873 # If we ever wanted to show that these tables were combined, a new
10874 # property method could be created, like set_combined_props()
10875 property_ref('Case_Folding')->add_comment(join_lines( <<END
10876This file includes both the simple and full case folding maps. The simple
10877ones are in the main body of the table below, and the full ones adding to or
10878overriding them are in the hash.
10879END
10880 ));
99870f4d
KW
10881 return;
10882 }
10883
10884 sub filter_case_folding_line {
10885 # Called for each line in CaseFolding.txt
10886 # Input lines look like:
10887 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10888 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10889 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10890 #
10891 # 'C' means that folding is the same for both simple and full
10892 # 'F' that it is only for full folding
10893 # 'S' that it is only for simple folding
10894 # 'T' is locale-dependent, and ignored
10895 # 'I' is a type of 'F' used in some early releases.
10896 # Note the trailing semi-colon, unlike many of the input files. That
10897 # means that there will be an extra null field generated by the split
10898 # below, which we ignore and hence is not an error.
10899
10900 my $file = shift;
10901 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10902
10903 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10904 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10905 $file->carp_bad_line('Extra fields');
10906 $_ = "";
10907 return;
10908 }
10909
10910 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10911 $_ = "";
10912 return;
10913 }
10914
10915 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
3c099872
KW
10916 # I are all full foldings; S is single-char. For S, there is always
10917 # an F entry, so we must allow multiple values for the same code
10918 # point. Fortunately this table doesn't need further manipulation
10919 # which would preclude using multiple-values. The S is now included
10920 # so that _swash_inversion_hash() is able to construct closures
10921 # without having to worry about F mappings.
10922 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
9470941f
KW
10923 $_ = "$range; Case_Folding; "
10924 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
99870f4d
KW
10925 }
10926 else {
10927 $_ = "";
3c099872 10928 $file->carp_bad_line('Expecting C F I S or T in second field');
99870f4d
KW
10929 }
10930
10931 # C and S are simple foldings, but simple case folding is not needed
10932 # unless we explicitly want its map table output.
10933 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10934 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10935 }
10936
99870f4d
KW
10937 return;
10938 }
10939
99870f4d
KW
10940} # End case fold closure
10941
10942sub filter_jamo_line {
10943 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10944 # from this file that is used in generating the Name property for Jamo
10945 # code points. But, it also is used to convert early versions' syntax
10946 # into the modern form. Here are two examples:
10947 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10948 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10949 #
10950 # The input is $_, the output is $_ filtered.
10951
10952 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10953
10954 # Let the caller handle unexpected input. In earlier versions, there was
10955 # a third field which is supposed to be a comment, but did not have a '#'
10956 # before it.
10957 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10958
10959 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10960 # beginning.
10961
10962 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10963 $fields[1] = 'R' if $fields[0] eq '1105';
10964
10965 # Add to structure so can generate Names from it.
10966 my $cp = hex $fields[0];
10967 my $short_name = $fields[1];
10968 $Jamo{$cp} = $short_name;
10969 if ($cp <= $LBase + $LCount) {
10970 $Jamo_L{$short_name} = $cp - $LBase;
10971 }
10972 elsif ($cp <= $VBase + $VCount) {
10973 $Jamo_V{$short_name} = $cp - $VBase;
10974 }
10975 elsif ($cp <= $TBase + $TCount) {
10976 $Jamo_T{$short_name} = $cp - $TBase;
10977 }
10978 else {
10979 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10980 }
10981
10982
10983 # Reassemble using just the first two fields to look like a typical
10984 # property file line
10985 $_ = "$fields[0]; $fields[1]";
10986
10987 return;
10988}
10989
99870f4d
KW
10990sub register_fraction($) {
10991 # This registers the input rational number so that it can be passed on to
10992 # utf8_heavy.pl, both in rational and floating forms.
10993
10994 my $rational = shift;
10995
10996 my $float = eval $rational;
10997 $nv_floating_to_rational{$float} = $rational;
10998 return;
10999}
11000
11001sub filter_numeric_value_line {
11002 # DNumValues contains lines of a different syntax than the typical
11003 # property file:
11004 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
11005 #
11006 # This routine transforms $_ containing the anomalous syntax to the
11007 # typical, by filtering out the extra columns, and convert early version
11008 # decimal numbers to strings that look like rational numbers.
11009
11010 my $file = shift;
11011 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11012
11013 # Starting in 5.1, there is a rational field. Just use that, omitting the
11014 # extra columns. Otherwise convert the decimal number in the second field
11015 # to a rational, and omit extraneous columns.
11016 my @fields = split /\s*;\s*/, $_, -1;
11017 my $rational;
11018
11019 if ($v_version ge v5.1.0) {
11020 if (@fields != 4) {
11021 $file->carp_bad_line('Not 4 semi-colon separated fields');
11022 $_ = "";
11023 return;
11024 }
11025 $rational = $fields[3];
11026 $_ = join '; ', @fields[ 0, 3 ];
11027 }
11028 else {
11029
11030 # Here, is an older Unicode file, which has decimal numbers instead of
11031 # rationals in it. Use the fraction to calculate the denominator and
11032 # convert to rational.
11033
11034 if (@fields != 2 && @fields != 3) {
11035 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11036 $_ = "";
11037 return;
11038 }
11039
11040 my $codepoints = $fields[0];
11041 my $decimal = $fields[1];
11042 if ($decimal =~ s/\.0+$//) {
11043
11044 # Anything ending with a decimal followed by nothing but 0's is an
11045 # integer
11046 $_ = "$codepoints; $decimal";
11047 $rational = $decimal;
11048 }
11049 else {
11050
11051 my $denominator;
11052 if ($decimal =~ /\.50*$/) {
11053 $denominator = 2;
11054 }
11055
11056 # Here have the hardcoded repeating decimals in the fraction, and
11057 # the denominator they imply. There were only a few denominators
11058 # in the older Unicode versions of this file which this code
11059 # handles, so it is easy to convert them.
11060
11061 # The 4 is because of a round-off error in the Unicode 3.2 files
11062 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11063 $denominator = 3;
11064 }
11065 elsif ($decimal =~ /\.[27]50*$/) {
11066 $denominator = 4;
11067 }
11068 elsif ($decimal =~ /\.[2468]0*$/) {
11069 $denominator = 5;
11070 }
11071 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11072 $denominator = 6;
11073 }
11074 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11075 $denominator = 8;
11076 }
11077 if ($denominator) {
11078 my $sign = ($decimal < 0) ? "-" : "";
11079 my $numerator = int((abs($decimal) * $denominator) + .5);
11080 $rational = "$sign$numerator/$denominator";
11081 $_ = "$codepoints; $rational";
11082 }
11083 else {
11084 $file->carp_bad_line("Can't cope with number '$decimal'.");
11085 $_ = "";
11086 return;
11087 }
11088 }
11089 }
11090
11091 register_fraction($rational) if $rational =~ qr{/};
11092 return;
11093}
11094
11095{ # Closure
11096 my %unihan_properties;
99870f4d
KW
11097
11098 sub setup_unihan {
11099 # Do any special setup for Unihan properties.
11100
11101 # This property gives the wrong computed type, so override.
11102 my $usource = property_ref('kIRG_USource');
11103 $usource->set_type($STRING) if defined $usource;
11104
b2abbe5b
KW
11105 # This property is to be considered binary (it says so in
11106 # http://www.unicode.org/reports/tr38/)
46b2142f 11107 my $iicore = property_ref('kIICore');
99870f4d 11108 if (defined $iicore) {
46b2142f
KW
11109 $iicore->set_type($FORCED_BINARY);
11110 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11111
11112 # Unicode doesn't include the maps for this property, so don't
11113 # warn that they are missing.
11114 $iicore->set_pre_declared_maps(0);
11115 $iicore->add_comment(join_lines( <<END
11116This property contains enum values, but Unicode UAX #38 says it should be
11117interpreted as binary, so Perl creates tables for both 1) its enum values,
11118plus 2) true/false tables in which it is considered true for all code points
11119that have a non-null value
11120END
11121 ));
99870f4d
KW
11122 }
11123
11124 return;
11125 }
11126
11127 sub filter_unihan_line {
11128 # Change unihan db lines to look like the others in the db. Here is
11129 # an input sample:
11130 # U+341C kCangjie IEKN
11131
11132 # Tabs are used instead of semi-colons to separate fields; therefore
11133 # they may have semi-colons embedded in them. Change these to periods
11134 # so won't screw up the rest of the code.
11135 s/;/./g;
11136
11137 # Remove lines that don't look like ones we accept.
11138 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11139 $_ = "";
11140 return;
11141 }
11142
11143 # Extract the property, and save a reference to its object.
11144 my $property = $1;
11145 if (! exists $unihan_properties{$property}) {
11146 $unihan_properties{$property} = property_ref($property);
11147 }
11148
11149 # Don't do anything unless the property is one we're handling, which
11150 # we determine by seeing if there is an object defined for it or not
11151 if (! defined $unihan_properties{$property}) {
11152 $_ = "";
11153 return;
11154 }
11155
99870f4d
KW
11156 # Convert the tab separators to our standard semi-colons, and convert
11157 # the U+HHHH notation to the rest of the standard's HHHH
11158 s/\t/;/g;
11159 s/\b U \+ (?= $code_point_re )//xg;
11160
11161 #local $to_trace = 1 if main::DEBUG;
11162 trace $_ if main::DEBUG && $to_trace;
11163
11164 return;
11165 }
11166}
11167
11168sub filter_blocks_lines {
11169 # In the Blocks.txt file, the names of the blocks don't quite match the
11170 # names given in PropertyValueAliases.txt, so this changes them so they
11171 # do match: Blanks and hyphens are changed into underscores. Also makes
11172 # early release versions look like later ones
11173 #
11174 # $_ is transformed to the correct value.
11175
11176 my $file = shift;
11177 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11178
11179 if ($v_version lt v3.2.0) {
11180 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11181 $_ = "";
11182 return;
11183 }
11184
11185 # Old versions used a different syntax to mark the range.
11186 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11187 }
11188
11189 my @fields = split /\s*;\s*/, $_, -1;
11190 if (@fields != 2) {
11191 $file->carp_bad_line("Expecting exactly two fields");
11192 $_ = "";
11193 return;
11194 }
11195
11196 # Change hyphens and blanks in the block name field only
11197 $fields[1] =~ s/[ -]/_/g;
11198 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11199
11200 $_ = join("; ", @fields);
11201 return;
11202}
11203
11204{ # Closure
11205 my $current_property;
11206
11207 sub filter_old_style_proplist {
11208 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11209 # was in a completely different syntax. Ken Whistler of Unicode says
11210 # that it was something he used as an aid for his own purposes, but
11211 # was never an official part of the standard. However, comments in
11212 # DAge.txt indicate that non-character code points were available in
11213 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11214 # there except through this file (but on the other hand, they first
11215 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11216 # not. But the claim is that it was published as an aid to others who
11217 # might want some more information than was given in the official UCD
11218 # of the time. Many of the properties in it were incorporated into
11219 # the later PropList.txt, but some were not. This program uses this
11220 # early file to generate property tables that are otherwise not
11221 # accessible in the early UCD's, and most were probably not really
11222 # official at that time, so one could argue that it should be ignored,
11223 # and you can easily modify things to skip this. And there are bugs
11224 # in this file in various versions. (For example, the 2.1.9 version
11225 # removes from Alphabetic the CJK range starting at 4E00, and they
11226 # weren't added back in until 3.1.0.) Many of this file's properties
11227 # were later sanctioned, so this code generates tables for those
11228 # properties that aren't otherwise in the UCD of the time but
11229 # eventually did become official, and throws away the rest. Here is a
11230 # list of all the ones that are thrown away:
11231 # Bidi=* duplicates UnicodeData.txt
11232 # Combining never made into official property;
11233 # is \P{ccc=0}
11234 # Composite never made into official property.
11235 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11236 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11237 # Delimiter never made into official property;
11238 # removed in 3.0.1
11239 # Format Control never made into official property;
11240 # similar to gc=cf
11241 # High Surrogate duplicates Blocks.txt
11242 # Ignorable Control never made into official property;
11243 # similar to di=y
11244 # ISO Control duplicates UnicodeData.txt: gc=cc
11245 # Left of Pair never made into official property;
11246 # Line Separator duplicates UnicodeData.txt: gc=zl
11247 # Low Surrogate duplicates Blocks.txt
11248 # Non-break was actually listed as a property
11249 # in 3.2, but without any code
11250 # points. Unicode denies that this
11251 # was ever an official property
11252 # Non-spacing duplicate UnicodeData.txt: gc=mn
11253 # Numeric duplicates UnicodeData.txt: gc=cc
11254 # Paired Punctuation never made into official property;
11255 # appears to be gc=ps + gc=pe
11256 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11257 # Private Use duplicates UnicodeData.txt: gc=co
11258 # Private Use High Surrogate duplicates Blocks.txt
11259 # Punctuation duplicates UnicodeData.txt: gc=p
11260 # Space different definition than eventual
11261 # one.
11262 # Titlecase duplicates UnicodeData.txt: gc=lt
11263 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
98dc9551 11264 # Zero-width never made into official property;
99870f4d
KW
11265 # subset of gc=cf
11266 # Most of the properties have the same names in this file as in later
11267 # versions, but a couple do not.
11268 #
11269 # This subroutine filters $_, converting it from the old style into
11270 # the new style. Here's a sample of the old-style
11271 #
11272 # *******************************************
11273 #
11274 # Property dump for: 0x100000A0 (Join Control)
11275 #
11276 # 200C..200D (2 chars)
11277 #
11278 # In the example, the property is "Join Control". It is kept in this
11279 # closure between calls to the subroutine. The numbers beginning with
11280 # 0x were internal to Ken's program that generated this file.
11281
11282 # If this line contains the property name, extract it.
11283 if (/^Property dump for: [^(]*\((.*)\)/) {
11284 $_ = $1;
11285
11286 # Convert white space to underscores.
11287 s/ /_/g;
11288
11289 # Convert the few properties that don't have the same name as
11290 # their modern counterparts
11291 s/Identifier_Part/ID_Continue/
11292 or s/Not_a_Character/NChar/;
11293
11294 # If the name matches an existing property, use it.
11295 if (defined property_ref($_)) {
11296 trace "new property=", $_ if main::DEBUG && $to_trace;
11297 $current_property = $_;
11298 }
11299 else { # Otherwise discard it
11300 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11301 undef $current_property;
11302 }
11303 $_ = ""; # The property is saved for the next lines of the
11304 # file, but this defining line is of no further use,
11305 # so clear it so that the caller won't process it
11306 # further.
11307 }
11308 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11309
11310 # Here, the input line isn't a header defining a property for the
11311 # following section, and either we aren't in such a section, or
11312 # the line doesn't look like one that defines the code points in
11313 # such a section. Ignore this line.
11314 $_ = "";
11315 }
11316 else {
11317
11318 # Here, we have a line defining the code points for the current
11319 # stashed property. Anything starting with the first blank is
11320 # extraneous. Otherwise, it should look like a normal range to
11321 # the caller. Append the property name so that it looks just like
11322 # a modern PropList entry.
11323
11324 $_ =~ s/\s.*//;
11325 $_ .= "; $current_property";
11326 }
11327 trace $_ if main::DEBUG && $to_trace;
11328 return;
11329 }
11330} # End closure for old style proplist
11331
11332sub filter_old_style_normalization_lines {
11333 # For early releases of Unicode, the lines were like:
11334 # 74..2A76 ; NFKD_NO
11335 # For later releases this became:
11336 # 74..2A76 ; NFKD_QC; N
11337 # Filter $_ to look like those in later releases.
11338 # Similarly for MAYBEs
11339
11340 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11341
11342 # Also, the property FC_NFKC was abbreviated to FNC
11343 s/FNC/FC_NFKC/;
11344 return;
11345}
11346
82aed44a
KW
11347sub setup_script_extensions {
11348 # The Script_Extensions property starts out with a clone of the Script
11349 # property.
11350
4fec90df
KW
11351 my $scx = property_ref("Script_Extensions");
11352 $scx = Property->new("scx", Full_Name => "Script_Extensions")
11353 if ! defined $scx;
11354 $scx->_set_format($STRING_WHITE_SPACE_LIST);
11355 $scx->initialize($script);
11356 $scx->set_default_map($script->default_map);
11357 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
82aed44a
KW
11358 $scx->add_comment(join_lines( <<END
11359The values for code points that appear in one script are just the same as for
11360the 'Script' property. Likewise the values for those that appear in many
11361scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11362values of code points that appear in a few scripts are a space separated list
11363of those scripts.
11364END
11365 ));
11366
8d35804a 11367 # Initialize scx's tables and the aliases for them to be the same as sc's
4fec90df 11368 foreach my $table ($script->tables) {
82aed44a
KW
11369 my $scx_table = $scx->add_match_table($table->name,
11370 Full_Name => $table->full_name);
11371 foreach my $alias ($table->aliases) {
11372 $scx_table->add_alias($alias->name);
11373 }
11374 }
11375}
11376
fbe1e607
KW
11377sub filter_script_extensions_line {
11378 # The Scripts file comes with the full name for the scripts; the
11379 # ScriptExtensions, with the short name. The final mapping file is a
11380 # combination of these, and without adjustment, would have inconsistent
11381 # entries. This filters the latter file to convert to full names.
11382 # Entries look like this:
11383 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11384
11385 my @fields = split /\s*;\s*/;
11386 my @full_names;
11387 foreach my $short_name (split " ", $fields[1]) {
11388 push @full_names, $script->table($short_name)->full_name;
11389 }
11390 $fields[1] = join " ", @full_names;
11391 $_ = join "; ", @fields;
11392
11393 return;
11394}
11395
ce432655 11396sub setup_early_name_alias {
58b75e36
KW
11397 property_ref('Name_Alias')->add_map(7, 7, "ALERT: control");
11398}
11399
11400sub filter_early_version_name_alias_line {
11401 $_ .= ": correction";
11402 return;
dcd72625
KW
11403}
11404
99870f4d
KW
11405sub finish_Unicode() {
11406 # This routine should be called after all the Unicode files have been read
11407 # in. It:
11408 # 1) Adds the mappings for code points missing from the files which have
11409 # defaults specified for them.
11410 # 2) At this this point all mappings are known, so it computes the type of
11411 # each property whose type hasn't been determined yet.
11412 # 3) Calculates all the regular expression match tables based on the
11413 # mappings.
11414 # 3) Calculates and adds the tables which are defined by Unicode, but
11415 # which aren't derived by them
11416
11417 # For each property, fill in any missing mappings, and calculate the re
11418 # match tables. If a property has more than one missing mapping, the
11419 # default is a reference to a data structure, and requires data from other
11420 # properties to resolve. The sort is used to cause these to be processed
11421 # last, after all the other properties have been calculated.
11422 # (Fortunately, the missing properties so far don't depend on each other.)
11423 foreach my $property
11424 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11425 property_ref('*'))
11426 {
11427 # $perl has been defined, but isn't one of the Unicode properties that
11428 # need to be finished up.
11429 next if $property == $perl;
11430
9f877a47
KW
11431 # Nor do we need to do anything with properties that aren't going to
11432 # be output.
11433 next if $property->fate == $SUPPRESSED;
11434
99870f4d
KW
11435 # Handle the properties that have more than one possible default
11436 if (ref $property->default_map) {
11437 my $default_map = $property->default_map;
11438
11439 # These properties have stored in the default_map:
11440 # One or more of:
11441 # 1) A default map which applies to all code points in a
11442 # certain class
11443 # 2) an expression which will evaluate to the list of code
11444 # points in that class
11445 # And
11446 # 3) the default map which applies to every other missing code
11447 # point.
11448 #
11449 # Go through each list.
11450 while (my ($default, $eval) = $default_map->get_next_defaults) {
11451
11452 # Get the class list, and intersect it with all the so-far
11453 # unspecified code points yielding all the code points
11454 # in the class that haven't been specified.
11455 my $list = eval $eval;
11456 if ($@) {
11457 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11458 last;
11459 }
11460
11461 # Narrow down the list to just those code points we don't have
11462 # maps for yet.
11463 $list = $list & $property->inverse_list;
11464
11465 # Add mappings to the property for each code point in the list
11466 foreach my $range ($list->ranges) {
56343c78
KW
11467 $property->add_map($range->start, $range->end, $default,
11468 Replace => $CROAK);
99870f4d
KW
11469 }
11470 }
11471
11472 # All remaining code points have the other mapping. Set that up
11473 # so the normal single-default mapping code will work on them
11474 $property->set_default_map($default_map->other_default);
11475
11476 # And fall through to do that
11477 }
11478
11479 # We should have enough data now to compute the type of the property.
11480 $property->compute_type;
11481 my $property_type = $property->type;
11482
11483 next if ! $property->to_create_match_tables;
11484
11485 # Here want to create match tables for this property
11486
11487 # The Unicode db always (so far, and they claim into the future) have
11488 # the default for missing entries in binary properties be 'N' (unless
11489 # there is a '@missing' line that specifies otherwise)
11490 if ($property_type == $BINARY && ! defined $property->default_map) {
11491 $property->set_default_map('N');
11492 }
11493
11494 # Add any remaining code points to the mapping, using the default for
5d7f7709 11495 # missing code points.
d8fb1cc3 11496 my $default_table;
99870f4d 11497 if (defined (my $default_map = $property->default_map)) {
1520492f 11498
f4c2a127 11499 # Make sure there is a match table for the default
f4c2a127
KW
11500 if (! defined ($default_table = $property->table($default_map))) {
11501 $default_table = $property->add_match_table($default_map);
11502 }
11503
a92d5c2e
KW
11504 # And, if the property is binary, the default table will just
11505 # be the complement of the other table.
11506 if ($property_type == $BINARY) {
11507 my $non_default_table;
11508
11509 # Find the non-default table.
11510 for my $table ($property->tables) {
11511 next if $table == $default_table;
11512 $non_default_table = $table;
11513 }
11514 $default_table->set_complement($non_default_table);
11515 }
862fd107 11516 else {
a92d5c2e 11517
3981d009
KW
11518 # This fills in any missing values with the default. It's not
11519 # necessary to do this with binary properties, as the default
11520 # is defined completely in terms of the Y table.
6189eadc 11521 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
3981d009 11522 $default_map, Replace => $NO);
862fd107 11523 }
99870f4d
KW
11524 }
11525
11526 # Have all we need to populate the match tables.
11527 my $property_name = $property->name;
56557540 11528 my $maps_should_be_defined = $property->pre_declared_maps;
99870f4d
KW
11529 foreach my $range ($property->ranges) {
11530 my $map = $range->value;
f5e9a6ca 11531 my $table = $property->table($map);
99870f4d
KW
11532 if (! defined $table) {
11533
11534 # Integral and rational property values are not necessarily
56557540
KW
11535 # defined in PropValueAliases, but whether all the other ones
11536 # should be depends on the property.
11537 if ($maps_should_be_defined
99870f4d
KW
11538 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11539 {
11540 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
11541 }
f5e9a6ca 11542 $table = $property->add_match_table($map);
99870f4d
KW
11543 }
11544
862fd107 11545 next if $table->complement != 0; # Don't need to populate these
99870f4d
KW
11546 $table->add_range($range->start, $range->end);
11547 }
11548
06f26c45
KW
11549 # A forced binary property has additional true/false tables which
11550 # should have been set up when it was forced into binary. The false
11551 # table matches exactly the same set as the property's default table.
11552 # The true table matches the complement of that. The false table is
11553 # not the same as an additional set of aliases on top of the default
11554 # table, so use 'set_equivalent_to'. If it were implemented as
11555 # additional aliases, various things would have to be adjusted, but
11556 # especially, if the user wants to get a list of names for the table
11557 # using Unicode::UCD::prop_value_aliases(), s/he should get a
11558 # different set depending on whether they want the default table or
11559 # the false table.
11560 if ($property_type == $FORCED_BINARY) {
11561 $property->table('N')->set_equivalent_to($default_table,
11562 Related => 1);
11563 $property->table('Y')->set_complement($default_table);
11564 }
11565
807807b7
KW
11566 # For Perl 5.6 compatibility, all properties matchable in regexes can
11567 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
11568 # But warn if this creates a conflict with a (new) Unicode property
11569 # name, although it appears that Unicode has made a decision never to
11570 # begin a property name with 'Is_', so this shouldn't happen.
99870f4d
KW
11571 foreach my $alias ($property->aliases) {
11572 my $Is_name = 'Is_' . $alias->name;
807807b7 11573 if (defined (my $pre_existing = property_ref($Is_name))) {
99870f4d 11574 Carp::my_carp(<<END
807807b7
KW
11575There is already an alias named $Is_name (from " . $pre_existing . "), so
11576creating one for $property won't work. This is bad news. If it is not too
11577late, get Unicode to back off. Otherwise go back to the old scheme (findable
11578from the git blame log for this area of the code that suppressed individual
11579aliases that conflict with the new Unicode names. Proceeding anyway.
99870f4d
KW
11580END
11581 );
99870f4d
KW
11582 }
11583 } # End of loop through aliases for this property
11584 } # End of loop through all Unicode properties.
11585
11586 # Fill in the mappings that Unicode doesn't completely furnish. First the
11587 # single letter major general categories. If Unicode were to start
11588 # delivering the values, this would be redundant, but better that than to
11589 # try to figure out if should skip and not get it right. Ths could happen
11590 # if a new major category were to be introduced, and the hard-coded test
11591 # wouldn't know about it.
11592 # This routine depends on the standard names for the general categories
11593 # being what it thinks they are, like 'Cn'. The major categories are the
11594 # union of all the general category tables which have the same first
11595 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11596 foreach my $minor_table ($gc->tables) {
11597 my $minor_name = $minor_table->name;
11598 next if length $minor_name == 1;
11599 if (length $minor_name != 2) {
11600 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11601 next;
11602 }
11603
11604 my $major_name = uc(substr($minor_name, 0, 1));
11605 my $major_table = $gc->table($major_name);
11606 $major_table += $minor_table;
11607 }
11608
11609 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11610 # defines it as LC)
11611 my $LC = $gc->table('LC');
11612 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11613 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11614
11615
11616 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11617 # deliver the correct values in it
11618 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11619
11620 # Lt not in release 1.
a5c376b7
KW
11621 if (defined $gc->table('Lt')) {
11622 $LC += $gc->table('Lt');
11623 $gc->table('Lt')->set_caseless_equivalent($LC);
11624 }
99870f4d
KW
11625 }
11626 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11627
a5c376b7
KW
11628 $gc->table('Ll')->set_caseless_equivalent($LC);
11629 $gc->table('Lu')->set_caseless_equivalent($LC);
11630
99870f4d 11631 my $Cs = $gc->table('Cs');
99870f4d
KW
11632
11633
11634 # Folding information was introduced later into Unicode data. To get
11635 # Perl's case ignore (/i) to work at all in releases that don't have
11636 # folding, use the best available alternative, which is lower casing.
11637 my $fold = property_ref('Simple_Case_Folding');
11638 if ($fold->is_empty) {
11639 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11640 $fold->add_note(join_lines(<<END
11641WARNING: This table uses lower case as a substitute for missing fold
11642information
11643END
11644 ));
11645 }
11646
11647 # Multiple-character mapping was introduced later into Unicode data. If
11648 # missing, use the single-characters maps as best available alternative
11649 foreach my $map (qw { Uppercase_Mapping
11650 Lowercase_Mapping
11651 Titlecase_Mapping
11652 Case_Folding
11653 } ) {
11654 my $full = property_ref($map);
11655 if ($full->is_empty) {
11656 my $simple = property_ref('Simple_' . $map);
11657 $full->initialize($simple);
11658 $full->add_comment($simple->comment) if ($simple->comment);
11659 $full->add_note(join_lines(<<END
11660WARNING: This table uses simple mapping (single-character only) as a
11661substitute for missing multiple-character information
11662END
11663 ));
11664 }
11665 }
82aed44a
KW
11666
11667 # The Script_Extensions property started out as a clone of the Script
11668 # property. But processing its data file caused some elements to be
11669 # replaced with different data. (These elements were for the Common and
11670 # Inherited properties.) This data is a qw() list of all the scripts that
11671 # the code points in the given range are in. An example line is:
11672 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
11673 #
11674 # The code above has created a new match table named "Arab Syrc Thaa"
11675 # which contains 060C. (The cloned table started out with this code point
11676 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
11677 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
11678 # match table. This is repeated for all these tables and ranges. The map
11679 # data is retained in the map table for reference, but the spurious match
11680 # tables are deleted.
11681
11682 my $scx = property_ref("Script_Extensions");
d53a7e7d 11683 if (defined $scx) {
c3a37f64
KW
11684 foreach my $table ($scx->tables) {
11685 next unless $table->name =~ /\s/; # All the new and only the new
11686 # tables have a space in their
11687 # names
11688 my @scripts = split /\s+/, $table->name;
11689 foreach my $script (@scripts) {
11690 my $script_table = $scx->table($script);
11691 $script_table += $table;
11692 }
11693 $scx->delete_match_table($table);
82aed44a 11694 }
d53a7e7d 11695 }
82aed44a
KW
11696
11697 return;
99870f4d
KW
11698}
11699
11700sub compile_perl() {
11701 # Create perl-defined tables. Almost all are part of the pseudo-property
11702 # named 'perl' internally to this program. Many of these are recommended
11703 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11704 # on those found there.
11705 # Almost all of these are equivalent to some Unicode property.
11706 # A number of these properties have equivalents restricted to the ASCII
11707 # range, with their names prefaced by 'Posix', to signify that these match
11708 # what the Posix standard says they should match. A couple are
11709 # effectively this, but the name doesn't have 'Posix' in it because there
cbc24f92
KW
11710 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
11711 # to the full Unicode range, by our guesses as to what is appropriate.
99870f4d
KW
11712
11713 # 'Any' is all code points. As an error check, instead of just setting it
11714 # to be that, construct it to be the union of all the major categories
7fc6cb55 11715 $Any = $perl->add_match_table('Any',
6189eadc 11716 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
99870f4d
KW
11717 Matches_All => 1);
11718
11719 foreach my $major_table ($gc->tables) {
11720
11721 # Major categories are the ones with single letter names.
11722 next if length($major_table->name) != 1;
11723
11724 $Any += $major_table;
11725 }
11726
6189eadc 11727 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
99870f4d
KW
11728 Carp::my_carp_bug("Generated highest code point ("
11729 . sprintf("%X", $Any->max)
6189eadc 11730 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
99870f4d
KW
11731 }
11732 if ($Any->range_count != 1 || $Any->min != 0) {
11733 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11734 }
11735
11736 $Any->add_alias('All');
11737
11738 # Assigned is the opposite of gc=unassigned
11739 my $Assigned = $perl->add_match_table('Assigned',
11740 Description => "All assigned code points",
11741 Initialize => ~ $gc->table('Unassigned'),
11742 );
11743
11744 # Our internal-only property should be treated as more than just a
8050d00f 11745 # synonym; grandfather it in to the pod.
b15a0a3b
KW
11746 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
11747 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
99870f4d
KW
11748 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11749 Related => 1);
11750
11751 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11752 if (defined $block) { # This is equivalent to the block if have it.
11753 my $Unicode_ASCII = $block->table('Basic_Latin');
11754 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11755 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11756 }
11757 }
11758
11759 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11760 # necessary
11761 if ($ASCII->is_empty) {
11762 $ASCII->initialize([ 0..127 ]);
11763 }
11764
99870f4d
KW
11765 # Get the best available case definitions. Early Unicode versions didn't
11766 # have Uppercase and Lowercase defined, so use the general category
11767 # instead for them.
11768 my $Lower = $perl->add_match_table('Lower');
11769 my $Unicode_Lower = property_ref('Lowercase');
11770 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11771 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
a5c376b7
KW
11772 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11773 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11774 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11775
99870f4d
KW
11776 }
11777 else {
11778 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11779 Related => 1);
11780 }
cbc24f92 11781 $Lower->add_alias('XPosixLower');
a5c376b7 11782 my $Posix_Lower = $perl->add_match_table("PosixLower",
ad5e8af1
KW
11783 Description => "[a-z]",
11784 Initialize => $Lower & $ASCII,
11785 );
99870f4d
KW
11786
11787 my $Upper = $perl->add_match_table('Upper');
11788 my $Unicode_Upper = property_ref('Uppercase');
11789 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11790 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
a5c376b7
KW
11791 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11792 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11793 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
99870f4d
KW
11794 }
11795 else {
11796 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11797 Related => 1);
11798 }
cbc24f92 11799 $Upper->add_alias('XPosixUpper');
a5c376b7 11800 my $Posix_Upper = $perl->add_match_table("PosixUpper",
ad5e8af1
KW
11801 Description => "[A-Z]",
11802 Initialize => $Upper & $ASCII,
11803 );
99870f4d
KW
11804
11805 # Earliest releases didn't have title case. Initialize it to empty if not
11806 # otherwise present
4364919a
KW
11807 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11808 Description => '(= \p{Gc=Lt})');
99870f4d 11809 my $lt = $gc->table('Lt');
a5c376b7
KW
11810
11811 # Earlier versions of mktables had this related to $lt since they have
c12f2655
KW
11812 # identical code points, but their caseless equivalents are not the same,
11813 # one being 'Cased' and the other being 'LC', and so now must be kept as
11814 # separate entities.
a5c376b7 11815 $Title += $lt if defined $lt;
99870f4d
KW
11816
11817 # If this Unicode version doesn't have Cased, set up our own. From
11818 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11819 # and only if C has the Lowercase or Uppercase property or has a
11820 # General_Category value of Titlecase_Letter.
a5c376b7
KW
11821 my $Unicode_Cased = property_ref('Cased');
11822 unless (defined $Unicode_Cased) {
99870f4d
KW
11823 my $cased = $perl->add_match_table('Cased',
11824 Initialize => $Lower + $Upper + $Title,
11825 Description => 'Uppercase or Lowercase or Titlecase',
11826 );
a5c376b7 11827 $Unicode_Cased = $cased;
99870f4d 11828 }
a5c376b7 11829 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
99870f4d
KW
11830
11831 # Similarly, set up our own Case_Ignorable property if this Unicode
11832 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11833 # C is defined to be case-ignorable if C has the value MidLetter or the
11834 # value MidNumLet for the Word_Break property or its General_Category is
11835 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11836 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11837
8050d00f
KW
11838 # Perl has long had an internal-only alias for this property; grandfather
11839 # it in to the pod, but discourage its use.
11840 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
b15a0a3b
KW
11841 Re_Pod_Entry => 1,
11842 Fate => $INTERNAL_ONLY,
11843 Status => $DISCOURAGED);
99870f4d
KW
11844 my $case_ignorable = property_ref('Case_Ignorable');
11845 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11846 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11847 Related => 1);
11848 }
11849 else {
11850
11851 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11852
11853 # The following three properties are not in early releases
11854 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11855 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11856 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11857
11858 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11859 # correspondingly the case-ignorable definition lacks that one. For
11860 # 4.0, it appears that it was meant to be the same definition, but was
11861 # inadvertently omitted from the standard's text, so add it if the
11862 # property actually is there
11863 my $wb = property_ref('Word_Break');
11864 if (defined $wb) {
11865 my $midlet = $wb->table('MidLetter');
11866 $perl_case_ignorable += $midlet if defined $midlet;
11867 my $midnumlet = $wb->table('MidNumLet');
11868 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11869 }
11870 else {
11871
11872 # In earlier versions of the standard, instead of the above two
11873 # properties , just the following characters were used:
11874 $perl_case_ignorable += 0x0027 # APOSTROPHE
11875 + 0x00AD # SOFT HYPHEN (SHY)
11876 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11877 }
11878 }
11879
11880 # The remaining perl defined tables are mostly based on Unicode TR 18,
11881 # "Annex C: Compatibility Properties". All of these have two versions,
11882 # one whose name generally begins with Posix that is posix-compliant, and
11883 # one that matches Unicode characters beyond the Posix, ASCII range
11884
ad5e8af1 11885 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
11886
11887 # Alphabetic was not present in early releases
11888 my $Alphabetic = property_ref('Alphabetic');
11889 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11890 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11891 }
11892 else {
11893
11894 # For early releases, we don't get it exactly right. The below
11895 # includes more than it should, which in 5.2 terms is: L + Nl +
11896 # Other_Alphabetic. Other_Alphabetic contains many characters from
11897 # Mn and Mc. It's better to match more than we should, than less than
11898 # we should.
11899 $Alpha->initialize($gc->table('Letter')
11900 + $gc->table('Mn')
11901 + $gc->table('Mc'));
11902 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 11903 $Alpha->add_description('Alphabetic');
99870f4d 11904 }
cbc24f92 11905 $Alpha->add_alias('XPosixAlpha');
a5c376b7 11906 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
ad5e8af1
KW
11907 Description => "[A-Za-z]",
11908 Initialize => $Alpha & $ASCII,
11909 );
a5c376b7
KW
11910 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11911 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
99870f4d
KW
11912
11913 my $Alnum = $perl->add_match_table('Alnum',
56339b2c 11914 Description => 'Alphabetic and (decimal) Numeric',
99870f4d
KW
11915 Initialize => $Alpha + $gc->table('Decimal_Number'),
11916 );
cbc24f92 11917 $Alnum->add_alias('XPosixAlnum');
ad5e8af1
KW
11918 $perl->add_match_table("PosixAlnum",
11919 Description => "[A-Za-z0-9]",
11920 Initialize => $Alnum & $ASCII,
11921 );
99870f4d
KW
11922
11923 my $Word = $perl->add_match_table('Word',
d35dd6c6
KW
11924 Description => '\w, including beyond ASCII;'
11925 . ' = \p{Alnum} + \pM + \p{Pc}',
99870f4d
KW
11926 Initialize => $Alnum + $gc->table('Mark'),
11927 );
cbc24f92 11928 $Word->add_alias('XPosixWord');
99870f4d
KW
11929 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11930 $Word += $Pc if defined $Pc;
11931
f38f76ae 11932 # This is a Perl extension, so the name doesn't begin with Posix.
cbc24f92 11933 my $PerlWord = $perl->add_match_table('PerlWord',
99870f4d
KW
11934 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11935 Initialize => $Word & $ASCII,
11936 );
cbc24f92 11937 $PerlWord->add_alias('PosixWord');
99870f4d
KW
11938
11939 my $Blank = $perl->add_match_table('Blank',
11940 Description => '\h, Horizontal white space',
11941
11942 # 200B is Zero Width Space which is for line
11943 # break control, and was listed as
11944 # Space_Separator in early releases
11945 Initialize => $gc->table('Space_Separator')
11946 + 0x0009 # TAB
11947 - 0x200B, # ZWSP
11948 );
11949 $Blank->add_alias('HorizSpace'); # Another name for it.
cbc24f92 11950 $Blank->add_alias('XPosixBlank');
ad5e8af1
KW
11951 $perl->add_match_table("PosixBlank",
11952 Description => "\\t and ' '",
11953 Initialize => $Blank & $ASCII,
11954 );
99870f4d
KW
11955
11956 my $VertSpace = $perl->add_match_table('VertSpace',
11957 Description => '\v',
11958 Initialize => $gc->table('Line_Separator')
11959 + $gc->table('Paragraph_Separator')
11960 + 0x000A # LINE FEED
11961 + 0x000B # VERTICAL TAB
11962 + 0x000C # FORM FEED
11963 + 0x000D # CARRIAGE RETURN
11964 + 0x0085, # NEL
11965 );
11966 # No Posix equivalent for vertical space
11967
11968 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
11969 Description => '\s including beyond ASCII plus vertical tab',
11970 Initialize => $Blank + $VertSpace,
99870f4d 11971 );
cbc24f92 11972 $Space->add_alias('XPosixSpace');
ad5e8af1 11973 $perl->add_match_table("PosixSpace",
f38f76ae 11974 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
11975 Initialize => $Space & $ASCII,
11976 );
99870f4d
KW
11977
11978 # Perl's traditional space doesn't include Vertical Tab
cbc24f92 11979 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
99870f4d
KW
11980 Description => '\s, including beyond ASCII',
11981 Initialize => $Space - 0x000B,
11982 );
cbc24f92
KW
11983 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
11984 my $PerlSpace = $perl->add_match_table('PerlSpace',
de25ec47
KW
11985 Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11986 Initialize => $XPerlSpace & $ASCII,
99870f4d
KW
11987 );
11988
cbc24f92 11989
99870f4d 11990 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 11991 Description => 'Control characters');
99870f4d 11992 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
cbc24f92 11993 $Cntrl->add_alias('XPosixCntrl');
ad5e8af1 11994 $perl->add_match_table("PosixCntrl",
f38f76ae 11995 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
11996 Initialize => $Cntrl & $ASCII,
11997 );
99870f4d
KW
11998
11999 # $controls is a temporary used to construct Graph.
12000 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
12001 + $gc->table('Control'));
12002 # Cs not in release 1
12003 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
12004
12005 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
12006 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 12007 Description => 'Characters that are graphical',
99870f4d
KW
12008 Initialize => ~ ($Space + $controls),
12009 );
cbc24f92 12010 $Graph->add_alias('XPosixGraph');
ad5e8af1 12011 $perl->add_match_table("PosixGraph",
f38f76ae
KW
12012 Description =>
12013 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
12014 Initialize => $Graph & $ASCII,
12015 );
99870f4d 12016
3e20195b 12017 $print = $perl->add_match_table('Print',
ad5e8af1 12018 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 12019 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 12020 );
cbc24f92 12021 $print->add_alias('XPosixPrint');
ad5e8af1 12022 $perl->add_match_table("PosixPrint",
66fd7fd0 12023 Description =>
f38f76ae 12024 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 12025 Initialize => $print & $ASCII,
ad5e8af1 12026 );
99870f4d
KW
12027
12028 my $Punct = $perl->add_match_table('Punct');
12029 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
12030
12031 # \p{punct} doesn't include the symbols, which posix does
cbc24f92
KW
12032 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
12033 Description => '\p{Punct} + ASCII-range \p{Symbol}',
12034 Initialize => $gc->table('Punctuation')
12035 + ($ASCII & $gc->table('Symbol')),
bb080638 12036 Perl_Extension => 1
cbc24f92 12037 );
bb080638 12038 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
f38f76ae 12039 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
cbc24f92 12040 Initialize => $ASCII & $XPosixPunct,
ad5e8af1 12041 );
99870f4d
KW
12042
12043 my $Digit = $perl->add_match_table('Digit',
f3a73f6e 12044 Description => '[0-9] + all other decimal digits');
99870f4d 12045 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
cbc24f92 12046 $Digit->add_alias('XPosixDigit');
ad5e8af1
KW
12047 my $PosixDigit = $perl->add_match_table("PosixDigit",
12048 Description => '[0-9]',
12049 Initialize => $Digit & $ASCII,
12050 );
99870f4d 12051
eadadd41
KW
12052 # Hex_Digit was not present in first release
12053 my $Xdigit = $perl->add_match_table('XDigit');
cbc24f92 12054 $Xdigit->add_alias('XPosixXDigit');
eadadd41
KW
12055 my $Hex = property_ref('Hex_Digit');
12056 if (defined $Hex && ! $Hex->is_empty) {
12057 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
12058 }
12059 else {
eadadd41
KW
12060 # (Have to use hex instead of e.g. '0', because could be running on an
12061 # non-ASCII machine, and we want the Unicode (ASCII) values)
12062 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
12063 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
12064 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d 12065 }
4efcc33b
KW
12066
12067 # AHex was not present in early releases
12068 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
12069 my $AHex = property_ref('ASCII_Hex_Digit');
12070 if (defined $AHex && ! $AHex->is_empty) {
12071 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
12072 }
12073 else {
12074 $PosixXDigit->initialize($Xdigit & $ASCII);
12075 }
12076 $PosixXDigit->add_description('[0-9A-Fa-f]');
99870f4d 12077
99870f4d
KW
12078 my $dt = property_ref('Decomposition_Type');
12079 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
12080 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
12081 Perl_Extension => 1,
d57ccc9a 12082 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
12083 );
12084
12085 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
12086 # than SD appeared, construct it ourselves, based on the first release SD
8050d00f 12087 # was in. A pod entry is grandfathered in for it
33e96e72 12088 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
301ba948
KW
12089 Perl_Extension => 1,
12090 Fate => $INTERNAL_ONLY,
12091 Status => $DISCOURAGED);
99870f4d
KW
12092 my $soft_dotted = property_ref('Soft_Dotted');
12093 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
12094 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
12095 }
12096 else {
12097
12098 # This list came from 3.2 Soft_Dotted.
12099 $CanonDCIJ->initialize([ 0x0069,
12100 0x006A,
12101 0x012F,
12102 0x0268,
12103 0x0456,
12104 0x0458,
12105 0x1E2D,
12106 0x1ECB,
12107 ]);
12108 $CanonDCIJ = $CanonDCIJ & $Assigned;
12109 }
12110
f86864ac 12111 # These are used in Unicode's definition of \X
6ba2d696 12112 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
301ba948 12113 Fate => $INTERNAL_ONLY);
6ba2d696 12114 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
301ba948 12115 Fate => $INTERNAL_ONLY);
37e2e78e 12116
ee24a51c
KW
12117 # For backward compatibility, Perl has its own definition for IDStart
12118 # First, we include the underscore, and then the regular XID_Start also
12119 # have to be Words
12120 $perl->add_match_table('_Perl_IDStart',
12121 Perl_Extension => 1,
301ba948 12122 Fate => $INTERNAL_ONLY,
ee24a51c
KW
12123 Initialize =>
12124 ord('_')
12125 + (property_ref('XID_Start')->table('Y') & $Word)
12126 );
12127
99870f4d 12128 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 12129
678f13d5 12130 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
12131 # definition differs too much from the traditional Perl one to use.
12132 if (defined $gcb && defined $gcb->table('SpacingMark')) {
12133
12134 # Note that assumes HST is defined; it came in an earlier release than
12135 # GCB. In the line below, two negatives means: yes hangul
12136 $begin += ~ property_ref('Hangul_Syllable_Type')
12137 ->table('Not_Applicable')
12138 + ~ ($gcb->table('Control')
12139 + $gcb->table('CR')
12140 + $gcb->table('LF'));
12141 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12142
12143 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12144 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
12145 }
12146 else { # Old definition, used on early releases.
f86864ac 12147 $extend += $gc->table('Mark')
37e2e78e
KW
12148 + 0x200C # ZWNJ
12149 + 0x200D; # ZWJ
12150 $begin += ~ $extend;
12151
12152 # Here we may have a release that has the regular grapheme cluster
12153 # defined, or a release that doesn't have anything defined.
12154 # We set things up so the Perl core degrades gracefully, possibly with
12155 # placeholders that match nothing.
12156
12157 if (! defined $gcb) {
12158 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
12159 }
12160 my $hst = property_ref('HST');
12161 if (!defined $hst) {
12162 $hst = Property->new('HST', Status => $PLACEHOLDER);
12163 $hst->add_match_table('Not_Applicable',
12164 Initialize => $Any,
12165 Matches_All => 1);
12166 }
12167
12168 # On some releases, here we may not have the needed tables for the
12169 # perl core, in some releases we may.
12170 foreach my $name (qw{ L LV LVT T V prepend }) {
12171 my $table = $gcb->table($name);
12172 if (! defined $table) {
12173 $table = $gcb->add_match_table($name);
12174 push @tables_that_may_be_empty, $table->complete_name;
12175 }
12176
12177 # The HST property predates the GCB one, and has identical tables
12178 # for some of them, so use it if we can.
12179 if ($table->is_empty
12180 && defined $hst
12181 && defined $hst->table($name))
12182 {
12183 $table += $hst->table($name);
12184 }
12185 }
12186 }
12187
12188 # More GCB. If we found some hangul syllables, populate a combined
12189 # table.
301ba948
KW
12190 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
12191 Perl_Extension => 1,
12192 Fate => $INTERNAL_ONLY);
37e2e78e
KW
12193 my $LV = $gcb->table('LV');
12194 if ($LV->is_empty) {
12195 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12196 } else {
12197 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12198 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
12199 }
12200
28093d0e 12201 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
12202 my @composition = ('Name', 'Unicode_1_Name');
12203
12204 if (@named_sequences) {
12205 push @composition, 'Named_Sequence';
12206 foreach my $sequence (@named_sequences) {
12207 $perl_charname->add_anomalous_entry($sequence);
12208 }
12209 }
12210
12211 my $alias_sentence = "";
12212 my $alias = property_ref('Name_Alias');
12213 if (defined $alias) {
12214 push @composition, 'Name_Alias';
12215 $alias->reset_each_range;
12216 while (my ($range) = $alias->each_range) {
12217 next if $range->value eq "";
12218 if ($range->start != $range->end) {
12219 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
12220 }
58b75e36
KW
12221 $perl_charname->add_duplicate($range->start,
12222 $range->value =~ s/:.*//r);
99870f4d
KW
12223 }
12224 $alias_sentence = <<END;
12225The Name_Alias property adds duplicate code point entries with a corrected
12226name. The original (less correct, but still valid) name will be physically
53d84487 12227last.
99870f4d
KW
12228END
12229 }
12230 my $comment;
12231 if (@composition <= 2) { # Always at least 2
12232 $comment = join " and ", @composition;
12233 }
12234 else {
12235 $comment = join ", ", @composition[0 .. scalar @composition - 2];
12236 $comment .= ", and $composition[-1]";
12237 }
12238
99870f4d
KW
12239 $perl_charname->add_comment(join_lines( <<END
12240This file is for charnames.pm. It is the union of the $comment properties.
12241Unicode_1_Name entries are used only for otherwise nameless code
12242points.
12243$alias_sentence
a03f0b9f
KW
12244This file doesn't include the algorithmically determinable names. For those,
12245use 'unicore/Name.pm'
12246END
12247 ));
12248 property_ref('Name')->add_comment(join_lines( <<END
12249This file doesn't include the algorithmically determinable names. For those,
12250use 'unicore/Name.pm'
99870f4d
KW
12251END
12252 ));
12253
99870f4d
KW
12254 # Construct the Present_In property from the Age property.
12255 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12256 my $default_map = $age->default_map;
12257 my $in = Property->new('In',
12258 Default_Map => $default_map,
12259 Full_Name => "Present_In",
99870f4d
KW
12260 Perl_Extension => 1,
12261 Type => $ENUM,
12262 Initialize => $age,
12263 );
12264 $in->add_comment(join_lines(<<END
c12f2655 12265THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
99870f4d
KW
12266same as for $age, and not for what $in really means. This is because anything
12267defined in a given release should have multiple values: that release and all
12268higher ones. But only one value per code point can be represented in a table
12269like this.
12270END
12271 ));
12272
12273 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
12274 # lowest numbered (earliest) come first, with the non-numeric one
12275 # last.
12276 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12277 ? 1
12278 : ($b->name !~ /^[\d.]*$/)
12279 ? -1
12280 : $a->name <=> $b->name
12281 } $age->tables;
12282
12283 # The Present_In property is the cumulative age properties. The first
12284 # one hence is identical to the first age one.
12285 my $previous_in = $in->add_match_table($first_age->name);
12286 $previous_in->set_equivalent_to($first_age, Related => 1);
12287
12288 my $description_start = "Code point's usage introduced in version ";
12289 $first_age->add_description($description_start . $first_age->name);
12290
98dc9551 12291 # To construct the accumulated values, for each of the age tables
99870f4d
KW
12292 # starting with the 2nd earliest, merge the earliest with it, to get
12293 # all those code points existing in the 2nd earliest. Repeat merging
12294 # the new 2nd earliest with the 3rd earliest to get all those existing
12295 # in the 3rd earliest, and so on.
12296 foreach my $current_age (@rest_ages) {
12297 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
12298
12299 my $current_in = $in->add_match_table(
12300 $current_age->name,
12301 Initialize => $current_age + $previous_in,
12302 Description => $description_start
12303 . $current_age->name
12304 . ' or earlier',
12305 );
12306 $previous_in = $current_in;
12307
12308 # Add clarifying material for the corresponding age file. This is
12309 # in part because of the confusing and contradictory information
12310 # given in the Standard's documentation itself, as of 5.2.
12311 $current_age->add_description(
12312 "Code point's usage was introduced in version "
12313 . $current_age->name);
12314 $current_age->add_note("See also $in");
12315
12316 }
12317
12318 # And finally the code points whose usages have yet to be decided are
12319 # the same in both properties. Note that permanently unassigned code
12320 # points actually have their usage assigned (as being permanently
12321 # unassigned), so that these tables are not the same as gc=cn.
12322 my $unassigned = $in->add_match_table($default_map);
12323 my $age_default = $age->table($default_map);
12324 $age_default->add_description(<<END
12325Code point's usage has not been assigned in any Unicode release thus far.
12326END
12327 );
12328 $unassigned->set_equivalent_to($age_default, Related => 1);
12329 }
12330
12331
12332 # Finished creating all the perl properties. All non-internal non-string
12333 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
12334 # an underscore.) These do not get a separate entry in the pod file
12335 foreach my $table ($perl->tables) {
12336 foreach my $alias ($table->aliases) {
12337 next if $alias->name =~ /^_/;
12338 $table->add_alias('Is_' . $alias->name,
33e96e72 12339 Re_Pod_Entry => 0,
fd1e3e84 12340 UCD => 0,
99870f4d 12341 Status => $alias->status,
0eac1e20 12342 OK_as_Filename => 0);
99870f4d
KW
12343 }
12344 }
12345
c4019d52
KW
12346 # Here done with all the basic stuff. Ready to populate the information
12347 # about each character if annotating them.
558712cf 12348 if ($annotate) {
c4019d52
KW
12349
12350 # See comments at its declaration
12351 $annotate_ranges = Range_Map->new;
12352
12353 # This separates out the non-characters from the other unassigneds, so
12354 # can give different annotations for each.
12355 $unassigned_sans_noncharacters = Range_List->new(
12356 Initialize => $gc->table('Unassigned')
12357 & property_ref('Noncharacter_Code_Point')->table('N'));
12358
6189eadc 12359 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
c4019d52
KW
12360 $i = populate_char_info($i); # Note sets $i so may cause skips
12361 }
12362 }
12363
99870f4d
KW
12364 return;
12365}
12366
12367sub add_perl_synonyms() {
12368 # A number of Unicode tables have Perl synonyms that are expressed in
12369 # the single-form, \p{name}. These are:
12370 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12371 # \p{Is_Name} as synonyms
12372 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12373 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12374 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12375 # conflict, \p{Value} and \p{Is_Value} as well
12376 #
12377 # This routine generates these synonyms, warning of any unexpected
12378 # conflicts.
12379
12380 # Construct the list of tables to get synonyms for. Start with all the
12381 # binary and the General_Category ones.
06f26c45
KW
12382 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
12383 property_ref('*');
99870f4d
KW
12384 push @tables, $gc->tables;
12385
12386 # If the version of Unicode includes the Script property, add its tables
359523e2 12387 push @tables, $script->tables if defined $script;
99870f4d
KW
12388
12389 # The Block tables are kept separate because they are treated differently.
12390 # And the earliest versions of Unicode didn't include them, so add only if
12391 # there are some.
12392 my @blocks;
12393 push @blocks, $block->tables if defined $block;
12394
12395 # Here, have the lists of tables constructed. Process blocks last so that
12396 # if there are name collisions with them, blocks have lowest priority.
12397 # Should there ever be other collisions, manual intervention would be
12398 # required. See the comments at the beginning of the program for a
12399 # possible way to handle those semi-automatically.
12400 foreach my $table (@tables, @blocks) {
12401
12402 # For non-binary properties, the synonym is just the name of the
12403 # table, like Greek, but for binary properties the synonym is the name
12404 # of the property, and means the code points in its 'Y' table.
12405 my $nominal = $table;
12406 my $nominal_property = $nominal->property;
12407 my $actual;
12408 if (! $nominal->isa('Property')) {
12409 $actual = $table;
12410 }
12411 else {
12412
12413 # Here is a binary property. Use the 'Y' table. Verify that is
12414 # there
12415 my $yes = $nominal->table('Y');
12416 unless (defined $yes) { # Must be defined, but is permissible to
12417 # be empty.
12418 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
12419 next;
12420 }
12421 $actual = $yes;
12422 }
12423
12424 foreach my $alias ($nominal->aliases) {
12425
12426 # Attempt to create a table in the perl directory for the
12427 # candidate table, using whatever aliases in it that don't
12428 # conflict. Also add non-conflicting aliases for all these
12429 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12430 PREFIX:
12431 foreach my $prefix ("", 'Is_', 'In_') {
12432
12433 # Only Block properties can have added 'In_' aliases.
12434 next if $prefix eq 'In_' and $nominal_property != $block;
12435
12436 my $proposed_name = $prefix . $alias->name;
12437
12438 # No Is_Is, In_In, nor combinations thereof
12439 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12440 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12441
12442 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12443
12444 # Get a reference to any existing table in the perl
12445 # directory with the desired name.
12446 my $pre_existing = $perl->table($proposed_name);
12447
12448 if (! defined $pre_existing) {
12449
12450 # No name collision, so ok to add the perl synonym.
12451
33e96e72 12452 my $make_re_pod_entry;
0eac1e20 12453 my $ok_as_filename;
4cd1260a 12454 my $status = $alias->status;
99870f4d
KW
12455 if ($nominal_property == $block) {
12456
12457 # For block properties, the 'In' form is preferred for
12458 # external use; the pod file contains wild cards for
12459 # this and the 'Is' form so no entries for those; and
12460 # we don't want people using the name without the
12461 # 'In', so discourage that.
12462 if ($prefix eq "") {
33e96e72 12463 $make_re_pod_entry = 1;
99870f4d 12464 $status = $status || $DISCOURAGED;
0eac1e20 12465 $ok_as_filename = 0;
99870f4d
KW
12466 }
12467 elsif ($prefix eq 'In_') {
33e96e72 12468 $make_re_pod_entry = 0;
99870f4d 12469 $status = $status || $NORMAL;
0eac1e20 12470 $ok_as_filename = 1;
99870f4d
KW
12471 }
12472 else {
33e96e72 12473 $make_re_pod_entry = 0;
99870f4d 12474 $status = $status || $DISCOURAGED;
0eac1e20 12475 $ok_as_filename = 0;
99870f4d
KW
12476 }
12477 }
12478 elsif ($prefix ne "") {
12479
12480 # The 'Is' prefix is handled in the pod by a wild
12481 # card, and we won't use it for an external name
33e96e72 12482 $make_re_pod_entry = 0;
99870f4d 12483 $status = $status || $NORMAL;
0eac1e20 12484 $ok_as_filename = 0;
99870f4d
KW
12485 }
12486 else {
12487
12488 # Here, is an empty prefix, non block. This gets its
12489 # own pod entry and can be used for an external name.
33e96e72 12490 $make_re_pod_entry = 1;
99870f4d 12491 $status = $status || $NORMAL;
0eac1e20 12492 $ok_as_filename = 1;
99870f4d
KW
12493 }
12494
12495 # Here, there isn't a perl pre-existing table with the
12496 # name. Look through the list of equivalents of this
12497 # table to see if one is a perl table.
12498 foreach my $equivalent ($actual->leader->equivalents) {
12499 next if $equivalent->property != $perl;
12500
12501 # Here, have found a table for $perl. Add this alias
12502 # to it, and are done with this prefix.
12503 $equivalent->add_alias($proposed_name,
33e96e72 12504 Re_Pod_Entry => $make_re_pod_entry,
fd1e3e84
KW
12505
12506 # Currently don't output these in the
12507 # ucd pod, as are strongly discouraged
12508 # from being used
12509 UCD => 0,
12510
99870f4d 12511 Status => $status,
0eac1e20 12512 OK_as_Filename => $ok_as_filename);
99870f4d
KW
12513 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12514 next PREFIX;
12515 }
12516
12517 # Here, $perl doesn't already have a table that is a
12518 # synonym for this property, add one.
12519 my $added_table = $perl->add_match_table($proposed_name,
33e96e72 12520 Re_Pod_Entry => $make_re_pod_entry,
fd1e3e84
KW
12521
12522 # See UCD comment just above
12523 UCD => 0,
12524
99870f4d 12525 Status => $status,
0eac1e20 12526 OK_as_Filename => $ok_as_filename);
99870f4d
KW
12527 # And it will be related to the actual table, since it is
12528 # based on it.
12529 $added_table->set_equivalent_to($actual, Related => 1);
12530 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12531 next;
12532 } # End of no pre-existing.
12533
12534 # Here, there is a pre-existing table that has the proposed
12535 # name. We could be in trouble, but not if this is just a
12536 # synonym for another table that we have already made a child
12537 # of the pre-existing one.
6505c6e2 12538 if ($pre_existing->is_set_equivalent_to($actual)) {
99870f4d
KW
12539 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12540 $pre_existing->add_alias($proposed_name);
12541 next;
12542 }
12543
12544 # Here, there is a name collision, but it still could be ok if
12545 # the tables match the identical set of code points, in which
12546 # case, we can combine the names. Compare each table's code
12547 # point list to see if they are identical.
12548 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12549 if ($pre_existing->matches_identically_to($actual)) {
12550
12551 # Here, they do match identically. Not a real conflict.
12552 # Make the perl version a child of the Unicode one, except
12553 # in the non-obvious case of where the perl name is
12554 # already a synonym of another Unicode property. (This is
12555 # excluded by the test for it being its own parent.) The
12556 # reason for this exclusion is that then the two Unicode
12557 # properties become related; and we don't really know if
12558 # they are or not. We generate documentation based on
12559 # relatedness, and this would be misleading. Code
12560 # later executed in the process will cause the tables to
12561 # be represented by a single file anyway, without making
12562 # it look in the pod like they are necessarily related.
12563 if ($pre_existing->parent == $pre_existing
12564 && ($pre_existing->property == $perl
12565 || $actual->property == $perl))
12566 {
12567 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12568 $pre_existing->set_equivalent_to($actual, Related => 1);
12569 }
12570 elsif (main::DEBUG && $to_trace) {
12571 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12572 trace $pre_existing->parent;
12573 }
12574 next PREFIX;
12575 }
12576
12577 # Here they didn't match identically, there is a real conflict
12578 # between our new name and a pre-existing property.
12579 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12580 $pre_existing->add_conflicting($nominal->full_name,
12581 'p',
12582 $actual);
12583
12584 # Don't output a warning for aliases for the block
12585 # properties (unless they start with 'In_') as it is
12586 # expected that there will be conflicts and the block
12587 # form loses.
12588 if ($verbosity >= $NORMAL_VERBOSITY
12589 && ($actual->property != $block || $prefix eq 'In_'))
12590 {
12591 print simple_fold(join_lines(<<END
12592There is already an alias named $proposed_name (from " . $pre_existing . "),
12593so not creating this alias for " . $actual
12594END
12595 ), "", 4);
12596 }
12597
12598 # Keep track for documentation purposes.
12599 $has_In_conflicts++ if $prefix eq 'In_';
12600 $has_Is_conflicts++ if $prefix eq 'Is_';
12601 }
12602 }
12603 }
12604
12605 # There are some properties which have No and Yes (and N and Y) as
12606 # property values, but aren't binary, and could possibly be confused with
12607 # binary ones. So create caveats for them. There are tables that are
12608 # named 'No', and tables that are named 'N', but confusion is not likely
12609 # unless they are the same table. For example, N meaning Number or
12610 # Neutral is not likely to cause confusion, so don't add caveats to things
12611 # like them.
06f26c45
KW
12612 foreach my $property (grep { $_->type != $BINARY
12613 && $_->type != $FORCED_BINARY }
12614 property_ref('*'))
12615 {
99870f4d
KW
12616 my $yes = $property->table('Yes');
12617 if (defined $yes) {
12618 my $y = $property->table('Y');
12619 if (defined $y && $yes == $y) {
12620 foreach my $alias ($property->aliases) {
12621 $yes->add_conflicting($alias->name);
12622 }
12623 }
12624 }
12625 my $no = $property->table('No');
12626 if (defined $no) {
12627 my $n = $property->table('N');
12628 if (defined $n && $no == $n) {
12629 foreach my $alias ($property->aliases) {
12630 $no->add_conflicting($alias->name, 'P');
12631 }
12632 }
12633 }
12634 }
12635
12636 return;
12637}
12638
12639sub register_file_for_name($$$) {
12640 # Given info about a table and a datafile that it should be associated
98dc9551 12641 # with, register that association
99870f4d
KW
12642
12643 my $table = shift;
12644 my $directory_ref = shift; # Array of the directory path for the file
e6ebc4c0 12645 my $file = shift; # The file name in the final directory.
99870f4d
KW
12646 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12647
12648 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12649
12650 if ($table->isa('Property')) {
12651 $table->set_file_path(@$directory_ref, $file);
48cf9da9 12652 push @map_properties, $table;
315bfd4e
KW
12653
12654 # No swash means don't do the rest of this.
12655 return if $table->fate != $ORDINARY;
12656
12657 # Get the path to the file
12658 my @path = $table->file_path;
12659
12660 # Use just the file name if no subdirectory.
12661 shift @path if $path[0] eq File::Spec->curdir();
12662
12663 my $file = join '/', @path;
12664
12665 # Create a hash entry for utf8_heavy to get the file that stores this
12666 # property's map table
12667 foreach my $alias ($table->aliases) {
12668 my $name = $alias->name;
12669 $loose_property_to_file_of{standardize($name)} = $file;
12670 }
12671
89cf10cc
KW
12672 # And a way for utf8_heavy to find the proper key in the SwashInfo
12673 # hash for this property.
12674 $file_to_swash_name{$file} = "To" . $table->swash_name;
99870f4d
KW
12675 return;
12676 }
12677
12678 # Do all of the work for all equivalent tables when called with the leader
12679 # table, so skip if isn't the leader.
12680 return if $table->leader != $table;
12681
a92d5c2e
KW
12682 # If this is a complement of another file, use that other file instead,
12683 # with a ! prepended to it.
12684 my $complement;
12685 if (($complement = $table->complement) != 0) {
12686 my @directories = $complement->file_path;
12687
12688 # This assumes that the 0th element is something like 'lib',
12689 # the 1th element the property name (in its own directory), like
12690 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12691 # appended to it later.
12692 $directories[1] =~ s/^/!/;
12693 $file = pop @directories;
12694 $directory_ref =\@directories;
12695 }
12696
99870f4d
KW
12697 # Join all the file path components together, using slashes.
12698 my $full_filename = join('/', @$directory_ref, $file);
12699
12700 # All go in the same subdirectory of unicore
12701 if ($directory_ref->[0] ne $matches_directory) {
12702 Carp::my_carp("Unexpected directory in "
12703 . join('/', @{$directory_ref}, $file));
12704 }
12705
12706 # For this table and all its equivalents ...
12707 foreach my $table ($table, $table->equivalents) {
12708
12709 # Associate it with its file internally. Don't include the
12710 # $matches_directory first component
12711 $table->set_file_path(@$directory_ref, $file);
c15fda25
KW
12712
12713 # No swash means don't do the rest of this.
12714 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
12715
99870f4d
KW
12716 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12717
12718 my $property = $table->property;
ae51efca
KW
12719 my $property_name = ($property == $perl)
12720 ? "" # 'perl' is never explicitly stated
12721 : standardize($property->name) . '=';
99870f4d 12722
c15fda25
KW
12723 my $is_default = 0; # Is this table the default one for the property?
12724
12725 # To calculate $is_default, we find if this table is the same as the
12726 # default one for the property. But this is complicated by the
12727 # possibility that there is a master table for this one, and the
12728 # information is stored there instead of here.
9e4a1e86
KW
12729 my $parent = $table->parent;
12730 my $leader_prop = $parent->property;
c15fda25
KW
12731 my $default_map = $leader_prop->default_map;
12732 if (defined $default_map) {
12733 my $default_table = $leader_prop->table($default_map);
12734 $is_default = 1 if defined $default_table && $parent == $default_table;
12735 }
9e4a1e86
KW
12736
12737 # Calculate the loose name for this table. Mostly it's just its name,
12738 # standardized. But in the case of Perl tables that are single-form
12739 # equivalents to Unicode properties, it is the latter's name.
12740 my $loose_table_name =
12741 ($property != $perl || $leader_prop == $perl)
12742 ? standardize($table->name)
12743 : standardize($parent->name);
12744
99870f4d
KW
12745 my $deprecated = ($table->status eq $DEPRECATED)
12746 ? $table->status_info
12747 : "";
d867ccfb 12748 my $caseless_equivalent = $table->caseless_equivalent;
99870f4d
KW
12749
12750 # And for each of the table's aliases... This inner loop eventually
12751 # goes through all aliases in the UCD that we generate regex match
12752 # files for
12753 foreach my $alias ($table->aliases) {
c85f591a 12754 my $standard = utf8_heavy_name($table, $alias);
99870f4d
KW
12755
12756 # Generate an entry in either the loose or strict hashes, which
12757 # will translate the property and alias names combination into the
12758 # file where the table for them is stored.
99870f4d 12759 if ($alias->loose_match) {
99870f4d
KW
12760 if (exists $loose_to_file_of{$standard}) {
12761 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12762 }
12763 else {
12764 $loose_to_file_of{$standard} = $sub_filename;
12765 }
12766 }
12767 else {
99870f4d
KW
12768 if (exists $stricter_to_file_of{$standard}) {
12769 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12770 }
12771 else {
12772 $stricter_to_file_of{$standard} = $sub_filename;
12773
12774 # Tightly coupled with how utf8_heavy.pl works, for a
12775 # floating point number that is a whole number, get rid of
12776 # the trailing decimal point and 0's, so that utf8_heavy
12777 # will work. Also note that this assumes that such a
12778 # number is matched strictly; so if that were to change,
12779 # this would be wrong.
c85f591a 12780 if ((my $integer_name = $alias->name)
99870f4d
KW
12781 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12782 {
ae51efca 12783 $stricter_to_file_of{$property_name . $integer_name}
c12f2655 12784 = $sub_filename;
99870f4d
KW
12785 }
12786 }
12787 }
12788
9e4a1e86
KW
12789 # For Unicode::UCD, create a mapping of the prop=value to the
12790 # canonical =value for that property.
12791 if ($standard =~ /=/) {
12792
12793 # This could happen if a strict name mapped into an existing
12794 # loose name. In that event, the strict names would have to
12795 # be moved to a new hash.
12796 if (exists($loose_to_standard_value{$standard})) {
12797 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
12798 }
12799 $loose_to_standard_value{$standard} = $loose_table_name;
12800 }
12801
99870f4d 12802 # Keep a list of the deprecated properties and their filenames
a92d5c2e 12803 if ($deprecated && $complement == 0) {
99870f4d
KW
12804 $utf8::why_deprecated{$sub_filename} = $deprecated;
12805 }
d867ccfb
KW
12806
12807 # And a substitute table, if any, for case-insensitive matching
12808 if ($caseless_equivalent != 0) {
12809 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12810 }
c15fda25
KW
12811
12812 # Add to defaults list if the table this alias belongs to is the
12813 # default one
12814 $loose_defaults{$standard} = 1 if $is_default;
99870f4d
KW
12815 }
12816 }
12817
12818 return;
12819}
12820
12821{ # Closure
12822 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12823 # conflicts
12824 my %full_dir_name_of; # Full length names of directories used.
12825
12826 sub construct_filename($$$) {
12827 # Return a file name for a table, based on the table name, but perhaps
12828 # changed to get rid of non-portable characters in it, and to make
12829 # sure that it is unique on a file system that allows the names before
12830 # any period to be at most 8 characters (DOS). While we're at it
12831 # check and complain if there are any directory conflicts.
12832
12833 my $name = shift; # The name to start with
12834 my $mutable = shift; # Boolean: can it be changed? If no, but
12835 # yet it must be to work properly, a warning
12836 # is given
12837 my $directories_ref = shift; # A reference to an array containing the
12838 # path to the file, with each element one path
12839 # component. This is used because the same
12840 # name can be used in different directories.
12841 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12842
12843 my $warn = ! defined wantarray; # If true, then if the name is
12844 # changed, a warning is issued as well.
12845
12846 if (! defined $name) {
12847 Carp::my_carp("Undefined name in directory "
12848 . File::Spec->join(@$directories_ref)
12849 . ". '_' used");
12850 return '_';
12851 }
12852
12853 # Make sure that no directory names conflict with each other. Look at
12854 # each directory in the input file's path. If it is already in use,
12855 # assume it is correct, and is merely being re-used, but if we
12856 # truncate it to 8 characters, and find that there are two directories
12857 # that are the same for the first 8 characters, but differ after that,
12858 # then that is a problem.
12859 foreach my $directory (@$directories_ref) {
12860 my $short_dir = substr($directory, 0, 8);
12861 if (defined $full_dir_name_of{$short_dir}) {
12862 next if $full_dir_name_of{$short_dir} eq $directory;
12863 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12864 }
12865 else {
12866 $full_dir_name_of{$short_dir} = $directory;
12867 }
12868 }
12869
12870 my $path = join '/', @$directories_ref;
12871 $path .= '/' if $path;
12872
12873 # Remove interior underscores.
12874 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12875
12876 # Change any non-word character into an underscore, and truncate to 8.
12877 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12878 substr($filename, 8) = "" if length($filename) > 8;
12879
12880 # Make sure the basename doesn't conflict with something we
12881 # might have already written. If we have, say,
12882 # InGreekExtended1
12883 # InGreekExtended2
12884 # they become
12885 # InGreekE
12886 # InGreek2
12887 my $warned = 0;
12888 while (my $num = $base_names{$path}{lc $filename}++) {
12889 $num++; # so basenames with numbers start with '2', which
12890 # just looks more natural.
12891
12892 # Want to append $num, but if it'll make the basename longer
12893 # than 8 characters, pre-truncate $filename so that the result
12894 # is acceptable.
12895 my $delta = length($filename) + length($num) - 8;
12896 if ($delta > 0) {
12897 substr($filename, -$delta) = $num;
12898 }
12899 else {
12900 $filename .= $num;
12901 }
12902 if ($warn && ! $warned) {
12903 $warned = 1;
12904 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12905 }
12906 }
12907
12908 return $filename if $mutable;
12909
12910 # If not changeable, must return the input name, but warn if needed to
12911 # change it beyond shortening it.
12912 if ($name ne $filename
12913 && substr($name, 0, length($filename)) ne $filename) {
12914 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12915 }
12916 return $name;
12917 }
12918}
12919
12920# The pod file contains a very large table. Many of the lines in that table
12921# would exceed a typical output window's size, and so need to be wrapped with
12922# a hanging indent to make them look good. The pod language is really
12923# insufficient here. There is no general construct to do that in pod, so it
12924# is done here by beginning each such line with a space to cause the result to
12925# be output without formatting, and doing all the formatting here. This leads
12926# to the result that if the eventual display window is too narrow it won't
12927# look good, and if the window is too wide, no advantage is taken of that
12928# extra width. A further complication is that the output may be indented by
12929# the formatter so that there is less space than expected. What I (khw) have
12930# done is to assume that that indent is a particular number of spaces based on
12931# what it is in my Linux system; people can always resize their windows if
12932# necessary, but this is obviously less than desirable, but the best that can
12933# be expected.
12934my $automatic_pod_indent = 8;
12935
12936# Try to format so that uses fewest lines, but few long left column entries
12937# slide into the right column. An experiment on 5.1 data yielded the
12938# following percentages that didn't cut into the other side along with the
12939# associated first-column widths
12940# 69% = 24
12941# 80% not too bad except for a few blocks
12942# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12943# 95% = 37;
12944my $indent_info_column = 27; # 75% of lines didn't have overlap
12945
12946my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12947 # The 3 is because of:
12948 # 1 for the leading space to tell the pod formatter to
12949 # output as-is
12950 # 1 for the flag
12951 # 1 for the space between the flag and the main data
12952
12953sub format_pod_line ($$$;$$) {
12954 # Take a pod line and return it, formatted properly
12955
12956 my $first_column_width = shift;
12957 my $entry = shift; # Contents of left column
12958 my $info = shift; # Contents of right column
12959
12960 my $status = shift || ""; # Any flag
12961
12962 my $loose_match = shift; # Boolean.
12963 $loose_match = 1 unless defined $loose_match;
12964
12965 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12966
12967 my $flags = "";
12968 $flags .= $STRICTER if ! $loose_match;
12969
12970 $flags .= $status if $status;
12971
12972 # There is a blank in the left column to cause the pod formatter to
12973 # output the line as-is.
12974 return sprintf " %-*s%-*s %s\n",
12975 # The first * in the format is replaced by this, the -1 is
12976 # to account for the leading blank. There isn't a
12977 # hard-coded blank after this to separate the flags from
12978 # the rest of the line, so that in the unlikely event that
12979 # multiple flags are shown on the same line, they both
12980 # will get displayed at the expense of that separation,
12981 # but since they are left justified, a blank will be
12982 # inserted in the normal case.
12983 $FILLER - 1,
12984 $flags,
12985
12986 # The other * in the format is replaced by this number to
12987 # cause the first main column to right fill with blanks.
12988 # The -1 is for the guaranteed blank following it.
12989 $first_column_width - $FILLER - 1,
12990 $entry,
12991 $info;
12992}
12993
12994my @zero_match_tables; # List of tables that have no matches in this release
12995
d1476e4d 12996sub make_re_pod_entries($) {
99870f4d
KW
12997 # This generates the entries for the pod file for a given table.
12998 # Also done at this time are any children tables. The output looks like:
12999 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
13000
13001 my $input_table = shift; # Table the entry is for
13002 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13003
13004 # Generate parent and all its children at the same time.
13005 return if $input_table->parent != $input_table;
13006
13007 my $property = $input_table->property;
13008 my $type = $property->type;
13009 my $full_name = $property->full_name;
13010
13011 my $count = $input_table->count;
13012 my $string_count = clarify_number($count);
13013 my $status = $input_table->status;
13014 my $status_info = $input_table->status_info;
56ca34ca 13015 my $caseless_equivalent = $input_table->caseless_equivalent;
99870f4d
KW
13016
13017 my $entry_for_first_table; # The entry for the first table output.
13018 # Almost certainly, it is the parent.
13019
13020 # For each related table (including itself), we will generate a pod entry
13021 # for each name each table goes by
13022 foreach my $table ($input_table, $input_table->children) {
13023
d4da3f74
KW
13024 # utf8_heavy.pl cannot deal with null string property values, so skip
13025 # any tables that have no non-null names.
13026 next if ! grep { $_->name ne "" } $table->aliases;
99870f4d
KW
13027
13028 # First, gather all the info that applies to this table as a whole.
13029
13030 push @zero_match_tables, $table if $count == 0;
13031
13032 my $table_property = $table->property;
13033
13034 # The short name has all the underscores removed, while the full name
13035 # retains them. Later, we decide whether to output a short synonym
13036 # for the full one, we need to compare apples to apples, so we use the
13037 # short name's length including underscores.
13038 my $table_property_short_name_length;
13039 my $table_property_short_name
13040 = $table_property->short_name(\$table_property_short_name_length);
13041 my $table_property_full_name = $table_property->full_name;
13042
13043 # Get how much savings there is in the short name over the full one
13044 # (delta will always be <= 0)
13045 my $table_property_short_delta = $table_property_short_name_length
13046 - length($table_property_full_name);
13047 my @table_description = $table->description;
13048 my @table_note = $table->note;
13049
13050 # Generate an entry for each alias in this table.
13051 my $entry_for_first_alias; # saves the first one encountered.
13052 foreach my $alias ($table->aliases) {
13053
13054 # Skip if not to go in pod.
33e96e72 13055 next unless $alias->make_re_pod_entry;
99870f4d
KW
13056
13057 # Start gathering all the components for the entry
13058 my $name = $alias->name;
13059
d4da3f74
KW
13060 # Skip if name is empty, as can't be accessed by regexes.
13061 next if $name eq "";
13062
99870f4d
KW
13063 my $entry; # Holds the left column, may include extras
13064 my $entry_ref; # To refer to the left column's contents from
13065 # another entry; has no extras
13066
13067 # First the left column of the pod entry. Tables for the $perl
13068 # property always use the single form.
13069 if ($table_property == $perl) {
13070 $entry = "\\p{$name}";
13071 $entry_ref = "\\p{$name}";
13072 }
13073 else { # Compound form.
13074
13075 # Only generate one entry for all the aliases that mean true
13076 # or false in binary properties. Append a '*' to indicate
13077 # some are missing. (The heading comment notes this.)
60e471b3 13078 my $rhs;
99870f4d
KW
13079 if ($type == $BINARY) {
13080 next if $name ne 'N' && $name ne 'Y';
60e471b3 13081 $rhs = "$name*";
99870f4d 13082 }
06f26c45 13083 elsif ($type != $FORCED_BINARY) {
60e471b3 13084 $rhs = $name;
99870f4d 13085 }
06f26c45
KW
13086 else {
13087
13088 # Forced binary properties require special handling. It
13089 # has two sets of tables, one set is true/false; and the
13090 # other set is everything else. Entries are generated for
13091 # each set. Use the Bidi_Mirrored property (which appears
13092 # in all Unicode versions) to get a list of the aliases
13093 # for the true/false tables. Of these, only output the N
13094 # and Y ones, the same as, a regular binary property. And
13095 # output all the rest, same as a non-binary property.
13096 my $bm = property_ref("Bidi_Mirrored");
13097 if ($name eq 'N' || $name eq 'Y') {
13098 $rhs = "$name*";
13099 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
13100 $bm->table("N")->aliases)
13101 {
13102 next;
13103 }
13104 else {
13105 $rhs = $name;
13106 }
13107 }
99870f4d
KW
13108
13109 # Colon-space is used to give a little more space to be easier
13110 # to read;
13111 $entry = "\\p{"
13112 . $table_property_full_name
60e471b3 13113 . ": $rhs}";
99870f4d
KW
13114
13115 # But for the reference to this entry, which will go in the
13116 # right column, where space is at a premium, use equals
13117 # without a space
13118 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
13119 }
13120
13121 # Then the right (info) column. This is stored as components of
13122 # an array for the moment, then joined into a string later. For
13123 # non-internal only properties, begin the info with the entry for
13124 # the first table we encountered (if any), as things are ordered
13125 # so that that one is the most descriptive. This leads to the
13126 # info column of an entry being a more descriptive version of the
13127 # name column
13128 my @info;
13129 if ($name =~ /^_/) {
13130 push @info,
13131 '(For internal use by Perl, not necessarily stable)';
13132 }
13133 elsif ($entry_for_first_alias) {
13134 push @info, $entry_for_first_alias;
13135 }
13136
13137 # If this entry is equivalent to another, add that to the info,
13138 # using the first such table we encountered
13139 if ($entry_for_first_table) {
13140 if (@info) {
13141 push @info, "(= $entry_for_first_table)";
13142 }
13143 else {
13144 push @info, $entry_for_first_table;
13145 }
13146 }
13147
13148 # If the name is a large integer, add an equivalent with an
13149 # exponent for better readability
13150 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
13151 push @info, sprintf "(= %.1e)", $name
13152 }
13153
13154 my $parenthesized = "";
13155 if (! $entry_for_first_alias) {
13156
13157 # This is the first alias for the current table. The alias
13158 # array is ordered so that this is the fullest, most
13159 # descriptive alias, so it gets the fullest info. The other
13160 # aliases are mostly merely pointers to this one, using the
13161 # information already added above.
13162
13163 # Display any status message, but only on the parent table
13164 if ($status && ! $entry_for_first_table) {
13165 push @info, $status_info;
13166 }
13167
13168 # Put out any descriptive info
13169 if (@table_description || @table_note) {
13170 push @info, join "; ", @table_description, @table_note;
13171 }
13172
13173 # Look to see if there is a shorter name we can point people
13174 # at
13175 my $standard_name = standardize($name);
13176 my $short_name;
13177 my $proposed_short = $table->short_name;
13178 if (defined $proposed_short) {
13179 my $standard_short = standardize($proposed_short);
13180
13181 # If the short name is shorter than the standard one, or
13182 # even it it's not, but the combination of it and its
13183 # short property name (as in \p{prop=short} ($perl doesn't
13184 # have this form)) saves at least two characters, then,
13185 # cause it to be listed as a shorter synonym.
13186 if (length $standard_short < length $standard_name
13187 || ($table_property != $perl
13188 && (length($standard_short)
13189 - length($standard_name)
13190 + $table_property_short_delta) # (<= 0)
13191 < -2))
13192 {
13193 $short_name = $proposed_short;
13194 if ($table_property != $perl) {
13195 $short_name = $table_property_short_name
13196 . "=$short_name";
13197 }
13198 $short_name = "\\p{$short_name}";
13199 }
13200 }
13201
13202 # And if this is a compound form name, see if there is a
13203 # single form equivalent
13204 my $single_form;
13205 if ($table_property != $perl) {
13206
13207 # Special case the binary N tables, so that will print
13208 # \P{single}, but use the Y table values to populate
c12f2655 13209 # 'single', as we haven't likewise populated the N table.
06f26c45
KW
13210 # For forced binary tables, we can't just look at the N
13211 # table, but must see if this table is equivalent to the N
13212 # one, as there are two equivalent beasts in these
13213 # properties.
99870f4d
KW
13214 my $test_table;
13215 my $p;
06f26c45
KW
13216 if ( ($type == $BINARY
13217 && $input_table == $property->table('No'))
13218 || ($type == $FORCED_BINARY
13219 && $property->table('No')->
13220 is_set_equivalent_to($input_table)))
99870f4d
KW
13221 {
13222 $test_table = $property->table('Yes');
13223 $p = 'P';
13224 }
13225 else {
13226 $test_table = $input_table;
13227 $p = 'p';
13228 }
13229
13230 # Look for a single form amongst all the children.
13231 foreach my $table ($test_table->children) {
13232 next if $table->property != $perl;
13233 my $proposed_name = $table->short_name;
13234 next if ! defined $proposed_name;
13235
13236 # Don't mention internal-only properties as a possible
13237 # single form synonym
13238 next if substr($proposed_name, 0, 1) eq '_';
13239
13240 $proposed_name = "\\$p\{$proposed_name}";
13241 if (! defined $single_form
13242 || length($proposed_name) < length $single_form)
13243 {
13244 $single_form = $proposed_name;
13245
13246 # The goal here is to find a single form; not the
13247 # shortest possible one. We've already found a
13248 # short name. So, stop at the first single form
13249 # found, which is likely to be closer to the
13250 # original.
13251 last;
13252 }
13253 }
13254 }
13255
13256 # Ouput both short and single in the same parenthesized
13257 # expression, but with only one of 'Single', 'Short' if there
13258 # are both items.
13259 if ($short_name || $single_form || $table->conflicting) {
99870f4d
KW
13260 $parenthesized .= "Short: $short_name" if $short_name;
13261 if ($short_name && $single_form) {
13262 $parenthesized .= ', ';
13263 }
13264 elsif ($single_form) {
13265 $parenthesized .= 'Single: ';
13266 }
13267 $parenthesized .= $single_form if $single_form;
13268 }
13269 }
13270
56ca34ca
KW
13271 if ($caseless_equivalent != 0) {
13272 $parenthesized .= '; ' if $parenthesized ne "";
13273 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
13274 }
13275
99870f4d
KW
13276
13277 # Warn if this property isn't the same as one that a
13278 # semi-casual user might expect. The other components of this
13279 # parenthesized structure are calculated only for the first entry
13280 # for this table, but the conflicting is deemed important enough
13281 # to go on every entry.
13282 my $conflicting = join " NOR ", $table->conflicting;
13283 if ($conflicting) {
e5228720 13284 $parenthesized .= '; ' if $parenthesized ne "";
99870f4d
KW
13285 $parenthesized .= "NOT $conflicting";
13286 }
99870f4d 13287
e5228720 13288 push @info, "($parenthesized)" if $parenthesized;
d57ccc9a 13289
0f88d393
KW
13290 if ($name =~ /_$/ && $alias->loose_match) {
13291 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13292 }
13293
d57ccc9a
KW
13294 if ($table_property != $perl && $table->perl_extension) {
13295 push @info, '(Perl extension)';
13296 }
2cf724d4 13297 push @info, "($string_count)";
99870f4d
KW
13298
13299 # Now, we have both the entry and info so add them to the
13300 # list of all the properties.
13301 push @match_properties,
13302 format_pod_line($indent_info_column,
13303 $entry,
13304 join( " ", @info),
13305 $alias->status,
13306 $alias->loose_match);
13307
13308 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13309 } # End of looping through the aliases for this table.
13310
13311 if (! $entry_for_first_table) {
13312 $entry_for_first_table = $entry_for_first_alias;
13313 }
13314 } # End of looping through all the related tables
13315 return;
13316}
13317
2df7880f
KW
13318sub make_ucd_table_pod_entries {
13319 my $table = shift;
13320
ee94c7d1
KW
13321 # Generate the entries for the UCD section of the pod for $table. This
13322 # also calculates if names are ambiguous, so has to be called even if the
13323 # pod is not being output
13324
13325 my $short_name = $table->name;
13326 my $standard_short_name = standardize($short_name);
13327 my $full_name = $table->full_name;
13328 my $standard_full_name = standardize($full_name);
13329
13330 my $full_info = ""; # Text of info column for full-name entries
13331 my $other_info = ""; # Text of info column for short-name entries
13332 my $short_info = ""; # Text of info column for other entries
13333 my $meaning = ""; # Synonym of this table
2df7880f
KW
13334
13335 my $property = ($table->isa('Property'))
13336 ? $table
13337 : $table->parent->property;
13338
ee94c7d1
KW
13339 my $perl_extension = $table->perl_extension;
13340
13341 # Get the more official name for for perl extensions that aren't
13342 # stand-alone properties
13343 if ($perl_extension && $property != $table) {
13344 if ($property == $perl ||$property->type == $BINARY) {
13345 $meaning = $table->complete_name;
13346 }
13347 else {
13348 $meaning = $property->full_name . "=$full_name";
13349 }
13350 }
13351
13352 # There are three types of info column. One for the short name, one for
13353 # the full name, and one for everything else. They mostly are the same,
13354 # so initialize in the same loop.
13355 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
13356 if ($perl_extension && $property != $table) {
13357
13358 # Add the synonymous name for the non-full name entries; and to
13359 # the full-name entry if it adds extra information
13360 if ($info_ref == \$other_info
13361 || ($info_ref == \$short_info
13362 && $standard_short_name ne $standard_full_name)
13363 || standardize($meaning) ne $standard_full_name
13364 ) {
13365 $$info_ref .= "$meaning.";
13366 }
13367 }
13368 elsif ($info_ref != \$full_info) {
13369
13370 # Otherwise, the non-full name columns include the full name
13371 $$info_ref .= $full_name;
13372 }
13373
13374 # And the full-name entry includes the short name, if different
13375 if ($info_ref == \$full_info
13376 && $standard_short_name ne $standard_full_name)
13377 {
13378 $full_info =~ s/\.\Z//;
13379 $full_info .= " " if $full_info;
13380 $full_info .= "(Short: $short_name)";
13381 }
13382
13383 if ($table->perl_extension) {
13384 $$info_ref =~ s/\.\Z//;
13385 $$info_ref .= ". " if $$info_ref;
13386 $$info_ref .= "(Perl extension)";
13387 }
13388 }
13389
13390 # Add any extra annotations to the full name entry
13391 foreach my $more_info ($table->description,
13392 $table->note,
13393 $table->status_info)
13394 {
13395 next unless $more_info;
13396 $full_info =~ s/\.\Z//;
13397 $full_info .= ". " if $full_info;
13398 $full_info .= $more_info;
13399 }
13400
13401 # These keep track if have created full and short name pod entries for the
13402 # property
13403 my $done_full = 0;
13404 my $done_short = 0;
13405
2df7880f
KW
13406 # Every possible name is kept track of, even those that aren't going to be
13407 # output. This way we can be sure to find the ambiguities.
13408 foreach my $alias ($table->aliases) {
13409 my $name = $alias->name;
13410 my $standard = standardize($name);
ee94c7d1
KW
13411 my $info;
13412 my $output_this = $alias->ucd;
13413
13414 # If the full and short names are the same, we want to output the full
13415 # one's entry, so it has priority.
13416 if ($standard eq $standard_full_name) {
13417 next if $done_full;
13418 $done_full = 1;
13419 $info = $full_info;
13420 }
13421 elsif ($standard eq $standard_short_name) {
13422 next if $done_short;
13423 $done_short = 1;
13424 next if $standard_short_name eq $standard_full_name;
13425 $info = $short_info;
13426 }
13427 else {
13428 $info = $other_info;
13429 }
2df7880f 13430
ee94c7d1
KW
13431 # Here, we have set up the two columns for this entry. But if an
13432 # entry already exists for this name, we have to decide which one
13433 # we're going to later output.
2df7880f
KW
13434 if (exists $ucd_pod{$standard}) {
13435
13436 # If the two entries refer to the same property, it's not going to
ee94c7d1
KW
13437 # be ambiguous. (Likely it's because the names when standardized
13438 # are the same.) But that means if they are different properties,
13439 # there is ambiguity.
2df7880f
KW
13440 if ($ucd_pod{$standard}->{'property'} != $property) {
13441
ee94c7d1
KW
13442 # Here, we have an ambiguity. This code assumes that one is
13443 # scheduled to be output and one not and that one is a perl
13444 # extension (which is not to be output) and the other isn't.
13445 # If those assumptions are wrong, things have to be rethought.
13446 if ($ucd_pod{$standard}{'output_this'} == $output_this
13447 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
13448 || $output_this == $perl_extension)
13449 {
13450 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
13451 }
13452
13453 # We modifiy the info column of the one being output to
13454 # indicate the ambiguity. Set $which to point to that one's
13455 # info.
13456 my $which;
13457 if ($ucd_pod{$standard}{'output_this'}) {
13458 $which = \$ucd_pod{$standard}->{'info'};
13459 }
13460 else {
13461 $which = \$info;
13462 $meaning = $ucd_pod{$standard}{'meaning'};
13463 }
13464
13465 chomp $$which;
13466 $$which =~ s/\.\Z//;
13467 $$which .= "; NOT '$standard' meaning '$meaning'";
13468
2df7880f
KW
13469 $ambiguous_names{$standard} = 1;
13470 }
13471
ee94c7d1
KW
13472 # Use the non-perl-extension variant
13473 next unless $ucd_pod{$standard}{'perl_extension'};
2df7880f
KW
13474 }
13475
ee94c7d1
KW
13476 # Store enough information about this entry that we can later look for
13477 # ambiguities, and output it properly.
13478 $ucd_pod{$standard} = { 'name' => $name,
13479 'info' => $info,
13480 'meaning' => $meaning,
13481 'output_this' => $output_this,
13482 'perl_extension' => $perl_extension,
2df7880f 13483 'property' => $property,
ee94c7d1 13484 'status' => $alias->status,
2df7880f
KW
13485 };
13486 } # End of looping through all this table's aliases
13487
13488 return;
13489}
13490
99870f4d
KW
13491sub pod_alphanumeric_sort {
13492 # Sort pod entries alphanumerically.
13493
99f78760
KW
13494 # The first few character columns are filler, plus the '\p{'; and get rid
13495 # of all the trailing stuff, starting with the trailing '}', so as to sort
13496 # on just 'Name=Value'
13497 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 13498 $a =~ s/}.*//;
99f78760 13499 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
13500 $b =~ s/}.*//;
13501
99f78760
KW
13502 # Determine if the two operands are both internal only or both not.
13503 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13504 # should be the underscore that begins internal only
13505 my $a_is_internal = (substr($a, 0, 1) eq '_');
13506 my $b_is_internal = (substr($b, 0, 1) eq '_');
13507
13508 # Sort so the internals come last in the table instead of first (which the
13509 # leading underscore would otherwise indicate).
13510 if ($a_is_internal != $b_is_internal) {
13511 return 1 if $a_is_internal;
13512 return -1
13513 }
13514
99870f4d 13515 # Determine if the two operands are numeric property values or not.
99f78760 13516 # A numeric property will look like xyz: 3. But the number
99870f4d 13517 # can begin with an optional minus sign, and may have a
99f78760 13518 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
13519 # isn't numeric, use alphabetic sort.
13520 my ($a_initial, $a_number) =
99f78760 13521 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13522 return $a cmp $b unless defined $a_number;
13523 my ($b_initial, $b_number) =
99f78760 13524 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13525 return $a cmp $b unless defined $b_number;
13526
13527 # Here they are both numeric, but use alphabetic sort if the
13528 # initial parts don't match
13529 return $a cmp $b if $a_initial ne $b_initial;
13530
13531 # Convert rationals to floating for the comparison.
13532 $a_number = eval $a_number if $a_number =~ qr{/};
13533 $b_number = eval $b_number if $b_number =~ qr{/};
13534
13535 return $a_number <=> $b_number;
13536}
13537
13538sub make_pod () {
13539 # Create the .pod file. This generates the various subsections and then
13540 # combines them in one big HERE document.
13541
07c070a8
KW
13542 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
13543
99870f4d
KW
13544 return unless defined $pod_directory;
13545 print "Making pod file\n" if $verbosity >= $PROGRESS;
13546
13547 my $exception_message =
13548 '(Any exceptions are individually noted beginning with the word NOT.)';
13549 my @block_warning;
13550 if (-e 'Blocks.txt') {
13551
13552 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
13553 # if the global $has_In_conflicts indicates we have them.
13554 push @match_properties, format_pod_line($indent_info_column,
13555 '\p{In_*}',
13556 '\p{Block: *}'
13557 . (($has_In_conflicts)
13558 ? " $exception_message"
13559 : ""));
13560 @block_warning = << "END";
13561
77173124
KW
13562Matches in the Block property have shortcuts that begin with "In_". For
13563example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
13564backward compatibility, if there is no conflict with another shortcut, these
13565may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
13566are numerous such conflicting shortcuts. Use of these forms for Block is
13567discouraged, and are flagged as such, not only because of the potential
13568confusion as to what is meant, but also because a later release of Unicode may
13569preempt the shortcut, and your program would no longer be correct. Use the
13570"In_" form instead to avoid this, or even more clearly, use the compound form,
13571e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
13572about this.
99870f4d
KW
13573END
13574 }
07c070a8 13575 my $text = $Is_flags_text;
99870f4d
KW
13576 $text = "$exception_message $text" if $has_Is_conflicts;
13577
13578 # And the 'Is_ line';
13579 push @match_properties, format_pod_line($indent_info_column,
13580 '\p{Is_*}',
13581 "\\p{*} $text");
13582
13583 # Sort the properties array for output. It is sorted alphabetically
13584 # except numerically for numeric properties, and only output unique lines.
13585 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13586
13587 my $formatted_properties = simple_fold(\@match_properties,
13588 "",
13589 # indent succeeding lines by two extra
13590 # which looks better
13591 $indent_info_column + 2,
13592
13593 # shorten the line length by how much
13594 # the formatter indents, so the folded
13595 # line will fit in the space
13596 # presumably available
13597 $automatic_pod_indent);
13598 # Add column headings, indented to be a little more centered, but not
13599 # exactly
13600 $formatted_properties = format_pod_line($indent_info_column,
13601 ' NAME',
13602 ' INFO')
13603 . "\n"
13604 . $formatted_properties;
13605
13606 # Generate pod documentation lines for the tables that match nothing
0090c5d1 13607 my $zero_matches = "";
99870f4d
KW
13608 if (@zero_match_tables) {
13609 @zero_match_tables = uniques(@zero_match_tables);
13610 $zero_matches = join "\n\n",
13611 map { $_ = '=item \p{' . $_->complete_name . "}" }
13612 sort { $a->complete_name cmp $b->complete_name }
c0de960f 13613 @zero_match_tables;
99870f4d
KW
13614
13615 $zero_matches = <<END;
13616
77173124 13617=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
99870f4d
KW
13618
13619Unicode has some property-value pairs that currently don't match anything.
c12f2655
KW
13620This happens generally either because they are obsolete, or they exist for
13621symmetry with other forms, but no language has yet been encoded that uses
13622them. In this version of Unicode, the following match zero code points:
99870f4d
KW
13623
13624=over 4
13625
13626$zero_matches
13627
13628=back
13629
13630END
13631 }
13632
13633 # Generate list of properties that we don't accept, grouped by the reasons
13634 # why. This is so only put out the 'why' once, and then list all the
13635 # properties that have that reason under it.
13636
13637 my %why_list; # The keys are the reasons; the values are lists of
13638 # properties that have the key as their reason
13639
13640 # For each property, add it to the list that are suppressed for its reason
13641 # The sort will cause the alphabetically first properties to be added to
13642 # each list first, so each list will be sorted.
13643 foreach my $property (sort keys %why_suppressed) {
13644 push @{$why_list{$why_suppressed{$property}}}, $property;
13645 }
13646
13647 # For each reason (sorted by the first property that has that reason)...
13648 my @bad_re_properties;
13649 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13650 keys %why_list)
13651 {
54ce19c9 13652 # Add to the output, all the properties that have that reason.
99870f4d
KW
13653 my $has_item = 0; # Flag if actually output anything.
13654 foreach my $name (@{$why_list{$why}}) {
13655
13656 # Split compound names into $property and $table components
13657 my $property = $name;
13658 my $table;
13659 if ($property =~ / (.*) = (.*) /x) {
13660 $property = $1;
13661 $table = $2;
13662 }
13663
13664 # This release of Unicode may not have a property that is
13665 # suppressed, so don't reference a non-existent one.
13666 $property = property_ref($property);
13667 next if ! defined $property;
13668
13669 # And since this list is only for match tables, don't list the
13670 # ones that don't have match tables.
13671 next if ! $property->to_create_match_tables;
13672
13673 # Find any abbreviation, and turn it into a compound name if this
13674 # is a property=value pair.
13675 my $short_name = $property->name;
13676 $short_name .= '=' . $property->table($table)->name if $table;
13677
54ce19c9
KW
13678 # Start with an empty line.
13679 push @bad_re_properties, "\n\n" unless $has_item;
13680
99870f4d
KW
13681 # And add the property as an item for the reason.
13682 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13683 $has_item = 1;
13684 }
13685
13686 # And add the reason under the list of properties, if such a list
13687 # actually got generated. Note that the header got added
13688 # unconditionally before. But pod ignores extra blank lines, so no
13689 # harm.
13690 push @bad_re_properties, "\n$why\n" if $has_item;
13691
13692 } # End of looping through each reason.
13693
54ce19c9
KW
13694 if (! @bad_re_properties) {
13695 push @bad_re_properties,
13696 "*** This installation accepts ALL non-Unihan properties ***";
13697 }
13698 else {
13699 # Add =over only if non-empty to avoid an empty =over/=back section,
13700 # which is considered bad form.
13701 unshift @bad_re_properties, "\n=over 4\n";
13702 push @bad_re_properties, "\n=back\n";
13703 }
13704
8d099389
KW
13705 # Similiarly, generate a list of files that we don't use, grouped by the
13706 # reasons why. First, create a hash whose keys are the reasons, and whose
13707 # values are anonymous arrays of all the files that share that reason.
13708 my %grouped_by_reason;
13709 foreach my $file (keys %ignored_files) {
13710 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
13711 }
1fec9f60
KW
13712 foreach my $file (keys %skipped_files) {
13713 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
13714 }
8d099389
KW
13715
13716 # Then, sort each group.
13717 foreach my $group (keys %grouped_by_reason) {
13718 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
13719 @{$grouped_by_reason{$group}} ;
13720 }
13721
13722 # Finally, create the output text. For each reason (sorted by the
13723 # alphabetically first file that has that reason)...
13724 my @unused_files;
13725 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
13726 cmp lc $grouped_by_reason{$b}->[0]
13727 }
13728 keys %grouped_by_reason)
13729 {
13730 # Add all the files that have that reason to the output. Start
13731 # with an empty line.
13732 push @unused_files, "\n\n";
13733 push @unused_files, map { "\n=item F<$_> \n" }
13734 @{$grouped_by_reason{$reason}};
13735 # And add the reason under the list of files
13736 push @unused_files, "\n$reason\n";
13737 }
13738
ee94c7d1
KW
13739 # Similarly, create the output text for the UCD section of the pod
13740 my @ucd_pod;
13741 foreach my $key (keys %ucd_pod) {
13742 next unless $ucd_pod{$key}->{'output_this'};
13743 push @ucd_pod, format_pod_line($indent_info_column,
13744 $ucd_pod{$key}->{'name'},
13745 $ucd_pod{$key}->{'info'},
13746 $ucd_pod{$key}->{'status'},
13747 );
13748 }
13749
13750 # Sort alphabetically, and fold for output
13751 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
13752 my $ucd_pod = simple_fold(\@ucd_pod,
13753 ' ',
13754 $indent_info_column,
13755 $automatic_pod_indent);
13756 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
13757 . "\n"
13758 . $ucd_pod;
12916dad
MS
13759 local $" = "";
13760
99870f4d
KW
13761 # Everything is ready to assemble.
13762 my @OUT = << "END";
13763=begin comment
13764
13765$HEADER
13766
13767To change this file, edit $0 instead.
13768
13769=end comment
13770
13771=head1 NAME
13772
8d099389 13773$pod_file - Index of Unicode Version $string_version character properties in Perl
99870f4d
KW
13774
13775=head1 DESCRIPTION
13776
8d099389
KW
13777This document provides information about the portion of the Unicode database
13778that deals with character properties, that is the portion that is defined on
13779single code points. (L</Other information in the Unicode data base>
13780below briefly mentions other data that Unicode provides.)
99870f4d 13781
8d099389
KW
13782Perl can provide access to all non-provisional Unicode character properties,
13783though not all are enabled by default. The omitted ones are the Unihan
13784properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
13785deprecated or Unicode-internal properties. (An installation may choose to
ea5acc0f 13786recompile Perl's tables to change this. See L<Unicode character
8d099389
KW
13787properties that are NOT accepted by Perl>.)
13788
ee94c7d1
KW
13789For most purposes, access to Unicode properties from the Perl core is through
13790regular expression matches, as described in the next section.
13791For some special purposes, and to access the properties that are not suitable
13792for regular expression matching, all the Unicode character properties that
13793Perl handles are accessible via the standard L<Unicode::UCD> module, as
13794described in the section L</Properties accessible through Unicode::UCD>.
13795
8d099389
KW
13796Perl also provides some additional extensions and short-cut synonyms
13797for Unicode properties.
99870f4d
KW
13798
13799This document merely lists all available properties and does not attempt to
13800explain what each property really means. There is a brief description of each
043f3b3f
KW
13801Perl extension; see L<perlunicode/Other Properties> for more information on
13802these. There is some detail about Blocks, Scripts, General_Category,
99870f4d 13803and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
043f3b3f
KW
13804official Unicode properties, refer to the Unicode standard. A good starting
13805place is L<$unicode_reference_url>.
99870f4d
KW
13806
13807Note that you can define your own properties; see
13808L<perlunicode/"User-Defined Character Properties">.
13809
77173124 13810=head1 Properties accessible through C<\\p{}> and C<\\P{}>
99870f4d 13811
77173124
KW
13812The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13813most of the Unicode character properties. The table below shows all these
13814constructs, both single and compound forms.
99870f4d
KW
13815
13816B<Compound forms> consist of two components, separated by an equals sign or a
13817colon. The first component is the property name, and the second component is
13818the particular value of the property to match against, for example,
77173124 13819C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
99870f4d
KW
13820whose Script property is Greek.
13821
77173124 13822B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
99870f4d 13823their equivalent compound forms. The table shows these equivalences. (In our
77173124 13824example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
99870f4d 13825There are also a few Perl-defined single forms that are not shortcuts for a
77173124 13826compound form. One such is C<\\p{Word}>. These are also listed in the table.
99870f4d
KW
13827
13828In parsing these constructs, Perl always ignores Upper/lower case differences
77173124
KW
13829everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
13830C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
13831the left brace completely changes the meaning of the construct, from "match"
13832(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
13833for improved legibility.
99870f4d
KW
13834
13835Also, white space, hyphens, and underscores are also normally ignored
13836everywhere between the {braces}, and hence can be freely added or removed
13837even if the C</x> modifier hasn't been specified on the regular expression.
13838But $a_bold_stricter at the beginning of an entry in the table below
13839means that tighter (stricter) rules are used for that entry:
13840
13841=over 4
13842
77173124 13843=item Single form (C<\\p{name}>) tighter rules:
99870f4d
KW
13844
13845White space, hyphens, and underscores ARE significant
13846except for:
13847
13848=over 4
13849
13850=item * white space adjacent to a non-word character
13851
13852=item * underscores separating digits in numbers
13853
13854=back
13855
13856That means, for example, that you can freely add or remove white space
13857adjacent to (but within) the braces without affecting the meaning.
13858
77173124 13859=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
99870f4d
KW
13860
13861The tighter rules given above for the single form apply to everything to the
13862right of the colon or equals; the looser rules still apply to everything to
13863the left.
13864
13865That means, for example, that you can freely add or remove white space
13866adjacent to (but within) the braces and the colon or equal sign.
13867
13868=back
13869
78bb419c
KW
13870Some properties are considered obsolete by Unicode, but still available.
13871There are several varieties of obsolescence:
99870f4d
KW
13872
13873=over 4
13874
99870f4d
KW
13875=item Stabilized
13876
f8c38b14 13877A property may be stabilized. Such a determination does not indicate
5f7264c7
KW
13878that the property should or should not be used; instead it is a declaration
13879that the property will not be maintained nor extended for newly encoded
13880characters. Such properties are marked with $a_bold_stabilized in the
13881table.
99870f4d
KW
13882
13883=item Deprecated
13884
f8c38b14 13885A property may be deprecated, perhaps because its original intent
78bb419c
KW
13886has been replaced by another property, or because its specification was
13887somehow defective. This means that its use is strongly
99870f4d
KW
13888discouraged, so much so that a warning will be issued if used, unless the
13889regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13890statement. $A_bold_deprecated flags each such entry in the table, and
13891the entry there for the longest, most descriptive version of the property will
13892give the reason it is deprecated, and perhaps advice. Perl may issue such a
13893warning, even for properties that aren't officially deprecated by Unicode,
13894when there used to be characters or code points that were matched by them, but
13895no longer. This is to warn you that your program may not work like it did on
13896earlier Unicode releases.
13897
13898A deprecated property may be made unavailable in a future Perl version, so it
13899is best to move away from them.
13900
c12f2655
KW
13901A deprecated property may also be stabilized, but this fact is not shown.
13902
13903=item Obsolete
13904
13905Properties marked with $a_bold_obsolete in the table are considered (plain)
13906obsolete. Generally this designation is given to properties that Unicode once
13907used for internal purposes (but not any longer).
13908
99870f4d
KW
13909=back
13910
13911Some Perl extensions are present for backwards compatibility and are
c12f2655
KW
13912discouraged from being used, but are not obsolete. $A_bold_discouraged
13913flags each such entry in the table. Future Unicode versions may force
13914some of these extensions to be removed without warning, replaced by another
13915property with the same name that means something different. Use the
13916equivalent shown instead.
99870f4d
KW
13917
13918@block_warning
13919
77173124 13920The table below has two columns. The left column contains the C<\\p{}>
98dc9551 13921constructs to look up, possibly preceded by the flags mentioned above; and
99870f4d
KW
13922the right column contains information about them, like a description, or
13923synonyms. It shows both the single and compound forms for each property that
13924has them. If the left column is a short name for a property, the right column
13925will give its longer, more descriptive name; and if the left column is the
13926longest name, the right column will show any equivalent shortest name, in both
13927single and compound forms if applicable.
13928
13929The right column will also caution you if a property means something different
13930than what might normally be expected.
13931
d57ccc9a
KW
13932All single forms are Perl extensions; a few compound forms are as well, and
13933are noted as such.
13934
99870f4d
KW
13935Numbers in (parentheses) indicate the total number of code points matched by
13936the property. For emphasis, those properties that match no code points at all
13937are listed as well in a separate section following the table.
13938
56ca34ca
KW
13939Most properties match the same code points regardless of whether C<"/i">
13940case-insensitive matching is specified or not. But a few properties are
13941affected. These are shown with the notation
13942
13943 (/i= other_property)
13944
13945in the second column. Under case-insensitive matching they match the
13946same code pode points as the property "other_property".
13947
99870f4d 13948There is no description given for most non-Perl defined properties (See
77173124 13949L<$unicode_reference_url> for that).
d73e5302 13950
99870f4d
KW
13951For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13952combinations. For example, entries like:
d73e5302 13953
99870f4d 13954 \\p{Gc: *} \\p{General_Category: *}
5beb625e 13955
99870f4d
KW
13956mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13957for the latter is also valid for the former. Similarly,
5beb625e 13958
99870f4d 13959 \\p{Is_*} \\p{*}
5beb625e 13960
77173124
KW
13961means that if and only if, for example, C<\\p{Foo}> exists, then
13962C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13963And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13964C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
13965underscore.
5beb625e 13966
99870f4d
KW
13967Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13968And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
13969'N*' to indicate this, and doesn't have separate entries for the other
13970possibilities. Note that not all properties which have values 'Yes' and 'No'
13971are binary, and they have all their values spelled out without using this wild
13972card, and a C<NOT> clause in their description that highlights their not being
13973binary. These also require the compound form to match them, whereas true
13974binary properties have both single and compound forms available.
5beb625e 13975
99870f4d
KW
13976Note that all non-essential underscores are removed in the display of the
13977short names below.
5beb625e 13978
c12f2655 13979B<Legend summary:>
5beb625e 13980
99870f4d 13981=over 4
cf25bb62 13982
21405004 13983=item Z<>B<*> is a wild-card
cf25bb62 13984
99870f4d
KW
13985=item B<(\\d+)> in the info column gives the number of code points matched by
13986this property.
cf25bb62 13987
99870f4d 13988=item B<$DEPRECATED> means this is deprecated.
cf25bb62 13989
99870f4d 13990=item B<$OBSOLETE> means this is obsolete.
cf25bb62 13991
99870f4d 13992=item B<$STABILIZED> means this is stabilized.
cf25bb62 13993
99870f4d 13994=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 13995
c12f2655
KW
13996=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13997stable.
5beb625e 13998
99870f4d 13999=back
da7fcca4 14000
99870f4d 14001$formatted_properties
cf25bb62 14002
99870f4d 14003$zero_matches
cf25bb62 14004
ee94c7d1
KW
14005=head1 Properties accessible through Unicode::UCD
14006
14007All the Unicode character properties mentioned above (except for those marked
14008as for internal use by Perl) are also accessible by
14009L<Unicode::UCD/prop_invlist()>.
14010
14011Due to their nature, not all Unicode character properties are suitable for
14012regular expression matches, nor C<prop_invlist()>. The remaining
14013non-provisional, non-internal ones are accessible via
14014L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
14015hasn't included; see L<below for which those are|/Unicode character properties
14016that are NOT accepted by Perl>).
14017
14018For compatibility with other parts of Perl, all the single forms given in the
14019table in the L<section above|/Properties accessible through \\p{} and \\P{}>
14020are recognized. BUT, there are some ambiguities between some Perl extensions
14021and the Unicode properties, all of which are silently resolved in favor of the
14022official Unicode property. To avoid surprises, you should only use
14023C<prop_invmap()> for forms listed in the table below, which omits the
14024non-recommended ones. The affected forms are the Perl single form equivalents
14025of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
14026C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
14027whose short name is C<sc>. The table indicates the current ambiguities in the
14028INFO column, beginning with the word C<"NOT">.
14029
14030The standard Unicode properties listed below are documented in
14031L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
14032L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
14033L<perlunicode/Other Properties>;
14034
14035The first column in the table is a name for the property; the second column is
14036an alternative name, if any, plus possibly some annotations. The alternative
14037name is the property's full name, unless that would simply repeat the first
14038column, in which case the second column indicates the property's short name
14039(if different). The annotations are given only in the entry for the full
14040name. If a property is obsolete, etc, the entry will be flagged with the same
14041characters used in the table in the L<section above|/Properties accessible
14042through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
14043
14044$ucd_pod
14045
14046=head1 Properties accessible through other means
14047
14048Certain properties are accessible also via core function calls. These are:
78bb419c 14049
99870f4d
KW
14050 Lowercase_Mapping lc() and lcfirst()
14051 Titlecase_Mapping ucfirst()
14052 Uppercase_Mapping uc()
12ac2576 14053
043f3b3f
KW
14054Also, Case_Folding is accessible through the C</i> modifier in regular
14055expressions.
cf25bb62 14056
043f3b3f 14057And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
fbb93542
KW
14058interpolation in double-quoted strings and regular expressions; and functions
14059C<charnames::viacode()>, C<charnames::vianame()>, and
14060C<charnames::string_vianame()> (which require a C<use charnames ();> to be
14061specified.
cf25bb62 14062
ee94c7d1
KW
14063Finally, most properties related to decomposition are accessible via
14064L<Unicode::Normalize>.
14065
ea5acc0f 14066=head1 Unicode character properties that are NOT accepted by Perl
d2d499f5 14067
99870f4d
KW
14068Perl will generate an error for a few character properties in Unicode when
14069used in a regular expression. The non-Unihan ones are listed below, with the
14070reasons they are not accepted, perhaps with work-arounds. The short names for
14071the properties are listed enclosed in (parentheses).
c12f2655
KW
14072As described after the list, an installation can change the defaults and choose
14073to accept any of these. The list is machine generated based on the
14074choices made for the installation that generated this document.
ae6979a8 14075
99870f4d 14076@bad_re_properties
a3a8c5f0 14077
b7986f4f
KW
14078An installation can choose to allow any of these to be matched by downloading
14079the Unicode database from L<http://www.unicode.org/Public/> to
f3514a2f
KW
14080C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
14081controlling lists contained in the program
14082C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
14083(C<\%Config> is available from the Config module).
d73e5302 14084
8d099389
KW
14085=head1 Other information in the Unicode data base
14086
14087The Unicode data base is delivered in two different formats. The XML version
14088is valid for more modern Unicode releases. The other version is a collection
14089of files. The two are intended to give equivalent information. Perl uses the
14090older form; this allows you to recompile Perl to use early Unicode releases.
14091
14092The only non-character property that Perl currently supports is Named
14093Sequences, in which a sequence of code points
14094is given a name and generally treated as a single entity. (Perl supports
14095these via the C<\\N{...}> double-quotish construct,
14096L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
14097
14098Below is a list of the files in the Unicode data base that Perl doesn't
14099currently use, along with very brief descriptions of their purposes.
14100Some of the names of the files have been shortened from those that Unicode
14101uses, in order to allow them to be distinguishable from similarly named files
14102on file systems for which only the first 8 characters of a name are
14103significant.
14104
14105=over 4
14106
14107@unused_files
14108
14109=back
14110
99870f4d 14111=head1 SEE ALSO
d73e5302 14112
99870f4d 14113L<$unicode_reference_url>
12ac2576 14114
99870f4d 14115L<perlrecharclass>
12ac2576 14116
99870f4d 14117L<perlunicode>
d73e5302 14118
99870f4d 14119END
d73e5302 14120
9218f1cf
KW
14121 # And write it. The 0 means no utf8.
14122 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
99870f4d
KW
14123 return;
14124}
d73e5302 14125
99870f4d
KW
14126sub make_Heavy () {
14127 # Create and write Heavy.pl, which passes info about the tables to
14128 # utf8_heavy.pl
12ac2576 14129
143b2c48
KW
14130 # Stringify structures for output
14131 my $loose_property_name_of
14132 = simple_dumper(\%loose_property_name_of, ' ' x 4);
14133 chomp $loose_property_name_of;
14134
14135 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
14136 chomp $stricter_to_file_of;
14137
14138 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
14139 chomp $loose_to_file_of;
14140
14141 my $nv_floating_to_rational
14142 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
14143 chomp $nv_floating_to_rational;
14144
14145 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
14146 chomp $why_deprecated;
14147
14148 # We set the key to the file when we associated files with tables, but we
14149 # couldn't do the same for the value then, as we might not have the file
14150 # for the alternate table figured out at that time.
14151 foreach my $cased (keys %caseless_equivalent_to) {
14152 my @path = $caseless_equivalent_to{$cased}->file_path;
14153 my $path = join '/', @path[1, -1];
14154 $caseless_equivalent_to{$cased} = $path;
14155 }
14156 my $caseless_equivalent_to
14157 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
14158 chomp $caseless_equivalent_to;
14159
315bfd4e
KW
14160 my $loose_property_to_file_of
14161 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
14162 chomp $loose_property_to_file_of;
14163
89cf10cc
KW
14164 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
14165 chomp $file_to_swash_name;
14166
99870f4d
KW
14167 my @heavy = <<END;
14168$HEADER
126c3d4e 14169$INTERNAL_ONLY_HEADER
d73e5302 14170
01da8b85 14171# This file is for the use of utf8_heavy.pl and Unicode::UCD
12ac2576 14172
c12f2655
KW
14173# Maps Unicode (not Perl single-form extensions) property names in loose
14174# standard form to their corresponding standard names
99870f4d 14175\%utf8::loose_property_name_of = (
143b2c48 14176$loose_property_name_of
99870f4d 14177);
12ac2576 14178
99870f4d
KW
14179# Maps property, table to file for those using stricter matching
14180\%utf8::stricter_to_file_of = (
143b2c48 14181$stricter_to_file_of
99870f4d 14182);
12ac2576 14183
99870f4d
KW
14184# Maps property, table to file for those using loose matching
14185\%utf8::loose_to_file_of = (
143b2c48 14186$loose_to_file_of
99870f4d 14187);
12ac2576 14188
99870f4d
KW
14189# Maps floating point to fractional form
14190\%utf8::nv_floating_to_rational = (
143b2c48 14191$nv_floating_to_rational
99870f4d 14192);
12ac2576 14193
99870f4d
KW
14194# If a floating point number doesn't have enough digits in it to get this
14195# close to a fraction, it isn't considered to be that fraction even if all the
14196# digits it does have match.
14197\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 14198
99870f4d
KW
14199# Deprecated tables to generate a warning for. The key is the file containing
14200# the table, so as to avoid duplication, as many property names can map to the
14201# file, but we only need one entry for all of them.
14202\%utf8::why_deprecated = (
143b2c48 14203$why_deprecated
99870f4d 14204);
12ac2576 14205
143b2c48 14206# A few properties have different behavior under /i matching. This maps
d867ccfb
KW
14207# those to substitute files to use under /i.
14208\%utf8::caseless_equivalent = (
143b2c48 14209$caseless_equivalent_to
d867ccfb
KW
14210);
14211
315bfd4e
KW
14212# Property names to mapping files
14213\%utf8::loose_property_to_file_of = (
14214$loose_property_to_file_of
14215);
14216
89cf10cc
KW
14217# Files to the swash names within them.
14218\%utf8::file_to_swash_name = (
14219$file_to_swash_name
14220);
14221
99870f4d
KW
142221;
14223END
12ac2576 14224
9218f1cf 14225 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
99870f4d 14226 return;
12ac2576
JP
14227}
14228
52dc8b5d 14229sub make_Name_pm () {
6f424f62 14230 # Create and write Name.pm, which contains subroutines and data to use in
52dc8b5d
KW
14231 # conjunction with Name.pl
14232
bb1dd3da
KW
14233 # Maybe there's nothing to do.
14234 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
14235
52dc8b5d
KW
14236 my @name = <<END;
14237$HEADER
126c3d4e 14238$INTERNAL_ONLY_HEADER
52dc8b5d 14239END
0f6f7bc2 14240
fb848dce
KW
14241 # Convert these structures to output format.
14242 my $code_points_ending_in_code_point =
14243 main::simple_dumper(\@code_points_ending_in_code_point,
14244 ' ' x 8);
14245 my $names = main::simple_dumper(\%names_ending_in_code_point,
14246 ' ' x 8);
14247 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
0f6f7bc2 14248 ' ' x 8);
0f6f7bc2 14249
fb848dce
KW
14250 # Do the same with the Hangul names,
14251 my $jamo;
14252 my $jamo_l;
14253 my $jamo_v;
14254 my $jamo_t;
14255 my $jamo_re;
14256 if ($has_hangul_syllables) {
0f6f7bc2 14257
fb848dce
KW
14258 # Construct a regular expression of all the possible
14259 # combinations of the Hangul syllables.
14260 my @L_re; # Leading consonants
14261 for my $i ($LBase .. $LBase + $LCount - 1) {
14262 push @L_re, $Jamo{$i}
14263 }
14264 my @V_re; # Middle vowels
14265 for my $i ($VBase .. $VBase + $VCount - 1) {
14266 push @V_re, $Jamo{$i}
14267 }
14268 my @T_re; # Trailing consonants
14269 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
14270 push @T_re, $Jamo{$i}
14271 }
0f6f7bc2 14272
fb848dce
KW
14273 # The whole re is made up of the L V T combination.
14274 $jamo_re = '('
14275 . join ('|', sort @L_re)
14276 . ')('
14277 . join ('|', sort @V_re)
14278 . ')('
14279 . join ('|', sort @T_re)
14280 . ')?';
0f6f7bc2 14281
fb848dce
KW
14282 # These hashes needed by the algorithm were generated
14283 # during reading of the Jamo.txt file
14284 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
14285 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
14286 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
14287 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
14288 }
0f6f7bc2 14289
6f424f62 14290 push @name, <<END;
0f6f7bc2 14291
e7a078a0
KW
14292package charnames;
14293
6f424f62
KW
14294# This module contains machine-generated tables and code for the
14295# algorithmically-determinable Unicode character names. The following
14296# routines can be used to translate between name and code point and vice versa
0f6f7bc2
KW
14297
14298{ # Closure
14299
92199589
KW
14300 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
14301 # two must be 10; if there are 5, the first must not be a 0. Written this
14302 # way to decrease backtracking. The first regex allows the code point to
14303 # be at the end of a word, but to work properly, the word shouldn't end
14304 # with a valid hex character. The second one won't match a code point at
14305 # the end of a word, and doesn't have the run-on issue
0f6f7bc2
KW
14306 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
14307 my \$code_point_re = qr/$code_point_re/;
14308
14309 # In the following hash, the keys are the bases of names which includes
14310 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
14311 # of each key is another hash which is used to get the low and high ends
14312 # for each range of code points that apply to the name.
14313 my %names_ending_in_code_point = (
14314$names
14315 );
14316
14317 # The following hash is a copy of the previous one, except is for loose
14318 # matching, so each name has blanks and dashes squeezed out
14319 my %loose_names_ending_in_code_point = (
14320$loose_names
14321 );
14322
14323 # And the following array gives the inverse mapping from code points to
14324 # names. Lowest code points are first
14325 my \@code_points_ending_in_code_point = (
14326$code_points_ending_in_code_point
14327 );
14328END
fb848dce
KW
14329 # Earlier releases didn't have Jamos. No sense outputting
14330 # them unless will be used.
14331 if ($has_hangul_syllables) {
6f424f62 14332 push @name, <<END;
0f6f7bc2
KW
14333
14334 # Convert from code point to Jamo short name for use in composing Hangul
14335 # syllable names
14336 my %Jamo = (
14337$jamo
14338 );
14339
14340 # Leading consonant (can be null)
14341 my %Jamo_L = (
14342$jamo_l
14343 );
14344
14345 # Vowel
14346 my %Jamo_V = (
14347$jamo_v
14348 );
14349
14350 # Optional trailing consonant
14351 my %Jamo_T = (
14352$jamo_t
14353 );
14354
14355 # Computed re that splits up a Hangul name into LVT or LV syllables
14356 my \$syllable_re = qr/$jamo_re/;
14357
14358 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
14359 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
14360
14361 # These constants names and values were taken from the Unicode standard,
14362 # version 5.1, section 3.12. They are used in conjunction with Hangul
14363 # syllables
14364 my \$SBase = $SBase_string;
14365 my \$LBase = $LBase_string;
14366 my \$VBase = $VBase_string;
14367 my \$TBase = $TBase_string;
14368 my \$SCount = $SCount;
14369 my \$LCount = $LCount;
14370 my \$VCount = $VCount;
14371 my \$TCount = $TCount;
14372 my \$NCount = \$VCount * \$TCount;
14373END
fb848dce 14374 } # End of has Jamos
0f6f7bc2 14375
6f424f62 14376 push @name, << 'END';
0f6f7bc2
KW
14377
14378 sub name_to_code_point_special {
14379 my ($name, $loose) = @_;
14380
14381 # Returns undef if not one of the specially handled names; otherwise
14382 # returns the code point equivalent to the input name
14383 # $loose is non-zero if to use loose matching, 'name' in that case
14384 # must be input as upper case with all blanks and dashes squeezed out.
14385END
fb848dce 14386 if ($has_hangul_syllables) {
6f424f62 14387 push @name, << 'END';
0f6f7bc2
KW
14388
14389 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
14390 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
14391 {
14392 return if $name !~ qr/^$syllable_re$/;
14393 my $L = $Jamo_L{$1};
14394 my $V = $Jamo_V{$2};
14395 my $T = (defined $3) ? $Jamo_T{$3} : 0;
14396 return ($L * $VCount + $V) * $TCount + $T + $SBase;
14397 }
14398END
fb848dce 14399 }
6f424f62 14400 push @name, << 'END';
0f6f7bc2
KW
14401
14402 # Name must end in 'code_point' for this to handle.
14403 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
14404 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
14405
14406 my $base = $1;
14407 my $code_point = CORE::hex $2;
14408 my $names_ref;
14409
14410 if ($loose) {
14411 $names_ref = \%loose_names_ending_in_code_point;
14412 }
14413 else {
14414 return if $base !~ s/-$//;
14415 $names_ref = \%names_ending_in_code_point;
14416 }
14417
14418 # Name must be one of the ones which has the code point in it.
14419 return if ! $names_ref->{$base};
14420
14421 # Look through the list of ranges that apply to this name to see if
14422 # the code point is in one of them.
14423 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
14424 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
14425 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
14426
14427 # Here, the code point is in the range.
14428 return $code_point;
14429 }
14430
14431 # Here, looked like the name had a code point number in it, but
14432 # did not match one of the valid ones.
14433 return;
14434 }
14435
14436 sub code_point_to_name_special {
14437 my $code_point = shift;
14438
14439 # Returns the name of a code point if algorithmically determinable;
14440 # undef if not
14441END
fb848dce 14442 if ($has_hangul_syllables) {
6f424f62 14443 push @name, << 'END';
0f6f7bc2
KW
14444
14445 # If in the Hangul range, calculate the name based on Unicode's
14446 # algorithm
14447 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
14448 use integer;
14449 my $SIndex = $code_point - $SBase;
14450 my $L = $LBase + $SIndex / $NCount;
14451 my $V = $VBase + ($SIndex % $NCount) / $TCount;
14452 my $T = $TBase + $SIndex % $TCount;
14453 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
14454 $name .= $Jamo{$T} if $T != $TBase;
14455 return $name;
14456 }
14457END
fb848dce 14458 }
6f424f62 14459 push @name, << 'END';
0f6f7bc2
KW
14460
14461 # Look through list of these code points for one in range.
14462 foreach my $hash (@code_points_ending_in_code_point) {
14463 return if $code_point < $hash->{'low'};
14464 if ($code_point <= $hash->{'high'}) {
14465 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
14466 }
14467 }
14468 return; # None found
14469 }
14470} # End closure
14471
6f424f62 144721;
0f6f7bc2 14473END
52dc8b5d
KW
14474
14475 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
14476 return;
14477}
14478
9f077a68
KW
14479sub make_UCD () {
14480 # Create and write UCD.pl, which passes info about the tables to
14481 # Unicode::UCD
14482
f7be2375
KW
14483 # Create a mapping from each alias of Perl single-form extensions to all
14484 # its equivalent aliases, for quick look-up.
14485 my %perlprop_to_aliases;
14486 foreach my $table ($perl->tables) {
14487
14488 # First create the list of the aliases of each extension
14489 my @aliases_list; # List of legal aliases for this extension
14490
14491 my $table_name = $table->name;
14492 my $standard_table_name = standardize($table_name);
14493 my $table_full_name = $table->full_name;
14494 my $standard_table_full_name = standardize($table_full_name);
14495
14496 # Make sure that the list has both the short and full names
14497 push @aliases_list, $table_name, $table_full_name;
14498
14499 my $found_ucd = 0; # ? Did we actually get an alias that should be
14500 # output for this table
14501
14502 # Go through all the aliases (including the two just added), and add
14503 # any new unique ones to the list
14504 foreach my $alias ($table->aliases) {
14505
14506 # Skip non-legal names
0eac1e20 14507 next unless $alias->ok_as_filename;
f7be2375
KW
14508 next unless $alias->ucd;
14509
14510 $found_ucd = 1; # have at least one legal name
14511
14512 my $name = $alias->name;
14513 my $standard = standardize($name);
14514
14515 # Don't repeat a name that is equivalent to one already on the
14516 # list
14517 next if $standard eq $standard_table_name;
14518 next if $standard eq $standard_table_full_name;
14519
14520 push @aliases_list, $name;
14521 }
14522
14523 # If there were no legal names, don't output anything.
14524 next unless $found_ucd;
14525
14526 # To conserve memory in the program reading these in, omit full names
14527 # that are identical to the short name, when those are the only two
14528 # aliases for the property.
14529 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
14530 pop @aliases_list;
14531 }
14532
14533 # Here, @aliases_list is the list of all the aliases that this
14534 # extension legally has. Now can create a map to it from each legal
14535 # standardized alias
14536 foreach my $alias ($table->aliases) {
14537 next unless $alias->ucd;
0eac1e20 14538 next unless $alias->ok_as_filename;
f7be2375
KW
14539 push @{$perlprop_to_aliases{standardize($alias->name)}},
14540 @aliases_list;
14541 }
14542 }
14543
55a40252
KW
14544 # Make a list of all combinations of properties/values that are suppressed.
14545 my @suppressed;
14546 foreach my $property_name (keys %why_suppressed) {
14547
14548 # Just the value
14549 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
14550
14551 # The hash may contain properties not in this release of Unicode
14552 next unless defined (my $property = property_ref($property_name));
14553
14554 # Find all combinations
14555 foreach my $prop_alias ($property->aliases) {
14556 my $prop_alias_name = standardize($prop_alias->name);
14557
14558 # If no =value, there's just one combination possibe for this
14559 if (! $value_name) {
14560
14561 # The property may be suppressed, but there may be a proxy for
14562 # it, so it shouldn't be listed as suppressed
14563 next if $prop_alias->ucd;
14564 push @suppressed, $prop_alias_name;
14565 }
14566 else { # Otherwise
14567 foreach my $value_alias ($property->table($value_name)->aliases)
14568 {
14569 next if $value_alias->ucd;
14570
14571 push @suppressed, "$prop_alias_name="
14572 . standardize($value_alias->name);
14573 }
14574 }
14575 }
14576 }
14577
6a40599f
KW
14578 # Convert the structure below (designed for Name.pm) to a form that UCD
14579 # wants, so it doesn't have to modify it at all; i.e. so that it includes
14580 # an element for the Hangul syllables in the appropriate place, and
14581 # otherwise changes the name to include the "-<code point>" suffix.
14582 my @algorithm_names;
14583 my $done_hangul = 0;
14584
14585 # Copy it linearly.
14586 for my $i (0 .. @code_points_ending_in_code_point - 1) {
14587
14588 # Insert the hanguls in the correct place.
14589 if (! $done_hangul
14590 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
14591 {
14592 $done_hangul = 1;
14593 push @algorithm_names, { low => $SBase,
14594 high => $SBase + $SCount - 1,
14595 name => '<hangul syllable>',
14596 };
14597 }
14598
14599 # Copy the current entry, modified.
14600 push @algorithm_names, {
14601 low => $code_points_ending_in_code_point[$i]->{'low'},
14602 high => $code_points_ending_in_code_point[$i]->{'high'},
14603 name =>
14604 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
14605 };
14606 }
14607
9e4a1e86
KW
14608 # Serialize these structures for output.
14609 my $loose_to_standard_value
14610 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
14611 chomp $loose_to_standard_value;
14612
86a52d1e
KW
14613 my $string_property_loose_to_name
14614 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
14615 chomp $string_property_loose_to_name;
14616
f7be2375
KW
14617 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
14618 chomp $perlprop_to_aliases;
14619
5d1df013
KW
14620 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
14621 chomp $prop_aliases;
14622
1e863613
KW
14623 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
14624 chomp $prop_value_aliases;
14625
55a40252
KW
14626 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
14627 chomp $suppressed;
14628
6a40599f
KW
14629 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
14630 chomp $algorithm_names;
14631
2df7880f
KW
14632 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
14633 chomp $ambiguous_names;
14634
c15fda25
KW
14635 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
14636 chomp $loose_defaults;
14637
9f077a68
KW
14638 my @ucd = <<END;
14639$HEADER
14640$INTERNAL_ONLY_HEADER
14641
14642# This file is for the use of Unicode::UCD
14643
14644# Highest legal Unicode code point
14645\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
14646
14647# Hangul syllables
14648\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
14649\$Unicode::UCD::HANGUL_COUNT = $SCount;
14650
9e4a1e86
KW
14651# Keys are all the possible "prop=value" combinations, in loose form; values
14652# are the standard loose name for the 'value' part of the key
14653\%Unicode::UCD::loose_to_standard_value = (
14654$loose_to_standard_value
14655);
14656
86a52d1e
KW
14657# String property loose names to standard loose name
14658\%Unicode::UCD::string_property_loose_to_name = (
14659$string_property_loose_to_name
14660);
14661
f7be2375
KW
14662# Keys are Perl extensions in loose form; values are each one's list of
14663# aliases
14664\%Unicode::UCD::loose_perlprop_to_name = (
14665$perlprop_to_aliases
14666);
14667
5d1df013
KW
14668# Keys are standard property name; values are each one's aliases
14669\%Unicode::UCD::prop_aliases = (
14670$prop_aliases
14671);
14672
1e863613
KW
14673# Keys of top level are standard property name; values are keys to another
14674# hash, Each one is one of the property's values, in standard form. The
14675# values are that prop-val's aliases. If only one specified, the short and
14676# long alias are identical.
14677\%Unicode::UCD::prop_value_aliases = (
14678$prop_value_aliases
14679);
14680
6a40599f
KW
14681# Ordered (by code point ordinal) list of the ranges of code points whose
14682# names are algorithmically determined. Each range entry is an anonymous hash
14683# of the start and end points and a template for the names within it.
14684\@Unicode::UCD::algorithmic_named_code_points = (
14685$algorithm_names
14686);
14687
2df7880f
KW
14688# The properties that as-is have two meanings, and which must be disambiguated
14689\%Unicode::UCD::ambiguous_names = (
14690$ambiguous_names
14691);
14692
c15fda25
KW
14693# Keys are the prop-val combinations which are the default values for the
14694# given property, expressed in standard loose form
14695\%Unicode::UCD::loose_defaults = (
14696$loose_defaults
14697);
14698
55a40252
KW
14699# All combinations of names that are suppressed.
14700# This is actually for UCD.t, so it knows which properties shouldn't have
14701# entries. If it got any bigger, would probably want to put it in its own
14702# file to use memory only when it was needed, in testing.
14703\@Unicode::UCD::suppressed_properties = (
14704$suppressed
14705);
14706
9f077a68
KW
147071;
14708END
14709
14710 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
14711 return;
14712}
52dc8b5d 14713
99870f4d
KW
14714sub write_all_tables() {
14715 # Write out all the tables generated by this program to files, as well as
14716 # the supporting data structures, pod file, and .t file.
14717
14718 my @writables; # List of tables that actually get written
14719 my %match_tables_to_write; # Used to collapse identical match tables
14720 # into one file. Each key is a hash function
14721 # result to partition tables into buckets.
14722 # Each value is an array of the tables that
14723 # fit in the bucket.
14724
14725 # For each property ...
14726 # (sort so that if there is an immutable file name, it has precedence, so
14727 # some other property can't come in and take over its file name. If b's
14728 # file name is defined, will return 1, meaning to take it first; don't
7fc6cb55
KW
14729 # care if both defined, as they had better be different anyway. And the
14730 # property named 'Perl' needs to be first (it doesn't have any immutable
14731 # file name) because empty properties are defined in terms of it's table
14732 # named 'Any'.)
99870f4d 14733 PROPERTY:
7fc6cb55
KW
14734 foreach my $property (sort { return -1 if $a == $perl;
14735 return 1 if $b == $perl;
14736 return defined $b->file
14737 } property_ref('*'))
14738 {
99870f4d
KW
14739 my $type = $property->type;
14740
14741 # And for each table for that property, starting with the mapping
14742 # table for it ...
14743 TABLE:
14744 foreach my $table($property,
14745
14746 # and all the match tables for it (if any), sorted so
14747 # the ones with the shortest associated file name come
14748 # first. The length sorting prevents problems of a
14749 # longer file taking a name that might have to be used
14750 # by a shorter one. The alphabetic sorting prevents
14751 # differences between releases
14752 sort { my $ext_a = $a->external_name;
14753 return 1 if ! defined $ext_a;
14754 my $ext_b = $b->external_name;
14755 return -1 if ! defined $ext_b;
a92d5c2e
KW
14756
14757 # But return the non-complement table before
14758 # the complement one, as the latter is defined
14759 # in terms of the former, and needs to have
14760 # the information for the former available.
14761 return 1 if $a->complement != 0;
14762 return -1 if $b->complement != 0;
14763
0a695432
KW
14764 # Similarly, return a subservient table after
14765 # a leader
14766 return 1 if $a->leader != $a;
14767 return -1 if $b->leader != $b;
14768
99870f4d
KW
14769 my $cmp = length $ext_a <=> length $ext_b;
14770
14771 # Return result if lengths not equal
14772 return $cmp if $cmp;
14773
14774 # Alphabetic if lengths equal
14775 return $ext_a cmp $ext_b
14776 } $property->tables
14777 )
14778 {
12ac2576 14779
99870f4d
KW
14780 # Here we have a table associated with a property. It could be
14781 # the map table (done first for each property), or one of the
14782 # other tables. Determine which type.
14783 my $is_property = $table->isa('Property');
14784
14785 my $name = $table->name;
14786 my $complete_name = $table->complete_name;
14787
14788 # See if should suppress the table if is empty, but warn if it
14789 # contains something.
0332277c
KW
14790 my $suppress_if_empty_warn_if_not
14791 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
99870f4d
KW
14792
14793 # Calculate if this table should have any code points associated
14794 # with it or not.
14795 my $expected_empty =
14796
14797 # $perl should be empty, as well as properties that we just
14798 # don't do anything with
14799 ($is_property
14800 && ($table == $perl
14801 || grep { $complete_name eq $_ }
14802 @unimplemented_properties
14803 )
14804 )
14805
14806 # Match tables in properties we skipped populating should be
14807 # empty
14808 || (! $is_property && ! $property->to_create_match_tables)
14809
14810 # Tables and properties that are expected to have no code
14811 # points should be empty
14812 || $suppress_if_empty_warn_if_not
14813 ;
14814
14815 # Set a boolean if this table is the complement of an empty binary
14816 # table
14817 my $is_complement_of_empty_binary =
14818 $type == $BINARY &&
14819 (($table == $property->table('Y')
14820 && $property->table('N')->is_empty)
14821 || ($table == $property->table('N')
14822 && $property->table('Y')->is_empty));
14823
99870f4d
KW
14824 if ($table->is_empty) {
14825
99870f4d 14826 if ($suppress_if_empty_warn_if_not) {
301ba948
KW
14827 $table->set_fate($SUPPRESSED,
14828 $suppress_if_empty_warn_if_not);
99870f4d 14829 }
12ac2576 14830
c12f2655 14831 # Suppress (by skipping them) expected empty tables.
99870f4d
KW
14832 next TABLE if $expected_empty;
14833
14834 # And setup to later output a warning for those that aren't
14835 # known to be allowed to be empty. Don't do the warning if
14836 # this table is a child of another one to avoid duplicating
14837 # the warning that should come from the parent one.
14838 if (($table == $property || $table->parent == $table)
301ba948 14839 && $table->fate != $SUPPRESSED
395dfc19 14840 && $table->fate != $MAP_PROXIED
99870f4d
KW
14841 && ! grep { $complete_name =~ /^$_$/ }
14842 @tables_that_may_be_empty)
14843 {
14844 push @unhandled_properties, "$table";
14845 }
7fc6cb55
KW
14846
14847 # An empty table is just the complement of everything.
14848 $table->set_complement($Any) if $table != $property;
99870f4d
KW
14849 }
14850 elsif ($expected_empty) {
14851 my $because = "";
14852 if ($suppress_if_empty_warn_if_not) {
0332277c 14853 $because = " because $suppress_if_empty_warn_if_not";
99870f4d 14854 }
12ac2576 14855
99870f4d
KW
14856 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
14857 }
12ac2576 14858
14479722
KW
14859 # Some tables should match everything
14860 my $expected_full =
1583a95b
KW
14861 ($table->fate == $SUPPRESSED)
14862 ? 0
e75669bd
KW
14863 : ($is_property)
14864 ? # All these types of map tables will be full because
14865 # they will have been populated with defaults
14866 ($type == $ENUM || $type == $FORCED_BINARY)
14867
14868 : # A match table should match everything if its method
14869 # shows it should
14870 ($table->matches_all
14871
14872 # The complement of an empty binary table will match
14873 # everything
14874 || $is_complement_of_empty_binary
14875 )
14479722
KW
14876 ;
14877
99870f4d
KW
14878 my $count = $table->count;
14879 if ($expected_full) {
14880 if ($count != $MAX_UNICODE_CODEPOINTS) {
14881 Carp::my_carp("$table matches only "
14882 . clarify_number($count)
14883 . " Unicode code points but should match "
14884 . clarify_number($MAX_UNICODE_CODEPOINTS)
14885 . " (off by "
14886 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
14887 . "). Proceeding anyway.");
14888 }
12ac2576 14889
99870f4d
KW
14890 # Here is expected to be full. If it is because it is the
14891 # complement of an (empty) binary table that is to be
14892 # suppressed, then suppress this one as well.
14893 if ($is_complement_of_empty_binary) {
14894 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
14895 my $opposing = $property->table($opposing_name);
14896 my $opposing_status = $opposing->status;
14897 if ($opposing_status) {
14898 $table->set_status($opposing_status,
14899 $opposing->status_info);
14900 }
14901 }
14902 }
14903 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
14904 if ($table == $property || $table->leader == $table) {
14905 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
14906 }
14907 }
d73e5302 14908
301ba948 14909 if ($table->fate == $SUPPRESSED) {
99870f4d
KW
14910 if (! $is_property) {
14911 my @children = $table->children;
14912 foreach my $child (@children) {
301ba948 14913 if ($child->fate != $SUPPRESSED) {
99870f4d
KW
14914 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
14915 }
14916 }
14917 }
14918 next TABLE;
d73e5302 14919
99870f4d 14920 }
2df7880f 14921
99870f4d
KW
14922 if (! $is_property) {
14923
2df7880f
KW
14924 make_ucd_table_pod_entries($table) if $table->property == $perl;
14925
99870f4d
KW
14926 # Several things need to be done just once for each related
14927 # group of match tables. Do them on the parent.
14928 if ($table->parent == $table) {
14929
14930 # Add an entry in the pod file for the table; it also does
14931 # the children.
d1476e4d 14932 make_re_pod_entries($table) if defined $pod_directory;
99870f4d
KW
14933
14934 # See if the the table matches identical code points with
14935 # something that has already been output. In that case,
14936 # no need to have two files with the same code points in
14937 # them. We use the table's hash() method to store these
14938 # in buckets, so that it is quite likely that if two
14939 # tables are in the same bucket they will be identical, so
14940 # don't have to compare tables frequently. The tables
14941 # have to have the same status to share a file, so add
14942 # this to the bucket hash. (The reason for this latter is
14943 # that Heavy.pl associates a status with a file.)
06671cbc
KW
14944 # We don't check tables that are inverses of others, as it
14945 # would lead to some coding complications, and checking
14946 # all the regular ones should find everything.
14947 if ($table->complement == 0) {
21be712a 14948 my $hash = $table->hash . ';' . $table->status;
99870f4d 14949
21be712a
KW
14950 # Look at each table that is in the same bucket as
14951 # this one would be.
14952 foreach my $comparison
14953 (@{$match_tables_to_write{$hash}})
14954 {
14955 if ($table->matches_identically_to($comparison)) {
14956 $table->set_equivalent_to($comparison,
99870f4d 14957 Related => 0);
21be712a
KW
14958 next TABLE;
14959 }
99870f4d 14960 }
d73e5302 14961
21be712a
KW
14962 # Here, not equivalent, add this table to the bucket.
14963 push @{$match_tables_to_write{$hash}}, $table;
06671cbc 14964 }
99870f4d
KW
14965 }
14966 }
14967 else {
14968
14969 # Here is the property itself.
14970 # Don't write out or make references to the $perl property
14971 next if $table == $perl;
14972
2df7880f
KW
14973 make_ucd_table_pod_entries($table);
14974
382cadab
KW
14975 # There is a mapping stored of the various synonyms to the
14976 # standardized name of the property for utf8_heavy.pl.
14977 # Also, the pod file contains entries of the form:
14978 # \p{alias: *} \p{full: *}
14979 # rather than show every possible combination of things.
99870f4d 14980
382cadab 14981 my @property_aliases = $property->aliases;
99870f4d 14982
382cadab
KW
14983 my $full_property_name = $property->full_name;
14984 my $property_name = $property->name;
14985 my $standard_property_name = standardize($property_name);
5d1df013
KW
14986 my $standard_property_full_name
14987 = standardize($full_property_name);
14988
14989 # We also create for Unicode::UCD a list of aliases for
14990 # the property. The list starts with the property name;
14991 # then its full name.
14992 my @property_list;
14993 my @standard_list;
14994 if ( $property->fate <= $MAP_PROXIED) {
14995 @property_list = ($property_name, $full_property_name);
14996 @standard_list = ($standard_property_name,
14997 $standard_property_full_name);
14998 }
99870f4d 14999
382cadab
KW
15000 # For each synonym ...
15001 for my $i (0 .. @property_aliases - 1) {
15002 my $alias = $property_aliases[$i];
15003 my $alias_name = $alias->name;
15004 my $alias_standard = standardize($alias_name);
99870f4d 15005
382cadab 15006
5d1df013
KW
15007 # Add other aliases to the list of property aliases
15008 if ($property->fate <= $MAP_PROXIED
15009 && ! grep { $alias_standard eq $_ } @standard_list)
15010 {
15011 push @property_list, $alias_name;
15012 push @standard_list, $alias_standard;
15013 }
382cadab
KW
15014
15015 # For utf8_heavy, set the mapping of the alias to the
15016 # property
86a52d1e
KW
15017 if ($type == $STRING) {
15018 if ($property->fate <= $MAP_PROXIED) {
15019 $string_property_loose_to_name{$alias_standard}
15020 = $standard_property_name;
15021 }
15022 }
15023 else {
99870f4d
KW
15024 if (exists ($loose_property_name_of{$alias_standard}))
15025 {
15026 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
15027 }
15028 else {
15029 $loose_property_name_of{$alias_standard}
15030 = $standard_property_name;
15031 }
15032
d1476e4d 15033 # Now for the re pod entry for this alias. Skip if not
23e33b60
KW
15034 # outputting a pod; skip the first one, which is the
15035 # full name so won't have an entry like: '\p{full: *}
15036 # \p{full: *}', and skip if don't want an entry for
15037 # this one.
15038 next if $i == 0
15039 || ! defined $pod_directory
33e96e72 15040 || ! $alias->make_re_pod_entry;
99870f4d 15041
01d970b5 15042 my $rhs = "\\p{$full_property_name: *}";
d57ccc9a
KW
15043 if ($property != $perl && $table->perl_extension) {
15044 $rhs .= ' (Perl extension)';
15045 }
99870f4d
KW
15046 push @match_properties,
15047 format_pod_line($indent_info_column,
15048 '\p{' . $alias->name . ': *}',
d57ccc9a 15049 $rhs,
99870f4d
KW
15050 $alias->status);
15051 }
382cadab 15052 }
d73e5302 15053
5d1df013
KW
15054 # The list of all possible names is attached to each alias, so
15055 # lookup is easy
15056 if (@property_list) {
15057 push @{$prop_aliases{$standard_list[0]}}, @property_list;
15058 }
15059
1e863613
KW
15060 if ($property->fate <= $MAP_PROXIED) {
15061
15062 # Similarly, we create for Unicode::UCD a list of
15063 # property-value aliases.
15064
15065 my $property_full_name = $property->full_name;
15066
15067 # Look at each table in the property...
15068 foreach my $table ($property->tables) {
15069 my @values_list;
15070 my $table_full_name = $table->full_name;
15071 my $standard_table_full_name
15072 = standardize($table_full_name);
15073 my $table_name = $table->name;
15074 my $standard_table_name = standardize($table_name);
15075
15076 # The list starts with the table name and its full
15077 # name.
15078 push @values_list, $table_name, $table_full_name;
15079
15080 # We add to the table each unique alias that isn't
15081 # discouraged from use.
15082 foreach my $alias ($table->aliases) {
15083 next if $alias->status
15084 && $alias->status eq $DISCOURAGED;
15085 my $name = $alias->name;
15086 my $standard = standardize($name);
15087 next if $standard eq $standard_table_name;
15088 next if $standard eq $standard_table_full_name;
15089 push @values_list, $name;
15090 }
5d1df013 15091
1e863613
KW
15092 # Here @values_list is a list of all the aliases for
15093 # the table. That is, all the property-values given
15094 # by this table. By agreement with Unicode::UCD,
15095 # if the name and full name are identical, and there
15096 # are no other names, drop the duplcate entry to save
15097 # memory.
15098 if (@values_list == 2
15099 && $values_list[0] eq $values_list[1])
15100 {
15101 pop @values_list
15102 }
15103
15104 # To save memory, unlike the similar list for property
15105 # aliases above, only the standard forms hve the list.
15106 # This forces an extra step of converting from input
15107 # name to standard name, but the savings are
15108 # considerable. (There is only marginal savings if we
15109 # did this with the property aliases.)
15110 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
15111 }
15112 }
d73e5302 15113
c12f2655 15114 # Don't write out a mapping file if not desired.
99870f4d
KW
15115 next if ! $property->to_output_map;
15116 }
d73e5302 15117
99870f4d
KW
15118 # Here, we know we want to write out the table, but don't do it
15119 # yet because there may be other tables that come along and will
15120 # want to share the file, and the file's comments will change to
15121 # mention them. So save for later.
15122 push @writables, $table;
15123
15124 } # End of looping through the property and all its tables.
15125 } # End of looping through all properties.
15126
15127 # Now have all the tables that will have files written for them. Do it.
15128 foreach my $table (@writables) {
15129 my @directory;
15130 my $filename;
15131 my $property = $table->property;
15132 my $is_property = ($table == $property);
15133 if (! $is_property) {
15134
15135 # Match tables for the property go in lib/$subdirectory, which is
15136 # the property's name. Don't use the standard file name for this,
15137 # as may get an unfamiliar alias
15138 @directory = ($matches_directory, $property->external_name);
15139 }
15140 else {
d73e5302 15141
99870f4d
KW
15142 @directory = $table->directory;
15143 $filename = $table->file;
15144 }
d73e5302 15145
98dc9551 15146 # Use specified filename if available, or default to property's
99870f4d
KW
15147 # shortest name. We need an 8.3 safe filename (which means "an 8
15148 # safe" filename, since after the dot is only 'pl', which is < 3)
15149 # The 2nd parameter is if the filename shouldn't be changed, and
15150 # it shouldn't iff there is a hard-coded name for this table.
15151 $filename = construct_filename(
15152 $filename || $table->external_name,
15153 ! $filename, # mutable if no filename
15154 \@directory);
d73e5302 15155
99870f4d 15156 register_file_for_name($table, \@directory, $filename);
d73e5302 15157
99870f4d
KW
15158 # Only need to write one file when shared by more than one
15159 # property
a92d5c2e
KW
15160 next if ! $is_property
15161 && ($table->leader != $table || $table->complement != 0);
d73e5302 15162
99870f4d
KW
15163 # Construct a nice comment to add to the file
15164 $table->set_final_comment;
15165
15166 $table->write;
cf25bb62 15167 }
d73e5302 15168
d73e5302 15169
99870f4d
KW
15170 # Write out the pod file
15171 make_pod;
15172
9f077a68 15173 # And Heavy.pl, Name.pm, UCD.pl
99870f4d 15174 make_Heavy;
52dc8b5d 15175 make_Name_pm;
9f077a68 15176 make_UCD;
d73e5302 15177
99870f4d
KW
15178 make_property_test_script() if $make_test_script;
15179 return;
cf25bb62 15180}
d73e5302 15181
99870f4d
KW
15182my @white_space_separators = ( # This used only for making the test script.
15183 "",
15184 ' ',
15185 "\t",
15186 ' '
15187 );
d73e5302 15188
99870f4d
KW
15189sub generate_separator($) {
15190 # This used only for making the test script. It generates the colon or
15191 # equal separator between the property and property value, with random
15192 # white space surrounding the separator
d73e5302 15193
99870f4d 15194 my $lhs = shift;
d73e5302 15195
99870f4d 15196 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 15197
99870f4d
KW
15198 # Choose space before and after randomly
15199 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
15200 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 15201
99870f4d
KW
15202 # And return the whole complex, half the time using a colon, half the
15203 # equals
15204 return $spaces_before
15205 . (rand() < 0.5) ? '=' : ':'
15206 . $spaces_after;
15207}
76ccdbe2 15208
430ada4c 15209sub generate_tests($$$$$) {
99870f4d
KW
15210 # This used only for making the test script. It generates test cases that
15211 # are expected to compile successfully in perl. Note that the lhs and
15212 # rhs are assumed to already be as randomized as the caller wants.
15213
99870f4d
KW
15214 my $lhs = shift; # The property: what's to the left of the colon
15215 # or equals separator
15216 my $rhs = shift; # The property value; what's to the right
15217 my $valid_code = shift; # A code point that's known to be in the
15218 # table given by lhs=rhs; undef if table is
15219 # empty
15220 my $invalid_code = shift; # A code point known to not be in the table;
15221 # undef if the table is all code points
15222 my $warning = shift;
15223
15224 # Get the colon or equal
15225 my $separator = generate_separator($lhs);
15226
15227 # The whole 'property=value'
15228 my $name = "$lhs$separator$rhs";
15229
430ada4c 15230 my @output;
99870f4d
KW
15231 # Create a complete set of tests, with complements.
15232 if (defined $valid_code) {
430ada4c
NC
15233 push @output, <<"EOC"
15234Expect(1, $valid_code, '\\p{$name}', $warning);
15235Expect(0, $valid_code, '\\p{^$name}', $warning);
15236Expect(0, $valid_code, '\\P{$name}', $warning);
15237Expect(1, $valid_code, '\\P{^$name}', $warning);
15238EOC
99870f4d
KW
15239 }
15240 if (defined $invalid_code) {
430ada4c
NC
15241 push @output, <<"EOC"
15242Expect(0, $invalid_code, '\\p{$name}', $warning);
15243Expect(1, $invalid_code, '\\p{^$name}', $warning);
15244Expect(1, $invalid_code, '\\P{$name}', $warning);
15245Expect(0, $invalid_code, '\\P{^$name}', $warning);
15246EOC
15247 }
15248 return @output;
99870f4d 15249}
cf25bb62 15250
430ada4c 15251sub generate_error($$$) {
99870f4d
KW
15252 # This used only for making the test script. It generates test cases that
15253 # are expected to not only not match, but to be syntax or similar errors
15254
99870f4d
KW
15255 my $lhs = shift; # The property: what's to the left of the
15256 # colon or equals separator
15257 my $rhs = shift; # The property value; what's to the right
15258 my $already_in_error = shift; # Boolean; if true it's known that the
15259 # unmodified lhs and rhs will cause an error.
15260 # This routine should not force another one
15261 # Get the colon or equal
15262 my $separator = generate_separator($lhs);
15263
15264 # Since this is an error only, don't bother to randomly decide whether to
15265 # put the error on the left or right side; and assume that the rhs is
15266 # loosely matched, again for convenience rather than rigor.
15267 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
15268
15269 my $property = $lhs . $separator . $rhs;
15270
430ada4c
NC
15271 return <<"EOC";
15272Error('\\p{$property}');
15273Error('\\P{$property}');
15274EOC
d73e5302
JH
15275}
15276
99870f4d
KW
15277# These are used only for making the test script
15278# XXX Maybe should also have a bad strict seps, which includes underscore.
15279
15280my @good_loose_seps = (
15281 " ",
15282 "-",
15283 "\t",
15284 "",
15285 "_",
15286 );
15287my @bad_loose_seps = (
15288 "/a/",
15289 ':=',
15290 );
15291
15292sub randomize_stricter_name {
15293 # This used only for making the test script. Take the input name and
15294 # return a randomized, but valid version of it under the stricter matching
15295 # rules.
15296
15297 my $name = shift;
15298 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15299
15300 # If the name looks like a number (integer, floating, or rational), do
15301 # some extra work
15302 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
15303 my $sign = $1;
15304 my $number = $2;
15305 my $separator = $3;
15306
15307 # If there isn't a sign, part of the time add a plus
15308 # Note: Not testing having any denominator having a minus sign
15309 if (! $sign) {
15310 $sign = '+' if rand() <= .3;
15311 }
15312
15313 # And add 0 or more leading zeros.
15314 $name = $sign . ('0' x int rand(10)) . $number;
15315
15316 if (defined $separator) {
15317 my $extra_zeros = '0' x int rand(10);
cf25bb62 15318
99870f4d
KW
15319 if ($separator eq '.') {
15320
15321 # Similarly, add 0 or more trailing zeros after a decimal
15322 # point
15323 $name .= $extra_zeros;
15324 }
15325 else {
15326
15327 # Or, leading zeros before the denominator
15328 $name =~ s,/,/$extra_zeros,;
15329 }
15330 }
cf25bb62 15331 }
d73e5302 15332
99870f4d
KW
15333 # For legibility of the test, only change the case of whole sections at a
15334 # time. To do this, first split into sections. The split returns the
15335 # delimiters
15336 my @sections;
15337 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
15338 trace $section if main::DEBUG && $to_trace;
15339
15340 if (length $section > 1 && $section !~ /\D/) {
15341
15342 # If the section is a sequence of digits, about half the time
15343 # randomly add underscores between some of them.
15344 if (rand() > .5) {
15345
15346 # Figure out how many underscores to add. max is 1 less than
15347 # the number of digits. (But add 1 at the end to make sure
15348 # result isn't 0, and compensate earlier by subtracting 2
15349 # instead of 1)
15350 my $num_underscores = int rand(length($section) - 2) + 1;
15351
15352 # And add them evenly throughout, for convenience, not rigor
15353 use integer;
15354 my $spacing = (length($section) - 1)/ $num_underscores;
15355 my $temp = $section;
15356 $section = "";
15357 for my $i (1 .. $num_underscores) {
15358 $section .= substr($temp, 0, $spacing, "") . '_';
15359 }
15360 $section .= $temp;
15361 }
15362 push @sections, $section;
15363 }
15364 else {
d73e5302 15365
99870f4d
KW
15366 # Here not a sequence of digits. Change the case of the section
15367 # randomly
15368 my $switch = int rand(4);
15369 if ($switch == 0) {
15370 push @sections, uc $section;
15371 }
15372 elsif ($switch == 1) {
15373 push @sections, lc $section;
15374 }
15375 elsif ($switch == 2) {
15376 push @sections, ucfirst $section;
15377 }
15378 else {
15379 push @sections, $section;
15380 }
15381 }
cf25bb62 15382 }
99870f4d
KW
15383 trace "returning", join "", @sections if main::DEBUG && $to_trace;
15384 return join "", @sections;
15385}
71d929cb 15386
99870f4d
KW
15387sub randomize_loose_name($;$) {
15388 # This used only for making the test script
71d929cb 15389
99870f4d
KW
15390 my $name = shift;
15391 my $want_error = shift; # if true, make an error
15392 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15393
15394 $name = randomize_stricter_name($name);
5beb625e
JH
15395
15396 my @parts;
99870f4d 15397 push @parts, $good_loose_seps[rand(@good_loose_seps)];
45376db6
KW
15398
15399 # Preserve trailing ones for the sake of not stripping the underscore from
15400 # 'L_'
15401 for my $part (split /[-\s_]+ (?= . )/, $name) {
5beb625e 15402 if (@parts) {
99870f4d
KW
15403 if ($want_error and rand() < 0.3) {
15404 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
15405 $want_error = 0;
15406 }
15407 else {
15408 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
15409 }
15410 }
99870f4d 15411 push @parts, $part;
5beb625e 15412 }
99870f4d
KW
15413 my $new = join("", @parts);
15414 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 15415
99870f4d 15416 if ($want_error) {
5beb625e 15417 if (rand() >= 0.5) {
99870f4d
KW
15418 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
15419 }
15420 else {
15421 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
15422 }
15423 }
15424 return $new;
15425}
15426
99870f4d
KW
15427# Used to make sure don't generate duplicate test cases.
15428my %test_generated;
5beb625e 15429
99870f4d
KW
15430sub make_property_test_script() {
15431 # This used only for making the test script
15432 # this written directly -- it's huge.
5beb625e 15433
99870f4d 15434 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 15435
99870f4d
KW
15436 # This uses randomness to test different possibilities without testing all
15437 # possibilities. To ensure repeatability, set the seed to 0. But if
15438 # tests are added, it will perturb all later ones in the .t file
15439 srand 0;
5beb625e 15440
3df51b85
KW
15441 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
15442
99870f4d
KW
15443 # Keep going down an order of magnitude
15444 # until find that adding this quantity to
15445 # 1 remains 1; but put an upper limit on
15446 # this so in case this algorithm doesn't
15447 # work properly on some platform, that we
15448 # won't loop forever.
15449 my $digits = 0;
15450 my $min_floating_slop = 1;
15451 while (1+ $min_floating_slop != 1
15452 && $digits++ < 50)
5beb625e 15453 {
99870f4d
KW
15454 my $next = $min_floating_slop / 10;
15455 last if $next == 0; # If underflows,
15456 # use previous one
15457 $min_floating_slop = $next;
5beb625e 15458 }
430ada4c
NC
15459
15460 # It doesn't matter whether the elements of this array contain single lines
15461 # or multiple lines. main::write doesn't count the lines.
15462 my @output;
99870f4d
KW
15463
15464 foreach my $property (property_ref('*')) {
15465 foreach my $table ($property->tables) {
15466
15467 # Find code points that match, and don't match this table.
15468 my $valid = $table->get_valid_code_point;
15469 my $invalid = $table->get_invalid_code_point;
15470 my $warning = ($table->status eq $DEPRECATED)
15471 ? "'deprecated'"
15472 : '""';
15473
15474 # Test each possible combination of the property's aliases with
15475 # the table's. If this gets to be too many, could do what is done
15476 # in the set_final_comment() for Tables
15477 my @table_aliases = $table->aliases;
15478 my @property_aliases = $table->property->aliases;
807807b7
KW
15479
15480 # Every property can be optionally be prefixed by 'Is_', so test
15481 # that those work, by creating such a new alias for each
15482 # pre-existing one.
15483 push @property_aliases, map { Alias->new("Is_" . $_->name,
15484 $_->loose_match,
33e96e72 15485 $_->make_re_pod_entry,
0eac1e20 15486 $_->ok_as_filename,
fd1e3e84
KW
15487 $_->status,
15488 $_->ucd,
15489 )
807807b7 15490 } @property_aliases;
99870f4d
KW
15491 my $max = max(scalar @table_aliases, scalar @property_aliases);
15492 for my $j (0 .. $max - 1) {
15493
15494 # The current alias for property is the next one on the list,
15495 # or if beyond the end, start over. Similarly for table
15496 my $property_name
15497 = $property_aliases[$j % @property_aliases]->name;
15498
15499 $property_name = "" if $table->property == $perl;
15500 my $table_alias = $table_aliases[$j % @table_aliases];
15501 my $table_name = $table_alias->name;
15502 my $loose_match = $table_alias->loose_match;
15503
15504 # If the table doesn't have a file, any test for it is
15505 # already guaranteed to be in error
15506 my $already_error = ! $table->file_path;
15507
15508 # Generate error cases for this alias.
430ada4c
NC
15509 push @output, generate_error($property_name,
15510 $table_name,
15511 $already_error);
99870f4d
KW
15512
15513 # If the table is guaranteed to always generate an error,
15514 # quit now without generating success cases.
15515 next if $already_error;
15516
15517 # Now for the success cases.
15518 my $random;
15519 if ($loose_match) {
15520
15521 # For loose matching, create an extra test case for the
15522 # standard name.
15523 my $standard = standardize($table_name);
15524
15525 # $test_name should be a unique combination for each test
15526 # case; used just to avoid duplicate tests
15527 my $test_name = "$property_name=$standard";
15528
15529 # Don't output duplicate test cases.
15530 if (! exists $test_generated{$test_name}) {
15531 $test_generated{$test_name} = 1;
430ada4c
NC
15532 push @output, generate_tests($property_name,
15533 $standard,
15534 $valid,
15535 $invalid,
15536 $warning,
15537 );
5beb625e 15538 }
99870f4d
KW
15539 $random = randomize_loose_name($table_name)
15540 }
15541 else { # Stricter match
15542 $random = randomize_stricter_name($table_name);
99598c8c 15543 }
99598c8c 15544
99870f4d
KW
15545 # Now for the main test case for this alias.
15546 my $test_name = "$property_name=$random";
15547 if (! exists $test_generated{$test_name}) {
15548 $test_generated{$test_name} = 1;
430ada4c
NC
15549 push @output, generate_tests($property_name,
15550 $random,
15551 $valid,
15552 $invalid,
15553 $warning,
15554 );
99870f4d
KW
15555
15556 # If the name is a rational number, add tests for the
15557 # floating point equivalent.
15558 if ($table_name =~ qr{/}) {
15559
15560 # Calculate the float, and find just the fraction.
15561 my $float = eval $table_name;
15562 my ($whole, $fraction)
15563 = $float =~ / (.*) \. (.*) /x;
15564
15565 # Starting with one digit after the decimal point,
15566 # create a test for each possible precision (number of
15567 # digits past the decimal point) until well beyond the
15568 # native number found on this machine. (If we started
15569 # with 0 digits, it would be an integer, which could
15570 # well match an unrelated table)
15571 PLACE:
15572 for my $i (1 .. $min_floating_slop + 3) {
15573 my $table_name = sprintf("%.*f", $i, $float);
15574 if ($i < $MIN_FRACTION_LENGTH) {
15575
15576 # If the test case has fewer digits than the
15577 # minimum acceptable precision, it shouldn't
15578 # succeed, so we expect an error for it.
15579 # E.g., 2/3 = .7 at one decimal point, and we
15580 # shouldn't say it matches .7. We should make
15581 # it be .667 at least before agreeing that the
15582 # intent was to match 2/3. But at the
15583 # less-than- acceptable level of precision, it
15584 # might actually match an unrelated number.
15585 # So don't generate a test case if this
15586 # conflating is possible. In our example, we
15587 # don't want 2/3 matching 7/10, if there is
15588 # a 7/10 code point.
15589 for my $existing
15590 (keys %nv_floating_to_rational)
15591 {
15592 next PLACE
15593 if abs($table_name - $existing)
15594 < $MAX_FLOATING_SLOP;
15595 }
430ada4c
NC
15596 push @output, generate_error($property_name,
15597 $table_name,
15598 1 # 1 => already an error
15599 );
99870f4d
KW
15600 }
15601 else {
15602
15603 # Here the number of digits exceeds the
15604 # minimum we think is needed. So generate a
15605 # success test case for it.
430ada4c
NC
15606 push @output, generate_tests($property_name,
15607 $table_name,
15608 $valid,
15609 $invalid,
15610 $warning,
15611 );
99870f4d
KW
15612 }
15613 }
99598c8c
JH
15614 }
15615 }
99870f4d
KW
15616 }
15617 }
15618 }
37e2e78e 15619
9218f1cf
KW
15620 &write($t_path,
15621 0, # Not utf8;
15622 [<DATA>,
15623 @output,
15624 (map {"Test_X('$_');\n"} @backslash_X_tests),
15625 "Finished();\n"]);
99870f4d
KW
15626 return;
15627}
99598c8c 15628
99870f4d
KW
15629# This is a list of the input files and how to handle them. The files are
15630# processed in their order in this list. Some reordering is possible if
15631# desired, but the v0 files should be first, and the extracted before the
15632# others except DAge.txt (as data in an extracted file can be over-ridden by
15633# the non-extracted. Some other files depend on data derived from an earlier
15634# file, like UnicodeData requires data from Jamo, and the case changing and
15635# folding requires data from Unicode. Mostly, it safest to order by first
15636# version releases in (except the Jamo). DAge.txt is read before the
15637# extracted ones because of the rarely used feature $compare_versions. In the
15638# unlikely event that there were ever an extracted file that contained the Age
15639# property information, it would have to go in front of DAge.
15640#
15641# The version strings allow the program to know whether to expect a file or
15642# not, but if a file exists in the directory, it will be processed, even if it
15643# is in a version earlier than expected, so you can copy files from a later
15644# release into an earlier release's directory.
15645my @input_file_objects = (
15646 Input_file->new('PropertyAliases.txt', v0,
15647 Handler => \&process_PropertyAliases,
15648 ),
15649 Input_file->new(undef, v0, # No file associated with this
3df51b85 15650 Progress_Message => 'Finishing property setup',
99870f4d
KW
15651 Handler => \&finish_property_setup,
15652 ),
15653 Input_file->new('PropValueAliases.txt', v0,
15654 Handler => \&process_PropValueAliases,
15655 Has_Missings_Defaults => $NOT_IGNORED,
15656 ),
15657 Input_file->new('DAge.txt', v3.2.0,
15658 Has_Missings_Defaults => $NOT_IGNORED,
15659 Property => 'Age'
15660 ),
15661 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
15662 Property => 'General_Category',
15663 ),
15664 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
15665 Property => 'Canonical_Combining_Class',
15666 Has_Missings_Defaults => $NOT_IGNORED,
15667 ),
15668 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
15669 Property => 'Numeric_Type',
15670 Has_Missings_Defaults => $NOT_IGNORED,
15671 ),
15672 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
15673 Property => 'East_Asian_Width',
15674 Has_Missings_Defaults => $NOT_IGNORED,
15675 ),
15676 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
15677 Property => 'Line_Break',
15678 Has_Missings_Defaults => $NOT_IGNORED,
15679 ),
15680 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
15681 Property => 'Bidi_Class',
15682 Has_Missings_Defaults => $NOT_IGNORED,
15683 ),
15684 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
15685 Property => 'Decomposition_Type',
15686 Has_Missings_Defaults => $NOT_IGNORED,
15687 ),
15688 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
15689 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
15690 Property => 'Numeric_Value',
15691 Each_Line_Handler => \&filter_numeric_value_line,
15692 Has_Missings_Defaults => $NOT_IGNORED,
15693 ),
15694 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
15695 Property => 'Joining_Group',
15696 Has_Missings_Defaults => $NOT_IGNORED,
15697 ),
15698
15699 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
15700 Property => 'Joining_Type',
15701 Has_Missings_Defaults => $NOT_IGNORED,
15702 ),
15703 Input_file->new('Jamo.txt', v2.0.0,
15704 Property => 'Jamo_Short_Name',
15705 Each_Line_Handler => \&filter_jamo_line,
15706 ),
15707 Input_file->new('UnicodeData.txt', v1.1.5,
15708 Pre_Handler => \&setup_UnicodeData,
15709
15710 # We clean up this file for some early versions.
15711 Each_Line_Handler => [ (($v_version lt v2.0.0 )
15712 ? \&filter_v1_ucd
15713 : ($v_version eq v2.1.5)
15714 ? \&filter_v2_1_5_ucd
3ffed8c2
KW
15715
15716 # And for 5.14 Perls with 6.0,
15717 # have to also make changes
15718 : ($v_version ge v6.0.0)
15719 ? \&filter_v6_ucd
15720 : undef),
99870f4d
KW
15721
15722 # And the main filter
15723 \&filter_UnicodeData_line,
15724 ],
15725 EOF_Handler => \&EOF_UnicodeData,
15726 ),
15727 Input_file->new('ArabicShaping.txt', v2.0.0,
15728 Each_Line_Handler =>
15729 [ ($v_version lt 4.1.0)
15730 ? \&filter_old_style_arabic_shaping
15731 : undef,
15732 \&filter_arabic_shaping_line,
15733 ],
15734 Has_Missings_Defaults => $NOT_IGNORED,
15735 ),
15736 Input_file->new('Blocks.txt', v2.0.0,
15737 Property => 'Block',
15738 Has_Missings_Defaults => $NOT_IGNORED,
15739 Each_Line_Handler => \&filter_blocks_lines
15740 ),
15741 Input_file->new('PropList.txt', v2.0.0,
15742 Each_Line_Handler => (($v_version lt v3.1.0)
15743 ? \&filter_old_style_proplist
15744 : undef),
15745 ),
15746 Input_file->new('Unihan.txt', v2.0.0,
15747 Pre_Handler => \&setup_unihan,
15748 Optional => 1,
15749 Each_Line_Handler => \&filter_unihan_line,
15750 ),
15751 Input_file->new('SpecialCasing.txt', v2.1.8,
15752 Each_Line_Handler => \&filter_special_casing_line,
15753 Pre_Handler => \&setup_special_casing,
dbf17f82 15754 Has_Missings_Defaults => $IGNORED,
99870f4d
KW
15755 ),
15756 Input_file->new(
15757 'LineBreak.txt', v3.0.0,
15758 Has_Missings_Defaults => $NOT_IGNORED,
15759 Property => 'Line_Break',
15760 # Early versions had problematic syntax
15761 Each_Line_Handler => (($v_version lt v3.1.0)
15762 ? \&filter_early_ea_lb
15763 : undef),
15764 ),
15765 Input_file->new('EastAsianWidth.txt', v3.0.0,
15766 Property => 'East_Asian_Width',
15767 Has_Missings_Defaults => $NOT_IGNORED,
15768 # Early versions had problematic syntax
15769 Each_Line_Handler => (($v_version lt v3.1.0)
15770 ? \&filter_early_ea_lb
15771 : undef),
15772 ),
15773 Input_file->new('CompositionExclusions.txt', v3.0.0,
15774 Property => 'Composition_Exclusion',
15775 ),
15776 Input_file->new('BidiMirroring.txt', v3.0.1,
15777 Property => 'Bidi_Mirroring_Glyph',
15778 ),
37e2e78e 15779 Input_file->new("NormalizationTest.txt", v3.0.1,
09ca89ce 15780 Skip => 'Validation Tests',
37e2e78e 15781 ),
99870f4d
KW
15782 Input_file->new('CaseFolding.txt', v3.0.1,
15783 Pre_Handler => \&setup_case_folding,
15784 Each_Line_Handler =>
15785 [ ($v_version lt v3.1.0)
15786 ? \&filter_old_style_case_folding
15787 : undef,
15788 \&filter_case_folding_line
15789 ],
dbf17f82 15790 Has_Missings_Defaults => $IGNORED,
99870f4d
KW
15791 ),
15792 Input_file->new('DCoreProperties.txt', v3.1.0,
15793 # 5.2 changed this file
15794 Has_Missings_Defaults => (($v_version ge v5.2.0)
15795 ? $NOT_IGNORED
15796 : $NO_DEFAULTS),
15797 ),
15798 Input_file->new('Scripts.txt', v3.1.0,
15799 Property => 'Script',
15800 Has_Missings_Defaults => $NOT_IGNORED,
15801 ),
15802 Input_file->new('DNormalizationProps.txt', v3.1.0,
15803 Has_Missings_Defaults => $NOT_IGNORED,
15804 Each_Line_Handler => (($v_version lt v4.0.1)
15805 ? \&filter_old_style_normalization_lines
15806 : undef),
15807 ),
15808 Input_file->new('HangulSyllableType.txt', v4.0.0,
15809 Has_Missings_Defaults => $NOT_IGNORED,
15810 Property => 'Hangul_Syllable_Type'),
15811 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
15812 Property => 'Word_Break',
15813 Has_Missings_Defaults => $NOT_IGNORED,
15814 ),
15815 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
15816 Property => 'Grapheme_Cluster_Break',
15817 Has_Missings_Defaults => $NOT_IGNORED,
15818 ),
37e2e78e
KW
15819 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
15820 Handler => \&process_GCB_test,
15821 ),
15822 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
09ca89ce 15823 Skip => 'Validation Tests',
37e2e78e
KW
15824 ),
15825 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
09ca89ce 15826 Skip => 'Validation Tests',
37e2e78e
KW
15827 ),
15828 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
09ca89ce 15829 Skip => 'Validation Tests',
37e2e78e 15830 ),
99870f4d
KW
15831 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
15832 Property => 'Sentence_Break',
15833 Has_Missings_Defaults => $NOT_IGNORED,
15834 ),
15835 Input_file->new('NamedSequences.txt', v4.1.0,
15836 Handler => \&process_NamedSequences
15837 ),
15838 Input_file->new('NameAliases.txt', v5.0.0,
15839 Property => 'Name_Alias',
dcd72625 15840 Pre_Handler => ($v_version ge v6.0.0)
ce432655 15841 ? \&setup_early_name_alias
dcd72625 15842 : undef,
58b75e36
KW
15843 Each_Line_Handler =>
15844 \&filter_early_version_name_alias_line,
99870f4d 15845 ),
37e2e78e 15846 Input_file->new("BidiTest.txt", v5.2.0,
09ca89ce 15847 Skip => 'Validation Tests',
37e2e78e 15848 ),
99870f4d
KW
15849 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
15850 Optional => 1,
15851 Each_Line_Handler => \&filter_unihan_line,
15852 ),
15853 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
15854 Optional => 1,
15855 Each_Line_Handler => \&filter_unihan_line,
15856 ),
15857 Input_file->new('UnihanIRGSources.txt', v5.2.0,
15858 Optional => 1,
15859 Pre_Handler => \&setup_unihan,
15860 Each_Line_Handler => \&filter_unihan_line,
15861 ),
15862 Input_file->new('UnihanNumericValues.txt', v5.2.0,
15863 Optional => 1,
15864 Each_Line_Handler => \&filter_unihan_line,
15865 ),
15866 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
15867 Optional => 1,
15868 Each_Line_Handler => \&filter_unihan_line,
15869 ),
15870 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
15871 Optional => 1,
15872 Each_Line_Handler => \&filter_unihan_line,
15873 ),
15874 Input_file->new('UnihanReadings.txt', v5.2.0,
15875 Optional => 1,
15876 Each_Line_Handler => \&filter_unihan_line,
15877 ),
15878 Input_file->new('UnihanVariants.txt', v5.2.0,
15879 Optional => 1,
15880 Each_Line_Handler => \&filter_unihan_line,
15881 ),
82aed44a
KW
15882 Input_file->new('ScriptExtensions.txt', v6.0.0,
15883 Property => 'Script_Extensions',
15884 Pre_Handler => \&setup_script_extensions,
fbe1e607 15885 Each_Line_Handler => \&filter_script_extensions_line,
4fec90df
KW
15886 Has_Missings_Defaults => (($v_version le v6.0.0)
15887 ? $NO_DEFAULTS
15888 : $IGNORED),
82aed44a 15889 ),
3111abc0
KW
15890 # The two Indic files are actually available starting in v6.0.0, but their
15891 # property values are missing from PropValueAliases.txt in that release,
15892 # so that further work would have to be done to get them to work properly
15893 # for that release.
15894 Input_file->new('IndicMatraCategory.txt', v6.1.0,
15895 Property => 'Indic_Matra_Category',
15896 Has_Missings_Defaults => $NOT_IGNORED,
15897 Skip => "Provisional; for the analysis and processing of Indic scripts",
15898 ),
15899 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
15900 Property => 'Indic_Syllabic_Category',
15901 Has_Missings_Defaults => $NOT_IGNORED,
15902 Skip => "Provisional; for the analysis and processing of Indic scripts",
15903 ),
99870f4d 15904);
99598c8c 15905
99870f4d
KW
15906# End of all the preliminaries.
15907# Do it...
99598c8c 15908
99870f4d
KW
15909if ($compare_versions) {
15910 Carp::my_carp(<<END
15911Warning. \$compare_versions is set. Output is not suitable for production
15912END
15913 );
15914}
99598c8c 15915
99870f4d
KW
15916# Put into %potential_files a list of all the files in the directory structure
15917# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 15918# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
15919my @ignored_files_full_names = map { File::Spec->rel2abs(
15920 internal_file_to_platform($_))
15921 } keys %ignored_files;
15922File::Find::find({
15923 wanted=>sub {
37e2e78e 15924 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 15925 my $full = lc(File::Spec->rel2abs($_));
99870f4d 15926 $potential_files{$full} = 1
37e2e78e 15927 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
15928 return;
15929 }
15930}, File::Spec->curdir());
99598c8c 15931
99870f4d 15932my @mktables_list_output_files;
cdcef19a 15933my $old_start_time = 0;
cf25bb62 15934
3644ba60
KW
15935if (! -e $file_list) {
15936 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
15937 $write_unchanged_files = 1;
15938} elsif ($write_unchanged_files) {
99870f4d
KW
15939 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
15940}
15941else {
15942 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
15943 my $file_handle;
23e33b60 15944 if (! open $file_handle, "<", $file_list) {
3644ba60 15945 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
15946 $glob_list = 1;
15947 }
15948 else {
15949 my @input;
15950
15951 # Read and parse mktables.lst, placing the results from the first part
15952 # into @input, and the second part into @mktables_list_output_files
15953 for my $list ( \@input, \@mktables_list_output_files ) {
15954 while (<$file_handle>) {
15955 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
15956 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
15957 $old_start_time = $1;
15958 }
99870f4d
KW
15959 next if /^ \s* (?: \# .* )? $/x;
15960 last if /^ =+ $/x;
15961 my ( $file ) = split /\t/;
15962 push @$list, $file;
cf25bb62 15963 }
99870f4d
KW
15964 @$list = uniques(@$list);
15965 next;
cf25bb62
JH
15966 }
15967
99870f4d
KW
15968 # Look through all the input files
15969 foreach my $input (@input) {
15970 next if $input eq 'version'; # Already have checked this.
cf25bb62 15971
99870f4d
KW
15972 # Ignore if doesn't exist. The checking about whether we care or
15973 # not is done via the Input_file object.
15974 next if ! file_exists($input);
5beb625e 15975
99870f4d
KW
15976 # The paths are stored with relative names, and with '/' as the
15977 # delimiter; convert to absolute on this machine
517956bf 15978 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
faf3cf6b
KW
15979 $potential_files{lc $full} = 1
15980 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 15981 }
5beb625e 15982 }
cf25bb62 15983
99870f4d
KW
15984 close $file_handle;
15985}
15986
15987if ($glob_list) {
15988
15989 # Here wants to process all .txt files in the directory structure.
15990 # Convert them to full path names. They are stored in the platform's
15991 # relative style
f86864ac
KW
15992 my @known_files;
15993 foreach my $object (@input_file_objects) {
15994 my $file = $object->file;
15995 next unless defined $file;
15996 push @known_files, File::Spec->rel2abs($file);
15997 }
99870f4d
KW
15998
15999 my @unknown_input_files;
faf3cf6b
KW
16000 foreach my $file (keys %potential_files) { # The keys are stored in lc
16001 next if grep { $file eq lc($_) } @known_files;
99870f4d
KW
16002
16003 # Here, the file is unknown to us. Get relative path name
16004 $file = File::Spec->abs2rel($file);
16005 push @unknown_input_files, $file;
16006
16007 # What will happen is we create a data structure for it, and add it to
16008 # the list of input files to process. First get the subdirectories
16009 # into an array
16010 my (undef, $directories, undef) = File::Spec->splitpath($file);
16011 $directories =~ s;/$;;; # Can have extraneous trailing '/'
16012 my @directories = File::Spec->splitdir($directories);
16013
16014 # If the file isn't extracted (meaning none of the directories is the
16015 # extracted one), just add it to the end of the list of inputs.
16016 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 16017 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
16018 }
16019 else {
16020
16021 # Here, the file is extracted. It needs to go ahead of most other
16022 # processing. Search for the first input file that isn't a
16023 # special required property (that is, find one whose first_release
16024 # is non-0), and isn't extracted. Also, the Age property file is
16025 # processed before the extracted ones, just in case
16026 # $compare_versions is set.
16027 for (my $i = 0; $i < @input_file_objects; $i++) {
16028 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
16029 && lc($input_file_objects[$i]->file) ne 'dage.txt'
16030 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 16031 {
99f78760 16032 splice @input_file_objects, $i, 0,
37e2e78e 16033 Input_file->new($file, v0);
99870f4d
KW
16034 last;
16035 }
cf25bb62 16036 }
99870f4d 16037
cf25bb62 16038 }
d2d499f5 16039 }
99870f4d 16040 if (@unknown_input_files) {
23e33b60 16041 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
16042
16043The following files are unknown as to how to handle. Assuming they are
16044typical property files. You'll know by later error messages if it worked or
16045not:
16046END
99f78760 16047 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
16048 }
16049} # End of looking through directory structure for more .txt files.
5beb625e 16050
99870f4d
KW
16051# Create the list of input files from the objects we have defined, plus
16052# version
16053my @input_files = 'version';
16054foreach my $object (@input_file_objects) {
16055 my $file = $object->file;
16056 next if ! defined $file; # Not all objects have files
16057 next if $object->optional && ! -e $file;
16058 push @input_files, $file;
16059}
5beb625e 16060
99870f4d
KW
16061if ( $verbosity >= $VERBOSE ) {
16062 print "Expecting ".scalar( @input_files )." input files. ",
16063 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
16064}
cf25bb62 16065
aeab6150
KW
16066# We set $most_recent to be the most recently changed input file, including
16067# this program itself (done much earlier in this file)
99870f4d 16068foreach my $in (@input_files) {
cdcef19a
KW
16069 next unless -e $in; # Keep going even if missing a file
16070 my $mod_time = (stat $in)[9];
aeab6150 16071 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
16072
16073 # See that the input files have distinct names, to warn someone if they
16074 # are adding a new one
16075 if ($make_list) {
16076 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
16077 $directories =~ s;/$;;; # Can have extraneous trailing '/'
16078 my @directories = File::Spec->splitdir($directories);
16079 my $base = $file =~ s/\.txt$//;
16080 construct_filename($file, 'mutable', \@directories);
cf25bb62 16081 }
99870f4d 16082}
cf25bb62 16083
dff6c046 16084my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 16085 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 16086 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 16087
99870f4d
KW
16088# Now we check to see if any output files are older than youngest, if
16089# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 16090if (! $rebuild) {
99870f4d
KW
16091 foreach my $out (@mktables_list_output_files) {
16092 if ( ! file_exists($out)) {
16093 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 16094 $rebuild = 1;
99870f4d
KW
16095 last;
16096 }
16097 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
16098 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
16099 if ( (stat $out)[9] <= $most_recent ) {
16100 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 16101 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 16102 $rebuild = 1;
99870f4d 16103 last;
cf25bb62 16104 }
cf25bb62 16105 }
99870f4d 16106}
d1d1cd7a 16107if (! $rebuild) {
1265e11f 16108 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
16109 exit(0);
16110}
16111print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 16112
99870f4d
KW
16113# Ready to do the major processing. First create the perl pseudo-property.
16114$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 16115
99870f4d
KW
16116# Process each input file
16117foreach my $file (@input_file_objects) {
16118 $file->run;
d2d499f5
JH
16119}
16120
99870f4d 16121# Finish the table generation.
c4051cc5 16122
99870f4d
KW
16123print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
16124finish_Unicode();
c4051cc5 16125
99870f4d
KW
16126print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
16127compile_perl();
c4051cc5 16128
99870f4d
KW
16129print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
16130add_perl_synonyms();
c4051cc5 16131
99870f4d
KW
16132print "Writing tables\n" if $verbosity >= $PROGRESS;
16133write_all_tables();
c4051cc5 16134
99870f4d
KW
16135# Write mktables.lst
16136if ( $file_list and $make_list ) {
c4051cc5 16137
99870f4d
KW
16138 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
16139 foreach my $file (@input_files, @files_actually_output) {
16140 my (undef, $directories, $file) = File::Spec->splitpath($file);
16141 my @directories = File::Spec->splitdir($directories);
16142 $file = join '/', @directories, $file;
16143 }
16144
16145 my $ofh;
16146 if (! open $ofh,">",$file_list) {
16147 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
16148 return
16149 }
16150 else {
cdcef19a 16151 my $localtime = localtime $start_time;
99870f4d
KW
16152 print $ofh <<"END";
16153#
16154# $file_list -- File list for $0.
97050450 16155#
cdcef19a 16156# Autogenerated starting on $start_time ($localtime)
97050450
YO
16157#
16158# - First section is input files
99870f4d 16159# ($0 itself is not listed but is automatically considered an input)
98dc9551 16160# - Section separator is /^=+\$/
97050450
YO
16161# - Second section is a list of output files.
16162# - Lines matching /^\\s*#/ are treated as comments
16163# which along with blank lines are ignored.
16164#
16165
16166# Input files:
16167
99870f4d
KW
16168END
16169 print $ofh "$_\n" for sort(@input_files);
16170 print $ofh "\n=================================\n# Output files:\n\n";
16171 print $ofh "$_\n" for sort @files_actually_output;
16172 print $ofh "\n# ",scalar(@input_files)," input files\n",
16173 "# ",scalar(@files_actually_output)+1," output files\n\n",
16174 "# End list\n";
16175 close $ofh
16176 or Carp::my_carp("Failed to close $ofh: $!");
16177
16178 print "Filelist has ",scalar(@input_files)," input files and ",
16179 scalar(@files_actually_output)+1," output files\n"
16180 if $verbosity >= $VERBOSE;
16181 }
16182}
16183
16184# Output these warnings unless -q explicitly specified.
c83dffeb 16185if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
99870f4d
KW
16186 if (@unhandled_properties) {
16187 print "\nProperties and tables that unexpectedly have no code points\n";
16188 foreach my $property (sort @unhandled_properties) {
16189 print $property, "\n";
16190 }
16191 }
16192
16193 if (%potential_files) {
16194 print "\nInput files that are not considered:\n";
16195 foreach my $file (sort keys %potential_files) {
16196 print File::Spec->abs2rel($file), "\n";
16197 }
16198 }
16199 print "\nAll done\n" if $verbosity >= $VERBOSE;
16200}
5beb625e 16201exit(0);
cf25bb62 16202
99870f4d 16203# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 16204__DATA__
99870f4d 16205
5beb625e
JH
16206use strict;
16207use warnings;
16208
66fd7fd0
KW
16209# If run outside the normal test suite on an ASCII platform, you can
16210# just create a latin1_to_native() function that just returns its
16211# inputs, because that's the only function used from test.pl
16212require "test.pl";
16213
37e2e78e
KW
16214# Test qr/\X/ and the \p{} regular expression constructs. This file is
16215# constructed by mktables from the tables it generates, so if mktables is
16216# buggy, this won't necessarily catch those bugs. Tests are generated for all
16217# feasible properties; a few aren't currently feasible; see
16218# is_code_point_usable() in mktables for details.
99870f4d
KW
16219
16220# Standard test packages are not used because this manipulates SIG_WARN. It
16221# exits 0 if every non-skipped test succeeded; -1 if any failed.
16222
5beb625e
JH
16223my $Tests = 0;
16224my $Fails = 0;
99870f4d 16225
99870f4d
KW
16226sub Expect($$$$) {
16227 my $expected = shift;
16228 my $ord = shift;
16229 my $regex = shift;
16230 my $warning_type = shift; # Type of warning message, like 'deprecated'
16231 # or empty if none
16232 my $line = (caller)[2];
66fd7fd0 16233 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 16234
99870f4d 16235 # Convert the code point to hex form
23e33b60 16236 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 16237
99870f4d 16238 my @tests = "";
5beb625e 16239
37e2e78e
KW
16240 # The first time through, use all warnings. If the input should generate
16241 # a warning, add another time through with them turned off
99870f4d
KW
16242 push @tests, "no warnings '$warning_type';" if $warning_type;
16243
16244 foreach my $no_warnings (@tests) {
16245
16246 # Store any warning messages instead of outputting them
16247 local $SIG{__WARN__} = $SIG{__WARN__};
16248 my $warning_message;
16249 $SIG{__WARN__} = sub { $warning_message = $_[0] };
16250
16251 $Tests++;
16252
16253 # A string eval is needed because of the 'no warnings'.
16254 # Assumes no parens in the regular expression
16255 my $result = eval "$no_warnings
16256 my \$RegObj = qr($regex);
16257 $string =~ \$RegObj ? 1 : 0";
16258 if (not defined $result) {
16259 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
16260 $Fails++;
16261 }
16262 elsif ($result ^ $expected) {
16263 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
16264 $Fails++;
16265 }
16266 elsif ($warning_message) {
16267 if (! $warning_type || ($warning_type && $no_warnings)) {
16268 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
16269 $Fails++;
16270 }
16271 else {
16272 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
16273 }
16274 }
16275 elsif ($warning_type && ! $no_warnings) {
16276 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
16277 $Fails++;
16278 }
16279 else {
16280 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
16281 }
5beb625e 16282 }
99870f4d 16283 return;
5beb625e 16284}
d73e5302 16285
99870f4d
KW
16286sub Error($) {
16287 my $regex = shift;
5beb625e 16288 $Tests++;
99870f4d 16289 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 16290 $Fails++;
99870f4d
KW
16291 my $line = (caller)[2];
16292 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 16293 }
99870f4d
KW
16294 else {
16295 my $line = (caller)[2];
16296 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
16297 }
16298 return;
5beb625e
JH
16299}
16300
37e2e78e
KW
16301# GCBTest.txt character that separates grapheme clusters
16302my $breakable_utf8 = my $breakable = chr(0xF7);
16303utf8::upgrade($breakable_utf8);
16304
16305# GCBTest.txt character that indicates that the adjoining code points are part
16306# of the same grapheme cluster
16307my $nobreak_utf8 = my $nobreak = chr(0xD7);
16308utf8::upgrade($nobreak_utf8);
16309
16310sub Test_X($) {
16311 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
16312 # Each such line is a sequence of code points given by their hex numbers,
16313 # separated by the two characters defined just before this subroutine that
16314 # indicate that either there can or cannot be a break between the adjacent
16315 # code points. If there isn't a break, that means the sequence forms an
16316 # extended grapheme cluster, which means that \X should match the whole
16317 # thing. If there is a break, \X should stop there. This is all
16318 # converted by this routine into a match:
16319 # $string =~ /(\X)/,
16320 # Each \X should match the next cluster; and that is what is checked.
16321
16322 my $template = shift;
16323
16324 my $line = (caller)[2];
16325
16326 # The line contains characters above the ASCII range, but in Latin1. It
16327 # may or may not be in utf8, and if it is, it may or may not know it. So,
16328 # convert these characters to 8 bits. If knows is in utf8, simply
16329 # downgrade.
16330 if (utf8::is_utf8($template)) {
16331 utf8::downgrade($template);
16332 } else {
16333
16334 # Otherwise, if it is in utf8, but doesn't know it, the next lines
16335 # convert the two problematic characters to their 8-bit equivalents.
16336 # If it isn't in utf8, they don't harm anything.
16337 use bytes;
16338 $template =~ s/$nobreak_utf8/$nobreak/g;
16339 $template =~ s/$breakable_utf8/$breakable/g;
16340 }
16341
16342 # Get rid of the leading and trailing breakables
16343 $template =~ s/^ \s* $breakable \s* //x;
16344 $template =~ s/ \s* $breakable \s* $ //x;
16345
16346 # And no-breaks become just a space.
16347 $template =~ s/ \s* $nobreak \s* / /xg;
16348
16349 # Split the input into segments that are breakable between them.
16350 my @segments = split /\s*$breakable\s*/, $template;
16351
16352 my $string = "";
16353 my $display_string = "";
16354 my @should_match;
16355 my @should_display;
16356
16357 # Convert the code point sequence in each segment into a Perl string of
16358 # characters
16359 foreach my $segment (@segments) {
16360 my @code_points = split /\s+/, $segment;
16361 my $this_string = "";
16362 my $this_display = "";
16363 foreach my $code_point (@code_points) {
66fd7fd0 16364 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
16365 $this_display .= "\\x{$code_point}";
16366 }
16367
16368 # The next cluster should match the string in this segment.
16369 push @should_match, $this_string;
16370 push @should_display, $this_display;
16371 $string .= $this_string;
16372 $display_string .= $this_display;
16373 }
16374
16375 # If a string can be represented in both non-ut8 and utf8, test both cases
16376 UPGRADE:
16377 for my $to_upgrade (0 .. 1) {
678f13d5 16378
37e2e78e
KW
16379 if ($to_upgrade) {
16380
16381 # If already in utf8, would just be a repeat
16382 next UPGRADE if utf8::is_utf8($string);
16383
16384 utf8::upgrade($string);
16385 }
16386
16387 # Finally, do the \X match.
16388 my @matches = $string =~ /(\X)/g;
16389
16390 # Look through each matched cluster to verify that it matches what we
16391 # expect.
16392 my $min = (@matches < @should_match) ? @matches : @should_match;
16393 for my $i (0 .. $min - 1) {
16394 $Tests++;
16395 if ($matches[$i] eq $should_match[$i]) {
16396 print "ok $Tests - ";
16397 if ($i == 0) {
16398 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
16399 } else {
16400 print "And \\X #", $i + 1,
16401 }
16402 print " correctly matched $should_display[$i]; line $line\n";
16403 } else {
16404 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
16405 unpack("U*", $matches[$i]));
16406 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
16407 $i + 1,
16408 " should have matched $should_display[$i]",
16409 " but instead matched $matches[$i]",
16410 ". Abandoning rest of line $line\n";
16411 next UPGRADE;
16412 }
16413 }
16414
16415 # And the number of matches should equal the number of expected matches.
16416 $Tests++;
16417 if (@matches == @should_match) {
16418 print "ok $Tests - Nothing was left over; line $line\n";
16419 } else {
16420 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
16421 }
16422 }
16423
16424 return;
16425}
16426
99870f4d 16427sub Finished() {
f86864ac 16428 print "1..$Tests\n";
99870f4d 16429 exit($Fails ? -1 : 0);
5beb625e 16430}
99870f4d
KW
16431
16432Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 16433Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
16434Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
16435Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 16436Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726