This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Refactor \X regex handling to avoid a typical case table lookup
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a 19my $start_time;
98dc9551 20BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
21 # get it as close as possible
22 $start_time= time;
23}
24
23e33b60 25require 5.010_001;
d73e5302 26use strict;
99870f4d 27use warnings;
cf25bb62 28use Carp;
bd9ebcfd 29use Config;
99870f4d
KW
30use File::Find;
31use File::Path;
d07a55ed 32use File::Spec;
99870f4d 33use Text::Tabs;
6b64c11c 34use re "/aa";
99870f4d
KW
35
36sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 37my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d
KW
38
39##########################################################################
40#
41# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42# from the Unicode database files (lib/unicore/.../*.txt), It also generates
232ed87f 43# a pod file and .t files, depending on option parameters.
99870f4d
KW
44#
45# The structure of this file is:
46# First these introductory comments; then
47# code needed for everywhere, such as debugging stuff; then
48# code to handle input parameters; then
49# data structures likely to be of external interest (some of which depend on
50# the input parameters, so follows them; then
51# more data structures and subroutine and package (class) definitions; then
52# the small actual loop to process the input files and finish up; then
53# a __DATA__ section, for the .t tests
54#
232ed87f
KW
55# This program works on all releases of Unicode so far. The outputs have been
56# scrutinized most intently for release 5.1. The others have been checked for
57# somewhat more than just sanity. It can handle all non-provisional Unicode
58# character properties in those releases.
99870f4d 59#
99870f4d
KW
60# This program is mostly about Unicode character (or code point) properties.
61# A property describes some attribute or quality of a code point, like if it
62# is lowercase or not, its name, what version of Unicode it was first defined
63# in, or what its uppercase equivalent is. Unicode deals with these disparate
64# possibilities by making all properties into mappings from each code point
65# into some corresponding value. In the case of it being lowercase or not,
66# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67# property maps each Unicode code point to a single value, called a "property
232ed87f
KW
68# value". (Some more recently defined properties, map a code point to a set
69# of values.)
99870f4d
KW
70#
71# When using a property in a regular expression, what is desired isn't the
72# mapping of the code point to its property's value, but the reverse (or the
73# mathematical "inverse relation"): starting with the property value, "Does a
74# code point map to it?" These are written in a "compound" form:
75# \p{property=value}, e.g., \p{category=punctuation}. This program generates
76# files containing the lists of code points that map to each such regular
77# expression property value, one file per list
78#
79# There is also a single form shortcut that Perl adds for many of the commonly
80# used properties. This happens for all binary properties, plus script,
81# general_category, and block properties.
82#
83# Thus the outputs of this program are files. There are map files, mostly in
84# the 'To' directory; and there are list files for use in regular expression
85# matching, all in subdirectories of the 'lib' directory, with each
86# subdirectory being named for the property that the lists in it are for.
87# Bookkeeping, test, and documentation files are also generated.
88
89my $matches_directory = 'lib'; # Where match (\p{}) files go.
90my $map_directory = 'To'; # Where map files go.
91
92# DATA STRUCTURES
93#
94# The major data structures of this program are Property, of course, but also
95# Table. There are two kinds of tables, very similar to each other.
96# "Match_Table" is the data structure giving the list of code points that have
97# a particular property value, mentioned above. There is also a "Map_Table"
98# data structure which gives the property's mapping from code point to value.
99# There are two structures because the match tables need to be combined in
100# various ways, such as constructing unions, intersections, complements, etc.,
101# and the map ones don't. And there would be problems, perhaps subtle, if
102# a map table were inadvertently operated on in some of those ways.
103# The use of separate classes with operations defined on one but not the other
104# prevents accidentally confusing the two.
105#
106# At the heart of each table's data structure is a "Range_List", which is just
107# an ordered list of "Ranges", plus ancillary information, and methods to
108# operate on them. A Range is a compact way to store property information.
109# Each range has a starting code point, an ending code point, and a value that
110# is meant to apply to all the code points between the two end points,
111# inclusive. For a map table, this value is the property value for those
112# code points. Two such ranges could be written like this:
113# 0x41 .. 0x5A, 'Upper',
114# 0x61 .. 0x7A, 'Lower'
115#
116# Each range also has a type used as a convenience to classify the values.
117# Most ranges in this program will be Type 0, or normal, but there are some
118# ranges that have a non-zero type. These are used only in map tables, and
119# are for mappings that don't fit into the normal scheme of things. Mappings
120# that require a hash entry to communicate with utf8.c are one example;
121# another example is mappings for charnames.pm to use which indicate a name
232ed87f 122# that is algorithmically determinable from its code point (and the reverse).
99870f4d
KW
123# These are used to significantly compact these tables, instead of listing
124# each one of the tens of thousands individually.
125#
126# In a match table, the value of a range is irrelevant (and hence the type as
127# well, which will always be 0), and arbitrarily set to the null string.
128# Using the example above, there would be two match tables for those two
129# entries, one named Upper would contain the 0x41..0x5A range, and the other
130# named Lower would contain 0x61..0x7A.
131#
132# Actually, there are two types of range lists, "Range_Map" is the one
133# associated with map tables, and "Range_List" with match tables.
232ed87f
KW
134# Again, this is so that methods can be defined on one and not the others so
135# as to prevent operating on them in incorrect ways.
99870f4d
KW
136#
137# Eventually, most tables are written out to files to be read by utf8_heavy.pl
138# in the perl core. All tables could in theory be written, but some are
139# suppressed because there is no current practical use for them. It is easy
140# to change which get written by changing various lists that are near the top
141# of the actual code in this file. The table data structures contain enough
142# ancillary information to allow them to be treated as separate entities for
143# writing, such as the path to each one's file. There is a heading in each
144# map table that gives the format of its entries, and what the map is for all
145# the code points missing from it. (This allows tables to be more compact.)
678f13d5 146#
99870f4d
KW
147# The Property data structure contains one or more tables. All properties
148# contain a map table (except the $perl property which is a
149# pseudo-property containing only match tables), and any properties that
150# are usable in regular expression matches also contain various matching
151# tables, one for each value the property can have. A binary property can
152# have two values, True and False (or Y and N, which are preferred by Unicode
153# terminology). Thus each of these properties will have a map table that
154# takes every code point and maps it to Y or N (but having ranges cuts the
155# number of entries in that table way down), and two match tables, one
156# which has a list of all the code points that map to Y, and one for all the
232ed87f 157# code points that map to N. (For each binary property, a third table is also
99870f4d 158# generated for the pseudo Perl property. It contains the identical code
232ed87f
KW
159# points as the Y table, but can be written in regular expressions, not in the
160# compound form, but in a "single" form like \p{IsUppercase}.) Many
161# properties are binary, but some properties have several possible values,
162# some have many, and properties like Name have a different value for every
163# named code point. Those will not, unless the controlling lists are changed,
164# have their match tables written out. But all the ones which can be used in
165# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
166# a property would have either its map table or its match tables written but
167# not both. Again, what gets written is controlled by lists which can easily
168# be changed. Starting in 5.14, advantage was taken of this, and all the map
169# tables needed to reconstruct the Unicode db are now written out, while
170# suppressing the Unicode .txt files that contain the data. Our tables are
171# much more compact than the .txt files, so a significant space savings was
172# achieved. Also, tables are not written out that are trivially derivable
173# from tables that do get written. So, there typically is no file containing
174# the code points not matched by a binary property (the table for \P{} versus
175# lowercase \p{}), since you just need to invert the True table to get the
176# False table.
177
178# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
179# how many match tables there are and the content of the maps. This 'Type' is
c12f2655
KW
180# different than a range 'Type', so don't get confused by the two concepts
181# having the same name.
678f13d5 182#
99870f4d
KW
183# For information about the Unicode properties, see Unicode's UAX44 document:
184
185my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
186
187# As stated earlier, this program will work on any release of Unicode so far.
188# Most obvious problems in earlier data have NOT been corrected except when
be864b6c 189# necessary to make Perl or this program work reasonably, and to keep out
232ed87f
KW
190# potential security issues. For example, no folding information was given in
191# early releases, so this program substitutes lower case instead, just so that
192# a regular expression with the /i option will do something that actually
193# gives the right results in many cases. There are also a couple other
194# corrections for version 1.1.5, commented at the point they are made. As an
195# example of corrections that weren't made (but could be) is this statement
196# from DerivedAge.txt: "The supplementary private use code points and the
197# non-character code points were assigned in version 2.0, but not specifically
198# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
199# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
200# further down in these introductory comments.
99870f4d 201#
232ed87f
KW
202# This program works on all non-provisional properties as of the current
203# Unicode release, though the files for some are suppressed for various
204# reasons. You can change which are output by changing lists in this program.
678f13d5 205#
dc85bd38 206# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
207# loose matchings rules (from Unicode TR18):
208#
209# The recommended names for UCD properties and property values are in
210# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
211# [PropValue]. There are both abbreviated names and longer, more
212# descriptive names. It is strongly recommended that both names be
213# recognized, and that loose matching of property names be used,
214# whereby the case distinctions, whitespace, hyphens, and underbar
215# are ignored.
232ed87f 216#
99870f4d
KW
217# The program still allows Fuzzy to override its determination of if loose
218# matching should be used, but it isn't currently used, as it is no longer
219# needed; the calculations it makes are good enough.
678f13d5 220#
99870f4d
KW
221# SUMMARY OF HOW IT WORKS:
222#
223# Process arguments
224#
225# A list is constructed containing each input file that is to be processed
226#
227# Each file on the list is processed in a loop, using the associated handler
228# code for each:
229# The PropertyAliases.txt and PropValueAliases.txt files are processed
230# first. These files name the properties and property values.
231# Objects are created of all the property and property value names
232# that the rest of the input should expect, including all synonyms.
233# The other input files give mappings from properties to property
234# values. That is, they list code points and say what the mapping
235# is under the given property. Some files give the mappings for
236# just one property; and some for many. This program goes through
232ed87f
KW
237# each file and populates the properties and their map tables from
238# them. Some properties are listed in more than one file, and
239# Unicode has set up a precedence as to which has priority if there
240# is a conflict. Thus the order of processing matters, and this
241# program handles the conflict possibility by processing the
242# overriding input files last, so that if necessary they replace
243# earlier values.
99870f4d
KW
244# After this is all done, the program creates the property mappings not
245# furnished by Unicode, but derivable from what it does give.
246# The tables of code points that match each property value in each
247# property that is accessible by regular expressions are created.
248# The Perl-defined properties are created and populated. Many of these
249# require data determined from the earlier steps
250# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 251# and Unicode are reconciled and warned about.
99870f4d
KW
252# All the properties are written to files
253# Any other files are written, and final warnings issued.
678f13d5 254#
99870f4d
KW
255# For clarity, a number of operators have been overloaded to work on tables:
256# ~ means invert (take all characters not in the set). The more
257# conventional '!' is not used because of the possibility of confusing
258# it with the actual boolean operation.
259# + means union
260# - means subtraction
261# & means intersection
262# The precedence of these is the order listed. Parentheses should be
263# copiously used. These are not a general scheme. The operations aren't
264# defined for a number of things, deliberately, to avoid getting into trouble.
265# Operations are done on references and affect the underlying structures, so
266# that the copy constructors for them have been overloaded to not return a new
267# clone, but the input object itself.
678f13d5 268#
99870f4d
KW
269# The bool operator is deliberately not overloaded to avoid confusion with
270# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
271#
272# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
273#
274# This program needs to be able to run under miniperl. Therefore, it uses a
275# minimum of other modules, and hence implements some things itself that could
276# be gotten from CPAN
277#
278# This program uses inputs published by the Unicode Consortium. These can
279# change incompatibly between releases without the Perl maintainers realizing
280# it. Therefore this program is now designed to try to flag these. It looks
281# at the directories where the inputs are, and flags any unrecognized files.
282# It keeps track of all the properties in the files it handles, and flags any
283# that it doesn't know how to handle. It also flags any input lines that
284# don't match the expected syntax, among other checks.
285#
286# It is also designed so if a new input file matches one of the known
287# templates, one hopefully just needs to add it to a list to have it
288# processed.
289#
290# As mentioned earlier, some properties are given in more than one file. In
291# particular, the files in the extracted directory are supposedly just
292# reformattings of the others. But they contain information not easily
293# derivable from the other files, including results for Unihan, which this
294# program doesn't ordinarily look at, and for unassigned code points. They
295# also have historically had errors or been incomplete. In an attempt to
296# create the best possible data, this program thus processes them first to
297# glean information missing from the other files; then processes those other
298# files to override any errors in the extracted ones. Much of the design was
299# driven by this need to store things and then possibly override them.
300#
301# It tries to keep fatal errors to a minimum, to generate something usable for
302# testing purposes. It always looks for files that could be inputs, and will
303# warn about any that it doesn't know how to handle (the -q option suppresses
304# the warning).
99870f4d 305#
678f13d5
KW
306# Why is there more than one type of range?
307# This simplified things. There are some very specialized code points that
308# have to be handled specially for output, such as Hangul syllable names.
309# By creating a range type (done late in the development process), it
310# allowed this to be stored with the range, and overridden by other input.
311# Originally these were stored in another data structure, and it became a
312# mess trying to decide if a second file that was for the same property was
313# overriding the earlier one or not.
314#
315# Why are there two kinds of tables, match and map?
316# (And there is a base class shared by the two as well.) As stated above,
317# they actually are for different things. Development proceeded much more
318# smoothly when I (khw) realized the distinction. Map tables are used to
319# give the property value for every code point (actually every code point
320# that doesn't map to a default value). Match tables are used for regular
321# expression matches, and are essentially the inverse mapping. Separating
322# the two allows more specialized methods, and error checks so that one
323# can't just take the intersection of two map tables, for example, as that
324# is nonsensical.
99870f4d 325#
232ed87f
KW
326# What about 'fate' and 'status'. The concept of a table's fate was created
327# late when it became clear that something more was needed. The difference
328# between this and 'status' is unclean, and could be improved if someone
329# wanted to spend the effort.
330#
23e33b60
KW
331# DEBUGGING
332#
678f13d5
KW
333# This program is written so it will run under miniperl. Occasionally changes
334# will cause an error where the backtrace doesn't work well under miniperl.
335# To diagnose the problem, you can instead run it under regular perl, if you
336# have one compiled.
337#
338# There is a good trace facility. To enable it, first sub DEBUG must be set
339# to return true. Then a line like
340#
341# local $to_trace = 1 if main::DEBUG;
342#
232ed87f
KW
343# can be added to enable tracing in its lexical scope (plus dynamic) or until
344# you insert another line:
678f13d5
KW
345#
346# local $to_trace = 0 if main::DEBUG;
347#
232ed87f 348# To actually trace, use a line like "trace $a, @b, %c, ...;
678f13d5
KW
349#
350# Some of the more complex subroutines already have trace statements in them.
351# Permanent trace statements should be like:
352#
353# trace ... if main::DEBUG && $to_trace;
354#
355# If there is just one or a few files that you're debugging, you can easily
356# cause most everything else to be skipped. Change the line
357#
358# my $debug_skip = 0;
359#
360# to 1, and every file whose object is in @input_file_objects and doesn't have
232ed87f
KW
361# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
362# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
678f13d5 363#
b4a0206c 364# To compare the output tables, it may be useful to specify the -annotate
c4019d52
KW
365# flag. This causes the tables to expand so there is one entry for each
366# non-algorithmically named code point giving, currently its name, and its
367# graphic representation if printable (and you have a font that knows about
368# it). This makes it easier to see what the particular code points are in
369# each output table. The tables are usable, but because they don't have
370# ranges (for the most part), a Perl using them will run slower. Non-named
371# code points are annotated with a description of their status, and contiguous
372# ones with the same description will be output as a range rather than
373# individually. Algorithmically named characters are also output as ranges,
374# except when there are just a few contiguous ones.
375#
99870f4d
KW
376# FUTURE ISSUES
377#
378# The program would break if Unicode were to change its names so that
379# interior white space, underscores, or dashes differences were significant
380# within property and property value names.
381#
382# It might be easier to use the xml versions of the UCD if this program ever
383# would need heavy revision, and the ability to handle old versions was not
384# required.
385#
386# There is the potential for name collisions, in that Perl has chosen names
387# that Unicode could decide it also likes. There have been such collisions in
388# the past, with mostly Perl deciding to adopt the Unicode definition of the
389# name. However in the 5.2 Unicode beta testing, there were a number of such
390# collisions, which were withdrawn before the final release, because of Perl's
391# and other's protests. These all involved new properties which began with
392# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
393# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
394# Unicode document, so they are unlikely to be used by Unicode for another
395# purpose. However, they might try something beginning with 'In', or use any
396# of the other Perl-defined properties. This program will warn you of name
397# collisions, and refuse to generate tables with them, but manual intervention
398# will be required in this event. One scheme that could be implemented, if
399# necessary, would be to have this program generate another file, or add a
400# field to mktables.lst that gives the date of first definition of a property.
401# Each new release of Unicode would use that file as a basis for the next
402# iteration. And the Perl synonym addition code could sort based on the age
403# of the property, so older properties get priority, and newer ones that clash
404# would be refused; hence existing code would not be impacted, and some other
405# synonym would have to be used for the new property. This is ugly, and
406# manual intervention would certainly be easier to do in the short run; lets
407# hope it never comes to this.
678f13d5 408#
99870f4d
KW
409# A NOTE ON UNIHAN
410#
411# This program can generate tables from the Unihan database. But it doesn't
412# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
413# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
414# database was split into 8 different files, all beginning with the letters
415# 'Unihan'. This program will read those file(s) if present, but it needs to
416# know which of the many properties in the file(s) should have tables created
417# for them. It will create tables for any properties listed in
418# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
419# @cjk_properties array and the @cjk_property_values array. Thus, if a
420# property you want is not in those files of the release you are building
421# against, you must add it to those two arrays. Starting in 4.0, the
422# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
423# is present in the directory, a table will be generated for that property.
424# In 5.2, several more properties were added. For your convenience, the two
5f7264c7 425# arrays are initialized with all the 6.0 listed properties that are also in
99870f4d
KW
426# earlier releases. But these are commented out. You can just uncomment the
427# ones you want, or use them as a template for adding entries for other
428# properties.
429#
430# You may need to adjust the entries to suit your purposes. setup_unihan(),
431# and filter_unihan_line() are the functions where this is done. This program
432# already does some adjusting to make the lines look more like the rest of the
433# Unicode DB; You can see what that is in filter_unihan_line()
434#
435# There is a bug in the 3.2 data file in which some values for the
436# kPrimaryNumeric property have commas and an unexpected comment. A filter
437# could be added for these; or for a particular installation, the Unihan.txt
438# file could be edited to fix them.
99870f4d 439#
678f13d5
KW
440# HOW TO ADD A FILE TO BE PROCESSED
441#
442# A new file from Unicode needs to have an object constructed for it in
443# @input_file_objects, probably at the end or at the end of the extracted
444# ones. The program should warn you if its name will clash with others on
445# restrictive file systems, like DOS. If so, figure out a better name, and
446# add lines to the README.perl file giving that. If the file is a character
232ed87f 447# property, it should be in the format that Unicode has implicitly
678f13d5
KW
448# standardized for such files for the more recently introduced ones.
449# If so, the Input_file constructor for @input_file_objects can just be the
450# file name and release it first appeared in. If not, then it should be
451# possible to construct an each_line_handler() to massage the line into the
452# standardized form.
453#
454# For non-character properties, more code will be needed. You can look at
455# the existing entries for clues.
456#
457# UNICODE VERSIONS NOTES
458#
459# The Unicode UCD has had a number of errors in it over the versions. And
460# these remain, by policy, in the standard for that version. Therefore it is
461# risky to correct them, because code may be expecting the error. So this
462# program doesn't generally make changes, unless the error breaks the Perl
463# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
464# for U+1105, which causes real problems for the algorithms for Jamo
465# calculations, so it is changed here.
466#
467# But it isn't so clear cut as to what to do about concepts that are
468# introduced in a later release; should they extend back to earlier releases
469# where the concept just didn't exist? It was easier to do this than to not,
470# so that's what was done. For example, the default value for code points not
471# in the files for various properties was probably undefined until changed by
472# some version. No_Block for blocks is such an example. This program will
473# assign No_Block even in Unicode versions that didn't have it. This has the
474# benefit that code being written doesn't have to special case earlier
475# versions; and the detriment that it doesn't match the Standard precisely for
476# the affected versions.
477#
478# Here are some observations about some of the issues in early versions:
479#
232ed87f
KW
480# Prior to version 3.0, there were 3 character decompositions. These are not
481# handled by Unicode::Normalize, nor will it compile when presented a version
482# that has them. However, you can trivially get it to compile by simply
483# ignoring those decompositions, by changing the croak to a carp. At the time
484# of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
485#
486# croak("Weird Canonical Decomposition of U+$h");
487#
488# Simply change to a carp. It will compile, but will not know about any three
489# character decomposition.
490
491# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
492# that the reason is that the CJK block starting at 4E00 was removed from
493# PropList, and was not put back in until 3.1.0. The Perl extension (the
494# single property name \p{alpha}) has the correct values. But the compound
495# form is simply not generated until 3.1, as it can be argued that prior to
496# this release, this was not an official property. The comments for
497# filter_old_style_proplist() give more details.
678f13d5
KW
498#
499# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
500# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
501# reason is that 3.2 introduced U+205F=medium math space, which was not
502# classed as white space, but Perl figured out that it should have been. 4.0
503# reclassified it correctly.
504#
505# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
232ed87f
KW
506# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
507# became 202, and ATBL was left with no code points, as all the ones that
508# mapped to 202 stayed mapped to 202. Thus if your program used the numeric
509# name for the class, it would not have been affected, but if it used the
510# mnemonic, it would have been.
678f13d5
KW
511#
512# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
513# points which eventually came to have this script property value, instead
514# mapped to "Unknown". But in the next release all these code points were
515# moved to \p{sc=common} instead.
99870f4d
KW
516#
517# The default for missing code points for BidiClass is complicated. Starting
518# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
519# tries to do the best it can for earlier releases. It is done in
520# process_PropertyAliases()
521#
232ed87f
KW
522# In version 2.1.2, the entry in UnicodeData.txt:
523# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
524# should instead be
525# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
526# Without this change, there are casing problems for this character.
527#
99870f4d
KW
528##############################################################################
529
530my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
531 # and errors
532my $MAX_LINE_WIDTH = 78;
533
534# Debugging aid to skip most files so as to not be distracted by them when
535# concentrating on the ones being debugged. Add
536# non_skip => 1,
537# to the constructor for those files you want processed when you set this.
538# Files with a first version number of 0 are special: they are always
c12f2655
KW
539# processed regardless of the state of this flag. Generally, Jamo.txt and
540# UnicodeData.txt must not be skipped if you want this program to not die
541# before normal completion.
99870f4d
KW
542my $debug_skip = 0;
543
e9c4b4f8
KW
544
545# Normally these are suppressed.
546my $write_Unicode_deprecated_tables = 0;
547
99870f4d
KW
548# Set to 1 to enable tracing.
549our $to_trace = 0;
550
551{ # Closure for trace: debugging aid
552 my $print_caller = 1; # ? Include calling subroutine name
553 my $main_with_colon = 'main::';
554 my $main_colon_length = length($main_with_colon);
555
556 sub trace {
557 return unless $to_trace; # Do nothing if global flag not set
558
559 my @input = @_;
560
561 local $DB::trace = 0;
562 $DB::trace = 0; # Quiet 'used only once' message
563
564 my $line_number;
565
566 # Loop looking up the stack to get the first non-trace caller
567 my $caller_line;
568 my $caller_name;
569 my $i = 0;
570 do {
571 $line_number = $caller_line;
572 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
573 $caller = $main_with_colon unless defined $caller;
574
575 $caller_name = $caller;
576
577 # get rid of pkg
578 $caller_name =~ s/.*:://;
579 if (substr($caller_name, 0, $main_colon_length)
580 eq $main_with_colon)
581 {
582 $caller_name = substr($caller_name, $main_colon_length);
583 }
584
585 } until ($caller_name ne 'trace');
586
587 # If the stack was empty, we were called from the top level
588 $caller_name = 'main' if ($caller_name eq ""
589 || $caller_name eq 'trace');
590
591 my $output = "";
592 foreach my $string (@input) {
593 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
594 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
595 $output .= simple_dumper($string);
596 }
597 else {
598 $string = "$string" if ref $string;
599 $string = $UNDEF unless defined $string;
600 chomp $string;
601 $string = '""' if $string eq "";
602 $output .= " " if $output ne ""
603 && $string ne ""
604 && substr($output, -1, 1) ne " "
605 && substr($string, 0, 1) ne " ";
606 $output .= $string;
607 }
608 }
609
99f78760
KW
610 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
611 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
612 print STDERR $output, "\n";
613 return;
614 }
615}
616
617# This is for a rarely used development feature that allows you to compare two
618# versions of the Unicode standard without having to deal with changes caused
c12f2655
KW
619# by the code points introduced in the later version. Change the 0 to a
620# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
621# code points introduced in that release and earlier will be used; later ones
622# are thrown away. You use the version number of the earliest one you want to
623# compare; then run this program on directory structures containing each
624# release, and compare the outputs. These outputs will therefore include only
625# the code points common to both releases, and you can see the changes caused
626# just by the underlying release semantic changes. For versions earlier than
627# 3.2, you must copy a version of DAge.txt into the directory.
628my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
99870f4d
KW
629my $compare_versions = DEBUG
630 && $string_compare_versions
631 && pack "C*", split /\./, $string_compare_versions;
632
633sub uniques {
634 # Returns non-duplicated input values. From "Perl Best Practices:
635 # Encapsulated Cleverness". p. 455 in first edition.
636
637 my %seen;
0e407844
NC
638 # Arguably this breaks encapsulation, if the goal is to permit multiple
639 # distinct objects to stringify to the same value, and be interchangeable.
640 # However, for this program, no two objects stringify identically, and all
641 # lists passed to this function are either objects or strings. So this
642 # doesn't affect correctness, but it does give a couple of percent speedup.
643 no overloading;
99870f4d
KW
644 return grep { ! $seen{$_}++ } @_;
645}
646
647$0 = File::Spec->canonpath($0);
648
649my $make_test_script = 0; # ? Should we output a test script
6b5ab373 650my $make_norm_test_script = 0; # ? Should we output a normalization test script
99870f4d
KW
651my $write_unchanged_files = 0; # ? Should we update the output files even if
652 # we don't think they have changed
653my $use_directory = ""; # ? Should we chdir somewhere.
654my $pod_directory; # input directory to store the pod file.
655my $pod_file = 'perluniprops';
656my $t_path; # Path to the .t test file
657my $file_list = 'mktables.lst'; # File to store input and output file names.
658 # This is used to speed up the build, by not
659 # executing the main body of the program if
660 # nothing on the list has changed since the
661 # previous build
662my $make_list = 1; # ? Should we write $file_list. Set to always
663 # make a list so that when the pumpking is
664 # preparing a release, s/he won't have to do
665 # special things
666my $glob_list = 0; # ? Should we try to include unknown .txt files
667 # in the input.
bd9ebcfd
KW
668my $output_range_counts = $debugging_build; # ? Should we include the number
669 # of code points in ranges in
670 # the output
558712cf 671my $annotate = 0; # ? Should character names be in the output
9ef2b94f 672
99870f4d
KW
673# Verbosity levels; 0 is quiet
674my $NORMAL_VERBOSITY = 1;
675my $PROGRESS = 2;
676my $VERBOSE = 3;
677
678my $verbosity = $NORMAL_VERBOSITY;
679
680# Process arguments
681while (@ARGV) {
cf25bb62
JH
682 my $arg = shift @ARGV;
683 if ($arg eq '-v') {
99870f4d
KW
684 $verbosity = $VERBOSE;
685 }
686 elsif ($arg eq '-p') {
687 $verbosity = $PROGRESS;
688 $| = 1; # Flush buffers as we go.
689 }
690 elsif ($arg eq '-q') {
691 $verbosity = 0;
692 }
693 elsif ($arg eq '-w') {
694 $write_unchanged_files = 1; # update the files even if havent changed
695 }
696 elsif ($arg eq '-check') {
6ae7e459
YO
697 my $this = shift @ARGV;
698 my $ok = shift @ARGV;
699 if ($this ne $ok) {
700 print "Skipping as check params are not the same.\n";
701 exit(0);
702 }
00a8df5c 703 }
99870f4d
KW
704 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
705 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
706 }
3df51b85
KW
707 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
708 {
99870f4d 709 $make_test_script = 1;
99870f4d 710 }
6b5ab373
KW
711 elsif ($arg eq '-makenormtest')
712 {
713 $make_norm_test_script = 1;
714 }
99870f4d
KW
715 elsif ($arg eq '-makelist') {
716 $make_list = 1;
717 }
718 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
719 -d $use_directory or croak "Unknown directory '$use_directory'";
720 }
721 elsif ($arg eq '-L') {
722
723 # Existence not tested until have chdir'd
724 $file_list = shift;
725 }
726 elsif ($arg eq '-globlist') {
727 $glob_list = 1;
728 }
729 elsif ($arg eq '-c') {
730 $output_range_counts = ! $output_range_counts
731 }
b4a0206c 732 elsif ($arg eq '-annotate') {
558712cf 733 $annotate = 1;
bd9ebcfd
KW
734 $debugging_build = 1;
735 $output_range_counts = 1;
9ef2b94f 736 }
99870f4d
KW
737 else {
738 my $with_c = 'with';
739 $with_c .= 'out' if $output_range_counts; # Complements the state
740 croak <<END;
741usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
742 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
743 [-check A B ]
744 -c : Output comments $with_c number of code points in ranges
745 -q : Quiet Mode: Only output serious warnings.
746 -p : Set verbosity level to normal plus show progress.
747 -v : Set Verbosity level high: Show progress and non-serious
748 warnings
749 -w : Write files regardless
750 -C dir : Change to this directory before proceeding. All relative paths
751 except those specified by the -P and -T options will be done
752 with respect to this directory.
753 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 754 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
755 -L filelist : Use alternate 'filelist' instead of standard one
756 -globlist : Take as input all non-Test *.txt files in current and sub
757 directories
3df51b85
KW
758 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
759 overrides -T
99870f4d 760 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 761 -annotate : Output an annotation for each character in the table files;
c4019d52 762 useful for debugging mktables, looking at diffs; but is slow,
b318e5e5
KW
763 memory intensive; resulting tables are usable but are slow and
764 very large (and currently fail the Unicode::UCD.t tests).
99870f4d
KW
765 -check A B : Executes $0 only if A and B are the same
766END
767 }
768}
769
770# Stores the most-recently changed file. If none have changed, can skip the
771# build
aeab6150 772my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
773
774# Change directories now, because need to read 'version' early.
775if ($use_directory) {
3df51b85 776 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
777 $pod_directory = File::Spec->rel2abs($pod_directory);
778 }
3df51b85 779 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 780 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 781 }
99870f4d 782 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 783 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 784 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 785 }
3df51b85 786 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 787 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 788 }
00a8df5c
YO
789}
790
99870f4d
KW
791# Get Unicode version into regular and v-string. This is done now because
792# various tables below get populated based on it. These tables are populated
793# here to be near the top of the file, and so easily seeable by those needing
794# to modify things.
795open my $VERSION, "<", "version"
796 or croak "$0: can't open required file 'version': $!\n";
797my $string_version = <$VERSION>;
798close $VERSION;
799chomp $string_version;
800my $v_version = pack "C*", split /\./, $string_version; # v string
801
802# The following are the complete names of properties with property values that
803# are known to not match any code points in some versions of Unicode, but that
804# may change in the future so they should be matchable, hence an empty file is
805# generated for them.
806my @tables_that_may_be_empty = (
807 'Joining_Type=Left_Joining',
808 );
809push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
810push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
811push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
812 if $v_version ge v4.1.0;
82aed44a
KW
813push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
814 if $v_version ge v6.0.0;
f583b44c
KW
815push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
816 if $v_version ge v6.1.0;
1e958ea9
KW
817push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
818 if $v_version ge v6.2.0;
99870f4d
KW
819
820# The lists below are hashes, so the key is the item in the list, and the
821# value is the reason why it is in the list. This makes generation of
822# documentation easier.
823
824my %why_suppressed; # No file generated for these.
825
826# Files aren't generated for empty extraneous properties. This is arguable.
827# Extraneous properties generally come about because a property is no longer
828# used in a newer version of Unicode. If we generated a file without code
829# points, programs that used to work on that property will still execute
830# without errors. It just won't ever match (or will always match, with \P{}).
831# This means that the logic is now likely wrong. I (khw) think its better to
832# find this out by getting an error message. Just move them to the table
833# above to change this behavior
834my %why_suppress_if_empty_warn_if_not = (
835
836 # It is the only property that has ever officially been removed from the
837 # Standard. The database never contained any code points for it.
838 'Special_Case_Condition' => 'Obsolete',
839
840 # Apparently never official, but there were code points in some versions of
841 # old-style PropList.txt
842 'Non_Break' => 'Obsolete',
843);
844
845# These would normally go in the warn table just above, but they were changed
846# a long time before this program was written, so warnings about them are
847# moot.
848if ($v_version gt v3.2.0) {
849 push @tables_that_may_be_empty,
850 'Canonical_Combining_Class=Attached_Below_Left'
851}
852
5f7264c7 853# These are listed in the Property aliases file in 6.0, but Unihan is ignored
99870f4d
KW
854# unless explicitly added.
855if ($v_version ge v5.2.0) {
856 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 857 foreach my $table (qw (
99870f4d
KW
858 kAccountingNumeric
859 kOtherNumeric
860 kPrimaryNumeric
861 kCompatibilityVariant
862 kIICore
863 kIRG_GSource
864 kIRG_HSource
865 kIRG_JSource
866 kIRG_KPSource
867 kIRG_MSource
868 kIRG_KSource
869 kIRG_TSource
870 kIRG_USource
871 kIRG_VSource
872 kRSUnicode
ea25a9b2 873 ))
99870f4d
KW
874 {
875 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
876 }
ca12659b
NC
877}
878
272501f6
KW
879# Enum values for to_output_map() method in the Map_Table package.
880my $EXTERNAL_MAP = 1;
881my $INTERNAL_MAP = 2;
ce712c88 882my $OUTPUT_ADJUSTED = 3;
272501f6 883
fcf1973c
KW
884# To override computed values for writing the map tables for these properties.
885# The default for enum map tables is to write them out, so that the Unicode
886# .txt files can be removed, but all the data to compute any property value
887# for any code point is available in a more compact form.
888my %global_to_output_map = (
889 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
c12f2655
KW
890 # get stuck supporting it if things change. Since it is a STRING
891 # property, it normally would be listed in the pod, but INTERNAL_MAP
892 # suppresses that.
fcf1973c
KW
893 Unicode_1_Name => $INTERNAL_MAP,
894
895 Present_In => 0, # Suppress, as easily computed from Age
fcf1973c 896 Block => 0, # Suppress, as Blocks.txt is retained.
53d34b6c
KW
897
898 # Suppress, as mapping can be found instead from the
899 # Perl_Decomposition_Mapping file
900 Decomposition_Type => 0,
fcf1973c
KW
901);
902
99870f4d 903# Properties that this program ignores.
230e0c16
KW
904my @unimplemented_properties;
905
906# With this release, it is automatically handled if the Unihan db is
907# downloaded
908push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
d73e5302 909
99870f4d
KW
910# There are several types of obsolete properties defined by Unicode. These
911# must be hand-edited for every new Unicode release.
912my %why_deprecated; # Generates a deprecated warning message if used.
913my %why_stabilized; # Documentation only
914my %why_obsolete; # Documentation only
915
916{ # Closure
917 my $simple = 'Perl uses the more complete version of this property';
918 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
919
920 my $other_properties = 'other properties';
921 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 922 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
923
924 %why_deprecated = (
5f7264c7 925 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
926 'Jamo_Short_Name' => $contributory,
927 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
928 'Other_Alphabetic' => $contributory,
929 'Other_Default_Ignorable_Code_Point' => $contributory,
930 'Other_Grapheme_Extend' => $contributory,
931 'Other_ID_Continue' => $contributory,
932 'Other_ID_Start' => $contributory,
933 'Other_Lowercase' => $contributory,
934 'Other_Math' => $contributory,
935 'Other_Uppercase' => $contributory,
e22aaf5c
KW
936 'Expands_On_NFC' => $why_no_expand,
937 'Expands_On_NFD' => $why_no_expand,
938 'Expands_On_NFKC' => $why_no_expand,
939 'Expands_On_NFKD' => $why_no_expand,
99870f4d
KW
940 );
941
942 %why_suppressed = (
5f7264c7 943 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
944 # contains the same information, but without the algorithmically
945 # determinable Hangul syllables'. This file is not published, so it's
946 # existence is not noted in the comment.
e0b29447 947 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
99870f4d 948
3111abc0
KW
949 'Indic_Matra_Category' => "Provisional",
950 'Indic_Syllabic_Category' => "Provisional",
951
5f8d1a89
KW
952 # Don't suppress ISO_Comment, as otherwise special handling is needed
953 # to differentiate between it and gc=c, which can be written as 'isc',
954 # which is the same characters as ISO_Comment's short name.
99870f4d 955
fbb93542 956 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
e0b29447
KW
957
958 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
959 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
960 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
961 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
99870f4d 962
5f7264c7 963 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
964 );
965
1704a0ea
KW
966 foreach my $property (
967
968 # The following are suppressed because they were made contributory
969 # or deprecated by Unicode before Perl ever thought about
970 # supporting them.
971 'Jamo_Short_Name',
972 'Grapheme_Link',
973 'Expands_On_NFC',
974 'Expands_On_NFD',
975 'Expands_On_NFKC',
976 'Expands_On_NFKD',
977
978 # The following are suppressed because they have been marked
979 # as deprecated for a sufficient amount of time
980 'Other_Alphabetic',
981 'Other_Default_Ignorable_Code_Point',
982 'Other_Grapheme_Extend',
983 'Other_ID_Continue',
984 'Other_ID_Start',
985 'Other_Lowercase',
986 'Other_Math',
987 'Other_Uppercase',
e22aaf5c 988 ) {
99870f4d
KW
989 $why_suppressed{$property} = $why_deprecated{$property};
990 }
cf25bb62 991
99870f4d
KW
992 # Customize the message for all the 'Other_' properties
993 foreach my $property (keys %why_deprecated) {
994 next if (my $main_property = $property) !~ s/^Other_//;
995 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
996 }
997}
998
e9c4b4f8
KW
999if ($write_Unicode_deprecated_tables) {
1000 foreach my $property (keys %why_suppressed) {
1001 delete $why_suppressed{$property} if $property =~
1002 / ^ Other | Grapheme /x;
1003 }
1004}
1005
99870f4d
KW
1006if ($v_version ge 4.0.0) {
1007 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
1008 if ($v_version ge 6.0.0) {
1009 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1010 }
99870f4d 1011}
5f7264c7 1012if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 1013 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7 1014 if ($v_version ge 6.0.0) {
63f74647 1015 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
5f7264c7 1016 }
99870f4d
KW
1017}
1018
1019# Probably obsolete forever
1020if ($v_version ge v4.1.0) {
82aed44a
KW
1021 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1022}
1023if ($v_version ge v6.0.0) {
2b352efd
KW
1024 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
1025 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
99870f4d
KW
1026}
1027
1028# This program can create files for enumerated-like properties, such as
1029# 'Numeric_Type'. This file would be the same format as for a string
1030# property, with a mapping from code point to its value, so you could look up,
1031# for example, the script a code point is in. But no one so far wants this
1032# mapping, or they have found another way to get it since this is a new
1033# feature. So no file is generated except if it is in this list.
1034my @output_mapped_properties = split "\n", <<END;
1035END
1036
c12f2655
KW
1037# If you are using the Unihan database in a Unicode version before 5.2, you
1038# need to add the properties that you want to extract from it to this table.
1039# For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1040# listed, commented out
99870f4d
KW
1041my @cjk_properties = split "\n", <<'END';
1042#cjkAccountingNumeric; kAccountingNumeric
1043#cjkOtherNumeric; kOtherNumeric
1044#cjkPrimaryNumeric; kPrimaryNumeric
1045#cjkCompatibilityVariant; kCompatibilityVariant
1046#cjkIICore ; kIICore
1047#cjkIRG_GSource; kIRG_GSource
1048#cjkIRG_HSource; kIRG_HSource
1049#cjkIRG_JSource; kIRG_JSource
1050#cjkIRG_KPSource; kIRG_KPSource
1051#cjkIRG_KSource; kIRG_KSource
1052#cjkIRG_TSource; kIRG_TSource
1053#cjkIRG_USource; kIRG_USource
1054#cjkIRG_VSource; kIRG_VSource
1055#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1056END
1057
1058# Similarly for the property values. For your convenience, the lines in the
5f7264c7 1059# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
c12f2655 1060# '#' marks (for Unicode versions before 5.2)
99870f4d
KW
1061my @cjk_property_values = split "\n", <<'END';
1062## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1063## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1064## @missing: 0000..10FFFF; cjkIICore; <none>
1065## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1066## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1067## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1068## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1069## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1070## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1071## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1072## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1073## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1074## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1075## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1076END
1077
1078# The input files don't list every code point. Those not listed are to be
1079# defaulted to some value. Below are hard-coded what those values are for
1080# non-binary properties as of 5.1. Starting in 5.0, there are
1081# machine-parsable comment lines in the files the give the defaults; so this
1082# list shouldn't have to be extended. The claim is that all missing entries
1083# for binary properties will default to 'N'. Unicode tried to change that in
1084# 5.2, but the beta period produced enough protest that they backed off.
1085#
1086# The defaults for the fields that appear in UnicodeData.txt in this hash must
1087# be in the form that it expects. The others may be synonyms.
1088my $CODE_POINT = '<code point>';
1089my %default_mapping = (
1090 Age => "Unassigned",
1091 # Bidi_Class => Complicated; set in code
1092 Bidi_Mirroring_Glyph => "",
1093 Block => 'No_Block',
1094 Canonical_Combining_Class => 0,
1095 Case_Folding => $CODE_POINT,
1096 Decomposition_Mapping => $CODE_POINT,
1097 Decomposition_Type => 'None',
1098 East_Asian_Width => "Neutral",
1099 FC_NFKC_Closure => $CODE_POINT,
1100 General_Category => 'Cn',
1101 Grapheme_Cluster_Break => 'Other',
1102 Hangul_Syllable_Type => 'NA',
1103 ISO_Comment => "",
1104 Jamo_Short_Name => "",
1105 Joining_Group => "No_Joining_Group",
1106 # Joining_Type => Complicated; set in code
1107 kIICore => 'N', # Is converted to binary
1108 #Line_Break => Complicated; set in code
1109 Lowercase_Mapping => $CODE_POINT,
1110 Name => "",
1111 Name_Alias => "",
1112 NFC_QC => 'Yes',
1113 NFD_QC => 'Yes',
1114 NFKC_QC => 'Yes',
1115 NFKD_QC => 'Yes',
1116 Numeric_Type => 'None',
1117 Numeric_Value => 'NaN',
1118 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1119 Sentence_Break => 'Other',
1120 Simple_Case_Folding => $CODE_POINT,
1121 Simple_Lowercase_Mapping => $CODE_POINT,
1122 Simple_Titlecase_Mapping => $CODE_POINT,
1123 Simple_Uppercase_Mapping => $CODE_POINT,
1124 Titlecase_Mapping => $CODE_POINT,
1125 Unicode_1_Name => "",
1126 Unicode_Radical_Stroke => "",
1127 Uppercase_Mapping => $CODE_POINT,
1128 Word_Break => 'Other',
1129);
1130
232ed87f
KW
1131# Below are files that Unicode furnishes, but this program ignores, and why.
1132# NormalizationCorrections.txt requires some more explanation. It documents
1133# the cumulative fixes to erroneous normalizations in earlier Unicode
1134# versions. Its main purpose is so that someone running on an earlier version
1135# can use this file to override what got published in that earlier release.
1136# It would be easy for mktables to read and handle this file. But all the
1137# corrections in it should already be in the other files for the release it
1138# is. To get it to actually mean something useful, someone would have to be
1139# using an earlier Unicode release, and copy it to the files for that release
1140# and recomplile. So far there has been no demand to do that, so this hasn't
1141# been implemented.
99870f4d 1142my %ignored_files = (
73ba1144
KW
1143 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1144 'Index.txt' => 'Alphabetical index of Unicode characters',
1145 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1146 'NamesList.txt' => 'Annotated list of characters',
1147 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1148 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1149 'ReadMe.txt' => 'Documentation',
1150 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1151 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
73ba1144
KW
1152 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1153 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1154 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1155 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
99870f4d
KW
1156);
1157
1fec9f60
KW
1158my %skipped_files; # List of files that we skip
1159
678f13d5 1160### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1161
1162my $HEADER=<<"EOF";
1163# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1164# This file is machine-generated by $0 from the Unicode
1165# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1166EOF
1167
126c3d4e 1168my $INTERNAL_ONLY_HEADER = <<"EOF";
99870f4d
KW
1169
1170# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
fac53429
KW
1171# This file is for internal use by core Perl only. The format and even the
1172# name or existence of this file are subject to change without notice. Don't
1173# use it directly.
99870f4d
KW
1174EOF
1175
1176my $DEVELOPMENT_ONLY=<<"EOF";
1177# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1178# This file contains information artificially constrained to code points
1179# present in Unicode release $string_compare_versions.
1180# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1181# not be used for production.
b6922eda
KW
1182
1183EOF
1184
6189eadc
KW
1185my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1186my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1187my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
99870f4d
KW
1188
1189# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1190# two must be 10; if there are 5, the first must not be a 0. Written this way
92199589
KW
1191# to decrease backtracking. The first regex allows the code point to be at
1192# the end of a word, but to work properly, the word shouldn't end with a valid
1193# hex character. The second one won't match a code point at the end of a
1194# word, and doesn't have the run-on issue
8c32d378
KW
1195my $run_on_code_point_re =
1196 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1197my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1198
1199# This matches the beginning of the line in the Unicode db files that give the
1200# defaults for code points not listed (i.e., missing) in the file. The code
1201# depends on this ending with a semi-colon, so it can assume it is a valid
1202# field when the line is split() by semi-colons
1203my $missing_defaults_prefix =
6189eadc 1204 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
99870f4d
KW
1205
1206# Property types. Unicode has more types, but these are sufficient for our
1207# purposes.
1208my $UNKNOWN = -1; # initialized to illegal value
1209my $NON_STRING = 1; # Either binary or enum
1210my $BINARY = 2;
06f26c45
KW
1211my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1212 # tables, additional true and false tables are
1213 # generated so that false is anything matching the
1214 # default value, and true is everything else.
1215my $ENUM = 4; # Include catalog
1216my $STRING = 5; # Anything else: string or misc
99870f4d
KW
1217
1218# Some input files have lines that give default values for code points not
1219# contained in the file. Sometimes these should be ignored.
1220my $NO_DEFAULTS = 0; # Must evaluate to false
1221my $NOT_IGNORED = 1;
1222my $IGNORED = 2;
1223
1224# Range types. Each range has a type. Most ranges are type 0, for normal,
1225# and will appear in the main body of the tables in the output files, but
1226# there are other types of ranges as well, listed below, that are specially
1227# handled. There are pseudo-types as well that will never be stored as a
1228# type, but will affect the calculation of the type.
1229
1230# 0 is for normal, non-specials
1231my $MULTI_CP = 1; # Sequence of more than code point
1232my $HANGUL_SYLLABLE = 2;
1233my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1234my $NULL = 4; # The map is to the null string; utf8.c can't
1235 # handle these, nor is there an accepted syntax
1236 # for them in \p{} constructs
f86864ac 1237my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1238 # otherwise be $MULTI_CP type are instead type 0
1239
1240# process_generic_property_file() can accept certain overrides in its input.
1241# Each of these must begin AND end with $CMD_DELIM.
1242my $CMD_DELIM = "\a";
1243my $REPLACE_CMD = 'replace'; # Override the Replace
1244my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1245
1246my $NO = 0;
1247my $YES = 1;
1248
1249# Values for the Replace argument to add_range.
1250# $NO # Don't replace; add only the code points not
1251 # already present.
1252my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1253 # the comments at the subroutine definition.
1254my $UNCONDITIONALLY = 2; # Replace without conditions.
9470941f 1255my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
99870f4d 1256 # already there
7f4b1e25
KW
1257my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1258 # already there
1259my $CROAK = 6; # Die with an error if is already there
99870f4d
KW
1260
1261# Flags to give property statuses. The phrases are to remind maintainers that
1262# if the flag is changed, the indefinite article referring to it in the
1263# documentation may need to be as well.
1264my $NORMAL = "";
99870f4d
KW
1265my $DEPRECATED = 'D';
1266my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1267my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1268my $DISCOURAGED = 'X';
1269my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1270my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1271my $STRICTER = 'T';
1272my $a_bold_stricter = "a 'B<$STRICTER>'";
1273my $A_bold_stricter = "A 'B<$STRICTER>'";
1274my $STABILIZED = 'S';
1275my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1276my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1277my $OBSOLETE = 'O';
1278my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1279my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1280
1281my %status_past_participles = (
1282 $DISCOURAGED => 'discouraged',
99870f4d
KW
1283 $STABILIZED => 'stabilized',
1284 $OBSOLETE => 'obsolete',
37e2e78e 1285 $DEPRECATED => 'deprecated',
99870f4d
KW
1286);
1287
395dfc19
KW
1288# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1289# externally documented.
301ba948 1290my $ORDINARY = 0; # The normal fate.
395dfc19
KW
1291my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1292 # but there is a file written that can be used to
1293 # reconstruct this table
3cdaf629 1294my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
301ba948 1295 # for Perl's internal use only
3cdaf629
KW
1296my $SUPPRESSED = 3; # The file for this table is not written out, and as a
1297 # result, we don't bother to do many computations on
1298 # it.
1299my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the
1300 # computations anyway, as the values are needed for
1301 # things to work. This happens when we have Perl
1302 # extensions that depend on Unicode tables that
1303 # wouldn't normally be in a given Unicode version.
301ba948 1304
f5817e0a
KW
1305# The format of the values of the tables:
1306my $EMPTY_FORMAT = "";
99870f4d
KW
1307my $BINARY_FORMAT = 'b';
1308my $DECIMAL_FORMAT = 'd';
1309my $FLOAT_FORMAT = 'f';
1310my $INTEGER_FORMAT = 'i';
1311my $HEX_FORMAT = 'x';
1312my $RATIONAL_FORMAT = 'r';
1313my $STRING_FORMAT = 's';
d11155ec 1314my $ADJUST_FORMAT = 'a';
a14f3cb1 1315my $DECOMP_STRING_FORMAT = 'c';
c3ff2976 1316my $STRING_WHITE_SPACE_LIST = 'sw';
99870f4d
KW
1317
1318my %map_table_formats = (
1319 $BINARY_FORMAT => 'binary',
1320 $DECIMAL_FORMAT => 'single decimal digit',
1321 $FLOAT_FORMAT => 'floating point number',
1322 $INTEGER_FORMAT => 'integer',
add63c13 1323 $HEX_FORMAT => 'non-negative hex whole number; a code point',
99870f4d 1324 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1325 $STRING_FORMAT => 'string',
d11155ec 1326 $ADJUST_FORMAT => 'some entries need adjustment',
92f9d56c 1327 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
c3ff2976 1328 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
99870f4d
KW
1329);
1330
1331# Unicode didn't put such derived files in a separate directory at first.
1332my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1333my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1334my $AUXILIARY = 'auxiliary';
1335
1336# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
9e4a1e86 1337# and into UCD.pl for the use of UCD.pm
99870f4d
KW
1338my %loose_to_file_of; # loosely maps table names to their respective
1339 # files
1340my %stricter_to_file_of; # same; but for stricter mapping.
315bfd4e 1341my %loose_property_to_file_of; # Maps a loose property name to its map file
89cf10cc
KW
1342my %file_to_swash_name; # Maps the file name to its corresponding key name
1343 # in the hash %utf8::SwashInfo
99870f4d
KW
1344my %nv_floating_to_rational; # maps numeric values floating point numbers to
1345 # their rational equivalent
c12f2655
KW
1346my %loose_property_name_of; # Loosely maps (non_string) property names to
1347 # standard form
86a52d1e 1348my %string_property_loose_to_name; # Same, for string properties.
c15fda25
KW
1349my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1350 # the property name in standard loose form, and
1351 # 'value' is the default value for that property,
1352 # also in standard loose form.
9e4a1e86
KW
1353my %loose_to_standard_value; # loosely maps table names to the canonical
1354 # alias for them
2df7880f
KW
1355my %ambiguous_names; # keys are alias names (in standard form) that
1356 # have more than one possible meaning.
5d1df013
KW
1357my %prop_aliases; # Keys are standard property name; values are each
1358 # one's aliases
1e863613
KW
1359my %prop_value_aliases; # Keys of top level are standard property name;
1360 # values are keys to another hash, Each one is
1361 # one of the property's values, in standard form.
1362 # The values are that prop-val's aliases.
2df7880f 1363my %ucd_pod; # Holds entries that will go into the UCD section of the pod
99870f4d 1364
d867ccfb
KW
1365# Most properties are immune to caseless matching, otherwise you would get
1366# nonsensical results, as properties are a function of a code point, not
1367# everything that is caselessly equivalent to that code point. For example,
1368# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1369# be true because 's' and 'S' are equivalent caselessly. However,
1370# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1371# extend that concept to those very few properties that are like this. Each
1372# such property will match the full range caselessly. They are hard-coded in
1373# the program; it's not worth trying to make it general as it's extremely
1374# unlikely that they will ever change.
1375my %caseless_equivalent_to;
1376
99870f4d
KW
1377# These constants names and values were taken from the Unicode standard,
1378# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1379# syllables. The '_string' versions are so generated tables can retain the
1380# hex format, which is the more familiar value
1381my $SBase_string = "0xAC00";
1382my $SBase = CORE::hex $SBase_string;
1383my $LBase_string = "0x1100";
1384my $LBase = CORE::hex $LBase_string;
1385my $VBase_string = "0x1161";
1386my $VBase = CORE::hex $VBase_string;
1387my $TBase_string = "0x11A7";
1388my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1389my $SCount = 11172;
1390my $LCount = 19;
1391my $VCount = 21;
1392my $TCount = 28;
1393my $NCount = $VCount * $TCount;
1394
1395# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1396# with the above published constants.
1397my %Jamo;
1398my %Jamo_L; # Leading consonants
1399my %Jamo_V; # Vowels
1400my %Jamo_T; # Trailing consonants
1401
bb1dd3da
KW
1402# For code points whose name contains its ordinal as a '-ABCD' suffix.
1403# The key is the base name of the code point, and the value is an
1404# array giving all the ranges that use this base name. Each range
1405# is actually a hash giving the 'low' and 'high' values of it.
1406my %names_ending_in_code_point;
1407my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1408 # removed from the names
1409# Inverse mapping. The list of ranges that have these kinds of
1410# names. Each element contains the low, high, and base names in an
1411# anonymous hash.
1412my @code_points_ending_in_code_point;
1413
6b5ab373
KW
1414# To hold Unicode's normalization test suite
1415my @normalization_tests;
1416
bb1dd3da
KW
1417# Boolean: does this Unicode version have the hangul syllables, and are we
1418# writing out a table for them?
1419my $has_hangul_syllables = 0;
1420
1421# Does this Unicode version have code points whose names end in their
1422# respective code points, and are we writing out a table for them? 0 for no;
1423# otherwise points to first property that a table is needed for them, so that
1424# if multiple tables are needed, we don't create duplicates
1425my $needing_code_points_ending_in_code_point = 0;
1426
37e2e78e 1427my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1428my @unhandled_properties; # Will contain a list of properties found in
1429 # the input that we didn't process.
f86864ac 1430my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1431 # listed in the pod
1432my @map_properties; # Properties that get map files written
1433my @named_sequences; # NamedSequences.txt contents.
1434my %potential_files; # Generated list of all .txt files in the directory
1435 # structure so we can warn if something is being
1436 # ignored.
1437my @files_actually_output; # List of files we generated.
1438my @more_Names; # Some code point names are compound; this is used
1439 # to store the extra components of them.
1440my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1441 # the minimum before we consider it equivalent to a
1442 # candidate rational
1443my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1444
1445# These store references to certain commonly used property objects
1446my $gc;
1447my $perl;
1448my $block;
3e20195b
KW
1449my $perl_charname;
1450my $print;
7fc6cb55 1451my $Any;
359523e2 1452my $script;
99870f4d
KW
1453
1454# Are there conflicting names because of beginning with 'In_', or 'Is_'
1455my $has_In_conflicts = 0;
1456my $has_Is_conflicts = 0;
1457
1458sub internal_file_to_platform ($) {
1459 # Convert our file paths which have '/' separators to those of the
1460 # platform.
1461
1462 my $file = shift;
1463 return undef unless defined $file;
1464
1465 return File::Spec->join(split '/', $file);
d07a55ed 1466}
5beb625e 1467
99870f4d
KW
1468sub file_exists ($) { # platform independent '-e'. This program internally
1469 # uses slash as a path separator.
1470 my $file = shift;
1471 return 0 if ! defined $file;
1472 return -e internal_file_to_platform($file);
1473}
5beb625e 1474
99870f4d 1475sub objaddr($) {
23e33b60
KW
1476 # Returns the address of the blessed input object.
1477 # It doesn't check for blessedness because that would do a string eval
1478 # every call, and the program is structured so that this is never called
1479 # for a non-blessed object.
99870f4d 1480
23e33b60 1481 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1482
1483 # Numifying a ref gives its address.
051df77b 1484 return pack 'J', $_[0];
99870f4d
KW
1485}
1486
558712cf 1487# These are used only if $annotate is true.
c4019d52
KW
1488# The entire range of Unicode characters is examined to populate these
1489# after all the input has been processed. But most can be skipped, as they
1490# have the same descriptive phrases, such as being unassigned
1491my @viacode; # Contains the 1 million character names
1492my @printable; # boolean: And are those characters printable?
1493my @annotate_char_type; # Contains a type of those characters, specifically
1494 # for the purposes of annotation.
1495my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1496 # name for the purposes of annotation. They map to the
c4019d52
KW
1497 # upper edge of the range, so that the end point can
1498 # be immediately found. This is used to skip ahead to
1499 # the end of a range, and avoid processing each
1500 # individual code point in it.
1501my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1502 # characters, but excluding those which are
1503 # also noncharacter code points
1504
1505# The annotation types are an extension of the regular range types, though
1506# some of the latter are folded into one. Make the new types negative to
1507# avoid conflicting with the regular types
1508my $SURROGATE_TYPE = -1;
1509my $UNASSIGNED_TYPE = -2;
1510my $PRIVATE_USE_TYPE = -3;
1511my $NONCHARACTER_TYPE = -4;
1512my $CONTROL_TYPE = -5;
1513my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1514
1515sub populate_char_info ($) {
558712cf 1516 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1517 # input code point's info that are needed for outputting more detailed
1518 # comments. If calling context wants a return, it is the end point of
1519 # any contiguous range of characters that share essentially the same info
1520
1521 my $i = shift;
1522 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1523
1524 $viacode[$i] = $perl_charname->value_of($i) || "";
1525
1526 # A character is generally printable if Unicode says it is,
1527 # but below we make sure that most Unicode general category 'C' types
1528 # aren't.
1529 $printable[$i] = $print->contains($i);
1530
1531 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1532
1533 # Only these two regular types are treated specially for annotations
1534 # purposes
1535 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1536 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1537
1538 # Give a generic name to all code points that don't have a real name.
1539 # We output ranges, if applicable, for these. Also calculate the end
1540 # point of the range.
1541 my $end;
1542 if (! $viacode[$i]) {
1d025d66
KW
1543 my $nonchar;
1544 if ($gc-> table('Private_use')->contains($i)) {
c4019d52
KW
1545 $viacode[$i] = 'Private Use';
1546 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1547 $printable[$i] = 0;
1548 $end = $gc->table('Private_Use')->containing_range($i)->end;
1549 }
1d025d66
KW
1550 elsif ((defined ($nonchar =
1551 Property::property_ref('Noncharacter_Code_Point'))
1552 && $nonchar->table('Y')->contains($i)))
c4019d52
KW
1553 {
1554 $viacode[$i] = 'Noncharacter';
1555 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1556 $printable[$i] = 0;
1557 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1558 containing_range($i)->end;
1559 }
1560 elsif ($gc-> table('Control')->contains($i)) {
c71dea7f 1561 $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
c4019d52
KW
1562 $annotate_char_type[$i] = $CONTROL_TYPE;
1563 $printable[$i] = 0;
c4019d52
KW
1564 }
1565 elsif ($gc-> table('Unassigned')->contains($i)) {
c4019d52
KW
1566 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1567 $printable[$i] = 0;
1d025d66
KW
1568 if ($v_version lt v2.0.0) { # No blocks in earliest releases
1569 $viacode[$i] = 'Unassigned';
1570 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1571 }
1572 else {
1573 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
c4019d52 1574
bf06c733
KW
1575 # Because we name the unassigned by the blocks they are in, it
1576 # can't go past the end of that block, and it also can't go
1577 # past the unassigned range it is in. The special table makes
1578 # sure that the non-characters, which are unassigned, are
1579 # separated out.
1580 $end = min($block->containing_range($i)->end,
1581 $unassigned_sans_noncharacters->
1582 containing_range($i)->end);
1d025d66
KW
1583 }
1584 }
1585 elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases
1586 $viacode[$i] = $gc->value_of($i);
1587 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1588 $printable[$i] = 0;
1589 }
1590 elsif ($gc-> table('Surrogate')->contains($i)) {
1591 $viacode[$i] = 'Surrogate';
1592 $annotate_char_type[$i] = $SURROGATE_TYPE;
1593 $printable[$i] = 0;
1594 $end = $gc->table('Surrogate')->containing_range($i)->end;
13ca76ff
KW
1595 }
1596 else {
1597 Carp::my_carp_bug("Can't figure out how to annotate "
1598 . sprintf("U+%04X", $i)
1599 . ". Proceeding anyway.");
c4019d52
KW
1600 $viacode[$i] = 'UNKNOWN';
1601 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1602 $printable[$i] = 0;
1603 }
1604 }
1605
1606 # Here, has a name, but if it's one in which the code point number is
1607 # appended to the name, do that.
1608 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1609 $viacode[$i] .= sprintf("-%04X", $i);
1610 $end = $perl_charname->containing_range($i)->end;
1611 }
1612
1613 # And here, has a name, but if it's a hangul syllable one, replace it with
1614 # the correct name from the Unicode algorithm
1615 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1616 use integer;
1617 my $SIndex = $i - $SBase;
1618 my $L = $LBase + $SIndex / $NCount;
1619 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1620 my $T = $TBase + $SIndex % $TCount;
1621 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1622 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1623 $end = $perl_charname->containing_range($i)->end;
1624 }
1625
1626 return if ! defined wantarray;
1627 return $i if ! defined $end; # If not a range, return the input
1628
1629 # Save this whole range so can find the end point quickly
1630 $annotate_ranges->add_map($i, $end, $end);
1631
1632 return $end;
1633}
1634
23e33b60
KW
1635# Commented code below should work on Perl 5.8.
1636## This 'require' doesn't necessarily work in miniperl, and even if it does,
1637## the native perl version of it (which is what would operate under miniperl)
1638## is extremely slow, as it does a string eval every call.
1639#my $has_fast_scalar_util = $\18 !~ /miniperl/
1640# && defined eval "require Scalar::Util";
1641#
1642#sub objaddr($) {
1643# # Returns the address of the blessed input object. Uses the XS version if
1644# # available. It doesn't check for blessedness because that would do a
1645# # string eval every call, and the program is structured so that this is
1646# # never called for a non-blessed object.
1647#
1648# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1649#
1650# # Check at least that is a ref.
1651# my $pkg = ref($_[0]) or return undef;
1652#
1653# # Change to a fake package to defeat any overloaded stringify
1654# bless $_[0], 'main::Fake';
1655#
1656# # Numifying a ref gives its address.
051df77b 1657# my $addr = pack 'J', $_[0];
23e33b60
KW
1658#
1659# # Return to original class
1660# bless $_[0], $pkg;
1661# return $addr;
1662#}
1663
99870f4d
KW
1664sub max ($$) {
1665 my $a = shift;
1666 my $b = shift;
1667 return $a if $a >= $b;
1668 return $b;
1669}
1670
1671sub min ($$) {
1672 my $a = shift;
1673 my $b = shift;
1674 return $a if $a <= $b;
1675 return $b;
1676}
1677
1678sub clarify_number ($) {
1679 # This returns the input number with underscores inserted every 3 digits
1680 # in large (5 digits or more) numbers. Input must be entirely digits, not
1681 # checked.
1682
1683 my $number = shift;
1684 my $pos = length($number) - 3;
1685 return $number if $pos <= 1;
1686 while ($pos > 0) {
1687 substr($number, $pos, 0) = '_';
1688 $pos -= 3;
5beb625e 1689 }
99870f4d 1690 return $number;
99598c8c
JH
1691}
1692
12ac2576 1693
99870f4d 1694package Carp;
7ebf06b3 1695
99870f4d
KW
1696# These routines give a uniform treatment of messages in this program. They
1697# are placed in the Carp package to cause the stack trace to not include them,
1698# although an alternative would be to use another package and set @CARP_NOT
1699# for it.
12ac2576 1700
99870f4d 1701our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1702
99f78760
KW
1703# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1704# and overload trying to load Scalar:Util under miniperl. See
1705# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1706undef $overload::VERSION;
1707
99870f4d
KW
1708sub my_carp {
1709 my $message = shift || "";
1710 my $nofold = shift || 0;
7ebf06b3 1711
99870f4d
KW
1712 if ($message) {
1713 $message = main::join_lines($message);
1714 $message =~ s/^$0: *//; # Remove initial program name
1715 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1716 $message = "\n$0: $message;";
12ac2576 1717
99870f4d
KW
1718 # Fold the message with program name, semi-colon end punctuation
1719 # (which looks good with the message that carp appends to it), and a
1720 # hanging indent for continuation lines.
1721 $message = main::simple_fold($message, "", 4) unless $nofold;
1722 $message =~ s/\n$//; # Remove the trailing nl so what carp
1723 # appends is to the same line
1724 }
12ac2576 1725
99870f4d 1726 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1727
99870f4d
KW
1728 carp $message;
1729 return;
1730}
7ebf06b3 1731
99870f4d
KW
1732sub my_carp_bug {
1733 # This is called when it is clear that the problem is caused by a bug in
1734 # this program.
7ebf06b3 1735
99870f4d
KW
1736 my $message = shift;
1737 $message =~ s/^$0: *//;
1738 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1739 carp $message;
1740 return;
1741}
7ebf06b3 1742
99870f4d
KW
1743sub carp_too_few_args {
1744 if (@_ != 2) {
1745 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1746 return;
12ac2576 1747 }
7ebf06b3 1748
99870f4d
KW
1749 my $args_ref = shift;
1750 my $count = shift;
7ebf06b3 1751
99870f4d
KW
1752 my_carp_bug("Need at least $count arguments to "
1753 . (caller 1)[3]
1754 . ". Instead got: '"
1755 . join ', ', @$args_ref
1756 . "'. No action taken.");
1757 return;
12ac2576
JP
1758}
1759
99870f4d
KW
1760sub carp_extra_args {
1761 my $args_ref = shift;
1762 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1763
99870f4d
KW
1764 unless (ref $args_ref) {
1765 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1766 return;
1767 }
1768 my ($package, $file, $line) = caller;
1769 my $subroutine = (caller 1)[3];
cf25bb62 1770
99870f4d
KW
1771 my $list;
1772 if (ref $args_ref eq 'HASH') {
1773 foreach my $key (keys %$args_ref) {
1774 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1775 }
99870f4d 1776 $list = join ', ', each %{$args_ref};
cf25bb62 1777 }
99870f4d
KW
1778 elsif (ref $args_ref eq 'ARRAY') {
1779 foreach my $arg (@$args_ref) {
1780 $arg = $UNDEF unless defined $arg;
1781 }
1782 $list = join ', ', @$args_ref;
1783 }
1784 else {
1785 my_carp_bug("Can't cope with ref "
1786 . ref($args_ref)
1787 . " . argument to 'carp_extra_args'. Not checking arguments.");
1788 return;
1789 }
1790
1791 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1792 return;
d73e5302
JH
1793}
1794
99870f4d
KW
1795package main;
1796
1797{ # Closure
1798
1799 # This program uses the inside-out method for objects, as recommended in
1800 # "Perl Best Practices". This closure aids in generating those. There
1801 # are two routines. setup_package() is called once per package to set
1802 # things up, and then set_access() is called for each hash representing a
1803 # field in the object. These routines arrange for the object to be
1804 # properly destroyed when no longer used, and for standard accessor
1805 # functions to be generated. If you need more complex accessors, just
1806 # write your own and leave those accesses out of the call to set_access().
1807 # More details below.
1808
1809 my %constructor_fields; # fields that are to be used in constructors; see
1810 # below
1811
1812 # The values of this hash will be the package names as keys to other
1813 # hashes containing the name of each field in the package as keys, and
1814 # references to their respective hashes as values.
1815 my %package_fields;
1816
1817 sub setup_package {
1818 # Sets up the package, creating standard DESTROY and dump methods
1819 # (unless already defined). The dump method is used in debugging by
1820 # simple_dumper().
1821 # The optional parameters are:
1822 # a) a reference to a hash, that gets populated by later
1823 # set_access() calls with one of the accesses being
1824 # 'constructor'. The caller can then refer to this, but it is
1825 # not otherwise used by these two routines.
1826 # b) a reference to a callback routine to call during destruction
1827 # of the object, before any fields are actually destroyed
1828
1829 my %args = @_;
1830 my $constructor_ref = delete $args{'Constructor_Fields'};
1831 my $destroy_callback = delete $args{'Destroy_Callback'};
1832 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1833
1834 my %fields;
1835 my $package = (caller)[0];
1836
1837 $package_fields{$package} = \%fields;
1838 $constructor_fields{$package} = $constructor_ref;
1839
1840 unless ($package->can('DESTROY')) {
1841 my $destroy_name = "${package}::DESTROY";
1842 no strict "refs";
1843
1844 # Use typeglob to give the anonymous subroutine the name we want
1845 *$destroy_name = sub {
1846 my $self = shift;
ffe43484 1847 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1848
1849 $self->$destroy_callback if $destroy_callback;
1850 foreach my $field (keys %{$package_fields{$package}}) {
1851 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1852 delete $package_fields{$package}{$field}{$addr};
1853 }
1854 return;
1855 }
1856 }
1857
1858 unless ($package->can('dump')) {
1859 my $dump_name = "${package}::dump";
1860 no strict "refs";
1861 *$dump_name = sub {
1862 my $self = shift;
1863 return dump_inside_out($self, $package_fields{$package}, @_);
1864 }
1865 }
1866 return;
1867 }
1868
1869 sub set_access {
1870 # Arrange for the input field to be garbage collected when no longer
1871 # needed. Also, creates standard accessor functions for the field
1872 # based on the optional parameters-- none if none of these parameters:
1873 # 'addable' creates an 'add_NAME()' accessor function.
1874 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1875 # function.
1876 # 'settable' creates a 'set_NAME()' accessor function.
1877 # 'constructor' doesn't create an accessor function, but adds the
1878 # field to the hash that was previously passed to
1879 # setup_package();
1880 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1881 # 'add' etc. all mean 'addable'.
1882 # The read accessor function will work on both array and scalar
1883 # values. If another accessor in the parameter list is 'a', the read
1884 # access assumes an array. You can also force it to be array access
1885 # by specifying 'readable_array' instead of 'readable'
1886 #
1887 # A sort-of 'protected' access can be set-up by preceding the addable,
1888 # readable or settable with some initial portion of 'protected_' (but,
1889 # the underscore is required), like 'p_a', 'pro_set', etc. The
1890 # "protection" is only by convention. All that happens is that the
1891 # accessor functions' names begin with an underscore. So instead of
1892 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1893 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1894 # end of each package, and then storing the __LINE__ ranges and
1895 # checking them on every accessor. But that is way overkill.)
1896
1897 # We create anonymous subroutines as the accessors and then use
1898 # typeglobs to assign them to the proper package and name
1899
1900 my $name = shift; # Name of the field
1901 my $field = shift; # Reference to the inside-out hash containing the
1902 # field
1903
1904 my $package = (caller)[0];
1905
1906 if (! exists $package_fields{$package}) {
1907 croak "$0: Must call 'setup_package' before 'set_access'";
1908 }
d73e5302 1909
99870f4d
KW
1910 # Stash the field so DESTROY can get it.
1911 $package_fields{$package}{$name} = $field;
cf25bb62 1912
99870f4d
KW
1913 # Remaining arguments are the accessors. For each...
1914 foreach my $access (@_) {
1915 my $access = lc $access;
cf25bb62 1916
99870f4d 1917 my $protected = "";
cf25bb62 1918
99870f4d
KW
1919 # Match the input as far as it goes.
1920 if ($access =~ /^(p[^_]*)_/) {
1921 $protected = $1;
1922 if (substr('protected_', 0, length $protected)
1923 eq $protected)
1924 {
1925
1926 # Add 1 for the underscore not included in $protected
1927 $access = substr($access, length($protected) + 1);
1928 $protected = '_';
1929 }
1930 else {
1931 $protected = "";
1932 }
1933 }
1934
1935 if (substr('addable', 0, length $access) eq $access) {
1936 my $subname = "${package}::${protected}add_$name";
1937 no strict "refs";
1938
1939 # add_ accessor. Don't add if already there, which we
1940 # determine using 'eq' for scalars and '==' otherwise.
1941 *$subname = sub {
1942 use strict "refs";
1943 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1944 my $self = shift;
1945 my $value = shift;
ffe43484 1946 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1947 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1948 if (ref $value) {
f998e60c 1949 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1950 }
1951 else {
f998e60c 1952 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1953 }
f998e60c 1954 push @{$field->{$addr}}, $value;
99870f4d
KW
1955 return;
1956 }
1957 }
1958 elsif (substr('constructor', 0, length $access) eq $access) {
1959 if ($protected) {
1960 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1961 }
1962 else {
1963 $constructor_fields{$package}{$name} = $field;
1964 }
1965 }
1966 elsif (substr('readable_array', 0, length $access) eq $access) {
1967
1968 # Here has read access. If one of the other parameters for
1969 # access is array, or this one specifies array (by being more
1970 # than just 'readable_'), then create a subroutine that
1971 # assumes the data is an array. Otherwise just a scalar
1972 my $subname = "${package}::${protected}$name";
1973 if (grep { /^a/i } @_
1974 or length($access) > length('readable_'))
1975 {
1976 no strict "refs";
1977 *$subname = sub {
1978 use strict "refs";
23e33b60 1979 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1980 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1981 if (ref $field->{$addr} ne 'ARRAY') {
1982 my $type = ref $field->{$addr};
1983 $type = 'scalar' unless $type;
1984 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1985 return;
1986 }
1987 return scalar @{$field->{$addr}} unless wantarray;
1988
1989 # Make a copy; had problems with caller modifying the
1990 # original otherwise
1991 my @return = @{$field->{$addr}};
1992 return @return;
1993 }
1994 }
1995 else {
1996
1997 # Here not an array value, a simpler function.
1998 no strict "refs";
1999 *$subname = sub {
2000 use strict "refs";
23e33b60 2001 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 2002 no overloading;
051df77b 2003 return $field->{pack 'J', $_[0]};
99870f4d
KW
2004 }
2005 }
2006 }
2007 elsif (substr('settable', 0, length $access) eq $access) {
2008 my $subname = "${package}::${protected}set_$name";
2009 no strict "refs";
2010 *$subname = sub {
2011 use strict "refs";
23e33b60
KW
2012 if (main::DEBUG) {
2013 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2014 Carp::carp_extra_args(\@_) if @_ > 2;
2015 }
2016 # $self is $_[0]; $value is $_[1]
f998e60c 2017 no overloading;
051df77b 2018 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
2019 return;
2020 }
2021 }
2022 else {
2023 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2024 }
cf25bb62 2025 }
99870f4d 2026 return;
cf25bb62 2027 }
99870f4d
KW
2028}
2029
2030package Input_file;
2031
2032# All input files use this object, which stores various attributes about them,
2033# and provides for convenient, uniform handling. The run method wraps the
2034# processing. It handles all the bookkeeping of opening, reading, and closing
2035# the file, returning only significant input lines.
2036#
2037# Each object gets a handler which processes the body of the file, and is
2038# called by run(). Most should use the generic, default handler, which has
2039# code scrubbed to handle things you might not expect. A handler should
2040# basically be a while(next_line()) {...} loop.
2041#
2042# You can also set up handlers to
2043# 1) call before the first line is read for pre processing
2044# 2) call to adjust each line of the input before the main handler gets them
2045# 3) call upon EOF before the main handler exits its loop
2046# 4) call at the end for post processing
2047#
2048# $_ is used to store the input line, and is to be filtered by the
2049# each_line_handler()s. So, if the format of the line is not in the desired
2050# format for the main handler, these are used to do that adjusting. They can
2051# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2052# so the $_ output of one is used as the input to the next. None of the other
2053# handlers are stackable, but could easily be changed to be so.
2054#
2055# Most of the handlers can call insert_lines() or insert_adjusted_lines()
2056# which insert the parameters as lines to be processed before the next input
2057# file line is read. This allows the EOF handler to flush buffers, for
2058# example. The difference between the two routines is that the lines inserted
2059# by insert_lines() are subjected to the each_line_handler()s. (So if you
2060# called it from such a handler, you would get infinite recursion.) Lines
2061# inserted by insert_adjusted_lines() go directly to the main handler without
2062# any adjustments. If the post-processing handler calls any of these, there
2063# will be no effect. Some error checking for these conditions could be added,
2064# but it hasn't been done.
2065#
2066# carp_bad_line() should be called to warn of bad input lines, which clears $_
2067# to prevent further processing of the line. This routine will output the
2068# message as a warning once, and then keep a count of the lines that have the
2069# same message, and output that count at the end of the file's processing.
2070# This keeps the number of messages down to a manageable amount.
2071#
2072# get_missings() should be called to retrieve any @missing input lines.
2073# Messages will be raised if this isn't done if the options aren't to ignore
2074# missings.
2075
2076sub trace { return main::trace(@_); }
2077
99870f4d
KW
2078{ # Closure
2079 # Keep track of fields that are to be put into the constructor.
2080 my %constructor_fields;
2081
2082 main::setup_package(Constructor_Fields => \%constructor_fields);
2083
2084 my %file; # Input file name, required
2085 main::set_access('file', \%file, qw{ c r });
2086
2087 my %first_released; # Unicode version file was first released in, required
2088 main::set_access('first_released', \%first_released, qw{ c r });
2089
2090 my %handler; # Subroutine to process the input file, defaults to
2091 # 'process_generic_property_file'
2092 main::set_access('handler', \%handler, qw{ c });
2093
2094 my %property;
2095 # name of property this file is for. defaults to none, meaning not
2096 # applicable, or is otherwise determinable, for example, from each line.
696609bf 2097 main::set_access('property', \%property, qw{ c r });
99870f4d
KW
2098
2099 my %optional;
2100 # If this is true, the file is optional. If not present, no warning is
2101 # output. If it is present, the string given by this parameter is
2102 # evaluated, and if false the file is not processed.
2103 main::set_access('optional', \%optional, 'c', 'r');
2104
2105 my %non_skip;
2106 # This is used for debugging, to skip processing of all but a few input
2107 # files. Add 'non_skip => 1' to the constructor for those files you want
2108 # processed when you set the $debug_skip global.
2109 main::set_access('non_skip', \%non_skip, 'c');
2110
37e2e78e 2111 my %skip;
09ca89ce
KW
2112 # This is used to skip processing of this input file semi-permanently,
2113 # when it evaluates to true. The value should be the reason the file is
2114 # being skipped. It is used for files that we aren't planning to process
2115 # anytime soon, but want to allow to be in the directory and not raise a
2116 # message that we are not handling. Mostly for test files. This is in
2117 # contrast to the non_skip element, which is supposed to be used very
2118 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2119 # pretty much will never look at can be placed in the global
1fec9f60 2120 # %ignored_files instead. Ones used here will be added to %skipped files
37e2e78e
KW
2121 main::set_access('skip', \%skip, 'c');
2122
99870f4d
KW
2123 my %each_line_handler;
2124 # list of subroutines to look at and filter each non-comment line in the
2125 # file. defaults to none. The subroutines are called in order, each is
2126 # to adjust $_ for the next one, and the final one adjusts it for
2127 # 'handler'
2128 main::set_access('each_line_handler', \%each_line_handler, 'c');
2129
2130 my %has_missings_defaults;
2131 # ? Are there lines in the file giving default values for code points
2132 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2133 # the norm, but IGNORED means it has such lines, but the handler doesn't
2134 # use them. Having these three states allows us to catch changes to the
2135 # UCD that this program should track
2136 main::set_access('has_missings_defaults',
2137 \%has_missings_defaults, qw{ c r });
2138
2139 my %pre_handler;
2140 # Subroutine to call before doing anything else in the file. If undef, no
2141 # such handler is called.
2142 main::set_access('pre_handler', \%pre_handler, qw{ c });
2143
2144 my %eof_handler;
2145 # Subroutine to call upon getting an EOF on the input file, but before
2146 # that is returned to the main handler. This is to allow buffers to be
2147 # flushed. The handler is expected to call insert_lines() or
2148 # insert_adjusted() with the buffered material
2149 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2150
2151 my %post_handler;
2152 # Subroutine to call after all the lines of the file are read in and
2153 # processed. If undef, no such handler is called.
2154 main::set_access('post_handler', \%post_handler, qw{ c });
2155
2156 my %progress_message;
2157 # Message to print to display progress in lieu of the standard one
2158 main::set_access('progress_message', \%progress_message, qw{ c });
2159
2160 my %handle;
2161 # cache open file handle, internal. Is undef if file hasn't been
2162 # processed at all, empty if has;
2163 main::set_access('handle', \%handle);
2164
2165 my %added_lines;
2166 # cache of lines added virtually to the file, internal
2167 main::set_access('added_lines', \%added_lines);
2168
2169 my %errors;
2170 # cache of errors found, internal
2171 main::set_access('errors', \%errors);
2172
2173 my %missings;
2174 # storage of '@missing' defaults lines
2175 main::set_access('missings', \%missings);
2176
2177 sub new {
2178 my $class = shift;
2179
2180 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2181 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2182
2183 # Set defaults
2184 $handler{$addr} = \&main::process_generic_property_file;
2185 $non_skip{$addr} = 0;
37e2e78e 2186 $skip{$addr} = 0;
99870f4d
KW
2187 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2188 $handle{$addr} = undef;
2189 $added_lines{$addr} = [ ];
2190 $each_line_handler{$addr} = [ ];
2191 $errors{$addr} = { };
2192 $missings{$addr} = [ ];
2193
2194 # Two positional parameters.
99f78760 2195 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2196 $file{$addr} = main::internal_file_to_platform(shift);
2197 $first_released{$addr} = shift;
2198
2199 # The rest of the arguments are key => value pairs
2200 # %constructor_fields has been set up earlier to list all possible
2201 # ones. Either set or push, depending on how the default has been set
2202 # up just above.
2203 my %args = @_;
2204 foreach my $key (keys %args) {
2205 my $argument = $args{$key};
2206
2207 # Note that the fields are the lower case of the constructor keys
2208 my $hash = $constructor_fields{lc $key};
2209 if (! defined $hash) {
2210 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2211 next;
2212 }
2213 if (ref $hash->{$addr} eq 'ARRAY') {
2214 if (ref $argument eq 'ARRAY') {
2215 foreach my $argument (@{$argument}) {
2216 next if ! defined $argument;
2217 push @{$hash->{$addr}}, $argument;
2218 }
2219 }
2220 else {
2221 push @{$hash->{$addr}}, $argument if defined $argument;
2222 }
2223 }
2224 else {
2225 $hash->{$addr} = $argument;
2226 }
2227 delete $args{$key};
2228 };
2229
2230 # If the file has a property for it, it means that the property is not
2231 # listed in the file's entries. So add a handler to the list of line
2232 # handlers to insert the property name into the lines, to provide a
2233 # uniform interface to the final processing subroutine.
2234 # the final code doesn't have to worry about that.
2235 if ($property{$addr}) {
2236 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2237 }
2238
2239 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2240 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 2241 }
99870f4d 2242
09ca89ce
KW
2243 # If skipping, set to optional, and add to list of ignored files,
2244 # including its reason
2245 if ($skip{$addr}) {
2246 $optional{$addr} = 1;
1fec9f60 2247 $skipped_files{$file{$addr}} = $skip{$addr}
09ca89ce 2248 }
37e2e78e 2249
99870f4d 2250 return $self;
d73e5302
JH
2251 }
2252
cf25bb62 2253
99870f4d
KW
2254 use overload
2255 fallback => 0,
2256 qw("") => "_operator_stringify",
2257 "." => \&main::_operator_dot,
1285127e 2258 ".=" => \&main::_operator_dot_equal,
99870f4d 2259 ;
cf25bb62 2260
99870f4d
KW
2261 sub _operator_stringify {
2262 my $self = shift;
cf25bb62 2263
99870f4d 2264 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2265 }
d73e5302 2266
99870f4d
KW
2267 # flag to make sure extracted files are processed early
2268 my $seen_non_extracted_non_age = 0;
d73e5302 2269
99870f4d
KW
2270 sub run {
2271 # Process the input object $self. This opens and closes the file and
2272 # calls all the handlers for it. Currently, this can only be called
2273 # once per file, as it destroy's the EOF handler
d73e5302 2274
99870f4d
KW
2275 my $self = shift;
2276 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2277
ffe43484 2278 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2279
99870f4d 2280 my $file = $file{$addr};
d73e5302 2281
99870f4d
KW
2282 # Don't process if not expecting this file (because released later
2283 # than this Unicode version), and isn't there. This means if someone
2284 # copies it into an earlier version's directory, we will go ahead and
2285 # process it.
2286 return if $first_released{$addr} gt $v_version && ! -e $file;
2287
2288 # If in debugging mode and this file doesn't have the non-skip
2289 # flag set, and isn't one of the critical files, skip it.
2290 if ($debug_skip
2291 && $first_released{$addr} ne v0
2292 && ! $non_skip{$addr})
2293 {
2294 print "Skipping $file in debugging\n" if $verbosity;
2295 return;
2296 }
2297
2298 # File could be optional
37e2e78e 2299 if ($optional{$addr}) {
99870f4d
KW
2300 return unless -e $file;
2301 my $result = eval $optional{$addr};
2302 if (! defined $result) {
2303 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2304 return;
2305 }
2306 if (! $result) {
2307 if ($verbosity) {
2308 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2309 }
2310 return;
2311 }
2312 }
2313
2314 if (! defined $file || ! -e $file) {
2315
2316 # If the file doesn't exist, see if have internal data for it
2317 # (based on first_released being 0).
2318 if ($first_released{$addr} eq v0) {
2319 $handle{$addr} = 'pretend_is_open';
2320 }
2321 else {
2322 if (! $optional{$addr} # File could be optional
2323 && $v_version ge $first_released{$addr})
2324 {
2325 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2326 }
2327 return;
2328 }
2329 }
2330 else {
2331
37e2e78e
KW
2332 # Here, the file exists. Some platforms may change the case of
2333 # its name
99870f4d 2334 if ($seen_non_extracted_non_age) {
517956bf 2335 if ($file =~ /$EXTRACTED/i) {
1675ea0d 2336 Carp::my_carp_bug(main::join_lines(<<END
99f78760 2337$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2338anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2339have subtle problems
2340END
2341 ));
2342 }
2343 }
2344 elsif ($EXTRACTED_DIR
2345 && $first_released{$addr} ne v0
517956bf
CB
2346 && $file !~ /$EXTRACTED/i
2347 && lc($file) ne 'dage.txt')
99870f4d
KW
2348 {
2349 # We don't set this (by the 'if' above) if we have no
2350 # extracted directory, so if running on an early version,
2351 # this test won't work. Not worth worrying about.
2352 $seen_non_extracted_non_age = 1;
2353 }
2354
2355 # And mark the file as having being processed, and warn if it
2356 # isn't a file we are expecting. As we process the files,
2357 # they are deleted from the hash, so any that remain at the
2358 # end of the program are files that we didn't process.
517956bf 2359 my $fkey = File::Spec->rel2abs($file);
faf3cf6b
KW
2360 my $expecting = delete $potential_files{lc($fkey)};
2361
678f13d5
KW
2362 Carp::my_carp("Was not expecting '$file'.") if
2363 ! $expecting
99870f4d
KW
2364 && ! defined $handle{$addr};
2365
37e2e78e
KW
2366 # Having deleted from expected files, we can quit if not to do
2367 # anything. Don't print progress unless really want verbosity
2368 if ($skip{$addr}) {
2369 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2370 return;
2371 }
2372
99870f4d
KW
2373 # Open the file, converting the slashes used in this program
2374 # into the proper form for the OS
2375 my $file_handle;
2376 if (not open $file_handle, "<", $file) {
2377 Carp::my_carp("Can't open $file. Skipping: $!");
2378 return 0;
2379 }
2380 $handle{$addr} = $file_handle; # Cache the open file handle
2381 }
2382
2383 if ($verbosity >= $PROGRESS) {
2384 if ($progress_message{$addr}) {
2385 print "$progress_message{$addr}\n";
2386 }
2387 else {
2388 # If using a virtual file, say so.
2389 print "Processing ", (-e $file)
2390 ? $file
2391 : "substitute $file",
2392 "\n";
2393 }
2394 }
2395
2396
2397 # Call any special handler for before the file.
2398 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2399
2400 # Then the main handler
2401 &{$handler{$addr}}($self);
2402
2403 # Then any special post-file handler.
2404 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2405
2406 # If any errors have been accumulated, output the counts (as the first
2407 # error message in each class was output when it was encountered).
2408 if ($errors{$addr}) {
2409 my $total = 0;
2410 my $types = 0;
2411 foreach my $error (keys %{$errors{$addr}}) {
2412 $total += $errors{$addr}->{$error};
2413 delete $errors{$addr}->{$error};
2414 $types++;
2415 }
2416 if ($total > 1) {
2417 my $message
2418 = "A total of $total lines had errors in $file. ";
2419
2420 $message .= ($types == 1)
2421 ? '(Only the first one was displayed.)'
2422 : '(Only the first of each type was displayed.)';
2423 Carp::my_carp($message);
2424 }
2425 }
2426
2427 if (@{$missings{$addr}}) {
2428 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2429 }
2430
2431 # If a real file handle, close it.
2432 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2433 ref $handle{$addr};
2434 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2435 # the file, as opposed to undef
2436 return;
2437 }
2438
2439 sub next_line {
2440 # Sets $_ to be the next logical input line, if any. Returns non-zero
2441 # if such a line exists. 'logical' means that any lines that have
2442 # been added via insert_lines() will be returned in $_ before the file
2443 # is read again.
2444
2445 my $self = shift;
2446 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2447
ffe43484 2448 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2449
2450 # Here the file is open (or if the handle is not a ref, is an open
2451 # 'virtual' file). Get the next line; any inserted lines get priority
2452 # over the file itself.
2453 my $adjusted;
2454
2455 LINE:
2456 while (1) { # Loop until find non-comment, non-empty line
2457 #local $to_trace = 1 if main::DEBUG;
2458 my $inserted_ref = shift @{$added_lines{$addr}};
2459 if (defined $inserted_ref) {
2460 ($adjusted, $_) = @{$inserted_ref};
2461 trace $adjusted, $_ if main::DEBUG && $to_trace;
2462 return 1 if $adjusted;
2463 }
2464 else {
2465 last if ! ref $handle{$addr}; # Don't read unless is real file
2466 last if ! defined ($_ = readline $handle{$addr});
2467 }
2468 chomp;
2469 trace $_ if main::DEBUG && $to_trace;
2470
2471 # See if this line is the comment line that defines what property
2472 # value that code points that are not listed in the file should
2473 # have. The format or existence of these lines is not guaranteed
2474 # by Unicode since they are comments, but the documentation says
2475 # that this was added for machine-readability, so probably won't
2476 # change. This works starting in Unicode Version 5.0. They look
2477 # like:
2478 #
2479 # @missing: 0000..10FFFF; Not_Reordered
2480 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2481 # @missing: 0000..10FFFF; ; NaN
2482 #
2483 # Save the line for a later get_missings() call.
2484 if (/$missing_defaults_prefix/) {
2485 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2486 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2487 }
2488 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2489 my @defaults = split /\s* ; \s*/x, $_;
2490
2491 # The first field is the @missing, which ends in a
2492 # semi-colon, so can safely shift.
2493 shift @defaults;
2494
2495 # Some of these lines may have empty field placeholders
2496 # which get in the way. An example is:
2497 # @missing: 0000..10FFFF; ; NaN
2498 # Remove them. Process starting from the top so the
2499 # splice doesn't affect things still to be looked at.
2500 for (my $i = @defaults - 1; $i >= 0; $i--) {
2501 next if $defaults[$i] ne "";
2502 splice @defaults, $i, 1;
2503 }
2504
2505 # What's left should be just the property (maybe) and the
2506 # default. Having only one element means it doesn't have
2507 # the property.
2508 my $default;
2509 my $property;
2510 if (@defaults >= 1) {
2511 if (@defaults == 1) {
2512 $default = $defaults[0];
2513 }
2514 else {
2515 $property = $defaults[0];
2516 $default = $defaults[1];
2517 }
2518 }
2519
2520 if (@defaults < 1
2521 || @defaults > 2
2522 || ($default =~ /^</
2523 && $default !~ /^<code *point>$/i
09f8d0ac
KW
2524 && $default !~ /^<none>$/i
2525 && $default !~ /^<script>$/i))
99870f4d
KW
2526 {
2527 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2528 }
2529 else {
2530
2531 # If the property is missing from the line, it should
2532 # be the one for the whole file
2533 $property = $property{$addr} if ! defined $property;
2534
2535 # Change <none> to the null string, which is what it
2536 # really means. If the default is the code point
2537 # itself, set it to <code point>, which is what
2538 # Unicode uses (but sometimes they've forgotten the
2539 # space)
2540 if ($default =~ /^<none>$/i) {
2541 $default = "";
2542 }
2543 elsif ($default =~ /^<code *point>$/i) {
2544 $default = $CODE_POINT;
2545 }
09f8d0ac
KW
2546 elsif ($default =~ /^<script>$/i) {
2547
2548 # Special case this one. Currently is from
2549 # ScriptExtensions.txt, and means for all unlisted
2550 # code points, use their Script property values.
2551 # For the code points not listed in that file, the
2552 # default value is 'Unknown'.
2553 $default = "Unknown";
2554 }
99870f4d
KW
2555
2556 # Store them as a sub-arrays with both components.
2557 push @{$missings{$addr}}, [ $default, $property ];
2558 }
2559 }
2560
2561 # There is nothing for the caller to process on this comment
2562 # line.
2563 next;
2564 }
2565
2566 # Remove comments and trailing space, and skip this line if the
2567 # result is empty
2568 s/#.*//;
2569 s/\s+$//;
2570 next if /^$/;
2571
2572 # Call any handlers for this line, and skip further processing of
2573 # the line if the handler sets the line to null.
2574 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2575 &{$sub_ref}($self);
2576 next LINE if /^$/;
2577 }
2578
2579 # Here the line is ok. return success.
2580 return 1;
2581 } # End of looping through lines.
2582
2583 # If there is an EOF handler, call it (only once) and if it generates
2584 # more lines to process go back in the loop to handle them.
2585 if ($eof_handler{$addr}) {
2586 &{$eof_handler{$addr}}($self);
2587 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2588 goto LINE if $added_lines{$addr};
2589 }
2590
2591 # Return failure -- no more lines.
2592 return 0;
2593
2594 }
2595
2596# Not currently used, not fully tested.
2597# sub peek {
2598# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2599# # record. Not callable from an each_line_handler(), nor does it call
2600# # an each_line_handler() on the line.
2601#
2602# my $self = shift;
ffe43484 2603# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2604#
2605# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2606# my ($adjusted, $line) = @{$inserted_ref};
2607# next if $adjusted;
2608#
2609# # Remove comments and trailing space, and return a non-empty
2610# # resulting line
2611# $line =~ s/#.*//;
2612# $line =~ s/\s+$//;
2613# return $line if $line ne "";
2614# }
2615#
2616# return if ! ref $handle{$addr}; # Don't read unless is real file
2617# while (1) { # Loop until find non-comment, non-empty line
2618# local $to_trace = 1 if main::DEBUG;
2619# trace $_ if main::DEBUG && $to_trace;
2620# return if ! defined (my $line = readline $handle{$addr});
2621# chomp $line;
2622# push @{$added_lines{$addr}}, [ 0, $line ];
2623#
2624# $line =~ s/#.*//;
2625# $line =~ s/\s+$//;
2626# return $line if $line ne "";
2627# }
2628#
2629# return;
2630# }
2631
2632
2633 sub insert_lines {
2634 # Lines can be inserted so that it looks like they were in the input
2635 # file at the place it was when this routine is called. See also
2636 # insert_adjusted_lines(). Lines inserted via this routine go through
2637 # any each_line_handler()
2638
2639 my $self = shift;
2640
2641 # Each inserted line is an array, with the first element being 0 to
2642 # indicate that this line hasn't been adjusted, and needs to be
2643 # processed.
f998e60c 2644 no overloading;
051df77b 2645 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2646 return;
2647 }
2648
2649 sub insert_adjusted_lines {
2650 # Lines can be inserted so that it looks like they were in the input
2651 # file at the place it was when this routine is called. See also
2652 # insert_lines(). Lines inserted via this routine are already fully
2653 # adjusted, ready to be processed; each_line_handler()s handlers will
2654 # not be called. This means this is not a completely general
2655 # facility, as only the last each_line_handler on the stack should
2656 # call this. It could be made more general, by passing to each of the
2657 # line_handlers their position on the stack, which they would pass on
2658 # to this routine, and that would replace the boolean first element in
2659 # the anonymous array pushed here, so that the next_line routine could
2660 # use that to call only those handlers whose index is after it on the
2661 # stack. But this is overkill for what is needed now.
2662
2663 my $self = shift;
2664 trace $_[0] if main::DEBUG && $to_trace;
2665
2666 # Each inserted line is an array, with the first element being 1 to
2667 # indicate that this line has been adjusted
f998e60c 2668 no overloading;
051df77b 2669 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2670 return;
2671 }
2672
2673 sub get_missings {
2674 # Returns the stored up @missings lines' values, and clears the list.
2675 # The values are in an array, consisting of the default in the first
2676 # element, and the property in the 2nd. However, since these lines
2677 # can be stacked up, the return is an array of all these arrays.
2678
2679 my $self = shift;
2680 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2681
ffe43484 2682 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2683
2684 # If not accepting a list return, just return the first one.
2685 return shift @{$missings{$addr}} unless wantarray;
2686
2687 my @return = @{$missings{$addr}};
2688 undef @{$missings{$addr}};
2689 return @return;
2690 }
2691
2692 sub _insert_property_into_line {
2693 # Add a property field to $_, if this file requires it.
2694
f998e60c 2695 my $self = shift;
ffe43484 2696 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2697 my $property = $property{$addr};
99870f4d
KW
2698 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2699
2700 $_ =~ s/(;|$)/; $property$1/;
2701 return;
2702 }
2703
2704 sub carp_bad_line {
2705 # Output consistent error messages, using either a generic one, or the
2706 # one given by the optional parameter. To avoid gazillions of the
2707 # same message in case the syntax of a file is way off, this routine
2708 # only outputs the first instance of each message, incrementing a
2709 # count so the totals can be output at the end of the file.
2710
2711 my $self = shift;
2712 my $message = shift;
2713 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2714
ffe43484 2715 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2716
2717 $message = 'Unexpected line' unless $message;
2718
2719 # No trailing punctuation so as to fit with our addenda.
2720 $message =~ s/[.:;,]$//;
2721
2722 # If haven't seen this exact message before, output it now. Otherwise
2723 # increment the count of how many times it has occurred
2724 unless ($errors{$addr}->{$message}) {
2725 Carp::my_carp("$message in '$_' in "
f998e60c 2726 . $file{$addr}
99870f4d
KW
2727 . " at line $.. Skipping this line;");
2728 $errors{$addr}->{$message} = 1;
2729 }
2730 else {
2731 $errors{$addr}->{$message}++;
2732 }
2733
2734 # Clear the line to prevent any further (meaningful) processing of it.
2735 $_ = "";
2736
2737 return;
2738 }
2739} # End closure
2740
2741package Multi_Default;
2742
2743# Certain properties in early versions of Unicode had more than one possible
2744# default for code points missing from the files. In these cases, one
2745# default applies to everything left over after all the others are applied,
2746# and for each of the others, there is a description of which class of code
2747# points applies to it. This object helps implement this by storing the
2748# defaults, and for all but that final default, an eval string that generates
2749# the class that it applies to.
2750
2751
2752{ # Closure
2753
2754 main::setup_package();
2755
2756 my %class_defaults;
2757 # The defaults structure for the classes
2758 main::set_access('class_defaults', \%class_defaults);
2759
2760 my %other_default;
2761 # The default that applies to everything left over.
2762 main::set_access('other_default', \%other_default, 'r');
2763
2764
2765 sub new {
2766 # The constructor is called with default => eval pairs, terminated by
2767 # the left-over default. e.g.
2768 # Multi_Default->new(
2769 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2770 # - 0x200D',
2771 # 'R' => 'some other expression that evaluates to code points',
2772 # .
2773 # .
2774 # .
2775 # 'U'));
2776
2777 my $class = shift;
2778
2779 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2780 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2781
2782 while (@_ > 1) {
2783 my $default = shift;
2784 my $eval = shift;
2785 $class_defaults{$addr}->{$default} = $eval;
2786 }
2787
2788 $other_default{$addr} = shift;
2789
2790 return $self;
2791 }
2792
2793 sub get_next_defaults {
2794 # Iterates and returns the next class of defaults.
2795 my $self = shift;
2796 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2797
ffe43484 2798 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2799
2800 return each %{$class_defaults{$addr}};
2801 }
2802}
2803
2804package Alias;
2805
2806# An alias is one of the names that a table goes by. This class defines them
2807# including some attributes. Everything is currently setup in the
2808# constructor.
2809
2810
2811{ # Closure
2812
2813 main::setup_package();
2814
2815 my %name;
2816 main::set_access('name', \%name, 'r');
2817
2818 my %loose_match;
c12f2655 2819 # Should this name match loosely or not.
99870f4d
KW
2820 main::set_access('loose_match', \%loose_match, 'r');
2821
33e96e72
KW
2822 my %make_re_pod_entry;
2823 # Some aliases should not get their own entries in the re section of the
2824 # pod, because they are covered by a wild-card, and some we want to
2825 # discourage use of. Binary
f82fe4ba 2826 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
99870f4d 2827
fd1e3e84
KW
2828 my %ucd;
2829 # Is this documented to be accessible via Unicode::UCD
2830 main::set_access('ucd', \%ucd, 'r', 's');
2831
99870f4d
KW
2832 my %status;
2833 # Aliases have a status, like deprecated, or even suppressed (which means
2834 # they don't appear in documentation). Enum
2835 main::set_access('status', \%status, 'r');
2836
0eac1e20 2837 my %ok_as_filename;
99870f4d
KW
2838 # Similarly, some aliases should not be considered as usable ones for
2839 # external use, such as file names, or we don't want documentation to
2840 # recommend them. Boolean
0eac1e20 2841 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
99870f4d
KW
2842
2843 sub new {
2844 my $class = shift;
2845
2846 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2847 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2848
2849 $name{$addr} = shift;
2850 $loose_match{$addr} = shift;
33e96e72 2851 $make_re_pod_entry{$addr} = shift;
0eac1e20 2852 $ok_as_filename{$addr} = shift;
99870f4d 2853 $status{$addr} = shift;
fd1e3e84 2854 $ucd{$addr} = shift;
99870f4d
KW
2855
2856 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2857
2858 # Null names are never ok externally
0eac1e20 2859 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
99870f4d
KW
2860
2861 return $self;
2862 }
2863}
2864
2865package Range;
2866
2867# A range is the basic unit for storing code points, and is described in the
2868# comments at the beginning of the program. Each range has a starting code
2869# point; an ending code point (not less than the starting one); a value
2870# that applies to every code point in between the two end-points, inclusive;
2871# and an enum type that applies to the value. The type is for the user's
2872# convenience, and has no meaning here, except that a non-zero type is
2873# considered to not obey the normal Unicode rules for having standard forms.
2874#
2875# The same structure is used for both map and match tables, even though in the
2876# latter, the value (and hence type) is irrelevant and could be used as a
2877# comment. In map tables, the value is what all the code points in the range
2878# map to. Type 0 values have the standardized version of the value stored as
2879# well, so as to not have to recalculate it a lot.
2880
2881sub trace { return main::trace(@_); }
2882
2883{ # Closure
2884
2885 main::setup_package();
2886
2887 my %start;
2888 main::set_access('start', \%start, 'r', 's');
2889
2890 my %end;
2891 main::set_access('end', \%end, 'r', 's');
2892
2893 my %value;
2894 main::set_access('value', \%value, 'r');
2895
2896 my %type;
2897 main::set_access('type', \%type, 'r');
2898
2899 my %standard_form;
2900 # The value in internal standard form. Defined only if the type is 0.
2901 main::set_access('standard_form', \%standard_form);
2902
2903 # Note that if these fields change, the dump() method should as well
2904
2905 sub new {
2906 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2907 my $class = shift;
2908
2909 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2910 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2911
2912 $start{$addr} = shift;
2913 $end{$addr} = shift;
2914
2915 my %args = @_;
2916
2917 my $value = delete $args{'Value'}; # Can be 0
2918 $value = "" unless defined $value;
2919 $value{$addr} = $value;
2920
2921 $type{$addr} = delete $args{'Type'} || 0;
2922
2923 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2924
99870f4d
KW
2925 return $self;
2926 }
2927
2928 use overload
2929 fallback => 0,
2930 qw("") => "_operator_stringify",
2931 "." => \&main::_operator_dot,
1285127e 2932 ".=" => \&main::_operator_dot_equal,
99870f4d
KW
2933 ;
2934
2935 sub _operator_stringify {
2936 my $self = shift;
ffe43484 2937 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2938
2939 # Output it like '0041..0065 (value)'
2940 my $return = sprintf("%04X", $start{$addr})
2941 . '..'
2942 . sprintf("%04X", $end{$addr});
2943 my $value = $value{$addr};
2944 my $type = $type{$addr};
2945 $return .= ' (';
2946 $return .= "$value";
2947 $return .= ", Type=$type" if $type != 0;
2948 $return .= ')';
2949
2950 return $return;
2951 }
2952
2953 sub standard_form {
c292d35a
NC
2954 # Calculate the standard form only if needed, and cache the result.
2955 # The standard form is the value itself if the type is special.
2956 # This represents a considerable CPU and memory saving - at the time
2957 # of writing there are 368676 non-special objects, but the standard
2958 # form is only requested for 22047 of them - ie about 6%.
99870f4d
KW
2959
2960 my $self = shift;
2961 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2962
ffe43484 2963 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2964
2965 return $standard_form{$addr} if defined $standard_form{$addr};
c292d35a
NC
2966
2967 my $value = $value{$addr};
2968 return $value if $type{$addr};
2969 return $standard_form{$addr} = main::standardize($value);
99870f4d
KW
2970 }
2971
2972 sub dump {
2973 # Human, not machine readable. For machine readable, comment out this
2974 # entire routine and let the standard one take effect.
2975 my $self = shift;
2976 my $indent = shift;
2977 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2978
ffe43484 2979 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2980
2981 my $return = $indent
2982 . sprintf("%04X", $start{$addr})
2983 . '..'
2984 . sprintf("%04X", $end{$addr})
2985 . " '$value{$addr}';";
2986 if (! defined $standard_form{$addr}) {
2987 $return .= "(type=$type{$addr})";
2988 }
2989 elsif ($standard_form{$addr} ne $value{$addr}) {
2990 $return .= "(standard '$standard_form{$addr}')";
2991 }
2992 return $return;
2993 }
2994} # End closure
2995
2996package _Range_List_Base;
2997
2998# Base class for range lists. A range list is simply an ordered list of
2999# ranges, so that the ranges with the lowest starting numbers are first in it.
3000#
3001# When a new range is added that is adjacent to an existing range that has the
3002# same value and type, it merges with it to form a larger range.
3003#
3004# Ranges generally do not overlap, except that there can be multiple entries
3005# of single code point ranges. This is because of NameAliases.txt.
3006#
3007# In this program, there is a standard value such that if two different
3008# values, have the same standard value, they are considered equivalent. This
3009# value was chosen so that it gives correct results on Unicode data
3010
3011# There are a number of methods to manipulate range lists, and some operators
3012# are overloaded to handle them.
3013
99870f4d
KW
3014sub trace { return main::trace(@_); }
3015
3016{ # Closure
3017
3018 our $addr;
3019
5b348b71
KW
3020 # Max is initialized to a negative value that isn't adjacent to 0, for
3021 # simpler tests
3022 my $max_init = -2;
3023
99870f4d
KW
3024 main::setup_package();
3025
3026 my %ranges;
3027 # The list of ranges
3028 main::set_access('ranges', \%ranges, 'readable_array');
3029
3030 my %max;
3031 # The highest code point in the list. This was originally a method, but
3032 # actual measurements said it was used a lot.
3033 main::set_access('max', \%max, 'r');
3034
3035 my %each_range_iterator;
3036 # Iterator position for each_range()
3037 main::set_access('each_range_iterator', \%each_range_iterator);
3038
3039 my %owner_name_of;
3040 # Name of parent this is attached to, if any. Solely for better error
3041 # messages.
3042 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3043
3044 my %_search_ranges_cache;
3045 # A cache of the previous result from _search_ranges(), for better
3046 # performance
3047 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3048
3049 sub new {
3050 my $class = shift;
3051 my %args = @_;
3052
3053 # Optional initialization data for the range list.
3054 my $initialize = delete $args{'Initialize'};
3055
3056 my $self;
3057
3058 # Use _union() to initialize. _union() returns an object of this
3059 # class, which means that it will call this constructor recursively.
3060 # But it won't have this $initialize parameter so that it won't
3061 # infinitely loop on this.
3062 return _union($class, $initialize, %args) if defined $initialize;
3063
3064 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 3065 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3066
3067 # Optional parent object, only for debug info.
3068 $owner_name_of{$addr} = delete $args{'Owner'};
3069 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3070
3071 # Stringify, in case it is an object.
3072 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3073
3074 # This is used only for error messages, and so a colon is added
3075 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3076
3077 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3078
5b348b71 3079 $max{$addr} = $max_init;
99870f4d
KW
3080
3081 $_search_ranges_cache{$addr} = 0;
3082 $ranges{$addr} = [];
3083
3084 return $self;
3085 }
3086
3087 use overload
3088 fallback => 0,
3089 qw("") => "_operator_stringify",
3090 "." => \&main::_operator_dot,
1285127e 3091 ".=" => \&main::_operator_dot_equal,
99870f4d
KW
3092 ;
3093
3094 sub _operator_stringify {
3095 my $self = shift;
ffe43484 3096 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3097
3098 return "Range_List attached to '$owner_name_of{$addr}'"
3099 if $owner_name_of{$addr};
3100 return "anonymous Range_List " . \$self;
3101 }
3102
3103 sub _union {
3104 # Returns the union of the input code points. It can be called as
3105 # either a constructor or a method. If called as a method, the result
3106 # will be a new() instance of the calling object, containing the union
3107 # of that object with the other parameter's code points; if called as
d59563d0 3108 # a constructor, the first parameter gives the class that the new object
99870f4d
KW
3109 # should be, and the second parameter gives the code points to go into
3110 # it.
3111 # In either case, there are two parameters looked at by this routine;
3112 # any additional parameters are passed to the new() constructor.
3113 #
3114 # The code points can come in the form of some object that contains
3115 # ranges, and has a conventionally named method to access them; or
3116 # they can be an array of individual code points (as integers); or
3117 # just a single code point.
3118 #
3119 # If they are ranges, this routine doesn't make any effort to preserve
3198cc57
KW
3120 # the range values and types of one input over the other. Therefore
3121 # this base class should not allow _union to be called from other than
99870f4d
KW
3122 # initialization code, so as to prevent two tables from being added
3123 # together where the range values matter. The general form of this
3124 # routine therefore belongs in a derived class, but it was moved here
3125 # to avoid duplication of code. The failure to overload this in this
3126 # class keeps it safe.
3198cc57
KW
3127 #
3128 # It does make the effort during initialization to accept tables with
3129 # multiple values for the same code point, and to preserve the order
3130 # of these. If there is only one input range or range set, it doesn't
3131 # sort (as it should already be sorted to the desired order), and will
3132 # accept multiple values per code point. Otherwise it will merge
3133 # multiple values into a single one.
99870f4d
KW
3134
3135 my $self;
3136 my @args; # Arguments to pass to the constructor
3137
3138 my $class = shift;
3139
3140 # If a method call, will start the union with the object itself, and
3141 # the class of the new object will be the same as self.
3142 if (ref $class) {
3143 $self = $class;
3144 $class = ref $self;
3145 push @args, $self;
3146 }
3147
3148 # Add the other required parameter.
3149 push @args, shift;
3150 # Rest of parameters are passed on to the constructor
3151
3152 # Accumulate all records from both lists.
3153 my @records;
3198cc57 3154 my $input_count = 0;
99870f4d
KW
3155 for my $arg (@args) {
3156 #local $to_trace = 0 if main::DEBUG;
3157 trace "argument = $arg" if main::DEBUG && $to_trace;
3158 if (! defined $arg) {
3159 my $message = "";
3160 if (defined $self) {
f998e60c 3161 no overloading;
051df77b 3162 $message .= $owner_name_of{pack 'J', $self};
99870f4d 3163 }
ada6088e 3164 Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
99870f4d
KW
3165 return;
3166 }
3198cc57 3167
99870f4d
KW
3168 $arg = [ $arg ] if ! ref $arg;
3169 my $type = ref $arg;
3170 if ($type eq 'ARRAY') {
3171 foreach my $element (@$arg) {
3172 push @records, Range->new($element, $element);
3198cc57 3173 $input_count++;
99870f4d
KW
3174 }
3175 }
3176 elsif ($arg->isa('Range')) {
3177 push @records, $arg;
3198cc57 3178 $input_count++;
99870f4d
KW
3179 }
3180 elsif ($arg->can('ranges')) {
3181 push @records, $arg->ranges;
3198cc57 3182 $input_count++;
99870f4d
KW
3183 }
3184 else {
3185 my $message = "";
3186 if (defined $self) {
f998e60c 3187 no overloading;
051df77b 3188 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
3189 }
3190 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3191 return;
3192 }
3193 }
3194
3195 # Sort with the range containing the lowest ordinal first, but if
3196 # two ranges start at the same code point, sort with the bigger range
3197 # of the two first, because it takes fewer cycles.
3198cc57
KW
3198 if ($input_count > 1) {
3199 @records = sort { ($a->start <=> $b->start)
99870f4d
KW
3200 or
3201 # if b is shorter than a, b->end will be
3202 # less than a->end, and we want to select
3203 # a, so want to return -1
3204 ($b->end <=> $a->end)
3205 } @records;
3198cc57 3206 }
99870f4d
KW
3207
3208 my $new = $class->new(@_);
3209
3210 # Fold in records so long as they add new information.
3211 for my $set (@records) {
3212 my $start = $set->start;
3213 my $end = $set->end;
d59563d0 3214 my $value = $set->value;
3198cc57 3215 my $type = $set->type;
99870f4d 3216 if ($start > $new->max) {
3198cc57 3217 $new->_add_delete('+', $start, $end, $value, Type => $type);
99870f4d
KW
3218 }
3219 elsif ($end > $new->max) {
3198cc57
KW
3220 $new->_add_delete('+', $new->max +1, $end, $value,
3221 Type => $type);
3222 }
3223 elsif ($input_count == 1) {
3224 # Here, overlaps existing range, but is from a single input,
3225 # so preserve the multiple values from that input.
3226 $new->_add_delete('+', $start, $end, $value, Type => $type,
3227 Replace => $MULTIPLE_AFTER);
99870f4d
KW
3228 }
3229 }
3230
3231 return $new;
3232 }
3233
3234 sub range_count { # Return the number of ranges in the range list
3235 my $self = shift;
3236 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3237
f998e60c 3238 no overloading;
051df77b 3239 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
3240 }
3241
3242 sub min {
3243 # Returns the minimum code point currently in the range list, or if
3244 # the range list is empty, 2 beyond the max possible. This is a
3245 # method because used so rarely, that not worth saving between calls,
3246 # and having to worry about changing it as ranges are added and
3247 # deleted.
3248
3249 my $self = shift;
3250 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3251
ffe43484 3252 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3253
3254 # If the range list is empty, return a large value that isn't adjacent
3255 # to any that could be in the range list, for simpler tests
6189eadc 3256 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
99870f4d
KW
3257 return $ranges{$addr}->[0]->start;
3258 }
3259
3260 sub contains {
3261 # Boolean: Is argument in the range list? If so returns $i such that:
3262 # range[$i]->end < $codepoint <= range[$i+1]->end
3263 # which is one beyond what you want; this is so that the 0th range
3264 # doesn't return false
3265 my $self = shift;
3266 my $codepoint = shift;
3267 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3268
99870f4d
KW
3269 my $i = $self->_search_ranges($codepoint);
3270 return 0 unless defined $i;
3271
3272 # The search returns $i, such that
3273 # range[$i-1]->end < $codepoint <= range[$i]->end
3274 # So is in the table if and only iff it is at least the start position
3275 # of range $i.
f998e60c 3276 no overloading;
051df77b 3277 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
3278 return $i + 1;
3279 }
3280
2f7a8815
KW
3281 sub containing_range {
3282 # Returns the range object that contains the code point, undef if none
3283
3284 my $self = shift;
3285 my $codepoint = shift;
3286 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3287
3288 my $i = $self->contains($codepoint);
3289 return unless $i;
3290
3291 # contains() returns 1 beyond where we should look
3292 no overloading;
3293 return $ranges{pack 'J', $self}->[$i-1];
3294 }
3295
99870f4d
KW
3296 sub value_of {
3297 # Returns the value associated with the code point, undef if none
3298
3299 my $self = shift;
3300 my $codepoint = shift;
3301 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3302
d69c231b
KW
3303 my $range = $self->containing_range($codepoint);
3304 return unless defined $range;
99870f4d 3305
d69c231b 3306 return $range->value;
99870f4d
KW
3307 }
3308
0a9dbafc
KW
3309 sub type_of {
3310 # Returns the type of the range containing the code point, undef if
3311 # the code point is not in the table
3312
3313 my $self = shift;
3314 my $codepoint = shift;
3315 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3316
3317 my $range = $self->containing_range($codepoint);
3318 return unless defined $range;
3319
3320 return $range->type;
3321 }
3322
99870f4d
KW
3323 sub _search_ranges {
3324 # Find the range in the list which contains a code point, or where it
3325 # should go if were to add it. That is, it returns $i, such that:
3326 # range[$i-1]->end < $codepoint <= range[$i]->end
3327 # Returns undef if no such $i is possible (e.g. at end of table), or
3328 # if there is an error.
3329
3330 my $self = shift;
3331 my $code_point = shift;
3332 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3333
ffe43484 3334 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3335
3336 return if $code_point > $max{$addr};
3337 my $r = $ranges{$addr}; # The current list of ranges
3338 my $range_list_size = scalar @$r;
3339 my $i;
3340
3341 use integer; # want integer division
3342
3343 # Use the cached result as the starting guess for this one, because,
3344 # an experiment on 5.1 showed that 90% of the time the cache was the
3345 # same as the result on the next call (and 7% it was one less).
3346 $i = $_search_ranges_cache{$addr};
3347 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3348 # from an intervening deletion
3349 #local $to_trace = 1 if main::DEBUG;
3350 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3351 return $i if $code_point <= $r->[$i]->end
3352 && ($i == 0 || $r->[$i-1]->end < $code_point);
3353
3354 # Here the cache doesn't yield the correct $i. Try adding 1.
3355 if ($i < $range_list_size - 1
3356 && $r->[$i]->end < $code_point &&
3357 $code_point <= $r->[$i+1]->end)
3358 {
3359 $i++;
3360 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3361 $_search_ranges_cache{$addr} = $i;
3362 return $i;
3363 }
3364
3365 # Here, adding 1 also didn't work. We do a binary search to
3366 # find the correct position, starting with current $i
3367 my $lower = 0;
3368 my $upper = $range_list_size - 1;
3369 while (1) {
3370 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3371
3372 if ($code_point <= $r->[$i]->end) {
3373
3374 # Here we have met the upper constraint. We can quit if we
3375 # also meet the lower one.
3376 last if $i == 0 || $r->[$i-1]->end < $code_point;
3377
3378 $upper = $i; # Still too high.
3379
3380 }
3381 else {
3382
3383 # Here, $r[$i]->end < $code_point, so look higher up.
3384 $lower = $i;
3385 }
3386
3387 # Split search domain in half to try again.
3388 my $temp = ($upper + $lower) / 2;
3389
3390 # No point in continuing unless $i changes for next time
3391 # in the loop.
3392 if ($temp == $i) {
3393
3394 # We can't reach the highest element because of the averaging.
3395 # So if one below the upper edge, force it there and try one
3396 # more time.
3397 if ($i == $range_list_size - 2) {
3398
3399 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3400 $i = $range_list_size - 1;
3401
3402 # Change $lower as well so if fails next time through,
3403 # taking the average will yield the same $i, and we will
3404 # quit with the error message just below.
3405 $lower = $i;
3406 next;
3407 }
3408 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3409 return;
3410 }
3411 $i = $temp;
3412 } # End of while loop
3413
3414 if (main::DEBUG && $to_trace) {
3415 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3416 trace "i= [ $i ]", $r->[$i];
3417 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3418 }
3419
3420 # Here we have found the offset. Cache it as a starting point for the
3421 # next call.
3422 $_search_ranges_cache{$addr} = $i;
3423 return $i;
3424 }
3425
3426 sub _add_delete {
3427 # Add, replace or delete ranges to or from a list. The $type
3428 # parameter gives which:
3429 # '+' => insert or replace a range, returning a list of any changed
3430 # ranges.
3431 # '-' => delete a range, returning a list of any deleted ranges.
3432 #
3433 # The next three parameters give respectively the start, end, and
3434 # value associated with the range. 'value' should be null unless the
3435 # operation is '+';
3436 #
3437 # The range list is kept sorted so that the range with the lowest
3438 # starting position is first in the list, and generally, adjacent
c1739a4a 3439 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3440 # exceptions below).
3441 #
c1739a4a 3442 # There are more parameters; all are key => value pairs:
99870f4d
KW
3443 # Type gives the type of the value. It is only valid for '+'.
3444 # All ranges have types; if this parameter is omitted, 0 is
3445 # assumed. Ranges with type 0 are assumed to obey the
3446 # Unicode rules for casing, etc; ranges with other types are
3447 # not. Otherwise, the type is arbitrary, for the caller's
3448 # convenience, and looked at only by this routine to keep
3449 # adjacent ranges of different types from being merged into
3450 # a single larger range, and when Replace =>
3451 # $IF_NOT_EQUIVALENT is specified (see just below).
3452 # Replace determines what to do if the range list already contains
3453 # ranges which coincide with all or portions of the input
3454 # range. It is only valid for '+':
3455 # => $NO means that the new value is not to replace
3456 # any existing ones, but any empty gaps of the
3457 # range list coinciding with the input range
3458 # will be filled in with the new value.
3459 # => $UNCONDITIONALLY means to replace the existing values with
3460 # this one unconditionally. However, if the
3461 # new and old values are identical, the
3462 # replacement is skipped to save cycles
3463 # => $IF_NOT_EQUIVALENT means to replace the existing values
d59563d0 3464 # (the default) with this one if they are not equivalent.
99870f4d 3465 # Ranges are equivalent if their types are the
c1739a4a 3466 # same, and they are the same string; or if
99870f4d
KW
3467 # both are type 0 ranges, if their Unicode
3468 # standard forms are identical. In this last
3469 # case, the routine chooses the more "modern"
3470 # one to use. This is because some of the
3471 # older files are formatted with values that
3472 # are, for example, ALL CAPs, whereas the
3473 # derived files have a more modern style,
3474 # which looks better. By looking for this
3475 # style when the pre-existing and replacement
3476 # standard forms are the same, we can move to
3477 # the modern style
9470941f 3478 # => $MULTIPLE_BEFORE means that if this range duplicates an
99870f4d
KW
3479 # existing one, but has a different value,
3480 # don't replace the existing one, but insert
3481 # this, one so that the same range can occur
53d84487
KW
3482 # multiple times. They are stored LIFO, so
3483 # that the final one inserted is the first one
3484 # returned in an ordered search of the table.
6901521e
KW
3485 # If this is an exact duplicate, including the
3486 # value, the original will be moved to be
3487 # first, before any other duplicate ranges
3488 # with different values.
7f4b1e25
KW
3489 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3490 # FIFO, so that this one is inserted after all
6901521e
KW
3491 # others that currently exist. If this is an
3492 # exact duplicate, including value, of an
3493 # existing range, this one is discarded
3494 # (leaving the existing one in its original,
3495 # higher priority position
99870f4d
KW
3496 # => anything else is the same as => $IF_NOT_EQUIVALENT
3497 #
c1739a4a
KW
3498 # "same value" means identical for non-type-0 ranges, and it means
3499 # having the same standard forms for type-0 ranges.
99870f4d
KW
3500
3501 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3502
3503 my $self = shift;
3504 my $operation = shift; # '+' for add/replace; '-' for delete;
3505 my $start = shift;
3506 my $end = shift;
3507 my $value = shift;
3508
3509 my %args = @_;
3510
3511 $value = "" if not defined $value; # warning: $value can be "0"
3512
3513 my $replace = delete $args{'Replace'};
3514 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3515
3516 my $type = delete $args{'Type'};
3517 $type = 0 unless defined $type;
3518
3519 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3520
ffe43484 3521 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3522
3523 if ($operation ne '+' && $operation ne '-') {
3524 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3525 return;
3526 }
3527 unless (defined $start && defined $end) {
3528 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3529 return;
3530 }
3531 unless ($end >= $start) {
3532 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3533 return;
3534 }
556ca434
KW
3535 if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
3536 Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
3537 }
99870f4d
KW
3538 #local $to_trace = 1 if main::DEBUG;
3539
3540 if ($operation eq '-') {
3541 if ($replace != $IF_NOT_EQUIVALENT) {
3542 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3543 $replace = $IF_NOT_EQUIVALENT;
3544 }
3545 if ($type) {
3546 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3547 $type = 0;
3548 }
3549 if ($value ne "") {
3550 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3551 $value = "";
3552 }
3553 }
3554
3555 my $r = $ranges{$addr}; # The current list of ranges
3556 my $range_list_size = scalar @$r; # And its size
3557 my $max = $max{$addr}; # The current high code point in
3558 # the list of ranges
3559
3560 # Do a special case requiring fewer machine cycles when the new range
3561 # starts after the current highest point. The Unicode input data is
3562 # structured so this is common.
3563 if ($start > $max) {
3564
52d4d76a 3565 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
99870f4d
KW
3566 return if $operation eq '-'; # Deleting a non-existing range is a
3567 # no-op
3568
3569 # If the new range doesn't logically extend the current final one
3570 # in the range list, create a new range at the end of the range
3571 # list. (max cleverly is initialized to a negative number not
3572 # adjacent to 0 if the range list is empty, so even adding a range
3573 # to an empty range list starting at 0 will have this 'if'
3574 # succeed.)
3575 if ($start > $max + 1 # non-adjacent means can't extend.
3576 || @{$r}[-1]->value ne $value # values differ, can't extend.
3577 || @{$r}[-1]->type != $type # types differ, can't extend.
3578 ) {
3579 push @$r, Range->new($start, $end,
3580 Value => $value,
3581 Type => $type);
3582 }
3583 else {
3584
3585 # Here, the new range starts just after the current highest in
3586 # the range list, and they have the same type and value.
3587 # Extend the current range to incorporate the new one.
3588 @{$r}[-1]->set_end($end);
3589 }
3590
3591 # This becomes the new maximum.
3592 $max{$addr} = $end;
3593
3594 return;
3595 }
3596 #local $to_trace = 0 if main::DEBUG;
3597
3598 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3599
3600 # Here, the input range isn't after the whole rest of the range list.
3601 # Most likely 'splice' will be needed. The rest of the routine finds
3602 # the needed splice parameters, and if necessary, does the splice.
3603 # First, find the offset parameter needed by the splice function for
3604 # the input range. Note that the input range may span multiple
3605 # existing ones, but we'll worry about that later. For now, just find
3606 # the beginning. If the input range is to be inserted starting in a
3607 # position not currently in the range list, it must (obviously) come
3608 # just after the range below it, and just before the range above it.
3609 # Slightly less obviously, it will occupy the position currently
3610 # occupied by the range that is to come after it. More formally, we
3611 # are looking for the position, $i, in the array of ranges, such that:
3612 #
3613 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3614 #
3615 # (The ordered relationships within existing ranges are also shown in
3616 # the equation above). However, if the start of the input range is
3617 # within an existing range, the splice offset should point to that
3618 # existing range's position in the list; that is $i satisfies a
3619 # somewhat different equation, namely:
3620 #
3621 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3622 #
3623 # More briefly, $start can come before or after r[$i]->start, and at
3624 # this point, we don't know which it will be. However, these
3625 # two equations share these constraints:
3626 #
3627 # r[$i-1]->end < $start <= r[$i]->end
3628 #
3629 # And that is good enough to find $i.
3630
3631 my $i = $self->_search_ranges($start);
3632 if (! defined $i) {
3633 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3634 return;
3635 }
3636
3637 # The search function returns $i such that:
3638 #
3639 # r[$i-1]->end < $start <= r[$i]->end
3640 #
3641 # That means that $i points to the first range in the range list
3642 # that could possibly be affected by this operation. We still don't
3643 # know if the start of the input range is within r[$i], or if it
3644 # points to empty space between r[$i-1] and r[$i].
3645 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3646
3647 # Special case the insertion of data that is not to replace any
3648 # existing data.
3649 if ($replace == $NO) { # If $NO, has to be operation '+'
3650 #local $to_trace = 1 if main::DEBUG;
3651 trace "Doesn't replace" if main::DEBUG && $to_trace;
3652
3653 # Here, the new range is to take effect only on those code points
3654 # that aren't already in an existing range. This can be done by
3655 # looking through the existing range list and finding the gaps in
3656 # the ranges that this new range affects, and then calling this
3657 # function recursively on each of those gaps, leaving untouched
3658 # anything already in the list. Gather up a list of the changed
3659 # gaps first so that changes to the internal state as new ranges
3660 # are added won't be a problem.
3661 my @gap_list;
3662
3663 # First, if the starting point of the input range is outside an
3664 # existing one, there is a gap from there to the beginning of the
3665 # existing range -- add a span to fill the part that this new
3666 # range occupies
3667 if ($start < $r->[$i]->start) {
3668 push @gap_list, Range->new($start,
3669 main::min($end,
3670 $r->[$i]->start - 1),
3671 Type => $type);
3672 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3673 }
3674
3675 # Then look through the range list for other gaps until we reach
3676 # the highest range affected by the input one.
3677 my $j;
3678 for ($j = $i+1; $j < $range_list_size; $j++) {
3679 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3680 last if $end < $r->[$j]->start;
3681
3682 # If there is a gap between when this range starts and the
3683 # previous one ends, add a span to fill it. Note that just
3684 # because there are two ranges doesn't mean there is a
3685 # non-zero gap between them. It could be that they have
3686 # different values or types
3687 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3688 push @gap_list,
3689 Range->new($r->[$j-1]->end + 1,
3690 $r->[$j]->start - 1,
3691 Type => $type);
3692 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3693 }
3694 }
3695
3696 # Here, we have either found an existing range in the range list,
3697 # beyond the area affected by the input one, or we fell off the
3698 # end of the loop because the input range affects the whole rest
3699 # of the range list. In either case, $j is 1 higher than the
3700 # highest affected range. If $j == $i, it means that there are no
3701 # affected ranges, that the entire insertion is in the gap between
3702 # r[$i-1], and r[$i], which we already have taken care of before
3703 # the loop.
3704 # On the other hand, if there are affected ranges, it might be
3705 # that there is a gap that needs filling after the final such
3706 # range to the end of the input range
3707 if ($r->[$j-1]->end < $end) {
3708 push @gap_list, Range->new(main::max($start,
3709 $r->[$j-1]->end + 1),
3710 $end,
3711 Type => $type);
3712 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3713 }
3714
3715 # Call recursively to fill in all the gaps.
3716 foreach my $gap (@gap_list) {
3717 $self->_add_delete($operation,
3718 $gap->start,
3719 $gap->end,
3720 $value,
3721 Type => $type);
3722 }
3723
3724 return;
3725 }
3726
53d84487
KW
3727 # Here, we have taken care of the case where $replace is $NO.
3728 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3729 # If inserting a multiple record, this is where it goes, before the
7f4b1e25
KW
3730 # first (if any) existing one if inserting LIFO. (If this is to go
3731 # afterwards, FIFO, we below move the pointer to there.) These imply
3732 # an insertion, and no change to any existing ranges. Note that $i
3733 # can be -1 if this new range doesn't actually duplicate any existing,
3734 # and comes at the beginning of the list.
3735 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
53d84487
KW
3736
3737 if ($start != $end) {
3738 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3739 return;
3740 }
3741
19155fcc 3742 # If the new code point is within a current range ...
53d84487 3743 if ($end >= $r->[$i]->start) {
19155fcc
KW
3744
3745 # Don't add an exact duplicate, as it isn't really a multiple
1f6798c4
KW
3746 my $existing_value = $r->[$i]->value;
3747 my $existing_type = $r->[$i]->type;
3748 return if $value eq $existing_value && $type eq $existing_type;
3749
3750 # If the multiple value is part of an existing range, we want
3751 # to split up that range, so that only the single code point
3752 # is affected. To do this, we first call ourselves
3753 # recursively to delete that code point from the table, having
3754 # preserved its current data above. Then we call ourselves
3755 # recursively again to add the new multiple, which we know by
3756 # the test just above is different than the current code
3757 # point's value, so it will become a range containing a single
3758 # code point: just itself. Finally, we add back in the
3759 # pre-existing code point, which will again be a single code
3760 # point range. Because 'i' likely will have changed as a
3761 # result of these operations, we can't just continue on, but
7f4b1e25
KW
3762 # do this operation recursively as well. If we are inserting
3763 # LIFO, the pre-existing code point needs to go after the new
3764 # one, so use MULTIPLE_AFTER; and vice versa.
53d84487 3765 if ($r->[$i]->start != $r->[$i]->end) {
1f6798c4
KW
3766 $self->_add_delete('-', $start, $end, "");
3767 $self->_add_delete('+', $start, $end, $value, Type => $type);
7f4b1e25
KW
3768 return $self->_add_delete('+',
3769 $start, $end,
3770 $existing_value,
3771 Type => $existing_type,
3772 Replace => ($replace == $MULTIPLE_BEFORE)
3773 ? $MULTIPLE_AFTER
3774 : $MULTIPLE_BEFORE);
3775 }
3776 }
3777
3778 # If to place this new record after, move to beyond all existing
1722e378 3779 # ones; but don't add this one if identical to any of them, as it
6901521e
KW
3780 # isn't really a multiple. This leaves the original order, so
3781 # that the current request is ignored. The reasoning is that the
3782 # previous request that wanted this record to have high priority
3783 # should have precedence.
7f4b1e25
KW
3784 if ($replace == $MULTIPLE_AFTER) {
3785 while ($i < @$r && $r->[$i]->start == $start) {
1722e378
KW
3786 return if $value eq $r->[$i]->value
3787 && $type eq $r->[$i]->type;
7f4b1e25 3788 $i++;
53d84487 3789 }
53d84487 3790 }
6901521e
KW
3791 else {
3792 # If instead we are to place this new record before any
3793 # existing ones, remove any identical ones that come after it.
3794 # This changes the existing order so that the new one is
3795 # first, as is being requested.
3796 for (my $j = $i + 1;
3797 $j < @$r && $r->[$j]->start == $start;
3798 $j++)
3799 {
3800 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
3801 splice @$r, $j, 1;
3802 last; # There should only be one instance, so no
3803 # need to keep looking
3804 }
3805 }
3806 }
53d84487
KW
3807
3808 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3809 my @return = splice @$r,
3810 $i,
3811 0,
3812 Range->new($start,
3813 $end,
3814 Value => $value,
3815 Type => $type);
3816 if (main::DEBUG && $to_trace) {
3817 trace "After splice:";
3818 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3819 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3820 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3821 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3822 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3823 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3824 }
3825 return @return;
3826 }
3827
7f4b1e25
KW
3828 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
3829 # leaves delete, insert, and replace either unconditionally or if not
53d84487
KW
3830 # equivalent. $i still points to the first potential affected range.
3831 # Now find the highest range affected, which will determine the length
3832 # parameter to splice. (The input range can span multiple existing
3833 # ones.) If this isn't a deletion, while we are looking through the
3834 # range list, see also if this is a replacement rather than a clean
3835 # insertion; that is if it will change the values of at least one
3836 # existing range. Start off assuming it is an insert, until find it
3837 # isn't.
3838 my $clean_insert = $operation eq '+';
99870f4d
KW
3839 my $j; # This will point to the highest affected range
3840
3841 # For non-zero types, the standard form is the value itself;
3842 my $standard_form = ($type) ? $value : main::standardize($value);
3843
3844 for ($j = $i; $j < $range_list_size; $j++) {
3845 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3846
3847 # If find a range that it doesn't overlap into, we can stop
3848 # searching
3849 last if $end < $r->[$j]->start;
3850
969a34cc
KW
3851 # Here, overlaps the range at $j. If the values don't match,
3852 # and so far we think this is a clean insertion, it becomes a
3853 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3854 if ($clean_insert) {
99870f4d 3855 if ($r->[$j]->standard_form ne $standard_form) {
969a34cc 3856 $clean_insert = 0;
56343c78
KW
3857 if ($replace == $CROAK) {
3858 main::croak("The range to add "
3859 . sprintf("%04X", $start)
3860 . '-'
3861 . sprintf("%04X", $end)
3862 . " with value '$value' overlaps an existing range $r->[$j]");
3863 }
99870f4d
KW
3864 }
3865 else {
3866
3867 # Here, the two values are essentially the same. If the
3868 # two are actually identical, replacing wouldn't change
3869 # anything so skip it.
3870 my $pre_existing = $r->[$j]->value;
3871 if ($pre_existing ne $value) {
3872
3873 # Here the new and old standardized values are the
3874 # same, but the non-standardized values aren't. If
3875 # replacing unconditionally, then replace
3876 if( $replace == $UNCONDITIONALLY) {
969a34cc 3877 $clean_insert = 0;
99870f4d
KW
3878 }
3879 else {
3880
3881 # Here, are replacing conditionally. Decide to
3882 # replace or not based on which appears to look
3883 # the "nicest". If one is mixed case and the
3884 # other isn't, choose the mixed case one.
3885 my $new_mixed = $value =~ /[A-Z]/
3886 && $value =~ /[a-z]/;
3887 my $old_mixed = $pre_existing =~ /[A-Z]/
3888 && $pre_existing =~ /[a-z]/;
3889
3890 if ($old_mixed != $new_mixed) {
969a34cc 3891 $clean_insert = 0 if $new_mixed;
99870f4d 3892 if (main::DEBUG && $to_trace) {
969a34cc
KW
3893 if ($clean_insert) {
3894 trace "Retaining $pre_existing over $value";
99870f4d
KW
3895 }
3896 else {
969a34cc 3897 trace "Replacing $pre_existing with $value";
99870f4d
KW
3898 }
3899 }
3900 }
3901 else {
3902
3903 # Here casing wasn't different between the two.
3904 # If one has hyphens or underscores and the
3905 # other doesn't, choose the one with the
3906 # punctuation.
3907 my $new_punct = $value =~ /[-_]/;
3908 my $old_punct = $pre_existing =~ /[-_]/;
3909
3910 if ($old_punct != $new_punct) {
969a34cc 3911 $clean_insert = 0 if $new_punct;
99870f4d 3912 if (main::DEBUG && $to_trace) {
969a34cc
KW
3913 if ($clean_insert) {
3914 trace "Retaining $pre_existing over $value";
99870f4d
KW
3915 }
3916 else {
969a34cc 3917 trace "Replacing $pre_existing with $value";
99870f4d
KW
3918 }
3919 }
3920 } # else existing one is just as "good";
3921 # retain it to save cycles.
3922 }
3923 }
3924 }
3925 }
3926 }
3927 } # End of loop looking for highest affected range.
3928
3929 # Here, $j points to one beyond the highest range that this insertion
3930 # affects (hence to beyond the range list if that range is the final
3931 # one in the range list).
3932
3933 # The splice length is all the affected ranges. Get it before
3934 # subtracting, for efficiency, so we don't have to later add 1.
3935 my $length = $j - $i;
3936
3937 $j--; # $j now points to the highest affected range.
3938 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3939
7f4b1e25 3940 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
99870f4d
KW
3941 # $j points to the highest affected range. But it can be < $i or even
3942 # -1. These happen only if the insertion is entirely in the gap
3943 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3944 # above exited first time through with $end < $r->[$i]->start. (And
3945 # then we subtracted one from j) This implies also that $start <
3946 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3947 # $start, so the entire input range is in the gap.
3948 if ($j < $i) {
3949
3950 # Here the entire input range is in the gap before $i.
3951
3952 if (main::DEBUG && $to_trace) {
3953 if ($i) {
3954 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3955 }
3956 else {
3957 trace "Entire range is before $r->[$i]";
3958 }
3959 }
3960 return if $operation ne '+'; # Deletion of a non-existent range is
3961 # a no-op
3962 }
3963 else {
3964
969a34cc
KW
3965 # Here part of the input range is not in the gap before $i. Thus,
3966 # there is at least one affected one, and $j points to the highest
3967 # such one.
99870f4d
KW
3968
3969 # At this point, here is the situation:
3970 # This is not an insertion of a multiple, nor of tentative ($NO)
3971 # data.
3972 # $i points to the first element in the current range list that
3973 # may be affected by this operation. In fact, we know
3974 # that the range at $i is affected because we are in
3975 # the else branch of this 'if'
3976 # $j points to the highest affected range.
3977 # In other words,
3978 # r[$i-1]->end < $start <= r[$i]->end
3979 # And:
3980 # r[$i-1]->end < $start <= $end <= r[$j]->end
3981 #
3982 # Also:
969a34cc
KW
3983 # $clean_insert is a boolean which is set true if and only if
3984 # this is a "clean insertion", i.e., not a change nor a
3985 # deletion (multiple was handled above).
99870f4d
KW
3986
3987 # We now have enough information to decide if this call is a no-op
969a34cc
KW
3988 # or not. It is a no-op if this is an insertion of already
3989 # existing data.
99870f4d 3990
969a34cc 3991 if (main::DEBUG && $to_trace && $clean_insert
99870f4d
KW
3992 && $i == $j
3993 && $start >= $r->[$i]->start)
3994 {
3995 trace "no-op";
3996 }
969a34cc 3997 return if $clean_insert
99870f4d
KW
3998 && $i == $j # more than one affected range => not no-op
3999
4000 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
4001 # Further, $start and/or $end is >= r[$i]->start
4002 # The test below hence guarantees that
4003 # r[$i]->start < $start <= $end <= r[$i]->end
4004 # This means the input range is contained entirely in
4005 # the one at $i, so is a no-op
4006 && $start >= $r->[$i]->start;
4007 }
4008
4009 # Here, we know that some action will have to be taken. We have
4010 # calculated the offset and length (though adjustments may be needed)
4011 # for the splice. Now start constructing the replacement list.
4012 my @replacement;
4013 my $splice_start = $i;
4014
4015 my $extends_below;
4016 my $extends_above;
4017
4018 # See if should extend any adjacent ranges.
4019 if ($operation eq '-') { # Don't extend deletions
4020 $extends_below = $extends_above = 0;
4021 }
4022 else { # Here, should extend any adjacent ranges. See if there are
4023 # any.
4024 $extends_below = ($i > 0
4025 # can't extend unless adjacent
4026 && $r->[$i-1]->end == $start -1
4027 # can't extend unless are same standard value
4028 && $r->[$i-1]->standard_form eq $standard_form
4029 # can't extend unless share type
4030 && $r->[$i-1]->type == $type);
4031 $extends_above = ($j+1 < $range_list_size
4032 && $r->[$j+1]->start == $end +1
4033 && $r->[$j+1]->standard_form eq $standard_form
23822bda 4034 && $r->[$j+1]->type == $type);
99870f4d
KW
4035 }
4036 if ($extends_below && $extends_above) { # Adds to both
4037 $splice_start--; # start replace at element below
4038 $length += 2; # will replace on both sides
4039 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4040
4041 # The result will fill in any gap, replacing both sides, and
4042 # create one large range.
4043 @replacement = Range->new($r->[$i-1]->start,
4044 $r->[$j+1]->end,
4045 Value => $value,
4046 Type => $type);
4047 }
4048 else {
4049
4050 # Here we know that the result won't just be the conglomeration of
4051 # a new range with both its adjacent neighbors. But it could
4052 # extend one of them.
4053
4054 if ($extends_below) {
4055
4056 # Here the new element adds to the one below, but not to the
4057 # one above. If inserting, and only to that one range, can
4058 # just change its ending to include the new one.
969a34cc 4059 if ($length == 0 && $clean_insert) {
99870f4d
KW
4060 $r->[$i-1]->set_end($end);
4061 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4062 return;
4063 }
4064 else {
4065 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4066 $splice_start--; # start replace at element below
4067 $length++; # will replace the element below
4068 $start = $r->[$i-1]->start;
4069 }
4070 }
4071 elsif ($extends_above) {
4072
4073 # Here the new element adds to the one above, but not below.
4074 # Mirror the code above
969a34cc 4075 if ($length == 0 && $clean_insert) {
99870f4d
KW
4076 $r->[$j+1]->set_start($start);
4077 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4078 return;
4079 }
4080 else {
4081 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4082 $length++; # will replace the element above
4083 $end = $r->[$j+1]->end;
4084 }
4085 }
4086
4087 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4088
4089 # Finally, here we know there will have to be a splice.
4090 # If the change or delete affects only the highest portion of the
4091 # first affected range, the range will have to be split. The
4092 # splice will remove the whole range, but will replace it by a new
4093 # range containing just the unaffected part. So, in this case,
4094 # add to the replacement list just this unaffected portion.
4095 if (! $extends_below
4096 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4097 {
4098 push @replacement,
4099 Range->new($r->[$i]->start,
4100 $start - 1,
4101 Value => $r->[$i]->value,
4102 Type => $r->[$i]->type);
4103 }
4104
4105 # In the case of an insert or change, but not a delete, we have to
4106 # put in the new stuff; this comes next.
4107 if ($operation eq '+') {
4108 push @replacement, Range->new($start,
4109 $end,
4110 Value => $value,
4111 Type => $type);
4112 }
4113
4114 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4115 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4116
4117 # And finally, if we're changing or deleting only a portion of the
4118 # highest affected range, it must be split, as the lowest one was.
4119 if (! $extends_above
4120 && $j >= 0 # Remember that j can be -1 if before first
4121 # current element
4122 && $end >= $r->[$j]->start
4123 && $end < $r->[$j]->end)
4124 {
4125 push @replacement,
4126 Range->new($end + 1,
4127 $r->[$j]->end,
4128 Value => $r->[$j]->value,
4129 Type => $r->[$j]->type);
4130 }
4131 }
4132
4133 # And do the splice, as calculated above
4134 if (main::DEBUG && $to_trace) {
4135 trace "replacing $length element(s) at $i with ";
4136 foreach my $replacement (@replacement) {
4137 trace " $replacement";
4138 }
4139 trace "Before splice:";
4140 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4141 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4142 trace "i =[", $i, "]", $r->[$i];
4143 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4144 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4145 }
4146
4147 my @return = splice @$r, $splice_start, $length, @replacement;
4148
4149 if (main::DEBUG && $to_trace) {
4150 trace "After splice:";
4151 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4152 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4153 trace "i =[", $i, "]", $r->[$i];
4154 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4155 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
e6451557 4156 trace "removed ", @return if @return;
99870f4d
KW
4157 }
4158
4159 # An actual deletion could have changed the maximum in the list.
4160 # There was no deletion if the splice didn't return something, but
4161 # otherwise recalculate it. This is done too rarely to worry about
4162 # performance.
4163 if ($operation eq '-' && @return) {
5b348b71
KW
4164 if (@$r) {
4165 $max{$addr} = $r->[-1]->end;
4166 }
4167 else { # Now empty
4168 $max{$addr} = $max_init;
4169 }
99870f4d
KW
4170 }
4171 return @return;
4172 }
4173
4174 sub reset_each_range { # reset the iterator for each_range();
4175 my $self = shift;
4176 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4177
f998e60c 4178 no overloading;
051df77b 4179 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
4180 return;
4181 }
4182
4183 sub each_range {
4184 # Iterate over each range in a range list. Results are undefined if
4185 # the range list is changed during the iteration.
4186
4187 my $self = shift;
4188 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4189
ffe43484 4190 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4191
4192 return if $self->is_empty;
4193
4194 $each_range_iterator{$addr} = -1
4195 if ! defined $each_range_iterator{$addr};
4196 $each_range_iterator{$addr}++;
4197 return $ranges{$addr}->[$each_range_iterator{$addr}]
4198 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4199 undef $each_range_iterator{$addr};
4200 return;
4201 }
4202
4203 sub count { # Returns count of code points in range list
4204 my $self = shift;
4205 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4206
ffe43484 4207 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4208
4209 my $count = 0;
4210 foreach my $range (@{$ranges{$addr}}) {
4211 $count += $range->end - $range->start + 1;
4212 }
4213 return $count;
4214 }
4215
4216 sub delete_range { # Delete a range
4217 my $self = shift;
4218 my $start = shift;
4219 my $end = shift;
4220
4221 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4222
4223 return $self->_add_delete('-', $start, $end, "");
4224 }
4225
4226 sub is_empty { # Returns boolean as to if a range list is empty
4227 my $self = shift;
4228 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4229
f998e60c 4230 no overloading;
051df77b 4231 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
4232 }
4233
4234 sub hash {
4235 # Quickly returns a scalar suitable for separating tables into
4236 # buckets, i.e. it is a hash function of the contents of a table, so
4237 # there are relatively few conflicts.
4238
4239 my $self = shift;
4240 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4241
ffe43484 4242 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4243
4244 # These are quickly computable. Return looks like 'min..max;count'
4245 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4246 }
4247} # End closure for _Range_List_Base
4248
4249package Range_List;
4250use base '_Range_List_Base';
4251
4252# A Range_List is a range list for match tables; i.e. the range values are
4253# not significant. Thus a number of operations can be safely added to it,
4254# such as inversion, intersection. Note that union is also an unsafe
4255# operation when range values are cared about, and that method is in the base
4256# class, not here. But things are set up so that that method is callable only
4257# during initialization. Only in this derived class, is there an operation
4258# that combines two tables. A Range_Map can thus be used to initialize a
4259# Range_List, and its mappings will be in the list, but are not significant to
4260# this class.
4261
4262sub trace { return main::trace(@_); }
4263
4264{ # Closure
4265
4266 use overload
4267 fallback => 0,
4268 '+' => sub { my $self = shift;
4269 my $other = shift;
4270
4271 return $self->_union($other)
4272 },
e76a8d86
KW
4273 '+=' => sub { my $self = shift;
4274 my $other = shift;
4275 my $reversed = shift;
4276
4277 if ($reversed) {
4278 Carp::my_carp_bug("Bad news. Can't cope with '"
4279 . ref($other)
4280 . ' += '
4281 . ref($self)
4282 . "'. undef returned.");
4283 return;
4284 }
4285
4286 return $self->_union($other)
4287 },
99870f4d
KW
4288 '&' => sub { my $self = shift;
4289 my $other = shift;
4290
4291 return $self->_intersect($other, 0);
4292 },
3d0c6d38
KW
4293 '&=' => sub { my $self = shift;
4294 my $other = shift;
4295 my $reversed = shift;
4296
4297 if ($reversed) {
4298 Carp::my_carp_bug("Bad news. Can't cope with '"
4299 . ref($other)
4300 . ' &= '
4301 . ref($self)
4302 . "'. undef returned.");
4303 return;
4304 }
4305
4306 return $self->_intersect($other, 0);
4307 },
99870f4d
KW
4308 '~' => "_invert",
4309 '-' => "_subtract",
4310 ;
4311
4312 sub _invert {
4313 # Returns a new Range_List that gives all code points not in $self.
4314
4315 my $self = shift;
4316
4317 my $new = Range_List->new;
4318
4319 # Go through each range in the table, finding the gaps between them
4320 my $max = -1; # Set so no gap before range beginning at 0
4321 for my $range ($self->ranges) {
4322 my $start = $range->start;
4323 my $end = $range->end;
4324
4325 # If there is a gap before this range, the inverse will contain
4326 # that gap.
4327 if ($start > $max + 1) {
4328 $new->add_range($max + 1, $start - 1);
4329 }
4330 $max = $end;
4331 }
4332
4333 # And finally, add the gap from the end of the table to the max
4334 # possible code point
6189eadc
KW
4335 if ($max < $MAX_UNICODE_CODEPOINT) {
4336 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
99870f4d
KW
4337 }
4338 return $new;
4339 }
4340
4341 sub _subtract {
4342 # Returns a new Range_List with the argument deleted from it. The
4343 # argument can be a single code point, a range, or something that has
4344 # a range, with the _range_list() method on it returning them
4345
4346 my $self = shift;
4347 my $other = shift;
4348 my $reversed = shift;
4349 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4350
4351 if ($reversed) {
5d9b6ded
KW
4352 Carp::my_carp_bug("Bad news. Can't cope with '"
4353 . ref($other)
4354 . ' - '
4355 . ref($self)
4356 . "'. undef returned.");
4357 return;
99870f4d
KW
4358 }
4359
4360 my $new = Range_List->new(Initialize => $self);
4361
4362 if (! ref $other) { # Single code point
4363 $new->delete_range($other, $other);
4364 }
4365 elsif ($other->isa('Range')) {
4366 $new->delete_range($other->start, $other->end);
4367 }
4368 elsif ($other->can('_range_list')) {
4369 foreach my $range ($other->_range_list->ranges) {
4370 $new->delete_range($range->start, $range->end);
4371 }
4372 }
4373 else {
4374 Carp::my_carp_bug("Can't cope with a "
4375 . ref($other)
4376 . " argument to '-'. Subtraction ignored."
4377 );
4378 return $self;
4379 }
4380
4381 return $new;
4382 }
4383
4384 sub _intersect {
4385 # Returns either a boolean giving whether the two inputs' range lists
4386 # intersect (overlap), or a new Range_List containing the intersection
4387 # of the two lists. The optional final parameter being true indicates
4388 # to do the check instead of the intersection.
4389
4390 my $a_object = shift;
4391 my $b_object = shift;
4392 my $check_if_overlapping = shift;
4393 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4394 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4395
4396 if (! defined $b_object) {
4397 my $message = "";
4398 $message .= $a_object->_owner_name_of if defined $a_object;
4399 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4400 return;
4401 }
4402
4403 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4404 # Thus the intersection could be much more simply be written:
4405 # return ~(~$a_object + ~$b_object);
4406 # But, this is slower, and when taking the inverse of a large
4407 # range_size_1 table, back when such tables were always stored that
4408 # way, it became prohibitively slow, hence the code was changed to the
4409 # below
4410
4411 if ($b_object->isa('Range')) {
4412 $b_object = Range_List->new(Initialize => $b_object,
4413 Owner => $a_object->_owner_name_of);
4414 }
4415 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4416
4417 my @a_ranges = $a_object->ranges;
4418 my @b_ranges = $b_object->ranges;
4419
4420 #local $to_trace = 1 if main::DEBUG;
4421 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4422
4423 # Start with the first range in each list
4424 my $a_i = 0;
4425 my $range_a = $a_ranges[$a_i];
4426 my $b_i = 0;
4427 my $range_b = $b_ranges[$b_i];
4428
4429 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4430 if ! $check_if_overlapping;
4431
4432 # If either list is empty, there is no intersection and no overlap
4433 if (! defined $range_a || ! defined $range_b) {
4434 return $check_if_overlapping ? 0 : $new;
4435 }
4436 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4437
4438 # Otherwise, must calculate the intersection/overlap. Start with the
4439 # very first code point in each list
4440 my $a = $range_a->start;
4441 my $b = $range_b->start;
4442
4443 # Loop through all the ranges of each list; in each iteration, $a and
4444 # $b are the current code points in their respective lists
4445 while (1) {
4446
4447 # If $a and $b are the same code point, ...
4448 if ($a == $b) {
4449
4450 # it means the lists overlap. If just checking for overlap
4451 # know the answer now,
4452 return 1 if $check_if_overlapping;
4453
4454 # The intersection includes this code point plus anything else
4455 # common to both current ranges.
4456 my $start = $a;
4457 my $end = main::min($range_a->end, $range_b->end);
4458 if (! $check_if_overlapping) {
4459 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4460 $new->add_range($start, $end);
4461 }
4462
4463 # Skip ahead to the end of the current intersect
4464 $a = $b = $end;
4465
4466 # If the current intersect ends at the end of either range (as
4467 # it must for at least one of them), the next possible one
4468 # will be the beginning code point in it's list's next range.
4469 if ($a == $range_a->end) {
4470 $range_a = $a_ranges[++$a_i];
4471 last unless defined $range_a;
4472 $a = $range_a->start;
4473 }
4474 if ($b == $range_b->end) {
4475 $range_b = $b_ranges[++$b_i];
4476 last unless defined $range_b;
4477 $b = $range_b->start;
4478 }
4479
4480 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4481 }
4482 elsif ($a < $b) {
4483
4484 # Not equal, but if the range containing $a encompasses $b,
4485 # change $a to be the middle of the range where it does equal
4486 # $b, so the next iteration will get the intersection
4487 if ($range_a->end >= $b) {
4488 $a = $b;
4489 }
4490 else {
4491
4492 # Here, the current range containing $a is entirely below
4493 # $b. Go try to find a range that could contain $b.
4494 $a_i = $a_object->_search_ranges($b);
4495
4496 # If no range found, quit.
4497 last unless defined $a_i;
4498
4499 # The search returns $a_i, such that
4500 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4501 # Set $a to the beginning of this new range, and repeat.
4502 $range_a = $a_ranges[$a_i];
4503 $a = $range_a->start;
4504 }
4505 }
4506 else { # Here, $b < $a.
4507
4508 # Mirror image code to the leg just above
4509 if ($range_b->end >= $a) {
4510 $b = $a;
4511 }
4512 else {
4513 $b_i = $b_object->_search_ranges($a);
4514 last unless defined $b_i;
4515 $range_b = $b_ranges[$b_i];
4516 $b = $range_b->start;
4517 }
4518 }
4519 } # End of looping through ranges.
4520
4521 # Intersection fully computed, or now know that there is no overlap
4522 return $check_if_overlapping ? 0 : $new;
4523 }
4524
4525 sub overlaps {
4526 # Returns boolean giving whether the two arguments overlap somewhere
4527
4528 my $self = shift;
4529 my $other = shift;
4530 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4531
4532 return $self->_intersect($other, 1);
4533 }
4534
4535 sub add_range {
4536 # Add a range to the list.
4537
4538 my $self = shift;
4539 my $start = shift;
4540 my $end = shift;
4541 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4542
4543 return $self->_add_delete('+', $start, $end, "");
4544 }
4545
09aba7e4
KW
4546 sub matches_identically_to {
4547 # Return a boolean as to whether or not two Range_Lists match identical
4548 # sets of code points.
4549
4550 my $self = shift;
4551 my $other = shift;
4552 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4553
4554 # These are ordered in increasing real time to figure out (at least
4555 # until a patch changes that and doesn't change this)
4556 return 0 if $self->max != $other->max;
4557 return 0 if $self->min != $other->min;
4558 return 0 if $self->range_count != $other->range_count;
4559 return 0 if $self->count != $other->count;
4560
4561 # Here they could be identical because all the tests above passed.
4562 # The loop below is somewhat simpler since we know they have the same
4563 # number of elements. Compare range by range, until reach the end or
4564 # find something that differs.
4565 my @a_ranges = $self->ranges;
4566 my @b_ranges = $other->ranges;
4567 for my $i (0 .. @a_ranges - 1) {
4568 my $a = $a_ranges[$i];
4569 my $b = $b_ranges[$i];
4570 trace "self $a; other $b" if main::DEBUG && $to_trace;
c1c2d9e8
KW
4571 return 0 if ! defined $b
4572 || $a->start != $b->start
4573 || $a->end != $b->end;
09aba7e4
KW
4574 }
4575 return 1;
4576 }
4577
99870f4d
KW
4578 sub is_code_point_usable {
4579 # This used only for making the test script. See if the input
4580 # proposed trial code point is one that Perl will handle. If second
4581 # parameter is 0, it won't select some code points for various
4582 # reasons, noted below.
4583
4584 my $code = shift;
4585 my $try_hard = shift;
4586 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4587
4588 return 0 if $code < 0; # Never use a negative
4589
99870f4d
KW
4590 # shun null. I'm (khw) not sure why this was done, but NULL would be
4591 # the character very frequently used.
4592 return $try_hard if $code == 0x0000;
4593
99870f4d
KW
4594 # shun non-character code points.
4595 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4596 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4597
6189eadc 4598 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
99870f4d
KW
4599 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4600
4601 return 1;
4602 }
4603
4604 sub get_valid_code_point {
4605 # Return a code point that's part of the range list. Returns nothing
4606 # if the table is empty or we can't find a suitable code point. This
4607 # used only for making the test script.
4608
4609 my $self = shift;
4610 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4611
ffe43484 4612 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4613
4614 # On first pass, don't choose less desirable code points; if no good
4615 # one is found, repeat, allowing a less desirable one to be selected.
4616 for my $try_hard (0, 1) {
4617
4618 # Look through all the ranges for a usable code point.
06feba9a 4619 for my $set (reverse $self->ranges) {
99870f4d
KW
4620
4621 # Try the edge cases first, starting with the end point of the
4622 # range.
4623 my $end = $set->end;
4624 return $end if is_code_point_usable($end, $try_hard);
4625
4626 # End point didn't, work. Start at the beginning and try
4627 # every one until find one that does work.
4628 for my $trial ($set->start .. $end - 1) {
4629 return $trial if is_code_point_usable($trial, $try_hard);
4630 }
4631 }
4632 }
4633 return (); # If none found, give up.
4634 }
4635
4636 sub get_invalid_code_point {
4637 # Return a code point that's not part of the table. Returns nothing
4638 # if the table covers all code points or a suitable code point can't
4639 # be found. This used only for making the test script.
4640
4641 my $self = shift;
4642 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4643
4644 # Just find a valid code point of the inverse, if any.
4645 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4646 }
4647} # end closure for Range_List
4648
4649package Range_Map;
4650use base '_Range_List_Base';
4651
4652# A Range_Map is a range list in which the range values (called maps) are
4653# significant, and hence shouldn't be manipulated by our other code, which
4654# could be ambiguous or lose things. For example, in taking the union of two
4655# lists, which share code points, but which have differing values, which one
4656# has precedence in the union?
4657# It turns out that these operations aren't really necessary for map tables,
4658# and so this class was created to make sure they aren't accidentally
4659# applied to them.
4660
4661{ # Closure
4662
4663 sub add_map {
4664 # Add a range containing a mapping value to the list
4665
4666 my $self = shift;
4667 # Rest of parameters passed on
4668
4669 return $self->_add_delete('+', @_);
4670 }
4671
4672 sub add_duplicate {
4673 # Adds entry to a range list which can duplicate an existing entry
4674
4675 my $self = shift;
4676 my $code_point = shift;
4677 my $value = shift;
7f4b1e25
KW
4678 my %args = @_;
4679 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4680 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
99870f4d
KW
4681
4682 return $self->add_map($code_point, $code_point,
7f4b1e25 4683 $value, Replace => $replace);
99870f4d
KW
4684 }
4685} # End of closure for package Range_Map
4686
4687package _Base_Table;
4688
4689# A table is the basic data structure that gets written out into a file for
4690# use by the Perl core. This is the abstract base class implementing the
4691# common elements from the derived ones. A list of the methods to be
4692# furnished by an implementing class is just after the constructor.
4693
4694sub standardize { return main::standardize($_[0]); }
4695sub trace { return main::trace(@_); }
4696
4697{ # Closure
4698
4699 main::setup_package();
4700
4701 my %range_list;
4702 # Object containing the ranges of the table.
4703 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4704
4705 my %full_name;
4706 # The full table name.
4707 main::set_access('full_name', \%full_name, 'r');
4708
4709 my %name;
4710 # The table name, almost always shorter
4711 main::set_access('name', \%name, 'r');
4712
4713 my %short_name;
4714 # The shortest of all the aliases for this table, with underscores removed
4715 main::set_access('short_name', \%short_name);
4716
4717 my %nominal_short_name_length;
4718 # The length of short_name before removing underscores
4719 main::set_access('nominal_short_name_length',
4720 \%nominal_short_name_length);
4721
23e33b60
KW
4722 my %complete_name;
4723 # The complete name, including property.
4724 main::set_access('complete_name', \%complete_name, 'r');
4725
99870f4d
KW
4726 my %property;
4727 # Parent property this table is attached to.
4728 main::set_access('property', \%property, 'r');
4729
4730 my %aliases;
c12f2655
KW
4731 # Ordered list of alias objects of the table's name. The first ones in
4732 # the list are output first in comments
99870f4d
KW
4733 main::set_access('aliases', \%aliases, 'readable_array');
4734
4735 my %comment;
4736 # A comment associated with the table for human readers of the files
4737 main::set_access('comment', \%comment, 's');
4738
4739 my %description;
4740 # A comment giving a short description of the table's meaning for human
4741 # readers of the files.
4742 main::set_access('description', \%description, 'readable_array');
4743
4744 my %note;
4745 # A comment giving a short note about the table for human readers of the
4746 # files.
4747 main::set_access('note', \%note, 'readable_array');
4748
301ba948
KW
4749 my %fate;
4750 # Enum; there are a number of possibilities for what happens to this
4751 # table: it could be normal, or suppressed, or not for external use. See
4752 # values at definition for $SUPPRESSED.
4753 main::set_access('fate', \%fate, 'r');
99870f4d
KW
4754
4755 my %find_table_from_alias;
4756 # The parent property passes this pointer to a hash which this class adds
4757 # all its aliases to, so that the parent can quickly take an alias and
4758 # find this table.
4759 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4760
4761 my %locked;
4762 # After this table is made equivalent to another one; we shouldn't go
4763 # changing the contents because that could mean it's no longer equivalent
4764 main::set_access('locked', \%locked, 'r');
4765
4766 my %file_path;
4767 # This gives the final path to the file containing the table. Each
4768 # directory in the path is an element in the array
4769 main::set_access('file_path', \%file_path, 'readable_array');
4770
4771 my %status;
4772 # What is the table's status, normal, $OBSOLETE, etc. Enum
4773 main::set_access('status', \%status, 'r');
4774
4775 my %status_info;
4776 # A comment about its being obsolete, or whatever non normal status it has
4777 main::set_access('status_info', \%status_info, 'r');
4778
d867ccfb
KW
4779 my %caseless_equivalent;
4780 # The table this is equivalent to under /i matching, if any.
4781 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4782
99870f4d
KW
4783 my %range_size_1;
4784 # Is the table to be output with each range only a single code point?
4785 # This is done to avoid breaking existing code that may have come to rely
4786 # on this behavior in previous versions of this program.)
4787 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4788
4789 my %perl_extension;
4790 # A boolean set iff this table is a Perl extension to the Unicode
4791 # standard.
4792 main::set_access('perl_extension', \%perl_extension, 'r');
4793
0c07e538
KW
4794 my %output_range_counts;
4795 # A boolean set iff this table is to have comments written in the
4796 # output file that contain the number of code points in the range.
4797 # The constructor can override the global flag of the same name.
4798 main::set_access('output_range_counts', \%output_range_counts, 'r');
4799
f5817e0a
KW
4800 my %format;
4801 # The format of the entries of the table. This is calculated from the
4802 # data in the table (or passed in the constructor). This is an enum e.g.,
26561784
KW
4803 # $STRING_FORMAT. It is marked protected as it should not be generally
4804 # used to override calculations.
f5817e0a
KW
4805 main::set_access('format', \%format, 'r', 'p_s');
4806
99870f4d
KW
4807 sub new {
4808 # All arguments are key => value pairs, which you can see below, most
33e96e72 4809 # of which match fields documented above. Otherwise: Re_Pod_Entry,
0eac1e20 4810 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
99870f4d
KW
4811 # documented in the Alias package
4812
4813 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4814
4815 my $class = shift;
4816
4817 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4818 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4819
4820 my %args = @_;
4821
4822 $name{$addr} = delete $args{'Name'};
4823 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4824 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4825 my $complete_name = $complete_name{$addr}
4826 = delete $args{'Complete_Name'};
f5817e0a 4827 $format{$addr} = delete $args{'Format'};
0c07e538 4828 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4829 $property{$addr} = delete $args{'_Property'};
4830 $range_list{$addr} = delete $args{'_Range_List'};
4831 $status{$addr} = delete $args{'Status'} || $NORMAL;
4832 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4833 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
d867ccfb 4834 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
301ba948 4835 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
fd1e3e84 4836 my $ucd = delete $args{'UCD'};
99870f4d
KW
4837
4838 my $description = delete $args{'Description'};
0eac1e20 4839 my $ok_as_filename = delete $args{'OK_as_Filename'};
99870f4d
KW
4840 my $loose_match = delete $args{'Fuzzy'};
4841 my $note = delete $args{'Note'};
33e96e72 4842 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
37e2e78e 4843 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4844
4845 # Shouldn't have any left over
4846 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4847
4848 # Can't use || above because conceivably the name could be 0, and
4849 # can't use // operator in case this program gets used in Perl 5.8
4850 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4851 $output_range_counts{$addr} = $output_range_counts if
4852 ! defined $output_range_counts{$addr};
99870f4d
KW
4853
4854 $aliases{$addr} = [ ];
4855 $comment{$addr} = [ ];
4856 $description{$addr} = [ ];
4857 $note{$addr} = [ ];
4858 $file_path{$addr} = [ ];
4859 $locked{$addr} = "";
4860
4861 push @{$description{$addr}}, $description if $description;
4862 push @{$note{$addr}}, $note if $note;
4863
301ba948 4864 if ($fate{$addr} == $PLACEHOLDER) {
37e2e78e
KW
4865
4866 # A placeholder table doesn't get documented, is a perl extension,
4867 # and quite likely will be empty
33e96e72 4868 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
37e2e78e 4869 $perl_extension = 1 if ! defined $perl_extension;
fd1e3e84 4870 $ucd = 0 if ! defined $ucd;
37e2e78e 4871 push @tables_that_may_be_empty, $complete_name{$addr};
301ba948
KW
4872 $self->add_comment(<<END);
4873This is a placeholder because it is not in Version $string_version of Unicode,
4874but is needed by the Perl core to work gracefully. Because it is not in this
4875version of Unicode, it will not be listed in $pod_file.pod
4876END
37e2e78e 4877 }
301ba948 4878 elsif (exists $why_suppressed{$complete_name}
98dc9551 4879 # Don't suppress if overridden
ec11e5f4
KW
4880 && ! grep { $_ eq $complete_name{$addr} }
4881 @output_mapped_properties)
301ba948
KW
4882 {
4883 $fate{$addr} = $SUPPRESSED;
4884 }
4885 elsif ($fate{$addr} == $SUPPRESSED
4886 && ! exists $why_suppressed{$property{$addr}->complete_name})
4887 {
4888 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4889 # perhaps Fate => [ $SUPPRESSED, "reason" ]
4890 }
4891
4892 # If hasn't set its status already, see if it is on one of the
4893 # lists of properties or tables that have particular statuses; if
4894 # not, is normal. The lists are prioritized so the most serious
4895 # ones are checked first
4896 if (! $status{$addr}) {
4897 if (exists $why_deprecated{$complete_name}) {
99870f4d
KW
4898 $status{$addr} = $DEPRECATED;
4899 }
4900 elsif (exists $why_stabilized{$complete_name}) {
4901 $status{$addr} = $STABILIZED;
4902 }
4903 elsif (exists $why_obsolete{$complete_name}) {
4904 $status{$addr} = $OBSOLETE;
4905 }
4906
4907 # Existence above doesn't necessarily mean there is a message
4908 # associated with it. Use the most serious message.
4909 if ($status{$addr}) {
301ba948 4910 if ($why_deprecated{$complete_name}) {
99870f4d
KW
4911 $status_info{$addr}
4912 = $why_deprecated{$complete_name};
4913 }
4914 elsif ($why_stabilized{$complete_name}) {
4915 $status_info{$addr}
4916 = $why_stabilized{$complete_name};
4917 }
4918 elsif ($why_obsolete{$complete_name}) {
4919 $status_info{$addr}
4920 = $why_obsolete{$complete_name};
4921 }
4922 }
4923 }
4924
37e2e78e
KW
4925 $perl_extension{$addr} = $perl_extension || 0;
4926
8050d00f 4927 # Don't list a property by default that is internal only
395dfc19 4928 if ($fate{$addr} > $MAP_PROXIED) {
301ba948 4929 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
fd1e3e84
KW
4930 $ucd = 0 if ! defined $ucd;
4931 }
4932 else {
4933 $ucd = 1 if ! defined $ucd;
301ba948 4934 }
8050d00f 4935
99870f4d
KW
4936 # By convention what typically gets printed only or first is what's
4937 # first in the list, so put the full name there for good output
4938 # clarity. Other routines rely on the full name being first on the
4939 # list
4940 $self->add_alias($full_name{$addr},
0eac1e20 4941 OK_as_Filename => $ok_as_filename,
99870f4d 4942 Fuzzy => $loose_match,
33e96e72 4943 Re_Pod_Entry => $make_re_pod_entry,
99870f4d 4944 Status => $status{$addr},
fd1e3e84 4945 UCD => $ucd,
99870f4d
KW
4946 );
4947
4948 # Then comes the other name, if meaningfully different.
4949 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4950 $self->add_alias($name{$addr},
0eac1e20 4951 OK_as_Filename => $ok_as_filename,
99870f4d 4952 Fuzzy => $loose_match,
33e96e72 4953 Re_Pod_Entry => $make_re_pod_entry,
99870f4d 4954 Status => $status{$addr},
fd1e3e84 4955 UCD => $ucd,
99870f4d
KW
4956 );
4957 }
4958
4959 return $self;
4960 }
4961
4962 # Here are the methods that are required to be defined by any derived
4963 # class
ea25a9b2 4964 for my $sub (qw(
668b3bfc 4965 handle_special_range
99870f4d 4966 append_to_body
99870f4d 4967 pre_body
ea25a9b2 4968 ))
668b3bfc
KW
4969 # write() knows how to write out normal ranges, but it calls
4970 # handle_special_range() when it encounters a non-normal one.
4971 # append_to_body() is called by it after it has handled all
4972 # ranges to add anything after the main portion of the table.
4973 # And finally, pre_body() is called after all this to build up
4974 # anything that should appear before the main portion of the
4975 # table. Doing it this way allows things in the middle to
4976 # affect what should appear before the main portion of the
99870f4d 4977 # table.
99870f4d
KW
4978 {
4979 no strict "refs";
4980 *$sub = sub {
4981 Carp::my_carp_bug( __LINE__
4982 . ": Must create method '$sub()' for "
4983 . ref shift);
4984 return;
4985 }
4986 }
4987
4988 use overload
4989 fallback => 0,
4990 "." => \&main::_operator_dot,
1285127e 4991 ".=" => \&main::_operator_dot_equal,
99870f4d
KW
4992 '!=' => \&main::_operator_not_equal,
4993 '==' => \&main::_operator_equal,
4994 ;
4995
4996 sub ranges {
4997 # Returns the array of ranges associated with this table.
4998
f998e60c 4999 no overloading;
051df77b 5000 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
5001 }
5002
5003 sub add_alias {
5004 # Add a synonym for this table.
5005
5006 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5007
5008 my $self = shift;
5009 my $name = shift; # The name to add.
5010 my $pointer = shift; # What the alias hash should point to. For
5011 # map tables, this is the parent property;
5012 # for match tables, it is the table itself.
5013
5014 my %args = @_;
5015 my $loose_match = delete $args{'Fuzzy'};
5016
33e96e72
KW
5017 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5018 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
99870f4d 5019
0eac1e20
KW
5020 my $ok_as_filename = delete $args{'OK_as_Filename'};
5021 $ok_as_filename = 1 unless defined $ok_as_filename;
99870f4d
KW
5022
5023 my $status = delete $args{'Status'};
5024 $status = $NORMAL unless defined $status;
5025
26cef665
KW
5026 # An internal name does not get documented, unless overridden by the
5027 # input.
5028 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
fd1e3e84 5029
99870f4d
KW
5030 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5031
5032 # Capitalize the first letter of the alias unless it is one of the CJK
5033 # ones which specifically begins with a lower 'k'. Do this because
5034 # Unicode has varied whether they capitalize first letters or not, and
5035 # have later changed their minds and capitalized them, but not the
5036 # other way around. So do it always and avoid changes from release to
5037 # release
5038 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5039
ffe43484 5040 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5041
5042 # Figure out if should be loosely matched if not already specified.
5043 if (! defined $loose_match) {
5044
5045 # Is a loose_match if isn't null, and doesn't begin with an
5046 # underscore and isn't just a number
5047 if ($name ne ""
5048 && substr($name, 0, 1) ne '_'
5049 && $name !~ qr{^[0-9_.+-/]+$})
5050 {
5051 $loose_match = 1;
5052 }
5053 else {
5054 $loose_match = 0;
5055 }
5056 }
5057
5058 # If this alias has already been defined, do nothing.
5059 return if defined $find_table_from_alias{$addr}->{$name};
5060
5061 # That includes if it is standardly equivalent to an existing alias,
5062 # in which case, add this name to the list, so won't have to search
5063 # for it again.
5064 my $standard_name = main::standardize($name);
5065 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5066 $find_table_from_alias{$addr}->{$name}
5067 = $find_table_from_alias{$addr}->{$standard_name};
5068 return;
5069 }
5070
5071 # Set the index hash for this alias for future quick reference.
5072 $find_table_from_alias{$addr}->{$name} = $pointer;
5073 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5074 local $to_trace = 0 if main::DEBUG;
5075 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5076 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5077
5078
5079 # Put the new alias at the end of the list of aliases unless the final
5080 # element begins with an underscore (meaning it is for internal perl
5081 # use) or is all numeric, in which case, put the new one before that
5082 # one. This floats any all-numeric or underscore-beginning aliases to
5083 # the end. This is done so that they are listed last in output lists,
5084 # to encourage the user to use a better name (either more descriptive
5085 # or not an internal-only one) instead. This ordering is relied on
5086 # implicitly elsewhere in this program, like in short_name()
5087 my $list = $aliases{$addr};
5088 my $insert_position = (@$list == 0
5089 || (substr($list->[-1]->name, 0, 1) ne '_'
5090 && $list->[-1]->name =~ /\D/))
5091 ? @$list
5092 : @$list - 1;
5093 splice @$list,
5094 $insert_position,
5095 0,
33e96e72 5096 Alias->new($name, $loose_match, $make_re_pod_entry,
0eac1e20 5097 $ok_as_filename, $status, $ucd);
99870f4d
KW
5098
5099 # This name may be shorter than any existing ones, so clear the cache
5100 # of the shortest, so will have to be recalculated.
f998e60c 5101 no overloading;
051df77b 5102 undef $short_name{pack 'J', $self};
99870f4d
KW
5103 return;
5104 }
5105
5106 sub short_name {
5107 # Returns a name suitable for use as the base part of a file name.
5108 # That is, shorter wins. It can return undef if there is no suitable
5109 # name. The name has all non-essential underscores removed.
5110
5111 # The optional second parameter is a reference to a scalar in which
5112 # this routine will store the length the returned name had before the
5113 # underscores were removed, or undef if the return is undef.
5114
5115 # The shortest name can change if new aliases are added. So using
5116 # this should be deferred until after all these are added. The code
5117 # that does that should clear this one's cache.
5118 # Any name with alphabetics is preferred over an all numeric one, even
5119 # if longer.
5120
5121 my $self = shift;
5122 my $nominal_length_ptr = shift;
5123 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5124
ffe43484 5125 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5126
5127 # For efficiency, don't recalculate, but this means that adding new
5128 # aliases could change what the shortest is, so the code that does
5129 # that needs to undef this.
5130 if (defined $short_name{$addr}) {
5131 if ($nominal_length_ptr) {
5132 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5133 }
5134 return $short_name{$addr};
5135 }
5136
5137 # Look at each alias
5138 foreach my $alias ($self->aliases()) {
5139
5140 # Don't use an alias that isn't ok to use for an external name.
0eac1e20 5141 next if ! $alias->ok_as_filename;
99870f4d
KW
5142
5143 my $name = main::Standardize($alias->name);
5144 trace $self, $name if main::DEBUG && $to_trace;
5145
5146 # Take the first one, or a shorter one that isn't numeric. This
5147 # relies on numeric aliases always being last in the array
5148 # returned by aliases(). Any alpha one will have precedence.
5149 if (! defined $short_name{$addr}
5150 || ($name =~ /\D/
5151 && length($name) < length($short_name{$addr})))
5152 {
5153 # Remove interior underscores.
5154 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5155
5156 $nominal_short_name_length{$addr} = length $name;
5157 }
5158 }
5159
ff485b9e
KW
5160 # If the short name isn't a nice one, perhaps an equivalent table has
5161 # a better one.
5162 if (! defined $short_name{$addr}
5163 || $short_name{$addr} eq ""
5164 || $short_name{$addr} eq "_")
5165 {
5166 my $return;
5167 foreach my $follower ($self->children) { # All equivalents
5168 my $follower_name = $follower->short_name;
5169 next unless defined $follower_name;
5170
5171 # Anything (except undefined) is better than underscore or
5172 # empty
5173 if (! defined $return || $return eq "_") {
5174 $return = $follower_name;
5175 next;
5176 }
5177
5178 # If the new follower name isn't "_" and is shorter than the
5179 # current best one, prefer the new one.
5180 next if $follower_name eq "_";
5181 next if length $follower_name > length $return;
5182 $return = $follower_name;
5183 }
5184 $short_name{$addr} = $return if defined $return;
5185 }
5186
99870f4d
KW
5187 # If no suitable external name return undef
5188 if (! defined $short_name{$addr}) {
5189 $$nominal_length_ptr = undef if $nominal_length_ptr;
5190 return;
5191 }
5192
c12f2655 5193 # Don't allow a null short name.
99870f4d
KW
5194 if ($short_name{$addr} eq "") {
5195 $short_name{$addr} = '_';
5196 $nominal_short_name_length{$addr} = 1;
5197 }
5198
5199 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5200
5201 if ($nominal_length_ptr) {
5202 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5203 }
5204 return $short_name{$addr};
5205 }
5206
5207 sub external_name {
5208 # Returns the external name that this table should be known by. This
c12f2655
KW
5209 # is usually the short_name, but not if the short_name is undefined,
5210 # in which case the external_name is arbitrarily set to the
5211 # underscore.
99870f4d
KW
5212
5213 my $self = shift;
5214 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5215
5216 my $short = $self->short_name;
5217 return $short if defined $short;
5218
5219 return '_';
5220 }
5221
5222 sub add_description { # Adds the parameter as a short description.
5223
5224 my $self = shift;
5225 my $description = shift;
5226 chomp $description;
5227 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5228
f998e60c 5229 no overloading;
051df77b 5230 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
5231
5232 return;
5233 }
5234
5235 sub add_note { # Adds the parameter as a short note.
5236
5237 my $self = shift;
5238 my $note = shift;
5239 chomp $note;
5240 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5241
f998e60c 5242 no overloading;
051df77b 5243 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
5244
5245 return;
5246 }
5247
5248 sub add_comment { # Adds the parameter as a comment.
5249
bd9ebcfd
KW
5250 return unless $debugging_build;
5251
99870f4d
KW
5252 my $self = shift;
5253 my $comment = shift;
5254 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5255
5256 chomp $comment;
f998e60c
KW
5257
5258 no overloading;
051df77b 5259 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
5260
5261 return;
5262 }
5263
5264 sub comment {
5265 # Return the current comment for this table. If called in list
5266 # context, returns the array of comments. In scalar, returns a string
5267 # of each element joined together with a period ending each.
5268
5269 my $self = shift;
5270 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5271
ffe43484 5272 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 5273 my @list = @{$comment{$addr}};
99870f4d
KW
5274 return @list if wantarray;
5275 my $return = "";
5276 foreach my $sentence (@list) {
5277 $return .= '. ' if $return;
5278 $return .= $sentence;
5279 $return =~ s/\.$//;
5280 }
5281 $return .= '.' if $return;
5282 return $return;
5283 }
5284
5285 sub initialize {
5286 # Initialize the table with the argument which is any valid
5287 # initialization for range lists.
5288
5289 my $self = shift;
ffe43484 5290 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5291 my $initialization = shift;
5292 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5293
5294 # Replace the current range list with a new one of the same exact
5295 # type.
f998e60c
KW
5296 my $class = ref $range_list{$addr};
5297 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
5298 Initialize => $initialization);
5299 return;
5300
5301 }
5302
5303 sub header {
5304 # The header that is output for the table in the file it is written
5305 # in.
5306
5307 my $self = shift;
5308 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5309
5310 my $return = "";
5311 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5312 $return .= $HEADER;
99870f4d
KW
5313 return $return;
5314 }
5315
5316 sub write {
668b3bfc
KW
5317 # Write a representation of the table to its file. It calls several
5318 # functions furnished by sub-classes of this abstract base class to
5319 # handle non-normal ranges, to add stuff before the table, and at its
ce712c88
KW
5320 # end. If the table is to be written so that adjustments are
5321 # required, this does that conversion.
99870f4d
KW
5322
5323 my $self = shift;
ce712c88 5324 my $use_adjustments = shift; # ? output in adjusted format or not
99870f4d
KW
5325 my $tab_stops = shift; # The number of tab stops over to put any
5326 # comment.
5327 my $suppress_value = shift; # Optional, if the value associated with
5328 # a range equals this one, don't write
5329 # the range
5330 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5331
ffe43484 5332 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5333
5334 # Start with the header
668b3bfc 5335 my @HEADER = $self->header;
99870f4d
KW
5336
5337 # Then the comments
668b3bfc 5338 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
99870f4d
KW
5339 if $comment{$addr};
5340
668b3bfc
KW
5341 # Things discovered processing the main body of the document may
5342 # affect what gets output before it, therefore pre_body() isn't called
5343 # until after all other processing of the table is done.
99870f4d 5344
c4019d52
KW
5345 # The main body looks like a 'here' document. If annotating, get rid
5346 # of the comments before passing to the caller, as some callers, such
5347 # as charnames.pm, can't cope with them. (Outputting range counts
5348 # also introduces comments, but these don't show up in the tables that
5349 # can't cope with comments, and there aren't that many of them that
5350 # it's worth the extra real time to get rid of them).
668b3bfc 5351 my @OUT;
558712cf 5352 if ($annotate) {
c4019d52
KW
5353 # Use the line below in Perls that don't have /r
5354 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5355 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5356 } else {
5357 push @OUT, "return <<'END';\n";
5358 }
99870f4d
KW
5359
5360 if ($range_list{$addr}->is_empty) {
5361
5362 # This is a kludge for empty tables to silence a warning in
5363 # utf8.c, which can't really deal with empty tables, but it can
5364 # deal with a table that matches nothing, as the inverse of 'Any'
5365 # does.
67a53d68 5366 push @OUT, "!utf8::Any\n";
99870f4d 5367 }
c69a9c68
KW
5368 elsif ($self->name eq 'N'
5369
5370 # To save disk space and table cache space, avoid putting out
5371 # binary N tables, but instead create a file which just inverts
5372 # the Y table. Since the file will still exist and occupy a
5373 # certain number of blocks, might as well output the whole
5374 # thing if it all will fit in one block. The number of
5375 # ranges below is an approximate number for that.
06f26c45
KW
5376 && ($self->property->type == $BINARY
5377 || $self->property->type == $FORCED_BINARY)
c69a9c68
KW
5378 # && $self->property->tables == 2 Can't do this because the
5379 # non-binary properties, like NFDQC aren't specifiable
5380 # by the notation
5381 && $range_list{$addr}->ranges > 15
5382 && ! $annotate) # Under --annotate, want to see everything
5383 {
5384 push @OUT, "!utf8::" . $self->property->name . "\n";
5385 }
99870f4d
KW
5386 else {
5387 my $range_size_1 = $range_size_1{$addr};
558712cf
KW
5388 my $format; # Used only in $annotate option
5389 my $include_name; # Used only in $annotate option
c4019d52 5390
558712cf 5391 if ($annotate) {
c4019d52 5392
d59563d0 5393 # If annotating each code point, must print 1 per line.
c4019d52
KW
5394 # The variable could point to a subroutine, and we don't want
5395 # to lose that fact, so only set if not set already
5396 $range_size_1 = 1 if ! $range_size_1;
5397
5398 $format = $self->format;
5399
5400 # The name of the character is output only for tables that
5401 # don't already include the name in the output.
5402 my $property = $self->property;
5403 $include_name =
5404 ! ($property == $perl_charname
5405 || $property == main::property_ref('Unicode_1_Name')
5406 || $property == main::property_ref('Name')
5407 || $property == main::property_ref('Name_Alias')
5408 );
5409 }
99870f4d 5410
bbed833a
KW
5411 # Values for previous time through the loop. Initialize to
5412 # something that won't be adjacent to the first iteration;
5413 # only $previous_end matters for that.
5414 my $previous_start;
5415 my $previous_end = -2;
5416 my $previous_value;
5417
5418 # Values for next time through the portion of the loop that splits
5419 # the range. 0 in $next_start means there is no remaining portion
5420 # to deal with.
5421 my $next_start = 0;
5422 my $next_end;
5423 my $next_value;
d11155ec 5424 my $offset = 0;
bbed833a 5425
99870f4d 5426 # Output each range as part of the here document.
5a2b5ddb 5427 RANGE:
99870f4d 5428 for my $set ($range_list{$addr}->ranges) {
5a2b5ddb
KW
5429 if ($set->type != 0) {
5430 $self->handle_special_range($set);
5431 next RANGE;
5432 }
99870f4d
KW
5433 my $start = $set->start;
5434 my $end = $set->end;
5435 my $value = $set->value;
5436
5437 # Don't output ranges whose value is the one to suppress
c4019d52
KW
5438 next RANGE if defined $suppress_value
5439 && $value eq $suppress_value;
99870f4d 5440
bbed833a 5441 { # This bare block encloses the scope where we may need to
ce712c88 5442 # split a range (when outputting adjusteds), and each time
bbed833a
KW
5443 # through we handle the next portion of the original by
5444 # ending the block with a 'redo'. The values to use for
5445 # that next time through are set up just below in the
5446 # scalars whose names begin with '$next_'.
5447
ce712c88 5448 if ($use_adjustments) {
bbed833a 5449
d11155ec
KW
5450 # When converting to use adjustments, we can handle
5451 # only single element ranges. Set up so that this
5452 # time through the loop, we look at the first element,
5453 # and the next time through, we start off with the
5454 # remainder. Thus each time through we look at the
5455 # first element of the range
bbed833a
KW
5456 if ($end != $start) {
5457 $next_start = $start + 1;
5458 $next_end = $end;
5459 $next_value = $value;
5460 $end = $start;
5461 }
5462
ce712c88
KW
5463 # The values for some of these tables are stored as
5464 # hex strings. Convert those to decimal
5465 $value = hex($value)
5466 if $self->default_map eq $CODE_POINT
5467 && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
bbed833a
KW
5468
5469 # If this range is adjacent to the previous one, and
d11155ec
KW
5470 # the values in each are integers that are also
5471 # adjacent (differ by 1), then this range really
5472 # extends the previous one that is already in element
5473 # $OUT[-1]. So we pop that element, and pretend that
5474 # the range starts with whatever it started with.
5475 # $offset is incremented by 1 each time so that it
5476 # gives the current offset from the first element in
5477 # the accumulating range, and we keep in $value the
5478 # value of that first element.
bbed833a 5479 if ($start == $previous_end + 1
d11155ec
KW
5480 && $value =~ /^ -? \d+ $/xa
5481 && $previous_value =~ /^ -? \d+ $/xa
5482 && ($value == ($previous_value + ++$offset)))
bbed833a
KW
5483 {
5484 pop @OUT;
5485 $start = $previous_start;
d11155ec
KW
5486 $value = $previous_value;
5487 }
5488 else {
5489 $offset = 0;
bbed833a
KW
5490 }
5491
5492 # Save the current values for the next time through
5493 # the loop.
5494 $previous_start = $start;
5495 $previous_end = $end;
5496 $previous_value = $value;
5497 }
5498
74a3abe0
KW
5499 # If there is a range and doesn't need a single point range
5500 # output
5501 if ($start != $end && ! $range_size_1) {
5502 push @OUT, sprintf "%04X\t%04X", $start, $end;
5503 $OUT[-1] .= "\t$value" if $value ne "";
5504
5505 # Add a comment with the size of the range, if
5506 # requested. Expand Tabs to make sure they all start
5507 # in the same column, and then unexpand to use mostly
5508 # tabs.
5509 if (! $output_range_counts{$addr}) {
5510 $OUT[-1] .= "\n";
5511 }
5512 else {
5513 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5514 my $count = main::clarify_number($end - $start + 1);
5515 use integer;
5516
5517 my $width = $tab_stops * 8 - 1;
5518 $OUT[-1] = sprintf("%-*s # [%s]\n",
5519 $width,
5520 $OUT[-1],
5521 $count);
5522 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5523 }
99870f4d 5524 }
c4019d52 5525
74a3abe0
KW
5526 # Here to output a single code point per line.
5527 # If not to annotate, use the simple formats
5528 elsif (! $annotate) {
5529
5530 # Use any passed in subroutine to output.
5531 if (ref $range_size_1 eq 'CODE') {
5532 for my $i ($start .. $end) {
5533 push @OUT, &{$range_size_1}($i, $value);
5534 }
5535 }
5536 else {
c4019d52 5537
74a3abe0
KW
5538 # Here, caller is ok with default output.
5539 for (my $i = $start; $i <= $end; $i++) {
5540 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5541 }
c4019d52
KW
5542 }
5543 }
5544 else {
5545
74a3abe0 5546 # Here, wants annotation.
c4019d52 5547 for (my $i = $start; $i <= $end; $i++) {
c4019d52 5548
74a3abe0
KW
5549 # Get character information if don't have it already
5550 main::populate_char_info($i)
5551 if ! defined $viacode[$i];
5552 my $type = $annotate_char_type[$i];
5553
5554 # Figure out if should output the next code points
5555 # as part of a range or not. If this is not in an
5556 # annotation range, then won't output as a range,
5557 # so returns $i. Otherwise use the end of the
5558 # annotation range, but no further than the
5559 # maximum possible end point of the loop.
5560 my $range_end = main::min(
5561 $annotate_ranges->value_of($i) || $i,
5562 $end);
5563
5564 # Use a range if it is a range, and either is one
5565 # of the special annotation ranges, or the range
5566 # is at most 3 long. This last case causes the
5567 # algorithmically named code points to be output
5568 # individually in spans of at most 3, as they are
5569 # the ones whose $type is > 0.
5570 if ($range_end != $i
5571 && ( $type < 0 || $range_end - $i > 2))
5572 {
5573 # Here is to output a range. We don't allow a
5574 # caller-specified output format--just use the
5575 # standard one.
5576 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
c4019d52
KW
5577 $range_end,
5578 $value;
74a3abe0
KW
5579 my $range_name = $viacode[$i];
5580
5581 # For the code points which end in their hex
5582 # value, we eliminate that from the output
5583 # annotation, and capitalize only the first
5584 # letter of each word.
5585 if ($type == $CP_IN_NAME) {
5586 my $hex = sprintf "%04X", $i;
5587 $range_name =~ s/-$hex$//;
5588 my @words = split " ", $range_name;
5589 for my $word (@words) {
5590 $word =
5591 ucfirst(lc($word)) if $word ne 'CJK';
5592 }
5593 $range_name = join " ", @words;
5594 }
5595 elsif ($type == $HANGUL_SYLLABLE) {
5596 $range_name = "Hangul Syllable";
5597 }
c4019d52 5598
74a3abe0 5599 $OUT[-1] .= " $range_name" if $range_name;
c4019d52 5600
74a3abe0
KW
5601 # Include the number of code points in the
5602 # range
5603 my $count =
5604 main::clarify_number($range_end - $i + 1);
5605 $OUT[-1] .= " [$count]\n";
c4019d52 5606
74a3abe0
KW
5607 # Skip to the end of the range
5608 $i = $range_end;
c4019d52 5609 }
74a3abe0
KW
5610 else { # Not in a range.
5611 my $comment = "";
5612
5613 # When outputting the names of each character,
5614 # use the character itself if printable
5615 $comment .= "'" . chr($i) . "' "
5616 if $printable[$i];
5617
5618 # To make it more readable, use a minimum
5619 # indentation
5620 my $comment_indent;
5621
5622 # Determine the annotation
5623 if ($format eq $DECOMP_STRING_FORMAT) {
5624
5625 # This is very specialized, with the type
5626 # of decomposition beginning the line
5627 # enclosed in <...>, and the code points
5628 # that the code point decomposes to
5629 # separated by blanks. Create two
5630 # strings, one of the printable
5631 # characters, and one of their official
5632 # names.
5633 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5634 my $tostr = "";
5635 my $to_name = "";
5636 my $to_chr = "";
5637 foreach my $to (split " ", $map) {
5638 $to = CORE::hex $to;
5639 $to_name .= " + " if $to_name;
5640 $to_chr .= chr($to);
5641 main::populate_char_info($to)
5642 if ! defined $viacode[$to];
5643 $to_name .= $viacode[$to];
5644 }
c4019d52 5645
74a3abe0 5646 $comment .=
c4019d52 5647 "=> '$to_chr'; $viacode[$i] => $to_name";
74a3abe0
KW
5648 $comment_indent = 25; # Determined by
5649 # experiment
5650 }
5651 else {
5652
5653 # Assume that any table that has hex
5654 # format is a mapping of one code point to
5655 # another.
5656 if ($format eq $HEX_FORMAT) {
5657 my $decimal_value = CORE::hex $value;
5658 main::populate_char_info($decimal_value)
c4019d52 5659 if ! defined $viacode[$decimal_value];
74a3abe0
KW
5660 $comment .= "=> '"
5661 . chr($decimal_value)
5662 . "'; " if $printable[$decimal_value];
5663 }
5664 $comment .= $viacode[$i] if $include_name
5665 && $viacode[$i];
5666 if ($format eq $HEX_FORMAT) {
5667 my $decimal_value = CORE::hex $value;
5668 $comment .=
5669 " => $viacode[$decimal_value]"
5670 if $viacode[$decimal_value];
5671 }
c4019d52 5672
74a3abe0
KW
5673 # If including the name, no need to
5674 # indent, as the name will already be way
5675 # across the line.
5676 $comment_indent = ($include_name) ? 0 : 60;
5677 }
c4019d52 5678
74a3abe0
KW
5679 # Use any passed in routine to output the base
5680 # part of the line.
5681 if (ref $range_size_1 eq 'CODE') {
5682 my $base_part=&{$range_size_1}($i, $value);
5683 chomp $base_part;
5684 push @OUT, $base_part;
5685 }
5686 else {
5687 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5688 }
c4019d52 5689
74a3abe0
KW
5690 # And add the annotation.
5691 $OUT[-1] = sprintf "%-*s\t# %s",
5692 $comment_indent,
5693 $OUT[-1],
5694 $comment
5695 if $comment;
5696 $OUT[-1] .= "\n";
5697 }
5698 }
c4019d52 5699 }
bbed833a
KW
5700
5701 # If we split the range, set up so the next time through
5702 # we get the remainder, and redo.
5703 if ($next_start) {
5704 $start = $next_start;
5705 $end = $next_end;
5706 $value = $next_value;
5707 $next_start = 0;
5708 redo;
5709 }
99870f4d
KW
5710 }
5711 } # End of loop through all the table's ranges
5712 }
5713
5714 # Add anything that goes after the main body, but within the here
5715 # document,
5716 my $append_to_body = $self->append_to_body;
5717 push @OUT, $append_to_body if $append_to_body;
5718
5719 # And finish the here document.
5720 push @OUT, "END\n";
5721
668b3bfc
KW
5722 # Done with the main portion of the body. Can now figure out what
5723 # should appear before it in the file.
5724 my $pre_body = $self->pre_body;
5725 push @HEADER, $pre_body, "\n" if $pre_body;
668b3bfc 5726
6b0079b5
KW
5727 # All these files should have a .pl suffix added to them.
5728 my @file_with_pl = @{$file_path{$addr}};
5729 $file_with_pl[-1] .= '.pl';
99870f4d 5730
6b0079b5 5731 main::write(\@file_with_pl,
558712cf 5732 $annotate, # utf8 iff annotating
9218f1cf
KW
5733 \@HEADER,
5734 \@OUT);
99870f4d
KW
5735 return;
5736 }
5737
5738 sub set_status { # Set the table's status
5739 my $self = shift;
5740 my $status = shift; # The status enum value
5741 my $info = shift; # Any message associated with it.
5742 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5743
ffe43484 5744 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5745
5746 $status{$addr} = $status;
5747 $status_info{$addr} = $info;
5748 return;
5749 }
5750
301ba948
KW
5751 sub set_fate { # Set the fate of a table
5752 my $self = shift;
5753 my $fate = shift;
5754 my $reason = shift;
5755 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5756
5757 my $addr = do { no overloading; pack 'J', $self; };
5758
5759 return if $fate{$addr} == $fate; # If no-op
5760
395dfc19
KW
5761 # Can only change the ordinary fate, except if going to $MAP_PROXIED
5762 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
301ba948
KW
5763
5764 $fate{$addr} = $fate;
5765
395dfc19
KW
5766 # Don't document anything to do with a non-normal fated table
5767 if ($fate != $ORDINARY) {
fd1e3e84 5768 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
395dfc19 5769 foreach my $alias ($self->aliases) {
fd1e3e84 5770 $alias->set_ucd($put_in_pod);
395dfc19
KW
5771
5772 # MAP_PROXIED doesn't affect the match tables
5773 next if $fate == $MAP_PROXIED;
fd1e3e84 5774 $alias->set_make_re_pod_entry($put_in_pod);
395dfc19
KW
5775 }
5776 }
5777
301ba948
KW
5778 # Save the reason for suppression for output
5779 if ($fate == $SUPPRESSED && defined $reason) {
5780 $why_suppressed{$complete_name{$addr}} = $reason;
5781 }
5782
5783 return;
5784 }
5785
99870f4d
KW
5786 sub lock {
5787 # Don't allow changes to the table from now on. This stores a stack
5788 # trace of where it was called, so that later attempts to modify it
5789 # can immediately show where it got locked.
5790
5791 my $self = shift;
5792 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5793
ffe43484 5794 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5795
5796 $locked{$addr} = "";
5797
5798 my $line = (caller(0))[2];
5799 my $i = 1;
5800
5801 # Accumulate the stack trace
5802 while (1) {
5803 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5804
5805 last unless defined $caller;
5806
5807 $locked{$addr} .= " called from $caller() at line $line\n";
5808 $line = $caller_line;
5809 }
5810 $locked{$addr} .= " called from main at line $line\n";
5811
5812 return;
5813 }
5814
5815 sub carp_if_locked {
5816 # Return whether a table is locked or not, and, by the way, complain
5817 # if is locked
5818
5819 my $self = shift;
5820 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5821
ffe43484 5822 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5823
5824 return 0 if ! $locked{$addr};
5825 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5826 return 1;
5827 }
5828
5829 sub set_file_path { # Set the final directory path for this table
5830 my $self = shift;
5831 # Rest of parameters passed on
5832
f998e60c 5833 no overloading;
051df77b 5834 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
5835 return
5836 }
5837
5838 # Accessors for the range list stored in this table. First for
5839 # unconditional
ea25a9b2 5840 for my $sub (qw(
2f7a8815 5841 containing_range
99870f4d
KW
5842 contains
5843 count
5844 each_range
5845 hash
5846 is_empty
09aba7e4 5847 matches_identically_to
99870f4d
KW
5848 max
5849 min
5850 range_count
5851 reset_each_range
0a9dbafc 5852 type_of
99870f4d 5853 value_of
ea25a9b2 5854 ))
99870f4d
KW
5855 {
5856 no strict "refs";
5857 *$sub = sub {
5858 use strict "refs";
5859 my $self = shift;
ec40ee88 5860 return $self->_range_list->$sub(@_);
99870f4d
KW
5861 }
5862 }
5863
5864 # Then for ones that should fail if locked
ea25a9b2 5865 for my $sub (qw(
99870f4d 5866 delete_range
ea25a9b2 5867 ))
99870f4d
KW
5868 {
5869 no strict "refs";
5870 *$sub = sub {
5871 use strict "refs";
5872 my $self = shift;
5873
5874 return if $self->carp_if_locked;
f998e60c 5875 no overloading;
ec40ee88 5876 return $self->_range_list->$sub(@_);
99870f4d
KW
5877 }
5878 }
5879
5880} # End closure
5881
5882package Map_Table;
5883use base '_Base_Table';
5884
5885# A Map Table is a table that contains the mappings from code points to
5886# values. There are two weird cases:
5887# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5888# are written in the table's file at the end of the table nonetheless. It
5889# requires specially constructed code to handle these; utf8.c can not read
5890# these in, so they should not go in $map_directory. As of this writing,
5891# the only case that these happen is for named sequences used in
5892# charnames.pm. But this code doesn't enforce any syntax on these, so
5893# something else could come along that uses it.
5894# 2) Specials are anything that doesn't fit syntactically into the body of the
5895# table. The ranges for these have a map type of non-zero. The code below
5896# knows about and handles each possible type. In most cases, these are
5897# written as part of the header.
5898#
5899# A map table deliberately can't be manipulated at will unlike match tables.
5900# This is because of the ambiguities having to do with what to do with
5901# overlapping code points. And there just isn't a need for those things;
5902# what one wants to do is just query, add, replace, or delete mappings, plus
5903# write the final result.
5904# However, there is a method to get the list of possible ranges that aren't in
5905# this table to use for defaulting missing code point mappings. And,
5906# map_add_or_replace_non_nulls() does allow one to add another table to this
5907# one, but it is clearly very specialized, and defined that the other's
5908# non-null values replace this one's if there is any overlap.
5909
5910sub trace { return main::trace(@_); }
5911
5912{ # Closure
5913
5914 main::setup_package();
5915
5916 my %default_map;
5917 # Many input files omit some entries; this gives what the mapping for the
5918 # missing entries should be
5919 main::set_access('default_map', \%default_map, 'r');
5920
5921 my %anomalous_entries;
5922 # Things that go in the body of the table which don't fit the normal
5923 # scheme of things, like having a range. Not much can be done with these
5924 # once there except to output them. This was created to handle named
5925 # sequences.
5926 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5927 main::set_access('anomalous_entries', # Append singular, read plural
5928 \%anomalous_entries,
5929 'readable_array');
5930
99870f4d 5931 my %to_output_map;
bbed833a 5932 # Enum as to whether or not to write out this map table, and how:
c12f2655 5933 # 0 don't output
8572ace0
KW
5934 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5935 # it should not be removed nor its format changed. This
5936 # is done for those files that have traditionally been
5937 # output.
5938 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5939 # with this file
ce712c88
KW
5940 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
5941 # outputting the actual mappings as-is, we adjust things
5942 # to create a much more compact table. Only those few
5943 # tables where the mapping is convertible at least to an
5944 # integer and compacting makes a big difference should
5945 # have this. Hence, the default is to not do this
5946 # unless the table's default mapping is to $CODE_POINT,
5947 # and the range size is not 1.
99870f4d
KW
5948 main::set_access('to_output_map', \%to_output_map, 's');
5949
99870f4d
KW
5950 sub new {
5951 my $class = shift;
5952 my $name = shift;
5953
5954 my %args = @_;
5955
5956 # Optional initialization data for the table.
5957 my $initialize = delete $args{'Initialize'};
5958
99870f4d 5959 my $default_map = delete $args{'Default_Map'};
99870f4d 5960 my $property = delete $args{'_Property'};
23e33b60 5961 my $full_name = delete $args{'Full_Name'};
bbed833a 5962 my $to_output_map = delete $args{'To_Output_Map'};
20863809 5963
99870f4d
KW
5964 # Rest of parameters passed on
5965
5966 my $range_list = Range_Map->new(Owner => $property);
5967
5968 my $self = $class->SUPER::new(
5969 Name => $name,
23e33b60
KW
5970 Complete_Name => $full_name,
5971 Full_Name => $full_name,
99870f4d
KW
5972 _Property => $property,
5973 _Range_List => $range_list,
5974 %args);
5975
ffe43484 5976 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5977
5978 $anomalous_entries{$addr} = [];
99870f4d 5979 $default_map{$addr} = $default_map;
bbed833a 5980 $to_output_map{$addr} = $to_output_map;
99870f4d
KW
5981
5982 $self->initialize($initialize) if defined $initialize;
5983
5984 return $self;
5985 }
5986
5987 use overload
5988 fallback => 0,
5989 qw("") => "_operator_stringify",
5990 ;
5991
5992 sub _operator_stringify {
5993 my $self = shift;
5994
5995 my $name = $self->property->full_name;
5996 $name = '""' if $name eq "";
5997 return "Map table for Property '$name'";
5998 }
5999
99870f4d
KW
6000 sub add_alias {
6001 # Add a synonym for this table (which means the property itself)
6002 my $self = shift;
6003 my $name = shift;
6004 # Rest of parameters passed on.
6005
6006 $self->SUPER::add_alias($name, $self->property, @_);
6007 return;
6008 }
6009
6010 sub add_map {
6011 # Add a range of code points to the list of specially-handled code
6012 # points. $MULTI_CP is assumed if the type of special is not passed
6013 # in.
6014
6015 my $self = shift;
6016 my $lower = shift;
6017 my $upper = shift;
6018 my $string = shift;
6019 my %args = @_;
6020
6021 my $type = delete $args{'Type'} || 0;
6022 # Rest of parameters passed on
6023
6024 # Can't change the table if locked.
6025 return if $self->carp_if_locked;
6026
ffe43484 6027 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 6028
99870f4d
KW
6029 $self->_range_list->add_map($lower, $upper,
6030 $string,
6031 @_,
6032 Type => $type);
6033 return;
6034 }
6035
6036 sub append_to_body {
6037 # Adds to the written HERE document of the table's body any anomalous
6038 # entries in the table..
6039
6040 my $self = shift;
6041 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6042
ffe43484 6043 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6044
6045 return "" unless @{$anomalous_entries{$addr}};
6046 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6047 }
6048
6049 sub map_add_or_replace_non_nulls {
6050 # This adds the mappings in the table $other to $self. Non-null
6051 # mappings from $other override those in $self. It essentially merges
6052 # the two tables, with the second having priority except for null
6053 # mappings.
6054
6055 my $self = shift;
6056 my $other = shift;
6057 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6058
6059 return if $self->carp_if_locked;
6060
6061 if (! $other->isa(__PACKAGE__)) {
6062 Carp::my_carp_bug("$other should be a "
6063 . __PACKAGE__
6064 . ". Not a '"
6065 . ref($other)
6066 . "'. Not added;");
6067 return;
6068 }
6069
ffe43484
NC
6070 my $addr = do { no overloading; pack 'J', $self; };
6071 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6072
6073 local $to_trace = 0 if main::DEBUG;
6074
6075 my $self_range_list = $self->_range_list;
6076 my $other_range_list = $other->_range_list;
6077 foreach my $range ($other_range_list->ranges) {
6078 my $value = $range->value;
6079 next if $value eq "";
6080 $self_range_list->_add_delete('+',
6081 $range->start,
6082 $range->end,
6083 $value,
6084 Type => $range->type,
6085 Replace => $UNCONDITIONALLY);
6086 }
6087
99870f4d
KW
6088 return;
6089 }
6090
6091 sub set_default_map {
6092 # Define what code points that are missing from the input files should
6093 # map to
6094
6095 my $self = shift;
6096 my $map = shift;
6097 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6098
ffe43484 6099 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6100
6101 # Convert the input to the standard equivalent, if any (won't have any
6102 # for $STRING properties)
6103 my $standard = $self->_find_table_from_alias->{$map};
6104 $map = $standard->name if defined $standard;
6105
6106 # Warn if there already is a non-equivalent default map for this
6107 # property. Note that a default map can be a ref, which means that
6108 # what it actually means is delayed until later in the program, and it
6109 # IS permissible to override it here without a message.
6110 my $default_map = $default_map{$addr};
6111 if (defined $default_map
6112 && ! ref($default_map)
6113 && $default_map ne $map
6114 && main::Standardize($map) ne $default_map)
6115 {
6116 my $property = $self->property;
6117 my $map_table = $property->table($map);
6118 my $default_table = $property->table($default_map);
6119 if (defined $map_table
6120 && defined $default_table
6121 && $map_table != $default_table)
6122 {
6123 Carp::my_carp("Changing the default mapping for "
6124 . $property
6125 . " from $default_map to $map'");
6126 }
6127 }
6128
6129 $default_map{$addr} = $map;
6130
6131 # Don't also create any missing table for this map at this point,
6132 # because if we did, it could get done before the main table add is
6133 # done for PropValueAliases.txt; instead the caller will have to make
6134 # sure it exists, if desired.
6135 return;
6136 }
6137
6138 sub to_output_map {
6139 # Returns boolean: should we write this map table?
6140
6141 my $self = shift;
6142 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6143
ffe43484 6144 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6145
6146 # If overridden, use that
6147 return $to_output_map{$addr} if defined $to_output_map{$addr};
6148
6149 my $full_name = $self->full_name;
fcf1973c
KW
6150 return $global_to_output_map{$full_name}
6151 if defined $global_to_output_map{$full_name};
99870f4d 6152
20863809 6153 # If table says to output, do so; if says to suppress it, do so.
301ba948
KW
6154 my $fate = $self->fate;
6155 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
8572ace0 6156 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
395dfc19 6157 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
99870f4d
KW
6158
6159 my $type = $self->property->type;
6160
6161 # Don't want to output binary map tables even for debugging.
6162 return 0 if $type == $BINARY;
6163
ed307795
KW
6164 # But do want to output string ones. All the ones that remain to
6165 # be dealt with (i.e. which haven't explicitly been set to external)
bf7fe2df
KW
6166 # are for internal Perl use only. The default for those that map to
6167 # $CODE_POINT and haven't been restricted to a single element range
ce712c88 6168 # is to use the adjusted form.
bf7fe2df
KW
6169 if ($type == $STRING) {
6170 return $INTERNAL_MAP if $self->range_size_1
6171 || $default_map{$addr} ne $CODE_POINT;
ce712c88 6172 return $OUTPUT_ADJUSTED;
bf7fe2df 6173 }
99870f4d 6174
8572ace0
KW
6175 # Otherwise is an $ENUM, do output it, for Perl's purposes
6176 return $INTERNAL_MAP;
99870f4d
KW
6177 }
6178
6179 sub inverse_list {
6180 # Returns a Range_List that is gaps of the current table. That is,
6181 # the inversion
6182
6183 my $self = shift;
6184 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6185
6186 my $current = Range_List->new(Initialize => $self->_range_list,
6187 Owner => $self->property);
6188 return ~ $current;
6189 }
6190
8572ace0
KW
6191 sub header {
6192 my $self = shift;
6193 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6194
6195 my $return = $self->SUPER::header();
6196
bbed833a 6197 if ($self->to_output_map >= $INTERNAL_MAP) {
ae92a9ae
KW
6198 $return .= $INTERNAL_ONLY_HEADER;
6199 }
6200 else {
26cef665 6201 my $property_name = $self->property->full_name =~ s/Legacy_//r;
ae92a9ae
KW
6202 $return .= <<END;
6203
6204# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
6205
6206# This file is for internal use by core Perl only. It is retained for
6207# backwards compatibility with applications that may have come to rely on it,
6208# but its format and even its name or existence are subject to change without
6209# notice in a future Perl version. Don't use it directly. Instead, its
6210# contents are now retrievable through a stable API in the Unicode::UCD
6211# module: Unicode::UCD::prop_invmap('$property_name').
6212END
6213 }
8572ace0
KW
6214 return $return;
6215 }
6216
99870f4d
KW
6217 sub set_final_comment {
6218 # Just before output, create the comment that heads the file
6219 # containing this table.
6220
bd9ebcfd
KW
6221 return unless $debugging_build;
6222
99870f4d
KW
6223 my $self = shift;
6224 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6225
6226 # No sense generating a comment if aren't going to write it out.
6227 return if ! $self->to_output_map;
6228
ffe43484 6229 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6230
6231 my $property = $self->property;
6232
6233 # Get all the possible names for this property. Don't use any that
6234 # aren't ok for use in a file name, etc. This is perhaps causing that
6235 # flag to do double duty, and may have to be changed in the future to
6236 # have our own flag for just this purpose; but it works now to exclude
6237 # Perl generated synonyms from the lists for properties, where the
6238 # name is always the proper Unicode one.
0eac1e20 6239 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
99870f4d
KW
6240
6241 my $count = $self->count;
6242 my $default_map = $default_map{$addr};
6243
6244 # The ranges that map to the default aren't output, so subtract that
6245 # to get those actually output. A property with matching tables
6246 # already has the information calculated.
6247 if ($property->type != $STRING) {
6248 $count -= $property->table($default_map)->count;
6249 }
6250 elsif (defined $default_map) {
6251
6252 # But for $STRING properties, must calculate now. Subtract the
6253 # count from each range that maps to the default.
6254 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
6255 if ($range->value eq $default_map) {
6256 $count -= $range->end +1 - $range->start;
6257 }
6258 }
6259
6260 }
6261
6262 # Get a string version of $count with underscores in large numbers,
6263 # for clarity.
6264 my $string_count = main::clarify_number($count);
6265
6266 my $code_points = ($count == 1)
6267 ? 'single code point'
6268 : "$string_count code points";
6269
6270 my $mapping;
6271 my $these_mappings;
6272 my $are;
6273 if (@property_aliases <= 1) {
6274 $mapping = 'mapping';
6275 $these_mappings = 'this mapping';
6276 $are = 'is'
6277 }
6278 else {
6279 $mapping = 'synonymous mappings';
6280 $these_mappings = 'these mappings';
6281 $are = 'are'
6282 }
6283 my $cp;
6284 if ($count >= $MAX_UNICODE_CODEPOINTS) {
6285 $cp = "any code point in Unicode Version $string_version";
6286 }
6287 else {
6288 my $map_to;
6289 if ($default_map eq "") {
6290 $map_to = 'the null string';
6291 }
6292 elsif ($default_map eq $CODE_POINT) {
6293 $map_to = "itself";
6294 }
6295 else {
6296 $map_to = "'$default_map'";
6297 }
6298 if ($count == 1) {
6299 $cp = "the single code point";
6300 }
6301 else {
6302 $cp = "one of the $code_points";
6303 }
6304 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6305 }
6306
6307 my $comment = "";
6308
6309 my $status = $self->status;
26e2a4d8 6310 if ($status && $status ne $PLACEHOLDER) {
99870f4d
KW
6311 my $warn = uc $status_past_participles{$status};
6312 $comment .= <<END;
6313
6314!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
6315 All property or property=value combinations contained in this file are $warn.
6316 See $unicode_reference_url for what this means.
6317
6318END
6319 }
6320 $comment .= "This file returns the $mapping:\n";
6321
26cef665
KW
6322 my $ucd_accessible_name = "";
6323 my $full_name = $self->property->full_name;
99870f4d 6324 for my $i (0 .. @property_aliases - 1) {
26cef665
KW
6325 my $name = $property_aliases[$i]->name;
6326 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6327 if ($property_aliases[$i]->ucd) {
6328 if ($name eq $full_name) {
6329 $ucd_accessible_name = $full_name;
6330 }
6331 elsif (! $ucd_accessible_name) {
6332 $ucd_accessible_name = $name;
6333 }
6334 }
6335 }
6336 $comment .= "\nwhere 'cp' is $cp.";
6337 if ($ucd_accessible_name) {
6338 $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
99870f4d 6339 }
99870f4d
KW
6340
6341 # And append any commentary already set from the actual property.
6342 $comment .= "\n\n" . $self->comment if $self->comment;
6343 if ($self->description) {
6344 $comment .= "\n\n" . join " ", $self->description;
6345 }
6346 if ($self->note) {
6347 $comment .= "\n\n" . join " ", $self->note;
6348 }
6349 $comment .= "\n";
6350
6351 if (! $self->perl_extension) {
6352 $comment .= <<END;
6353
6354For information about what this property really means, see:
6355$unicode_reference_url
6356END
6357 }
6358
6359 if ($count) { # Format differs for empty table
6360 $comment.= "\nThe format of the ";
6361 if ($self->range_size_1) {
6362 $comment.= <<END;
6363main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6364is in hex; MAPPING is what CODE_POINT maps to.
6365END
6366 }
6367 else {
6368
6369 # There are tables which end up only having one element per
6370 # range, but it is not worth keeping track of for making just
6371 # this comment a little better.
6372 $comment.= <<END;
6373non-comment portions of the main body of lines of this file is:
6374START\\tSTOP\\tMAPPING where START is the starting code point of the
6375range, in hex; STOP is the ending point, or if omitted, the range has just one
6376code point; MAPPING is what each code point between START and STOP maps to.
6377END
0c07e538 6378 if ($self->output_range_counts) {
99870f4d
KW
6379 $comment .= <<END;
6380Numbers in comments in [brackets] indicate how many code points are in the
6381range (omitted when the range is a single code point or if the mapping is to
6382the null string).
6383END
6384 }
6385 }
6386 }
6387 $self->set_comment(main::join_lines($comment));
6388 return;
6389 }
6390
6391 my %swash_keys; # Makes sure don't duplicate swash names.
6392
668b3bfc
KW
6393 # The remaining variables are temporaries used while writing each table,
6394 # to output special ranges.
668b3bfc
KW
6395 my @multi_code_point_maps; # Map is to more than one code point.
6396
668b3bfc
KW
6397 sub handle_special_range {
6398 # Called in the middle of write when it finds a range it doesn't know
6399 # how to handle.
6400
6401 my $self = shift;
6402 my $range = shift;
6403 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6404
6405 my $addr = do { no overloading; pack 'J', $self; };
6406
6407 my $type = $range->type;
6408
6409 my $low = $range->start;
6410 my $high = $range->end;
6411 my $map = $range->value;
6412
6413 # No need to output the range if it maps to the default.
6414 return if $map eq $default_map{$addr};
6415
bb1dd3da
KW
6416 my $property = $self->property;
6417
668b3bfc
KW
6418 # Switch based on the map type...
6419 if ($type == $HANGUL_SYLLABLE) {
6420
6421 # These are entirely algorithmically determinable based on
6422 # some constants furnished by Unicode; for now, just set a
6423 # flag to indicate that have them. After everything is figured
bb1dd3da
KW
6424 # out, we will output the code that does the algorithm. (Don't
6425 # output them if not needed because we are suppressing this
6426 # property.)
6427 $has_hangul_syllables = 1 if $property->to_output_map;
668b3bfc
KW
6428 }
6429 elsif ($type == $CP_IN_NAME) {
6430
bb1dd3da 6431 # Code points whose name ends in their code point are also
668b3bfc
KW
6432 # algorithmically determinable, but need information about the map
6433 # to do so. Both the map and its inverse are stored in data
bb1dd3da
KW
6434 # structures output in the file. They are stored in the mean time
6435 # in global lists The lists will be written out later into Name.pm,
6436 # which is created only if needed. In order to prevent duplicates
6437 # in the list, only add to them for one property, should multiple
6438 # ones need them.
6439 if ($needing_code_points_ending_in_code_point == 0) {
6440 $needing_code_points_ending_in_code_point = $property;
6441 }
6442 if ($property == $needing_code_points_ending_in_code_point) {
6c1bafed
KW
6443 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6444 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6445
6446 my $squeezed = $map =~ s/[-\s]+//gr;
6447 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6448 $low;
6449 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6450 $high;
6451
6452 push @code_points_ending_in_code_point, { low => $low,
6453 high => $high,
6454 name => $map
6455 };
bb1dd3da 6456 }
668b3bfc
KW
6457 }
6458 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6459
6460 # Multi-code point maps and null string maps have an entry
6461 # for each code point in the range. They use the same
6462 # output format.
6463 for my $code_point ($low .. $high) {
6464
c12f2655
KW
6465 # The pack() below can't cope with surrogates. XXX This may
6466 # no longer be true
668b3bfc 6467 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
98dc9551 6468 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
668b3bfc
KW
6469 next;
6470 }
6471
6472 # Generate the hash entries for these in the form that
6473 # utf8.c understands.
6474 my $tostr = "";
6475 my $to_name = "";
6476 my $to_chr = "";
6477 foreach my $to (split " ", $map) {
6478 if ($to !~ /^$code_point_re$/) {
6479 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6480 next;
6481 }
6482 $tostr .= sprintf "\\x{%s}", $to;
6483 $to = CORE::hex $to;
558712cf 6484 if ($annotate) {
c4019d52
KW
6485 $to_name .= " + " if $to_name;
6486 $to_chr .= chr($to);
6487 main::populate_char_info($to)
6488 if ! defined $viacode[$to];
6489 $to_name .= $viacode[$to];
6490 }
668b3bfc
KW
6491 }
6492
6493 # I (khw) have never waded through this line to
6494 # understand it well enough to comment it.
6495 my $utf8 = sprintf(qq["%s" => "$tostr",],
6496 join("", map { sprintf "\\x%02X", $_ }
6497 unpack("U0C*", pack("U", $code_point))));
6498
6499 # Add a comment so that a human reader can more easily
6500 # see what's going on.
6501 push @multi_code_point_maps,
6502 sprintf("%-45s # U+%04X", $utf8, $code_point);
558712cf 6503 if (! $annotate) {
c4019d52
KW
6504 $multi_code_point_maps[-1] .= " => $map";
6505 }
6506 else {
6507 main::populate_char_info($code_point)
6508 if ! defined $viacode[$code_point];
6509 $multi_code_point_maps[-1] .= " '"
6510 . chr($code_point)
6511 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6512 }
668b3bfc
KW
6513 }
6514 }
6515 else {
6516 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6517 }
6518
6519 return;
6520 }
6521
99870f4d
KW
6522 sub pre_body {
6523 # Returns the string that should be output in the file before the main
668b3bfc
KW
6524 # body of this table. It isn't called until the main body is
6525 # calculated, saving a pass. The string includes some hash entries
6526 # identifying the format of the body, and what the single value should
6527 # be for all ranges missing from it. It also includes any code points
6528 # which have map_types that don't go in the main table.
99870f4d
KW
6529
6530 my $self = shift;
6531 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6532
ffe43484 6533 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6534
6535 my $name = $self->property->swash_name;
6536
19f751d2
KW
6537 # Currently there is nothing in the pre_body unless a swash is being
6538 # generated.
6539 return unless defined $name;
6540
99870f4d 6541 if (defined $swash_keys{$name}) {
1675ea0d 6542 Carp::my_carp(main::join_lines(<<END
99870f4d
KW
6543Already created a swash name '$name' for $swash_keys{$name}. This means that
6544the same name desired for $self shouldn't be used. Bad News. This must be
6545fixed before production use, but proceeding anyway
6546END
6547 ));
6548 }
6549 $swash_keys{$name} = "$self";
6550
99870f4d 6551 my $pre_body = "";
99870f4d 6552
668b3bfc
KW
6553 # Here we assume we were called after have gone through the whole
6554 # file. If we actually generated anything for each map type, add its
6555 # respective header and trailer
ec2f0128 6556 my $specials_name = "";
668b3bfc 6557 if (@multi_code_point_maps) {
ec2f0128 6558 $specials_name = "utf8::ToSpec$name";
668b3bfc 6559 $pre_body .= <<END;
99870f4d
KW
6560
6561# Some code points require special handling because their mappings are each to
6562# multiple code points. These do not appear in the main body, but are defined
6563# in the hash below.
6564
76591e2b
KW
6565# Each key is the string of N bytes that together make up the UTF-8 encoding
6566# for the code point. (i.e. the same as looking at the code point's UTF-8
6567# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
ec2f0128 6568\%$specials_name = (
99870f4d 6569END
668b3bfc
KW
6570 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6571 }
99870f4d 6572
668b3bfc
KW
6573 my $format = $self->format;
6574
bbed833a
KW
6575 my $return = "";
6576
ce712c88
KW
6577 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6578 if ($output_adjusted) {
bbed833a
KW
6579 if ($specials_name) {
6580 $return .= <<END;
6581# The mappings in the non-hash portion of this file must be modified to get the
d11155ec
KW
6582# correct values by adding the code point ordinal number to each one that is
6583# numeric.
bbed833a
KW
6584END
6585 }
6586 else {
6587 $return .= <<END;
6588# The mappings must be modified to get the correct values by adding the code
d11155ec 6589# point ordinal number to each one that is numeric.
bbed833a
KW
6590END
6591 }
6592 }
6593
6594 $return .= <<END;
6595
668b3bfc
KW
6596# The name this swash is to be known by, with the format of the mappings in
6597# the main body of the table, and what all code points missing from this file
6598# map to.
6599\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6600END
ec2f0128 6601 if ($specials_name) {
d59563d0 6602 $return .= <<END;
ec2f0128
KW
6603\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6604END
6605 }
668b3bfc 6606 my $default_map = $default_map{$addr};
bbed833a 6607
ce712c88 6608 # For $CODE_POINT default maps and using adjustments, instead the default
bbed833a
KW
6609 # becomes zero.
6610 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
ce712c88 6611 . (($output_adjusted && $default_map eq $CODE_POINT)
bbed833a
KW
6612 ? "0"
6613 : $default_map)
6614 . "';";
668b3bfc
KW
6615
6616 if ($default_map eq $CODE_POINT) {
6617 $return .= ' # code point maps to itself';
6618 }
6619 elsif ($default_map eq "") {
6620 $return .= ' # code point maps to the null string';
6621 }
6622 $return .= "\n";
6623
6624 $return .= $pre_body;
6625
6626 return $return;
6627 }
6628
6629 sub write {
6630 # Write the table to the file.
6631
6632 my $self = shift;
6633 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6634
6635 my $addr = do { no overloading; pack 'J', $self; };
6636
6637 # Clear the temporaries
668b3bfc 6638 undef @multi_code_point_maps;
99870f4d
KW
6639
6640 # Calculate the format of the table if not already done.
f5817e0a 6641 my $format = $self->format;
668b3bfc
KW
6642 my $type = $self->property->type;
6643 my $default_map = $self->default_map;
99870f4d
KW
6644 if (! defined $format) {
6645 if ($type == $BINARY) {
6646
6647 # Don't bother checking the values, because we elsewhere
6648 # verify that a binary table has only 2 values.
6649 $format = $BINARY_FORMAT;
6650 }
6651 else {
6652 my @ranges = $self->_range_list->ranges;
6653
6654 # default an empty table based on its type and default map
6655 if (! @ranges) {
6656
6657 # But it turns out that the only one we can say is a
6658 # non-string (besides binary, handled above) is when the
6659 # table is a string and the default map is to a code point
6660 if ($type == $STRING && $default_map eq $CODE_POINT) {
6661 $format = $HEX_FORMAT;
6662 }
6663 else {
6664 $format = $STRING_FORMAT;
6665 }
6666 }
6667 else {
6668
6669 # Start with the most restrictive format, and as we find
6670 # something that doesn't fit with that, change to the next
6671 # most restrictive, and so on.
6672 $format = $DECIMAL_FORMAT;
6673 foreach my $range (@ranges) {
668b3bfc
KW
6674 next if $range->type != 0; # Non-normal ranges don't
6675 # affect the main body
99870f4d
KW
6676 my $map = $range->value;
6677 if ($map ne $default_map) {
6678 last if $format eq $STRING_FORMAT; # already at
6679 # least
6680 # restrictive
6681 $format = $INTEGER_FORMAT
6682 if $format eq $DECIMAL_FORMAT
6683 && $map !~ / ^ [0-9] $ /x;
6684 $format = $FLOAT_FORMAT
6685 if $format eq $INTEGER_FORMAT
6686 && $map !~ / ^ -? [0-9]+ $ /x;
6687 $format = $RATIONAL_FORMAT
6688 if $format eq $FLOAT_FORMAT
6689 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6690 $format = $HEX_FORMAT
b91749bc
KW
6691 if ($format eq $RATIONAL_FORMAT
6692 && $map !~
6693 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6694 # Assume a leading zero means hex,
6695 # even if all digits are 0-9
6696 || ($format eq $INTEGER_FORMAT
6342d445 6697 && $map =~ /^0[0-9A-F]/);
99870f4d
KW
6698 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6699 && $map =~ /[^0-9A-F]/;
6700 }
6701 }
6702 }
6703 }
6704 } # end of calculating format
6705
668b3bfc 6706 if ($default_map eq $CODE_POINT
99870f4d 6707 && $format ne $HEX_FORMAT
668b3bfc
KW
6708 && ! defined $self->format) # manual settings are always
6709 # considered ok
99870f4d
KW
6710 {
6711 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6712 }
99870f4d 6713
ce712c88
KW
6714 # If the output is to be adjusted, the format of the table that gets
6715 # output is actually 'a' instead of whatever it is stored internally
6716 # as.
6717 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6718 if ($output_adjusted) {
d11155ec 6719 $format = $ADJUST_FORMAT;
bbed833a
KW
6720 }
6721
668b3bfc 6722 $self->_set_format($format);
99870f4d
KW
6723
6724 return $self->SUPER::write(
ce712c88 6725 $output_adjusted,
99870f4d
KW
6726 ($self->property == $block)
6727 ? 7 # block file needs more tab stops
6728 : 3,
668b3bfc 6729 $default_map); # don't write defaulteds
99870f4d
KW
6730 }
6731
6732 # Accessors for the underlying list that should fail if locked.
ea25a9b2 6733 for my $sub (qw(
99870f4d 6734 add_duplicate
ea25a9b2 6735 ))
99870f4d
KW
6736 {
6737 no strict "refs";
6738 *$sub = sub {
6739 use strict "refs";
6740 my $self = shift;
6741
6742 return if $self->carp_if_locked;
6743 return $self->_range_list->$sub(@_);
6744 }
6745 }
6746} # End closure for Map_Table
6747
6748package Match_Table;
6749use base '_Base_Table';
6750
6751# A Match table is one which is a list of all the code points that have
6752# the same property and property value, for use in \p{property=value}
6753# constructs in regular expressions. It adds very little data to the base
6754# structure, but many methods, as these lists can be combined in many ways to
6755# form new ones.
6756# There are only a few concepts added:
6757# 1) Equivalents and Relatedness.
6758# Two tables can match the identical code points, but have different names.
6759# This always happens when there is a perl single form extension
6760# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6761# tables are set to be related, with the Perl extension being a child, and
6762# the Unicode property being the parent.
6763#
6764# It may be that two tables match the identical code points and we don't
6765# know if they are related or not. This happens most frequently when the
6766# Block and Script properties have the exact range. But note that a
6767# revision to Unicode could add new code points to the script, which would
6768# now have to be in a different block (as the block was filled, or there
6769# would have been 'Unknown' script code points in it and they wouldn't have
6770# been identical). So we can't rely on any two properties from Unicode
6771# always matching the same code points from release to release, and thus
6772# these tables are considered coincidentally equivalent--not related. When
6773# two tables are unrelated but equivalent, one is arbitrarily chosen as the
6774# 'leader', and the others are 'equivalents'. This concept is useful
6775# to minimize the number of tables written out. Only one file is used for
6776# any identical set of code points, with entries in Heavy.pl mapping all
6777# the involved tables to it.
6778#
6779# Related tables will always be identical; we set them up to be so. Thus
6780# if the Unicode one is deprecated, the Perl one will be too. Not so for
6781# unrelated tables. Relatedness makes generating the documentation easier.
6782#
c12f2655
KW
6783# 2) Complement.
6784# Like equivalents, two tables may be the inverses of each other, the
6785# intersection between them is null, and the union is every Unicode code
6786# point. The two tables that occupy a binary property are necessarily like
6787# this. By specifying one table as the complement of another, we can avoid
6788# storing it on disk (using the other table and performing a fast
6789# transform), and some memory and calculations.
6790#
6791# 3) Conflicting. It may be that there will eventually be name clashes, with
99870f4d
KW
6792# the same name meaning different things. For a while, there actually were
6793# conflicts, but they have so far been resolved by changing Perl's or
6794# Unicode's definitions to match the other, but when this code was written,
6795# it wasn't clear that that was what was going to happen. (Unicode changed
6796# because of protests during their beta period.) Name clashes are warned
6797# about during compilation, and the documentation. The generated tables
6798# are sane, free of name clashes, because the code suppresses the Perl
6799# version. But manual intervention to decide what the actual behavior
6800# should be may be required should this happen. The introductory comments
6801# have more to say about this.
6802
6803sub standardize { return main::standardize($_[0]); }
6804sub trace { return main::trace(@_); }
6805
6806
6807{ # Closure
6808
6809 main::setup_package();
6810
6811 my %leader;
6812 # The leader table of this one; initially $self.
6813 main::set_access('leader', \%leader, 'r');
6814
6815 my %equivalents;
6816 # An array of any tables that have this one as their leader
6817 main::set_access('equivalents', \%equivalents, 'readable_array');
6818
6819 my %parent;
6820 # The parent table to this one, initially $self. This allows us to
c12f2655
KW
6821 # distinguish between equivalent tables that are related (for which this
6822 # is set to), and those which may not be, but share the same output file
6823 # because they match the exact same set of code points in the current
6824 # Unicode release.
99870f4d
KW
6825 main::set_access('parent', \%parent, 'r');
6826
6827 my %children;
6828 # An array of any tables that have this one as their parent
6829 main::set_access('children', \%children, 'readable_array');
6830
6831 my %conflicting;
6832 # Array of any tables that would have the same name as this one with
6833 # a different meaning. This is used for the generated documentation.
6834 main::set_access('conflicting', \%conflicting, 'readable_array');
6835
6836 my %matches_all;
6837 # Set in the constructor for tables that are expected to match all code
6838 # points.
6839 main::set_access('matches_all', \%matches_all, 'r');
6840
a92d5c2e
KW
6841 my %complement;
6842 # Points to the complement that this table is expressed in terms of; 0 if
6843 # none.
8ae00c8a 6844 main::set_access('complement', \%complement, 'r');
a92d5c2e 6845
99870f4d
KW
6846 sub new {
6847 my $class = shift;
6848
6849 my %args = @_;
6850
6851 # The property for which this table is a listing of property values.
6852 my $property = delete $args{'_Property'};
6853
23e33b60
KW
6854 my $name = delete $args{'Name'};
6855 my $full_name = delete $args{'Full_Name'};
6856 $full_name = $name if ! defined $full_name;
6857
99870f4d
KW
6858 # Optional
6859 my $initialize = delete $args{'Initialize'};
6860 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 6861 my $format = delete $args{'Format'};
99870f4d
KW
6862 # Rest of parameters passed on.
6863
6864 my $range_list = Range_List->new(Initialize => $initialize,
6865 Owner => $property);
6866
23e33b60
KW
6867 my $complete = $full_name;
6868 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6869 # but this helps debug if it
6870 # does
6871 # The complete name for a match table includes it's property in a
6872 # compound form 'property=table', except if the property is the
6873 # pseudo-property, perl, in which case it is just the single form,
6874 # 'table' (If you change the '=' must also change the ':' in lots of
6875 # places in this program that assume an equal sign)
6876 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 6877
99870f4d 6878 my $self = $class->SUPER::new(%args,
23e33b60
KW
6879 Name => $name,
6880 Complete_Name => $complete,
6881 Full_Name => $full_name,
99870f4d
KW
6882 _Property => $property,
6883 _Range_List => $range_list,
f5817e0a 6884 Format => $EMPTY_FORMAT,
99870f4d 6885 );
ffe43484 6886 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6887
6888 $conflicting{$addr} = [ ];
6889 $equivalents{$addr} = [ ];
6890 $children{$addr} = [ ];
6891 $matches_all{$addr} = $matches_all;
6892 $leader{$addr} = $self;
6893 $parent{$addr} = $self;
a92d5c2e 6894 $complement{$addr} = 0;
99870f4d 6895
f5817e0a
KW
6896 if (defined $format && $format ne $EMPTY_FORMAT) {
6897 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6898 }
6899
99870f4d
KW
6900 return $self;
6901 }
6902
6903 # See this program's beginning comment block about overloading these.
6904 use overload
6905 fallback => 0,
6906 qw("") => "_operator_stringify",
6907 '=' => sub {
6908 my $self = shift;
6909
6910 return if $self->carp_if_locked;
6911 return $self;
6912 },
6913
6914 '+' => sub {
6915 my $self = shift;
6916 my $other = shift;
6917
6918 return $self->_range_list + $other;
6919 },
6920 '&' => sub {
6921 my $self = shift;
6922 my $other = shift;
6923
6924 return $self->_range_list & $other;
6925 },
6926 '+=' => sub {
6927 my $self = shift;
6928 my $other = shift;
5bfb1762
KW
6929 my $reversed = shift;
6930
6931 if ($reversed) {
6932 Carp::my_carp_bug("Bad news. Can't cope with '"
6933 . ref($other)
6934 . ' += '
6935 . ref($self)
6936 . "'. undef returned.");
6937 return;
6938 }
99870f4d
KW
6939
6940 return if $self->carp_if_locked;
6941
ffe43484 6942 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6943
6944 if (ref $other) {
6945
6946 # Change the range list of this table to be the
6947 # union of the two.
6948 $self->_set_range_list($self->_range_list
6949 + $other);
6950 }
6951 else { # $other is just a simple value
6952 $self->add_range($other, $other);
6953 }
6954 return $self;
6955 },
a0a95192
KW
6956 '&=' => sub {
6957 my $self = shift;
6958 my $other = shift;
c4f8b45f
KW
6959 my $reversed = shift;
6960
6961 if ($reversed) {
6962 Carp::my_carp_bug("Bad news. Can't cope with '"
6963 . ref($other)
6964 . ' &= '
6965 . ref($self)
6966 . "'. undef returned.");
6967 return;
6968 }
a0a95192
KW
6969
6970 return if $self->carp_if_locked;
6971 $self->_set_range_list($self->_range_list & $other);
6972 return $self;
6973 },
99870f4d
KW
6974 '-' => sub { my $self = shift;
6975 my $other = shift;
6976 my $reversed = shift;
99870f4d 6977 if ($reversed) {
5d9b6ded
KW
6978 Carp::my_carp_bug("Bad news. Can't cope with '"
6979 . ref($other)
6980 . ' - '
6981 . ref($self)
6982 . "'. undef returned.");
99870f4d
KW
6983 return;
6984 }
6985
6986 return $self->_range_list - $other;
6987 },
6988 '~' => sub { my $self = shift;
6989 return ~ $self->_range_list;
6990 },
6991 ;
6992
6993 sub _operator_stringify {
6994 my $self = shift;
6995
23e33b60 6996 my $name = $self->complete_name;
99870f4d
KW
6997 return "Table '$name'";
6998 }
6999
ec40ee88
KW
7000 sub _range_list {
7001 # Returns the range list associated with this table, which will be the
7002 # complement's if it has one.
7003
7004 my $self = shift;
7005 my $complement;
7006 if (($complement = $self->complement) != 0) {
7007 return ~ $complement->_range_list;
7008 }
7009 else {
7010 return $self->SUPER::_range_list;
7011 }
7012 }
7013
99870f4d
KW
7014 sub add_alias {
7015 # Add a synonym for this table. See the comments in the base class
7016
7017 my $self = shift;
7018 my $name = shift;
7019 # Rest of parameters passed on.
7020
7021 $self->SUPER::add_alias($name, $self, @_);
7022 return;
7023 }
7024
7025 sub add_conflicting {
7026 # Add the name of some other object to the list of ones that name
7027 # clash with this match table.
7028
7029 my $self = shift;
7030 my $conflicting_name = shift; # The name of the conflicting object
7031 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
7032 my $conflicting_object = shift; # Optional, the conflicting object
7033 # itself. This is used to
7034 # disambiguate the text if the input
7035 # name is identical to any of the
7036 # aliases $self is known by.
7037 # Sometimes the conflicting object is
7038 # merely hypothetical, so this has to
7039 # be an optional parameter.
7040 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7041
ffe43484 7042 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7043
7044 # Check if the conflicting name is exactly the same as any existing
7045 # alias in this table (as long as there is a real object there to
7046 # disambiguate with).
7047 if (defined $conflicting_object) {
7048 foreach my $alias ($self->aliases) {
7049 if ($alias->name eq $conflicting_name) {
7050
7051 # Here, there is an exact match. This results in
7052 # ambiguous comments, so disambiguate by changing the
7053 # conflicting name to its object's complete equivalent.
7054 $conflicting_name = $conflicting_object->complete_name;
7055 last;
7056 }
7057 }
7058 }
7059
7060 # Convert to the \p{...} final name
7061 $conflicting_name = "\\$p" . "{$conflicting_name}";
7062
7063 # Only add once
7064 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7065
7066 push @{$conflicting{$addr}}, $conflicting_name;
7067
7068 return;
7069 }
7070
6505c6e2 7071 sub is_set_equivalent_to {
99870f4d
KW
7072 # Return boolean of whether or not the other object is a table of this
7073 # type and has been marked equivalent to this one.
7074
7075 my $self = shift;
7076 my $other = shift;
7077 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7078
7079 return 0 if ! defined $other; # Can happen for incomplete early
7080 # releases
7081 unless ($other->isa(__PACKAGE__)) {
7082 my $ref_other = ref $other;
7083 my $ref_self = ref $self;
6505c6e2 7084 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
99870f4d
KW
7085 return 0;
7086 }
7087
7088 # Two tables are equivalent if they have the same leader.
f998e60c 7089 no overloading;
051df77b 7090 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
7091 return;
7092 }
7093
99870f4d
KW
7094 sub set_equivalent_to {
7095 # Set $self equivalent to the parameter table.
7096 # The required Related => 'x' parameter is a boolean indicating
7097 # whether these tables are related or not. If related, $other becomes
7098 # the 'parent' of $self; if unrelated it becomes the 'leader'
7099 #
7100 # Related tables share all characteristics except names; equivalents
7101 # not quite so many.
7102 # If they are related, one must be a perl extension. This is because
7103 # we can't guarantee that Unicode won't change one or the other in a
98dc9551 7104 # later release even if they are identical now.
99870f4d
KW
7105
7106 my $self = shift;
7107 my $other = shift;
7108
7109 my %args = @_;
7110 my $related = delete $args{'Related'};
7111
7112 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7113
7114 return if ! defined $other; # Keep on going; happens in some early
7115 # Unicode releases.
7116
7117 if (! defined $related) {
7118 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
7119 $related = 0;
7120 }
7121
7122 # If already are equivalent, no need to re-do it; if subroutine
7123 # returns null, it found an error, also do nothing
6505c6e2 7124 my $are_equivalent = $self->is_set_equivalent_to($other);
99870f4d
KW
7125 return if ! defined $are_equivalent || $are_equivalent;
7126
ffe43484 7127 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 7128 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d 7129
45e32b91
KW
7130 if ($related) {
7131 if ($current_leader->perl_extension) {
7132 if ($other->perl_extension) {
7133 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7134 return;
7135 }
7610e9e2
KW
7136 } elsif ($self->property != $other->property # Depending on
7137 # situation, might
7138 # be better to use
7139 # add_alias()
7140 # instead for same
7141 # property
7142 && ! $other->perl_extension)
7143 {
45e32b91
KW
7144 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
7145 $related = 0;
7146 }
7147 }
7148
7149 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7150 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
7151 return;
99870f4d
KW
7152 }
7153
ffe43484
NC
7154 my $leader = do { no overloading; pack 'J', $current_leader; };
7155 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
7156
7157 # Any tables that are equivalent to or children of this table must now
7158 # instead be equivalent to or (children) to the new leader (parent),
7159 # still equivalent. The equivalency includes their matches_all info,
301ba948 7160 # and for related tables, their fate and status.
99870f4d
KW
7161 # All related tables are of necessity equivalent, but the converse
7162 # isn't necessarily true
7163 my $status = $other->status;
7164 my $status_info = $other->status_info;
301ba948 7165 my $fate = $other->fate;
99870f4d 7166 my $matches_all = $matches_all{other_addr};
d867ccfb 7167 my $caseless_equivalent = $other->caseless_equivalent;
99870f4d
KW
7168 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7169 next if $table == $other;
7170 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7171
ffe43484 7172 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
7173 $leader{$table_addr} = $other;
7174 $matches_all{$table_addr} = $matches_all;
7175 $self->_set_range_list($other->_range_list);
7176 push @{$equivalents{$other_addr}}, $table;
7177 if ($related) {
7178 $parent{$table_addr} = $other;
7179 push @{$children{$other_addr}}, $table;
7180 $table->set_status($status, $status_info);
301ba948
KW
7181
7182 # This reason currently doesn't get exposed outside; otherwise
7183 # would have to look up the parent's reason and use it instead.
7184 $table->set_fate($fate, "Parent's fate");
7185
d867ccfb 7186 $self->set_caseless_equivalent($caseless_equivalent);
99870f4d
KW
7187 }
7188 }
7189
7190 # Now that we've declared these to be equivalent, any changes to one
7191 # of the tables would invalidate that equivalency.
7192 $self->lock;
7193 $other->lock;
7194 return;
7195 }
7196
8ae00c8a
KW
7197 sub set_complement {
7198 # Set $self to be the complement of the parameter table. $self is
7199 # locked, as what it contains should all come from the other table.
7200
7201 my $self = shift;
7202 my $other = shift;
7203
7204 my %args = @_;
7205 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7206
7207 if ($other->complement != 0) {
7208 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7209 return;
7210 }
7211 my $addr = do { no overloading; pack 'J', $self; };
7212 $complement{$addr} = $other;
7213 $self->lock;
7214 return;
7215 }
7216
99870f4d
KW
7217 sub add_range { # Add a range to the list for this table.
7218 my $self = shift;
7219 # Rest of parameters passed on
7220
7221 return if $self->carp_if_locked;
7222 return $self->_range_list->add_range(@_);
7223 }
7224
88c22f80
KW
7225 sub header {
7226 my $self = shift;
7227 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7228
7229 # All match tables are to be used only by the Perl core.
126c3d4e 7230 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
88c22f80
KW
7231 }
7232
99870f4d
KW
7233 sub pre_body { # Does nothing for match tables.
7234 return
7235 }
7236
7237 sub append_to_body { # Does nothing for match tables.
7238 return
7239 }
7240
301ba948
KW
7241 sub set_fate {
7242 my $self = shift;
7243 my $fate = shift;
7244 my $reason = shift;
7245 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7246
7247 $self->SUPER::set_fate($fate, $reason);
7248
7249 # All children share this fate
7250 foreach my $child ($self->children) {
7251 $child->set_fate($fate, $reason);
7252 }
7253 return;
7254 }
7255
99870f4d
KW
7256 sub write {
7257 my $self = shift;
7258 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7259
ce712c88 7260 return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
99870f4d
KW
7261 }
7262
7263 sub set_final_comment {
7264 # This creates a comment for the file that is to hold the match table
7265 # $self. It is somewhat convoluted to make the English read nicely,
7266 # but, heh, it's just a comment.
7267 # This should be called only with the leader match table of all the
7268 # ones that share the same file. It lists all such tables, ordered so
7269 # that related ones are together.
7270
bd9ebcfd
KW
7271 return unless $debugging_build;
7272
99870f4d
KW
7273 my $leader = shift; # Should only be called on the leader table of
7274 # an equivalent group
7275 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7276
ffe43484 7277 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
7278
7279 if ($leader{$addr} != $leader) {
7280 Carp::my_carp_bug(<<END
7281set_final_comment() must be called on a leader table, which $leader is not.
7282It is equivalent to $leader{$addr}. No comment created
7283END
7284 );
7285 return;
7286 }
7287
7288 # Get the number of code points matched by each of the tables in this
7289 # file, and add underscores for clarity.
7290 my $count = $leader->count;
7291 my $string_count = main::clarify_number($count);
7292
7293 my $loose_count = 0; # how many aliases loosely matched
7294 my $compound_name = ""; # ? Are any names compound?, and if so, an
7295 # example
7296 my $properties_with_compound_names = 0; # count of these
7297
7298
7299 my %flags; # The status flags used in the file
7300 my $total_entries = 0; # number of entries written in the comment
7301 my $matches_comment = ""; # The portion of the comment about the
7302 # \p{}'s
7303 my @global_comments; # List of all the tables' comments that are
7304 # there before this routine was called.
26cef665
KW
7305 my $has_ucd_alias = 0; # If there is an alias that is accessible via
7306 # Unicode::UCD. If not, then don't say it is
7307 # in the comment
99870f4d
KW
7308
7309 # Get list of all the parent tables that are equivalent to this one
7310 # (including itself).
7311 my @parents = grep { $parent{main::objaddr $_} == $_ }
7312 main::uniques($leader, @{$equivalents{$addr}});
7313 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
7314 # tables
7315
7316 for my $parent (@parents) {
7317
7318 my $property = $parent->property;
7319
7320 # Special case 'N' tables in properties with two match tables when
7321 # the other is a 'Y' one. These are likely to be binary tables,
7322 # but not necessarily. In either case, \P{} will match the
7323 # complement of \p{}, and so if something is a synonym of \p, the
7324 # complement of that something will be the synonym of \P. This
7325 # would be true of any property with just two match tables, not
7326 # just those whose values are Y and N; but that would require a
7327 # little extra work, and there are none such so far in Unicode.
7328 my $perl_p = 'p'; # which is it? \p{} or \P{}
7329 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
7330
7331 if (scalar $property->tables == 2
7332 && $parent == $property->table('N')
7333 && defined (my $yes = $property->table('Y')))
7334 {
ffe43484 7335 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
7336 @yes_perl_synonyms
7337 = grep { $_->property == $perl }
7338 main::uniques($yes,
7339 $parent{$yes_addr},
7340 $parent{$yes_addr}->children);
7341
7342 # But these synonyms are \P{} ,not \p{}
7343 $perl_p = 'P';
7344 }
7345
7346 my @description; # Will hold the table description
7347 my @note; # Will hold the table notes.
7348 my @conflicting; # Will hold the table conflicts.
7349
7350 # Look at the parent, any yes synonyms, and all the children
ffe43484 7351 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
7352 for my $table ($parent,
7353 @yes_perl_synonyms,
f998e60c 7354 @{$children{$parent_addr}})
99870f4d 7355 {
ffe43484 7356 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
7357 my $table_property = $table->property;
7358
7359 # Tables are separated by a blank line to create a grouping.
7360 $matches_comment .= "\n" if $matches_comment;
7361
7362 # The table is named based on the property and value
7363 # combination it is for, like script=greek. But there may be
7364 # a number of synonyms for each side, like 'sc' for 'script',
7365 # and 'grek' for 'greek'. Any combination of these is a valid
7366 # name for this table. In this case, there are three more,
7367 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
7368 # listing all possible combinations in the comment, we make
7369 # sure that each synonym occurs at least once, and add
7370 # commentary that the other combinations are possible.
da912e1e
KW
7371 # Because regular expressions don't recognize things like
7372 # \p{jsn=}, only look at non-null right-hand-sides
99870f4d 7373 my @property_aliases = $table_property->aliases;
da912e1e 7374 my @table_aliases = grep { $_->name ne "" } $table->aliases;
99870f4d
KW
7375
7376 # The alias lists above are already ordered in the order we
7377 # want to output them. To ensure that each synonym is listed,
da912e1e
KW
7378 # we must use the max of the two numbers. But if there are no
7379 # legal synonyms (nothing in @table_aliases), then we don't
7380 # list anything.
7381 my $listed_combos = (@table_aliases)
7382 ? main::max(scalar @table_aliases,
7383 scalar @property_aliases)
7384 : 0;
99870f4d
KW
7385 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7386
da912e1e 7387
99870f4d
KW
7388 my $property_had_compound_name = 0;
7389
7390 for my $i (0 .. $listed_combos - 1) {
7391 $total_entries++;
7392
7393 # The current alias for the property is the next one on
7394 # the list, or if beyond the end, start over. Similarly
7395 # for the table (\p{prop=table})
7396 my $property_alias = $property_aliases
7397 [$i % @property_aliases]->name;
7398 my $table_alias_object = $table_aliases
7399 [$i % @table_aliases];
7400 my $table_alias = $table_alias_object->name;
7401 my $loose_match = $table_alias_object->loose_match;
26cef665 7402 $has_ucd_alias |= $table_alias_object->ucd;
99870f4d
KW
7403
7404 if ($table_alias !~ /\D/) { # Clarify large numbers.
7405 $table_alias = main::clarify_number($table_alias)
7406 }
7407
7408 # Add a comment for this alias combination
7409 my $current_match_comment;
7410 if ($table_property == $perl) {
7411 $current_match_comment = "\\$perl_p"
7412 . "{$table_alias}";
7413 }
7414 else {
7415 $current_match_comment
7416 = "\\p{$property_alias=$table_alias}";
7417 $property_had_compound_name = 1;
7418 }
7419
7420 # Flag any abnormal status for this table.
7421 my $flag = $property->status
7422 || $table->status
7423 || $table_alias_object->status;
26e2a4d8
KW
7424 if ($flag && $flag ne $PLACEHOLDER) {
7425 $flags{$flag} = $status_past_participles{$flag};
7426 }
99870f4d
KW
7427
7428 $loose_count++;
7429
7430 # Pretty up the comment. Note the \b; it says don't make
7431 # this line a continuation.
7432 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7433 $flag,
7434 " " x 7,
7435 $current_match_comment);
7436 } # End of generating the entries for this table.
7437
7438 # Save these for output after this group of related tables.
7439 push @description, $table->description;
7440 push @note, $table->note;
7441 push @conflicting, $table->conflicting;
7442
37e2e78e
KW
7443 # And this for output after all the tables.
7444 push @global_comments, $table->comment;
7445
99870f4d
KW
7446 # Compute an alternate compound name using the final property
7447 # synonym and the first table synonym with a colon instead of
7448 # the equal sign used elsewhere.
7449 if ($property_had_compound_name) {
7450 $properties_with_compound_names ++;
7451 if (! $compound_name || @property_aliases > 1) {
7452 $compound_name = $property_aliases[-1]->name
7453 . ': '
7454 . $table_aliases[0]->name;
7455 }
7456 }
7457 } # End of looping through all children of this table
7458
7459 # Here have assembled in $matches_comment all the related tables
7460 # to the current parent (preceded by the same info for all the
7461 # previous parents). Put out information that applies to all of
7462 # the current family.
7463 if (@conflicting) {
7464
7465 # But output the conflicting information now, as it applies to
7466 # just this table.
7467 my $conflicting = join ", ", @conflicting;
7468 if ($conflicting) {
7469 $matches_comment .= <<END;
7470
7471 Note that contrary to what you might expect, the above is NOT the same as
7472END
7473 $matches_comment .= "any of: " if @conflicting > 1;
7474 $matches_comment .= "$conflicting\n";
7475 }
7476 }
7477 if (@description) {
7478 $matches_comment .= "\n Meaning: "
7479 . join('; ', @description)
7480 . "\n";
7481 }
7482 if (@note) {
7483 $matches_comment .= "\n Note: "
7484 . join("\n ", @note)
7485 . "\n";
7486 }
7487 } # End of looping through all tables
7488
7489
7490 my $code_points;
7491 my $match;
7492 my $any_of_these;
7493 if ($count == 1) {
7494 $match = 'matches';
7495 $code_points = 'single code point';
7496 }
7497 else {
7498 $match = 'match';
7499 $code_points = "$string_count code points";
7500 }
7501
7502 my $synonyms;
7503 my $entries;
da912e1e 7504 if ($total_entries == 1) {
99870f4d
KW
7505 $synonyms = "";
7506 $entries = 'entry';
7507 $any_of_these = 'this'
7508 }
7509 else {
7510 $synonyms = " any of the following regular expression constructs";
7511 $entries = 'entries';
7512 $any_of_these = 'any of these'
7513 }
7514
26cef665
KW
7515 my $comment = "";
7516 if ($has_ucd_alias) {
7517 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7518 }
99870f4d
KW
7519 if ($has_unrelated) {
7520 $comment .= <<END;
7521This file is for tables that are not necessarily related: To conserve
7522resources, every table that matches the identical set of code points in this
7523version of Unicode uses this file. Each one is listed in a separate group
7524below. It could be that the tables will match the same set of code points in
7525other Unicode releases, or it could be purely coincidence that they happen to
7526be the same in Unicode $string_version, and hence may not in other versions.
7527
7528END
7529 }
7530
7531 if (%flags) {
7532 foreach my $flag (sort keys %flags) {
7533 $comment .= <<END;
37e2e78e 7534'$flag' below means that this form is $flags{$flag}.
301ba948 7535Consult $pod_file.pod
99870f4d
KW
7536END
7537 }
7538 $comment .= "\n";
7539 }
7540
da912e1e
KW
7541 if ($total_entries == 0) {
7542 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7543 $comment .= <<END;
7544This file returns the $code_points in Unicode Version $string_version for
7545$leader, but it is inaccessible through Perl regular expressions, as
7546"\\p{prop=}" is not recognized.
7547END
7548
7549 } else {
7550 $comment .= <<END;
99870f4d
KW
7551This file returns the $code_points in Unicode Version $string_version that
7552$match$synonyms:
7553
7554$matches_comment
37e2e78e 7555$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
7556including if adding or subtracting white space, underscore, and hyphen
7557characters matters or doesn't matter, and other permissible syntactic
7558variants. Upper/lower case distinctions never matter.
7559END
7560
da912e1e 7561 }
99870f4d
KW
7562 if ($compound_name) {
7563 $comment .= <<END;
7564
7565A colon can be substituted for the equals sign, and
7566END
7567 if ($properties_with_compound_names > 1) {
7568 $comment .= <<END;
7569within each group above,
7570END
7571 }
7572 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7573
7574 # Note the \b below, it says don't make that line a continuation.
7575 $comment .= <<END;
7576anything to the left of the equals (or colon) can be combined with anything to
7577the right. Thus, for example,
7578$compound_name
7579\bis also valid.
7580END
7581 }
7582
7583 # And append any comment(s) from the actual tables. They are all
7584 # gathered here, so may not read all that well.
37e2e78e
KW
7585 if (@global_comments) {
7586 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7587 }
99870f4d
KW
7588
7589 if ($count) { # The format differs if no code points, and needs no
7590 # explanation in that case
7591 $comment.= <<END;
7592
7593The format of the lines of this file is:
7594END
7595 $comment.= <<END;
7596START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7597STOP is the ending point, or if omitted, the range has just one code point.
7598END
0c07e538 7599 if ($leader->output_range_counts) {
99870f4d
KW
7600 $comment .= <<END;
7601Numbers in comments in [brackets] indicate how many code points are in the
7602range.
7603END
7604 }
7605 }
7606
7607 $leader->set_comment(main::join_lines($comment));
7608 return;
7609 }
7610
7611 # Accessors for the underlying list
ea25a9b2 7612 for my $sub (qw(
99870f4d
KW
7613 get_valid_code_point
7614 get_invalid_code_point
ea25a9b2 7615 ))
99870f4d
KW
7616 {
7617 no strict "refs";
7618 *$sub = sub {
7619 use strict "refs";
7620 my $self = shift;
7621
7622 return $self->_range_list->$sub(@_);
7623 }
7624 }
7625} # End closure for Match_Table
7626
7627package Property;
7628
7629# The Property class represents a Unicode property, or the $perl
7630# pseudo-property. It contains a map table initialized empty at construction
7631# time, and for properties accessible through regular expressions, various
7632# match tables, created through the add_match_table() method, and referenced
7633# by the table('NAME') or tables() methods, the latter returning a list of all
7634# of the match tables. Otherwise table operations implicitly are for the map
7635# table.
7636#
7637# Most of the data in the property is actually about its map table, so it
7638# mostly just uses that table's accessors for most methods. The two could
7639# have been combined into one object, but for clarity because of their
7640# differing semantics, they have been kept separate. It could be argued that
7641# the 'file' and 'directory' fields should be kept with the map table.
7642#
7643# Each property has a type. This can be set in the constructor, or in the
7644# set_type accessor, but mostly it is figured out by the data. Every property
7645# starts with unknown type, overridden by a parameter to the constructor, or
7646# as match tables are added, or ranges added to the map table, the data is
7647# inspected, and the type changed. After the table is mostly or entirely
7648# filled, compute_type() should be called to finalize they analysis.
7649#
7650# There are very few operations defined. One can safely remove a range from
7651# the map table, and property_add_or_replace_non_nulls() adds the maps from another
7652# table to this one, replacing any in the intersection of the two.
7653
7654sub standardize { return main::standardize($_[0]); }
7655sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7656
7657{ # Closure
7658
7659 # This hash will contain as keys, all the aliases of all properties, and
7660 # as values, pointers to their respective property objects. This allows
7661 # quick look-up of a property from any of its names.
7662 my %alias_to_property_of;
7663
7664 sub dump_alias_to_property_of {
7665 # For debugging
7666
7667 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7668 return;
7669 }
7670
7671 sub property_ref {
7672 # This is a package subroutine, not called as a method.
7673 # If the single parameter is a literal '*' it returns a list of all
7674 # defined properties.
7675 # Otherwise, the single parameter is a name, and it returns a pointer
7676 # to the corresponding property object, or undef if none.
7677 #
7678 # Properties can have several different names. The 'standard' form of
7679 # each of them is stored in %alias_to_property_of as they are defined.
7680 # But it's possible that this subroutine will be called with some
7681 # variant, so if the initial lookup fails, it is repeated with the
98dc9551 7682 # standardized form of the input name. If found, besides returning the
99870f4d
KW
7683 # result, the input name is added to the list so future calls won't
7684 # have to do the conversion again.
7685
7686 my $name = shift;
7687
7688 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7689
7690 if (! defined $name) {
7691 Carp::my_carp_bug("Undefined input property. No action taken.");
7692 return;
7693 }
7694
7695 return main::uniques(values %alias_to_property_of) if $name eq '*';
7696
7697 # Return cached result if have it.
7698 my $result = $alias_to_property_of{$name};
7699 return $result if defined $result;
7700
7701 # Convert the input to standard form.
7702 my $standard_name = standardize($name);
7703
7704 $result = $alias_to_property_of{$standard_name};
7705 return unless defined $result; # Don't cache undefs
7706
7707 # Cache the result before returning it.
7708 $alias_to_property_of{$name} = $result;
7709 return $result;
7710 }
7711
7712
7713 main::setup_package();
7714
7715 my %map;
7716 # A pointer to the map table object for this property
7717 main::set_access('map', \%map);
7718
7719 my %full_name;
7720 # The property's full name. This is a duplicate of the copy kept in the
7721 # map table, but is needed because stringify needs it during
7722 # construction of the map table, and then would have a chicken before egg
7723 # problem.
7724 main::set_access('full_name', \%full_name, 'r');
7725
7726 my %table_ref;
7727 # This hash will contain as keys, all the aliases of any match tables
7728 # attached to this property, and as values, the pointers to their
7729 # respective tables. This allows quick look-up of a table from any of its
7730 # names.
7731 main::set_access('table_ref', \%table_ref);
7732
7733 my %type;
7734 # The type of the property, $ENUM, $BINARY, etc
7735 main::set_access('type', \%type, 'r');
7736
7737 my %file;
7738 # The filename where the map table will go (if actually written).
7739 # Normally defaulted, but can be overridden.
7740 main::set_access('file', \%file, 'r', 's');
7741
7742 my %directory;
7743 # The directory where the map table will go (if actually written).
7744 # Normally defaulted, but can be overridden.
7745 main::set_access('directory', \%directory, 's');
7746
7747 my %pseudo_map_type;
7748 # This is used to affect the calculation of the map types for all the
7749 # ranges in the table. It should be set to one of the values that signify
7750 # to alter the calculation.
7751 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7752
7753 my %has_only_code_point_maps;
7754 # A boolean used to help in computing the type of data in the map table.
7755 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7756
7757 my %unique_maps;
7758 # A list of the first few distinct mappings this property has. This is
7759 # used to disambiguate between binary and enum property types, so don't
7760 # have to keep more than three.
7761 main::set_access('unique_maps', \%unique_maps);
7762
56557540
KW
7763 my %pre_declared_maps;
7764 # A boolean that gives whether the input data should declare all the
7765 # tables used, or not. If the former, unknown ones raise a warning.
7766 main::set_access('pre_declared_maps',
047274f2 7767 \%pre_declared_maps, 'r', 's');
56557540 7768
99870f4d
KW
7769 sub new {
7770 # The only required parameter is the positionally first, name. All
7771 # other parameters are key => value pairs. See the documentation just
7772 # above for the meanings of the ones not passed directly on to the map
7773 # table constructor.
7774
7775 my $class = shift;
7776 my $name = shift || "";
7777
7778 my $self = property_ref($name);
7779 if (defined $self) {
7780 my $options_string = join ", ", @_;
7781 $options_string = ". Ignoring options $options_string" if $options_string;
7782 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7783 return $self;
7784 }
7785
7786 my %args = @_;
7787
7788 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 7789 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7790
7791 $directory{$addr} = delete $args{'Directory'};
7792 $file{$addr} = delete $args{'File'};
7793 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7794 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7795 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
56557540
KW
7796 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7797 # Starting in this release, property
7798 # values should be defined for all
7799 # properties, except those overriding this
7800 // $v_version ge v5.1.0;
c12f2655 7801
99870f4d
KW
7802 # Rest of parameters passed on.
7803
7804 $has_only_code_point_maps{$addr} = 1;
7805 $table_ref{$addr} = { };
7806 $unique_maps{$addr} = { };
7807
7808 $map{$addr} = Map_Table->new($name,
7809 Full_Name => $full_name{$addr},
7810 _Alias_Hash => \%alias_to_property_of,
7811 _Property => $self,
7812 %args);
7813 return $self;
7814 }
7815
7816 # See this program's beginning comment block about overloading the copy
7817 # constructor. Few operations are defined on properties, but a couple are
7818 # useful. It is safe to take the inverse of a property, and to remove a
7819 # single code point from it.
7820 use overload
7821 fallback => 0,
7822 qw("") => "_operator_stringify",
7823 "." => \&main::_operator_dot,
1285127e 7824 ".=" => \&main::_operator_dot_equal,
99870f4d
KW
7825 '==' => \&main::_operator_equal,
7826 '!=' => \&main::_operator_not_equal,
7827 '=' => sub { return shift },
7828 '-=' => "_minus_and_equal",
7829 ;
7830
7831 sub _operator_stringify {
7832 return "Property '" . shift->full_name . "'";
7833 }
7834
7835 sub _minus_and_equal {
7836 # Remove a single code point from the map table of a property.
7837
7838 my $self = shift;
7839 my $other = shift;
7840 my $reversed = shift;
7841 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7842
7843 if (ref $other) {
5d9b6ded 7844 Carp::my_carp_bug("Bad news. Can't cope with a "
99870f4d
KW
7845 . ref($other)
7846 . " argument to '-='. Subtraction ignored.");
7847 return $self;
7848 }
98dc9551 7849 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
5d9b6ded
KW
7850 Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
7851 . ref $self
7852 . " from a non-object. undef returned.");
7853 return;
99870f4d
KW
7854 }
7855 else {
f998e60c 7856 no overloading;
051df77b 7857 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
7858 }
7859 return $self;
7860 }
7861
7862 sub add_match_table {
7863 # Add a new match table for this property, with name given by the
7864 # parameter. It returns a pointer to the table.
7865
7866 my $self = shift;
7867 my $name = shift;
7868 my %args = @_;
7869
ffe43484 7870 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7871
7872 my $table = $table_ref{$addr}{$name};
7873 my $standard_name = main::standardize($name);
7874 if (defined $table
7875 || (defined ($table = $table_ref{$addr}{$standard_name})))
7876 {
7877 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7878 $table_ref{$addr}{$name} = $table;
7879 return $table;
7880 }
7881 else {
7882
7883 # See if this is a perl extension, if not passed in.
7884 my $perl_extension = delete $args{'Perl_Extension'};
7885 $perl_extension
7886 = $self->perl_extension if ! defined $perl_extension;
7887
7888 $table = Match_Table->new(
7889 Name => $name,
7890 Perl_Extension => $perl_extension,
7891 _Alias_Hash => $table_ref{$addr},
7892 _Property => $self,
7893
301ba948
KW
7894 # gets property's fate and status by default
7895 Fate => $self->fate,
99870f4d
KW
7896 Status => $self->status,
7897 _Status_Info => $self->status_info,
88c22f80 7898 %args);
99870f4d
KW
7899 return unless defined $table;
7900 }
7901
7902 # Save the names for quick look up
7903 $table_ref{$addr}{$standard_name} = $table;
7904 $table_ref{$addr}{$name} = $table;
7905
7906 # Perhaps we can figure out the type of this property based on the
7907 # fact of adding this match table. First, string properties don't
7908 # have match tables; second, a binary property can't have 3 match
7909 # tables
7910 if ($type{$addr} == $UNKNOWN) {
7911 $type{$addr} = $NON_STRING;
7912 }
7913 elsif ($type{$addr} == $STRING) {
7914 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7915 $type{$addr} = $NON_STRING;
7916 }
06f26c45 7917 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
99870f4d
KW
7918 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7919 && $type{$addr} == $BINARY)
7920 {
7921 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7922 $type{$addr} = $ENUM;
7923 }
7924 }
7925
7926 return $table;
7927 }
7928
4b9b0bc5
KW
7929 sub delete_match_table {
7930 # Delete the table referred to by $2 from the property $1.
7931
7932 my $self = shift;
7933 my $table_to_remove = shift;
7934 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7935
7936 my $addr = do { no overloading; pack 'J', $self; };
7937
7938 # Remove all names that refer to it.
7939 foreach my $key (keys %{$table_ref{$addr}}) {
7940 delete $table_ref{$addr}{$key}
7941 if $table_ref{$addr}{$key} == $table_to_remove;
7942 }
7943
7944 $table_to_remove->DESTROY;
7945 return;
7946 }
7947
99870f4d
KW
7948 sub table {
7949 # Return a pointer to the match table (with name given by the
7950 # parameter) associated with this property; undef if none.
7951
7952 my $self = shift;
7953 my $name = shift;
7954 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7955
ffe43484 7956 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7957
7958 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7959
7960 # If quick look-up failed, try again using the standard form of the
7961 # input name. If that succeeds, cache the result before returning so
7962 # won't have to standardize this input name again.
7963 my $standard_name = main::standardize($name);
7964 return unless defined $table_ref{$addr}{$standard_name};
7965
7966 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7967 return $table_ref{$addr}{$name};
7968 }
7969
7970 sub tables {
7971 # Return a list of pointers to all the match tables attached to this
7972 # property
7973
f998e60c 7974 no overloading;
051df77b 7975 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
7976 }
7977
7978 sub directory {
7979 # Returns the directory the map table for this property should be
7980 # output in. If a specific directory has been specified, that has
7981 # priority; 'undef' is returned if the type isn't defined;
7982 # or $map_directory for everything else.
7983
ffe43484 7984 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
7985
7986 return $directory{$addr} if defined $directory{$addr};
7987 return undef if $type{$addr} == $UNKNOWN;
7988 return $map_directory;
7989 }
7990
7991 sub swash_name {
7992 # Return the name that is used to both:
7993 # 1) Name the file that the map table is written to.
7994 # 2) The name of swash related stuff inside that file.
7995 # The reason for this is that the Perl core historically has used
7996 # certain names that aren't the same as the Unicode property names.
7997 # To continue using these, $file is hard-coded in this file for those,
7998 # but otherwise the standard name is used. This is different from the
7999 # external_name, so that the rest of the files, like in lib can use
8000 # the standard name always, without regard to historical precedent.
8001
8002 my $self = shift;
8003 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8004
ffe43484 8005 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 8006
19f751d2
KW
8007 # Swash names are used only on regular map tables; otherwise there
8008 # should be no access to the property map table from other parts of
8009 # Perl.
8010 return if $map{$addr}->fate != $ORDINARY;
8011
99870f4d
KW
8012 return $file{$addr} if defined $file{$addr};
8013 return $map{$addr}->external_name;
8014 }
8015
8016 sub to_create_match_tables {
8017 # Returns a boolean as to whether or not match tables should be
8018 # created for this property.
8019
8020 my $self = shift;
8021 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8022
8023 # The whole point of this pseudo property is match tables.
8024 return 1 if $self == $perl;
8025
ffe43484 8026 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
8027
8028 # Don't generate tables of code points that match the property values
8029 # of a string property. Such a list would most likely have many
8030 # property values, each with just one or very few code points mapping
8031 # to it.
8032 return 0 if $type{$addr} == $STRING;
8033
8034 # Don't generate anything for unimplemented properties.
8035 return 0 if grep { $self->complete_name eq $_ }
8036 @unimplemented_properties;
8037 # Otherwise, do.
8038 return 1;
8039 }
8040
8041 sub property_add_or_replace_non_nulls {
8042 # This adds the mappings in the property $other to $self. Non-null
8043 # mappings from $other override those in $self. It essentially merges
8044 # the two properties, with the second having priority except for null
8045 # mappings.
8046
8047 my $self = shift;
8048 my $other = shift;
8049 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8050
8051 if (! $other->isa(__PACKAGE__)) {
8052 Carp::my_carp_bug("$other should be a "
8053 . __PACKAGE__
8054 . ". Not a '"
8055 . ref($other)
8056 . "'. Not added;");
8057 return;
8058 }
8059
f998e60c 8060 no overloading;
051df77b 8061 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
8062 }
8063
5be997b0
KW
8064 sub set_proxy_for {
8065 # Certain tables are not generally written out to files, but
8066 # Unicode::UCD has the intelligence to know that the file for $self
8067 # can be used to reconstruct those tables. This routine just changes
8068 # things so that UCD pod entries for those suppressed tables are
8069 # generated, so the fact that a proxy is used is invisible to the
8070 # user.
8071
8072 my $self = shift;
8073
8074 foreach my $property_name (@_) {
8075 my $ref = property_ref($property_name);
8076 next if $ref->to_output_map;
8077 $ref->set_fate($MAP_PROXIED);
8078 }
8079 }
8080
99870f4d
KW
8081 sub set_type {
8082 # Set the type of the property. Mostly this is figured out by the
8083 # data in the table. But this is used to set it explicitly. The
8084 # reason it is not a standard accessor is that when setting a binary
8085 # property, we need to make sure that all the true/false aliases are
8086 # present, as they were omitted in early Unicode releases.
8087
8088 my $self = shift;
8089 my $type = shift;
8090 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8091
06f26c45
KW
8092 if ($type != $ENUM
8093 && $type != $BINARY
8094 && $type != $FORCED_BINARY
8095 && $type != $STRING)
8096 {
99870f4d
KW
8097 Carp::my_carp("Unrecognized type '$type'. Type not set");
8098 return;
8099 }
8100
051df77b 8101 { no overloading; $type{pack 'J', $self} = $type; }
06f26c45 8102 return if $type != $BINARY && $type != $FORCED_BINARY;
99870f4d
KW
8103
8104 my $yes = $self->table('Y');
8105 $yes = $self->table('Yes') if ! defined $yes;
01adf4be
KW
8106 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8107 if ! defined $yes;
8108
3c6bf941
KW
8109 # Add aliases in order wanted, duplicates will be ignored. We use a
8110 # binary property present in all releases for its ordered lists of
8111 # true/false aliases. Note, that could run into problems in
8112 # outputting things in that we don't distinguish between the name and
8113 # full name of these. Hopefully, if the table was already created
8114 # before this code is executed, it was done with these set properly.
8115 my $bm = property_ref("Bidi_Mirrored");
8116 foreach my $alias ($bm->table("Y")->aliases) {
8117 $yes->add_alias($alias->name);
8118 }
99870f4d
KW
8119 my $no = $self->table('N');
8120 $no = $self->table('No') if ! defined $no;
01adf4be 8121 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
3c6bf941
KW
8122 foreach my $alias ($bm->table("N")->aliases) {
8123 $no->add_alias($alias->name);
8124 }
c12f2655 8125
99870f4d
KW
8126 return;
8127 }
8128
8129 sub add_map {
8130 # Add a map to the property's map table. This also keeps
8131 # track of the maps so that the property type can be determined from
8132 # its data.
8133
8134 my $self = shift;
8135 my $start = shift; # First code point in range
8136 my $end = shift; # Final code point in range
8137 my $map = shift; # What the range maps to.
8138 # Rest of parameters passed on.
8139
ffe43484 8140 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
8141
8142 # If haven't the type of the property, gather information to figure it
8143 # out.
8144 if ($type{$addr} == $UNKNOWN) {
8145
8146 # If the map contains an interior blank or dash, or most other
8147 # nonword characters, it will be a string property. This
8148 # heuristic may actually miss some string properties. If so, they
8149 # may need to have explicit set_types called for them. This
8150 # happens in the Unihan properties.
8151 if ($map =~ / (?<= . ) [ -] (?= . ) /x
8152 || $map =~ / [^\w.\/\ -] /x)
8153 {
8154 $self->set_type($STRING);
8155
8156 # $unique_maps is used for disambiguating between ENUM and
8157 # BINARY later; since we know the property is not going to be
8158 # one of those, no point in keeping the data around
8159 undef $unique_maps{$addr};
8160 }
8161 else {
8162
8163 # Not necessarily a string. The final decision has to be
8164 # deferred until all the data are in. We keep track of if all
8165 # the values are code points for that eventual decision.
8166 $has_only_code_point_maps{$addr} &=
8167 $map =~ / ^ $code_point_re $/x;
8168
8169 # For the purposes of disambiguating between binary and other
8170 # enumerations at the end, we keep track of the first three
8171 # distinct property values. Once we get to three, we know
8172 # it's not going to be binary, so no need to track more.
8173 if (scalar keys %{$unique_maps{$addr}} < 3) {
8174 $unique_maps{$addr}{main::standardize($map)} = 1;
8175 }
8176 }
8177 }
8178
8179 # Add the mapping by calling our map table's method
8180 return $map{$addr}->add_map($start, $end, $map, @_);
8181 }
8182
8183 sub compute_type {
8184 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
8185 # should be called after the property is mostly filled with its maps.
8186 # We have been keeping track of what the property values have been,
8187 # and now have the necessary information to figure out the type.
8188
8189 my $self = shift;
8190 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8191
ffe43484 8192 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
8193
8194 my $type = $type{$addr};
8195
8196 # If already have figured these out, no need to do so again, but we do
8197 # a double check on ENUMS to make sure that a string property hasn't
8198 # improperly been classified as an ENUM, so continue on with those.
06f26c45
KW
8199 return if $type == $STRING
8200 || $type == $BINARY
8201 || $type == $FORCED_BINARY;
99870f4d
KW
8202
8203 # If every map is to a code point, is a string property.
8204 if ($type == $UNKNOWN
8205 && ($has_only_code_point_maps{$addr}
8206 || (defined $map{$addr}->default_map
8207 && $map{$addr}->default_map eq "")))
8208 {
8209 $self->set_type($STRING);
8210 }
8211 else {
8212
8213 # Otherwise, it is to some sort of enumeration. (The case where
8214 # it is a Unicode miscellaneous property, and treated like a
8215 # string in this program is handled in add_map()). Distinguish
8216 # between binary and some other enumeration type. Of course, if
8217 # there are more than two values, it's not binary. But more
8218 # subtle is the test that the default mapping is defined means it
8219 # isn't binary. This in fact may change in the future if Unicode
8220 # changes the way its data is structured. But so far, no binary
8221 # properties ever have @missing lines for them, so the default map
8222 # isn't defined for them. The few properties that are two-valued
8223 # and aren't considered binary have the default map defined
8224 # starting in Unicode 5.0, when the @missing lines appeared; and
8225 # this program has special code to put in a default map for them
8226 # for earlier than 5.0 releases.
8227 if ($type == $ENUM
8228 || scalar keys %{$unique_maps{$addr}} > 2
8229 || defined $self->default_map)
8230 {
8231 my $tables = $self->tables;
8232 my $count = $self->count;
8233 if ($verbosity && $count > 500 && $tables/$count > .1) {
8234 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
8235 }
8236 $self->set_type($ENUM);
8237 }
8238 else {
8239 $self->set_type($BINARY);
8240 }
8241 }
8242 undef $unique_maps{$addr}; # Garbage collect
8243 return;
8244 }
8245
301ba948
KW
8246 sub set_fate {
8247 my $self = shift;
8248 my $fate = shift;
8249 my $reason = shift; # Ignored unless suppressing
8250 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8251
8252 my $addr = do { no overloading; pack 'J', $self; };
8253 if ($fate == $SUPPRESSED) {
8254 $why_suppressed{$self->complete_name} = $reason;
8255 }
8256
395dfc19
KW
8257 # Each table shares the property's fate, except that MAP_PROXIED
8258 # doesn't affect match tables
8259 $map{$addr}->set_fate($fate, $reason);
8260 if ($fate != $MAP_PROXIED) {
13a49092
KW
8261 foreach my $table ($map{$addr}, $self->tables) {
8262 $table->set_fate($fate, $reason);
8263 }
395dfc19 8264 }
301ba948
KW
8265 return;
8266 }
8267
8268
99870f4d
KW
8269 # Most of the accessors for a property actually apply to its map table.
8270 # Setup up accessor functions for those, referring to %map
ea25a9b2 8271 for my $sub (qw(
99870f4d
KW
8272 add_alias
8273 add_anomalous_entry
8274 add_comment
8275 add_conflicting
8276 add_description
8277 add_duplicate
8278 add_note
8279 aliases
8280 comment
8281 complete_name
2f7a8815 8282 containing_range
99870f4d
KW
8283 count
8284 default_map
8285 delete_range
8286 description
8287 each_range
8288 external_name
301ba948 8289 fate
99870f4d
KW
8290 file_path
8291 format
8292 initialize
8293 inverse_list
8294 is_empty
8295 name
8296 note
8297 perl_extension
8298 property
8299 range_count
8300 ranges
8301 range_size_1
8302 reset_each_range
8303 set_comment
99870f4d
KW
8304 set_default_map
8305 set_file_path
8306 set_final_comment
26561784 8307 _set_format
99870f4d
KW
8308 set_range_size_1
8309 set_status
8310 set_to_output_map
8311 short_name
8312 status
8313 status_info
8314 to_output_map
0a9dbafc 8315 type_of
99870f4d
KW
8316 value_of
8317 write
ea25a9b2 8318 ))
99870f4d
KW
8319 # 'property' above is for symmetry, so that one can take
8320 # the property of a property and get itself, and so don't
8321 # have to distinguish between properties and tables in
8322 # calling code
8323 {
8324 no strict "refs";
8325 *$sub = sub {
8326 use strict "refs";
8327 my $self = shift;
f998e60c 8328 no overloading;
051df77b 8329 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
8330 }
8331 }
8332
8333
8334} # End closure
8335
8336package main;
8337
8338sub join_lines($) {
8339 # Returns lines of the input joined together, so that they can be folded
8340 # properly.
8341 # This causes continuation lines to be joined together into one long line
8342 # for folding. A continuation line is any line that doesn't begin with a
8343 # space or "\b" (the latter is stripped from the output). This is so
8344 # lines can be be in a HERE document so as to fit nicely in the terminal
8345 # width, but be joined together in one long line, and then folded with
8346 # indents, '#' prefixes, etc, properly handled.
8347 # A blank separates the joined lines except if there is a break; an extra
8348 # blank is inserted after a period ending a line.
8349
98dc9551 8350 # Initialize the return with the first line.
99870f4d
KW
8351 my ($return, @lines) = split "\n", shift;
8352
8353 # If the first line is null, it was an empty line, add the \n back in
8354 $return = "\n" if $return eq "";
8355
8356 # Now join the remainder of the physical lines.
8357 for my $line (@lines) {
8358
8359 # An empty line means wanted a blank line, so add two \n's to get that
8360 # effect, and go to the next line.
8361 if (length $line == 0) {
8362 $return .= "\n\n";
8363 next;
8364 }
8365
8366 # Look at the last character of what we have so far.
8367 my $previous_char = substr($return, -1, 1);
8368
8369 # And at the next char to be output.
8370 my $next_char = substr($line, 0, 1);
8371
8372 if ($previous_char ne "\n") {
8373
8374 # Here didn't end wth a nl. If the next char a blank or \b, it
8375 # means that here there is a break anyway. So add a nl to the
8376 # output.
8377 if ($next_char eq " " || $next_char eq "\b") {
8378 $previous_char = "\n";
8379 $return .= $previous_char;
8380 }
8381
8382 # Add an extra space after periods.
8383 $return .= " " if $previous_char eq '.';
8384 }
8385
8386 # Here $previous_char is still the latest character to be output. If
8387 # it isn't a nl, it means that the next line is to be a continuation
8388 # line, with a blank inserted between them.
8389 $return .= " " if $previous_char ne "\n";
8390
8391 # Get rid of any \b
8392 substr($line, 0, 1) = "" if $next_char eq "\b";
8393
8394 # And append this next line.
8395 $return .= $line;
8396 }
8397
8398 return $return;
8399}
8400
8401sub simple_fold($;$$$) {
8402 # Returns a string of the input (string or an array of strings) folded
8403 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8404 # a \n
8405 # This is tailored for the kind of text written by this program,
8406 # especially the pod file, which can have very long names with
8407 # underscores in the middle, or words like AbcDefgHij.... We allow
8408 # breaking in the middle of such constructs if the line won't fit
8409 # otherwise. The break in such cases will come either just after an
8410 # underscore, or just before one of the Capital letters.
8411
8412 local $to_trace = 0 if main::DEBUG;
8413
8414 my $line = shift;
8415 my $prefix = shift; # Optional string to prepend to each output
8416 # line
8417 $prefix = "" unless defined $prefix;
8418
8419 my $hanging_indent = shift; # Optional number of spaces to indent
8420 # continuation lines
8421 $hanging_indent = 0 unless $hanging_indent;
8422
8423 my $right_margin = shift; # Optional number of spaces to narrow the
8424 # total width by.
8425 $right_margin = 0 unless defined $right_margin;
8426
8427 # Call carp with the 'nofold' option to avoid it from trying to call us
8428 # recursively
8429 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8430
8431 # The space available doesn't include what's automatically prepended
8432 # to each line, or what's reserved on the right.
8433 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8434 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8435
8436 if (DEBUG && $hanging_indent >= $max) {
8437 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8438 $hanging_indent = 0;
8439 }
8440
8441 # First, split into the current physical lines.
8442 my @line;
8443 if (ref $line) { # Better be an array, because not bothering to
8444 # test
8445 foreach my $line (@{$line}) {
8446 push @line, split /\n/, $line;
8447 }
8448 }
8449 else {
8450 @line = split /\n/, $line;
8451 }
8452
8453 #local $to_trace = 1 if main::DEBUG;
8454 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8455
8456 # Look at each current physical line.
8457 for (my $i = 0; $i < @line; $i++) {
8458 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8459 #local $to_trace = 1 if main::DEBUG;
8460 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8461
8462 # Remove prefix, because will be added back anyway, don't want
8463 # doubled prefix
8464 $line[$i] =~ s/^$prefix//;
8465
8466 # Remove trailing space
8467 $line[$i] =~ s/\s+\Z//;
8468
8469 # If the line is too long, fold it.
8470 if (length $line[$i] > $max) {
8471 my $remainder;
8472
8473 # Here needs to fold. Save the leading space in the line for
8474 # later.
8475 $line[$i] =~ /^ ( \s* )/x;
8476 my $leading_space = $1;
8477 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8478
8479 # If character at final permissible position is white space,
8480 # fold there, which will delete that white space
8481 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8482 $remainder = substr($line[$i], $max);
8483 $line[$i] = substr($line[$i], 0, $max - 1);
8484 }
8485 else {
8486
8487 # Otherwise fold at an acceptable break char closest to
8488 # the max length. Look at just the maximal initial
8489 # segment of the line
8490 my $segment = substr($line[$i], 0, $max - 1);
8491 if ($segment =~
8492 /^ ( .{$hanging_indent} # Don't look before the
8493 # indent.
8494 \ * # Don't look in leading
8495 # blanks past the indent
8496 [^ ] .* # Find the right-most
8497 (?: # acceptable break:
8498 [ \s = ] # space or equal
8499 | - (?! [.0-9] ) # or non-unary minus.
8500 ) # $1 includes the character
8501 )/x)
8502 {
8503 # Split into the initial part that fits, and remaining
8504 # part of the input
8505 $remainder = substr($line[$i], length $1);
8506 $line[$i] = $1;
8507 trace $line[$i] if DEBUG && $to_trace;
8508 trace $remainder if DEBUG && $to_trace;
8509 }
8510
8511 # If didn't find a good breaking spot, see if there is a
8512 # not-so-good breaking spot. These are just after
8513 # underscores or where the case changes from lower to
8514 # upper. Use \a as a soft hyphen, but give up
8515 # and don't break the line if there is actually a \a
8516 # already in the input. We use an ascii character for the
8517 # soft-hyphen to avoid any attempt by miniperl to try to
8518 # access the files that this program is creating.
8519 elsif ($segment !~ /\a/
8520 && ($segment =~ s/_/_\a/g
8521 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8522 {
8523 # Here were able to find at least one place to insert
8524 # our substitute soft hyphen. Find the right-most one
8525 # and replace it by a real hyphen.
8526 trace $segment if DEBUG && $to_trace;
8527 substr($segment,
8528 rindex($segment, "\a"),
8529 1) = '-';
8530
8531 # Then remove the soft hyphen substitutes.
8532 $segment =~ s/\a//g;
8533 trace $segment if DEBUG && $to_trace;
8534
8535 # And split into the initial part that fits, and
8536 # remainder of the line
8537 my $pos = rindex($segment, '-');
8538 $remainder = substr($line[$i], $pos);
8539 trace $remainder if DEBUG && $to_trace;
8540 $line[$i] = substr($segment, 0, $pos + 1);
8541 }
8542 }
8543
8544 # Here we know if we can fold or not. If we can, $remainder
8545 # is what remains to be processed in the next iteration.
8546 if (defined $remainder) {
8547 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8548
8549 # Insert the folded remainder of the line as a new element
8550 # of the array. (It may still be too long, but we will
8551 # deal with that next time through the loop.) Omit any
8552 # leading space in the remainder.
8553 $remainder =~ s/^\s+//;
8554 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8555
8556 # But then indent by whichever is larger of:
8557 # 1) the leading space on the input line;
8558 # 2) the hanging indent.
8559 # This preserves indentation in the original line.
8560 my $lead = ($leading_space)
8561 ? length $leading_space
8562 : $hanging_indent;
8563 $lead = max($lead, $hanging_indent);
8564 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8565 }
8566 }
8567
8568 # Ready to output the line. Get rid of any trailing space
8569 # And prefix by the required $prefix passed in.
8570 $line[$i] =~ s/\s+$//;
8571 $line[$i] = "$prefix$line[$i]\n";
8572 } # End of looping through all the lines.
8573
8574 return join "", @line;
8575}
8576
8577sub property_ref { # Returns a reference to a property object.
8578 return Property::property_ref(@_);
8579}
8580
8581sub force_unlink ($) {
8582 my $filename = shift;
8583 return unless file_exists($filename);
8584 return if CORE::unlink($filename);
8585
8586 # We might need write permission
8587 chmod 0777, $filename;
8588 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8589 return;
8590}
8591
9218f1cf 8592sub write ($$@) {
9abe8df8
KW
8593 # Given a filename and references to arrays of lines, write the lines of
8594 # each array to the file
99870f4d
KW
8595 # Filename can be given as an arrayref of directory names
8596
9218f1cf 8597 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
99870f4d 8598
9abe8df8 8599 my $file = shift;
9218f1cf 8600 my $use_utf8 = shift;
99870f4d
KW
8601
8602 # Get into a single string if an array, and get rid of, in Unix terms, any
8603 # leading '.'
8604 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8605 $file = File::Spec->canonpath($file);
8606
8607 # If has directories, make sure that they all exist
8608 (undef, my $directories, undef) = File::Spec->splitpath($file);
8609 File::Path::mkpath($directories) if $directories && ! -d $directories;
8610
8611 push @files_actually_output, $file;
8612
99870f4d
KW
8613 force_unlink ($file);
8614
8615 my $OUT;
8616 if (not open $OUT, ">", $file) {
8617 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8618 return;
8619 }
430ada4c 8620
9218f1cf
KW
8621 binmode $OUT, ":utf8" if $use_utf8;
8622
9abe8df8
KW
8623 while (defined (my $lines_ref = shift)) {
8624 unless (@$lines_ref) {
8625 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8626 }
8627
8628 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8629 }
430ada4c
NC
8630 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8631
99870f4d
KW
8632 print "$file written.\n" if $verbosity >= $VERBOSE;
8633
99870f4d
KW
8634 return;
8635}
8636
8637
8638sub Standardize($) {
8639 # This converts the input name string into a standardized equivalent to
8640 # use internally.
8641
8642 my $name = shift;
8643 unless (defined $name) {
8644 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8645 return;
8646 }
8647
8648 # Remove any leading or trailing white space
8649 $name =~ s/^\s+//g;
8650 $name =~ s/\s+$//g;
8651
98dc9551 8652 # Convert interior white space and hyphens into underscores.
99870f4d
KW
8653 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8654
8655 # Capitalize the letter following an underscore, and convert a sequence of
8656 # multiple underscores to a single one
8657 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8658
8659 # And capitalize the first letter, but not for the special cjk ones.
8660 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8661 return $name;
8662}
8663
8664sub standardize ($) {
8665 # Returns a lower-cased standardized name, without underscores. This form
8666 # is chosen so that it can distinguish between any real versus superficial
8667 # Unicode name differences. It relies on the fact that Unicode doesn't
8668 # have interior underscores, white space, nor dashes in any
8669 # stricter-matched name. It should not be used on Unicode code point
8670 # names (the Name property), as they mostly, but not always follow these
8671 # rules.
8672
8673 my $name = Standardize(shift);
8674 return if !defined $name;
8675
8676 $name =~ s/ (?<= .) _ (?= . ) //xg;
8677 return lc $name;
8678}
8679
c85f591a
KW
8680sub utf8_heavy_name ($$) {
8681 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8682 # perhaps this function should be placed somewhere, like Heavy.pl so that
8683 # utf8_heavy can use it directly without duplicating code that can get
8684 # out-of sync.
8685
8686 my $table = shift;
8687 my $alias = shift;
8688 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8689
8690 my $property = $table->property;
8691 $property = ($property == $perl)
8692 ? "" # 'perl' is never explicitly stated
8693 : standardize($property->name) . '=';
8694 if ($alias->loose_match) {
8695 return $property . standardize($alias->name);
8696 }
8697 else {
8698 return lc ($property . $alias->name);
8699 }
8700
8701 return;
8702}
8703
99870f4d
KW
8704{ # Closure
8705
7e3121cc 8706 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
99870f4d
KW
8707 my %already_output;
8708
8709 $main::simple_dumper_nesting = 0;
8710
8711 sub simple_dumper {
8712 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8713 # the real thing as we have to run under miniperl.
8714
8715 # It is designed so that on input it is at the beginning of a line,
8716 # and the final thing output in any call is a trailing ",\n".
8717
8718 my $item = shift;
8719 my $indent = shift;
7e3121cc 8720 $indent = "" if ! $debugging_build || ! defined $indent;
99870f4d
KW
8721
8722 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8723
8724 # nesting level is localized, so that as the call stack pops, it goes
8725 # back to the prior value.
8726 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8727 undef %already_output if $main::simple_dumper_nesting == 0;
8728 $main::simple_dumper_nesting++;
8729 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8730
8731 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8732
8733 # Determine the indent for recursive calls.
8734 my $next_indent = $indent . $indent_increment;
8735
8736 my $output;
8737 if (! ref $item) {
8738
8739 # Dump of scalar: just output it in quotes if not a number. To do
8740 # so we must escape certain characters, and therefore need to
8741 # operate on a copy to avoid changing the original
8742 my $copy = $item;
8743 $copy = $UNDEF unless defined $copy;
8744
02cc6656
KW
8745 # Quote non-integers (integers also have optional leading '-')
8746 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
99870f4d
KW
8747
8748 # Escape apostrophe and backslash
8749 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8750 $copy = "'$copy'";
8751 }
8752 $output = "$indent$copy,\n";
8753 }
8754 else {
8755
8756 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 8757 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 8758 if (defined $already_output{$addr}) {
99870f4d
KW
8759 return "${indent}ALREADY OUTPUT: $item\n";
8760 }
f998e60c 8761 $already_output{$addr} = $item;
99870f4d
KW
8762
8763 if (ref $item eq 'ARRAY') {
8764 my $using_brackets;
8765 $output = $indent;
8766 if ($main::simple_dumper_nesting > 1) {
8767 $output .= '[';
8768 $using_brackets = 1;
8769 }
8770 else {
8771 $using_brackets = 0;
8772 }
8773
8774 # If the array is empty, put the closing bracket on the same
8775 # line. Otherwise, recursively add each array element
8776 if (@$item == 0) {
8777 $output .= " ";
8778 }
8779 else {
8780 $output .= "\n";
8781 for (my $i = 0; $i < @$item; $i++) {
8782
8783 # Indent array elements one level
8784 $output .= &simple_dumper($item->[$i], $next_indent);
7e3121cc 8785 next if ! $debugging_build;
c12f2655
KW
8786 $output =~ s/\n$//; # Remove any trailing nl so
8787 $output .= " # [$i]\n"; # as to add a comment giving
8788 # the array index
99870f4d
KW
8789 }
8790 $output .= $indent; # Indent closing ']' to orig level
8791 }
8792 $output .= ']' if $using_brackets;
8793 $output .= ",\n";
8794 }
8795 elsif (ref $item eq 'HASH') {
8796 my $is_first_line;
8797 my $using_braces;
8798 my $body_indent;
8799
8800 # No surrounding braces at top level
8801 $output .= $indent;
8802 if ($main::simple_dumper_nesting > 1) {
8803 $output .= "{\n";
8804 $is_first_line = 0;
8805 $body_indent = $next_indent;
8806 $next_indent .= $indent_increment;
8807 $using_braces = 1;
8808 }
8809 else {
8810 $is_first_line = 1;
8811 $body_indent = $indent;
8812 $using_braces = 0;
8813 }
8814
8815 # Output hashes sorted alphabetically instead of apparently
8816 # random. Use caseless alphabetic sort
8817 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8818 {
8819 if ($is_first_line) {
8820 $is_first_line = 0;
8821 }
8822 else {
8823 $output .= "$body_indent";
8824 }
8825
8826 # The key must be a scalar, but this recursive call quotes
8827 # it
8828 $output .= &simple_dumper($key);
8829
8830 # And change the trailing comma and nl to the hash fat
8831 # comma for clarity, and so the value can be on the same
8832 # line
8833 $output =~ s/,\n$/ => /;
8834
8835 # Recursively call to get the value's dump.
8836 my $next = &simple_dumper($item->{$key}, $next_indent);
8837
8838 # If the value is all on one line, remove its indent, so
8839 # will follow the => immediately. If it takes more than
8840 # one line, start it on a new line.
8841 if ($next !~ /\n.*\n/) {
8842 $next =~ s/^ *//;
8843 }
8844 else {
8845 $output .= "\n";
8846 }
8847 $output .= $next;
8848 }
8849
8850 $output .= "$indent},\n" if $using_braces;
8851 }
8852 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8853 $output = $indent . ref($item) . "\n";
8854 # XXX see if blessed
8855 }
8856 elsif ($item->can('dump')) {
8857
8858 # By convention in this program, objects furnish a 'dump'
8859 # method. Since not doing any output at this level, just pass
8860 # on the input indent
8861 $output = $item->dump($indent);
8862 }
8863 else {
8864 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8865 }
8866 }
8867 return $output;
8868 }
8869}
8870
8871sub dump_inside_out {
8872 # Dump inside-out hashes in an object's state by converting them to a
8873 # regular hash and then calling simple_dumper on that.
8874
8875 my $object = shift;
8876 my $fields_ref = shift;
8877 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8878
ffe43484 8879 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
8880
8881 my %hash;
8882 foreach my $key (keys %$fields_ref) {
8883 $hash{$key} = $fields_ref->{$key}{$addr};
8884 }
8885
8886 return simple_dumper(\%hash, @_);
8887}
8888
8889sub _operator_dot {
8890 # Overloaded '.' method that is common to all packages. It uses the
8891 # package's stringify method.
8892
8893 my $self = shift;
8894 my $other = shift;
8895 my $reversed = shift;
8896 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8897
8898 $other = "" unless defined $other;
8899
8900 foreach my $which (\$self, \$other) {
8901 next unless ref $$which;
8902 if ($$which->can('_operator_stringify')) {
8903 $$which = $$which->_operator_stringify;
8904 }
8905 else {
8906 my $ref = ref $$which;
ffe43484 8907 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
8908 $$which = "$ref ($addr)";
8909 }
8910 }
8911 return ($reversed)
8912 ? "$other$self"
8913 : "$self$other";
8914}
8915
1285127e
KW
8916sub _operator_dot_equal {
8917 # Overloaded '.=' method that is common to all packages.
8918
8919 my $self = shift;
8920 my $other = shift;
8921 my $reversed = shift;
8922 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8923
8924 $other = "" unless defined $other;
8925
8926 if ($reversed) {
8927 return $other .= "$self";
8928 }
8929 else {
8930 return "$self" . "$other";
8931 }
8932}
8933
99870f4d
KW
8934sub _operator_equal {
8935 # Generic overloaded '==' routine. To be equal, they must be the exact
8936 # same object
8937
8938 my $self = shift;
8939 my $other = shift;
8940
8941 return 0 unless defined $other;
8942 return 0 unless ref $other;
f998e60c 8943 no overloading;
2100aa98 8944 return $self == $other;
99870f4d
KW
8945}
8946
8947sub _operator_not_equal {
8948 my $self = shift;
8949 my $other = shift;
8950
8951 return ! _operator_equal($self, $other);
8952}
8953
8954sub process_PropertyAliases($) {
8955 # This reads in the PropertyAliases.txt file, which contains almost all
8956 # the character properties in Unicode and their equivalent aliases:
8957 # scf ; Simple_Case_Folding ; sfc
8958 #
8959 # Field 0 is the preferred short name for the property.
8960 # Field 1 is the full name.
8961 # Any succeeding ones are other accepted names.
8962
8963 my $file= shift;
8964 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8965
8966 # This whole file was non-existent in early releases, so use our own
8967 # internal one.
8968 $file->insert_lines(get_old_property_aliases())
8969 if ! -e 'PropertyAliases.txt';
8970
8971 # Add any cjk properties that may have been defined.
8972 $file->insert_lines(@cjk_properties);
8973
8974 while ($file->next_line) {
8975
8976 my @data = split /\s*;\s*/;
8977
8978 my $full = $data[1];
8979
8980 my $this = Property->new($data[0], Full_Name => $full);
8981
8982 # Start looking for more aliases after these two.
8983 for my $i (2 .. @data - 1) {
8984 $this->add_alias($data[$i]);
8985 }
8986
8987 }
90e64982
KW
8988
8989 my $scf = property_ref("Simple_Case_Folding");
8990 $scf->add_alias("scf");
8991 $scf->add_alias("sfc");
8992
99870f4d
KW
8993 return;
8994}
8995
8996sub finish_property_setup {
8997 # Finishes setting up after PropertyAliases.
8998
8999 my $file = shift;
9000 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9001
9002 # This entry was missing from this file in earlier Unicode versions
9f27388f
KW
9003 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9004 Property->new('JSN', Full_Name => 'Jamo_Short_Name');
99870f4d
KW
9005 }
9006
30769324
KW
9007 # These two properties must be defined in all releases so we can generate
9008 # the tables from them to make regex \X work, but suppress their output so
9009 # aren't application visible prior to releases where they should be
9010 if (! defined property_ref('GCB')) {
9011 Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9012 Fate => $PLACEHOLDER);
9013 }
9014 if (! defined property_ref('hst')) {
9015 Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9016 Fate => $PLACEHOLDER);
9017 }
9018
99870f4d
KW
9019 # These are used so much, that we set globals for them.
9020 $gc = property_ref('General_Category');
9021 $block = property_ref('Block');
359523e2 9022 $script = property_ref('Script');
99870f4d
KW
9023
9024 # Perl adds this alias.
9025 $gc->add_alias('Category');
9026
d3cbe105
KW
9027 # Unicode::Normalize expects this file with this name and directory.
9028 my $ccc = property_ref('Canonical_Combining_Class');
9029 if (defined $ccc) {
9030 $ccc->set_file('CombiningClass');
9031 $ccc->set_directory(File::Spec->curdir());
9032 }
9033
99870f4d
KW
9034 # These two properties aren't actually used in the core, but unfortunately
9035 # the names just above that are in the core interfere with these, so
9036 # choose different names. These aren't a problem unless the map tables
9037 # for these files get written out.
9038 my $lowercase = property_ref('Lowercase');
9039 $lowercase->set_file('IsLower') if defined $lowercase;
9040 my $uppercase = property_ref('Uppercase');
9041 $uppercase->set_file('IsUpper') if defined $uppercase;
9042
9043 # Set up the hard-coded default mappings, but only on properties defined
9044 # for this release
9045 foreach my $property (keys %default_mapping) {
9046 my $property_object = property_ref($property);
9047 next if ! defined $property_object;
9048 my $default_map = $default_mapping{$property};
9049 $property_object->set_default_map($default_map);
9050
9051 # A map of <code point> implies the property is string.
9052 if ($property_object->type == $UNKNOWN
9053 && $default_map eq $CODE_POINT)
9054 {
9055 $property_object->set_type($STRING);
9056 }
9057 }
9058
9059 # The following use the Multi_Default class to create objects for
9060 # defaults.
9061
9062 # Bidi class has a complicated default, but the derived file takes care of
9063 # the complications, leaving just 'L'.
9064 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9065 property_ref('Bidi_Class')->set_default_map('L');
9066 }
9067 else {
9068 my $default;
9069
9070 # The derived file was introduced in 3.1.1. The values below are
9071 # taken from table 3-8, TUS 3.0
9072 my $default_R =
9073 'my $default = Range_List->new;
9074 $default->add_range(0x0590, 0x05FF);
9075 $default->add_range(0xFB1D, 0xFB4F);'
9076 ;
9077
9078 # The defaults apply only to unassigned characters
a67f160a 9079 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
9080
9081 if ($v_version lt v3.0.0) {
9082 $default = Multi_Default->new(R => $default_R, 'L');
9083 }
9084 else {
9085
9086 # AL apparently not introduced until 3.0: TUS 2.x references are
9087 # not on-line to check it out
9088 my $default_AL =
9089 'my $default = Range_List->new;
9090 $default->add_range(0x0600, 0x07BF);
9091 $default->add_range(0xFB50, 0xFDFF);
9092 $default->add_range(0xFE70, 0xFEFF);'
9093 ;
9094
9095 # Non-character code points introduced in this release; aren't AL
9096 if ($v_version ge 3.1.0) {
9097 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9098 }
a67f160a 9099 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
9100 $default = Multi_Default->new(AL => $default_AL,
9101 R => $default_R,
9102 'L');
9103 }
9104 property_ref('Bidi_Class')->set_default_map($default);
9105 }
9106
9107 # Joining type has a complicated default, but the derived file takes care
9108 # of the complications, leaving just 'U' (or Non_Joining), except the file
9109 # is bad in 3.1.0
9110 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9111 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9112 property_ref('Joining_Type')->set_default_map('Non_Joining');
9113 }
9114 else {
9115
9116 # Otherwise, there are not one, but two possibilities for the
9117 # missing defaults: T and U.
9118 # The missing defaults that evaluate to T are given by:
9119 # T = Mn + Cf - ZWNJ - ZWJ
9120 # where Mn and Cf are the general category values. In other words,
9121 # any non-spacing mark or any format control character, except
9122 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9123 # WIDTH JOINER (joining type C).
9124 my $default = Multi_Default->new(
9125 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9126 'Non_Joining');
9127 property_ref('Joining_Type')->set_default_map($default);
9128 }
9129 }
9130
9131 # Line break has a complicated default in early releases. It is 'Unknown'
9132 # for non-assigned code points; 'AL' for assigned.
9133 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9134 my $lb = property_ref('Line_Break');
9135 if ($v_version gt 3.2.0) {
9136 $lb->set_default_map('Unknown');
9137 }
9138 else {
9139 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9140 'AL');
9141 $lb->set_default_map($default);
9142 }
9143
9144 # If has the URS property, make sure that the standard aliases are in
9145 # it, since not in the input tables in some versions.
9146 my $urs = property_ref('Unicode_Radical_Stroke');
9147 if (defined $urs) {
9148 $urs->add_alias('cjkRSUnicode');
9149 $urs->add_alias('kRSUnicode');
9150 }
9151 }
f64b46a1
KW
9152
9153 # For backwards compatibility with applications that may read the mapping
9154 # file directly (it was documented in 5.12 and 5.14 as being thusly
ce712c88 9155 # usable), keep it from being adjusted. (range_size_1 is
f64b46a1
KW
9156 # used to force the traditional format.)
9157 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9158 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9159 $nfkc_cf->set_range_size_1(1);
9160 }
9161 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9162 $bmg->set_to_output_map($EXTERNAL_MAP);
9163 $bmg->set_range_size_1(1);
9164 }
9165
ce712c88 9166 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
4f143a72 9167
99870f4d
KW
9168 return;
9169}
9170
9171sub get_old_property_aliases() {
9172 # Returns what would be in PropertyAliases.txt if it existed in very old
9173 # versions of Unicode. It was derived from the one in 3.2, and pared
9174 # down based on the data that was actually in the older releases.
9175 # An attempt was made to use the existence of files to mean inclusion or
9176 # not of various aliases, but if this was not sufficient, using version
9177 # numbers was resorted to.
9178
9179 my @return;
9180
9181 # These are to be used in all versions (though some are constructed by
9182 # this program if missing)
9183 push @return, split /\n/, <<'END';
9184bc ; Bidi_Class
9185Bidi_M ; Bidi_Mirrored
9186cf ; Case_Folding
9187ccc ; Canonical_Combining_Class
9188dm ; Decomposition_Mapping
9189dt ; Decomposition_Type
9190gc ; General_Category
9191isc ; ISO_Comment
9192lc ; Lowercase_Mapping
9193na ; Name
9194na1 ; Unicode_1_Name
9195nt ; Numeric_Type
9196nv ; Numeric_Value
90e64982 9197scf ; Simple_Case_Folding
99870f4d
KW
9198slc ; Simple_Lowercase_Mapping
9199stc ; Simple_Titlecase_Mapping
9200suc ; Simple_Uppercase_Mapping
9201tc ; Titlecase_Mapping
9202uc ; Uppercase_Mapping
9203END
9204
9205 if (-e 'Blocks.txt') {
9206 push @return, "blk ; Block\n";
9207 }
9208 if (-e 'ArabicShaping.txt') {
9209 push @return, split /\n/, <<'END';
9210jg ; Joining_Group
9211jt ; Joining_Type
9212END
9213 }
9214 if (-e 'PropList.txt') {
9215
9216 # This first set is in the original old-style proplist.
9217 push @return, split /\n/, <<'END';
99870f4d
KW
9218Bidi_C ; Bidi_Control
9219Dash ; Dash
9220Dia ; Diacritic
9221Ext ; Extender
9222Hex ; Hex_Digit
9223Hyphen ; Hyphen
9224IDC ; ID_Continue
9225Ideo ; Ideographic
9226Join_C ; Join_Control
9227Math ; Math
9228QMark ; Quotation_Mark
9229Term ; Terminal_Punctuation
9230WSpace ; White_Space
9231END
9232 # The next sets were added later
9233 if ($v_version ge v3.0.0) {
9234 push @return, split /\n/, <<'END';
9235Upper ; Uppercase
9236Lower ; Lowercase
9237END
9238 }
9239 if ($v_version ge v3.0.1) {
9240 push @return, split /\n/, <<'END';
9241NChar ; Noncharacter_Code_Point
9242END
9243 }
9244 # The next sets were added in the new-style
9245 if ($v_version ge v3.1.0) {
9246 push @return, split /\n/, <<'END';
9247OAlpha ; Other_Alphabetic
9248OLower ; Other_Lowercase
9249OMath ; Other_Math
9250OUpper ; Other_Uppercase
9251END
9252 }
9253 if ($v_version ge v3.1.1) {
9254 push @return, "AHex ; ASCII_Hex_Digit\n";
9255 }
9256 }
9257 if (-e 'EastAsianWidth.txt') {
9258 push @return, "ea ; East_Asian_Width\n";
9259 }
9260 if (-e 'CompositionExclusions.txt') {
9261 push @return, "CE ; Composition_Exclusion\n";
9262 }
9263 if (-e 'LineBreak.txt') {
9264 push @return, "lb ; Line_Break\n";
9265 }
9266 if (-e 'BidiMirroring.txt') {
9267 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
9268 }
9269 if (-e 'Scripts.txt') {
9270 push @return, "sc ; Script\n";
9271 }
9272 if (-e 'DNormalizationProps.txt') {
9273 push @return, split /\n/, <<'END';
9274Comp_Ex ; Full_Composition_Exclusion
9275FC_NFKC ; FC_NFKC_Closure
9276NFC_QC ; NFC_Quick_Check
9277NFD_QC ; NFD_Quick_Check
9278NFKC_QC ; NFKC_Quick_Check
9279NFKD_QC ; NFKD_Quick_Check
9280XO_NFC ; Expands_On_NFC
9281XO_NFD ; Expands_On_NFD
9282XO_NFKC ; Expands_On_NFKC
9283XO_NFKD ; Expands_On_NFKD
9284END
9285 }
9286 if (-e 'DCoreProperties.txt') {
9287 push @return, split /\n/, <<'END';
0ff33a84 9288Alpha ; Alphabetic
99870f4d
KW
9289IDS ; ID_Start
9290XIDC ; XID_Continue
9291XIDS ; XID_Start
9292END
9293 # These can also appear in some versions of PropList.txt
9294 push @return, "Lower ; Lowercase\n"
9295 unless grep { $_ =~ /^Lower\b/} @return;
9296 push @return, "Upper ; Uppercase\n"
9297 unless grep { $_ =~ /^Upper\b/} @return;
9298 }
9299
9300 # This flag requires the DAge.txt file to be copied into the directory.
9301 if (DEBUG && $compare_versions) {
9302 push @return, 'age ; Age';
9303 }
9304
9305 return @return;
9306}
9307
9308sub process_PropValueAliases {
9309 # This file contains values that properties look like:
9310 # bc ; AL ; Arabic_Letter
9311 # blk; n/a ; Greek_And_Coptic ; Greek
9312 #
9313 # Field 0 is the property.
9314 # Field 1 is the short name of a property value or 'n/a' if no
9315 # short name exists;
9316 # Field 2 is the full property value name;
9317 # Any other fields are more synonyms for the property value.
9318 # Purely numeric property values are omitted from the file; as are some
9319 # others, fewer and fewer in later releases
9320
9321 # Entries for the ccc property have an extra field before the
9322 # abbreviation:
9323 # ccc; 0; NR ; Not_Reordered
9324 # It is the numeric value that the names are synonyms for.
9325
9326 # There are comment entries for values missing from this file:
9327 # # @missing: 0000..10FFFF; ISO_Comment; <none>
9328 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9329
9330 my $file= shift;
9331 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9332
9333 # This whole file was non-existent in early releases, so use our own
9334 # internal one if necessary.
9335 if (! -e 'PropValueAliases.txt') {
9336 $file->insert_lines(get_old_property_value_aliases());
9337 }
9338
30769324
KW
9339 if ($v_version lt 4.0.0) {
9340 $file->insert_lines(split /\n/, <<'END'
9341hst; L ; Leading_Jamo
9342hst; LV ; LV_Syllable
9343hst; LVT ; LVT_Syllable
9344hst; NA ; Not_Applicable
9345hst; T ; Trailing_Jamo
9346hst; V ; Vowel_Jamo
9347END
9348 );
9349 }
9350 if ($v_version lt 4.1.0) {
9351 $file->insert_lines(split /\n/, <<'END'
9352GCB; CN ; Control
9353GCB; CR ; CR
9354GCB; EX ; Extend
9355GCB; L ; L
9356GCB; LF ; LF
9357GCB; LV ; LV
9358GCB; LVT ; LVT
9359GCB; T ; T
9360GCB; V ; V
9361GCB; XX ; Other
9362END
9363 );
9364 }
9365
9366
99870f4d
KW
9367 # Add any explicit cjk values
9368 $file->insert_lines(@cjk_property_values);
9369
9370 # This line is used only for testing the code that checks for name
9371 # conflicts. There is a script Inherited, and when this line is executed
9372 # it causes there to be a name conflict with the 'Inherited' that this
9373 # program generates for this block property value
9374 #$file->insert_lines('blk; n/a; Herited');
9375
9376
9377 # Process each line of the file ...
9378 while ($file->next_line) {
9379
74090492
KW
9380 # Fix typo in input file
9381 s/CCC133/CCC132/g if $v_version eq v6.1.0;
9382
99870f4d
KW
9383 my ($property, @data) = split /\s*;\s*/;
9384
66b4eb0a
KW
9385 # The ccc property has an extra field at the beginning, which is the
9386 # numeric value. Move it to be after the other two, mnemonic, fields,
9387 # so that those will be used as the property value's names, and the
9388 # number will be an extra alias. (Rightmost splice removes field 1-2,
9389 # returning them in a slice; left splice inserts that before anything,
9390 # thus shifting the former field 0 to after them.)
9391 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9392
9393 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
9394 # there is no short name, use the full one in element 1
027866c1
KW
9395 if ($data[0] eq "n/a") {
9396 $data[0] = $data[1];
9397 }
9398 elsif ($data[0] ne $data[1]
9399 && standardize($data[0]) eq standardize($data[1])
9400 && $data[1] !~ /[[:upper:]]/)
9401 {
9402 # Also, there is a bug in the file in which "n/a" is omitted, and
9403 # the two fields are identical except for case, and the full name
9404 # is all lower case. Copy the "short" name unto the full one to
9405 # give it some upper case.
9406
9407 $data[1] = $data[0];
9408 }
99870f4d
KW
9409
9410 # Earlier releases had the pseudo property 'qc' that should expand to
9411 # the ones that replace it below.
9412 if ($property eq 'qc') {
9413 if (lc $data[0] eq 'y') {
9414 $file->insert_lines('NFC_QC; Y ; Yes',
9415 'NFD_QC; Y ; Yes',
9416 'NFKC_QC; Y ; Yes',
9417 'NFKD_QC; Y ; Yes',
9418 );
9419 }
9420 elsif (lc $data[0] eq 'n') {
9421 $file->insert_lines('NFC_QC; N ; No',
9422 'NFD_QC; N ; No',
9423 'NFKC_QC; N ; No',
9424 'NFKD_QC; N ; No',
9425 );
9426 }
9427 elsif (lc $data[0] eq 'm') {
9428 $file->insert_lines('NFC_QC; M ; Maybe',
9429 'NFKC_QC; M ; Maybe',
9430 );
9431 }
9432 else {
9433 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9434 }
9435 next;
9436 }
9437
9438 # The first field is the short name, 2nd is the full one.
9439 my $property_object = property_ref($property);
9440 my $table = $property_object->add_match_table($data[0],
9441 Full_Name => $data[1]);
9442
9443 # Start looking for more aliases after these two.
9444 for my $i (2 .. @data - 1) {
9445 $table->add_alias($data[$i]);
9446 }
9447 } # End of looping through the file
9448
9449 # As noted in the comments early in the program, it generates tables for
9450 # the default values for all releases, even those for which the concept
9451 # didn't exist at the time. Here we add those if missing.
9452 my $age = property_ref('age');
9453 if (defined $age && ! defined $age->table('Unassigned')) {
9454 $age->add_match_table('Unassigned');
9455 }
9456 $block->add_match_table('No_Block') if -e 'Blocks.txt'
9457 && ! defined $block->table('No_Block');
9458
9459
9460 # Now set the default mappings of the properties from the file. This is
9461 # done after the loop because a number of properties have only @missings
9462 # entries in the file, and may not show up until the end.
9463 my @defaults = $file->get_missings;
9464 foreach my $default_ref (@defaults) {
9465 my $default = $default_ref->[0];
9466 my $property = property_ref($default_ref->[1]);
9467 $property->set_default_map($default);
9468 }
9469 return;
9470}
9471
9472sub get_old_property_value_aliases () {
9473 # Returns what would be in PropValueAliases.txt if it existed in very old
9474 # versions of Unicode. It was derived from the one in 3.2, and pared
9475 # down. An attempt was made to use the existence of files to mean
9476 # inclusion or not of various aliases, but if this was not sufficient,
9477 # using version numbers was resorted to.
9478
9479 my @return = split /\n/, <<'END';
9480bc ; AN ; Arabic_Number
9481bc ; B ; Paragraph_Separator
9482bc ; CS ; Common_Separator
9483bc ; EN ; European_Number
9484bc ; ES ; European_Separator
9485bc ; ET ; European_Terminator
9486bc ; L ; Left_To_Right
9487bc ; ON ; Other_Neutral
9488bc ; R ; Right_To_Left
9489bc ; WS ; White_Space
9490
90e64982
KW
9491Bidi_M; N; No; F; False
9492Bidi_M; Y; Yes; T; True
9493
99870f4d
KW
9494# The standard combining classes are very much different in v1, so only use
9495# ones that look right (not checked thoroughly)
9496ccc; 0; NR ; Not_Reordered
9497ccc; 1; OV ; Overlay
9498ccc; 7; NK ; Nukta
9499ccc; 8; KV ; Kana_Voicing
9500ccc; 9; VR ; Virama
9501ccc; 202; ATBL ; Attached_Below_Left
9502ccc; 216; ATAR ; Attached_Above_Right
9503ccc; 218; BL ; Below_Left
9504ccc; 220; B ; Below
9505ccc; 222; BR ; Below_Right
9506ccc; 224; L ; Left
9507ccc; 228; AL ; Above_Left
9508ccc; 230; A ; Above
9509ccc; 232; AR ; Above_Right
9510ccc; 234; DA ; Double_Above
9511
9512dt ; can ; canonical
9513dt ; enc ; circle
9514dt ; fin ; final
9515dt ; font ; font
9516dt ; fra ; fraction
9517dt ; init ; initial
9518dt ; iso ; isolated
9519dt ; med ; medial
9520dt ; n/a ; none
9521dt ; nb ; noBreak
9522dt ; sqr ; square
9523dt ; sub ; sub
9524dt ; sup ; super
9525
9526gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9527gc ; Cc ; Control
9528gc ; Cn ; Unassigned
9529gc ; Co ; Private_Use
9530gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9531gc ; LC ; Cased_Letter # Ll | Lt | Lu
9532gc ; Ll ; Lowercase_Letter
9533gc ; Lm ; Modifier_Letter
9534gc ; Lo ; Other_Letter
9535gc ; Lu ; Uppercase_Letter
9536gc ; M ; Mark # Mc | Me | Mn
9537gc ; Mc ; Spacing_Mark
9538gc ; Mn ; Nonspacing_Mark
9539gc ; N ; Number # Nd | Nl | No
9540gc ; Nd ; Decimal_Number
9541gc ; No ; Other_Number
9542gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9543gc ; Pd ; Dash_Punctuation
9544gc ; Pe ; Close_Punctuation
9545gc ; Po ; Other_Punctuation
9546gc ; Ps ; Open_Punctuation
9547gc ; S ; Symbol # Sc | Sk | Sm | So
9548gc ; Sc ; Currency_Symbol
9549gc ; Sm ; Math_Symbol
9550gc ; So ; Other_Symbol
9551gc ; Z ; Separator # Zl | Zp | Zs
9552gc ; Zl ; Line_Separator
9553gc ; Zp ; Paragraph_Separator
9554gc ; Zs ; Space_Separator
9555
9556nt ; de ; Decimal
9557nt ; di ; Digit
9558nt ; n/a ; None
9559nt ; nu ; Numeric
9560END
9561
9562 if (-e 'ArabicShaping.txt') {
9563 push @return, split /\n/, <<'END';
9564jg ; n/a ; AIN
9565jg ; n/a ; ALEF
9566jg ; n/a ; DAL
9567jg ; n/a ; GAF
9568jg ; n/a ; LAM
9569jg ; n/a ; MEEM
9570jg ; n/a ; NO_JOINING_GROUP
9571jg ; n/a ; NOON
9572jg ; n/a ; QAF
9573jg ; n/a ; SAD
9574jg ; n/a ; SEEN
9575jg ; n/a ; TAH
9576jg ; n/a ; WAW
9577
9578jt ; C ; Join_Causing
9579jt ; D ; Dual_Joining
9580jt ; L ; Left_Joining
9581jt ; R ; Right_Joining
9582jt ; U ; Non_Joining
9583jt ; T ; Transparent
9584END
9585 if ($v_version ge v3.0.0) {
9586 push @return, split /\n/, <<'END';
9587jg ; n/a ; ALAPH
9588jg ; n/a ; BEH
9589jg ; n/a ; BETH
9590jg ; n/a ; DALATH_RISH
9591jg ; n/a ; E
9592jg ; n/a ; FEH
9593jg ; n/a ; FINAL_SEMKATH
9594jg ; n/a ; GAMAL
9595jg ; n/a ; HAH
9596jg ; n/a ; HAMZA_ON_HEH_GOAL
9597jg ; n/a ; HE
9598jg ; n/a ; HEH
9599jg ; n/a ; HEH_GOAL
9600jg ; n/a ; HETH
9601jg ; n/a ; KAF
9602jg ; n/a ; KAPH
9603jg ; n/a ; KNOTTED_HEH
9604jg ; n/a ; LAMADH
9605jg ; n/a ; MIM
9606jg ; n/a ; NUN
9607jg ; n/a ; PE
9608jg ; n/a ; QAPH
9609jg ; n/a ; REH
9610jg ; n/a ; REVERSED_PE
9611jg ; n/a ; SADHE
9612jg ; n/a ; SEMKATH
9613jg ; n/a ; SHIN
9614jg ; n/a ; SWASH_KAF
9615jg ; n/a ; TAW
9616jg ; n/a ; TEH_MARBUTA
9617jg ; n/a ; TETH
9618jg ; n/a ; YEH
9619jg ; n/a ; YEH_BARREE
9620jg ; n/a ; YEH_WITH_TAIL
9621jg ; n/a ; YUDH
9622jg ; n/a ; YUDH_HE
9623jg ; n/a ; ZAIN
9624END
9625 }
9626 }
9627
9628
9629 if (-e 'EastAsianWidth.txt') {
9630 push @return, split /\n/, <<'END';
9631ea ; A ; Ambiguous
9632ea ; F ; Fullwidth
9633ea ; H ; Halfwidth
9634ea ; N ; Neutral
9635ea ; Na ; Narrow
9636ea ; W ; Wide
9637END
9638 }
9639
9640 if (-e 'LineBreak.txt') {
9641 push @return, split /\n/, <<'END';
9642lb ; AI ; Ambiguous
9643lb ; AL ; Alphabetic
9644lb ; B2 ; Break_Both
9645lb ; BA ; Break_After
9646lb ; BB ; Break_Before
9647lb ; BK ; Mandatory_Break
9648lb ; CB ; Contingent_Break
9649lb ; CL ; Close_Punctuation
9650lb ; CM ; Combining_Mark
9651lb ; CR ; Carriage_Return
9652lb ; EX ; Exclamation
9653lb ; GL ; Glue
9654lb ; HY ; Hyphen
9655lb ; ID ; Ideographic
9656lb ; IN ; Inseperable
9657lb ; IS ; Infix_Numeric
9658lb ; LF ; Line_Feed
9659lb ; NS ; Nonstarter
9660lb ; NU ; Numeric
9661lb ; OP ; Open_Punctuation
9662lb ; PO ; Postfix_Numeric
9663lb ; PR ; Prefix_Numeric
9664lb ; QU ; Quotation
9665lb ; SA ; Complex_Context
9666lb ; SG ; Surrogate
9667lb ; SP ; Space
9668lb ; SY ; Break_Symbols
9669lb ; XX ; Unknown
9670lb ; ZW ; ZWSpace
9671END
9672 }
9673
9674 if (-e 'DNormalizationProps.txt') {
9675 push @return, split /\n/, <<'END';
9676qc ; M ; Maybe
9677qc ; N ; No
9678qc ; Y ; Yes
9679END
9680 }
9681
9682 if (-e 'Scripts.txt') {
9683 push @return, split /\n/, <<'END';
9684sc ; Arab ; Arabic
9685sc ; Armn ; Armenian
9686sc ; Beng ; Bengali
9687sc ; Bopo ; Bopomofo
9688sc ; Cans ; Canadian_Aboriginal
9689sc ; Cher ; Cherokee
9690sc ; Cyrl ; Cyrillic
9691sc ; Deva ; Devanagari
9692sc ; Dsrt ; Deseret
9693sc ; Ethi ; Ethiopic
9694sc ; Geor ; Georgian
9695sc ; Goth ; Gothic
9696sc ; Grek ; Greek
9697sc ; Gujr ; Gujarati
9698sc ; Guru ; Gurmukhi
9699sc ; Hang ; Hangul
9700sc ; Hani ; Han
9701sc ; Hebr ; Hebrew
9702sc ; Hira ; Hiragana
9703sc ; Ital ; Old_Italic
9704sc ; Kana ; Katakana
9705sc ; Khmr ; Khmer
9706sc ; Knda ; Kannada
9707sc ; Laoo ; Lao
9708sc ; Latn ; Latin
9709sc ; Mlym ; Malayalam
9710sc ; Mong ; Mongolian
9711sc ; Mymr ; Myanmar
9712sc ; Ogam ; Ogham
9713sc ; Orya ; Oriya
9714sc ; Qaai ; Inherited
9715sc ; Runr ; Runic
9716sc ; Sinh ; Sinhala
9717sc ; Syrc ; Syriac
9718sc ; Taml ; Tamil
9719sc ; Telu ; Telugu
9720sc ; Thaa ; Thaana
9721sc ; Thai ; Thai
9722sc ; Tibt ; Tibetan
9723sc ; Yiii ; Yi
9724sc ; Zyyy ; Common
9725END
9726 }
9727
9728 if ($v_version ge v2.0.0) {
9729 push @return, split /\n/, <<'END';
9730dt ; com ; compat
9731dt ; nar ; narrow
9732dt ; sml ; small
9733dt ; vert ; vertical
9734dt ; wide ; wide
9735
9736gc ; Cf ; Format
9737gc ; Cs ; Surrogate
9738gc ; Lt ; Titlecase_Letter
9739gc ; Me ; Enclosing_Mark
9740gc ; Nl ; Letter_Number
9741gc ; Pc ; Connector_Punctuation
9742gc ; Sk ; Modifier_Symbol
9743END
9744 }
9745 if ($v_version ge v2.1.2) {
9746 push @return, "bc ; S ; Segment_Separator\n";
9747 }
9748 if ($v_version ge v2.1.5) {
9749 push @return, split /\n/, <<'END';
9750gc ; Pf ; Final_Punctuation
9751gc ; Pi ; Initial_Punctuation
9752END
9753 }
9754 if ($v_version ge v2.1.8) {
9755 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9756 }
9757
9758 if ($v_version ge v3.0.0) {
9759 push @return, split /\n/, <<'END';
9760bc ; AL ; Arabic_Letter
9761bc ; BN ; Boundary_Neutral
9762bc ; LRE ; Left_To_Right_Embedding
9763bc ; LRO ; Left_To_Right_Override
9764bc ; NSM ; Nonspacing_Mark
9765bc ; PDF ; Pop_Directional_Format
9766bc ; RLE ; Right_To_Left_Embedding
9767bc ; RLO ; Right_To_Left_Override
9768
9769ccc; 233; DB ; Double_Below
9770END
9771 }
9772
9773 if ($v_version ge v3.1.0) {
9774 push @return, "ccc; 226; R ; Right\n";
9775 }
9776
9777 return @return;
9778}
9779
6b5ab373
KW
9780sub process_NormalizationsTest {
9781
9782 # Each line looks like:
9783 # source code point; NFC; NFD; NFKC; NFKD
9784 # e.g.
9785 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
9786
9787 my $file= shift;
9788 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9789
9790 # Process each line of the file ...
9791 while ($file->next_line) {
9792
9793 next if /^@/;
9794
9795 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
9796
9797 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
9798 $$var = pack "U0U*", map { hex } split " ", $$var;
9799 $$var =~ s/(\\)/$1$1/g;
9800 }
9801
9802 push @normalization_tests,
9803 "Test_N(q\a$c1\a, q\a$c2\a, q\a$c3\a, q\a$c4\a, q\a$c5\a);\n";
9804 } # End of looping through the file
9805}
9806
b1c167a3
KW
9807sub output_perl_charnames_line ($$) {
9808
9809 # Output the entries in Perl_charnames specially, using 5 digits instead
9810 # of four. This makes the entries a constant length, and simplifies
9811 # charnames.pm which this table is for. Unicode can have 6 digit
9812 # ordinals, but they are all private use or noncharacters which do not
9813 # have names, so won't be in this table.
9814
73d9566f 9815 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
9816}
9817
99870f4d
KW
9818{ # Closure
9819 # This is used to store the range list of all the code points usable when
9820 # the little used $compare_versions feature is enabled.
9821 my $compare_versions_range_list;
9822
96cfc54a
KW
9823 # These are constants to the $property_info hash in this subroutine, to
9824 # avoid using a quoted-string which might have a typo.
9825 my $TYPE = 'type';
9826 my $DEFAULT_MAP = 'default_map';
9827 my $DEFAULT_TABLE = 'default_table';
9828 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9829 my $MISSINGS = 'missings';
9830
99870f4d
KW
9831 sub process_generic_property_file {
9832 # This processes a file containing property mappings and puts them
9833 # into internal map tables. It should be used to handle any property
9834 # files that have mappings from a code point or range thereof to
9835 # something else. This means almost all the UCD .txt files.
9836 # each_line_handlers() should be set to adjust the lines of these
9837 # files, if necessary, to what this routine understands:
9838 #
9839 # 0374 ; NFD_QC; N
9840 # 003C..003E ; Math
9841 #
92f9d56c 9842 # the fields are: "codepoint-range ; property; map"
99870f4d
KW
9843 #
9844 # meaning the codepoints in the range all have the value 'map' under
9845 # 'property'.
98dc9551 9846 # Beginning and trailing white space in each field are not significant.
99870f4d
KW
9847 # Note there is not a trailing semi-colon in the above. A trailing
9848 # semi-colon means the map is a null-string. An omitted map, as
9849 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9850 # table syntax. (This could have been hidden from this routine by
9851 # doing it in the $file object, but that would require parsing of the
9852 # line there, so would have to parse it twice, or change the interface
9853 # to pass this an array. So not done.)
9854 #
9855 # The map field may begin with a sequence of commands that apply to
9856 # this range. Each such command begins and ends with $CMD_DELIM.
9857 # These are used to indicate, for example, that the mapping for a
9858 # range has a non-default type.
9859 #
9860 # This loops through the file, calling it's next_line() method, and
9861 # then taking the map and adding it to the property's table.
9862 # Complications arise because any number of properties can be in the
9863 # file, in any order, interspersed in any way. The first time a
9864 # property is seen, it gets information about that property and
f86864ac 9865 # caches it for quick retrieval later. It also normalizes the maps
5d7f7709
KW
9866 # so that only one of many synonyms is stored. The Unicode input
9867 # files do use some multiple synonyms.
99870f4d
KW
9868
9869 my $file = shift;
9870 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9871
9872 my %property_info; # To keep track of what properties
9873 # have already had entries in the
9874 # current file, and info about each,
9875 # so don't have to recompute.
9876 my $property_name; # property currently being worked on
9877 my $property_type; # and its type
9878 my $previous_property_name = ""; # name from last time through loop
9879 my $property_object; # pointer to the current property's
9880 # object
9881 my $property_addr; # the address of that object
9882 my $default_map; # the string that code points missing
9883 # from the file map to
9884 my $default_table; # For non-string properties, a
9885 # reference to the match table that
9886 # will contain the list of code
9887 # points that map to $default_map.
9888
9889 # Get the next real non-comment line
9890 LINE:
9891 while ($file->next_line) {
9892
9893 # Default replacement type; means that if parts of the range have
9894 # already been stored in our tables, the new map overrides them if
9895 # they differ more than cosmetically
9896 my $replace = $IF_NOT_EQUIVALENT;
9897 my $map_type; # Default type for the map of this range
9898
9899 #local $to_trace = 1 if main::DEBUG;
9900 trace $_ if main::DEBUG && $to_trace;
9901
9902 # Split the line into components
9903 my ($range, $property_name, $map, @remainder)
9904 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9905
9906 # If more or less on the line than we are expecting, warn and skip
9907 # the line
9908 if (@remainder) {
9909 $file->carp_bad_line('Extra fields');
9910 next LINE;
9911 }
9912 elsif ( ! defined $property_name) {
9913 $file->carp_bad_line('Missing property');
9914 next LINE;
9915 }
9916
9917 # Examine the range.
9918 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9919 {
9920 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9921 next LINE;
9922 }
9923 my $low = hex $1;
9924 my $high = (defined $2) ? hex $2 : $low;
9925
9926 # For the very specialized case of comparing two Unicode
9927 # versions...
9928 if (DEBUG && $compare_versions) {
9929 if ($property_name eq 'Age') {
9930
9931 # Only allow code points at least as old as the version
9932 # specified.
9933 my $age = pack "C*", split(/\./, $map); # v string
9934 next LINE if $age gt $compare_versions;
9935 }
9936 else {
9937
9938 # Again, we throw out code points younger than those of
9939 # the specified version. By now, the Age property is
9940 # populated. We use the intersection of each input range
9941 # with this property to find what code points in it are
9942 # valid. To do the intersection, we have to convert the
9943 # Age property map to a Range_list. We only have to do
9944 # this once.
9945 if (! defined $compare_versions_range_list) {
9946 my $age = property_ref('Age');
9947 if (! -e 'DAge.txt') {
9948 croak "Need to have 'DAge.txt' file to do version comparison";
9949 }
9950 elsif ($age->count == 0) {
9951 croak "The 'Age' table is empty, but its file exists";
9952 }
9953 $compare_versions_range_list
9954 = Range_List->new(Initialize => $age);
9955 }
9956
9957 # An undefined map is always 'Y'
9958 $map = 'Y' if ! defined $map;
9959
9960 # Calculate the intersection of the input range with the
9961 # code points that are known in the specified version
9962 my @ranges = ($compare_versions_range_list
9963 & Range->new($low, $high))->ranges;
9964
9965 # If the intersection is empty, throw away this range
9966 next LINE unless @ranges;
9967
9968 # Only examine the first range this time through the loop.
9969 my $this_range = shift @ranges;
9970
9971 # Put any remaining ranges in the queue to be processed
9972 # later. Note that there is unnecessary work here, as we
9973 # will do the intersection again for each of these ranges
9974 # during some future iteration of the LINE loop, but this
9975 # code is not used in production. The later intersections
9976 # are guaranteed to not splinter, so this will not become
9977 # an infinite loop.
9978 my $line = join ';', $property_name, $map;
9979 foreach my $range (@ranges) {
9980 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9981 $range->start,
9982 $range->end,
9983 $line));
9984 }
9985
9986 # And process the first range, like any other.
9987 $low = $this_range->start;
9988 $high = $this_range->end;
9989 }
9990 } # End of $compare_versions
9991
9992 # If changing to a new property, get the things constant per
9993 # property
9994 if ($previous_property_name ne $property_name) {
9995
9996 $property_object = property_ref($property_name);
9997 if (! defined $property_object) {
9998 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9999 next LINE;
10000 }
051df77b 10001 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
10002
10003 # Defer changing names until have a line that is acceptable
10004 # (the 'next' statement above means is unacceptable)
10005 $previous_property_name = $property_name;
10006
10007 # If not the first time for this property, retrieve info about
10008 # it from the cache
96cfc54a
KW
10009 if (defined ($property_info{$property_addr}{$TYPE})) {
10010 $property_type = $property_info{$property_addr}{$TYPE};
10011 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
99870f4d 10012 $map_type
96cfc54a 10013 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
99870f4d 10014 $default_table
96cfc54a 10015 = $property_info{$property_addr}{$DEFAULT_TABLE};
99870f4d
KW
10016 }
10017 else {
10018
10019 # Here, is the first time for this property. Set up the
10020 # cache.
96cfc54a 10021 $property_type = $property_info{$property_addr}{$TYPE}
99870f4d
KW
10022 = $property_object->type;
10023 $map_type
96cfc54a 10024 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
99870f4d
KW
10025 = $property_object->pseudo_map_type;
10026
10027 # The Unicode files are set up so that if the map is not
10028 # defined, it is a binary property
10029 if (! defined $map && $property_type != $BINARY) {
10030 if ($property_type != $UNKNOWN
10031 && $property_type != $NON_STRING)
10032 {
10033 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
10034 }
10035 else {
10036 $property_object->set_type($BINARY);
10037 $property_type
96cfc54a 10038 = $property_info{$property_addr}{$TYPE}
99870f4d
KW
10039 = $BINARY;
10040 }
10041 }
10042
10043 # Get any @missings default for this property. This
10044 # should precede the first entry for the property in the
10045 # input file, and is located in a comment that has been
10046 # stored by the Input_file class until we access it here.
10047 # It's possible that there is more than one such line
10048 # waiting for us; collect them all, and parse
10049 my @missings_list = $file->get_missings
10050 if $file->has_missings_defaults;
10051 foreach my $default_ref (@missings_list) {
10052 my $default = $default_ref->[0];
ffe43484 10053 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
10054
10055 # For string properties, the default is just what the
10056 # file says, but non-string properties should already
10057 # have set up a table for the default property value;
10058 # use the table for these, so can resolve synonyms
10059 # later to a single standard one.
10060 if ($property_type == $STRING
10061 || $property_type == $UNKNOWN)
10062 {
96cfc54a 10063 $property_info{$addr}{$MISSINGS} = $default;
99870f4d
KW
10064 }
10065 else {
96cfc54a 10066 $property_info{$addr}{$MISSINGS}
99870f4d
KW
10067 = $property_object->table($default);
10068 }
10069 }
10070
10071 # Finished storing all the @missings defaults in the input
10072 # file so far. Get the one for the current property.
96cfc54a 10073 my $missings = $property_info{$property_addr}{$MISSINGS};
99870f4d
KW
10074
10075 # But we likely have separately stored what the default
10076 # should be. (This is to accommodate versions of the
10077 # standard where the @missings lines are absent or
10078 # incomplete.) Hopefully the two will match. But check
10079 # it out.
10080 $default_map = $property_object->default_map;
10081
10082 # If the map is a ref, it means that the default won't be
10083 # processed until later, so undef it, so next few lines
10084 # will redefine it to something that nothing will match
10085 undef $default_map if ref $default_map;
10086
10087 # Create a $default_map if don't have one; maybe a dummy
10088 # that won't match anything.
10089 if (! defined $default_map) {
10090
10091 # Use any @missings line in the file.
10092 if (defined $missings) {
10093 if (ref $missings) {
10094 $default_map = $missings->full_name;
10095 $default_table = $missings;
10096 }
10097 else {
10098 $default_map = $missings;
10099 }
678f13d5 10100
99870f4d
KW
10101 # And store it with the property for outside use.
10102 $property_object->set_default_map($default_map);
10103 }
10104 else {
10105
10106 # Neither an @missings nor a default map. Create
10107 # a dummy one, so won't have to test definedness
10108 # in the main loop.
10109 $default_map = '_Perl This will never be in a file
10110 from Unicode';
10111 }
10112 }
10113
10114 # Here, we have $default_map defined, possibly in terms of
10115 # $missings, but maybe not, and possibly is a dummy one.
10116 if (defined $missings) {
10117
10118 # Make sure there is no conflict between the two.
10119 # $missings has priority.
10120 if (ref $missings) {
23e33b60
KW
10121 $default_table
10122 = $property_object->table($default_map);
99870f4d
KW
10123 if (! defined $default_table
10124 || $default_table != $missings)
10125 {
10126 if (! defined $default_table) {
10127 $default_table = $UNDEF;
10128 }
10129 $file->carp_bad_line(<<END
10130The \@missings line for $property_name in $file says that missings default to
10131$missings, but we expect it to be $default_table. $missings used.
10132END
10133 );
10134 $default_table = $missings;
10135 $default_map = $missings->full_name;
10136 }
96cfc54a 10137 $property_info{$property_addr}{$DEFAULT_TABLE}
99870f4d
KW
10138 = $default_table;
10139 }
10140 elsif ($default_map ne $missings) {
10141 $file->carp_bad_line(<<END
10142The \@missings line for $property_name in $file says that missings default to
10143$missings, but we expect it to be $default_map. $missings used.
10144END
10145 );
10146 $default_map = $missings;
10147 }
10148 }
10149
96cfc54a 10150 $property_info{$property_addr}{$DEFAULT_MAP}
99870f4d
KW
10151 = $default_map;
10152
10153 # If haven't done so already, find the table corresponding
10154 # to this map for non-string properties.
10155 if (! defined $default_table
10156 && $property_type != $STRING
10157 && $property_type != $UNKNOWN)
10158 {
10159 $default_table = $property_info{$property_addr}
96cfc54a 10160 {$DEFAULT_TABLE}
99870f4d
KW
10161 = $property_object->table($default_map);
10162 }
10163 } # End of is first time for this property
10164 } # End of switching properties.
10165
10166 # Ready to process the line.
10167 # The Unicode files are set up so that if the map is not defined,
10168 # it is a binary property with value 'Y'
10169 if (! defined $map) {
10170 $map = 'Y';
10171 }
10172 else {
10173
10174 # If the map begins with a special command to us (enclosed in
10175 # delimiters), extract the command(s).
a35d7f90
KW
10176 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10177 my $command = $1;
10178 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
10179 $replace = $1;
99870f4d 10180 }
a35d7f90
KW
10181 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
10182 $map_type = $1;
10183 }
10184 else {
10185 $file->carp_bad_line("Unknown command line: '$1'");
10186 next LINE;
10187 }
10188 }
99870f4d
KW
10189 }
10190
10191 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10192 {
10193
10194 # Here, we have a map to a particular code point, and the
10195 # default map is to a code point itself. If the range
10196 # includes the particular code point, change that portion of
10197 # the range to the default. This makes sure that in the final
10198 # table only the non-defaults are listed.
10199 my $decimal_map = hex $map;
10200 if ($low <= $decimal_map && $decimal_map <= $high) {
10201
10202 # If the range includes stuff before or after the map
10203 # we're changing, split it and process the split-off parts
10204 # later.
10205 if ($low < $decimal_map) {
10206 $file->insert_adjusted_lines(
10207 sprintf("%04X..%04X; %s; %s",
10208 $low,
10209 $decimal_map - 1,
10210 $property_name,
10211 $map));
10212 }
10213 if ($high > $decimal_map) {
10214 $file->insert_adjusted_lines(
10215 sprintf("%04X..%04X; %s; %s",
10216 $decimal_map + 1,
10217 $high,
10218 $property_name,
10219 $map));
10220 }
10221 $low = $high = $decimal_map;
10222 $map = $CODE_POINT;
10223 }
10224 }
10225
10226 # If we can tell that this is a synonym for the default map, use
10227 # the default one instead.
10228 if ($property_type != $STRING
10229 && $property_type != $UNKNOWN)
10230 {
10231 my $table = $property_object->table($map);
10232 if (defined $table && $table == $default_table) {
10233 $map = $default_map;
10234 }
10235 }
10236
10237 # And figure out the map type if not known.
10238 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10239 if ($map eq "") { # Nulls are always $NULL map type
10240 $map_type = $NULL;
10241 } # Otherwise, non-strings, and those that don't allow
10242 # $MULTI_CP, and those that aren't multiple code points are
10243 # 0
10244 elsif
10245 (($property_type != $STRING && $property_type != $UNKNOWN)
10246 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10247 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
10248 {
10249 $map_type = 0;
10250 }
10251 else {
10252 $map_type = $MULTI_CP;
10253 }
10254 }
10255
10256 $property_object->add_map($low, $high,
10257 $map,
10258 Type => $map_type,
10259 Replace => $replace);
10260 } # End of loop through file's lines
10261
10262 return;
10263 }
10264}
10265
99870f4d
KW
10266{ # Closure for UnicodeData.txt handling
10267
10268 # This file was the first one in the UCD; its design leads to some
10269 # awkwardness in processing. Here is a sample line:
10270 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10271 # The fields in order are:
10272 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 10273 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
10274 my $CATEGORY = $i++; # category (e.g. "Lu")
10275 my $CCC = $i++; # Canonical combining class (e.g. "230")
10276 my $BIDI = $i++; # directional class (e.g. "L")
10277 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
10278 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
10279 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10280 # Dual-use in this program; see below
10281 my $NUMERIC = $i++; # numeric value
10282 my $MIRRORED = $i++; # ? mirrored
10283 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10284 my $COMMENT = $i++; # iso comment
10285 my $UPPER = $i++; # simple uppercase mapping
10286 my $LOWER = $i++; # simple lowercase mapping
10287 my $TITLE = $i++; # simple titlecase mapping
10288 my $input_field_count = $i;
10289
10290 # This routine in addition outputs these extra fields:
d59563d0 10291
99870f4d 10292 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
10293
10294 # These fields are modifications of ones above, and are usually
10295 # suppressed; they must come last, as for speed, the loop upper bound is
10296 # normally set to ignore them
10297 my $NAME = $i++; # This is the strict name field, not the one that
10298 # charnames uses.
10299 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
10300 # by Unicode::Normalize
99870f4d
KW
10301 my $last_field = $i - 1;
10302
10303 # All these are read into an array for each line, with the indices defined
10304 # above. The empty fields in the example line above indicate that the
10305 # value is defaulted. The handler called for each line of the input
10306 # changes these to their defaults.
10307
10308 # Here are the official names of the properties, in a parallel array:
10309 my @field_names;
10310 $field_names[$BIDI] = 'Bidi_Class';
10311 $field_names[$CATEGORY] = 'General_Category';
10312 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 10313 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
10314 $field_names[$COMMENT] = 'ISO_Comment';
10315 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10316 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 10317 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
10318 $field_names[$MIRRORED] = 'Bidi_Mirrored';
10319 $field_names[$NAME] = 'Name';
10320 $field_names[$NUMERIC] = 'Numeric_Value';
10321 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10322 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10323 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 10324 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 10325 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 10326 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 10327
28093d0e
KW
10328 # Some of these need a little more explanation:
10329 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10330 # property, but is used in calculating the Numeric_Type. Perl however,
10331 # creates a file from this field, so a Perl property is created from it.
10332 # Similarly, the Other_Digit field is used only for calculating the
10333 # Numeric_Type, and so it can be safely re-used as the place to store
10334 # the value for Numeric_Type; hence it is referred to as
10335 # $NUMERIC_TYPE_OTHER_DIGIT.
10336 # The input field named $PERL_DECOMPOSITION is a combination of both the
10337 # decomposition mapping and its type. Perl creates a file containing
10338 # exactly this field, so it is used for that. The two properties are
10339 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10340 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
10341 # output it), as Perl doesn't use it directly.
10342 # The input field named here $CHARNAME is used to construct the
10343 # Perl_Charnames property, which is a combination of the Name property
10344 # (which the input field contains), and the Unicode_1_Name property, and
10345 # others from other files. Since, the strict Name property is not used
10346 # by Perl, this field is used for the table that Perl does use. The
10347 # strict Name property table is usually suppressed (unless the lists are
10348 # changed to output it), so it is accumulated in a separate field,
10349 # $NAME, which to save time is discarded unless the table is actually to
10350 # be output
99870f4d
KW
10351
10352 # This file is processed like most in this program. Control is passed to
10353 # process_generic_property_file() which calls filter_UnicodeData_line()
10354 # for each input line. This filter converts the input into line(s) that
10355 # process_generic_property_file() understands. There is also a setup
10356 # routine called before any of the file is processed, and a handler for
10357 # EOF processing, all in this closure.
10358
10359 # A huge speed-up occurred at the cost of some added complexity when these
10360 # routines were altered to buffer the outputs into ranges. Almost all the
10361 # lines of the input file apply to just one code point, and for most
10362 # properties, the map for the next code point up is the same as the
10363 # current one. So instead of creating a line for each property for each
10364 # input line, filter_UnicodeData_line() remembers what the previous map
10365 # of a property was, and doesn't generate a line to pass on until it has
10366 # to, as when the map changes; and that passed-on line encompasses the
10367 # whole contiguous range of code points that have the same map for that
10368 # property. This means a slight amount of extra setup, and having to
10369 # flush these buffers on EOF, testing if the maps have changed, plus
10370 # remembering state information in the closure. But it means a lot less
10371 # real time in not having to change the data base for each property on
10372 # each line.
10373
10374 # Another complication is that there are already a few ranges designated
10375 # in the input. There are two lines for each, with the same maps except
10376 # the code point and name on each line. This was actually the hardest
10377 # thing to design around. The code points in those ranges may actually
10378 # have real maps not given by these two lines. These maps will either
56339b2c 10379 # be algorithmically determinable, or be in the extracted files furnished
99870f4d
KW
10380 # with the UCD. In the event of conflicts between these extracted files,
10381 # and this one, Unicode says that this one prevails. But it shouldn't
10382 # prevail for conflicts that occur in these ranges. The data from the
10383 # extracted files prevails in those cases. So, this program is structured
10384 # so that those files are processed first, storing maps. Then the other
10385 # files are processed, generally overwriting what the extracted files
10386 # stored. But just the range lines in this input file are processed
10387 # without overwriting. This is accomplished by adding a special string to
10388 # the lines output to tell process_generic_property_file() to turn off the
10389 # overwriting for just this one line.
10390 # A similar mechanism is used to tell it that the map is of a non-default
10391 # type.
10392
10393 sub setup_UnicodeData { # Called before any lines of the input are read
10394 my $file = shift;
10395 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10396
28093d0e
KW
10397 # Create a new property specially located that is a combination of the
10398 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10399 # Name_Alias properties. (The final duplicates elements of the
10400 # first.) A comment for it will later be constructed based on the
10401 # actual properties present and used
3e20195b 10402 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
10403 Default_Map => "",
10404 Directory => File::Spec->curdir(),
10405 File => 'Name',
301ba948 10406 Fate => $INTERNAL_ONLY,
28093d0e 10407 Perl_Extension => 1,
b1c167a3 10408 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
10409 Type => $STRING,
10410 );
9ba5575c 10411 $perl_charname->set_proxy_for('Name');
28093d0e 10412
99870f4d 10413 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 10414 Directory => File::Spec->curdir(),
99870f4d 10415 File => 'Decomposition',
a14f3cb1 10416 Format => $DECOMP_STRING_FORMAT,
301ba948 10417 Fate => $INTERNAL_ONLY,
99870f4d
KW
10418 Perl_Extension => 1,
10419 Default_Map => $CODE_POINT,
10420
0c07e538
KW
10421 # normalize.pm can't cope with these
10422 Output_Range_Counts => 0,
10423
99870f4d
KW
10424 # This is a specially formatted table
10425 # explicitly for normalize.pm, which
10426 # is expecting a particular format,
10427 # which means that mappings containing
10428 # multiple code points are in the main
10429 # body of the table
10430 Map_Type => $COMPUTE_NO_MULTI_CP,
10431 Type => $STRING,
f64b46a1 10432 To_Output_Map => $INTERNAL_MAP,
99870f4d 10433 );
5be997b0 10434 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
99870f4d
KW
10435 $Perl_decomp->add_comment(join_lines(<<END
10436This mapping is a combination of the Unicode 'Decomposition_Type' and
10437'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
8d6427a5 10438identical to the official Unicode 'Decomposition_Mapping' property except for
99870f4d
KW
10439two things:
10440 1) It omits the algorithmically determinable Hangul syllable decompositions,
10441which normalize.pm handles algorithmically.
10442 2) It contains the decomposition type as well. Non-canonical decompositions
10443begin with a word in angle brackets, like <super>, which denotes the
10444compatible decomposition type. If the map does not begin with the <angle
10445brackets>, the decomposition is canonical.
10446END
10447 ));
10448
10449 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10450 Default_Map => "",
10451 Perl_Extension => 1,
99870f4d
KW
10452 Directory => $map_directory,
10453 Type => $STRING,
ce712c88 10454 To_Output_Map => $OUTPUT_ADJUSTED,
99870f4d
KW
10455 );
10456 $Decimal_Digit->add_comment(join_lines(<<END
10457This file gives the mapping of all code points which represent a single
ce712c88
KW
10458decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10459points, and the mapping of each non-initial element of each range is actually
10460not to "0", but to the offset that element has from its corresponding DIGIT 0.
b0b13ada
KW
10461These code points are those that have Numeric_Type=Decimal; not special
10462things, like subscripts nor Roman numerals.
99870f4d
KW
10463END
10464 ));
10465
28093d0e
KW
10466 # These properties are not used for generating anything else, and are
10467 # usually not output. By making them last in the list, we can just
99870f4d 10468 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
10469 # generating a table(s) that is/are just going to get thrown away.
10470 if (! property_ref('Decomposition_Mapping')->to_output_map
10471 && ! property_ref('Name')->to_output_map)
10472 {
10473 $last_field = min($NAME, $DECOMP_MAP) - 1;
10474 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10475 $last_field = $DECOMP_MAP;
10476 } elsif (property_ref('Name')->to_output_map) {
10477 $last_field = $NAME;
99870f4d
KW
10478 }
10479 return;
10480 }
10481
10482 my $first_time = 1; # ? Is this the first line of the file
10483 my $in_range = 0; # ? Are we in one of the file's ranges
10484 my $previous_cp; # hex code point of previous line
10485 my $decimal_previous_cp = -1; # And its decimal equivalent
10486 my @start; # For each field, the current starting
10487 # code point in hex for the range
10488 # being accumulated.
10489 my @fields; # The input fields;
10490 my @previous_fields; # And those from the previous call
10491
10492 sub filter_UnicodeData_line {
10493 # Handle a single input line from UnicodeData.txt; see comments above
10494 # Conceptually this takes a single line from the file containing N
10495 # properties, and converts it into N lines with one property per line,
10496 # which is what the final handler expects. But there are
10497 # complications due to the quirkiness of the input file, and to save
10498 # time, it accumulates ranges where the property values don't change
10499 # and only emits lines when necessary. This is about an order of
10500 # magnitude fewer lines emitted.
10501
10502 my $file = shift;
10503 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10504
10505 # $_ contains the input line.
10506 # -1 in split means retain trailing null fields
10507 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10508
10509 #local $to_trace = 1 if main::DEBUG;
10510 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10511 if (@fields > $input_field_count) {
10512 $file->carp_bad_line('Extra fields');
10513 $_ = "";
10514 return;
10515 }
10516
10517 my $decimal_cp = hex $cp;
10518
10519 # We have to output all the buffered ranges when the next code point
10520 # is not exactly one after the previous one, which means there is a
10521 # gap in the ranges.
10522 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10523
10524 # The decomposition mapping field requires special handling. It looks
10525 # like either:
10526 #
10527 # <compat> 0032 0020
10528 # 0041 0300
10529 #
10530 # The decomposition type is enclosed in <brackets>; if missing, it
10531 # means the type is canonical. There are two decomposition mapping
10532 # tables: the one for use by Perl's normalize.pm has a special format
10533 # which is this field intact; the other, for general use is of
10534 # standard format. In either case we have to find the decomposition
10535 # type. Empty fields have None as their type, and map to the code
10536 # point itself
10537 if ($fields[$PERL_DECOMPOSITION] eq "") {
10538 $fields[$DECOMP_TYPE] = 'None';
10539 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10540 }
10541 else {
10542 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10543 =~ / < ( .+? ) > \s* ( .+ ) /x;
10544 if (! defined $fields[$DECOMP_TYPE]) {
10545 $fields[$DECOMP_TYPE] = 'Canonical';
10546 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10547 }
10548 else {
10549 $fields[$DECOMP_MAP] = $map;
10550 }
10551 }
10552
10553 # The 3 numeric fields also require special handling. The 2 digit
10554 # fields must be either empty or match the number field. This means
10555 # that if it is empty, they must be as well, and the numeric type is
10556 # None, and the numeric value is 'Nan'.
10557 # The decimal digit field must be empty or match the other digit
10558 # field. If the decimal digit field is non-empty, the code point is
10559 # a decimal digit, and the other two fields will have the same value.
10560 # If it is empty, but the other digit field is non-empty, the code
10561 # point is an 'other digit', and the number field will have the same
10562 # value as the other digit field. If the other digit field is empty,
10563 # but the number field is non-empty, the code point is a generic
10564 # numeric type.
10565 if ($fields[$NUMERIC] eq "") {
10566 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10567 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10568 ) {
10569 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10570 }
10571 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10572 $fields[$NUMERIC] = 'NaN';
10573 }
10574 else {
10575 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10576 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10577 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
04537e94 10578 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
99870f4d
KW
10579 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10580 }
10581 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10582 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10583 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10584 }
10585 else {
10586 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10587
10588 # Rationals require extra effort.
10589 register_fraction($fields[$NUMERIC])
10590 if $fields[$NUMERIC] =~ qr{/};
10591 }
10592 }
10593
10594 # For the properties that have empty fields in the file, and which
10595 # mean something different from empty, change them to that default.
10596 # Certain fields just haven't been empty so far in any Unicode
10597 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10598 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 10599 # the defaults; which are very unlikely to ever change.
99870f4d
KW
10600 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10601 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10602
10603 # UAX44 says that if title is empty, it is the same as whatever upper
10604 # is,
10605 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10606
10607 # There are a few pairs of lines like:
10608 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10609 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10610 # that define ranges. These should be processed after the fields are
10611 # adjusted above, as they may override some of them; but mostly what
28093d0e 10612 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
10613 # paired lines start with a '<', but this is also true of '<control>,
10614 # which isn't one of these special ones.
28093d0e 10615 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
10616
10617 # Some code points in this file have the pseudo-name
10618 # '<control>', but the official name for such ones is the null
898b2fa7
KW
10619 # string.
10620 $fields[$NAME] = $fields[$CHARNAME] = "";
99870f4d
KW
10621
10622 # We had better not be in between range lines.
10623 if ($in_range) {
28093d0e 10624 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10625 $in_range = 0;
10626 }
10627 }
28093d0e 10628 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
10629
10630 # Here is a non-range line. We had better not be in between range
10631 # lines.
10632 if ($in_range) {
28093d0e 10633 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10634 $in_range = 0;
10635 }
edb80b88
KW
10636 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10637
10638 # These are code points whose names end in their code points,
10639 # which means the names are algorithmically derivable from the
10640 # code points. To shorten the output Name file, the algorithm
10641 # for deriving these is placed in the file instead of each
10642 # code point, so they have map type $CP_IN_NAME
10643 $fields[$CHARNAME] = $CMD_DELIM
10644 . $MAP_TYPE_CMD
10645 . '='
10646 . $CP_IN_NAME
10647 . $CMD_DELIM
10648 . $fields[$CHARNAME];
10649 }
28093d0e 10650 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 10651 }
28093d0e
KW
10652 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10653 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
10654
10655 # Here we are at the beginning of a range pair.
10656 if ($in_range) {
28093d0e 10657 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10658 }
10659 $in_range = 1;
10660
10661 # Because the properties in the range do not overwrite any already
10662 # in the db, we must flush the buffers of what's already there, so
10663 # they get handled in the normal scheme.
10664 $force_output = 1;
10665
10666 }
28093d0e
KW
10667 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10668 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
10669 $_ = "";
10670 return;
10671 }
10672 else { # Here, we are at the last line of a range pair.
10673
10674 if (! $in_range) {
28093d0e 10675 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
10676 $_ = "";
10677 return;
10678 }
10679 $in_range = 0;
10680
28093d0e
KW
10681 $fields[$NAME] = $fields[$CHARNAME];
10682
99870f4d
KW
10683 # Check that the input is valid: that the closing of the range is
10684 # the same as the beginning.
10685 foreach my $i (0 .. $last_field) {
10686 next if $fields[$i] eq $previous_fields[$i];
10687 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10688 }
10689
10690 # The processing differs depending on the type of range,
28093d0e
KW
10691 # determined by its $CHARNAME
10692 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
10693
10694 # Check that the data looks right.
10695 if ($decimal_previous_cp != $SBase) {
10696 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10697 }
10698 if ($decimal_cp != $SBase + $SCount - 1) {
10699 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10700 }
10701
10702 # The Hangul syllable range has a somewhat complicated name
10703 # generation algorithm. Each code point in it has a canonical
10704 # decomposition also computable by an algorithm. The
10705 # perl decomposition map table built from these is used only
10706 # by normalize.pm, which has the algorithm built in it, so the
10707 # decomposition maps are not needed, and are large, so are
10708 # omitted from it. If the full decomposition map table is to
10709 # be output, the decompositions are generated for it, in the
10710 # EOF handling code for this input file.
10711
10712 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10713
10714 # This range is stored in our internal structure with its
10715 # own map type, different from all others.
28093d0e
KW
10716 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10717 = $CMD_DELIM
99870f4d
KW
10718 . $MAP_TYPE_CMD
10719 . '='
10720 . $HANGUL_SYLLABLE
10721 . $CMD_DELIM
28093d0e 10722 . $fields[$CHARNAME];
99870f4d 10723 }
28093d0e 10724 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
10725
10726 # The name for these contains the code point itself, and all
10727 # are defined to have the same base name, regardless of what
10728 # is in the file. They are stored in our internal structure
10729 # with a map type of $CP_IN_NAME
28093d0e
KW
10730 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10731 = $CMD_DELIM
99870f4d
KW
10732 . $MAP_TYPE_CMD
10733 . '='
10734 . $CP_IN_NAME
10735 . $CMD_DELIM
10736 . 'CJK UNIFIED IDEOGRAPH';
10737
10738 }
10739 elsif ($fields[$CATEGORY] eq 'Co'
10740 || $fields[$CATEGORY] eq 'Cs')
10741 {
10742 # The names of all the code points in these ranges are set to
10743 # null, as there are no names for the private use and
10744 # surrogate code points.
10745
28093d0e 10746 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
10747 }
10748 else {
28093d0e 10749 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
10750 }
10751
10752 # The first line of the range caused everything else to be output,
10753 # and then its values were stored as the beginning values for the
10754 # next set of ranges, which this one ends. Now, for each value,
10755 # add a command to tell the handler that these values should not
10756 # replace any existing ones in our database.
10757 foreach my $i (0 .. $last_field) {
10758 $previous_fields[$i] = $CMD_DELIM
10759 . $REPLACE_CMD
10760 . '='
10761 . $NO
10762 . $CMD_DELIM
10763 . $previous_fields[$i];
10764 }
10765
10766 # And change things so it looks like the entire range has been
10767 # gone through with this being the final part of it. Adding the
10768 # command above to each field will cause this range to be flushed
10769 # during the next iteration, as it guaranteed that the stored
10770 # field won't match whatever value the next one has.
10771 $previous_cp = $cp;
10772 $decimal_previous_cp = $decimal_cp;
10773
10774 # We are now set up for the next iteration; so skip the remaining
10775 # code in this subroutine that does the same thing, but doesn't
10776 # know about these ranges.
10777 $_ = "";
c1739a4a 10778
99870f4d
KW
10779 return;
10780 }
10781
10782 # On the very first line, we fake it so the code below thinks there is
10783 # nothing to output, and initialize so that when it does get output it
10784 # uses the first line's values for the lowest part of the range.
10785 # (One could avoid this by using peek(), but then one would need to
10786 # know the adjustments done above and do the same ones in the setup
10787 # routine; not worth it)
10788 if ($first_time) {
10789 $first_time = 0;
10790 @previous_fields = @fields;
10791 @start = ($cp) x scalar @fields;
10792 $decimal_previous_cp = $decimal_cp - 1;
10793 }
10794
10795 # For each field, output the stored up ranges that this code point
10796 # doesn't fit in. Earlier we figured out if all ranges should be
10797 # terminated because of changing the replace or map type styles, or if
10798 # there is a gap between this new code point and the previous one, and
10799 # that is stored in $force_output. But even if those aren't true, we
10800 # need to output the range if this new code point's value for the
10801 # given property doesn't match the stored range's.
10802 #local $to_trace = 1 if main::DEBUG;
10803 foreach my $i (0 .. $last_field) {
10804 my $field = $fields[$i];
10805 if ($force_output || $field ne $previous_fields[$i]) {
10806
10807 # Flush the buffer of stored values.
10808 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10809
10810 # Start a new range with this code point and its value
10811 $start[$i] = $cp;
10812 $previous_fields[$i] = $field;
10813 }
10814 }
10815
10816 # Set the values for the next time.
10817 $previous_cp = $cp;
10818 $decimal_previous_cp = $decimal_cp;
10819
10820 # The input line has generated whatever adjusted lines are needed, and
10821 # should not be looked at further.
10822 $_ = "";
10823 return;
10824 }
10825
10826 sub EOF_UnicodeData {
10827 # Called upon EOF to flush the buffers, and create the Hangul
10828 # decomposition mappings if needed.
10829
10830 my $file = shift;
10831 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10832
10833 # Flush the buffers.
17060217 10834 foreach my $i (0 .. $last_field) {
99870f4d
KW
10835 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10836 }
10837
10838 if (-e 'Jamo.txt') {
10839
10840 # The algorithm is published by Unicode, based on values in
10841 # Jamo.txt, (which should have been processed before this
10842 # subroutine), and the results left in %Jamo
10843 unless (%Jamo) {
10844 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10845 return;
10846 }
10847
10848 # If the full decomposition map table is being output, insert
10849 # into it the Hangul syllable mappings. This is to avoid having
10850 # to publish a subroutine in it to compute them. (which would
10851 # essentially be this code.) This uses the algorithm published by
4a4f4270
KW
10852 # Unicode. (No hangul syllables in version 1)
10853 if ($v_version ge v2.0.0
10854 && property_ref('Decomposition_Mapping')->to_output_map) {
99870f4d
KW
10855 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10856 use integer;
10857 my $SIndex = $S - $SBase;
10858 my $L = $LBase + $SIndex / $NCount;
10859 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10860 my $T = $TBase + $SIndex % $TCount;
10861
10862 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10863 my $decomposition = sprintf("%04X %04X", $L, $V);
10864 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10865 $file->insert_adjusted_lines(
10866 sprintf("%04X; Decomposition_Mapping; %s",
10867 $S,
10868 $decomposition));
10869 }
10870 }
10871 }
10872
10873 return;
10874 }
10875
10876 sub filter_v1_ucd {
10877 # Fix UCD lines in version 1. This is probably overkill, but this
10878 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10879 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10880 # removed. This program retains them
10881 # 2) didn't include ranges, which it should have, and which are now
10882 # added in @corrected_lines below. It was hand populated by
10883 # taking the data from Version 2, verified by analyzing
10884 # DAge.txt.
10885 # 3) There is a syntax error in the entry for U+09F8 which could
10886 # cause problems for utf8_heavy, and so is changed. It's
10887 # numeric value was simply a minus sign, without any number.
10888 # (Eventually Unicode changed the code point to non-numeric.)
10889 # 4) The decomposition types often don't match later versions
10890 # exactly, and the whole syntax of that field is different; so
10891 # the syntax is changed as well as the types to their later
10892 # terminology. Otherwise normalize.pm would be very unhappy
10893 # 5) Many ccc classes are different. These are left intact.
4a4f4270 10894 # 6) U+FF10..U+FF19 are missing their numeric values in all three
99870f4d
KW
10895 # fields. These are unchanged because it doesn't really cause
10896 # problems for Perl.
10897 # 7) A number of code points, such as controls, don't have their
4a4f4270
KW
10898 # Unicode Version 1 Names in this file. These are added.
10899 # 8) A number of Symbols were marked as Lm. This changes those in
10900 # the Latin1 range, so that regexes work.
10901 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
10902 # referred to by their lc equivalents. Not fixed.
99870f4d
KW
10903
10904 my @corrected_lines = split /\n/, <<'END';
109054E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
109069FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10907E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10908F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10909F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10910FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10911END
10912
10913 my $file = shift;
10914 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10915
10916 #local $to_trace = 1 if main::DEBUG;
10917 trace $_ if main::DEBUG && $to_trace;
10918
10919 # -1 => retain trailing null fields
10920 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10921
10922 # At the first place that is wrong in the input, insert all the
10923 # corrections, replacing the wrong line.
10924 if ($code_point eq '4E00') {
10925 my @copy = @corrected_lines;
10926 $_ = shift @copy;
10927 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10928
10929 $file->insert_lines(@copy);
10930 }
4a4f4270 10931 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
99870f4d 10932
4a4f4270
KW
10933 # There are no Lm characters in Latin1; these should be 'Sk', but
10934 # there isn't that in V1.
10935 $fields[$CATEGORY] = 'So';
10936 }
99870f4d
KW
10937
10938 if ($fields[$NUMERIC] eq '-') {
10939 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10940 }
10941
10942 if ($fields[$PERL_DECOMPOSITION] ne "") {
10943
10944 # Several entries have this change to superscript 2 or 3 in the
10945 # middle. Convert these to the modern version, which is to use
10946 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10947 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10948 # 'HHHH HHHH 00B3 HHHH'.
10949 # It turns out that all of these that don't have another
10950 # decomposition defined at the beginning of the line have the
10951 # <square> decomposition in later releases.
10952 if ($code_point ne '00B2' && $code_point ne '00B3') {
10953 if ($fields[$PERL_DECOMPOSITION]
10954 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10955 {
10956 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10957 $fields[$PERL_DECOMPOSITION] = '<square> '
10958 . $fields[$PERL_DECOMPOSITION];
10959 }
10960 }
10961 }
10962
10963 # If is like '<+circled> 0052 <-circled>', convert to
10964 # '<circled> 0052'
10965 $fields[$PERL_DECOMPOSITION] =~
4a4f4270 10966 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
99870f4d
KW
10967
10968 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10969 $fields[$PERL_DECOMPOSITION] =~
10970 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10971 or $fields[$PERL_DECOMPOSITION] =~
10972 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10973 or $fields[$PERL_DECOMPOSITION] =~
10974 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10975 or $fields[$PERL_DECOMPOSITION] =~
10976 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10977
10978 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10979 $fields[$PERL_DECOMPOSITION] =~
10980 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10981
10982 # Change names to modern form.
10983 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10984 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10985 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10986 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10987
10988 # One entry has weird braces
10989 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
4a4f4270
KW
10990
10991 # One entry at U+2116 has an extra <sup>
10992 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
99870f4d
KW
10993 }
10994
10995 $_ = join ';', $code_point, @fields;
10996 trace $_ if main::DEBUG && $to_trace;
10997 return;
10998 }
10999
be864b6c
KW
11000 sub filter_bad_Nd_ucd {
11001 # Early versions specified a value in the decimal digit field even
11002 # though the code point wasn't a decimal digit. Clear the field in
11003 # that situation, so that the main code doesn't think it is a decimal
11004 # digit.
11005
11006 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11007 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11008 $fields[$PERL_DECIMAL_DIGIT] = "";
11009 $_ = join ';', $code_point, @fields;
11010 }
11011 return;
11012 }
11013
08978fe6
KW
11014 my @U1_control_names = split /\n/, <<'END';
11015NULL
11016START OF HEADING
11017START OF TEXT
11018END OF TEXT
11019END OF TRANSMISSION
11020ENQUIRY
11021ACKNOWLEDGE
11022BELL
11023BACKSPACE
11024HORIZONTAL TABULATION
11025LINE FEED
11026VERTICAL TABULATION
11027FORM FEED
11028CARRIAGE RETURN
11029SHIFT OUT
11030SHIFT IN
11031DATA LINK ESCAPE
11032DEVICE CONTROL ONE
11033DEVICE CONTROL TWO
11034DEVICE CONTROL THREE
11035DEVICE CONTROL FOUR
11036NEGATIVE ACKNOWLEDGE
11037SYNCHRONOUS IDLE
11038END OF TRANSMISSION BLOCK
11039CANCEL
11040END OF MEDIUM
11041SUBSTITUTE
11042ESCAPE
11043FILE SEPARATOR
11044GROUP SEPARATOR
11045RECORD SEPARATOR
11046UNIT SEPARATOR
11047DELETE
11048BREAK PERMITTED HERE
11049NO BREAK HERE
11050INDEX
11051NEXT LINE
11052START OF SELECTED AREA
11053END OF SELECTED AREA
11054CHARACTER TABULATION SET
11055CHARACTER TABULATION WITH JUSTIFICATION
11056LINE TABULATION SET
11057PARTIAL LINE DOWN
11058PARTIAL LINE UP
11059REVERSE LINE FEED
11060SINGLE SHIFT TWO
11061SINGLE SHIFT THREE
11062DEVICE CONTROL STRING
11063PRIVATE USE ONE
11064PRIVATE USE TWO
11065SET TRANSMIT STATE
11066CANCEL CHARACTER
11067MESSAGE WAITING
11068START OF GUARDED AREA
11069END OF GUARDED AREA
11070START OF STRING
11071SINGLE CHARACTER INTRODUCER
11072CONTROL SEQUENCE INTRODUCER
11073STRING TERMINATOR
11074OPERATING SYSTEM COMMAND
11075PRIVACY MESSAGE
11076APPLICATION PROGRAM COMMAND
11077END
11078
11079 sub filter_early_U1_names {
11080 # Very early versions did not have the Unicode_1_name field specified.
11081 # They differed in which ones were present; make sure a U1 name
11082 # exists, so that Unicode::UCD::charinfo will work
11083
11084 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11085
11086
11087 # @U1_control names above are entirely positional, so we pull them out
11088 # in the exact order required, with gaps for the ones that don't have
11089 # names.
11090 if ($code_point =~ /^00[01]/
11091 || $code_point eq '007F'
11092 || $code_point =~ /^008[2-9A-F]/
11093 || $code_point =~ /^009[0-8A-F]/)
11094 {
11095 my $u1_name = shift @U1_control_names;
11096 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11097 $_ = join ';', $code_point, @fields;
11098 }
11099 return;
11100 }
11101
99870f4d
KW
11102 sub filter_v2_1_5_ucd {
11103 # A dozen entries in this 2.1.5 file had the mirrored and numeric
11104 # columns swapped; These all had mirrored be 'N'. So if the numeric
11105 # column appears to be N, swap it back.
11106
11107 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11108 if ($fields[$NUMERIC] eq 'N') {
11109 $fields[$NUMERIC] = $fields[$MIRRORED];
11110 $fields[$MIRRORED] = 'N';
11111 $_ = join ';', $code_point, @fields;
11112 }
11113 return;
11114 }
3ffed8c2
KW
11115
11116 sub filter_v6_ucd {
11117
fe3193b5
KW
11118 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11119 # it wasn't accepted, to allow for some deprecation cycles. This
11120 # function is not called after 5.16
3ffed8c2 11121
484741e1 11122 return if $_ !~ /^(?:0007|1F514|070F);/;
3ffed8c2
KW
11123
11124 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11125 if ($code_point eq '0007') {
dcd72625 11126 $fields[$CHARNAME] = "";
3ffed8c2 11127 }
484741e1
KW
11128 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11129 # http://www.unicode.org/versions/corrigendum8.html
11130 $fields[$BIDI] = "AL";
11131 }
88187f32 11132 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
3ffed8c2
KW
11133 $fields[$CHARNAME] = "";
11134 }
11135
11136 $_ = join ';', $code_point, @fields;
11137
11138 return;
11139 }
99870f4d
KW
11140} # End closure for UnicodeData
11141
37e2e78e
KW
11142sub process_GCB_test {
11143
11144 my $file = shift;
11145 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11146
11147 while ($file->next_line) {
11148 push @backslash_X_tests, $_;
11149 }
678f13d5 11150
37e2e78e
KW
11151 return;
11152}
11153
99870f4d
KW
11154sub process_NamedSequences {
11155 # NamedSequences.txt entries are just added to an array. Because these
11156 # don't look like the other tables, they have their own handler.
11157 # An example:
11158 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11159 #
11160 # This just adds the sequence to an array for later handling
11161
99870f4d
KW
11162 my $file = shift;
11163 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11164
11165 while ($file->next_line) {
11166 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11167 if (@remainder) {
11168 $file->carp_bad_line(
11169 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11170 next;
11171 }
fb121860
KW
11172
11173 # Note single \t in keeping with special output format of
11174 # Perl_charnames. But it turns out that the code points don't have to
11175 # be 5 digits long, like the rest, based on the internal workings of
11176 # charnames.pm. This could be easily changed for consistency.
11177 push @named_sequences, "$sequence\t$name";
99870f4d
KW
11178 }
11179 return;
11180}
11181
11182{ # Closure
11183
11184 my $first_range;
11185
11186 sub filter_early_ea_lb {
11187 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
11188 # third field be the name of the code point, which can be ignored in
11189 # most cases. But it can be meaningful if it marks a range:
11190 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11191 # 3400;W;<CJK Ideograph Extension A, First>
11192 #
11193 # We need to see the First in the example above to know it's a range.
11194 # They did not use the later range syntaxes. This routine changes it
11195 # to use the modern syntax.
11196 # $1 is the Input_file object.
11197
11198 my @fields = split /\s*;\s*/;
11199 if ($fields[2] =~ /^<.*, First>/) {
11200 $first_range = $fields[0];
11201 $_ = "";
11202 }
11203 elsif ($fields[2] =~ /^<.*, Last>/) {
11204 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11205 }
11206 else {
11207 undef $first_range;
11208 $_ = "$fields[0]; $fields[1]";
11209 }
11210
11211 return;
11212 }
11213}
11214
11215sub filter_old_style_arabic_shaping {
11216 # Early versions used a different term for the later one.
11217
11218 my @fields = split /\s*;\s*/;
11219 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11220 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
11221 $_ = join ';', @fields;
11222 return;
11223}
11224
11225sub filter_arabic_shaping_line {
11226 # ArabicShaping.txt has entries that look like:
11227 # 062A; TEH; D; BEH
11228 # The field containing 'TEH' is not used. The next field is Joining_Type
11229 # and the last is Joining_Group
11230 # This generates two lines to pass on, one for each property on the input
11231 # line.
11232
11233 my $file = shift;
11234 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11235
11236 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11237
11238 if (@fields > 4) {
11239 $file->carp_bad_line('Extra fields');
11240 $_ = "";
11241 return;
11242 }
11243
11244 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
11245 $_ = "$fields[0]; Joining_Type; $fields[2]";
11246
11247 return;
11248}
11249
d3fed3dd
KW
11250{ # Closure
11251 my $lc; # Table for lowercase mapping
11252 my $tc;
11253 my $uc;
154ab528 11254 my %special_casing_code_points;
d3fed3dd 11255
6c0259ad
KW
11256 sub setup_special_casing {
11257 # SpecialCasing.txt contains the non-simple case change mappings. The
11258 # simple ones are in UnicodeData.txt, which should already have been
11259 # read in to the full property data structures, so as to initialize
11260 # these with the simple ones. Then the SpecialCasing.txt entries
bc0cd415 11261 # add or overwrite the ones which have different full mappings.
6c0259ad
KW
11262
11263 # This routine sees if the simple mappings are to be output, and if
11264 # so, copies what has already been put into the full mapping tables,
11265 # while they still contain only the simple mappings.
11266
11267 # The reason it is done this way is that the simple mappings are
11268 # probably not going to be output, so it saves work to initialize the
11269 # full tables with the simple mappings, and then overwrite those
11270 # relatively few entries in them that have different full mappings,
11271 # and thus skip the simple mapping tables altogether.
11272
11273 my $file= shift;
11274 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 11275
6c0259ad
KW
11276 $lc = property_ref('lc');
11277 $tc = property_ref('tc');
11278 $uc = property_ref('uc');
11279
11280 # For each of the case change mappings...
66474459
KW
11281 foreach my $full_table ($lc, $tc, $uc) {
11282 my $full_name = $full_table->name;
11283 unless (defined $full_table && ! $full_table->is_empty) {
6c0259ad
KW
11284 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
11285 }
11286
cdc18eb6
KW
11287 # Create a table in the old-style format and with the original
11288 # file name for backwards compatibility with applications that
ae1bcb1f
KW
11289 # read it directly. The new tables contain both the simple and
11290 # full maps, and the old are missing simple maps when there is a
11291 # conflicting full one. Probably it would have been ok to add
11292 # those to the legacy version, as was already done in 5.14 to the
11293 # case folding one, but this was not done, out of an abundance of
11294 # caution. The tables are set up here before we deal with the
11295 # full maps so that as we handle those, we can override the simple
11296 # maps for them in the legacy table, and merely add them in the
11297 # new-style one.
cdc18eb6
KW
11298 my $legacy = Property->new("Legacy_" . $full_table->full_name,
11299 File => $full_table->full_name =~
11300 s/case_Mapping//r,
11301 Range_Size_1 => 1,
11302 Format => $HEX_FORMAT,
11303 Default_Map => $CODE_POINT,
11304 UCD => 0,
11305 Initialize => $full_table,
f64b46a1 11306 To_Output_Map => $EXTERNAL_MAP,
cdc18eb6
KW
11307 );
11308
ae1bcb1f
KW
11309 $full_table->add_comment(join_lines( <<END
11310This file includes both the simple and full case changing maps. The simple
11311ones are in the main body of the table below, and the full ones adding to or
11312overriding them are in the hash.
11313END
11314 ));
11315
6c0259ad
KW
11316 # The simple version's name in each mapping merely has an 's' in
11317 # front of the full one's
66474459 11318 my $simple_name = 's' . $full_name;
301ba948 11319 my $simple = property_ref($simple_name);
66474459 11320 $simple->initialize($full_table) if $simple->to_output_map();
6c0259ad 11321 }
99870f4d 11322
6c0259ad
KW
11323 return;
11324 }
99870f4d 11325
154ab528
KW
11326 sub filter_2_1_8_special_casing_line {
11327
11328 # This version had duplicate entries in this file. Delete all but the
11329 # first one
11330 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11331 # fields
11332 if (exists $special_casing_code_points{$fields[0]}) {
11333 $_ = "";
11334 return;
11335 }
11336
11337 $special_casing_code_points{$fields[0]} = 1;
11338 filter_special_casing_line(@_);
11339 }
11340
6c0259ad
KW
11341 sub filter_special_casing_line {
11342 # Change the format of $_ from SpecialCasing.txt into something that
11343 # the generic handler understands. Each input line contains three
11344 # case mappings. This will generate three lines to pass to the
11345 # generic handler for each of those.
99870f4d 11346
6c0259ad
KW
11347 # The input syntax (after stripping comments and trailing white space
11348 # is like one of the following (with the final two being entries that
11349 # we ignore):
11350 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11351 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11352 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11353 # Note the trailing semi-colon, unlike many of the input files. That
11354 # means that there will be an extra null field generated by the split
99870f4d 11355
6c0259ad
KW
11356 my $file = shift;
11357 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 11358
6c0259ad
KW
11359 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11360 # fields
11361
11362 # field #4 is when this mapping is conditional. If any of these get
11363 # implemented, it would be by hard-coding in the casing functions in
11364 # the Perl core, not through tables. But if there is a new condition
11365 # we don't know about, output a warning. We know about all the
11366 # conditions through 6.0
11367 if ($fields[4] ne "") {
11368 my @conditions = split ' ', $fields[4];
11369 if ($conditions[0] ne 'tr' # We know that these languages have
11370 # conditions, and some are multiple
11371 && $conditions[0] ne 'az'
11372 && $conditions[0] ne 'lt'
11373
11374 # And, we know about a single condition Final_Sigma, but
11375 # nothing else.
11376 && ($v_version gt v5.2.0
11377 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11378 {
11379 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11380 }
11381 elsif ($conditions[0] ne 'Final_Sigma') {
99870f4d 11382
6c0259ad
KW
11383 # Don't print out a message for Final_Sigma, because we
11384 # have hard-coded handling for it. (But the standard
11385 # could change what the rule should be, but it wouldn't
11386 # show up here anyway.
99870f4d 11387
6c0259ad 11388 print "# SKIPPING Special Casing: $_\n"
99870f4d 11389 if $verbosity >= $VERBOSE;
6c0259ad
KW
11390 }
11391 $_ = "";
11392 return;
11393 }
11394 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11395 $file->carp_bad_line('Extra fields');
11396 $_ = "";
11397 return;
99870f4d 11398 }
99870f4d 11399
107e47c9
KW
11400 my $decimal_code_point = hex $fields[0];
11401
11402 # Loop to handle each of the three mappings in the input line, in
11403 # order, with $i indicating the current field number.
11404 my $i = 0;
11405 for my $object ($lc, $tc, $uc) {
11406 $i++; # First time through, $i = 0 ... 3rd time = 3
11407
11408 my $value = $object->value_of($decimal_code_point);
11409 $value = ($value eq $CODE_POINT)
11410 ? $decimal_code_point
11411 : hex $value;
11412
11413 # If this isn't a multi-character mapping, it should already have
11414 # been read in.
11415 if ($fields[$i] !~ / /) {
11416 if ($value != hex $fields[$i]) {
11417 Carp::my_carp("Bad news. UnicodeData.txt thinks "
11418 . $object->name
11419 . "(0x$fields[0]) is $value"
11420 . " and SpecialCasing.txt thinks it is "
22bc5f56
KW
11421 . hex($fields[$i])
11422 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
107e47c9
KW
11423 }
11424 }
11425 else {
cdc18eb6 11426
ae1bcb1f
KW
11427 # The mapping goes into both the legacy table, in which it
11428 # replaces the simple one...
cdc18eb6 11429 $file->insert_adjusted_lines("$fields[0]; Legacy_"
107e47c9
KW
11430 . $object->full_name
11431 . "; $fields[$i]");
11432
ae1bcb1f
KW
11433 # ... and, the The regular table, in which it is additional,
11434 # beyond the simple mapping.
cdc18eb6
KW
11435 $file->insert_adjusted_lines("$fields[0]; "
11436 . $object->name
11437 . "; "
ae1bcb1f
KW
11438 . $CMD_DELIM
11439 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11440 . $CMD_DELIM
cdc18eb6 11441 . $fields[$i]);
107e47c9 11442 }
6c0259ad 11443 }
d3fed3dd 11444
107e47c9
KW
11445 # Everything has been handled by the insert_adjusted_lines()
11446 $_ = "";
11447
6c0259ad
KW
11448 return;
11449 }
d3fed3dd 11450}
99870f4d
KW
11451
11452sub filter_old_style_case_folding {
11453 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 11454 # and later style. Different letters were used in the earlier.
99870f4d
KW
11455
11456 my $file = shift;
11457 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11458
11459 my @fields = split /\s*;\s*/;
11460 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11461 $fields[1] = 'I';
11462 }
11463 elsif ($fields[1] eq 'L') {
11464 $fields[1] = 'C'; # L => C always
11465 }
11466 elsif ($fields[1] eq 'E') {
11467 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
11468 $fields[1] = 'F'
11469 }
11470 else {
11471 $fields[1] = 'C'
11472 }
11473 }
11474 else {
11475 $file->carp_bad_line("Expecting L or E in second field");
11476 $_ = "";
11477 return;
11478 }
11479 $_ = join("; ", @fields) . ';';
11480 return;
11481}
11482
11483{ # Closure for case folding
11484
11485 # Create the map for simple only if are going to output it, for otherwise
11486 # it takes no part in anything we do.
11487 my $to_output_simple;
802ba51d 11488 my $non_final_folds;
e94e94b5 11489 my $all_folds;
99870f4d 11490
99870f4d
KW
11491 sub setup_case_folding($) {
11492 # Read in the case foldings in CaseFolding.txt. This handles both
11493 # simple and full case folding.
11494
11495 $to_output_simple
11496 = property_ref('Simple_Case_Folding')->to_output_map;
11497
5be997b0
KW
11498 if (! $to_output_simple) {
11499 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11500 }
11501
802ba51d
KW
11502 $non_final_folds = $perl->add_match_table("_Perl_Non_Final_Folds",
11503 Perl_Extension => 1,
11504 Fate => $INTERNAL_ONLY,
d59563d0 11505 Description => "Code points that particpate in a multi-char fold and are not the final character of said fold",
802ba51d 11506 );
8e487a0f
KW
11507 $non_final_folds->add_comment(<<END
11508Note that this is not a closure. 'f' is in the list, but not things that fold
11509singly to 'f', namely 'F'
11510END
11511 );
e94e94b5
KW
11512 $all_folds = $perl->add_match_table("_Perl_Any_Folds",
11513 Perl_Extension => 1,
11514 Fate => $INTERNAL_ONLY,
11515 Description => "Code points that particpate in some fold",
11516 );
802ba51d 11517
6f2a3287
KW
11518 # If we ever wanted to show that these tables were combined, a new
11519 # property method could be created, like set_combined_props()
11520 property_ref('Case_Folding')->add_comment(join_lines( <<END
11521This file includes both the simple and full case folding maps. The simple
11522ones are in the main body of the table below, and the full ones adding to or
11523overriding them are in the hash.
11524END
11525 ));
99870f4d
KW
11526 return;
11527 }
11528
11529 sub filter_case_folding_line {
11530 # Called for each line in CaseFolding.txt
11531 # Input lines look like:
11532 # 0041; C; 0061; # LATIN CAPITAL LETTER A
11533 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11534 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11535 #
11536 # 'C' means that folding is the same for both simple and full
11537 # 'F' that it is only for full folding
11538 # 'S' that it is only for simple folding
11539 # 'T' is locale-dependent, and ignored
11540 # 'I' is a type of 'F' used in some early releases.
11541 # Note the trailing semi-colon, unlike many of the input files. That
11542 # means that there will be an extra null field generated by the split
11543 # below, which we ignore and hence is not an error.
11544
11545 my $file = shift;
11546 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11547
11548 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11549 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11550 $file->carp_bad_line('Extra fields');
11551 $_ = "";
11552 return;
11553 }
11554
005c991b 11555 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
99870f4d
KW
11556 $_ = "";
11557 return;
11558 }
11559
11560 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
3c099872
KW
11561 # I are all full foldings; S is single-char. For S, there is always
11562 # an F entry, so we must allow multiple values for the same code
11563 # point. Fortunately this table doesn't need further manipulation
11564 # which would preclude using multiple-values. The S is now included
11565 # so that _swash_inversion_hash() is able to construct closures
11566 # without having to worry about F mappings.
11567 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
e94e94b5 11568 $all_folds->add_range(hex $range, hex $range); # Assumes range is single
9470941f
KW
11569 $_ = "$range; Case_Folding; "
11570 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
e94e94b5 11571
802ba51d
KW
11572 if ($type eq 'F') {
11573 my @string = split " ", $map;
11574 for my $i (0 .. @string - 1 -1) {
e94e94b5
KW
11575 my $decimal = hex $string[$i];
11576 $non_final_folds->add_range($decimal, $decimal);
11577 $all_folds->add_range($decimal, $decimal);
802ba51d
KW
11578 }
11579 }
e94e94b5
KW
11580 else {
11581 $all_folds->add_range(hex $map, hex $map);
11582 }
99870f4d
KW
11583 }
11584 else {
11585 $_ = "";
3c099872 11586 $file->carp_bad_line('Expecting C F I S or T in second field');
99870f4d
KW
11587 }
11588
11589 # C and S are simple foldings, but simple case folding is not needed
11590 # unless we explicitly want its map table output.
11591 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11592 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11593 }
11594
99870f4d
KW
11595 return;
11596 }
11597
99870f4d
KW
11598} # End case fold closure
11599
11600sub filter_jamo_line {
11601 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
11602 # from this file that is used in generating the Name property for Jamo
11603 # code points. But, it also is used to convert early versions' syntax
11604 # into the modern form. Here are two examples:
11605 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
11606 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
11607 #
11608 # The input is $_, the output is $_ filtered.
11609
11610 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11611
11612 # Let the caller handle unexpected input. In earlier versions, there was
11613 # a third field which is supposed to be a comment, but did not have a '#'
11614 # before it.
11615 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11616
11617 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
11618 # beginning.
11619
11620 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
11621 $fields[1] = 'R' if $fields[0] eq '1105';
11622
11623 # Add to structure so can generate Names from it.
11624 my $cp = hex $fields[0];
11625 my $short_name = $fields[1];
11626 $Jamo{$cp} = $short_name;
11627 if ($cp <= $LBase + $LCount) {
11628 $Jamo_L{$short_name} = $cp - $LBase;
11629 }
11630 elsif ($cp <= $VBase + $VCount) {
11631 $Jamo_V{$short_name} = $cp - $VBase;
11632 }
11633 elsif ($cp <= $TBase + $TCount) {
11634 $Jamo_T{$short_name} = $cp - $TBase;
11635 }
11636 else {
11637 Carp::my_carp_bug("Unexpected Jamo code point in $_");
11638 }
11639
11640
11641 # Reassemble using just the first two fields to look like a typical
11642 # property file line
11643 $_ = "$fields[0]; $fields[1]";
11644
11645 return;
11646}
11647
99870f4d
KW
11648sub register_fraction($) {
11649 # This registers the input rational number so that it can be passed on to
11650 # utf8_heavy.pl, both in rational and floating forms.
11651
11652 my $rational = shift;
11653
11654 my $float = eval $rational;
11655 $nv_floating_to_rational{$float} = $rational;
11656 return;
11657}
11658
11659sub filter_numeric_value_line {
11660 # DNumValues contains lines of a different syntax than the typical
11661 # property file:
11662 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
11663 #
11664 # This routine transforms $_ containing the anomalous syntax to the
11665 # typical, by filtering out the extra columns, and convert early version
11666 # decimal numbers to strings that look like rational numbers.
11667
11668 my $file = shift;
11669 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11670
11671 # Starting in 5.1, there is a rational field. Just use that, omitting the
11672 # extra columns. Otherwise convert the decimal number in the second field
11673 # to a rational, and omit extraneous columns.
11674 my @fields = split /\s*;\s*/, $_, -1;
11675 my $rational;
11676
11677 if ($v_version ge v5.1.0) {
11678 if (@fields != 4) {
11679 $file->carp_bad_line('Not 4 semi-colon separated fields');
11680 $_ = "";
11681 return;
11682 }
11683 $rational = $fields[3];
11684 $_ = join '; ', @fields[ 0, 3 ];
11685 }
11686 else {
11687
11688 # Here, is an older Unicode file, which has decimal numbers instead of
11689 # rationals in it. Use the fraction to calculate the denominator and
11690 # convert to rational.
11691
11692 if (@fields != 2 && @fields != 3) {
11693 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11694 $_ = "";
11695 return;
11696 }
11697
11698 my $codepoints = $fields[0];
11699 my $decimal = $fields[1];
11700 if ($decimal =~ s/\.0+$//) {
11701
11702 # Anything ending with a decimal followed by nothing but 0's is an
11703 # integer
11704 $_ = "$codepoints; $decimal";
11705 $rational = $decimal;
11706 }
11707 else {
11708
11709 my $denominator;
11710 if ($decimal =~ /\.50*$/) {
11711 $denominator = 2;
11712 }
11713
11714 # Here have the hardcoded repeating decimals in the fraction, and
11715 # the denominator they imply. There were only a few denominators
11716 # in the older Unicode versions of this file which this code
11717 # handles, so it is easy to convert them.
11718
11719 # The 4 is because of a round-off error in the Unicode 3.2 files
11720 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11721 $denominator = 3;
11722 }
11723 elsif ($decimal =~ /\.[27]50*$/) {
11724 $denominator = 4;
11725 }
11726 elsif ($decimal =~ /\.[2468]0*$/) {
11727 $denominator = 5;
11728 }
11729 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11730 $denominator = 6;
11731 }
11732 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11733 $denominator = 8;
11734 }
11735 if ($denominator) {
11736 my $sign = ($decimal < 0) ? "-" : "";
11737 my $numerator = int((abs($decimal) * $denominator) + .5);
11738 $rational = "$sign$numerator/$denominator";
11739 $_ = "$codepoints; $rational";
11740 }
11741 else {
11742 $file->carp_bad_line("Can't cope with number '$decimal'.");
11743 $_ = "";
11744 return;
11745 }
11746 }
11747 }
11748
11749 register_fraction($rational) if $rational =~ qr{/};
11750 return;
11751}
11752
11753{ # Closure
11754 my %unihan_properties;
99870f4d
KW
11755
11756 sub setup_unihan {
11757 # Do any special setup for Unihan properties.
11758
11759 # This property gives the wrong computed type, so override.
11760 my $usource = property_ref('kIRG_USource');
11761 $usource->set_type($STRING) if defined $usource;
11762
b2abbe5b
KW
11763 # This property is to be considered binary (it says so in
11764 # http://www.unicode.org/reports/tr38/)
46b2142f 11765 my $iicore = property_ref('kIICore');
99870f4d 11766 if (defined $iicore) {
46b2142f
KW
11767 $iicore->set_type($FORCED_BINARY);
11768 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11769
11770 # Unicode doesn't include the maps for this property, so don't
11771 # warn that they are missing.
11772 $iicore->set_pre_declared_maps(0);
11773 $iicore->add_comment(join_lines( <<END
11774This property contains enum values, but Unicode UAX #38 says it should be
11775interpreted as binary, so Perl creates tables for both 1) its enum values,
11776plus 2) true/false tables in which it is considered true for all code points
11777that have a non-null value
11778END
11779 ));
99870f4d
KW
11780 }
11781
11782 return;
11783 }
11784
11785 sub filter_unihan_line {
11786 # Change unihan db lines to look like the others in the db. Here is
11787 # an input sample:
11788 # U+341C kCangjie IEKN
11789
11790 # Tabs are used instead of semi-colons to separate fields; therefore
11791 # they may have semi-colons embedded in them. Change these to periods
11792 # so won't screw up the rest of the code.
11793 s/;/./g;
11794
11795 # Remove lines that don't look like ones we accept.
11796 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11797 $_ = "";
11798 return;
11799 }
11800
11801 # Extract the property, and save a reference to its object.
11802 my $property = $1;
11803 if (! exists $unihan_properties{$property}) {
11804 $unihan_properties{$property} = property_ref($property);
11805 }
11806
11807 # Don't do anything unless the property is one we're handling, which
11808 # we determine by seeing if there is an object defined for it or not
11809 if (! defined $unihan_properties{$property}) {
11810 $_ = "";
11811 return;
11812 }
11813
99870f4d
KW
11814 # Convert the tab separators to our standard semi-colons, and convert
11815 # the U+HHHH notation to the rest of the standard's HHHH
11816 s/\t/;/g;
11817 s/\b U \+ (?= $code_point_re )//xg;
11818
11819 #local $to_trace = 1 if main::DEBUG;
11820 trace $_ if main::DEBUG && $to_trace;
11821
11822 return;
11823 }
11824}
11825
11826sub filter_blocks_lines {
11827 # In the Blocks.txt file, the names of the blocks don't quite match the
11828 # names given in PropertyValueAliases.txt, so this changes them so they
11829 # do match: Blanks and hyphens are changed into underscores. Also makes
11830 # early release versions look like later ones
11831 #
11832 # $_ is transformed to the correct value.
11833
11834 my $file = shift;
11835 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11836
11837 if ($v_version lt v3.2.0) {
11838 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11839 $_ = "";
11840 return;
11841 }
11842
11843 # Old versions used a different syntax to mark the range.
11844 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11845 }
11846
11847 my @fields = split /\s*;\s*/, $_, -1;
11848 if (@fields != 2) {
11849 $file->carp_bad_line("Expecting exactly two fields");
11850 $_ = "";
11851 return;
11852 }
11853
11854 # Change hyphens and blanks in the block name field only
11855 $fields[1] =~ s/[ -]/_/g;
11856 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11857
11858 $_ = join("; ", @fields);
11859 return;
11860}
11861
11862{ # Closure
11863 my $current_property;
11864
11865 sub filter_old_style_proplist {
11866 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11867 # was in a completely different syntax. Ken Whistler of Unicode says
11868 # that it was something he used as an aid for his own purposes, but
0ff33a84
KW
11869 # was never an official part of the standard. Many of the properties
11870 # in it were incorporated into the later PropList.txt, but some were
11871 # not. This program uses this early file to generate property tables
11872 # that are otherwise not accessible in the early UCD's. It does this
11873 # for the ones that eventually became official, and don't appear to be
11874 # too different in their contents from the later official version, and
11875 # throws away the rest. It could be argued that the ones it generates
11876 # were probably not really official at that time, so should be
11877 # ignored. You can easily modify things to skip all of them by
11878 # changing this function to just set $_ to "", and return; and to skip
11879 # certain of them by by simply removing their declarations from
11880 # get_old_property_aliases().
11881 #
11882 # Here is a list of all the ones that are thrown away:
11883 # Alphabetic The definitions for this are very
11884 # defective, so better to not mislead
11885 # people into thinking it works.
11886 # Instead the Perl extension of the
11887 # same name is constructed from first
11888 # principles.
99870f4d
KW
11889 # Bidi=* duplicates UnicodeData.txt
11890 # Combining never made into official property;
11891 # is \P{ccc=0}
11892 # Composite never made into official property.
11893 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11894 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11895 # Delimiter never made into official property;
11896 # removed in 3.0.1
11897 # Format Control never made into official property;
11898 # similar to gc=cf
11899 # High Surrogate duplicates Blocks.txt
11900 # Ignorable Control never made into official property;
11901 # similar to di=y
11902 # ISO Control duplicates UnicodeData.txt: gc=cc
11903 # Left of Pair never made into official property;
11904 # Line Separator duplicates UnicodeData.txt: gc=zl
11905 # Low Surrogate duplicates Blocks.txt
11906 # Non-break was actually listed as a property
11907 # in 3.2, but without any code
11908 # points. Unicode denies that this
11909 # was ever an official property
11910 # Non-spacing duplicate UnicodeData.txt: gc=mn
11911 # Numeric duplicates UnicodeData.txt: gc=cc
11912 # Paired Punctuation never made into official property;
11913 # appears to be gc=ps + gc=pe
11914 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11915 # Private Use duplicates UnicodeData.txt: gc=co
11916 # Private Use High Surrogate duplicates Blocks.txt
11917 # Punctuation duplicates UnicodeData.txt: gc=p
11918 # Space different definition than eventual
11919 # one.
11920 # Titlecase duplicates UnicodeData.txt: gc=lt
c3783c6c 11921 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
98dc9551 11922 # Zero-width never made into official property;
99870f4d
KW
11923 # subset of gc=cf
11924 # Most of the properties have the same names in this file as in later
11925 # versions, but a couple do not.
11926 #
11927 # This subroutine filters $_, converting it from the old style into
11928 # the new style. Here's a sample of the old-style
11929 #
11930 # *******************************************
11931 #
11932 # Property dump for: 0x100000A0 (Join Control)
11933 #
11934 # 200C..200D (2 chars)
11935 #
11936 # In the example, the property is "Join Control". It is kept in this
11937 # closure between calls to the subroutine. The numbers beginning with
11938 # 0x were internal to Ken's program that generated this file.
11939
11940 # If this line contains the property name, extract it.
11941 if (/^Property dump for: [^(]*\((.*)\)/) {
11942 $_ = $1;
11943
11944 # Convert white space to underscores.
11945 s/ /_/g;
11946
11947 # Convert the few properties that don't have the same name as
11948 # their modern counterparts
11949 s/Identifier_Part/ID_Continue/
11950 or s/Not_a_Character/NChar/;
11951
11952 # If the name matches an existing property, use it.
11953 if (defined property_ref($_)) {
11954 trace "new property=", $_ if main::DEBUG && $to_trace;
11955 $current_property = $_;
11956 }
11957 else { # Otherwise discard it
11958 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11959 undef $current_property;
11960 }
11961 $_ = ""; # The property is saved for the next lines of the
11962 # file, but this defining line is of no further use,
11963 # so clear it so that the caller won't process it
11964 # further.
11965 }
11966 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11967
11968 # Here, the input line isn't a header defining a property for the
11969 # following section, and either we aren't in such a section, or
11970 # the line doesn't look like one that defines the code points in
11971 # such a section. Ignore this line.
11972 $_ = "";
11973 }
11974 else {
11975
11976 # Here, we have a line defining the code points for the current
11977 # stashed property. Anything starting with the first blank is
11978 # extraneous. Otherwise, it should look like a normal range to
11979 # the caller. Append the property name so that it looks just like
11980 # a modern PropList entry.
11981
11982 $_ =~ s/\s.*//;
11983 $_ .= "; $current_property";
11984 }
11985 trace $_ if main::DEBUG && $to_trace;
11986 return;
11987 }
11988} # End closure for old style proplist
11989
11990sub filter_old_style_normalization_lines {
11991 # For early releases of Unicode, the lines were like:
11992 # 74..2A76 ; NFKD_NO
11993 # For later releases this became:
11994 # 74..2A76 ; NFKD_QC; N
11995 # Filter $_ to look like those in later releases.
11996 # Similarly for MAYBEs
11997
11998 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11999
12000 # Also, the property FC_NFKC was abbreviated to FNC
12001 s/FNC/FC_NFKC/;
12002 return;
12003}
12004
82aed44a
KW
12005sub setup_script_extensions {
12006 # The Script_Extensions property starts out with a clone of the Script
12007 # property.
12008
4fec90df
KW
12009 my $scx = property_ref("Script_Extensions");
12010 $scx = Property->new("scx", Full_Name => "Script_Extensions")
d59563d0 12011 if ! defined $scx;
4fec90df
KW
12012 $scx->_set_format($STRING_WHITE_SPACE_LIST);
12013 $scx->initialize($script);
12014 $scx->set_default_map($script->default_map);
12015 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
82aed44a
KW
12016 $scx->add_comment(join_lines( <<END
12017The values for code points that appear in one script are just the same as for
12018the 'Script' property. Likewise the values for those that appear in many
12019scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
12020values of code points that appear in a few scripts are a space separated list
12021of those scripts.
12022END
12023 ));
12024
8d35804a 12025 # Initialize scx's tables and the aliases for them to be the same as sc's
4fec90df 12026 foreach my $table ($script->tables) {
82aed44a
KW
12027 my $scx_table = $scx->add_match_table($table->name,
12028 Full_Name => $table->full_name);
12029 foreach my $alias ($table->aliases) {
12030 $scx_table->add_alias($alias->name);
12031 }
12032 }
12033}
12034
fbe1e607
KW
12035sub filter_script_extensions_line {
12036 # The Scripts file comes with the full name for the scripts; the
12037 # ScriptExtensions, with the short name. The final mapping file is a
12038 # combination of these, and without adjustment, would have inconsistent
12039 # entries. This filters the latter file to convert to full names.
12040 # Entries look like this:
12041 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12042
12043 my @fields = split /\s*;\s*/;
ccb595a6
KW
12044
12045 # This script was erroneously omitted in this Unicode version.
12046 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12047
fbe1e607
KW
12048 my @full_names;
12049 foreach my $short_name (split " ", $fields[1]) {
12050 push @full_names, $script->table($short_name)->full_name;
12051 }
12052 $fields[1] = join " ", @full_names;
12053 $_ = join "; ", @fields;
12054
12055 return;
12056}
12057
30769324
KW
12058sub generate_hst {
12059
12060 # Populates the Hangul Syllable Type property from first principles
12061
12062 my $file= shift;
12063 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12064
12065 # These few ranges are hard-coded in.
12066 $file->insert_lines(split /\n/, <<'END'
120671100..1159 ; L
12068115F ; L
120691160..11A2 ; V
1207011A8..11F9 ; T
12071END
12072);
12073
12074 # The Hangul syllables in version 1 are completely different than what came
12075 # after, so just ignore them there.
12076 if ($v_version lt v2.0.0) {
12077 my $property = property_ref($file->property);
12078 push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12079 push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12080 return;
12081 }
12082
12083 # The algorithmically derived syllables are almost all LVT ones, so
12084 # initialize the whole range with that.
12085 $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12086 $SBase, $SBase + $SCount -1);
12087
12088 # Those ones that aren't LVT are LV, and they occur at intervals of
12089 # $TCount code points, starting with the first code point, at $SBase.
12090 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12091 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12092 }
12093
12094 return;
12095}
12096
12097sub generate_GCB {
12098
12099 # Populates the Grapheme Cluster Break property from first principles
12100
12101 my $file= shift;
12102 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12103
12104 # All these definitions are from
12105 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12106 # from http://www.unicode.org/reports/tr29/tr29-4.html
12107
12108 foreach my $range ($gc->ranges) {
12109
12110 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12111 # and gc=Cf
12112 if ($range->value =~ / ^ M [en] $ /x) {
12113 $file->insert_lines(sprintf "%04X..%04X; Extend",
12114 $range->start, $range->end);
12115 }
12116 elsif ($range->value =~ / ^ C [cf] $ /x) {
12117 $file->insert_lines(sprintf "%04X..%04X; Control",
12118 $range->start, $range->end);
12119 }
12120 }
12121 $file->insert_lines("2028; Control"); # Line Separator
12122 $file->insert_lines("2029; Control"); # Paragraph Separator
12123
12124 $file->insert_lines("000D; CR");
12125 $file->insert_lines("000A; LF");
12126
12127 # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12128 foreach my $code_point ( qw{
12129 40000
12130 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12131 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12132 }
12133 ) {
12134 my $category = $gc->value_of(hex $code_point);
12135 next if ! defined $category || $category eq 'Cn'; # But not if
12136 # unassigned in this
12137 # release
12138 $file->insert_lines("$code_point; Extend");
12139 }
12140
12141 my $hst = property_ref('Hangul_Syllable_Type');
12142 if ($hst->count > 0) {
12143 foreach my $range ($hst->ranges) {
12144 $file->insert_lines(sprintf "%04X..%04X; %s",
12145 $range->start, $range->end, $range->value);
12146 }
12147 }
12148 else {
12149 generate_hst($file);
12150 }
12151
12152 return;
12153}
12154
ce432655 12155sub setup_early_name_alias {
21a1aff7
KW
12156 my $file= shift;
12157 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12158
f11ca562
KW
12159 # This has the effect of pretending that the Name_Alias property was
12160 # available in all Unicode releases. Strictly speaking, this property
12161 # should not be availabe in early releases, but doing this allows
12162 # charnames.pm to work on older releases without change. Prior to v5.16
12163 # it had these names hard-coded inside it. Unicode 6.1 came along and
12164 # created these names, and so they were removed from charnames.
12165
b8ba2307 12166 my $aliases = property_ref('Name_Alias');
f11ca562
KW
12167 if (! defined $aliases) {
12168 $aliases = Property->new('Name_Alias', Default_Map => "");
12169 }
12170
21a1aff7 12171 $file->insert_lines(get_old_name_aliases());
b8ba2307 12172
b8ba2307
KW
12173 return;
12174}
12175
21a1aff7 12176sub get_old_name_aliases () {
f11ca562
KW
12177
12178 # The Unicode_1_Name field, contains most of these names. One would
12179 # expect, given the field's name, that its values would be fixed across
12180 # versions, giving the true Unicode version 1 name for the character.
12181 # Sadly, this is not the case. Actually Version 1.1.5 had no names for
12182 # any of the controls; Version 2.0 introduced names for the C0 controls,
12183 # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
12184 # changed some names: it
12185 # changed to parenthesized versions like "NEXT LINE" to
12186 # "NEXT LINE (NEL)";
12187 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12188 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12189 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12190 # This list contains all the names that were defined so that
12191 # charnames::vianame(), etc. understand them all EVEN if this version of
12192 # Unicode didn't specify them (this could be construed as a bug).
12193 # mktables elsewhere gives preference to the Unicode_1_Name field over
12194 # these names, so that viacode() will return the correct value for that
12195 # version of Unicode, except when that version doesn't define a name,
12196 # viacode() will return one anyway (this also could be construed as a
12197 # bug). But these potential "bugs" allow for the smooth working of code
12198 # on earlier Unicode releases.
12199
21a1aff7
KW
12200 my @return = split /\n/, <<'END';
122010000;NULL;control
122020000;NUL;abbreviation
122030001;START OF HEADING;control
122040001;SOH;abbreviation
122050002;START OF TEXT;control
122060002;STX;abbreviation
122070003;END OF TEXT;control
122080003;ETX;abbreviation
122090004;END OF TRANSMISSION;control
122100004;EOT;abbreviation
122110005;ENQUIRY;control
122120005;ENQ;abbreviation
122130006;ACKNOWLEDGE;control
122140006;ACK;abbreviation
21a1aff7
KW
122150007;BELL;control
122160007;BEL;abbreviation
122170008;BACKSPACE;control
122180008;BS;abbreviation
122190009;CHARACTER TABULATION;control
122200009;HORIZONTAL TABULATION;control
122210009;HT;abbreviation
122220009;TAB;abbreviation
12223000A;LINE FEED;control
12224000A;LINE FEED (LF);control
12225000A;NEW LINE;control
12226000A;END OF LINE;control
12227000A;LF;abbreviation
12228000A;NL;abbreviation
12229000A;EOL;abbreviation
12230000B;LINE TABULATION;control
12231000B;VERTICAL TABULATION;control
12232000B;VT;abbreviation
12233000C;FORM FEED;control
12234000C;FORM FEED (FF);control
12235000C;FF;abbreviation
12236000D;CARRIAGE RETURN;control
12237000D;CARRIAGE RETURN (CR);control
12238000D;CR;abbreviation
12239000E;SHIFT OUT;control
12240000E;LOCKING-SHIFT ONE;control
12241000E;SO;abbreviation
12242000F;SHIFT IN;control
12243000F;LOCKING-SHIFT ZERO;control
12244000F;SI;abbreviation
122450010;DATA LINK ESCAPE;control
122460010;DLE;abbreviation
122470011;DEVICE CONTROL ONE;control
122480011;DC1;abbreviation
122490012;DEVICE CONTROL TWO;control
122500012;DC2;abbreviation
122510013;DEVICE CONTROL THREE;control
122520013;DC3;abbreviation
122530014;DEVICE CONTROL FOUR;control
122540014;DC4;abbreviation
122550015;NEGATIVE ACKNOWLEDGE;control
122560015;NAK;abbreviation
122570016;SYNCHRONOUS IDLE;control
122580016;SYN;abbreviation
122590017;END OF TRANSMISSION BLOCK;control
122600017;ETB;abbreviation
122610018;CANCEL;control
122620018;CAN;abbreviation
122630019;END OF MEDIUM;control
122640019;EOM;abbreviation
12265001A;SUBSTITUTE;control
12266001A;SUB;abbreviation
12267001B;ESCAPE;control
12268001B;ESC;abbreviation
12269001C;INFORMATION SEPARATOR FOUR;control
12270001C;FILE SEPARATOR;control
12271001C;FS;abbreviation
12272001D;INFORMATION SEPARATOR THREE;control
12273001D;GROUP SEPARATOR;control
12274001D;GS;abbreviation
12275001E;INFORMATION SEPARATOR TWO;control
12276001E;RECORD SEPARATOR;control
12277001E;RS;abbreviation
12278001F;INFORMATION SEPARATOR ONE;control
12279001F;UNIT SEPARATOR;control
12280001F;US;abbreviation
122810020;SP;abbreviation
12282007F;DELETE;control
12283007F;DEL;abbreviation
122840080;PADDING CHARACTER;figment
122850080;PAD;abbreviation
122860081;HIGH OCTET PRESET;figment
122870081;HOP;abbreviation
122880082;BREAK PERMITTED HERE;control
122890082;BPH;abbreviation
122900083;NO BREAK HERE;control
122910083;NBH;abbreviation
122920084;INDEX;control
122930084;IND;abbreviation
122940085;NEXT LINE;control
122950085;NEXT LINE (NEL);control
122960085;NEL;abbreviation
122970086;START OF SELECTED AREA;control
122980086;SSA;abbreviation
122990087;END OF SELECTED AREA;control
123000087;ESA;abbreviation
123010088;CHARACTER TABULATION SET;control
123020088;HORIZONTAL TABULATION SET;control
123030088;HTS;abbreviation
123040089;CHARACTER TABULATION WITH JUSTIFICATION;control
123050089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
123060089;HTJ;abbreviation
12307008A;LINE TABULATION SET;control
12308008A;VERTICAL TABULATION SET;control
12309008A;VTS;abbreviation
12310008B;PARTIAL LINE FORWARD;control
12311008B;PARTIAL LINE DOWN;control
12312008B;PLD;abbreviation
12313008C;PARTIAL LINE BACKWARD;control
12314008C;PARTIAL LINE UP;control
12315008C;PLU;abbreviation
12316008D;REVERSE LINE FEED;control
12317008D;REVERSE INDEX;control
12318008D;RI;abbreviation
12319008E;SINGLE SHIFT TWO;control
12320008E;SINGLE-SHIFT-2;control
12321008E;SS2;abbreviation
12322008F;SINGLE SHIFT THREE;control
12323008F;SINGLE-SHIFT-3;control
12324008F;SS3;abbreviation
123250090;DEVICE CONTROL STRING;control
123260090;DCS;abbreviation
123270091;PRIVATE USE ONE;control
123280091;PRIVATE USE-1;control
123290091;PU1;abbreviation
123300092;PRIVATE USE TWO;control
123310092;PRIVATE USE-2;control
123320092;PU2;abbreviation
123330093;SET TRANSMIT STATE;control
123340093;STS;abbreviation
123350094;CANCEL CHARACTER;control
123360094;CCH;abbreviation
123370095;MESSAGE WAITING;control
123380095;MW;abbreviation
123390096;START OF GUARDED AREA;control
123400096;START OF PROTECTED AREA;control
123410096;SPA;abbreviation
123420097;END OF GUARDED AREA;control
123430097;END OF PROTECTED AREA;control
123440097;EPA;abbreviation
123450098;START OF STRING;control
123460098;SOS;abbreviation
123470099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
123480099;SGC;abbreviation
12349009A;SINGLE CHARACTER INTRODUCER;control
12350009A;SCI;abbreviation
12351009B;CONTROL SEQUENCE INTRODUCER;control
12352009B;CSI;abbreviation
12353009C;STRING TERMINATOR;control
12354009C;ST;abbreviation
12355009D;OPERATING SYSTEM COMMAND;control
12356009D;OSC;abbreviation
12357009E;PRIVACY MESSAGE;control
12358009E;PM;abbreviation
12359009F;APPLICATION PROGRAM COMMAND;control
12360009F;APC;abbreviation
1236100A0;NBSP;abbreviation
1236200AD;SHY;abbreviation
12363200B;ZWSP;abbreviation
12364200C;ZWNJ;abbreviation
12365200D;ZWJ;abbreviation
12366200E;LRM;abbreviation
12367200F;RLM;abbreviation
12368202A;LRE;abbreviation
12369202B;RLE;abbreviation
12370202C;PDF;abbreviation
12371202D;LRO;abbreviation
12372202E;RLO;abbreviation
12373FEFF;BYTE ORDER MARK;alternate
12374FEFF;BOM;abbreviation
12375FEFF;ZWNBSP;abbreviation
12376END
12377
12378 if ($v_version ge v3.0.0) {
12379 push @return, split /\n/, <<'END';
12380180B; FVS1; abbreviation
12381180C; FVS2; abbreviation
12382180D; FVS3; abbreviation
12383180E; MVS; abbreviation
12384202F; NNBSP; abbreviation
12385END
12386 }
12387
12388 if ($v_version ge v3.2.0) {
12389 push @return, split /\n/, <<'END';
12390034F; CGJ; abbreviation
12391205F; MMSP; abbreviation
123922060; WJ; abbreviation
12393END
12394 # Add in VS1..VS16
12395 my $cp = 0xFE00 - 1;
12396 for my $i (1..16) {
12397 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12398 }
12399 }
12400 if ($v_version ge v4.0.0) { # Add in VS17..VS256
12401 my $cp = 0xE0100 - 17;
12402 for my $i (17..256) {
12403 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12404 }
12405 }
12406
f11ca562
KW
12407 # ALERT did not come along until 6.0, at which point it became preferred
12408 # over BELL, and was never in the Unicode_1_Name field. For the same
12409 # reasons, that the other names are made known to all releases by this
12410 # function, we make ALERT known too. By inserting it
12411 # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12412 my $alert = '0007; ALERT; control';
12413 if ($v_version lt v6.0.0) {
12414 push @return, $alert;
12415 }
12416 else {
12417 unshift @return, $alert;
12418 }
12419
21a1aff7
KW
12420 return @return;
12421}
12422
b8ba2307
KW
12423sub filter_later_version_name_alias_line {
12424
12425 # This file has an extra entry per line for the alias type. This is
12426 # handled by creating a compound entry: "$alias: $type"; First, split
12427 # the line into components.
12428 my ($range, $alias, $type, @remainder)
12429 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12430
12431 # This file contains multiple entries for some components, so tell the
12432 # downstream code to allow this in our internal tables; the
12433 # $MULTIPLE_AFTER preserves the input ordering.
12434 $_ = join ";", $range, $CMD_DELIM
12435 . $REPLACE_CMD
12436 . '='
12437 . $MULTIPLE_AFTER
12438 . $CMD_DELIM
12439 . "$alias: $type",
12440 @remainder;
12441 return;
58b75e36
KW
12442}
12443
12444sub filter_early_version_name_alias_line {
b8ba2307
KW
12445
12446 # Early versions did not have the trailing alias type field; implicitly it
21a1aff7
KW
12447 # was 'correction'. But our synthetic lines we add in this program do
12448 # have it, so test for the type field.
12449 $_ .= "; correction" if $_ !~ /;.*;/;
12450
b8ba2307 12451 filter_later_version_name_alias_line;
58b75e36 12452 return;
dcd72625
KW
12453}
12454
99870f4d
KW
12455sub finish_Unicode() {
12456 # This routine should be called after all the Unicode files have been read
12457 # in. It:
af921c2d
KW
12458 # 1) Creates properties that are missing from the version of Unicode being
12459 # compiled, and which, for whatever reason, are needed for the Perl
12460 # core to function properly. These are minimally populated as
12461 # necessary.
12462 # 2) Adds the mappings for code points missing from the files which have
99870f4d 12463 # defaults specified for them.
af921c2d 12464 # 3) At this this point all mappings are known, so it computes the type of
99870f4d 12465 # each property whose type hasn't been determined yet.
af921c2d 12466 # 4) Calculates all the regular expression match tables based on the
99870f4d 12467 # mappings.
af921c2d 12468 # 5) Calculates and adds the tables which are defined by Unicode, but
d59563d0
KW
12469 # which aren't derived by them, and certain derived tables that Perl
12470 # uses.
99870f4d 12471
a868e563
KW
12472 # Folding information was introduced later into Unicode data. To get
12473 # Perl's case ignore (/i) to work at all in releases that don't have
12474 # folding, use the best available alternative, which is lower casing.
12475 my $fold = property_ref('Case_Folding');
12476 if ($fold->is_empty) {
12477 $fold->initialize(property_ref('Lowercase_Mapping'));
12478 $fold->add_note(join_lines(<<END
12479WARNING: This table uses lower case as a substitute for missing fold
12480information
12481END
12482 ));
12483 }
12484
12485 # Multiple-character mapping was introduced later into Unicode data, so it
12486 # is by default the simple version. If to output the simple versions and
12487 # not present, just use the regular (which in these Unicode versions is
12488 # the simple as well).
12489 foreach my $map (qw { Uppercase_Mapping
12490 Lowercase_Mapping
12491 Titlecase_Mapping
12492 Case_Folding
12493 } )
12494 {
12495 my $simple = property_ref("Simple_$map");
12496 next if ! $simple->is_empty;
12497 if ($simple->to_output_map) {
12498 $simple->initialize(property_ref($map));
12499 }
12500 else {
12501 property_ref($map)->set_proxy_for($simple->name);
12502 }
12503 }
12504
99870f4d
KW
12505 # For each property, fill in any missing mappings, and calculate the re
12506 # match tables. If a property has more than one missing mapping, the
12507 # default is a reference to a data structure, and requires data from other
12508 # properties to resolve. The sort is used to cause these to be processed
12509 # last, after all the other properties have been calculated.
12510 # (Fortunately, the missing properties so far don't depend on each other.)
12511 foreach my $property
12512 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
12513 property_ref('*'))
12514 {
12515 # $perl has been defined, but isn't one of the Unicode properties that
12516 # need to be finished up.
12517 next if $property == $perl;
12518
9f877a47
KW
12519 # Nor do we need to do anything with properties that aren't going to
12520 # be output.
12521 next if $property->fate == $SUPPRESSED;
12522
99870f4d
KW
12523 # Handle the properties that have more than one possible default
12524 if (ref $property->default_map) {
12525 my $default_map = $property->default_map;
12526
12527 # These properties have stored in the default_map:
12528 # One or more of:
12529 # 1) A default map which applies to all code points in a
12530 # certain class
12531 # 2) an expression which will evaluate to the list of code
12532 # points in that class
12533 # And
12534 # 3) the default map which applies to every other missing code
12535 # point.
12536 #
12537 # Go through each list.
12538 while (my ($default, $eval) = $default_map->get_next_defaults) {
12539
12540 # Get the class list, and intersect it with all the so-far
12541 # unspecified code points yielding all the code points
12542 # in the class that haven't been specified.
12543 my $list = eval $eval;
12544 if ($@) {
12545 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
12546 last;
12547 }
12548
12549 # Narrow down the list to just those code points we don't have
12550 # maps for yet.
12551 $list = $list & $property->inverse_list;
12552
12553 # Add mappings to the property for each code point in the list
12554 foreach my $range ($list->ranges) {
56343c78
KW
12555 $property->add_map($range->start, $range->end, $default,
12556 Replace => $CROAK);
99870f4d
KW
12557 }
12558 }
12559
12560 # All remaining code points have the other mapping. Set that up
12561 # so the normal single-default mapping code will work on them
12562 $property->set_default_map($default_map->other_default);
12563
12564 # And fall through to do that
12565 }
12566
12567 # We should have enough data now to compute the type of the property.
12568 $property->compute_type;
12569 my $property_type = $property->type;
12570
12571 next if ! $property->to_create_match_tables;
12572
12573 # Here want to create match tables for this property
12574
12575 # The Unicode db always (so far, and they claim into the future) have
12576 # the default for missing entries in binary properties be 'N' (unless
12577 # there is a '@missing' line that specifies otherwise)
12578 if ($property_type == $BINARY && ! defined $property->default_map) {
12579 $property->set_default_map('N');
12580 }
12581
12582 # Add any remaining code points to the mapping, using the default for
5d7f7709 12583 # missing code points.
d8fb1cc3 12584 my $default_table;
99870f4d 12585 if (defined (my $default_map = $property->default_map)) {
1520492f 12586
f4c2a127 12587 # Make sure there is a match table for the default
f4c2a127
KW
12588 if (! defined ($default_table = $property->table($default_map))) {
12589 $default_table = $property->add_match_table($default_map);
12590 }
12591
a92d5c2e
KW
12592 # And, if the property is binary, the default table will just
12593 # be the complement of the other table.
12594 if ($property_type == $BINARY) {
12595 my $non_default_table;
12596
12597 # Find the non-default table.
12598 for my $table ($property->tables) {
12599 next if $table == $default_table;
12600 $non_default_table = $table;
12601 }
12602 $default_table->set_complement($non_default_table);
12603 }
862fd107 12604 else {
a92d5c2e 12605
3981d009
KW
12606 # This fills in any missing values with the default. It's not
12607 # necessary to do this with binary properties, as the default
12608 # is defined completely in terms of the Y table.
6189eadc 12609 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
3981d009 12610 $default_map, Replace => $NO);
862fd107 12611 }
99870f4d
KW
12612 }
12613
12614 # Have all we need to populate the match tables.
12615 my $property_name = $property->name;
56557540 12616 my $maps_should_be_defined = $property->pre_declared_maps;
99870f4d
KW
12617 foreach my $range ($property->ranges) {
12618 my $map = $range->value;
f5e9a6ca 12619 my $table = $property->table($map);
99870f4d
KW
12620 if (! defined $table) {
12621
12622 # Integral and rational property values are not necessarily
56557540
KW
12623 # defined in PropValueAliases, but whether all the other ones
12624 # should be depends on the property.
12625 if ($maps_should_be_defined
99870f4d
KW
12626 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
12627 {
12628 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
12629 }
f5e9a6ca 12630 $table = $property->add_match_table($map);
99870f4d
KW
12631 }
12632
862fd107 12633 next if $table->complement != 0; # Don't need to populate these
99870f4d
KW
12634 $table->add_range($range->start, $range->end);
12635 }
12636
06f26c45
KW
12637 # A forced binary property has additional true/false tables which
12638 # should have been set up when it was forced into binary. The false
12639 # table matches exactly the same set as the property's default table.
12640 # The true table matches the complement of that. The false table is
12641 # not the same as an additional set of aliases on top of the default
12642 # table, so use 'set_equivalent_to'. If it were implemented as
12643 # additional aliases, various things would have to be adjusted, but
12644 # especially, if the user wants to get a list of names for the table
12645 # using Unicode::UCD::prop_value_aliases(), s/he should get a
12646 # different set depending on whether they want the default table or
12647 # the false table.
12648 if ($property_type == $FORCED_BINARY) {
12649 $property->table('N')->set_equivalent_to($default_table,
12650 Related => 1);
12651 $property->table('Y')->set_complement($default_table);
12652 }
12653
807807b7
KW
12654 # For Perl 5.6 compatibility, all properties matchable in regexes can
12655 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
12656 # But warn if this creates a conflict with a (new) Unicode property
12657 # name, although it appears that Unicode has made a decision never to
12658 # begin a property name with 'Is_', so this shouldn't happen.
99870f4d
KW
12659 foreach my $alias ($property->aliases) {
12660 my $Is_name = 'Is_' . $alias->name;
807807b7 12661 if (defined (my $pre_existing = property_ref($Is_name))) {
99870f4d 12662 Carp::my_carp(<<END
807807b7
KW
12663There is already an alias named $Is_name (from " . $pre_existing . "), so
12664creating one for $property won't work. This is bad news. If it is not too
12665late, get Unicode to back off. Otherwise go back to the old scheme (findable
12666from the git blame log for this area of the code that suppressed individual
12667aliases that conflict with the new Unicode names. Proceeding anyway.
99870f4d
KW
12668END
12669 );
99870f4d
KW
12670 }
12671 } # End of loop through aliases for this property
12672 } # End of loop through all Unicode properties.
12673
12674 # Fill in the mappings that Unicode doesn't completely furnish. First the
12675 # single letter major general categories. If Unicode were to start
12676 # delivering the values, this would be redundant, but better that than to
12677 # try to figure out if should skip and not get it right. Ths could happen
12678 # if a new major category were to be introduced, and the hard-coded test
12679 # wouldn't know about it.
12680 # This routine depends on the standard names for the general categories
12681 # being what it thinks they are, like 'Cn'. The major categories are the
12682 # union of all the general category tables which have the same first
12683 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
12684 foreach my $minor_table ($gc->tables) {
12685 my $minor_name = $minor_table->name;
12686 next if length $minor_name == 1;
12687 if (length $minor_name != 2) {
12688 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
12689 next;
12690 }
12691
12692 my $major_name = uc(substr($minor_name, 0, 1));
12693 my $major_table = $gc->table($major_name);
12694 $major_table += $minor_table;
12695 }
12696
12697 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
12698 # defines it as LC)
12699 my $LC = $gc->table('LC');
12700 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
12701 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
12702
12703
12704 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
12705 # deliver the correct values in it
12706 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
12707
12708 # Lt not in release 1.
a5c376b7
KW
12709 if (defined $gc->table('Lt')) {
12710 $LC += $gc->table('Lt');
12711 $gc->table('Lt')->set_caseless_equivalent($LC);
12712 }
99870f4d
KW
12713 }
12714 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
12715
a5c376b7
KW
12716 $gc->table('Ll')->set_caseless_equivalent($LC);
12717 $gc->table('Lu')->set_caseless_equivalent($LC);
12718
99870f4d 12719 my $Cs = $gc->table('Cs');
99870f4d 12720
cdc18eb6
KW
12721 # Create digit and case fold tables with the original file names for
12722 # backwards compatibility with applications that read them directly.
12723 my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
12724 Default_Map => "",
12725 Perl_Extension => 1,
12726 File => 'Digit', # Trad. location
12727 Directory => $map_directory,
12728 UCD => 0,
12729 Type => $STRING,
f64b46a1 12730 To_Output_Map => $EXTERNAL_MAP,
cdc18eb6
KW
12731 Range_Size_1 => 1,
12732 Initialize => property_ref('Perl_Decimal_Digit'),
12733 );
12734 $Digit->add_comment(join_lines(<<END
12735This file gives the mapping of all code points which represent a single
12736decimal digit [0-9] to their respective digits. For example, the code point
12737U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
12738that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
12739numerals.
12740END
12741 ));
12742
12743 Property->new('Legacy_Case_Folding',
12744 File => "Fold",
12745 Directory => $map_directory,
12746 Default_Map => $CODE_POINT,
12747 UCD => 0,
12748 Range_Size_1 => 1,
12749 Type => $STRING,
f64b46a1 12750 To_Output_Map => $EXTERNAL_MAP,
cdc18eb6
KW
12751 Format => $HEX_FORMAT,
12752 Initialize => property_ref('cf'),
12753 );
12754
82aed44a
KW
12755 # The Script_Extensions property started out as a clone of the Script
12756 # property. But processing its data file caused some elements to be
12757 # replaced with different data. (These elements were for the Common and
12758 # Inherited properties.) This data is a qw() list of all the scripts that
12759 # the code points in the given range are in. An example line is:
12760 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
12761 #
12762 # The code above has created a new match table named "Arab Syrc Thaa"
12763 # which contains 060C. (The cloned table started out with this code point
12764 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
12765 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
12766 # match table. This is repeated for all these tables and ranges. The map
12767 # data is retained in the map table for reference, but the spurious match
12768 # tables are deleted.
12769
12770 my $scx = property_ref("Script_Extensions");
d53a7e7d 12771 if (defined $scx) {
c3a37f64
KW
12772 foreach my $table ($scx->tables) {
12773 next unless $table->name =~ /\s/; # All the new and only the new
12774 # tables have a space in their
12775 # names
12776 my @scripts = split /\s+/, $table->name;
12777 foreach my $script (@scripts) {
12778 my $script_table = $scx->table($script);
12779 $script_table += $table;
12780 }
12781 $scx->delete_match_table($table);
82aed44a 12782 }
d53a7e7d 12783 }
82aed44a
KW
12784
12785 return;
99870f4d
KW
12786}
12787
fc862497
KW
12788sub pre_3_dot_1_Nl () {
12789
12790 # Return a range list for gc=nl for Unicode versions prior to 3.1, which
12791 # is when Unicode's became fully usable. These code points were
12792 # determined by inspection and experimentation. gc=nl is important for
12793 # certain Perl-extension properties that should be available in all
12794 # releases.
12795
12796 my $Nl = Range_List->new();
12797 if (defined (my $official = $gc->table('Nl'))) {
12798 $Nl += $official;
12799 }
12800 else {
12801 $Nl->add_range(0x2160, 0x2182);
12802 $Nl->add_range(0x3007, 0x3007);
12803 $Nl->add_range(0x3021, 0x3029);
12804 }
12805 $Nl->add_range(0xFE20, 0xFE23);
12806 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
12807 # these were added
12808 return $Nl;
12809}
12810
99870f4d
KW
12811sub compile_perl() {
12812 # Create perl-defined tables. Almost all are part of the pseudo-property
12813 # named 'perl' internally to this program. Many of these are recommended
12814 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
12815 # on those found there.
12816 # Almost all of these are equivalent to some Unicode property.
12817 # A number of these properties have equivalents restricted to the ASCII
12818 # range, with their names prefaced by 'Posix', to signify that these match
12819 # what the Posix standard says they should match. A couple are
12820 # effectively this, but the name doesn't have 'Posix' in it because there
cbc24f92
KW
12821 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
12822 # to the full Unicode range, by our guesses as to what is appropriate.
99870f4d
KW
12823
12824 # 'Any' is all code points. As an error check, instead of just setting it
12825 # to be that, construct it to be the union of all the major categories
7fc6cb55 12826 $Any = $perl->add_match_table('Any',
6189eadc 12827 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
99870f4d
KW
12828 Matches_All => 1);
12829
12830 foreach my $major_table ($gc->tables) {
12831
12832 # Major categories are the ones with single letter names.
12833 next if length($major_table->name) != 1;
12834
12835 $Any += $major_table;
12836 }
12837
6189eadc 12838 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
99870f4d
KW
12839 Carp::my_carp_bug("Generated highest code point ("
12840 . sprintf("%X", $Any->max)
6189eadc 12841 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
99870f4d
KW
12842 }
12843 if ($Any->range_count != 1 || $Any->min != 0) {
12844 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
12845 }
12846
12847 $Any->add_alias('All');
12848
12849 # Assigned is the opposite of gc=unassigned
12850 my $Assigned = $perl->add_match_table('Assigned',
12851 Description => "All assigned code points",
12852 Initialize => ~ $gc->table('Unassigned'),
12853 );
12854
12855 # Our internal-only property should be treated as more than just a
8050d00f 12856 # synonym; grandfather it in to the pod.
b15a0a3b
KW
12857 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
12858 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
99870f4d
KW
12859 ->set_equivalent_to(property_ref('ccc')->table('Above'),
12860 Related => 1);
12861
12862 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
12863 if (defined $block) { # This is equivalent to the block if have it.
12864 my $Unicode_ASCII = $block->table('Basic_Latin');
12865 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
12866 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
12867 }
12868 }
12869
12870 # Very early releases didn't have blocks, so initialize ASCII ourselves if
12871 # necessary
12872 if ($ASCII->is_empty) {
428230fa 12873 $ASCII->add_range(0, 127);
99870f4d
KW
12874 }
12875
99870f4d
KW
12876 # Get the best available case definitions. Early Unicode versions didn't
12877 # have Uppercase and Lowercase defined, so use the general category
4e165f0a
KW
12878 # instead for them, modified by hard-coding in the code points each is
12879 # missing.
99870f4d
KW
12880 my $Lower = $perl->add_match_table('Lower');
12881 my $Unicode_Lower = property_ref('Lowercase');
12882 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
12883 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
a5c376b7 12884
99870f4d
KW
12885 }
12886 else {
4e165f0a
KW
12887 $Lower += $gc->table('Lowercase_Letter');
12888
12889 # There are quite a few code points in Lower, that aren't in gc=lc,
12890 # and not all are in all releases.
12891 foreach my $code_point ( 0x00AA,
12892 0x00BA,
12893 0x02B0 .. 0x02B8,
12894 0x02C0 .. 0x02C1,
12895 0x02E0 .. 0x02E4,
12896 0x0345,
12897 0x037A,
12898 0x1D2C .. 0x1D6A,
12899 0x1D78,
12900 0x1D9B .. 0x1DBF,
12901 0x2071,
12902 0x207F,
12903 0x2090 .. 0x209C,
12904 0x2170 .. 0x217F,
12905 0x24D0 .. 0x24E9,
12906 0x2C7C .. 0x2C7D,
12907 0xA770,
12908 0xA7F8 .. 0xA7F9,
12909 ) {
12910 # Don't include the code point unless it is assigned in this
12911 # release
12912 my $category = $gc->value_of(hex $code_point);
12913 next if ! defined $category || $category eq 'Cn';
12914
12915 $Lower += $code_point;
12916 }
99870f4d 12917 }
cbc24f92 12918 $Lower->add_alias('XPosixLower');
a5c376b7 12919 my $Posix_Lower = $perl->add_match_table("PosixLower",
ad5e8af1
KW
12920 Description => "[a-z]",
12921 Initialize => $Lower & $ASCII,
12922 );
99870f4d
KW
12923
12924 my $Upper = $perl->add_match_table('Upper');
12925 my $Unicode_Upper = property_ref('Uppercase');
12926 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
12927 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
12928 }
12929 else {
4e165f0a
KW
12930
12931 # Unlike Lower, there are only two ranges in Upper that aren't in
12932 # gc=Lu, and all code points were assigned in all releases.
12933 $Upper += $gc->table('Uppercase_Letter');
12934 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
12935 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
99870f4d 12936 }
cbc24f92 12937 $Upper->add_alias('XPosixUpper');
a5c376b7 12938 my $Posix_Upper = $perl->add_match_table("PosixUpper",
ad5e8af1
KW
12939 Description => "[A-Z]",
12940 Initialize => $Upper & $ASCII,
12941 );
99870f4d
KW
12942
12943 # Earliest releases didn't have title case. Initialize it to empty if not
12944 # otherwise present
4364919a
KW
12945 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
12946 Description => '(= \p{Gc=Lt})');
99870f4d 12947 my $lt = $gc->table('Lt');
a5c376b7
KW
12948
12949 # Earlier versions of mktables had this related to $lt since they have
c12f2655
KW
12950 # identical code points, but their caseless equivalents are not the same,
12951 # one being 'Cased' and the other being 'LC', and so now must be kept as
12952 # separate entities.
2e2778b2
KW
12953 if (defined $lt) {
12954 $Title += $lt;
12955 }
12956 else {
12957 push @tables_that_may_be_empty, $Title->complete_name;
12958 }
99870f4d 12959
a5c376b7 12960 my $Unicode_Cased = property_ref('Cased');
f53af35e
KW
12961 if (defined $Unicode_Cased) {
12962 my $yes = $Unicode_Cased->table('Y');
12963 my $no = $Unicode_Cased->table('N');
12964 $Title->set_caseless_equivalent($yes);
12965 if (defined $Unicode_Upper) {
12966 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
12967 $Unicode_Upper->table('N')->set_caseless_equivalent($no);
12968 }
12969 $Upper->set_caseless_equivalent($yes);
12970 if (defined $Unicode_Lower) {
12971 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
12972 $Unicode_Lower->table('N')->set_caseless_equivalent($no);
12973 }
12974 $Lower->set_caseless_equivalent($yes);
12975 }
12976 else {
12977 # If this Unicode version doesn't have Cased, set up the Perl
12978 # extension from first principles. From Unicode 5.1: Definition D120:
12979 # A character C is defined to be cased if and only if C has the
12980 # Lowercase or Uppercase property or has a General_Category value of
12981 # Titlecase_Letter.
99870f4d
KW
12982 my $cased = $perl->add_match_table('Cased',
12983 Initialize => $Lower + $Upper + $Title,
12984 Description => 'Uppercase or Lowercase or Titlecase',
12985 );
f53af35e
KW
12986 # $notcased is purely for the caseless equivalents below
12987 my $notcased = $perl->add_match_table('_Not_Cased',
12988 Initialize => ~ $cased,
12989 Fate => $INTERNAL_ONLY,
12990 Description => 'All not-cased code points');
12991 $Title->set_caseless_equivalent($cased);
12992 if (defined $Unicode_Upper) {
12993 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
12994 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
12995 }
12996 $Upper->set_caseless_equivalent($cased);
12997 if (defined $Unicode_Lower) {
12998 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
12999 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
13000 }
13001 $Lower->set_caseless_equivalent($cased);
99870f4d
KW
13002 }
13003
13004 # Similarly, set up our own Case_Ignorable property if this Unicode
13005 # version doesn't have it. From Unicode 5.1: Definition D121: A character
13006 # C is defined to be case-ignorable if C has the value MidLetter or the
13007 # value MidNumLet for the Word_Break property or its General_Category is
13008 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
13009 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
13010
8050d00f
KW
13011 # Perl has long had an internal-only alias for this property; grandfather
13012 # it in to the pod, but discourage its use.
13013 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
b15a0a3b
KW
13014 Re_Pod_Entry => 1,
13015 Fate => $INTERNAL_ONLY,
13016 Status => $DISCOURAGED);
99870f4d
KW
13017 my $case_ignorable = property_ref('Case_Ignorable');
13018 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13019 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13020 Related => 1);
13021 }
13022 else {
13023
13024 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13025
13026 # The following three properties are not in early releases
13027 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13028 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13029 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13030
13031 # For versions 4.1 - 5.0, there is no MidNumLet property, and
13032 # correspondingly the case-ignorable definition lacks that one. For
13033 # 4.0, it appears that it was meant to be the same definition, but was
13034 # inadvertently omitted from the standard's text, so add it if the
13035 # property actually is there
13036 my $wb = property_ref('Word_Break');
13037 if (defined $wb) {
13038 my $midlet = $wb->table('MidLetter');
13039 $perl_case_ignorable += $midlet if defined $midlet;
13040 my $midnumlet = $wb->table('MidNumLet');
13041 $perl_case_ignorable += $midnumlet if defined $midnumlet;
13042 }
13043 else {
13044
13045 # In earlier versions of the standard, instead of the above two
13046 # properties , just the following characters were used:
13047 $perl_case_ignorable += 0x0027 # APOSTROPHE
13048 + 0x00AD # SOFT HYPHEN (SHY)
13049 + 0x2019; # RIGHT SINGLE QUOTATION MARK
13050 }
13051 }
13052
13053 # The remaining perl defined tables are mostly based on Unicode TR 18,
13054 # "Annex C: Compatibility Properties". All of these have two versions,
13055 # one whose name generally begins with Posix that is posix-compliant, and
13056 # one that matches Unicode characters beyond the Posix, ASCII range
13057
ad5e8af1 13058 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
13059
13060 # Alphabetic was not present in early releases
13061 my $Alphabetic = property_ref('Alphabetic');
13062 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13063 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13064 }
13065 else {
13066
fc862497
KW
13067 # The Alphabetic property doesn't exist for early releases, so
13068 # generate it. The actual definition, in 5.2 terms is:
13069 #
13070 # gc=L + gc=Nl + Other_Alphabetic
13071 #
13072 # Other_Alphabetic is also not defined in these early releases, but it
13073 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13074 # those last two as well, then subtract the relatively few of them that
13075 # shouldn't have been added. (The gc=So range is the circled capital
13076 # Latin characters. Early releases mistakenly didn't also include the
13077 # lower-case versions of these characters, and so we don't either, to
13078 # maintain consistency with those releases that first had this
13079 # property.
99870f4d 13080 $Alpha->initialize($gc->table('Letter')
fc862497
KW
13081 + pre_3_dot_1_Nl()
13082 + $gc->table('Mn')
13083 + $gc->table('Mc')
13084 );
13085 $Alpha->add_range(0x24D0, 0x24E9); # gc=So
13086 foreach my $range ( [ 0x0300, 0x0344 ],
13087 [ 0x0346, 0x034E ],
13088 [ 0x0360, 0x0362 ],
13089 [ 0x0483, 0x0486 ],
13090 [ 0x0591, 0x05AF ],
13091 [ 0x06DF, 0x06E0 ],
13092 [ 0x06EA, 0x06EC ],
13093 [ 0x0740, 0x074A ],
13094 0x093C,
13095 0x094D,
13096 [ 0x0951, 0x0954 ],
13097 0x09BC,
13098 0x09CD,
13099 0x0A3C,
13100 0x0A4D,
13101 0x0ABC,
13102 0x0ACD,
13103 0x0B3C,
13104 0x0B4D,
13105 0x0BCD,
13106 0x0C4D,
13107 0x0CCD,
13108 0x0D4D,
13109 0x0DCA,
13110 [ 0x0E47, 0x0E4C ],
13111 0x0E4E,
13112 [ 0x0EC8, 0x0ECC ],
13113 [ 0x0F18, 0x0F19 ],
13114 0x0F35,
13115 0x0F37,
13116 0x0F39,
13117 [ 0x0F3E, 0x0F3F ],
13118 [ 0x0F82, 0x0F84 ],
13119 [ 0x0F86, 0x0F87 ],
13120 0x0FC6,
13121 0x1037,
13122 0x1039,
13123 [ 0x17C9, 0x17D3 ],
13124 [ 0x20D0, 0x20DC ],
13125 0x20E1,
13126 [ 0x302A, 0x302F ],
13127 [ 0x3099, 0x309A ],
13128 [ 0xFE20, 0xFE23 ],
13129 [ 0x1D165, 0x1D169 ],
13130 [ 0x1D16D, 0x1D172 ],
13131 [ 0x1D17B, 0x1D182 ],
13132 [ 0x1D185, 0x1D18B ],
13133 [ 0x1D1AA, 0x1D1AD ],
13134 ) {
13135 if (ref $range) {
13136 $Alpha->delete_range($range->[0], $range->[1]);
13137 }
13138 else {
13139 $Alpha->delete_range($range, $range);
13140 }
13141 }
ad5e8af1 13142 $Alpha->add_description('Alphabetic');
0ff33a84 13143 $Alpha->add_alias('Alphabetic');
99870f4d 13144 }
cbc24f92 13145 $Alpha->add_alias('XPosixAlpha');
a5c376b7 13146 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
ad5e8af1
KW
13147 Description => "[A-Za-z]",
13148 Initialize => $Alpha & $ASCII,
13149 );
a5c376b7
KW
13150 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13151 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
99870f4d
KW
13152
13153 my $Alnum = $perl->add_match_table('Alnum',
56339b2c 13154 Description => 'Alphabetic and (decimal) Numeric',
99870f4d
KW
13155 Initialize => $Alpha + $gc->table('Decimal_Number'),
13156 );
cbc24f92 13157 $Alnum->add_alias('XPosixAlnum');
ad5e8af1
KW
13158 $perl->add_match_table("PosixAlnum",
13159 Description => "[A-Za-z0-9]",
13160 Initialize => $Alnum & $ASCII,
13161 );
99870f4d
KW
13162
13163 my $Word = $perl->add_match_table('Word',
d35dd6c6
KW
13164 Description => '\w, including beyond ASCII;'
13165 . ' = \p{Alnum} + \pM + \p{Pc}',
99870f4d
KW
13166 Initialize => $Alnum + $gc->table('Mark'),
13167 );
cbc24f92 13168 $Word->add_alias('XPosixWord');
99870f4d 13169 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
545b221f
KW
13170 if (defined $Pc) {
13171 $Word += $Pc;
13172 }
13173 else {
13174 $Word += ord('_'); # Make sure this is a $Word
13175 }
7a4d6ad6
KW
13176 my $JC = property_ref('Join_Control'); # Wasn't in release 1
13177 if (defined $JC) {
13178 $Word += $JC->table('Y');
13179 }
13180 else {
13181 $Word += 0x200C + 0x200D;
13182 }
99870f4d 13183
f38f76ae 13184 # This is a Perl extension, so the name doesn't begin with Posix.
cbc24f92 13185 my $PerlWord = $perl->add_match_table('PerlWord',
99870f4d
KW
13186 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13187 Initialize => $Word & $ASCII,
13188 );
cbc24f92 13189 $PerlWord->add_alias('PosixWord');
99870f4d
KW
13190
13191 my $Blank = $perl->add_match_table('Blank',
13192 Description => '\h, Horizontal white space',
13193
13194 # 200B is Zero Width Space which is for line
13195 # break control, and was listed as
13196 # Space_Separator in early releases
13197 Initialize => $gc->table('Space_Separator')
13198 + 0x0009 # TAB
13199 - 0x200B, # ZWSP
13200 );
13201 $Blank->add_alias('HorizSpace'); # Another name for it.
cbc24f92 13202 $Blank->add_alias('XPosixBlank');
ad5e8af1
KW
13203 $perl->add_match_table("PosixBlank",
13204 Description => "\\t and ' '",
13205 Initialize => $Blank & $ASCII,
13206 );
99870f4d
KW
13207
13208 my $VertSpace = $perl->add_match_table('VertSpace',
13209 Description => '\v',
13210 Initialize => $gc->table('Line_Separator')
13211 + $gc->table('Paragraph_Separator')
13212 + 0x000A # LINE FEED
13213 + 0x000B # VERTICAL TAB
13214 + 0x000C # FORM FEED
13215 + 0x000D # CARRIAGE RETURN
13216 + 0x0085, # NEL
13217 );
13218 # No Posix equivalent for vertical space
13219
13220 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
13221 Description => '\s including beyond ASCII plus vertical tab',
13222 Initialize => $Blank + $VertSpace,
99870f4d 13223 );
cbc24f92 13224 $Space->add_alias('XPosixSpace');
ad5e8af1 13225 $perl->add_match_table("PosixSpace",
f38f76ae 13226 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
13227 Initialize => $Space & $ASCII,
13228 );
99870f4d
KW
13229
13230 # Perl's traditional space doesn't include Vertical Tab
cbc24f92 13231 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
99870f4d 13232 Description => '\s, including beyond ASCII',
075b9d7d
KW
13233 #Initialize => $Space - 0x000B,
13234 Initialize => $Space,
99870f4d 13235 );
cbc24f92
KW
13236 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
13237 my $PerlSpace = $perl->add_match_table('PerlSpace',
075b9d7d 13238 Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
de25ec47 13239 Initialize => $XPerlSpace & $ASCII,
99870f4d
KW
13240 );
13241
cbc24f92 13242
99870f4d 13243 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 13244 Description => 'Control characters');
99870f4d 13245 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
cbc24f92 13246 $Cntrl->add_alias('XPosixCntrl');
ad5e8af1 13247 $perl->add_match_table("PosixCntrl",
f38f76ae 13248 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
13249 Initialize => $Cntrl & $ASCII,
13250 );
99870f4d
KW
13251
13252 # $controls is a temporary used to construct Graph.
13253 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13254 + $gc->table('Control'));
13255 # Cs not in release 1
13256 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13257
13258 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
13259 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 13260 Description => 'Characters that are graphical',
99870f4d
KW
13261 Initialize => ~ ($Space + $controls),
13262 );
cbc24f92 13263 $Graph->add_alias('XPosixGraph');
ad5e8af1 13264 $perl->add_match_table("PosixGraph",
f38f76ae
KW
13265 Description =>
13266 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
13267 Initialize => $Graph & $ASCII,
13268 );
99870f4d 13269
3e20195b 13270 $print = $perl->add_match_table('Print',
ad5e8af1 13271 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 13272 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 13273 );
cbc24f92 13274 $print->add_alias('XPosixPrint');
ad5e8af1 13275 $perl->add_match_table("PosixPrint",
66fd7fd0 13276 Description =>
f38f76ae 13277 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 13278 Initialize => $print & $ASCII,
ad5e8af1 13279 );
99870f4d
KW
13280
13281 my $Punct = $perl->add_match_table('Punct');
13282 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13283
13284 # \p{punct} doesn't include the symbols, which posix does
cbc24f92
KW
13285 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13286 Description => '\p{Punct} + ASCII-range \p{Symbol}',
13287 Initialize => $gc->table('Punctuation')
13288 + ($ASCII & $gc->table('Symbol')),
bb080638 13289 Perl_Extension => 1
cbc24f92 13290 );
bb080638 13291 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
f38f76ae 13292 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
cbc24f92 13293 Initialize => $ASCII & $XPosixPunct,
ad5e8af1 13294 );
99870f4d
KW
13295
13296 my $Digit = $perl->add_match_table('Digit',
f3a73f6e 13297 Description => '[0-9] + all other decimal digits');
99870f4d 13298 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
cbc24f92 13299 $Digit->add_alias('XPosixDigit');
ad5e8af1
KW
13300 my $PosixDigit = $perl->add_match_table("PosixDigit",
13301 Description => '[0-9]',
13302 Initialize => $Digit & $ASCII,
13303 );
99870f4d 13304
eadadd41
KW
13305 # Hex_Digit was not present in first release
13306 my $Xdigit = $perl->add_match_table('XDigit');
cbc24f92 13307 $Xdigit->add_alias('XPosixXDigit');
eadadd41
KW
13308 my $Hex = property_ref('Hex_Digit');
13309 if (defined $Hex && ! $Hex->is_empty) {
13310 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
13311 }
13312 else {
eadadd41
KW
13313 # (Have to use hex instead of e.g. '0', because could be running on an
13314 # non-ASCII machine, and we want the Unicode (ASCII) values)
13315 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
13316 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13317 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d 13318 }
4efcc33b
KW
13319
13320 # AHex was not present in early releases
13321 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13322 my $AHex = property_ref('ASCII_Hex_Digit');
13323 if (defined $AHex && ! $AHex->is_empty) {
13324 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13325 }
13326 else {
13327 $PosixXDigit->initialize($Xdigit & $ASCII);
aff65f9f
KW
13328 $PosixXDigit->add_alias('AHex');
13329 $PosixXDigit->add_alias('Ascii_Hex_Digit');
4efcc33b
KW
13330 }
13331 $PosixXDigit->add_description('[0-9A-Fa-f]');
99870f4d 13332
99870f4d
KW
13333 my $dt = property_ref('Decomposition_Type');
13334 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13335 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13336 Perl_Extension => 1,
d57ccc9a 13337 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
13338 );
13339
13340 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13341 # than SD appeared, construct it ourselves, based on the first release SD
8050d00f 13342 # was in. A pod entry is grandfathered in for it
33e96e72 13343 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
301ba948
KW
13344 Perl_Extension => 1,
13345 Fate => $INTERNAL_ONLY,
13346 Status => $DISCOURAGED);
99870f4d
KW
13347 my $soft_dotted = property_ref('Soft_Dotted');
13348 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13349 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13350 }
13351 else {
13352
c3783c6c
KW
13353 # This list came from 3.2 Soft_Dotted; all of these code points are in
13354 # all releases
99870f4d
KW
13355 $CanonDCIJ->initialize([ 0x0069,
13356 0x006A,
13357 0x012F,
13358 0x0268,
13359 0x0456,
13360 0x0458,
13361 0x1E2D,
13362 0x1ECB,
13363 ]);
13364 $CanonDCIJ = $CanonDCIJ & $Assigned;
13365 }
13366
a19e6569 13367 # For backward compatibility, Perl has its own definition for IDStart.
c3783c6c
KW
13368 # It is regular XID_Start plus the underscore, but all characters must be
13369 # Word characters as well
0646dc10 13370 my $XID_Start = property_ref('XID_Start');
8fdef279 13371 my $perl_xids = $perl->add_match_table('_Perl_IDStart',
5a5f34c4
KW
13372 Perl_Extension => 1,
13373 Fate => $INTERNAL_ONLY,
13374 Initialize => ord('_')
13375 );
0646dc10
KW
13376 if (defined $XID_Start
13377 || defined ($XID_Start = property_ref('ID_Start')))
13378 {
8fdef279 13379 $perl_xids += $XID_Start->table('Y');
0646dc10
KW
13380 }
13381 else {
13382 # For Unicode versions that don't have the property, construct our own
c157f5d2
KW
13383 # from first principles. The actual definition is:
13384 # Letters
13385 # + letter numbers (Nl)
13386 # - Pattern_Syntax
13387 # - Pattern_White_Space
13388 # + stability extensions
13389 # - NKFC modifications
13390 #
13391 # What we do in the code below is to include the identical code points
13392 # that are in the first release that had Unicode's version of this
13393 # property, essentially extrapolating backwards. There were no
13394 # stability extensions until v4.1, so none are included; likewise in
13395 # no Unicode version so far do subtracting PatSyn and PatWS make any
13396 # difference, so those also are ignored.
13397 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
13398
13399 # We do subtract the NFKC modifications that are in the first version
13400 # that had this property. We don't bother to test if they are in the
13401 # version in question, because if they aren't, the operation is a
13402 # no-op. The NKFC modifications are discussed in
13403 # http://www.unicode.org/reports/tr31/#NFKC_Modifications
13404 foreach my $range ( 0x037A,
13405 0x0E33,
13406 0x0EB3,
13407 [ 0xFC5E, 0xFC63 ],
13408 [ 0xFDFA, 0xFE70 ],
13409 [ 0xFE72, 0xFE76 ],
13410 0xFE78,
13411 0xFE7A,
13412 0xFE7C,
13413 0xFE7E,
13414 [ 0xFF9E, 0xFF9F ],
13415 ) {
13416 if (ref $range) {
13417 $perl_xids->delete_range($range->[0], $range->[1]);
13418 }
13419 else {
13420 $perl_xids->delete_range($range, $range);
13421 }
13422 }
0646dc10 13423 }
c157f5d2 13424
8fdef279 13425 $perl_xids &= $Word;
ee24a51c 13426
0686c5ae
KW
13427 my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
13428 Perl_Extension => 1,
13429 Fate => $INTERNAL_ONLY);
13430 my $XIDC = property_ref('XID_Continue');
13431 if (defined $XIDC
13432 || defined ($XIDC = property_ref('ID_Continue')))
13433 {
13434 $perl_xidc += $XIDC->table('Y');
13435 }
13436 else {
13437 # Similarly, we construct our own XIDC if necessary for early Unicode
13438 # versions. The definition is:
13439 # everything in XIDS
13440 # + Gc=Mn
13441 # + Gc=Mc
13442 # + Gc=Nd
13443 # + Gc=Pc
13444 # - Pattern_Syntax
13445 # - Pattern_White_Space
13446 # + stability extensions
13447 # - NFKC modifications
13448 #
13449 # The same thing applies to this as with XIDS for the PatSyn, PatWS,
13450 # and stability extensions. There is a somewhat different set of NFKC
13451 # mods to remove (and add in this case). The ones below make this
13452 # have identical code points as in the first release that defined it.
13453 $perl_xidc += $perl_xids
13454 + $gc->table('L')
13455 + $gc->table('Mn')
13456 + $gc->table('Mc')
13457 + $gc->table('Nd')
13458 + 0x00B7
13459 ;
13460 if (defined (my $pc = $gc->table('Pc'))) {
13461 $perl_xidc += $pc;
13462 }
13463 else { # 1.1.5 didn't have Pc, but these should have been in it
13464 $perl_xidc += 0xFF3F;
13465 $perl_xidc->add_range(0x203F, 0x2040);
13466 $perl_xidc->add_range(0xFE33, 0xFE34);
13467 $perl_xidc->add_range(0xFE4D, 0xFE4F);
13468 }
13469
13470 # Subtract the NFKC mods
13471 foreach my $range ( 0x037A,
13472 [ 0xFC5E, 0xFC63 ],
13473 [ 0xFDFA, 0xFE1F ],
13474 0xFE70,
13475 [ 0xFE72, 0xFE76 ],
13476 0xFE78,
13477 0xFE7A,
13478 0xFE7C,
13479 0xFE7E,
13480 ) {
13481 if (ref $range) {
13482 $perl_xidc->delete_range($range->[0], $range->[1]);
13483 }
13484 else {
13485 $perl_xidc->delete_range($range, $range);
13486 }
13487 }
13488 }
13489
c88f7420
KW
13490 $perl_xidc &= $Word;
13491
8f78a100
KW
13492 # These two tables are for matching \X, which is based on the 'extended'
13493 # grapheme cluster, which came in 5.1; create empty ones if not already
13494 # present. The straight 'grapheme cluster' (non-extended) is used prior
13495 # to 5.1, and differs from the extended (see
30769324
KW
13496 # http://www.unicode.org/reports/tr29/) only by these two tables, so we
13497 # get the older definition automatically when they are empty.
99870f4d 13498 my $gcb = property_ref('Grapheme_Cluster_Break');
30769324
KW
13499 my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
13500 Perl_Extension => 1,
13501 Fate => $INTERNAL_ONLY);
13502 if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
13503 $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
13504 }
13505 else {
13506 push @tables_that_may_be_empty, $perl_prepend->complete_name;
13507 }
13508
8f78a100
KW
13509 # All the tables with _X_ in their names are used in defining \X handling,
13510 # and are based on the Unicode GCB property. Basically, \X matches:
13511 # CR-LF
13512 # | Prepend* Begin Extend*
13513 # | .
1e958ea9 13514 # Begin is: ( Special_Begin | ! Control )
27d4fc33
KW
13515 # Begin is also: ( Regular_Begin | Special_Begin )
13516 # where Regular_Begin is defined as ( ! Control - Special_Begin )
1e958ea9 13517 # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
8f78a100
KW
13518 # Extend is: ( Grapheme_Extend | Spacing_Mark )
13519 # Control is: [ GCB_Control CR LF ]
13520 # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
30769324 13521
30769324
KW
13522 foreach my $gcb_name (qw{ L V T LV LVT }) {
13523
13524 # The perl internal extension's name is the gcb table name prepended
13525 # with an '_X_'
13526 my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
13527 Perl_Extension => 1,
13528 Fate => $INTERNAL_ONLY,
13529 Initialize => $gcb->table($gcb_name),
13530 );
13531 # Version 1 had mostly different Hangul syllables that were removed
13532 # from later versions, so some of the tables may not apply.
13533 if ($v_version lt v2.0) {
13534 push @tables_that_may_be_empty, $perl_table->complete_name;
13535 }
99870f4d 13536 }
8f78a100
KW
13537
13538 # More GCB. Populate a combined hangul syllables table
13539 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
13540 Perl_Extension => 1,
13541 Fate => $INTERNAL_ONLY);
9c3680f9 13542 $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
f0fd9933 13543 $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
99870f4d 13544
1e958ea9
KW
13545 my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
13546 Fate => $INTERNAL_ONLY);
13547 $ri += $gcb->table('RI') if $v_version ge v6.2;
13548
13549 my $specials_begin = $perl->add_match_table('_X_Special_Begin',
13550 Perl_Extension => 1,
13551 Fate => $INTERNAL_ONLY,
13552 Initialize => $lv_lvt_v
13553 + $gcb->table('L')
13554 + $gcb->table('T')
13555 + $ri
13556 );
13557 $specials_begin->add_comment(join_lines( <<END
13558For use in \\X; matches first character of potential multi-character
13559sequences that can begin an extended grapheme cluster. They need special
13560handling because of their complicated nature.
13561END
13562 ));
27d4fc33
KW
13563 my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
13564 Perl_Extension => 1,
1e958ea9 13565 Fate => $INTERNAL_ONLY,
27d4fc33
KW
13566 Initialize => ~ $gcb->table('Control')
13567 - $specials_begin
1e958ea9
KW
13568 - $gcb->table('CR')
13569 - $gcb->table('LF')
13570 );
27d4fc33 13571 $regular_begin->add_comment(join_lines( <<END
1e958ea9 13572For use in \\X; matches first character of anything that can begin an extended
27d4fc33 13573grapheme cluster, except those that require special handling.
1e958ea9
KW
13574END
13575 ));
8f78a100 13576
1e958ea9
KW
13577 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
13578 Fate => $INTERNAL_ONLY,
13579 Initialize => $gcb->table('Extend')
13580 );
8f78a100
KW
13581 if (defined (my $sm = $gcb->table('SpacingMark'))) {
13582 $extend += $sm;
13583 }
13584 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
13585
13586 # End of GCB \X processing
13587
f39c755e 13588 my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
99870f4d
KW
13589
13590 if (@named_sequences) {
13591 push @composition, 'Named_Sequence';
13592 foreach my $sequence (@named_sequences) {
13593 $perl_charname->add_anomalous_entry($sequence);
13594 }
13595 }
13596
13597 my $alias_sentence = "";
21a1aff7 13598 my %abbreviations;
99870f4d 13599 my $alias = property_ref('Name_Alias');
cf01e057
KW
13600 $perl_charname->set_proxy_for('Name_Alias');
13601
13602 # Add each entry in Name_Alias to Perl_Charnames. Where these go with
13603 # respect to any existing entry depends on the entry type. Corrections go
13604 # before said entry, as they should be returned in preference over the
13605 # existing entry. (A correction to a correction should be later in the
13606 # Name_Alias table, so it will correctly precede the erroneous correction
13607 # in Perl_Charnames.)
13608 #
13609 # Abbreviations go after everything else, so they are saved temporarily in
13610 # a hash for later.
13611 #
232ed87f
KW
13612 # Everything else is added added afterwards, which preserves the input
13613 # ordering
b8ba2307 13614
086bd819 13615 foreach my $range ($alias->ranges) {
cf01e057
KW
13616 next if $range->value eq "";
13617 my $code_point = $range->start;
13618 if ($code_point != $range->end) {
13619 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
13620 }
13621 my ($value, $type) = split ': ', $range->value;
13622 my $replace_type;
13623 if ($type eq 'correction') {
13624 $replace_type = $MULTIPLE_BEFORE;
13625 }
13626 elsif ($type eq 'abbreviation') {
4cc560b2 13627
cf01e057
KW
13628 # Save for later
13629 $abbreviations{$value} = $code_point;
13630 next;
b8ba2307 13631 }
cf01e057
KW
13632 else {
13633 $replace_type = $MULTIPLE_AFTER;
13634 }
13635
13636 # Actually add; before or after current entry(ies) as determined
13637 # above.
13638
13639 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
13640 }
d7ddd2c4
KW
13641 $alias_sentence = <<END;
13642The Name_Alias property adds duplicate code point entries that are
13643alternatives to the original name. If an addition is a corrected
13644name, it will be physically first in the table. The original (less correct,
13645but still valid) name will be next; then any alternatives, in no particular
13646order; and finally any abbreviations, again in no particular order.
13647END
21a1aff7
KW
13648
13649 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
13650 # precedence before 6.1, so should be first in the file; the other names
13651 # have precedence starting in 6.1,
13652 my $before_or_after = ($v_version lt v6.1.0)
13653 ? $MULTIPLE_BEFORE
13654 : $MULTIPLE_AFTER;
b8ba2307 13655
4cc560b2
KW
13656 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
13657 my $code_point = $range->start;
13658 my $unicode_1_value = $range->value;
13659 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
898b2fa7 13660
4cc560b2
KW
13661 if ($code_point != $range->end) {
13662 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
13663 }
898b2fa7 13664
4cc560b2
KW
13665 # To handle EBCDIC, we don't hard code in the code points of the
13666 # controls; instead realizing that all of them are below 256.
13667 last if $code_point > 255;
898b2fa7 13668
4cc560b2
KW
13669 # We only add in the controls.
13670 next if $gc->value_of($code_point) ne 'Cc';
898b2fa7 13671
fe3193b5
KW
13672 # We reject this Unicode1 name for later Perls, as it is used for
13673 # another code point
13674 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
13675
4cc560b2
KW
13676 # This won't add an exact duplicate.
13677 $perl_charname->add_duplicate($code_point, $unicode_1_value,
13678 Replace => $before_or_after);
13679 }
898b2fa7 13680
f447a8a2
KW
13681 # But in this version only, the ALERT has precedence over BELL, the
13682 # Unicode_1_Name that would otherwise have precedence.
13683 if ($v_version eq v6.0.0) {
13684 $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
13685 }
13686
4cc560b2
KW
13687 # Now that have everything added, add in abbreviations after
13688 # everything else.
13689 foreach my $value (keys %abbreviations) {
13690 $perl_charname->add_duplicate($abbreviations{$value}, $value,
13691 Replace => $MULTIPLE_AFTER);
99870f4d 13692 }
7620cb10 13693
99870f4d
KW
13694 my $comment;
13695 if (@composition <= 2) { # Always at least 2
13696 $comment = join " and ", @composition;
13697 }
13698 else {
13699 $comment = join ", ", @composition[0 .. scalar @composition - 2];
13700 $comment .= ", and $composition[-1]";
13701 }
13702
99870f4d
KW
13703 $perl_charname->add_comment(join_lines( <<END
13704This file is for charnames.pm. It is the union of the $comment properties.
7620cb10
KW
13705Unicode_1_Name entries are used only for nameless code points in the Name
13706property.
99870f4d 13707$alias_sentence
a03f0b9f
KW
13708This file doesn't include the algorithmically determinable names. For those,
13709use 'unicore/Name.pm'
13710END
13711 ));
13712 property_ref('Name')->add_comment(join_lines( <<END
13713This file doesn't include the algorithmically determinable names. For those,
13714use 'unicore/Name.pm'
99870f4d
KW
13715END
13716 ));
13717
99870f4d
KW
13718 # Construct the Present_In property from the Age property.
13719 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
13720 my $default_map = $age->default_map;
13721 my $in = Property->new('In',
13722 Default_Map => $default_map,
13723 Full_Name => "Present_In",
99870f4d
KW
13724 Perl_Extension => 1,
13725 Type => $ENUM,
13726 Initialize => $age,
13727 );
13728 $in->add_comment(join_lines(<<END
c12f2655 13729THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
7881e901 13730same as for $age, and not for what $in really means. This is because anything
99870f4d
KW
13731defined in a given release should have multiple values: that release and all
13732higher ones. But only one value per code point can be represented in a table
13733like this.
13734END
13735 ));
13736
13737 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
13738 # lowest numbered (earliest) come first, with the non-numeric one
13739 # last.
13740 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
13741 ? 1
13742 : ($b->name !~ /^[\d.]*$/)
13743 ? -1
13744 : $a->name <=> $b->name
13745 } $age->tables;
13746
13747 # The Present_In property is the cumulative age properties. The first
13748 # one hence is identical to the first age one.
13749 my $previous_in = $in->add_match_table($first_age->name);
13750 $previous_in->set_equivalent_to($first_age, Related => 1);
13751
13752 my $description_start = "Code point's usage introduced in version ";
13753 $first_age->add_description($description_start . $first_age->name);
13754
98dc9551 13755 # To construct the accumulated values, for each of the age tables
99870f4d
KW
13756 # starting with the 2nd earliest, merge the earliest with it, to get
13757 # all those code points existing in the 2nd earliest. Repeat merging
13758 # the new 2nd earliest with the 3rd earliest to get all those existing
13759 # in the 3rd earliest, and so on.
13760 foreach my $current_age (@rest_ages) {
13761 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
13762
13763 my $current_in = $in->add_match_table(
13764 $current_age->name,
13765 Initialize => $current_age + $previous_in,
13766 Description => $description_start
13767 . $current_age->name
13768 . ' or earlier',
13769 );
13770 $previous_in = $current_in;
13771
13772 # Add clarifying material for the corresponding age file. This is
13773 # in part because of the confusing and contradictory information
13774 # given in the Standard's documentation itself, as of 5.2.
13775 $current_age->add_description(
13776 "Code point's usage was introduced in version "
13777 . $current_age->name);
13778 $current_age->add_note("See also $in");
13779
13780 }
13781
13782 # And finally the code points whose usages have yet to be decided are
13783 # the same in both properties. Note that permanently unassigned code
13784 # points actually have their usage assigned (as being permanently
13785 # unassigned), so that these tables are not the same as gc=cn.
13786 my $unassigned = $in->add_match_table($default_map);
13787 my $age_default = $age->table($default_map);
13788 $age_default->add_description(<<END
13789Code point's usage has not been assigned in any Unicode release thus far.
13790END
13791 );
13792 $unassigned->set_equivalent_to($age_default, Related => 1);
13793 }
13794
674b7f6d
KW
13795 # See L<perlfunc/quotemeta>
13796 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
13797 Perl_Extension => 1,
13798 Fate => $INTERNAL_ONLY,
13799
13800 # Initialize to what's common in
13801 # all Unicode releases.
13802 Initialize =>
13803 $Space
13804 + $gc->table('Control')
13805 );
13806
13807 # In early releases without the proper Unicode properties, just set to \W.
13808 if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
13809 || ! defined (my $patws = property_ref('Pattern_White_Space'))
13810 || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
13811 {
13812 $quotemeta += ~ $Word;
13813 }
13814 else {
13815 $quotemeta += $patsyn->table('Y')
13816 + $patws->table('Y')
13817 + $di->table('Y')
13818 + ((~ $Word) & $ASCII);
13819 }
99870f4d
KW
13820
13821 # Finished creating all the perl properties. All non-internal non-string
13822 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
13823 # an underscore.) These do not get a separate entry in the pod file
13824 foreach my $table ($perl->tables) {
13825 foreach my $alias ($table->aliases) {
13826 next if $alias->name =~ /^_/;
13827 $table->add_alias('Is_' . $alias->name,
33e96e72 13828 Re_Pod_Entry => 0,
fd1e3e84 13829 UCD => 0,
99870f4d 13830 Status => $alias->status,
0eac1e20 13831 OK_as_Filename => 0);
99870f4d
KW
13832 }
13833 }
13834
c4019d52
KW
13835 # Here done with all the basic stuff. Ready to populate the information
13836 # about each character if annotating them.
558712cf 13837 if ($annotate) {
c4019d52
KW
13838
13839 # See comments at its declaration
13840 $annotate_ranges = Range_Map->new;
13841
13842 # This separates out the non-characters from the other unassigneds, so
13843 # can give different annotations for each.
13844 $unassigned_sans_noncharacters = Range_List->new(
1d025d66
KW
13845 Initialize => $gc->table('Unassigned'));
13846 if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
13847 $unassigned_sans_noncharacters &= $nonchars->table('N');
13848 }
c4019d52 13849
6189eadc 13850 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
c4019d52
KW
13851 $i = populate_char_info($i); # Note sets $i so may cause skips
13852 }
13853 }
13854
99870f4d
KW
13855 return;
13856}
13857
13858sub add_perl_synonyms() {
13859 # A number of Unicode tables have Perl synonyms that are expressed in
13860 # the single-form, \p{name}. These are:
13861 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
13862 # \p{Is_Name} as synonyms
13863 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
13864 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
13865 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
13866 # conflict, \p{Value} and \p{Is_Value} as well
13867 #
13868 # This routine generates these synonyms, warning of any unexpected
13869 # conflicts.
13870
13871 # Construct the list of tables to get synonyms for. Start with all the
13872 # binary and the General_Category ones.
06f26c45
KW
13873 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
13874 property_ref('*');
99870f4d
KW
13875 push @tables, $gc->tables;
13876
13877 # If the version of Unicode includes the Script property, add its tables
359523e2 13878 push @tables, $script->tables if defined $script;
99870f4d
KW
13879
13880 # The Block tables are kept separate because they are treated differently.
13881 # And the earliest versions of Unicode didn't include them, so add only if
13882 # there are some.
13883 my @blocks;
13884 push @blocks, $block->tables if defined $block;
13885
13886 # Here, have the lists of tables constructed. Process blocks last so that
13887 # if there are name collisions with them, blocks have lowest priority.
13888 # Should there ever be other collisions, manual intervention would be
13889 # required. See the comments at the beginning of the program for a
13890 # possible way to handle those semi-automatically.
13891 foreach my $table (@tables, @blocks) {
13892
13893 # For non-binary properties, the synonym is just the name of the
13894 # table, like Greek, but for binary properties the synonym is the name
13895 # of the property, and means the code points in its 'Y' table.
13896 my $nominal = $table;
13897 my $nominal_property = $nominal->property;
13898 my $actual;
13899 if (! $nominal->isa('Property')) {
13900 $actual = $table;
13901 }
13902 else {
13903
13904 # Here is a binary property. Use the 'Y' table. Verify that is
13905 # there
13906 my $yes = $nominal->table('Y');
13907 unless (defined $yes) { # Must be defined, but is permissible to
13908 # be empty.
13909 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
13910 next;
13911 }
13912 $actual = $yes;
13913 }
13914
13915 foreach my $alias ($nominal->aliases) {
13916
13917 # Attempt to create a table in the perl directory for the
13918 # candidate table, using whatever aliases in it that don't
13919 # conflict. Also add non-conflicting aliases for all these
13920 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
13921 PREFIX:
13922 foreach my $prefix ("", 'Is_', 'In_') {
13923
13924 # Only Block properties can have added 'In_' aliases.
13925 next if $prefix eq 'In_' and $nominal_property != $block;
13926
13927 my $proposed_name = $prefix . $alias->name;
13928
13929 # No Is_Is, In_In, nor combinations thereof
13930 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
13931 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
13932
13933 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
13934
13935 # Get a reference to any existing table in the perl
13936 # directory with the desired name.
13937 my $pre_existing = $perl->table($proposed_name);
13938
13939 if (! defined $pre_existing) {
13940
13941 # No name collision, so ok to add the perl synonym.
13942
33e96e72 13943 my $make_re_pod_entry;
0eac1e20 13944 my $ok_as_filename;
4cd1260a 13945 my $status = $alias->status;
99870f4d
KW
13946 if ($nominal_property == $block) {
13947
13948 # For block properties, the 'In' form is preferred for
13949 # external use; the pod file contains wild cards for
13950 # this and the 'Is' form so no entries for those; and
13951 # we don't want people using the name without the
13952 # 'In', so discourage that.
13953 if ($prefix eq "") {
33e96e72 13954 $make_re_pod_entry = 1;
99870f4d 13955 $status = $status || $DISCOURAGED;
0eac1e20 13956 $ok_as_filename = 0;
99870f4d
KW
13957 }
13958 elsif ($prefix eq 'In_') {
33e96e72 13959 $make_re_pod_entry = 0;
99870f4d 13960 $status = $status || $NORMAL;
0eac1e20 13961 $ok_as_filename = 1;
99870f4d
KW
13962 }
13963 else {
33e96e72 13964 $make_re_pod_entry = 0;
99870f4d 13965 $status = $status || $DISCOURAGED;
0eac1e20 13966 $ok_as_filename = 0;
99870f4d
KW
13967 }
13968 }
13969 elsif ($prefix ne "") {
13970
13971 # The 'Is' prefix is handled in the pod by a wild
13972 # card, and we won't use it for an external name
33e96e72 13973 $make_re_pod_entry = 0;
99870f4d 13974 $status = $status || $NORMAL;
0eac1e20 13975 $ok_as_filename = 0;
99870f4d
KW
13976 }
13977 else {
13978
13979 # Here, is an empty prefix, non block. This gets its
13980 # own pod entry and can be used for an external name.
33e96e72 13981 $make_re_pod_entry = 1;
99870f4d 13982 $status = $status || $NORMAL;
0eac1e20 13983 $ok_as_filename = 1;
99870f4d
KW
13984 }
13985
13986 # Here, there isn't a perl pre-existing table with the
13987 # name. Look through the list of equivalents of this
13988 # table to see if one is a perl table.
13989 foreach my $equivalent ($actual->leader->equivalents) {
13990 next if $equivalent->property != $perl;
13991
13992 # Here, have found a table for $perl. Add this alias
13993 # to it, and are done with this prefix.
13994 $equivalent->add_alias($proposed_name,
33e96e72 13995 Re_Pod_Entry => $make_re_pod_entry,
fd1e3e84
KW
13996
13997 # Currently don't output these in the
13998 # ucd pod, as are strongly discouraged
13999 # from being used
14000 UCD => 0,
14001
99870f4d 14002 Status => $status,
0eac1e20 14003 OK_as_Filename => $ok_as_filename);
99870f4d
KW
14004 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
14005 next PREFIX;
14006 }
14007
14008 # Here, $perl doesn't already have a table that is a
14009 # synonym for this property, add one.
14010 my $added_table = $perl->add_match_table($proposed_name,
33e96e72 14011 Re_Pod_Entry => $make_re_pod_entry,
fd1e3e84
KW
14012
14013 # See UCD comment just above
14014 UCD => 0,
14015
99870f4d 14016 Status => $status,
0eac1e20 14017 OK_as_Filename => $ok_as_filename);
99870f4d
KW
14018 # And it will be related to the actual table, since it is
14019 # based on it.
14020 $added_table->set_equivalent_to($actual, Related => 1);
14021 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
14022 next;
14023 } # End of no pre-existing.
14024
14025 # Here, there is a pre-existing table that has the proposed
14026 # name. We could be in trouble, but not if this is just a
14027 # synonym for another table that we have already made a child
14028 # of the pre-existing one.
6505c6e2 14029 if ($pre_existing->is_set_equivalent_to($actual)) {
99870f4d
KW
14030 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
14031 $pre_existing->add_alias($proposed_name);
14032 next;
14033 }
14034
14035 # Here, there is a name collision, but it still could be ok if
14036 # the tables match the identical set of code points, in which
14037 # case, we can combine the names. Compare each table's code
14038 # point list to see if they are identical.
14039 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
14040 if ($pre_existing->matches_identically_to($actual)) {
14041
14042 # Here, they do match identically. Not a real conflict.
14043 # Make the perl version a child of the Unicode one, except
14044 # in the non-obvious case of where the perl name is
14045 # already a synonym of another Unicode property. (This is
14046 # excluded by the test for it being its own parent.) The
14047 # reason for this exclusion is that then the two Unicode
14048 # properties become related; and we don't really know if
14049 # they are or not. We generate documentation based on
14050 # relatedness, and this would be misleading. Code
14051 # later executed in the process will cause the tables to
14052 # be represented by a single file anyway, without making
14053 # it look in the pod like they are necessarily related.
14054 if ($pre_existing->parent == $pre_existing
14055 && ($pre_existing->property == $perl
14056 || $actual->property == $perl))
14057 {
14058 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14059 $pre_existing->set_equivalent_to($actual, Related => 1);
14060 }
14061 elsif (main::DEBUG && $to_trace) {
14062 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14063 trace $pre_existing->parent;
14064 }
14065 next PREFIX;
14066 }
14067
14068 # Here they didn't match identically, there is a real conflict
14069 # between our new name and a pre-existing property.
14070 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14071 $pre_existing->add_conflicting($nominal->full_name,
14072 'p',
14073 $actual);
14074
14075 # Don't output a warning for aliases for the block
14076 # properties (unless they start with 'In_') as it is
14077 # expected that there will be conflicts and the block
14078 # form loses.
14079 if ($verbosity >= $NORMAL_VERBOSITY
14080 && ($actual->property != $block || $prefix eq 'In_'))
14081 {
14082 print simple_fold(join_lines(<<END
90b20726
KW
14083There is already an alias named $proposed_name (from $pre_existing),
14084so not creating this alias for $actual
99870f4d
KW
14085END
14086 ), "", 4);
14087 }
14088
14089 # Keep track for documentation purposes.
14090 $has_In_conflicts++ if $prefix eq 'In_';
14091 $has_Is_conflicts++ if $prefix eq 'Is_';
14092 }
14093 }
14094 }
14095
14096 # There are some properties which have No and Yes (and N and Y) as
14097 # property values, but aren't binary, and could possibly be confused with
14098 # binary ones. So create caveats for them. There are tables that are
14099 # named 'No', and tables that are named 'N', but confusion is not likely
14100 # unless they are the same table. For example, N meaning Number or
14101 # Neutral is not likely to cause confusion, so don't add caveats to things
14102 # like them.
06f26c45
KW
14103 foreach my $property (grep { $_->type != $BINARY
14104 && $_->type != $FORCED_BINARY }
14105 property_ref('*'))
14106 {
99870f4d
KW
14107 my $yes = $property->table('Yes');
14108 if (defined $yes) {
14109 my $y = $property->table('Y');
14110 if (defined $y && $yes == $y) {
14111 foreach my $alias ($property->aliases) {
14112 $yes->add_conflicting($alias->name);
14113 }
14114 }
14115 }
14116 my $no = $property->table('No');
14117 if (defined $no) {
14118 my $n = $property->table('N');
14119 if (defined $n && $no == $n) {
14120 foreach my $alias ($property->aliases) {
14121 $no->add_conflicting($alias->name, 'P');
14122 }
14123 }
14124 }
14125 }
14126
14127 return;
14128}
14129
14130sub register_file_for_name($$$) {
14131 # Given info about a table and a datafile that it should be associated
98dc9551 14132 # with, register that association
99870f4d
KW
14133
14134 my $table = shift;
14135 my $directory_ref = shift; # Array of the directory path for the file
e6ebc4c0 14136 my $file = shift; # The file name in the final directory.
99870f4d
KW
14137 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14138
14139 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14140
14141 if ($table->isa('Property')) {
14142 $table->set_file_path(@$directory_ref, $file);
48cf9da9 14143 push @map_properties, $table;
315bfd4e
KW
14144
14145 # No swash means don't do the rest of this.
14146 return if $table->fate != $ORDINARY;
14147
14148 # Get the path to the file
14149 my @path = $table->file_path;
14150
14151 # Use just the file name if no subdirectory.
14152 shift @path if $path[0] eq File::Spec->curdir();
14153
14154 my $file = join '/', @path;
14155
14156 # Create a hash entry for utf8_heavy to get the file that stores this
14157 # property's map table
14158 foreach my $alias ($table->aliases) {
14159 my $name = $alias->name;
14160 $loose_property_to_file_of{standardize($name)} = $file;
14161 }
14162
89cf10cc
KW
14163 # And a way for utf8_heavy to find the proper key in the SwashInfo
14164 # hash for this property.
14165 $file_to_swash_name{$file} = "To" . $table->swash_name;
99870f4d
KW
14166 return;
14167 }
14168
14169 # Do all of the work for all equivalent tables when called with the leader
14170 # table, so skip if isn't the leader.
14171 return if $table->leader != $table;
14172
a92d5c2e
KW
14173 # If this is a complement of another file, use that other file instead,
14174 # with a ! prepended to it.
14175 my $complement;
14176 if (($complement = $table->complement) != 0) {
14177 my @directories = $complement->file_path;
14178
14179 # This assumes that the 0th element is something like 'lib',
14180 # the 1th element the property name (in its own directory), like
14181 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14182 # appended to it later.
14183 $directories[1] =~ s/^/!/;
14184 $file = pop @directories;
14185 $directory_ref =\@directories;
14186 }
14187
99870f4d
KW
14188 # Join all the file path components together, using slashes.
14189 my $full_filename = join('/', @$directory_ref, $file);
14190
14191 # All go in the same subdirectory of unicore
14192 if ($directory_ref->[0] ne $matches_directory) {
14193 Carp::my_carp("Unexpected directory in "
14194 . join('/', @{$directory_ref}, $file));
14195 }
14196
14197 # For this table and all its equivalents ...
14198 foreach my $table ($table, $table->equivalents) {
14199
14200 # Associate it with its file internally. Don't include the
14201 # $matches_directory first component
14202 $table->set_file_path(@$directory_ref, $file);
c15fda25
KW
14203
14204 # No swash means don't do the rest of this.
14205 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14206
99870f4d
KW
14207 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14208
14209 my $property = $table->property;
ae51efca
KW
14210 my $property_name = ($property == $perl)
14211 ? "" # 'perl' is never explicitly stated
14212 : standardize($property->name) . '=';
99870f4d 14213
c15fda25
KW
14214 my $is_default = 0; # Is this table the default one for the property?
14215
14216 # To calculate $is_default, we find if this table is the same as the
14217 # default one for the property. But this is complicated by the
14218 # possibility that there is a master table for this one, and the
14219 # information is stored there instead of here.
9e4a1e86
KW
14220 my $parent = $table->parent;
14221 my $leader_prop = $parent->property;
c15fda25
KW
14222 my $default_map = $leader_prop->default_map;
14223 if (defined $default_map) {
14224 my $default_table = $leader_prop->table($default_map);
14225 $is_default = 1 if defined $default_table && $parent == $default_table;
14226 }
9e4a1e86
KW
14227
14228 # Calculate the loose name for this table. Mostly it's just its name,
14229 # standardized. But in the case of Perl tables that are single-form
14230 # equivalents to Unicode properties, it is the latter's name.
14231 my $loose_table_name =
14232 ($property != $perl || $leader_prop == $perl)
14233 ? standardize($table->name)
14234 : standardize($parent->name);
14235
99870f4d
KW
14236 my $deprecated = ($table->status eq $DEPRECATED)
14237 ? $table->status_info
14238 : "";
d867ccfb 14239 my $caseless_equivalent = $table->caseless_equivalent;
99870f4d
KW
14240
14241 # And for each of the table's aliases... This inner loop eventually
14242 # goes through all aliases in the UCD that we generate regex match
14243 # files for
14244 foreach my $alias ($table->aliases) {
c85f591a 14245 my $standard = utf8_heavy_name($table, $alias);
99870f4d
KW
14246
14247 # Generate an entry in either the loose or strict hashes, which
14248 # will translate the property and alias names combination into the
14249 # file where the table for them is stored.
99870f4d 14250 if ($alias->loose_match) {
99870f4d
KW
14251 if (exists $loose_to_file_of{$standard}) {
14252 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14253 }
14254 else {
14255 $loose_to_file_of{$standard} = $sub_filename;
14256 }
14257 }
14258 else {
99870f4d
KW
14259 if (exists $stricter_to_file_of{$standard}) {
14260 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14261 }
14262 else {
14263 $stricter_to_file_of{$standard} = $sub_filename;
14264
14265 # Tightly coupled with how utf8_heavy.pl works, for a
14266 # floating point number that is a whole number, get rid of
14267 # the trailing decimal point and 0's, so that utf8_heavy
14268 # will work. Also note that this assumes that such a
14269 # number is matched strictly; so if that were to change,
14270 # this would be wrong.
c85f591a 14271 if ((my $integer_name = $alias->name)
99870f4d
KW
14272 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14273 {
ae51efca 14274 $stricter_to_file_of{$property_name . $integer_name}
c12f2655 14275 = $sub_filename;
99870f4d
KW
14276 }
14277 }
14278 }
14279
9e4a1e86
KW
14280 # For Unicode::UCD, create a mapping of the prop=value to the
14281 # canonical =value for that property.
14282 if ($standard =~ /=/) {
14283
14284 # This could happen if a strict name mapped into an existing
14285 # loose name. In that event, the strict names would have to
14286 # be moved to a new hash.
14287 if (exists($loose_to_standard_value{$standard})) {
14288 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
14289 }
14290 $loose_to_standard_value{$standard} = $loose_table_name;
14291 }
14292
99870f4d 14293 # Keep a list of the deprecated properties and their filenames
a92d5c2e 14294 if ($deprecated && $complement == 0) {
99870f4d
KW
14295 $utf8::why_deprecated{$sub_filename} = $deprecated;
14296 }
d867ccfb
KW
14297
14298 # And a substitute table, if any, for case-insensitive matching
14299 if ($caseless_equivalent != 0) {
14300 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14301 }
c15fda25
KW
14302
14303 # Add to defaults list if the table this alias belongs to is the
14304 # default one
14305 $loose_defaults{$standard} = 1 if $is_default;
99870f4d
KW
14306 }
14307 }
14308
14309 return;
14310}
14311
14312{ # Closure
14313 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
14314 # conflicts
14315 my %full_dir_name_of; # Full length names of directories used.
14316
14317 sub construct_filename($$$) {
14318 # Return a file name for a table, based on the table name, but perhaps
14319 # changed to get rid of non-portable characters in it, and to make
14320 # sure that it is unique on a file system that allows the names before
14321 # any period to be at most 8 characters (DOS). While we're at it
14322 # check and complain if there are any directory conflicts.
14323
14324 my $name = shift; # The name to start with
14325 my $mutable = shift; # Boolean: can it be changed? If no, but
14326 # yet it must be to work properly, a warning
14327 # is given
14328 my $directories_ref = shift; # A reference to an array containing the
14329 # path to the file, with each element one path
14330 # component. This is used because the same
14331 # name can be used in different directories.
14332 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14333
14334 my $warn = ! defined wantarray; # If true, then if the name is
14335 # changed, a warning is issued as well.
14336
14337 if (! defined $name) {
14338 Carp::my_carp("Undefined name in directory "
14339 . File::Spec->join(@$directories_ref)
14340 . ". '_' used");
14341 return '_';
14342 }
14343
14344 # Make sure that no directory names conflict with each other. Look at
14345 # each directory in the input file's path. If it is already in use,
14346 # assume it is correct, and is merely being re-used, but if we
14347 # truncate it to 8 characters, and find that there are two directories
14348 # that are the same for the first 8 characters, but differ after that,
14349 # then that is a problem.
14350 foreach my $directory (@$directories_ref) {
14351 my $short_dir = substr($directory, 0, 8);
14352 if (defined $full_dir_name_of{$short_dir}) {
14353 next if $full_dir_name_of{$short_dir} eq $directory;
14354 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
14355 }
14356 else {
14357 $full_dir_name_of{$short_dir} = $directory;
14358 }
14359 }
14360
14361 my $path = join '/', @$directories_ref;
14362 $path .= '/' if $path;
14363
14364 # Remove interior underscores.
14365 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
14366
14367 # Change any non-word character into an underscore, and truncate to 8.
14368 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
14369 substr($filename, 8) = "" if length($filename) > 8;
14370
14371 # Make sure the basename doesn't conflict with something we
14372 # might have already written. If we have, say,
14373 # InGreekExtended1
14374 # InGreekExtended2
14375 # they become
14376 # InGreekE
14377 # InGreek2
14378 my $warned = 0;
14379 while (my $num = $base_names{$path}{lc $filename}++) {
14380 $num++; # so basenames with numbers start with '2', which
14381 # just looks more natural.
14382
14383 # Want to append $num, but if it'll make the basename longer
14384 # than 8 characters, pre-truncate $filename so that the result
14385 # is acceptable.
14386 my $delta = length($filename) + length($num) - 8;
14387 if ($delta > 0) {
14388 substr($filename, -$delta) = $num;
14389 }
14390 else {
14391 $filename .= $num;
14392 }
14393 if ($warn && ! $warned) {
14394 $warned = 1;
14395 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
14396 }
14397 }
14398
14399 return $filename if $mutable;
14400
14401 # If not changeable, must return the input name, but warn if needed to
14402 # change it beyond shortening it.
14403 if ($name ne $filename
14404 && substr($name, 0, length($filename)) ne $filename) {
14405 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
14406 }
14407 return $name;
14408 }
14409}
14410
14411# The pod file contains a very large table. Many of the lines in that table
14412# would exceed a typical output window's size, and so need to be wrapped with
14413# a hanging indent to make them look good. The pod language is really
14414# insufficient here. There is no general construct to do that in pod, so it
14415# is done here by beginning each such line with a space to cause the result to
14416# be output without formatting, and doing all the formatting here. This leads
14417# to the result that if the eventual display window is too narrow it won't
14418# look good, and if the window is too wide, no advantage is taken of that
14419# extra width. A further complication is that the output may be indented by
14420# the formatter so that there is less space than expected. What I (khw) have
14421# done is to assume that that indent is a particular number of spaces based on
14422# what it is in my Linux system; people can always resize their windows if
14423# necessary, but this is obviously less than desirable, but the best that can
14424# be expected.
14425my $automatic_pod_indent = 8;
14426
14427# Try to format so that uses fewest lines, but few long left column entries
14428# slide into the right column. An experiment on 5.1 data yielded the
14429# following percentages that didn't cut into the other side along with the
14430# associated first-column widths
14431# 69% = 24
14432# 80% not too bad except for a few blocks
14433# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
14434# 95% = 37;
14435my $indent_info_column = 27; # 75% of lines didn't have overlap
14436
14437my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
14438 # The 3 is because of:
14439 # 1 for the leading space to tell the pod formatter to
14440 # output as-is
14441 # 1 for the flag
14442 # 1 for the space between the flag and the main data
14443
14444sub format_pod_line ($$$;$$) {
14445 # Take a pod line and return it, formatted properly
14446
14447 my $first_column_width = shift;
14448 my $entry = shift; # Contents of left column
14449 my $info = shift; # Contents of right column
14450
14451 my $status = shift || ""; # Any flag
14452
14453 my $loose_match = shift; # Boolean.
14454 $loose_match = 1 unless defined $loose_match;
14455
14456 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14457
14458 my $flags = "";
14459 $flags .= $STRICTER if ! $loose_match;
14460
14461 $flags .= $status if $status;
14462
14463 # There is a blank in the left column to cause the pod formatter to
14464 # output the line as-is.
14465 return sprintf " %-*s%-*s %s\n",
14466 # The first * in the format is replaced by this, the -1 is
14467 # to account for the leading blank. There isn't a
14468 # hard-coded blank after this to separate the flags from
14469 # the rest of the line, so that in the unlikely event that
14470 # multiple flags are shown on the same line, they both
14471 # will get displayed at the expense of that separation,
14472 # but since they are left justified, a blank will be
14473 # inserted in the normal case.
14474 $FILLER - 1,
14475 $flags,
14476
14477 # The other * in the format is replaced by this number to
14478 # cause the first main column to right fill with blanks.
14479 # The -1 is for the guaranteed blank following it.
14480 $first_column_width - $FILLER - 1,
14481 $entry,
14482 $info;
14483}
14484
14485my @zero_match_tables; # List of tables that have no matches in this release
14486
d1476e4d 14487sub make_re_pod_entries($) {
99870f4d
KW
14488 # This generates the entries for the pod file for a given table.
14489 # Also done at this time are any children tables. The output looks like:
14490 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
14491
14492 my $input_table = shift; # Table the entry is for
14493 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14494
14495 # Generate parent and all its children at the same time.
14496 return if $input_table->parent != $input_table;
14497
14498 my $property = $input_table->property;
14499 my $type = $property->type;
14500 my $full_name = $property->full_name;
14501
14502 my $count = $input_table->count;
14503 my $string_count = clarify_number($count);
14504 my $status = $input_table->status;
14505 my $status_info = $input_table->status_info;
56ca34ca 14506 my $caseless_equivalent = $input_table->caseless_equivalent;
99870f4d 14507
f1738f8f
KW
14508 # Don't mention a placeholder equivalent as it isn't to be listed in the
14509 # pod
14510 $caseless_equivalent = 0 if $caseless_equivalent != 0
14511 && $caseless_equivalent->fate > $ORDINARY;
14512
99870f4d
KW
14513 my $entry_for_first_table; # The entry for the first table output.
14514 # Almost certainly, it is the parent.
14515
14516 # For each related table (including itself), we will generate a pod entry
14517 # for each name each table goes by
14518 foreach my $table ($input_table, $input_table->children) {
14519
d4da3f74
KW
14520 # utf8_heavy.pl cannot deal with null string property values, so skip
14521 # any tables that have no non-null names.
14522 next if ! grep { $_->name ne "" } $table->aliases;
99870f4d
KW
14523
14524 # First, gather all the info that applies to this table as a whole.
14525
c24fade0
KW
14526 push @zero_match_tables, $table if $count == 0
14527 # Don't mention special tables
14528 # as being zero length
14529 && $table->fate == $ORDINARY;
99870f4d
KW
14530
14531 my $table_property = $table->property;
14532
14533 # The short name has all the underscores removed, while the full name
14534 # retains them. Later, we decide whether to output a short synonym
14535 # for the full one, we need to compare apples to apples, so we use the
14536 # short name's length including underscores.
14537 my $table_property_short_name_length;
14538 my $table_property_short_name
14539 = $table_property->short_name(\$table_property_short_name_length);
14540 my $table_property_full_name = $table_property->full_name;
14541
14542 # Get how much savings there is in the short name over the full one
14543 # (delta will always be <= 0)
14544 my $table_property_short_delta = $table_property_short_name_length
14545 - length($table_property_full_name);
14546 my @table_description = $table->description;
14547 my @table_note = $table->note;
14548
14549 # Generate an entry for each alias in this table.
14550 my $entry_for_first_alias; # saves the first one encountered.
14551 foreach my $alias ($table->aliases) {
14552
14553 # Skip if not to go in pod.
33e96e72 14554 next unless $alias->make_re_pod_entry;
99870f4d
KW
14555
14556 # Start gathering all the components for the entry
14557 my $name = $alias->name;
14558
d4da3f74
KW
14559 # Skip if name is empty, as can't be accessed by regexes.
14560 next if $name eq "";
14561
99870f4d
KW
14562 my $entry; # Holds the left column, may include extras
14563 my $entry_ref; # To refer to the left column's contents from
14564 # another entry; has no extras
14565
14566 # First the left column of the pod entry. Tables for the $perl
14567 # property always use the single form.
14568 if ($table_property == $perl) {
14569 $entry = "\\p{$name}";
14570 $entry_ref = "\\p{$name}";
14571 }
14572 else { # Compound form.
14573
14574 # Only generate one entry for all the aliases that mean true
14575 # or false in binary properties. Append a '*' to indicate
14576 # some are missing. (The heading comment notes this.)
60e471b3 14577 my $rhs;
99870f4d
KW
14578 if ($type == $BINARY) {
14579 next if $name ne 'N' && $name ne 'Y';
60e471b3 14580 $rhs = "$name*";
99870f4d 14581 }
06f26c45 14582 elsif ($type != $FORCED_BINARY) {
60e471b3 14583 $rhs = $name;
99870f4d 14584 }
06f26c45
KW
14585 else {
14586
14587 # Forced binary properties require special handling. It
14588 # has two sets of tables, one set is true/false; and the
14589 # other set is everything else. Entries are generated for
14590 # each set. Use the Bidi_Mirrored property (which appears
14591 # in all Unicode versions) to get a list of the aliases
14592 # for the true/false tables. Of these, only output the N
14593 # and Y ones, the same as, a regular binary property. And
14594 # output all the rest, same as a non-binary property.
14595 my $bm = property_ref("Bidi_Mirrored");
14596 if ($name eq 'N' || $name eq 'Y') {
14597 $rhs = "$name*";
14598 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
14599 $bm->table("N")->aliases)
14600 {
14601 next;
14602 }
14603 else {
14604 $rhs = $name;
14605 }
14606 }
99870f4d
KW
14607
14608 # Colon-space is used to give a little more space to be easier
14609 # to read;
14610 $entry = "\\p{"
14611 . $table_property_full_name
60e471b3 14612 . ": $rhs}";
99870f4d
KW
14613
14614 # But for the reference to this entry, which will go in the
14615 # right column, where space is at a premium, use equals
14616 # without a space
14617 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
14618 }
14619
14620 # Then the right (info) column. This is stored as components of
14621 # an array for the moment, then joined into a string later. For
14622 # non-internal only properties, begin the info with the entry for
14623 # the first table we encountered (if any), as things are ordered
14624 # so that that one is the most descriptive. This leads to the
14625 # info column of an entry being a more descriptive version of the
14626 # name column
14627 my @info;
14628 if ($name =~ /^_/) {
14629 push @info,
14630 '(For internal use by Perl, not necessarily stable)';
14631 }
14632 elsif ($entry_for_first_alias) {
14633 push @info, $entry_for_first_alias;
14634 }
14635
14636 # If this entry is equivalent to another, add that to the info,
14637 # using the first such table we encountered
14638 if ($entry_for_first_table) {
14639 if (@info) {
14640 push @info, "(= $entry_for_first_table)";
14641 }
14642 else {
14643 push @info, $entry_for_first_table;
14644 }
14645 }
14646
14647 # If the name is a large integer, add an equivalent with an
14648 # exponent for better readability
14649 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
14650 push @info, sprintf "(= %.1e)", $name
14651 }
14652
14653 my $parenthesized = "";
14654 if (! $entry_for_first_alias) {
14655
14656 # This is the first alias for the current table. The alias
14657 # array is ordered so that this is the fullest, most
14658 # descriptive alias, so it gets the fullest info. The other
14659 # aliases are mostly merely pointers to this one, using the
14660 # information already added above.
14661
14662 # Display any status message, but only on the parent table
14663 if ($status && ! $entry_for_first_table) {
14664 push @info, $status_info;
14665 }
14666
14667 # Put out any descriptive info
14668 if (@table_description || @table_note) {
14669 push @info, join "; ", @table_description, @table_note;
14670 }
14671
14672 # Look to see if there is a shorter name we can point people
14673 # at
14674 my $standard_name = standardize($name);
14675 my $short_name;
14676 my $proposed_short = $table->short_name;
14677 if (defined $proposed_short) {
14678 my $standard_short = standardize($proposed_short);
14679
14680 # If the short name is shorter than the standard one, or
14681 # even it it's not, but the combination of it and its
14682 # short property name (as in \p{prop=short} ($perl doesn't
14683 # have this form)) saves at least two characters, then,
14684 # cause it to be listed as a shorter synonym.
14685 if (length $standard_short < length $standard_name
14686 || ($table_property != $perl
14687 && (length($standard_short)
14688 - length($standard_name)
14689 + $table_property_short_delta) # (<= 0)
14690 < -2))
14691 {
14692 $short_name = $proposed_short;
14693 if ($table_property != $perl) {
14694 $short_name = $table_property_short_name
14695 . "=$short_name";
14696 }
14697 $short_name = "\\p{$short_name}";
14698 }
14699 }
14700
14701 # And if this is a compound form name, see if there is a
14702 # single form equivalent
14703 my $single_form;
14704 if ($table_property != $perl) {
14705
14706 # Special case the binary N tables, so that will print
14707 # \P{single}, but use the Y table values to populate
c12f2655 14708 # 'single', as we haven't likewise populated the N table.
06f26c45
KW
14709 # For forced binary tables, we can't just look at the N
14710 # table, but must see if this table is equivalent to the N
14711 # one, as there are two equivalent beasts in these
14712 # properties.
99870f4d
KW
14713 my $test_table;
14714 my $p;
06f26c45
KW
14715 if ( ($type == $BINARY
14716 && $input_table == $property->table('No'))
14717 || ($type == $FORCED_BINARY
14718 && $property->table('No')->
14719 is_set_equivalent_to($input_table)))
99870f4d
KW
14720 {
14721 $test_table = $property->table('Yes');
14722 $p = 'P';
14723 }
14724 else {
14725 $test_table = $input_table;
14726 $p = 'p';
14727 }
14728
14729 # Look for a single form amongst all the children.
14730 foreach my $table ($test_table->children) {
14731 next if $table->property != $perl;
14732 my $proposed_name = $table->short_name;
14733 next if ! defined $proposed_name;
14734
14735 # Don't mention internal-only properties as a possible
14736 # single form synonym
14737 next if substr($proposed_name, 0, 1) eq '_';
14738
14739 $proposed_name = "\\$p\{$proposed_name}";
14740 if (! defined $single_form
14741 || length($proposed_name) < length $single_form)
14742 {
14743 $single_form = $proposed_name;
14744
14745 # The goal here is to find a single form; not the
14746 # shortest possible one. We've already found a
14747 # short name. So, stop at the first single form
14748 # found, which is likely to be closer to the
14749 # original.
14750 last;
14751 }
14752 }
14753 }
14754
14755 # Ouput both short and single in the same parenthesized
14756 # expression, but with only one of 'Single', 'Short' if there
14757 # are both items.
14758 if ($short_name || $single_form || $table->conflicting) {
99870f4d
KW
14759 $parenthesized .= "Short: $short_name" if $short_name;
14760 if ($short_name && $single_form) {
14761 $parenthesized .= ', ';
14762 }
14763 elsif ($single_form) {
14764 $parenthesized .= 'Single: ';
14765 }
14766 $parenthesized .= $single_form if $single_form;
14767 }
14768 }
14769
56ca34ca
KW
14770 if ($caseless_equivalent != 0) {
14771 $parenthesized .= '; ' if $parenthesized ne "";
14772 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
14773 }
14774
99870f4d
KW
14775
14776 # Warn if this property isn't the same as one that a
14777 # semi-casual user might expect. The other components of this
14778 # parenthesized structure are calculated only for the first entry
14779 # for this table, but the conflicting is deemed important enough
14780 # to go on every entry.
14781 my $conflicting = join " NOR ", $table->conflicting;
14782 if ($conflicting) {
e5228720 14783 $parenthesized .= '; ' if $parenthesized ne "";
99870f4d
KW
14784 $parenthesized .= "NOT $conflicting";
14785 }
99870f4d 14786
e5228720 14787 push @info, "($parenthesized)" if $parenthesized;
d57ccc9a 14788
0f88d393
KW
14789 if ($name =~ /_$/ && $alias->loose_match) {
14790 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
14791 }
14792
d57ccc9a
KW
14793 if ($table_property != $perl && $table->perl_extension) {
14794 push @info, '(Perl extension)';
14795 }
2cf724d4 14796 push @info, "($string_count)";
99870f4d
KW
14797
14798 # Now, we have both the entry and info so add them to the
14799 # list of all the properties.
14800 push @match_properties,
14801 format_pod_line($indent_info_column,
14802 $entry,
14803 join( " ", @info),
14804 $alias->status,
14805 $alias->loose_match);
14806
14807 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
14808 } # End of looping through the aliases for this table.
14809
14810 if (! $entry_for_first_table) {
14811 $entry_for_first_table = $entry_for_first_alias;
14812 }
14813 } # End of looping through all the related tables
14814 return;
14815}
14816
2df7880f
KW
14817sub make_ucd_table_pod_entries {
14818 my $table = shift;
14819
ee94c7d1
KW
14820 # Generate the entries for the UCD section of the pod for $table. This
14821 # also calculates if names are ambiguous, so has to be called even if the
14822 # pod is not being output
14823
14824 my $short_name = $table->name;
14825 my $standard_short_name = standardize($short_name);
14826 my $full_name = $table->full_name;
14827 my $standard_full_name = standardize($full_name);
14828
14829 my $full_info = ""; # Text of info column for full-name entries
14830 my $other_info = ""; # Text of info column for short-name entries
14831 my $short_info = ""; # Text of info column for other entries
14832 my $meaning = ""; # Synonym of this table
2df7880f
KW
14833
14834 my $property = ($table->isa('Property'))
14835 ? $table
14836 : $table->parent->property;
14837
ee94c7d1
KW
14838 my $perl_extension = $table->perl_extension;
14839
14840 # Get the more official name for for perl extensions that aren't
14841 # stand-alone properties
14842 if ($perl_extension && $property != $table) {
14843 if ($property == $perl ||$property->type == $BINARY) {
14844 $meaning = $table->complete_name;
14845 }
14846 else {
14847 $meaning = $property->full_name . "=$full_name";
14848 }
14849 }
14850
14851 # There are three types of info column. One for the short name, one for
14852 # the full name, and one for everything else. They mostly are the same,
14853 # so initialize in the same loop.
14854 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
14855 if ($perl_extension && $property != $table) {
14856
14857 # Add the synonymous name for the non-full name entries; and to
14858 # the full-name entry if it adds extra information
14859 if ($info_ref == \$other_info
14860 || ($info_ref == \$short_info
14861 && $standard_short_name ne $standard_full_name)
14862 || standardize($meaning) ne $standard_full_name
14863 ) {
14864 $$info_ref .= "$meaning.";
14865 }
14866 }
14867 elsif ($info_ref != \$full_info) {
14868
14869 # Otherwise, the non-full name columns include the full name
14870 $$info_ref .= $full_name;
14871 }
14872
14873 # And the full-name entry includes the short name, if different
14874 if ($info_ref == \$full_info
14875 && $standard_short_name ne $standard_full_name)
14876 {
14877 $full_info =~ s/\.\Z//;
14878 $full_info .= " " if $full_info;
14879 $full_info .= "(Short: $short_name)";
14880 }
14881
14882 if ($table->perl_extension) {
14883 $$info_ref =~ s/\.\Z//;
14884 $$info_ref .= ". " if $$info_ref;
14885 $$info_ref .= "(Perl extension)";
14886 }
14887 }
14888
14889 # Add any extra annotations to the full name entry
14890 foreach my $more_info ($table->description,
14891 $table->note,
14892 $table->status_info)
14893 {
14894 next unless $more_info;
14895 $full_info =~ s/\.\Z//;
14896 $full_info .= ". " if $full_info;
14897 $full_info .= $more_info;
14898 }
14899
14900 # These keep track if have created full and short name pod entries for the
14901 # property
14902 my $done_full = 0;
14903 my $done_short = 0;
14904
2df7880f
KW
14905 # Every possible name is kept track of, even those that aren't going to be
14906 # output. This way we can be sure to find the ambiguities.
14907 foreach my $alias ($table->aliases) {
14908 my $name = $alias->name;
14909 my $standard = standardize($name);
ee94c7d1
KW
14910 my $info;
14911 my $output_this = $alias->ucd;
14912
14913 # If the full and short names are the same, we want to output the full
14914 # one's entry, so it has priority.
14915 if ($standard eq $standard_full_name) {
14916 next if $done_full;
14917 $done_full = 1;
14918 $info = $full_info;
14919 }
14920 elsif ($standard eq $standard_short_name) {
14921 next if $done_short;
14922 $done_short = 1;
14923 next if $standard_short_name eq $standard_full_name;
14924 $info = $short_info;
14925 }
14926 else {
14927 $info = $other_info;
14928 }
2df7880f 14929
ee94c7d1
KW
14930 # Here, we have set up the two columns for this entry. But if an
14931 # entry already exists for this name, we have to decide which one
14932 # we're going to later output.
2df7880f
KW
14933 if (exists $ucd_pod{$standard}) {
14934
14935 # If the two entries refer to the same property, it's not going to
ee94c7d1
KW
14936 # be ambiguous. (Likely it's because the names when standardized
14937 # are the same.) But that means if they are different properties,
14938 # there is ambiguity.
2df7880f
KW
14939 if ($ucd_pod{$standard}->{'property'} != $property) {
14940
ee94c7d1
KW
14941 # Here, we have an ambiguity. This code assumes that one is
14942 # scheduled to be output and one not and that one is a perl
14943 # extension (which is not to be output) and the other isn't.
14944 # If those assumptions are wrong, things have to be rethought.
14945 if ($ucd_pod{$standard}{'output_this'} == $output_this
14946 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
14947 || $output_this == $perl_extension)
14948 {
d59563d0 14949 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
ee94c7d1
KW
14950 }
14951
14952 # We modifiy the info column of the one being output to
14953 # indicate the ambiguity. Set $which to point to that one's
14954 # info.
14955 my $which;
14956 if ($ucd_pod{$standard}{'output_this'}) {
14957 $which = \$ucd_pod{$standard}->{'info'};
14958 }
14959 else {
14960 $which = \$info;
14961 $meaning = $ucd_pod{$standard}{'meaning'};
14962 }
14963
14964 chomp $$which;
14965 $$which =~ s/\.\Z//;
14966 $$which .= "; NOT '$standard' meaning '$meaning'";
14967
2df7880f
KW
14968 $ambiguous_names{$standard} = 1;
14969 }
14970
ee94c7d1
KW
14971 # Use the non-perl-extension variant
14972 next unless $ucd_pod{$standard}{'perl_extension'};
2df7880f
KW
14973 }
14974
ee94c7d1
KW
14975 # Store enough information about this entry that we can later look for
14976 # ambiguities, and output it properly.
14977 $ucd_pod{$standard} = { 'name' => $name,
14978 'info' => $info,
14979 'meaning' => $meaning,
14980 'output_this' => $output_this,
14981 'perl_extension' => $perl_extension,
2df7880f 14982 'property' => $property,
ee94c7d1 14983 'status' => $alias->status,
2df7880f
KW
14984 };
14985 } # End of looping through all this table's aliases
14986
14987 return;
14988}
14989
99870f4d
KW
14990sub pod_alphanumeric_sort {
14991 # Sort pod entries alphanumerically.
14992
99f78760
KW
14993 # The first few character columns are filler, plus the '\p{'; and get rid
14994 # of all the trailing stuff, starting with the trailing '}', so as to sort
14995 # on just 'Name=Value'
14996 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 14997 $a =~ s/}.*//;
99f78760 14998 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
14999 $b =~ s/}.*//;
15000
99f78760
KW
15001 # Determine if the two operands are both internal only or both not.
15002 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
15003 # should be the underscore that begins internal only
15004 my $a_is_internal = (substr($a, 0, 1) eq '_');
15005 my $b_is_internal = (substr($b, 0, 1) eq '_');
15006
15007 # Sort so the internals come last in the table instead of first (which the
15008 # leading underscore would otherwise indicate).
15009 if ($a_is_internal != $b_is_internal) {
15010 return 1 if $a_is_internal;
15011 return -1
15012 }
15013
99870f4d 15014 # Determine if the two operands are numeric property values or not.
99f78760 15015 # A numeric property will look like xyz: 3. But the number
99870f4d 15016 # can begin with an optional minus sign, and may have a
99f78760 15017 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
15018 # isn't numeric, use alphabetic sort.
15019 my ($a_initial, $a_number) =
99f78760 15020 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
15021 return $a cmp $b unless defined $a_number;
15022 my ($b_initial, $b_number) =
99f78760 15023 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
15024 return $a cmp $b unless defined $b_number;
15025
15026 # Here they are both numeric, but use alphabetic sort if the
15027 # initial parts don't match
15028 return $a cmp $b if $a_initial ne $b_initial;
15029
15030 # Convert rationals to floating for the comparison.
15031 $a_number = eval $a_number if $a_number =~ qr{/};
15032 $b_number = eval $b_number if $b_number =~ qr{/};
15033
15034 return $a_number <=> $b_number;
15035}
15036
15037sub make_pod () {
15038 # Create the .pod file. This generates the various subsections and then
15039 # combines them in one big HERE document.
15040
07c070a8
KW
15041 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
15042
99870f4d
KW
15043 return unless defined $pod_directory;
15044 print "Making pod file\n" if $verbosity >= $PROGRESS;
15045
15046 my $exception_message =
15047 '(Any exceptions are individually noted beginning with the word NOT.)';
15048 my @block_warning;
15049 if (-e 'Blocks.txt') {
15050
15051 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
15052 # if the global $has_In_conflicts indicates we have them.
15053 push @match_properties, format_pod_line($indent_info_column,
15054 '\p{In_*}',
15055 '\p{Block: *}'
15056 . (($has_In_conflicts)
15057 ? " $exception_message"
15058 : ""));
15059 @block_warning = << "END";
15060
77173124
KW
15061Matches in the Block property have shortcuts that begin with "In_". For
15062example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
15063backward compatibility, if there is no conflict with another shortcut, these
15064may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
15065are numerous such conflicting shortcuts. Use of these forms for Block is
15066discouraged, and are flagged as such, not only because of the potential
15067confusion as to what is meant, but also because a later release of Unicode may
15068preempt the shortcut, and your program would no longer be correct. Use the
15069"In_" form instead to avoid this, or even more clearly, use the compound form,
15070e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
15071about this.
99870f4d
KW
15072END
15073 }
07c070a8 15074 my $text = $Is_flags_text;
99870f4d
KW
15075 $text = "$exception_message $text" if $has_Is_conflicts;
15076
15077 # And the 'Is_ line';
15078 push @match_properties, format_pod_line($indent_info_column,
15079 '\p{Is_*}',
15080 "\\p{*} $text");
15081
15082 # Sort the properties array for output. It is sorted alphabetically
15083 # except numerically for numeric properties, and only output unique lines.
15084 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15085
15086 my $formatted_properties = simple_fold(\@match_properties,
15087 "",
15088 # indent succeeding lines by two extra
15089 # which looks better
15090 $indent_info_column + 2,
15091
15092 # shorten the line length by how much
15093 # the formatter indents, so the folded
15094 # line will fit in the space
15095 # presumably available
15096 $automatic_pod_indent);
15097 # Add column headings, indented to be a little more centered, but not
15098 # exactly
15099 $formatted_properties = format_pod_line($indent_info_column,
15100 ' NAME',
15101 ' INFO')
15102 . "\n"
15103 . $formatted_properties;
15104
15105 # Generate pod documentation lines for the tables that match nothing
0090c5d1 15106 my $zero_matches = "";
99870f4d
KW
15107 if (@zero_match_tables) {
15108 @zero_match_tables = uniques(@zero_match_tables);
15109 $zero_matches = join "\n\n",
15110 map { $_ = '=item \p{' . $_->complete_name . "}" }
15111 sort { $a->complete_name cmp $b->complete_name }
c0de960f 15112 @zero_match_tables;
99870f4d
KW
15113
15114 $zero_matches = <<END;
15115
77173124 15116=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
99870f4d
KW
15117
15118Unicode has some property-value pairs that currently don't match anything.
c12f2655
KW
15119This happens generally either because they are obsolete, or they exist for
15120symmetry with other forms, but no language has yet been encoded that uses
15121them. In this version of Unicode, the following match zero code points:
99870f4d
KW
15122
15123=over 4
15124
15125$zero_matches
15126
15127=back
15128
15129END
15130 }
15131
15132 # Generate list of properties that we don't accept, grouped by the reasons
15133 # why. This is so only put out the 'why' once, and then list all the
15134 # properties that have that reason under it.
15135
15136 my %why_list; # The keys are the reasons; the values are lists of
15137 # properties that have the key as their reason
15138
15139 # For each property, add it to the list that are suppressed for its reason
15140 # The sort will cause the alphabetically first properties to be added to
15141 # each list first, so each list will be sorted.
15142 foreach my $property (sort keys %why_suppressed) {
15143 push @{$why_list{$why_suppressed{$property}}}, $property;
15144 }
15145
15146 # For each reason (sorted by the first property that has that reason)...
15147 my @bad_re_properties;
15148 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15149 keys %why_list)
15150 {
54ce19c9 15151 # Add to the output, all the properties that have that reason.
99870f4d
KW
15152 my $has_item = 0; # Flag if actually output anything.
15153 foreach my $name (@{$why_list{$why}}) {
15154
15155 # Split compound names into $property and $table components
15156 my $property = $name;
15157 my $table;
15158 if ($property =~ / (.*) = (.*) /x) {
15159 $property = $1;
15160 $table = $2;
15161 }
15162
15163 # This release of Unicode may not have a property that is
15164 # suppressed, so don't reference a non-existent one.
15165 $property = property_ref($property);
15166 next if ! defined $property;
15167
15168 # And since this list is only for match tables, don't list the
15169 # ones that don't have match tables.
15170 next if ! $property->to_create_match_tables;
15171
15172 # Find any abbreviation, and turn it into a compound name if this
15173 # is a property=value pair.
15174 my $short_name = $property->name;
15175 $short_name .= '=' . $property->table($table)->name if $table;
15176
54ce19c9
KW
15177 # Start with an empty line.
15178 push @bad_re_properties, "\n\n" unless $has_item;
15179
99870f4d
KW
15180 # And add the property as an item for the reason.
15181 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15182 $has_item = 1;
15183 }
15184
15185 # And add the reason under the list of properties, if such a list
15186 # actually got generated. Note that the header got added
15187 # unconditionally before. But pod ignores extra blank lines, so no
15188 # harm.
15189 push @bad_re_properties, "\n$why\n" if $has_item;
15190
15191 } # End of looping through each reason.
15192
54ce19c9
KW
15193 if (! @bad_re_properties) {
15194 push @bad_re_properties,
15195 "*** This installation accepts ALL non-Unihan properties ***";
15196 }
15197 else {
15198 # Add =over only if non-empty to avoid an empty =over/=back section,
15199 # which is considered bad form.
15200 unshift @bad_re_properties, "\n=over 4\n";
15201 push @bad_re_properties, "\n=back\n";
15202 }
15203
8d099389
KW
15204 # Similiarly, generate a list of files that we don't use, grouped by the
15205 # reasons why. First, create a hash whose keys are the reasons, and whose
15206 # values are anonymous arrays of all the files that share that reason.
15207 my %grouped_by_reason;
15208 foreach my $file (keys %ignored_files) {
15209 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15210 }
1fec9f60
KW
15211 foreach my $file (keys %skipped_files) {
15212 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15213 }
8d099389
KW
15214
15215 # Then, sort each group.
15216 foreach my $group (keys %grouped_by_reason) {
15217 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15218 @{$grouped_by_reason{$group}} ;
15219 }
15220
15221 # Finally, create the output text. For each reason (sorted by the
15222 # alphabetically first file that has that reason)...
15223 my @unused_files;
15224 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15225 cmp lc $grouped_by_reason{$b}->[0]
15226 }
15227 keys %grouped_by_reason)
15228 {
15229 # Add all the files that have that reason to the output. Start
15230 # with an empty line.
15231 push @unused_files, "\n\n";
15232 push @unused_files, map { "\n=item F<$_> \n" }
15233 @{$grouped_by_reason{$reason}};
15234 # And add the reason under the list of files
15235 push @unused_files, "\n$reason\n";
15236 }
15237
ee94c7d1
KW
15238 # Similarly, create the output text for the UCD section of the pod
15239 my @ucd_pod;
15240 foreach my $key (keys %ucd_pod) {
15241 next unless $ucd_pod{$key}->{'output_this'};
15242 push @ucd_pod, format_pod_line($indent_info_column,
15243 $ucd_pod{$key}->{'name'},
15244 $ucd_pod{$key}->{'info'},
15245 $ucd_pod{$key}->{'status'},
15246 );
15247 }
15248
15249 # Sort alphabetically, and fold for output
15250 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15251 my $ucd_pod = simple_fold(\@ucd_pod,
15252 ' ',
15253 $indent_info_column,
15254 $automatic_pod_indent);
15255 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
15256 . "\n"
15257 . $ucd_pod;
12916dad
MS
15258 local $" = "";
15259
99870f4d
KW
15260 # Everything is ready to assemble.
15261 my @OUT = << "END";
15262=begin comment
15263
15264$HEADER
15265
15266To change this file, edit $0 instead.
15267
15268=end comment
15269
15270=head1 NAME
15271
8d099389 15272$pod_file - Index of Unicode Version $string_version character properties in Perl
99870f4d
KW
15273
15274=head1 DESCRIPTION
15275
8d099389
KW
15276This document provides information about the portion of the Unicode database
15277that deals with character properties, that is the portion that is defined on
15278single code points. (L</Other information in the Unicode data base>
15279below briefly mentions other data that Unicode provides.)
99870f4d 15280
8d099389
KW
15281Perl can provide access to all non-provisional Unicode character properties,
15282though not all are enabled by default. The omitted ones are the Unihan
15283properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15284deprecated or Unicode-internal properties. (An installation may choose to
ea5acc0f 15285recompile Perl's tables to change this. See L<Unicode character
8d099389
KW
15286properties that are NOT accepted by Perl>.)
15287
ee94c7d1
KW
15288For most purposes, access to Unicode properties from the Perl core is through
15289regular expression matches, as described in the next section.
15290For some special purposes, and to access the properties that are not suitable
15291for regular expression matching, all the Unicode character properties that
15292Perl handles are accessible via the standard L<Unicode::UCD> module, as
15293described in the section L</Properties accessible through Unicode::UCD>.
15294
8d099389
KW
15295Perl also provides some additional extensions and short-cut synonyms
15296for Unicode properties.
99870f4d
KW
15297
15298This document merely lists all available properties and does not attempt to
15299explain what each property really means. There is a brief description of each
043f3b3f
KW
15300Perl extension; see L<perlunicode/Other Properties> for more information on
15301these. There is some detail about Blocks, Scripts, General_Category,
99870f4d 15302and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
043f3b3f
KW
15303official Unicode properties, refer to the Unicode standard. A good starting
15304place is L<$unicode_reference_url>.
99870f4d
KW
15305
15306Note that you can define your own properties; see
15307L<perlunicode/"User-Defined Character Properties">.
15308
77173124 15309=head1 Properties accessible through C<\\p{}> and C<\\P{}>
99870f4d 15310
77173124
KW
15311The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15312most of the Unicode character properties. The table below shows all these
15313constructs, both single and compound forms.
99870f4d
KW
15314
15315B<Compound forms> consist of two components, separated by an equals sign or a
15316colon. The first component is the property name, and the second component is
15317the particular value of the property to match against, for example,
77173124 15318C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
99870f4d
KW
15319whose Script property is Greek.
15320
77173124 15321B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
99870f4d 15322their equivalent compound forms. The table shows these equivalences. (In our
77173124 15323example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
99870f4d 15324There are also a few Perl-defined single forms that are not shortcuts for a
77173124 15325compound form. One such is C<\\p{Word}>. These are also listed in the table.
99870f4d
KW
15326
15327In parsing these constructs, Perl always ignores Upper/lower case differences
77173124
KW
15328everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
15329C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
15330the left brace completely changes the meaning of the construct, from "match"
15331(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
15332for improved legibility.
99870f4d
KW
15333
15334Also, white space, hyphens, and underscores are also normally ignored
15335everywhere between the {braces}, and hence can be freely added or removed
15336even if the C</x> modifier hasn't been specified on the regular expression.
15337But $a_bold_stricter at the beginning of an entry in the table below
15338means that tighter (stricter) rules are used for that entry:
15339
15340=over 4
15341
77173124 15342=item Single form (C<\\p{name}>) tighter rules:
99870f4d
KW
15343
15344White space, hyphens, and underscores ARE significant
15345except for:
15346
15347=over 4
15348
15349=item * white space adjacent to a non-word character
15350
15351=item * underscores separating digits in numbers
15352
15353=back
15354
15355That means, for example, that you can freely add or remove white space
15356adjacent to (but within) the braces without affecting the meaning.
15357
77173124 15358=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
99870f4d
KW
15359
15360The tighter rules given above for the single form apply to everything to the
15361right of the colon or equals; the looser rules still apply to everything to
15362the left.
15363
15364That means, for example, that you can freely add or remove white space
15365adjacent to (but within) the braces and the colon or equal sign.
15366
15367=back
15368
78bb419c
KW
15369Some properties are considered obsolete by Unicode, but still available.
15370There are several varieties of obsolescence:
99870f4d
KW
15371
15372=over 4
15373
99870f4d
KW
15374=item Stabilized
15375
f8c38b14 15376A property may be stabilized. Such a determination does not indicate
5f7264c7
KW
15377that the property should or should not be used; instead it is a declaration
15378that the property will not be maintained nor extended for newly encoded
15379characters. Such properties are marked with $a_bold_stabilized in the
15380table.
99870f4d
KW
15381
15382=item Deprecated
15383
f8c38b14 15384A property may be deprecated, perhaps because its original intent
78bb419c
KW
15385has been replaced by another property, or because its specification was
15386somehow defective. This means that its use is strongly
99870f4d
KW
15387discouraged, so much so that a warning will be issued if used, unless the
15388regular expression is in the scope of a C<S<no warnings 'deprecated'>>
15389statement. $A_bold_deprecated flags each such entry in the table, and
15390the entry there for the longest, most descriptive version of the property will
15391give the reason it is deprecated, and perhaps advice. Perl may issue such a
15392warning, even for properties that aren't officially deprecated by Unicode,
15393when there used to be characters or code points that were matched by them, but
15394no longer. This is to warn you that your program may not work like it did on
15395earlier Unicode releases.
15396
15397A deprecated property may be made unavailable in a future Perl version, so it
15398is best to move away from them.
15399
c12f2655
KW
15400A deprecated property may also be stabilized, but this fact is not shown.
15401
15402=item Obsolete
15403
15404Properties marked with $a_bold_obsolete in the table are considered (plain)
15405obsolete. Generally this designation is given to properties that Unicode once
15406used for internal purposes (but not any longer).
15407
99870f4d
KW
15408=back
15409
15410Some Perl extensions are present for backwards compatibility and are
c12f2655
KW
15411discouraged from being used, but are not obsolete. $A_bold_discouraged
15412flags each such entry in the table. Future Unicode versions may force
15413some of these extensions to be removed without warning, replaced by another
15414property with the same name that means something different. Use the
15415equivalent shown instead.
99870f4d
KW
15416
15417@block_warning
15418
77173124 15419The table below has two columns. The left column contains the C<\\p{}>
98dc9551 15420constructs to look up, possibly preceded by the flags mentioned above; and
99870f4d
KW
15421the right column contains information about them, like a description, or
15422synonyms. It shows both the single and compound forms for each property that
15423has them. If the left column is a short name for a property, the right column
15424will give its longer, more descriptive name; and if the left column is the
15425longest name, the right column will show any equivalent shortest name, in both
15426single and compound forms if applicable.
15427
15428The right column will also caution you if a property means something different
15429than what might normally be expected.
15430
d57ccc9a
KW
15431All single forms are Perl extensions; a few compound forms are as well, and
15432are noted as such.
15433
99870f4d
KW
15434Numbers in (parentheses) indicate the total number of code points matched by
15435the property. For emphasis, those properties that match no code points at all
15436are listed as well in a separate section following the table.
15437
56ca34ca
KW
15438Most properties match the same code points regardless of whether C<"/i">
15439case-insensitive matching is specified or not. But a few properties are
15440affected. These are shown with the notation
15441
15442 (/i= other_property)
15443
15444in the second column. Under case-insensitive matching they match the
15445same code pode points as the property "other_property".
15446
99870f4d 15447There is no description given for most non-Perl defined properties (See
77173124 15448L<$unicode_reference_url> for that).
d73e5302 15449
99870f4d
KW
15450For compactness, 'B<*>' is used as a wildcard instead of showing all possible
15451combinations. For example, entries like:
d73e5302 15452
99870f4d 15453 \\p{Gc: *} \\p{General_Category: *}
5beb625e 15454
99870f4d
KW
15455mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
15456for the latter is also valid for the former. Similarly,
5beb625e 15457
99870f4d 15458 \\p{Is_*} \\p{*}
5beb625e 15459
77173124
KW
15460means that if and only if, for example, C<\\p{Foo}> exists, then
15461C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
15462And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
15463C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
15464underscore.
5beb625e 15465
99870f4d
KW
15466Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
15467And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
15468'N*' to indicate this, and doesn't have separate entries for the other
15469possibilities. Note that not all properties which have values 'Yes' and 'No'
15470are binary, and they have all their values spelled out without using this wild
15471card, and a C<NOT> clause in their description that highlights their not being
15472binary. These also require the compound form to match them, whereas true
15473binary properties have both single and compound forms available.
5beb625e 15474
99870f4d
KW
15475Note that all non-essential underscores are removed in the display of the
15476short names below.
5beb625e 15477
c12f2655 15478B<Legend summary:>
5beb625e 15479
99870f4d 15480=over 4
cf25bb62 15481
21405004 15482=item Z<>B<*> is a wild-card
cf25bb62 15483
99870f4d
KW
15484=item B<(\\d+)> in the info column gives the number of code points matched by
15485this property.
cf25bb62 15486
99870f4d 15487=item B<$DEPRECATED> means this is deprecated.
cf25bb62 15488
99870f4d 15489=item B<$OBSOLETE> means this is obsolete.
cf25bb62 15490
99870f4d 15491=item B<$STABILIZED> means this is stabilized.
cf25bb62 15492
99870f4d 15493=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 15494
c12f2655
KW
15495=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
15496stable.
5beb625e 15497
99870f4d 15498=back
da7fcca4 15499
99870f4d 15500$formatted_properties
cf25bb62 15501
99870f4d 15502$zero_matches
cf25bb62 15503
ee94c7d1
KW
15504=head1 Properties accessible through Unicode::UCD
15505
15506All the Unicode character properties mentioned above (except for those marked
15507as for internal use by Perl) are also accessible by
15508L<Unicode::UCD/prop_invlist()>.
15509
15510Due to their nature, not all Unicode character properties are suitable for
15511regular expression matches, nor C<prop_invlist()>. The remaining
15512non-provisional, non-internal ones are accessible via
15513L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
15514hasn't included; see L<below for which those are|/Unicode character properties
15515that are NOT accepted by Perl>).
15516
15517For compatibility with other parts of Perl, all the single forms given in the
15518table in the L<section above|/Properties accessible through \\p{} and \\P{}>
15519are recognized. BUT, there are some ambiguities between some Perl extensions
15520and the Unicode properties, all of which are silently resolved in favor of the
15521official Unicode property. To avoid surprises, you should only use
15522C<prop_invmap()> for forms listed in the table below, which omits the
15523non-recommended ones. The affected forms are the Perl single form equivalents
15524of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
15525C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
15526whose short name is C<sc>. The table indicates the current ambiguities in the
15527INFO column, beginning with the word C<"NOT">.
15528
15529The standard Unicode properties listed below are documented in
15530L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
15531L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
15532L<perlunicode/Other Properties>;
15533
15534The first column in the table is a name for the property; the second column is
15535an alternative name, if any, plus possibly some annotations. The alternative
15536name is the property's full name, unless that would simply repeat the first
15537column, in which case the second column indicates the property's short name
15538(if different). The annotations are given only in the entry for the full
15539name. If a property is obsolete, etc, the entry will be flagged with the same
15540characters used in the table in the L<section above|/Properties accessible
15541through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
15542
15543$ucd_pod
15544
15545=head1 Properties accessible through other means
15546
15547Certain properties are accessible also via core function calls. These are:
78bb419c 15548
99870f4d
KW
15549 Lowercase_Mapping lc() and lcfirst()
15550 Titlecase_Mapping ucfirst()
15551 Uppercase_Mapping uc()
12ac2576 15552
043f3b3f 15553Also, Case_Folding is accessible through the C</i> modifier in regular
4ce498ef
RS
15554expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
15555operator.
cf25bb62 15556
043f3b3f 15557And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
fbb93542
KW
15558interpolation in double-quoted strings and regular expressions; and functions
15559C<charnames::viacode()>, C<charnames::vianame()>, and
15560C<charnames::string_vianame()> (which require a C<use charnames ();> to be
15561specified.
cf25bb62 15562
ee94c7d1
KW
15563Finally, most properties related to decomposition are accessible via
15564L<Unicode::Normalize>.
15565
ea5acc0f 15566=head1 Unicode character properties that are NOT accepted by Perl
d2d499f5 15567
99870f4d
KW
15568Perl will generate an error for a few character properties in Unicode when
15569used in a regular expression. The non-Unihan ones are listed below, with the
15570reasons they are not accepted, perhaps with work-arounds. The short names for
15571the properties are listed enclosed in (parentheses).
c12f2655
KW
15572As described after the list, an installation can change the defaults and choose
15573to accept any of these. The list is machine generated based on the
15574choices made for the installation that generated this document.
ae6979a8 15575
99870f4d 15576@bad_re_properties
a3a8c5f0 15577
b7986f4f
KW
15578An installation can choose to allow any of these to be matched by downloading
15579the Unicode database from L<http://www.unicode.org/Public/> to
f3514a2f
KW
15580C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
15581controlling lists contained in the program
15582C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
15583(C<\%Config> is available from the Config module).
d73e5302 15584
8d099389
KW
15585=head1 Other information in the Unicode data base
15586
15587The Unicode data base is delivered in two different formats. The XML version
15588is valid for more modern Unicode releases. The other version is a collection
15589of files. The two are intended to give equivalent information. Perl uses the
15590older form; this allows you to recompile Perl to use early Unicode releases.
15591
15592The only non-character property that Perl currently supports is Named
15593Sequences, in which a sequence of code points
15594is given a name and generally treated as a single entity. (Perl supports
15595these via the C<\\N{...}> double-quotish construct,
15596L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
15597
15598Below is a list of the files in the Unicode data base that Perl doesn't
15599currently use, along with very brief descriptions of their purposes.
15600Some of the names of the files have been shortened from those that Unicode
15601uses, in order to allow them to be distinguishable from similarly named files
15602on file systems for which only the first 8 characters of a name are
15603significant.
15604
15605=over 4
15606
15607@unused_files
15608
15609=back
15610
99870f4d 15611=head1 SEE ALSO
d73e5302 15612
99870f4d 15613L<$unicode_reference_url>
12ac2576 15614
99870f4d 15615L<perlrecharclass>
12ac2576 15616
99870f4d 15617L<perlunicode>
d73e5302 15618
99870f4d 15619END
d73e5302 15620
9218f1cf
KW
15621 # And write it. The 0 means no utf8.
15622 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
99870f4d
KW
15623 return;
15624}
d73e5302 15625
99870f4d
KW
15626sub make_Heavy () {
15627 # Create and write Heavy.pl, which passes info about the tables to
15628 # utf8_heavy.pl
12ac2576 15629
143b2c48
KW
15630 # Stringify structures for output
15631 my $loose_property_name_of
15632 = simple_dumper(\%loose_property_name_of, ' ' x 4);
15633 chomp $loose_property_name_of;
15634
15635 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
15636 chomp $stricter_to_file_of;
15637
15638 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
15639 chomp $loose_to_file_of;
15640
15641 my $nv_floating_to_rational
15642 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
15643 chomp $nv_floating_to_rational;
15644
15645 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
15646 chomp $why_deprecated;
15647
15648 # We set the key to the file when we associated files with tables, but we
15649 # couldn't do the same for the value then, as we might not have the file
15650 # for the alternate table figured out at that time.
15651 foreach my $cased (keys %caseless_equivalent_to) {
15652 my @path = $caseless_equivalent_to{$cased}->file_path;
15653 my $path = join '/', @path[1, -1];
15654 $caseless_equivalent_to{$cased} = $path;
15655 }
15656 my $caseless_equivalent_to
15657 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
15658 chomp $caseless_equivalent_to;
15659
315bfd4e
KW
15660 my $loose_property_to_file_of
15661 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
15662 chomp $loose_property_to_file_of;
15663
89cf10cc
KW
15664 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
15665 chomp $file_to_swash_name;
15666
99870f4d
KW
15667 my @heavy = <<END;
15668$HEADER
126c3d4e 15669$INTERNAL_ONLY_HEADER
d73e5302 15670
01da8b85 15671# This file is for the use of utf8_heavy.pl and Unicode::UCD
12ac2576 15672
c12f2655
KW
15673# Maps Unicode (not Perl single-form extensions) property names in loose
15674# standard form to their corresponding standard names
99870f4d 15675\%utf8::loose_property_name_of = (
143b2c48 15676$loose_property_name_of
99870f4d 15677);
12ac2576 15678
99870f4d
KW
15679# Maps property, table to file for those using stricter matching
15680\%utf8::stricter_to_file_of = (
143b2c48 15681$stricter_to_file_of
99870f4d 15682);
12ac2576 15683
99870f4d
KW
15684# Maps property, table to file for those using loose matching
15685\%utf8::loose_to_file_of = (
143b2c48 15686$loose_to_file_of
99870f4d 15687);
12ac2576 15688
99870f4d
KW
15689# Maps floating point to fractional form
15690\%utf8::nv_floating_to_rational = (
143b2c48 15691$nv_floating_to_rational
99870f4d 15692);
12ac2576 15693
99870f4d
KW
15694# If a floating point number doesn't have enough digits in it to get this
15695# close to a fraction, it isn't considered to be that fraction even if all the
15696# digits it does have match.
15697\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 15698
99870f4d
KW
15699# Deprecated tables to generate a warning for. The key is the file containing
15700# the table, so as to avoid duplication, as many property names can map to the
15701# file, but we only need one entry for all of them.
15702\%utf8::why_deprecated = (
143b2c48 15703$why_deprecated
99870f4d 15704);
12ac2576 15705
143b2c48 15706# A few properties have different behavior under /i matching. This maps
d867ccfb
KW
15707# those to substitute files to use under /i.
15708\%utf8::caseless_equivalent = (
143b2c48 15709$caseless_equivalent_to
d867ccfb
KW
15710);
15711
315bfd4e
KW
15712# Property names to mapping files
15713\%utf8::loose_property_to_file_of = (
15714$loose_property_to_file_of
15715);
15716
89cf10cc
KW
15717# Files to the swash names within them.
15718\%utf8::file_to_swash_name = (
15719$file_to_swash_name
15720);
15721
99870f4d
KW
157221;
15723END
12ac2576 15724
9218f1cf 15725 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
99870f4d 15726 return;
12ac2576
JP
15727}
15728
52dc8b5d 15729sub make_Name_pm () {
6f424f62 15730 # Create and write Name.pm, which contains subroutines and data to use in
52dc8b5d
KW
15731 # conjunction with Name.pl
15732
bb1dd3da
KW
15733 # Maybe there's nothing to do.
15734 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
15735
52dc8b5d
KW
15736 my @name = <<END;
15737$HEADER
126c3d4e 15738$INTERNAL_ONLY_HEADER
52dc8b5d 15739END
0f6f7bc2 15740
fb848dce
KW
15741 # Convert these structures to output format.
15742 my $code_points_ending_in_code_point =
15743 main::simple_dumper(\@code_points_ending_in_code_point,
15744 ' ' x 8);
15745 my $names = main::simple_dumper(\%names_ending_in_code_point,
15746 ' ' x 8);
15747 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
0f6f7bc2 15748 ' ' x 8);
0f6f7bc2 15749
fb848dce
KW
15750 # Do the same with the Hangul names,
15751 my $jamo;
15752 my $jamo_l;
15753 my $jamo_v;
15754 my $jamo_t;
15755 my $jamo_re;
15756 if ($has_hangul_syllables) {
0f6f7bc2 15757
fb848dce
KW
15758 # Construct a regular expression of all the possible
15759 # combinations of the Hangul syllables.
15760 my @L_re; # Leading consonants
15761 for my $i ($LBase .. $LBase + $LCount - 1) {
15762 push @L_re, $Jamo{$i}
15763 }
15764 my @V_re; # Middle vowels
15765 for my $i ($VBase .. $VBase + $VCount - 1) {
15766 push @V_re, $Jamo{$i}
15767 }
15768 my @T_re; # Trailing consonants
15769 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
15770 push @T_re, $Jamo{$i}
15771 }
0f6f7bc2 15772
fb848dce
KW
15773 # The whole re is made up of the L V T combination.
15774 $jamo_re = '('
15775 . join ('|', sort @L_re)
15776 . ')('
15777 . join ('|', sort @V_re)
15778 . ')('
15779 . join ('|', sort @T_re)
15780 . ')?';
0f6f7bc2 15781
fb848dce
KW
15782 # These hashes needed by the algorithm were generated
15783 # during reading of the Jamo.txt file
15784 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
15785 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
15786 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
15787 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
15788 }
0f6f7bc2 15789
6f424f62 15790 push @name, <<END;
0f6f7bc2 15791
e7a078a0
KW
15792package charnames;
15793
6f424f62
KW
15794# This module contains machine-generated tables and code for the
15795# algorithmically-determinable Unicode character names. The following
15796# routines can be used to translate between name and code point and vice versa
0f6f7bc2
KW
15797
15798{ # Closure
15799
92199589
KW
15800 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
15801 # two must be 10; if there are 5, the first must not be a 0. Written this
15802 # way to decrease backtracking. The first regex allows the code point to
15803 # be at the end of a word, but to work properly, the word shouldn't end
15804 # with a valid hex character. The second one won't match a code point at
15805 # the end of a word, and doesn't have the run-on issue
0f6f7bc2
KW
15806 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
15807 my \$code_point_re = qr/$code_point_re/;
15808
b58a05e5
KW
15809 # In the following hash, the keys are the bases of names which include
15810 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
0f6f7bc2
KW
15811 # of each key is another hash which is used to get the low and high ends
15812 # for each range of code points that apply to the name.
15813 my %names_ending_in_code_point = (
15814$names
15815 );
15816
15817 # The following hash is a copy of the previous one, except is for loose
15818 # matching, so each name has blanks and dashes squeezed out
15819 my %loose_names_ending_in_code_point = (
15820$loose_names
15821 );
15822
15823 # And the following array gives the inverse mapping from code points to
15824 # names. Lowest code points are first
15825 my \@code_points_ending_in_code_point = (
15826$code_points_ending_in_code_point
15827 );
15828END
fb848dce
KW
15829 # Earlier releases didn't have Jamos. No sense outputting
15830 # them unless will be used.
15831 if ($has_hangul_syllables) {
6f424f62 15832 push @name, <<END;
0f6f7bc2
KW
15833
15834 # Convert from code point to Jamo short name for use in composing Hangul
15835 # syllable names
15836 my %Jamo = (
15837$jamo
15838 );
15839
15840 # Leading consonant (can be null)
15841 my %Jamo_L = (
15842$jamo_l
15843 );
15844
15845 # Vowel
15846 my %Jamo_V = (
15847$jamo_v
15848 );
15849
15850 # Optional trailing consonant
15851 my %Jamo_T = (
15852$jamo_t
15853 );
15854
15855 # Computed re that splits up a Hangul name into LVT or LV syllables
15856 my \$syllable_re = qr/$jamo_re/;
15857
15858 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
15859 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
15860
15861 # These constants names and values were taken from the Unicode standard,
15862 # version 5.1, section 3.12. They are used in conjunction with Hangul
15863 # syllables
15864 my \$SBase = $SBase_string;
15865 my \$LBase = $LBase_string;
15866 my \$VBase = $VBase_string;
15867 my \$TBase = $TBase_string;
15868 my \$SCount = $SCount;
15869 my \$LCount = $LCount;
15870 my \$VCount = $VCount;
15871 my \$TCount = $TCount;
15872 my \$NCount = \$VCount * \$TCount;
15873END
fb848dce 15874 } # End of has Jamos
0f6f7bc2 15875
6f424f62 15876 push @name, << 'END';
0f6f7bc2
KW
15877
15878 sub name_to_code_point_special {
15879 my ($name, $loose) = @_;
15880
15881 # Returns undef if not one of the specially handled names; otherwise
15882 # returns the code point equivalent to the input name
15883 # $loose is non-zero if to use loose matching, 'name' in that case
15884 # must be input as upper case with all blanks and dashes squeezed out.
15885END
fb848dce 15886 if ($has_hangul_syllables) {
6f424f62 15887 push @name, << 'END';
0f6f7bc2
KW
15888
15889 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
15890 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
15891 {
15892 return if $name !~ qr/^$syllable_re$/;
15893 my $L = $Jamo_L{$1};
15894 my $V = $Jamo_V{$2};
15895 my $T = (defined $3) ? $Jamo_T{$3} : 0;
15896 return ($L * $VCount + $V) * $TCount + $T + $SBase;
15897 }
15898END
fb848dce 15899 }
6f424f62 15900 push @name, << 'END';
0f6f7bc2
KW
15901
15902 # Name must end in 'code_point' for this to handle.
15903 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
15904 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
15905
15906 my $base = $1;
15907 my $code_point = CORE::hex $2;
15908 my $names_ref;
15909
15910 if ($loose) {
15911 $names_ref = \%loose_names_ending_in_code_point;
15912 }
15913 else {
15914 return if $base !~ s/-$//;
15915 $names_ref = \%names_ending_in_code_point;
15916 }
15917
15918 # Name must be one of the ones which has the code point in it.
15919 return if ! $names_ref->{$base};
15920
15921 # Look through the list of ranges that apply to this name to see if
15922 # the code point is in one of them.
15923 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
15924 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
15925 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
15926
15927 # Here, the code point is in the range.
15928 return $code_point;
15929 }
15930
15931 # Here, looked like the name had a code point number in it, but
15932 # did not match one of the valid ones.
15933 return;
15934 }
15935
15936 sub code_point_to_name_special {
15937 my $code_point = shift;
15938
15939 # Returns the name of a code point if algorithmically determinable;
15940 # undef if not
15941END
fb848dce 15942 if ($has_hangul_syllables) {
6f424f62 15943 push @name, << 'END';
0f6f7bc2
KW
15944
15945 # If in the Hangul range, calculate the name based on Unicode's
15946 # algorithm
15947 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
15948 use integer;
15949 my $SIndex = $code_point - $SBase;
15950 my $L = $LBase + $SIndex / $NCount;
15951 my $V = $VBase + ($SIndex % $NCount) / $TCount;
15952 my $T = $TBase + $SIndex % $TCount;
15953 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
15954 $name .= $Jamo{$T} if $T != $TBase;
15955 return $name;
15956 }
15957END
fb848dce 15958 }
6f424f62 15959 push @name, << 'END';
0f6f7bc2
KW
15960
15961 # Look through list of these code points for one in range.
15962 foreach my $hash (@code_points_ending_in_code_point) {
15963 return if $code_point < $hash->{'low'};
15964 if ($code_point <= $hash->{'high'}) {
15965 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
15966 }
15967 }
15968 return; # None found
15969 }
15970} # End closure
15971
6f424f62 159721;
0f6f7bc2 15973END
52dc8b5d
KW
15974
15975 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
15976 return;
15977}
15978
9f077a68
KW
15979sub make_UCD () {
15980 # Create and write UCD.pl, which passes info about the tables to
15981 # Unicode::UCD
15982
f7be2375
KW
15983 # Create a mapping from each alias of Perl single-form extensions to all
15984 # its equivalent aliases, for quick look-up.
15985 my %perlprop_to_aliases;
15986 foreach my $table ($perl->tables) {
15987
15988 # First create the list of the aliases of each extension
15989 my @aliases_list; # List of legal aliases for this extension
15990
15991 my $table_name = $table->name;
15992 my $standard_table_name = standardize($table_name);
15993 my $table_full_name = $table->full_name;
15994 my $standard_table_full_name = standardize($table_full_name);
15995
15996 # Make sure that the list has both the short and full names
15997 push @aliases_list, $table_name, $table_full_name;
15998
15999 my $found_ucd = 0; # ? Did we actually get an alias that should be
16000 # output for this table
16001
16002 # Go through all the aliases (including the two just added), and add
16003 # any new unique ones to the list
16004 foreach my $alias ($table->aliases) {
16005
16006 # Skip non-legal names
0eac1e20 16007 next unless $alias->ok_as_filename;
f7be2375
KW
16008 next unless $alias->ucd;
16009
16010 $found_ucd = 1; # have at least one legal name
16011
16012 my $name = $alias->name;
16013 my $standard = standardize($name);
16014
16015 # Don't repeat a name that is equivalent to one already on the
16016 # list
16017 next if $standard eq $standard_table_name;
16018 next if $standard eq $standard_table_full_name;
16019
16020 push @aliases_list, $name;
16021 }
16022
16023 # If there were no legal names, don't output anything.
16024 next unless $found_ucd;
16025
16026 # To conserve memory in the program reading these in, omit full names
16027 # that are identical to the short name, when those are the only two
16028 # aliases for the property.
16029 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
16030 pop @aliases_list;
16031 }
16032
16033 # Here, @aliases_list is the list of all the aliases that this
16034 # extension legally has. Now can create a map to it from each legal
16035 # standardized alias
16036 foreach my $alias ($table->aliases) {
16037 next unless $alias->ucd;
0eac1e20 16038 next unless $alias->ok_as_filename;
f7be2375
KW
16039 push @{$perlprop_to_aliases{standardize($alias->name)}},
16040 @aliases_list;
16041 }
16042 }
16043
55a40252
KW
16044 # Make a list of all combinations of properties/values that are suppressed.
16045 my @suppressed;
10d7eb0d 16046 if (! $debug_skip) { # This tends to fail in this debug mode
b4e741dd 16047 foreach my $property_name (keys %why_suppressed) {
55a40252 16048
b4e741dd
KW
16049 # Just the value
16050 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
55a40252 16051
b4e741dd
KW
16052 # The hash may contain properties not in this release of Unicode
16053 next unless defined (my $property = property_ref($property_name));
55a40252 16054
b4e741dd
KW
16055 # Find all combinations
16056 foreach my $prop_alias ($property->aliases) {
16057 my $prop_alias_name = standardize($prop_alias->name);
55a40252 16058
b4e741dd
KW
16059 # If no =value, there's just one combination possibe for this
16060 if (! $value_name) {
55a40252 16061
b4e741dd
KW
16062 # The property may be suppressed, but there may be a proxy
16063 # for it, so it shouldn't be listed as suppressed
16064 next if $prop_alias->ucd;
16065 push @suppressed, $prop_alias_name;
16066 }
16067 else { # Otherwise
16068 foreach my $value_alias
16069 ($property->table($value_name)->aliases)
16070 {
16071 next if $value_alias->ucd;
55a40252 16072
b4e741dd
KW
16073 push @suppressed, "$prop_alias_name="
16074 . standardize($value_alias->name);
16075 }
55a40252
KW
16076 }
16077 }
16078 }
16079 }
16080
6a40599f
KW
16081 # Convert the structure below (designed for Name.pm) to a form that UCD
16082 # wants, so it doesn't have to modify it at all; i.e. so that it includes
16083 # an element for the Hangul syllables in the appropriate place, and
16084 # otherwise changes the name to include the "-<code point>" suffix.
16085 my @algorithm_names;
16086 my $done_hangul = 0;
16087
16088 # Copy it linearly.
16089 for my $i (0 .. @code_points_ending_in_code_point - 1) {
16090
16091 # Insert the hanguls in the correct place.
16092 if (! $done_hangul
16093 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16094 {
16095 $done_hangul = 1;
16096 push @algorithm_names, { low => $SBase,
16097 high => $SBase + $SCount - 1,
16098 name => '<hangul syllable>',
16099 };
16100 }
16101
16102 # Copy the current entry, modified.
16103 push @algorithm_names, {
16104 low => $code_points_ending_in_code_point[$i]->{'low'},
16105 high => $code_points_ending_in_code_point[$i]->{'high'},
16106 name =>
16107 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16108 };
16109 }
16110
9e4a1e86
KW
16111 # Serialize these structures for output.
16112 my $loose_to_standard_value
16113 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16114 chomp $loose_to_standard_value;
16115
86a52d1e
KW
16116 my $string_property_loose_to_name
16117 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16118 chomp $string_property_loose_to_name;
16119
f7be2375
KW
16120 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16121 chomp $perlprop_to_aliases;
16122
5d1df013
KW
16123 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16124 chomp $prop_aliases;
16125
1e863613
KW
16126 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16127 chomp $prop_value_aliases;
16128
55a40252
KW
16129 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16130 chomp $suppressed;
16131
6a40599f
KW
16132 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16133 chomp $algorithm_names;
16134
2df7880f
KW
16135 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16136 chomp $ambiguous_names;
16137
c15fda25
KW
16138 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16139 chomp $loose_defaults;
16140
9f077a68
KW
16141 my @ucd = <<END;
16142$HEADER
16143$INTERNAL_ONLY_HEADER
16144
16145# This file is for the use of Unicode::UCD
16146
16147# Highest legal Unicode code point
16148\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16149
16150# Hangul syllables
16151\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16152\$Unicode::UCD::HANGUL_COUNT = $SCount;
16153
9e4a1e86
KW
16154# Keys are all the possible "prop=value" combinations, in loose form; values
16155# are the standard loose name for the 'value' part of the key
16156\%Unicode::UCD::loose_to_standard_value = (
16157$loose_to_standard_value
16158);
16159
86a52d1e
KW
16160# String property loose names to standard loose name
16161\%Unicode::UCD::string_property_loose_to_name = (
16162$string_property_loose_to_name
16163);
16164
f7be2375
KW
16165# Keys are Perl extensions in loose form; values are each one's list of
16166# aliases
16167\%Unicode::UCD::loose_perlprop_to_name = (
16168$perlprop_to_aliases
16169);
16170
5d1df013
KW
16171# Keys are standard property name; values are each one's aliases
16172\%Unicode::UCD::prop_aliases = (
16173$prop_aliases
16174);
16175
1e863613
KW
16176# Keys of top level are standard property name; values are keys to another
16177# hash, Each one is one of the property's values, in standard form. The
16178# values are that prop-val's aliases. If only one specified, the short and
16179# long alias are identical.
16180\%Unicode::UCD::prop_value_aliases = (
16181$prop_value_aliases
16182);
16183
6a40599f
KW
16184# Ordered (by code point ordinal) list of the ranges of code points whose
16185# names are algorithmically determined. Each range entry is an anonymous hash
16186# of the start and end points and a template for the names within it.
16187\@Unicode::UCD::algorithmic_named_code_points = (
16188$algorithm_names
16189);
16190
2df7880f
KW
16191# The properties that as-is have two meanings, and which must be disambiguated
16192\%Unicode::UCD::ambiguous_names = (
16193$ambiguous_names
16194);
16195
c15fda25
KW
16196# Keys are the prop-val combinations which are the default values for the
16197# given property, expressed in standard loose form
16198\%Unicode::UCD::loose_defaults = (
16199$loose_defaults
16200);
16201
55a40252
KW
16202# All combinations of names that are suppressed.
16203# This is actually for UCD.t, so it knows which properties shouldn't have
16204# entries. If it got any bigger, would probably want to put it in its own
16205# file to use memory only when it was needed, in testing.
16206\@Unicode::UCD::suppressed_properties = (
16207$suppressed
16208);
16209
9f077a68
KW
162101;
16211END
16212
16213 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
16214 return;
16215}
52dc8b5d 16216
99870f4d
KW
16217sub write_all_tables() {
16218 # Write out all the tables generated by this program to files, as well as
16219 # the supporting data structures, pod file, and .t file.
16220
16221 my @writables; # List of tables that actually get written
16222 my %match_tables_to_write; # Used to collapse identical match tables
16223 # into one file. Each key is a hash function
16224 # result to partition tables into buckets.
16225 # Each value is an array of the tables that
16226 # fit in the bucket.
16227
16228 # For each property ...
16229 # (sort so that if there is an immutable file name, it has precedence, so
16230 # some other property can't come in and take over its file name. If b's
16231 # file name is defined, will return 1, meaning to take it first; don't
7fc6cb55
KW
16232 # care if both defined, as they had better be different anyway. And the
16233 # property named 'Perl' needs to be first (it doesn't have any immutable
16234 # file name) because empty properties are defined in terms of it's table
16235 # named 'Any'.)
99870f4d 16236 PROPERTY:
7fc6cb55
KW
16237 foreach my $property (sort { return -1 if $a == $perl;
16238 return 1 if $b == $perl;
16239 return defined $b->file
16240 } property_ref('*'))
16241 {
99870f4d
KW
16242 my $type = $property->type;
16243
16244 # And for each table for that property, starting with the mapping
16245 # table for it ...
16246 TABLE:
16247 foreach my $table($property,
16248
16249 # and all the match tables for it (if any), sorted so
16250 # the ones with the shortest associated file name come
16251 # first. The length sorting prevents problems of a
16252 # longer file taking a name that might have to be used
16253 # by a shorter one. The alphabetic sorting prevents
16254 # differences between releases
16255 sort { my $ext_a = $a->external_name;
16256 return 1 if ! defined $ext_a;
16257 my $ext_b = $b->external_name;
16258 return -1 if ! defined $ext_b;
a92d5c2e
KW
16259
16260 # But return the non-complement table before
16261 # the complement one, as the latter is defined
16262 # in terms of the former, and needs to have
16263 # the information for the former available.
16264 return 1 if $a->complement != 0;
16265 return -1 if $b->complement != 0;
16266
0a695432
KW
16267 # Similarly, return a subservient table after
16268 # a leader
16269 return 1 if $a->leader != $a;
16270 return -1 if $b->leader != $b;
16271
99870f4d
KW
16272 my $cmp = length $ext_a <=> length $ext_b;
16273
16274 # Return result if lengths not equal
16275 return $cmp if $cmp;
16276
16277 # Alphabetic if lengths equal
16278 return $ext_a cmp $ext_b
16279 } $property->tables
16280 )
16281 {
12ac2576 16282
99870f4d
KW
16283 # Here we have a table associated with a property. It could be
16284 # the map table (done first for each property), or one of the
16285 # other tables. Determine which type.
16286 my $is_property = $table->isa('Property');
16287
16288 my $name = $table->name;
16289 my $complete_name = $table->complete_name;
16290
16291 # See if should suppress the table if is empty, but warn if it
16292 # contains something.
0332277c
KW
16293 my $suppress_if_empty_warn_if_not
16294 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
99870f4d
KW
16295
16296 # Calculate if this table should have any code points associated
16297 # with it or not.
16298 my $expected_empty =
16299
16300 # $perl should be empty, as well as properties that we just
16301 # don't do anything with
16302 ($is_property
16303 && ($table == $perl
16304 || grep { $complete_name eq $_ }
16305 @unimplemented_properties
16306 )
16307 )
16308
16309 # Match tables in properties we skipped populating should be
16310 # empty
16311 || (! $is_property && ! $property->to_create_match_tables)
16312
16313 # Tables and properties that are expected to have no code
16314 # points should be empty
16315 || $suppress_if_empty_warn_if_not
16316 ;
16317
16318 # Set a boolean if this table is the complement of an empty binary
16319 # table
16320 my $is_complement_of_empty_binary =
16321 $type == $BINARY &&
16322 (($table == $property->table('Y')
16323 && $property->table('N')->is_empty)
16324 || ($table == $property->table('N')
16325 && $property->table('Y')->is_empty));
16326
99870f4d
KW
16327 if ($table->is_empty) {
16328
99870f4d 16329 if ($suppress_if_empty_warn_if_not) {
301ba948
KW
16330 $table->set_fate($SUPPRESSED,
16331 $suppress_if_empty_warn_if_not);
99870f4d 16332 }
12ac2576 16333
c12f2655 16334 # Suppress (by skipping them) expected empty tables.
99870f4d
KW
16335 next TABLE if $expected_empty;
16336
16337 # And setup to later output a warning for those that aren't
16338 # known to be allowed to be empty. Don't do the warning if
16339 # this table is a child of another one to avoid duplicating
16340 # the warning that should come from the parent one.
16341 if (($table == $property || $table->parent == $table)
301ba948 16342 && $table->fate != $SUPPRESSED
395dfc19 16343 && $table->fate != $MAP_PROXIED
99870f4d
KW
16344 && ! grep { $complete_name =~ /^$_$/ }
16345 @tables_that_may_be_empty)
16346 {
16347 push @unhandled_properties, "$table";
16348 }
7fc6cb55
KW
16349
16350 # An empty table is just the complement of everything.
16351 $table->set_complement($Any) if $table != $property;
99870f4d
KW
16352 }
16353 elsif ($expected_empty) {
16354 my $because = "";
16355 if ($suppress_if_empty_warn_if_not) {
0332277c 16356 $because = " because $suppress_if_empty_warn_if_not";
99870f4d 16357 }
12ac2576 16358
99870f4d
KW
16359 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
16360 }
12ac2576 16361
14479722
KW
16362 # Some tables should match everything
16363 my $expected_full =
1583a95b
KW
16364 ($table->fate == $SUPPRESSED)
16365 ? 0
e75669bd
KW
16366 : ($is_property)
16367 ? # All these types of map tables will be full because
16368 # they will have been populated with defaults
16369 ($type == $ENUM || $type == $FORCED_BINARY)
16370
16371 : # A match table should match everything if its method
16372 # shows it should
16373 ($table->matches_all
16374
16375 # The complement of an empty binary table will match
16376 # everything
16377 || $is_complement_of_empty_binary
16378 )
14479722
KW
16379 ;
16380
99870f4d
KW
16381 my $count = $table->count;
16382 if ($expected_full) {
16383 if ($count != $MAX_UNICODE_CODEPOINTS) {
16384 Carp::my_carp("$table matches only "
16385 . clarify_number($count)
16386 . " Unicode code points but should match "
16387 . clarify_number($MAX_UNICODE_CODEPOINTS)
16388 . " (off by "
16389 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
16390 . "). Proceeding anyway.");
16391 }
12ac2576 16392
99870f4d
KW
16393 # Here is expected to be full. If it is because it is the
16394 # complement of an (empty) binary table that is to be
16395 # suppressed, then suppress this one as well.
16396 if ($is_complement_of_empty_binary) {
16397 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
16398 my $opposing = $property->table($opposing_name);
16399 my $opposing_status = $opposing->status;
16400 if ($opposing_status) {
16401 $table->set_status($opposing_status,
16402 $opposing->status_info);
16403 }
16404 }
16405 }
26e2a4d8
KW
16406 elsif ($count == $MAX_UNICODE_CODEPOINTS
16407 && ($table == $property || $table->leader == $table)
c6005d23 16408 && $table->property->status ne $PLACEHOLDER)
26e2a4d8 16409 {
99870f4d 16410 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
99870f4d 16411 }
d73e5302 16412
c6005d23 16413 if ($table->fate >= $SUPPRESSED) {
99870f4d
KW
16414 if (! $is_property) {
16415 my @children = $table->children;
16416 foreach my $child (@children) {
c6005d23 16417 if ($child->fate < $SUPPRESSED) {
99870f4d
KW
16418 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
16419 }
16420 }
16421 }
16422 next TABLE;
d73e5302 16423
99870f4d 16424 }
2df7880f 16425
99870f4d
KW
16426 if (! $is_property) {
16427
2df7880f
KW
16428 make_ucd_table_pod_entries($table) if $table->property == $perl;
16429
99870f4d
KW
16430 # Several things need to be done just once for each related
16431 # group of match tables. Do them on the parent.
16432 if ($table->parent == $table) {
16433
16434 # Add an entry in the pod file for the table; it also does
16435 # the children.
d1476e4d 16436 make_re_pod_entries($table) if defined $pod_directory;
99870f4d
KW
16437
16438 # See if the the table matches identical code points with
16439 # something that has already been output. In that case,
16440 # no need to have two files with the same code points in
16441 # them. We use the table's hash() method to store these
16442 # in buckets, so that it is quite likely that if two
16443 # tables are in the same bucket they will be identical, so
16444 # don't have to compare tables frequently. The tables
16445 # have to have the same status to share a file, so add
16446 # this to the bucket hash. (The reason for this latter is
16447 # that Heavy.pl associates a status with a file.)
06671cbc
KW
16448 # We don't check tables that are inverses of others, as it
16449 # would lead to some coding complications, and checking
16450 # all the regular ones should find everything.
16451 if ($table->complement == 0) {
21be712a 16452 my $hash = $table->hash . ';' . $table->status;
99870f4d 16453
21be712a
KW
16454 # Look at each table that is in the same bucket as
16455 # this one would be.
16456 foreach my $comparison
16457 (@{$match_tables_to_write{$hash}})
16458 {
16459 if ($table->matches_identically_to($comparison)) {
16460 $table->set_equivalent_to($comparison,
99870f4d 16461 Related => 0);
21be712a
KW
16462 next TABLE;
16463 }
99870f4d 16464 }
d73e5302 16465
21be712a
KW
16466 # Here, not equivalent, add this table to the bucket.
16467 push @{$match_tables_to_write{$hash}}, $table;
06671cbc 16468 }
99870f4d
KW
16469 }
16470 }
16471 else {
16472
16473 # Here is the property itself.
16474 # Don't write out or make references to the $perl property
16475 next if $table == $perl;
16476
2df7880f
KW
16477 make_ucd_table_pod_entries($table);
16478
382cadab
KW
16479 # There is a mapping stored of the various synonyms to the
16480 # standardized name of the property for utf8_heavy.pl.
16481 # Also, the pod file contains entries of the form:
16482 # \p{alias: *} \p{full: *}
16483 # rather than show every possible combination of things.
99870f4d 16484
382cadab 16485 my @property_aliases = $property->aliases;
99870f4d 16486
382cadab
KW
16487 my $full_property_name = $property->full_name;
16488 my $property_name = $property->name;
16489 my $standard_property_name = standardize($property_name);
5d1df013
KW
16490 my $standard_property_full_name
16491 = standardize($full_property_name);
16492
16493 # We also create for Unicode::UCD a list of aliases for
16494 # the property. The list starts with the property name;
16495 # then its full name.
16496 my @property_list;
16497 my @standard_list;
16498 if ( $property->fate <= $MAP_PROXIED) {
16499 @property_list = ($property_name, $full_property_name);
16500 @standard_list = ($standard_property_name,
16501 $standard_property_full_name);
16502 }
99870f4d 16503
382cadab
KW
16504 # For each synonym ...
16505 for my $i (0 .. @property_aliases - 1) {
16506 my $alias = $property_aliases[$i];
16507 my $alias_name = $alias->name;
16508 my $alias_standard = standardize($alias_name);
99870f4d 16509
382cadab 16510
5d1df013
KW
16511 # Add other aliases to the list of property aliases
16512 if ($property->fate <= $MAP_PROXIED
16513 && ! grep { $alias_standard eq $_ } @standard_list)
16514 {
16515 push @property_list, $alias_name;
16516 push @standard_list, $alias_standard;
16517 }
382cadab
KW
16518
16519 # For utf8_heavy, set the mapping of the alias to the
16520 # property
86a52d1e
KW
16521 if ($type == $STRING) {
16522 if ($property->fate <= $MAP_PROXIED) {
16523 $string_property_loose_to_name{$alias_standard}
16524 = $standard_property_name;
16525 }
16526 }
16527 else {
99870f4d
KW
16528 if (exists ($loose_property_name_of{$alias_standard}))
16529 {
16530 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
16531 }
16532 else {
16533 $loose_property_name_of{$alias_standard}
16534 = $standard_property_name;
16535 }
16536
d1476e4d 16537 # Now for the re pod entry for this alias. Skip if not
23e33b60
KW
16538 # outputting a pod; skip the first one, which is the
16539 # full name so won't have an entry like: '\p{full: *}
16540 # \p{full: *}', and skip if don't want an entry for
16541 # this one.
16542 next if $i == 0
16543 || ! defined $pod_directory
33e96e72 16544 || ! $alias->make_re_pod_entry;
99870f4d 16545
01d970b5 16546 my $rhs = "\\p{$full_property_name: *}";
d57ccc9a
KW
16547 if ($property != $perl && $table->perl_extension) {
16548 $rhs .= ' (Perl extension)';
16549 }
99870f4d
KW
16550 push @match_properties,
16551 format_pod_line($indent_info_column,
16552 '\p{' . $alias->name . ': *}',
d57ccc9a 16553 $rhs,
99870f4d
KW
16554 $alias->status);
16555 }
382cadab 16556 }
d73e5302 16557
5d1df013
KW
16558 # The list of all possible names is attached to each alias, so
16559 # lookup is easy
16560 if (@property_list) {
16561 push @{$prop_aliases{$standard_list[0]}}, @property_list;
16562 }
16563
1e863613
KW
16564 if ($property->fate <= $MAP_PROXIED) {
16565
16566 # Similarly, we create for Unicode::UCD a list of
16567 # property-value aliases.
16568
16569 my $property_full_name = $property->full_name;
16570
16571 # Look at each table in the property...
16572 foreach my $table ($property->tables) {
16573 my @values_list;
16574 my $table_full_name = $table->full_name;
16575 my $standard_table_full_name
16576 = standardize($table_full_name);
16577 my $table_name = $table->name;
16578 my $standard_table_name = standardize($table_name);
16579
16580 # The list starts with the table name and its full
16581 # name.
16582 push @values_list, $table_name, $table_full_name;
16583
16584 # We add to the table each unique alias that isn't
16585 # discouraged from use.
16586 foreach my $alias ($table->aliases) {
16587 next if $alias->status
16588 && $alias->status eq $DISCOURAGED;
16589 my $name = $alias->name;
16590 my $standard = standardize($name);
16591 next if $standard eq $standard_table_name;
16592 next if $standard eq $standard_table_full_name;
16593 push @values_list, $name;
16594 }
5d1df013 16595
1e863613
KW
16596 # Here @values_list is a list of all the aliases for
16597 # the table. That is, all the property-values given
16598 # by this table. By agreement with Unicode::UCD,
16599 # if the name and full name are identical, and there
16600 # are no other names, drop the duplcate entry to save
16601 # memory.
16602 if (@values_list == 2
16603 && $values_list[0] eq $values_list[1])
16604 {
16605 pop @values_list
16606 }
16607
16608 # To save memory, unlike the similar list for property
16609 # aliases above, only the standard forms hve the list.
16610 # This forces an extra step of converting from input
16611 # name to standard name, but the savings are
16612 # considerable. (There is only marginal savings if we
16613 # did this with the property aliases.)
16614 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
16615 }
16616 }
d73e5302 16617
c12f2655 16618 # Don't write out a mapping file if not desired.
99870f4d
KW
16619 next if ! $property->to_output_map;
16620 }
d73e5302 16621
99870f4d
KW
16622 # Here, we know we want to write out the table, but don't do it
16623 # yet because there may be other tables that come along and will
16624 # want to share the file, and the file's comments will change to
16625 # mention them. So save for later.
16626 push @writables, $table;
16627
16628 } # End of looping through the property and all its tables.
16629 } # End of looping through all properties.
16630
16631 # Now have all the tables that will have files written for them. Do it.
16632 foreach my $table (@writables) {
16633 my @directory;
16634 my $filename;
16635 my $property = $table->property;
16636 my $is_property = ($table == $property);
16637 if (! $is_property) {
16638
16639 # Match tables for the property go in lib/$subdirectory, which is
16640 # the property's name. Don't use the standard file name for this,
16641 # as may get an unfamiliar alias
16642 @directory = ($matches_directory, $property->external_name);
16643 }
16644 else {
d73e5302 16645
99870f4d
KW
16646 @directory = $table->directory;
16647 $filename = $table->file;
16648 }
d73e5302 16649
98dc9551 16650 # Use specified filename if available, or default to property's
99870f4d
KW
16651 # shortest name. We need an 8.3 safe filename (which means "an 8
16652 # safe" filename, since after the dot is only 'pl', which is < 3)
16653 # The 2nd parameter is if the filename shouldn't be changed, and
16654 # it shouldn't iff there is a hard-coded name for this table.
16655 $filename = construct_filename(
16656 $filename || $table->external_name,
16657 ! $filename, # mutable if no filename
16658 \@directory);
d73e5302 16659
99870f4d 16660 register_file_for_name($table, \@directory, $filename);
d73e5302 16661
99870f4d
KW
16662 # Only need to write one file when shared by more than one
16663 # property
a92d5c2e
KW
16664 next if ! $is_property
16665 && ($table->leader != $table || $table->complement != 0);
d73e5302 16666
99870f4d
KW
16667 # Construct a nice comment to add to the file
16668 $table->set_final_comment;
16669
16670 $table->write;
cf25bb62 16671 }
d73e5302 16672
d73e5302 16673
99870f4d
KW
16674 # Write out the pod file
16675 make_pod;
16676
9f077a68 16677 # And Heavy.pl, Name.pm, UCD.pl
99870f4d 16678 make_Heavy;
52dc8b5d 16679 make_Name_pm;
9f077a68 16680 make_UCD;
d73e5302 16681
99870f4d 16682 make_property_test_script() if $make_test_script;
6b5ab373 16683 make_normalization_test_script() if $make_norm_test_script;
99870f4d 16684 return;
cf25bb62 16685}
d73e5302 16686
99870f4d
KW
16687my @white_space_separators = ( # This used only for making the test script.
16688 "",
16689 ' ',
16690 "\t",
16691 ' '
16692 );
d73e5302 16693
99870f4d
KW
16694sub generate_separator($) {
16695 # This used only for making the test script. It generates the colon or
16696 # equal separator between the property and property value, with random
16697 # white space surrounding the separator
d73e5302 16698
99870f4d 16699 my $lhs = shift;
d73e5302 16700
99870f4d 16701 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 16702
99870f4d
KW
16703 # Choose space before and after randomly
16704 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
16705 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 16706
99870f4d
KW
16707 # And return the whole complex, half the time using a colon, half the
16708 # equals
16709 return $spaces_before
16710 . (rand() < 0.5) ? '=' : ':'
16711 . $spaces_after;
16712}
76ccdbe2 16713
430ada4c 16714sub generate_tests($$$$$) {
99870f4d
KW
16715 # This used only for making the test script. It generates test cases that
16716 # are expected to compile successfully in perl. Note that the lhs and
16717 # rhs are assumed to already be as randomized as the caller wants.
16718
99870f4d
KW
16719 my $lhs = shift; # The property: what's to the left of the colon
16720 # or equals separator
16721 my $rhs = shift; # The property value; what's to the right
16722 my $valid_code = shift; # A code point that's known to be in the
16723 # table given by lhs=rhs; undef if table is
16724 # empty
16725 my $invalid_code = shift; # A code point known to not be in the table;
16726 # undef if the table is all code points
16727 my $warning = shift;
16728
16729 # Get the colon or equal
16730 my $separator = generate_separator($lhs);
16731
16732 # The whole 'property=value'
16733 my $name = "$lhs$separator$rhs";
16734
430ada4c 16735 my @output;
99870f4d
KW
16736 # Create a complete set of tests, with complements.
16737 if (defined $valid_code) {
430ada4c
NC
16738 push @output, <<"EOC"
16739Expect(1, $valid_code, '\\p{$name}', $warning);
16740Expect(0, $valid_code, '\\p{^$name}', $warning);
16741Expect(0, $valid_code, '\\P{$name}', $warning);
16742Expect(1, $valid_code, '\\P{^$name}', $warning);
16743EOC
99870f4d
KW
16744 }
16745 if (defined $invalid_code) {
430ada4c
NC
16746 push @output, <<"EOC"
16747Expect(0, $invalid_code, '\\p{$name}', $warning);
16748Expect(1, $invalid_code, '\\p{^$name}', $warning);
16749Expect(1, $invalid_code, '\\P{$name}', $warning);
16750Expect(0, $invalid_code, '\\P{^$name}', $warning);
16751EOC
16752 }
16753 return @output;
99870f4d 16754}
cf25bb62 16755
430ada4c 16756sub generate_error($$$) {
99870f4d
KW
16757 # This used only for making the test script. It generates test cases that
16758 # are expected to not only not match, but to be syntax or similar errors
16759
99870f4d
KW
16760 my $lhs = shift; # The property: what's to the left of the
16761 # colon or equals separator
16762 my $rhs = shift; # The property value; what's to the right
16763 my $already_in_error = shift; # Boolean; if true it's known that the
16764 # unmodified lhs and rhs will cause an error.
16765 # This routine should not force another one
16766 # Get the colon or equal
16767 my $separator = generate_separator($lhs);
16768
16769 # Since this is an error only, don't bother to randomly decide whether to
16770 # put the error on the left or right side; and assume that the rhs is
16771 # loosely matched, again for convenience rather than rigor.
16772 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
16773
16774 my $property = $lhs . $separator . $rhs;
16775
430ada4c
NC
16776 return <<"EOC";
16777Error('\\p{$property}');
16778Error('\\P{$property}');
16779EOC
d73e5302
JH
16780}
16781
99870f4d
KW
16782# These are used only for making the test script
16783# XXX Maybe should also have a bad strict seps, which includes underscore.
16784
16785my @good_loose_seps = (
16786 " ",
16787 "-",
16788 "\t",
16789 "",
16790 "_",
16791 );
16792my @bad_loose_seps = (
16793 "/a/",
16794 ':=',
16795 );
16796
16797sub randomize_stricter_name {
16798 # This used only for making the test script. Take the input name and
16799 # return a randomized, but valid version of it under the stricter matching
16800 # rules.
16801
16802 my $name = shift;
16803 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16804
16805 # If the name looks like a number (integer, floating, or rational), do
16806 # some extra work
16807 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
16808 my $sign = $1;
16809 my $number = $2;
16810 my $separator = $3;
16811
16812 # If there isn't a sign, part of the time add a plus
16813 # Note: Not testing having any denominator having a minus sign
16814 if (! $sign) {
16815 $sign = '+' if rand() <= .3;
16816 }
16817
16818 # And add 0 or more leading zeros.
16819 $name = $sign . ('0' x int rand(10)) . $number;
16820
16821 if (defined $separator) {
16822 my $extra_zeros = '0' x int rand(10);
cf25bb62 16823
99870f4d
KW
16824 if ($separator eq '.') {
16825
16826 # Similarly, add 0 or more trailing zeros after a decimal
16827 # point
16828 $name .= $extra_zeros;
16829 }
16830 else {
16831
16832 # Or, leading zeros before the denominator
16833 $name =~ s,/,/$extra_zeros,;
16834 }
16835 }
cf25bb62 16836 }
d73e5302 16837
99870f4d
KW
16838 # For legibility of the test, only change the case of whole sections at a
16839 # time. To do this, first split into sections. The split returns the
16840 # delimiters
16841 my @sections;
16842 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
16843 trace $section if main::DEBUG && $to_trace;
16844
16845 if (length $section > 1 && $section !~ /\D/) {
16846
16847 # If the section is a sequence of digits, about half the time
16848 # randomly add underscores between some of them.
16849 if (rand() > .5) {
16850
16851 # Figure out how many underscores to add. max is 1 less than
16852 # the number of digits. (But add 1 at the end to make sure
16853 # result isn't 0, and compensate earlier by subtracting 2
16854 # instead of 1)
16855 my $num_underscores = int rand(length($section) - 2) + 1;
16856
16857 # And add them evenly throughout, for convenience, not rigor
16858 use integer;
16859 my $spacing = (length($section) - 1)/ $num_underscores;
16860 my $temp = $section;
16861 $section = "";
16862 for my $i (1 .. $num_underscores) {
16863 $section .= substr($temp, 0, $spacing, "") . '_';
16864 }
16865 $section .= $temp;
16866 }
16867 push @sections, $section;
16868 }
16869 else {
d73e5302 16870
99870f4d
KW
16871 # Here not a sequence of digits. Change the case of the section
16872 # randomly
16873 my $switch = int rand(4);
16874 if ($switch == 0) {
16875 push @sections, uc $section;
16876 }
16877 elsif ($switch == 1) {
16878 push @sections, lc $section;
16879 }
16880 elsif ($switch == 2) {
16881 push @sections, ucfirst $section;
16882 }
16883 else {
16884 push @sections, $section;
16885 }
16886 }
cf25bb62 16887 }
99870f4d
KW
16888 trace "returning", join "", @sections if main::DEBUG && $to_trace;
16889 return join "", @sections;
16890}
71d929cb 16891
99870f4d
KW
16892sub randomize_loose_name($;$) {
16893 # This used only for making the test script
71d929cb 16894
99870f4d
KW
16895 my $name = shift;
16896 my $want_error = shift; # if true, make an error
16897 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16898
16899 $name = randomize_stricter_name($name);
5beb625e
JH
16900
16901 my @parts;
99870f4d 16902 push @parts, $good_loose_seps[rand(@good_loose_seps)];
45376db6
KW
16903
16904 # Preserve trailing ones for the sake of not stripping the underscore from
16905 # 'L_'
16906 for my $part (split /[-\s_]+ (?= . )/, $name) {
5beb625e 16907 if (@parts) {
99870f4d
KW
16908 if ($want_error and rand() < 0.3) {
16909 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
16910 $want_error = 0;
16911 }
16912 else {
16913 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
16914 }
16915 }
99870f4d 16916 push @parts, $part;
5beb625e 16917 }
99870f4d
KW
16918 my $new = join("", @parts);
16919 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 16920
99870f4d 16921 if ($want_error) {
5beb625e 16922 if (rand() >= 0.5) {
99870f4d
KW
16923 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
16924 }
16925 else {
16926 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
16927 }
16928 }
16929 return $new;
16930}
16931
99870f4d
KW
16932# Used to make sure don't generate duplicate test cases.
16933my %test_generated;
5beb625e 16934
99870f4d
KW
16935sub make_property_test_script() {
16936 # This used only for making the test script
16937 # this written directly -- it's huge.
5beb625e 16938
99870f4d 16939 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 16940
99870f4d
KW
16941 # This uses randomness to test different possibilities without testing all
16942 # possibilities. To ensure repeatability, set the seed to 0. But if
16943 # tests are added, it will perturb all later ones in the .t file
16944 srand 0;
5beb625e 16945
3df51b85
KW
16946 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
16947
99870f4d
KW
16948 # Keep going down an order of magnitude
16949 # until find that adding this quantity to
16950 # 1 remains 1; but put an upper limit on
16951 # this so in case this algorithm doesn't
16952 # work properly on some platform, that we
16953 # won't loop forever.
16954 my $digits = 0;
16955 my $min_floating_slop = 1;
16956 while (1+ $min_floating_slop != 1
16957 && $digits++ < 50)
5beb625e 16958 {
99870f4d
KW
16959 my $next = $min_floating_slop / 10;
16960 last if $next == 0; # If underflows,
16961 # use previous one
16962 $min_floating_slop = $next;
5beb625e 16963 }
430ada4c
NC
16964
16965 # It doesn't matter whether the elements of this array contain single lines
16966 # or multiple lines. main::write doesn't count the lines.
16967 my @output;
99870f4d
KW
16968
16969 foreach my $property (property_ref('*')) {
16970 foreach my $table ($property->tables) {
16971
16972 # Find code points that match, and don't match this table.
16973 my $valid = $table->get_valid_code_point;
16974 my $invalid = $table->get_invalid_code_point;
16975 my $warning = ($table->status eq $DEPRECATED)
16976 ? "'deprecated'"
16977 : '""';
16978
16979 # Test each possible combination of the property's aliases with
16980 # the table's. If this gets to be too many, could do what is done
16981 # in the set_final_comment() for Tables
16982 my @table_aliases = $table->aliases;
16983 my @property_aliases = $table->property->aliases;
807807b7
KW
16984
16985 # Every property can be optionally be prefixed by 'Is_', so test
16986 # that those work, by creating such a new alias for each
16987 # pre-existing one.
16988 push @property_aliases, map { Alias->new("Is_" . $_->name,
16989 $_->loose_match,
33e96e72 16990 $_->make_re_pod_entry,
0eac1e20 16991 $_->ok_as_filename,
fd1e3e84
KW
16992 $_->status,
16993 $_->ucd,
16994 )
807807b7 16995 } @property_aliases;
99870f4d
KW
16996 my $max = max(scalar @table_aliases, scalar @property_aliases);
16997 for my $j (0 .. $max - 1) {
16998
16999 # The current alias for property is the next one on the list,
17000 # or if beyond the end, start over. Similarly for table
17001 my $property_name
17002 = $property_aliases[$j % @property_aliases]->name;
17003
17004 $property_name = "" if $table->property == $perl;
17005 my $table_alias = $table_aliases[$j % @table_aliases];
17006 my $table_name = $table_alias->name;
17007 my $loose_match = $table_alias->loose_match;
17008
17009 # If the table doesn't have a file, any test for it is
17010 # already guaranteed to be in error
17011 my $already_error = ! $table->file_path;
17012
17013 # Generate error cases for this alias.
430ada4c
NC
17014 push @output, generate_error($property_name,
17015 $table_name,
17016 $already_error);
99870f4d
KW
17017
17018 # If the table is guaranteed to always generate an error,
17019 # quit now without generating success cases.
17020 next if $already_error;
17021
17022 # Now for the success cases.
17023 my $random;
17024 if ($loose_match) {
17025
17026 # For loose matching, create an extra test case for the
17027 # standard name.
17028 my $standard = standardize($table_name);
17029
17030 # $test_name should be a unique combination for each test
17031 # case; used just to avoid duplicate tests
17032 my $test_name = "$property_name=$standard";
17033
17034 # Don't output duplicate test cases.
17035 if (! exists $test_generated{$test_name}) {
17036 $test_generated{$test_name} = 1;
430ada4c
NC
17037 push @output, generate_tests($property_name,
17038 $standard,
17039 $valid,
17040 $invalid,
17041 $warning,
17042 );
5beb625e 17043 }
99870f4d
KW
17044 $random = randomize_loose_name($table_name)
17045 }
17046 else { # Stricter match
17047 $random = randomize_stricter_name($table_name);
99598c8c 17048 }
99598c8c 17049
99870f4d
KW
17050 # Now for the main test case for this alias.
17051 my $test_name = "$property_name=$random";
17052 if (! exists $test_generated{$test_name}) {
17053 $test_generated{$test_name} = 1;
430ada4c
NC
17054 push @output, generate_tests($property_name,
17055 $random,
17056 $valid,
17057 $invalid,
17058 $warning,
17059 );
99870f4d
KW
17060
17061 # If the name is a rational number, add tests for the
17062 # floating point equivalent.
17063 if ($table_name =~ qr{/}) {
17064
17065 # Calculate the float, and find just the fraction.
17066 my $float = eval $table_name;
17067 my ($whole, $fraction)
17068 = $float =~ / (.*) \. (.*) /x;
17069
17070 # Starting with one digit after the decimal point,
17071 # create a test for each possible precision (number of
17072 # digits past the decimal point) until well beyond the
17073 # native number found on this machine. (If we started
17074 # with 0 digits, it would be an integer, which could
17075 # well match an unrelated table)
17076 PLACE:
17077 for my $i (1 .. $min_floating_slop + 3) {
17078 my $table_name = sprintf("%.*f", $i, $float);
17079 if ($i < $MIN_FRACTION_LENGTH) {
17080
17081 # If the test case has fewer digits than the
17082 # minimum acceptable precision, it shouldn't
17083 # succeed, so we expect an error for it.
17084 # E.g., 2/3 = .7 at one decimal point, and we
17085 # shouldn't say it matches .7. We should make
17086 # it be .667 at least before agreeing that the
17087 # intent was to match 2/3. But at the
17088 # less-than- acceptable level of precision, it
17089 # might actually match an unrelated number.
17090 # So don't generate a test case if this
17091 # conflating is possible. In our example, we
17092 # don't want 2/3 matching 7/10, if there is
17093 # a 7/10 code point.
17094 for my $existing
17095 (keys %nv_floating_to_rational)
17096 {
17097 next PLACE
17098 if abs($table_name - $existing)
17099 < $MAX_FLOATING_SLOP;
17100 }
430ada4c
NC
17101 push @output, generate_error($property_name,
17102 $table_name,
17103 1 # 1 => already an error
17104 );
99870f4d
KW
17105 }
17106 else {
17107
17108 # Here the number of digits exceeds the
17109 # minimum we think is needed. So generate a
17110 # success test case for it.
430ada4c
NC
17111 push @output, generate_tests($property_name,
17112 $table_name,
17113 $valid,
17114 $invalid,
17115 $warning,
17116 );
99870f4d
KW
17117 }
17118 }
99598c8c
JH
17119 }
17120 }
99870f4d
KW
17121 }
17122 }
17123 }
37e2e78e 17124
9218f1cf
KW
17125 &write($t_path,
17126 0, # Not utf8;
17127 [<DATA>,
17128 @output,
17129 (map {"Test_X('$_');\n"} @backslash_X_tests),
17130 "Finished();\n"]);
99870f4d
KW
17131 return;
17132}
99598c8c 17133
6b5ab373
KW
17134sub make_normalization_test_script() {
17135 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17136
17137 my $n_path = 'TestNorm.pl';
17138
17139 unshift @normalization_tests, <<'END';
17140use utf8;
17141use Test::More;
17142
17143sub ord_string { # Convert packed ords to printable string
17144 use charnames ();
17145 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17146 unpack "U*", shift) . "'";
17147 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
17148}
17149
17150sub Test_N {
17151 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17152 my $display_source = ord_string($source);
17153 my $display_nfc = ord_string($nfc);
17154 my $display_nfd = ord_string($nfd);
17155 my $display_nfkc = ord_string($nfkc);
17156 my $display_nfkd = ord_string($nfkd);
17157
17158 use Unicode::Normalize;
17159 # NFC
17160 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
17161 # nfkc == toNFC(nfkc) == toNFC(nfkd)
17162 #
17163 # NFD
17164 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
17165 # nfkd == toNFD(nfkc) == toNFD(nfkd)
17166 #
17167 # NFKC
17168 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17169 # toNFKC(nfkc) == toNFKC(nfkd)
17170 #
17171 # NFKD
17172 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17173 # toNFKD(nfkc) == toNFKD(nfkd)
17174
17175 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17176 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17177 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17178 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17179 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17180
17181 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17182 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17183 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17184 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17185 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17186
17187 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17188 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17189 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17190 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17191 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17192
17193 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17194 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17195 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17196 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17197 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17198}
17199END
17200
17201 &write($n_path,
17202 1, # Is utf8;
17203 [
17204 @normalization_tests,
17205 'done_testing();'
17206 ]);
17207 return;
17208}
17209
99870f4d
KW
17210# This is a list of the input files and how to handle them. The files are
17211# processed in their order in this list. Some reordering is possible if
17212# desired, but the v0 files should be first, and the extracted before the
17213# others except DAge.txt (as data in an extracted file can be over-ridden by
17214# the non-extracted. Some other files depend on data derived from an earlier
17215# file, like UnicodeData requires data from Jamo, and the case changing and
dbe7a391 17216# folding requires data from Unicode. Mostly, it is safest to order by first
99870f4d
KW
17217# version releases in (except the Jamo). DAge.txt is read before the
17218# extracted ones because of the rarely used feature $compare_versions. In the
17219# unlikely event that there were ever an extracted file that contained the Age
17220# property information, it would have to go in front of DAge.
17221#
17222# The version strings allow the program to know whether to expect a file or
17223# not, but if a file exists in the directory, it will be processed, even if it
17224# is in a version earlier than expected, so you can copy files from a later
17225# release into an earlier release's directory.
17226my @input_file_objects = (
17227 Input_file->new('PropertyAliases.txt', v0,
17228 Handler => \&process_PropertyAliases,
17229 ),
17230 Input_file->new(undef, v0, # No file associated with this
3df51b85 17231 Progress_Message => 'Finishing property setup',
99870f4d
KW
17232 Handler => \&finish_property_setup,
17233 ),
17234 Input_file->new('PropValueAliases.txt', v0,
17235 Handler => \&process_PropValueAliases,
17236 Has_Missings_Defaults => $NOT_IGNORED,
17237 ),
17238 Input_file->new('DAge.txt', v3.2.0,
17239 Has_Missings_Defaults => $NOT_IGNORED,
17240 Property => 'Age'
17241 ),
17242 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
17243 Property => 'General_Category',
17244 ),
17245 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
17246 Property => 'Canonical_Combining_Class',
17247 Has_Missings_Defaults => $NOT_IGNORED,
17248 ),
17249 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
17250 Property => 'Numeric_Type',
17251 Has_Missings_Defaults => $NOT_IGNORED,
17252 ),
17253 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
17254 Property => 'East_Asian_Width',
17255 Has_Missings_Defaults => $NOT_IGNORED,
17256 ),
17257 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
17258 Property => 'Line_Break',
17259 Has_Missings_Defaults => $NOT_IGNORED,
17260 ),
17261 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
17262 Property => 'Bidi_Class',
17263 Has_Missings_Defaults => $NOT_IGNORED,
17264 ),
17265 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
17266 Property => 'Decomposition_Type',
17267 Has_Missings_Defaults => $NOT_IGNORED,
17268 ),
17269 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
17270 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
17271 Property => 'Numeric_Value',
17272 Each_Line_Handler => \&filter_numeric_value_line,
17273 Has_Missings_Defaults => $NOT_IGNORED,
17274 ),
17275 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
17276 Property => 'Joining_Group',
17277 Has_Missings_Defaults => $NOT_IGNORED,
17278 ),
17279
17280 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
17281 Property => 'Joining_Type',
17282 Has_Missings_Defaults => $NOT_IGNORED,
17283 ),
17284 Input_file->new('Jamo.txt', v2.0.0,
17285 Property => 'Jamo_Short_Name',
17286 Each_Line_Handler => \&filter_jamo_line,
17287 ),
17288 Input_file->new('UnicodeData.txt', v1.1.5,
17289 Pre_Handler => \&setup_UnicodeData,
17290
17291 # We clean up this file for some early versions.
17292 Each_Line_Handler => [ (($v_version lt v2.0.0 )
17293 ? \&filter_v1_ucd
17294 : ($v_version eq v2.1.5)
17295 ? \&filter_v2_1_5_ucd
3ffed8c2
KW
17296
17297 # And for 5.14 Perls with 6.0,
17298 # have to also make changes
fe3193b5
KW
17299 : ($v_version ge v6.0.0
17300 && $^V lt v5.17.0)
3ffed8c2
KW
17301 ? \&filter_v6_ucd
17302 : undef),
99870f4d 17303
08978fe6
KW
17304 # Early versions did not have the
17305 # proper Unicode_1 names for the
17306 # controls
17307 (($v_version lt v3.0.0)
17308 ? \&filter_early_U1_names
17309 : undef),
17310
be864b6c
KW
17311 # Early versions did not correctly
17312 # use the later method for giving
17313 # decimal digit values
17314 (($v_version le v3.2.0)
17315 ? \&filter_bad_Nd_ucd
17316 : undef),
17317
99870f4d
KW
17318 # And the main filter
17319 \&filter_UnicodeData_line,
17320 ],
17321 EOF_Handler => \&EOF_UnicodeData,
17322 ),
17323 Input_file->new('ArabicShaping.txt', v2.0.0,
17324 Each_Line_Handler =>
17325 [ ($v_version lt 4.1.0)
17326 ? \&filter_old_style_arabic_shaping
17327 : undef,
17328 \&filter_arabic_shaping_line,
17329 ],
17330 Has_Missings_Defaults => $NOT_IGNORED,
17331 ),
17332 Input_file->new('Blocks.txt', v2.0.0,
17333 Property => 'Block',
17334 Has_Missings_Defaults => $NOT_IGNORED,
17335 Each_Line_Handler => \&filter_blocks_lines
17336 ),
17337 Input_file->new('PropList.txt', v2.0.0,
17338 Each_Line_Handler => (($v_version lt v3.1.0)
17339 ? \&filter_old_style_proplist
17340 : undef),
17341 ),
17342 Input_file->new('Unihan.txt', v2.0.0,
17343 Pre_Handler => \&setup_unihan,
17344 Optional => 1,
17345 Each_Line_Handler => \&filter_unihan_line,
17346 ),
17347 Input_file->new('SpecialCasing.txt', v2.1.8,
154ab528
KW
17348 Each_Line_Handler => ($v_version eq 2.1.8)
17349 ? \&filter_2_1_8_special_casing_line
17350 : \&filter_special_casing_line,
99870f4d 17351 Pre_Handler => \&setup_special_casing,
dbf17f82 17352 Has_Missings_Defaults => $IGNORED,
99870f4d
KW
17353 ),
17354 Input_file->new(
17355 'LineBreak.txt', v3.0.0,
17356 Has_Missings_Defaults => $NOT_IGNORED,
17357 Property => 'Line_Break',
17358 # Early versions had problematic syntax
17359 Each_Line_Handler => (($v_version lt v3.1.0)
17360 ? \&filter_early_ea_lb
17361 : undef),
17362 ),
17363 Input_file->new('EastAsianWidth.txt', v3.0.0,
17364 Property => 'East_Asian_Width',
17365 Has_Missings_Defaults => $NOT_IGNORED,
17366 # Early versions had problematic syntax
17367 Each_Line_Handler => (($v_version lt v3.1.0)
17368 ? \&filter_early_ea_lb
17369 : undef),
17370 ),
17371 Input_file->new('CompositionExclusions.txt', v3.0.0,
17372 Property => 'Composition_Exclusion',
17373 ),
17374 Input_file->new('BidiMirroring.txt', v3.0.1,
17375 Property => 'Bidi_Mirroring_Glyph',
1e958ea9
KW
17376 Has_Missings_Defaults => ($v_version lt v6.2.0)
17377 ? $NO_DEFAULTS
17378 # Is <none> which doesn't mean
17379 # anything to us, we will use the
17380 # null string
17381 : $IGNORED,
17382
99870f4d 17383 ),
38a91a4e 17384 Input_file->new("NormTest.txt", v3.0.0,
6b5ab373
KW
17385 Handler => \&process_NormalizationsTest,
17386 Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
37e2e78e 17387 ),
99870f4d
KW
17388 Input_file->new('CaseFolding.txt', v3.0.1,
17389 Pre_Handler => \&setup_case_folding,
17390 Each_Line_Handler =>
17391 [ ($v_version lt v3.1.0)
17392 ? \&filter_old_style_case_folding
17393 : undef,
17394 \&filter_case_folding_line
17395 ],
dbf17f82 17396 Has_Missings_Defaults => $IGNORED,
99870f4d
KW
17397 ),
17398 Input_file->new('DCoreProperties.txt', v3.1.0,
17399 # 5.2 changed this file
17400 Has_Missings_Defaults => (($v_version ge v5.2.0)
17401 ? $NOT_IGNORED
17402 : $NO_DEFAULTS),
17403 ),
17404 Input_file->new('Scripts.txt', v3.1.0,
17405 Property => 'Script',
17406 Has_Missings_Defaults => $NOT_IGNORED,
17407 ),
17408 Input_file->new('DNormalizationProps.txt', v3.1.0,
17409 Has_Missings_Defaults => $NOT_IGNORED,
17410 Each_Line_Handler => (($v_version lt v4.0.1)
17411 ? \&filter_old_style_normalization_lines
17412 : undef),
17413 ),
30769324 17414 Input_file->new('HangulSyllableType.txt', v0,
99870f4d 17415 Has_Missings_Defaults => $NOT_IGNORED,
30769324
KW
17416 Property => 'Hangul_Syllable_Type',
17417 Pre_Handler => ($v_version lt v4.0.0)
17418 ? \&generate_hst
17419 : undef,
17420 ),
99870f4d
KW
17421 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
17422 Property => 'Word_Break',
17423 Has_Missings_Defaults => $NOT_IGNORED,
17424 ),
30769324 17425 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
99870f4d
KW
17426 Property => 'Grapheme_Cluster_Break',
17427 Has_Missings_Defaults => $NOT_IGNORED,
30769324
KW
17428 Pre_Handler => ($v_version lt v4.1.0)
17429 ? \&generate_GCB
17430 : undef,
99870f4d 17431 ),
37e2e78e
KW
17432 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
17433 Handler => \&process_GCB_test,
17434 ),
17435 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
09ca89ce 17436 Skip => 'Validation Tests',
37e2e78e
KW
17437 ),
17438 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
09ca89ce 17439 Skip => 'Validation Tests',
37e2e78e
KW
17440 ),
17441 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
09ca89ce 17442 Skip => 'Validation Tests',
37e2e78e 17443 ),
99870f4d
KW
17444 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
17445 Property => 'Sentence_Break',
17446 Has_Missings_Defaults => $NOT_IGNORED,
17447 ),
17448 Input_file->new('NamedSequences.txt', v4.1.0,
17449 Handler => \&process_NamedSequences
17450 ),
f11ca562 17451 Input_file->new('NameAliases.txt', v0,
99870f4d 17452 Property => 'Name_Alias',
b8ba2307 17453 Pre_Handler => ($v_version le v6.0.0)
ce432655 17454 ? \&setup_early_name_alias
dcd72625 17455 : undef,
b8ba2307
KW
17456 Each_Line_Handler => ($v_version le v6.0.0)
17457 ? \&filter_early_version_name_alias_line
17458 : \&filter_later_version_name_alias_line,
99870f4d 17459 ),
37e2e78e 17460 Input_file->new("BidiTest.txt", v5.2.0,
09ca89ce 17461 Skip => 'Validation Tests',
37e2e78e 17462 ),
99870f4d
KW
17463 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
17464 Optional => 1,
17465 Each_Line_Handler => \&filter_unihan_line,
17466 ),
17467 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
17468 Optional => 1,
17469 Each_Line_Handler => \&filter_unihan_line,
17470 ),
17471 Input_file->new('UnihanIRGSources.txt', v5.2.0,
17472 Optional => 1,
17473 Pre_Handler => \&setup_unihan,
17474 Each_Line_Handler => \&filter_unihan_line,
17475 ),
17476 Input_file->new('UnihanNumericValues.txt', v5.2.0,
17477 Optional => 1,
17478 Each_Line_Handler => \&filter_unihan_line,
17479 ),
17480 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
17481 Optional => 1,
17482 Each_Line_Handler => \&filter_unihan_line,
17483 ),
17484 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
17485 Optional => 1,
17486 Each_Line_Handler => \&filter_unihan_line,
17487 ),
17488 Input_file->new('UnihanReadings.txt', v5.2.0,
17489 Optional => 1,
17490 Each_Line_Handler => \&filter_unihan_line,
17491 ),
17492 Input_file->new('UnihanVariants.txt', v5.2.0,
17493 Optional => 1,
17494 Each_Line_Handler => \&filter_unihan_line,
17495 ),
82aed44a
KW
17496 Input_file->new('ScriptExtensions.txt', v6.0.0,
17497 Property => 'Script_Extensions',
17498 Pre_Handler => \&setup_script_extensions,
fbe1e607 17499 Each_Line_Handler => \&filter_script_extensions_line,
4fec90df
KW
17500 Has_Missings_Defaults => (($v_version le v6.0.0)
17501 ? $NO_DEFAULTS
17502 : $IGNORED),
82aed44a 17503 ),
3111abc0
KW
17504 # The two Indic files are actually available starting in v6.0.0, but their
17505 # property values are missing from PropValueAliases.txt in that release,
17506 # so that further work would have to be done to get them to work properly
17507 # for that release.
17508 Input_file->new('IndicMatraCategory.txt', v6.1.0,
17509 Property => 'Indic_Matra_Category',
17510 Has_Missings_Defaults => $NOT_IGNORED,
17511 Skip => "Provisional; for the analysis and processing of Indic scripts",
17512 ),
17513 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
17514 Property => 'Indic_Syllabic_Category',
17515 Has_Missings_Defaults => $NOT_IGNORED,
17516 Skip => "Provisional; for the analysis and processing of Indic scripts",
17517 ),
99870f4d 17518);
99598c8c 17519
99870f4d
KW
17520# End of all the preliminaries.
17521# Do it...
99598c8c 17522
99870f4d
KW
17523if ($compare_versions) {
17524 Carp::my_carp(<<END
17525Warning. \$compare_versions is set. Output is not suitable for production
17526END
17527 );
17528}
99598c8c 17529
99870f4d
KW
17530# Put into %potential_files a list of all the files in the directory structure
17531# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 17532# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
17533my @ignored_files_full_names = map { File::Spec->rel2abs(
17534 internal_file_to_platform($_))
17535 } keys %ignored_files;
17536File::Find::find({
17537 wanted=>sub {
37e2e78e 17538 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 17539 my $full = lc(File::Spec->rel2abs($_));
99870f4d 17540 $potential_files{$full} = 1
37e2e78e 17541 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
17542 return;
17543 }
17544}, File::Spec->curdir());
99598c8c 17545
99870f4d 17546my @mktables_list_output_files;
cdcef19a 17547my $old_start_time = 0;
cf25bb62 17548
3644ba60
KW
17549if (! -e $file_list) {
17550 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
17551 $write_unchanged_files = 1;
17552} elsif ($write_unchanged_files) {
99870f4d
KW
17553 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
17554}
17555else {
17556 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
17557 my $file_handle;
23e33b60 17558 if (! open $file_handle, "<", $file_list) {
3644ba60 17559 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
17560 $glob_list = 1;
17561 }
17562 else {
17563 my @input;
17564
17565 # Read and parse mktables.lst, placing the results from the first part
17566 # into @input, and the second part into @mktables_list_output_files
17567 for my $list ( \@input, \@mktables_list_output_files ) {
17568 while (<$file_handle>) {
17569 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
17570 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
17571 $old_start_time = $1;
17572 }
99870f4d
KW
17573 next if /^ \s* (?: \# .* )? $/x;
17574 last if /^ =+ $/x;
17575 my ( $file ) = split /\t/;
17576 push @$list, $file;
cf25bb62 17577 }
99870f4d
KW
17578 @$list = uniques(@$list);
17579 next;
cf25bb62
JH
17580 }
17581
99870f4d
KW
17582 # Look through all the input files
17583 foreach my $input (@input) {
17584 next if $input eq 'version'; # Already have checked this.
cf25bb62 17585
99870f4d
KW
17586 # Ignore if doesn't exist. The checking about whether we care or
17587 # not is done via the Input_file object.
17588 next if ! file_exists($input);
5beb625e 17589
99870f4d
KW
17590 # The paths are stored with relative names, and with '/' as the
17591 # delimiter; convert to absolute on this machine
517956bf 17592 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
faf3cf6b
KW
17593 $potential_files{lc $full} = 1
17594 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 17595 }
5beb625e 17596 }
cf25bb62 17597
99870f4d
KW
17598 close $file_handle;
17599}
17600
17601if ($glob_list) {
17602
17603 # Here wants to process all .txt files in the directory structure.
17604 # Convert them to full path names. They are stored in the platform's
17605 # relative style
f86864ac
KW
17606 my @known_files;
17607 foreach my $object (@input_file_objects) {
17608 my $file = $object->file;
17609 next unless defined $file;
17610 push @known_files, File::Spec->rel2abs($file);
17611 }
99870f4d
KW
17612
17613 my @unknown_input_files;
faf3cf6b
KW
17614 foreach my $file (keys %potential_files) { # The keys are stored in lc
17615 next if grep { $file eq lc($_) } @known_files;
99870f4d
KW
17616
17617 # Here, the file is unknown to us. Get relative path name
17618 $file = File::Spec->abs2rel($file);
17619 push @unknown_input_files, $file;
17620
17621 # What will happen is we create a data structure for it, and add it to
17622 # the list of input files to process. First get the subdirectories
17623 # into an array
17624 my (undef, $directories, undef) = File::Spec->splitpath($file);
17625 $directories =~ s;/$;;; # Can have extraneous trailing '/'
17626 my @directories = File::Spec->splitdir($directories);
17627
17628 # If the file isn't extracted (meaning none of the directories is the
17629 # extracted one), just add it to the end of the list of inputs.
17630 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 17631 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
17632 }
17633 else {
17634
17635 # Here, the file is extracted. It needs to go ahead of most other
17636 # processing. Search for the first input file that isn't a
17637 # special required property (that is, find one whose first_release
17638 # is non-0), and isn't extracted. Also, the Age property file is
17639 # processed before the extracted ones, just in case
17640 # $compare_versions is set.
17641 for (my $i = 0; $i < @input_file_objects; $i++) {
17642 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
17643 && lc($input_file_objects[$i]->file) ne 'dage.txt'
17644 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 17645 {
99f78760 17646 splice @input_file_objects, $i, 0,
37e2e78e 17647 Input_file->new($file, v0);
99870f4d
KW
17648 last;
17649 }
cf25bb62 17650 }
99870f4d 17651
cf25bb62 17652 }
d2d499f5 17653 }
99870f4d 17654 if (@unknown_input_files) {
23e33b60 17655 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
17656
17657The following files are unknown as to how to handle. Assuming they are
17658typical property files. You'll know by later error messages if it worked or
17659not:
17660END
99f78760 17661 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
17662 }
17663} # End of looking through directory structure for more .txt files.
5beb625e 17664
99870f4d
KW
17665# Create the list of input files from the objects we have defined, plus
17666# version
97766bb9 17667my @input_files = qw(version Makefile);
99870f4d
KW
17668foreach my $object (@input_file_objects) {
17669 my $file = $object->file;
17670 next if ! defined $file; # Not all objects have files
17671 next if $object->optional && ! -e $file;
17672 push @input_files, $file;
17673}
5beb625e 17674
99870f4d
KW
17675if ( $verbosity >= $VERBOSE ) {
17676 print "Expecting ".scalar( @input_files )." input files. ",
17677 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
17678}
cf25bb62 17679
aeab6150
KW
17680# We set $most_recent to be the most recently changed input file, including
17681# this program itself (done much earlier in this file)
99870f4d 17682foreach my $in (@input_files) {
cdcef19a
KW
17683 next unless -e $in; # Keep going even if missing a file
17684 my $mod_time = (stat $in)[9];
aeab6150 17685 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
17686
17687 # See that the input files have distinct names, to warn someone if they
17688 # are adding a new one
17689 if ($make_list) {
17690 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
17691 $directories =~ s;/$;;; # Can have extraneous trailing '/'
17692 my @directories = File::Spec->splitdir($directories);
17693 my $base = $file =~ s/\.txt$//;
17694 construct_filename($file, 'mutable', \@directories);
cf25bb62 17695 }
99870f4d 17696}
cf25bb62 17697
97766bb9
KW
17698# We use 'Makefile' just to see if it has changed since the last time we
17699# rebuilt. Now discard it.
17700@input_files = grep { $_ ne 'Makefile' } @input_files;
17701
dff6c046 17702my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 17703 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 17704 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 17705
99870f4d
KW
17706# Now we check to see if any output files are older than youngest, if
17707# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 17708if (! $rebuild) {
99870f4d
KW
17709 foreach my $out (@mktables_list_output_files) {
17710 if ( ! file_exists($out)) {
17711 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 17712 $rebuild = 1;
99870f4d
KW
17713 last;
17714 }
17715 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
17716 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
17717 if ( (stat $out)[9] <= $most_recent ) {
17718 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 17719 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 17720 $rebuild = 1;
99870f4d 17721 last;
cf25bb62 17722 }
cf25bb62 17723 }
99870f4d 17724}
d1d1cd7a 17725if (! $rebuild) {
1265e11f 17726 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
17727 exit(0);
17728}
17729print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 17730
99870f4d
KW
17731# Ready to do the major processing. First create the perl pseudo-property.
17732$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 17733
99870f4d
KW
17734# Process each input file
17735foreach my $file (@input_file_objects) {
17736 $file->run;
d2d499f5
JH
17737}
17738
99870f4d 17739# Finish the table generation.
c4051cc5 17740
99870f4d
KW
17741print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
17742finish_Unicode();
c4051cc5 17743
99870f4d
KW
17744print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
17745compile_perl();
c4051cc5 17746
99870f4d
KW
17747print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
17748add_perl_synonyms();
c4051cc5 17749
99870f4d
KW
17750print "Writing tables\n" if $verbosity >= $PROGRESS;
17751write_all_tables();
c4051cc5 17752
99870f4d
KW
17753# Write mktables.lst
17754if ( $file_list and $make_list ) {
c4051cc5 17755
99870f4d
KW
17756 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
17757 foreach my $file (@input_files, @files_actually_output) {
17758 my (undef, $directories, $file) = File::Spec->splitpath($file);
17759 my @directories = File::Spec->splitdir($directories);
17760 $file = join '/', @directories, $file;
17761 }
17762
17763 my $ofh;
17764 if (! open $ofh,">",$file_list) {
17765 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
17766 return
17767 }
17768 else {
cdcef19a 17769 my $localtime = localtime $start_time;
99870f4d
KW
17770 print $ofh <<"END";
17771#
17772# $file_list -- File list for $0.
97050450 17773#
cdcef19a 17774# Autogenerated starting on $start_time ($localtime)
97050450
YO
17775#
17776# - First section is input files
99870f4d 17777# ($0 itself is not listed but is automatically considered an input)
98dc9551 17778# - Section separator is /^=+\$/
97050450
YO
17779# - Second section is a list of output files.
17780# - Lines matching /^\\s*#/ are treated as comments
17781# which along with blank lines are ignored.
17782#
17783
17784# Input files:
17785
99870f4d
KW
17786END
17787 print $ofh "$_\n" for sort(@input_files);
17788 print $ofh "\n=================================\n# Output files:\n\n";
17789 print $ofh "$_\n" for sort @files_actually_output;
17790 print $ofh "\n# ",scalar(@input_files)," input files\n",
17791 "# ",scalar(@files_actually_output)+1," output files\n\n",
17792 "# End list\n";
17793 close $ofh
17794 or Carp::my_carp("Failed to close $ofh: $!");
17795
17796 print "Filelist has ",scalar(@input_files)," input files and ",
17797 scalar(@files_actually_output)+1," output files\n"
17798 if $verbosity >= $VERBOSE;
17799 }
17800}
17801
17802# Output these warnings unless -q explicitly specified.
c83dffeb 17803if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
99870f4d
KW
17804 if (@unhandled_properties) {
17805 print "\nProperties and tables that unexpectedly have no code points\n";
17806 foreach my $property (sort @unhandled_properties) {
17807 print $property, "\n";
17808 }
17809 }
17810
17811 if (%potential_files) {
17812 print "\nInput files that are not considered:\n";
17813 foreach my $file (sort keys %potential_files) {
17814 print File::Spec->abs2rel($file), "\n";
17815 }
17816 }
17817 print "\nAll done\n" if $verbosity >= $VERBOSE;
17818}
5beb625e 17819exit(0);
cf25bb62 17820
99870f4d 17821# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 17822__DATA__
99870f4d 17823
5beb625e
JH
17824use strict;
17825use warnings;
17826
66fd7fd0
KW
17827# If run outside the normal test suite on an ASCII platform, you can
17828# just create a latin1_to_native() function that just returns its
17829# inputs, because that's the only function used from test.pl
17830require "test.pl";
17831
37e2e78e
KW
17832# Test qr/\X/ and the \p{} regular expression constructs. This file is
17833# constructed by mktables from the tables it generates, so if mktables is
17834# buggy, this won't necessarily catch those bugs. Tests are generated for all
17835# feasible properties; a few aren't currently feasible; see
17836# is_code_point_usable() in mktables for details.
99870f4d
KW
17837
17838# Standard test packages are not used because this manipulates SIG_WARN. It
17839# exits 0 if every non-skipped test succeeded; -1 if any failed.
17840
5beb625e
JH
17841my $Tests = 0;
17842my $Fails = 0;
99870f4d 17843
99870f4d
KW
17844sub Expect($$$$) {
17845 my $expected = shift;
17846 my $ord = shift;
17847 my $regex = shift;
17848 my $warning_type = shift; # Type of warning message, like 'deprecated'
17849 # or empty if none
17850 my $line = (caller)[2];
66fd7fd0 17851 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 17852
99870f4d 17853 # Convert the code point to hex form
23e33b60 17854 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 17855
99870f4d 17856 my @tests = "";
5beb625e 17857
37e2e78e
KW
17858 # The first time through, use all warnings. If the input should generate
17859 # a warning, add another time through with them turned off
99870f4d
KW
17860 push @tests, "no warnings '$warning_type';" if $warning_type;
17861
17862 foreach my $no_warnings (@tests) {
17863
17864 # Store any warning messages instead of outputting them
17865 local $SIG{__WARN__} = $SIG{__WARN__};
17866 my $warning_message;
17867 $SIG{__WARN__} = sub { $warning_message = $_[0] };
17868
17869 $Tests++;
17870
17871 # A string eval is needed because of the 'no warnings'.
17872 # Assumes no parens in the regular expression
17873 my $result = eval "$no_warnings
17874 my \$RegObj = qr($regex);
17875 $string =~ \$RegObj ? 1 : 0";
17876 if (not defined $result) {
17877 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
17878 $Fails++;
17879 }
17880 elsif ($result ^ $expected) {
17881 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
17882 $Fails++;
17883 }
17884 elsif ($warning_message) {
17885 if (! $warning_type || ($warning_type && $no_warnings)) {
17886 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
17887 $Fails++;
17888 }
17889 else {
17890 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
17891 }
17892 }
17893 elsif ($warning_type && ! $no_warnings) {
17894 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
17895 $Fails++;
17896 }
17897 else {
17898 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
17899 }
5beb625e 17900 }
99870f4d 17901 return;
5beb625e 17902}
d73e5302 17903
99870f4d
KW
17904sub Error($) {
17905 my $regex = shift;
5beb625e 17906 $Tests++;
99870f4d 17907 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 17908 $Fails++;
99870f4d
KW
17909 my $line = (caller)[2];
17910 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 17911 }
99870f4d
KW
17912 else {
17913 my $line = (caller)[2];
17914 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
17915 }
17916 return;
5beb625e
JH
17917}
17918
37e2e78e
KW
17919# GCBTest.txt character that separates grapheme clusters
17920my $breakable_utf8 = my $breakable = chr(0xF7);
17921utf8::upgrade($breakable_utf8);
17922
17923# GCBTest.txt character that indicates that the adjoining code points are part
17924# of the same grapheme cluster
17925my $nobreak_utf8 = my $nobreak = chr(0xD7);
17926utf8::upgrade($nobreak_utf8);
17927
17928sub Test_X($) {
17929 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
17930 # Each such line is a sequence of code points given by their hex numbers,
17931 # separated by the two characters defined just before this subroutine that
17932 # indicate that either there can or cannot be a break between the adjacent
17933 # code points. If there isn't a break, that means the sequence forms an
17934 # extended grapheme cluster, which means that \X should match the whole
17935 # thing. If there is a break, \X should stop there. This is all
17936 # converted by this routine into a match:
17937 # $string =~ /(\X)/,
17938 # Each \X should match the next cluster; and that is what is checked.
17939
17940 my $template = shift;
17941
17942 my $line = (caller)[2];
17943
17944 # The line contains characters above the ASCII range, but in Latin1. It
17945 # may or may not be in utf8, and if it is, it may or may not know it. So,
17946 # convert these characters to 8 bits. If knows is in utf8, simply
17947 # downgrade.
17948 if (utf8::is_utf8($template)) {
17949 utf8::downgrade($template);
17950 } else {
17951
17952 # Otherwise, if it is in utf8, but doesn't know it, the next lines
17953 # convert the two problematic characters to their 8-bit equivalents.
17954 # If it isn't in utf8, they don't harm anything.
17955 use bytes;
17956 $template =~ s/$nobreak_utf8/$nobreak/g;
17957 $template =~ s/$breakable_utf8/$breakable/g;
17958 }
17959
17960 # Get rid of the leading and trailing breakables
17961 $template =~ s/^ \s* $breakable \s* //x;
17962 $template =~ s/ \s* $breakable \s* $ //x;
17963
17964 # And no-breaks become just a space.
17965 $template =~ s/ \s* $nobreak \s* / /xg;
17966
17967 # Split the input into segments that are breakable between them.
17968 my @segments = split /\s*$breakable\s*/, $template;
17969
17970 my $string = "";
17971 my $display_string = "";
17972 my @should_match;
17973 my @should_display;
17974
17975 # Convert the code point sequence in each segment into a Perl string of
17976 # characters
17977 foreach my $segment (@segments) {
17978 my @code_points = split /\s+/, $segment;
17979 my $this_string = "";
17980 my $this_display = "";
17981 foreach my $code_point (@code_points) {
66fd7fd0 17982 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
17983 $this_display .= "\\x{$code_point}";
17984 }
17985
17986 # The next cluster should match the string in this segment.
17987 push @should_match, $this_string;
17988 push @should_display, $this_display;
17989 $string .= $this_string;
17990 $display_string .= $this_display;
17991 }
17992
17993 # If a string can be represented in both non-ut8 and utf8, test both cases
17994 UPGRADE:
17995 for my $to_upgrade (0 .. 1) {
678f13d5 17996
37e2e78e
KW
17997 if ($to_upgrade) {
17998
17999 # If already in utf8, would just be a repeat
18000 next UPGRADE if utf8::is_utf8($string);
18001
18002 utf8::upgrade($string);
18003 }
18004
18005 # Finally, do the \X match.
18006 my @matches = $string =~ /(\X)/g;
18007
18008 # Look through each matched cluster to verify that it matches what we
18009 # expect.
18010 my $min = (@matches < @should_match) ? @matches : @should_match;
18011 for my $i (0 .. $min - 1) {
18012 $Tests++;
18013 if ($matches[$i] eq $should_match[$i]) {
18014 print "ok $Tests - ";
18015 if ($i == 0) {
18016 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
18017 } else {
18018 print "And \\X #", $i + 1,
18019 }
18020 print " correctly matched $should_display[$i]; line $line\n";
18021 } else {
18022 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
18023 unpack("U*", $matches[$i]));
18024 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
18025 $i + 1,
18026 " should have matched $should_display[$i]",
18027 " but instead matched $matches[$i]",
18028 ". Abandoning rest of line $line\n";
18029 next UPGRADE;
18030 }
18031 }
18032
18033 # And the number of matches should equal the number of expected matches.
18034 $Tests++;
18035 if (@matches == @should_match) {
18036 print "ok $Tests - Nothing was left over; line $line\n";
18037 } else {
18038 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
18039 }
18040 }
18041
18042 return;
18043}
18044
99870f4d 18045sub Finished() {
f86864ac 18046 print "1..$Tests\n";
99870f4d 18047 exit($Fails ? -1 : 0);
5beb625e 18048}
99870f4d
KW
18049
18050Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 18051Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
18052Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18053Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 18054Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726