This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlunicode: nit
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a 19my $start_time;
98dc9551 20BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
21 # get it as close as possible
22 $start_time= time;
23}
24
23e33b60 25require 5.010_001;
d73e5302 26use strict;
99870f4d 27use warnings;
cf25bb62 28use Carp;
bd9ebcfd 29use Config;
99870f4d
KW
30use File::Find;
31use File::Path;
d07a55ed 32use File::Spec;
99870f4d 33use Text::Tabs;
6b64c11c 34use re "/aa";
99870f4d
KW
35
36sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 37my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d
KW
38
39##########################################################################
40#
41# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42# from the Unicode database files (lib/unicore/.../*.txt), It also generates
43# a pod file and a .t file
44#
45# The structure of this file is:
46# First these introductory comments; then
47# code needed for everywhere, such as debugging stuff; then
48# code to handle input parameters; then
49# data structures likely to be of external interest (some of which depend on
50# the input parameters, so follows them; then
51# more data structures and subroutine and package (class) definitions; then
52# the small actual loop to process the input files and finish up; then
53# a __DATA__ section, for the .t tests
54#
5f7264c7 55# This program works on all releases of Unicode through at least 6.0. The
99870f4d
KW
56# outputs have been scrutinized most intently for release 5.1. The others
57# have been checked for somewhat more than just sanity. It can handle all
58# existing Unicode character properties in those releases.
59#
99870f4d
KW
60# This program is mostly about Unicode character (or code point) properties.
61# A property describes some attribute or quality of a code point, like if it
62# is lowercase or not, its name, what version of Unicode it was first defined
63# in, or what its uppercase equivalent is. Unicode deals with these disparate
64# possibilities by making all properties into mappings from each code point
65# into some corresponding value. In the case of it being lowercase or not,
66# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67# property maps each Unicode code point to a single value, called a "property
68# value". (Hence each Unicode property is a true mathematical function with
69# exactly one value per code point.)
70#
71# When using a property in a regular expression, what is desired isn't the
72# mapping of the code point to its property's value, but the reverse (or the
73# mathematical "inverse relation"): starting with the property value, "Does a
74# code point map to it?" These are written in a "compound" form:
75# \p{property=value}, e.g., \p{category=punctuation}. This program generates
76# files containing the lists of code points that map to each such regular
77# expression property value, one file per list
78#
79# There is also a single form shortcut that Perl adds for many of the commonly
80# used properties. This happens for all binary properties, plus script,
81# general_category, and block properties.
82#
83# Thus the outputs of this program are files. There are map files, mostly in
84# the 'To' directory; and there are list files for use in regular expression
85# matching, all in subdirectories of the 'lib' directory, with each
86# subdirectory being named for the property that the lists in it are for.
87# Bookkeeping, test, and documentation files are also generated.
88
89my $matches_directory = 'lib'; # Where match (\p{}) files go.
90my $map_directory = 'To'; # Where map files go.
91
92# DATA STRUCTURES
93#
94# The major data structures of this program are Property, of course, but also
95# Table. There are two kinds of tables, very similar to each other.
96# "Match_Table" is the data structure giving the list of code points that have
97# a particular property value, mentioned above. There is also a "Map_Table"
98# data structure which gives the property's mapping from code point to value.
99# There are two structures because the match tables need to be combined in
100# various ways, such as constructing unions, intersections, complements, etc.,
101# and the map ones don't. And there would be problems, perhaps subtle, if
102# a map table were inadvertently operated on in some of those ways.
103# The use of separate classes with operations defined on one but not the other
104# prevents accidentally confusing the two.
105#
106# At the heart of each table's data structure is a "Range_List", which is just
107# an ordered list of "Ranges", plus ancillary information, and methods to
108# operate on them. A Range is a compact way to store property information.
109# Each range has a starting code point, an ending code point, and a value that
110# is meant to apply to all the code points between the two end points,
111# inclusive. For a map table, this value is the property value for those
112# code points. Two such ranges could be written like this:
113# 0x41 .. 0x5A, 'Upper',
114# 0x61 .. 0x7A, 'Lower'
115#
116# Each range also has a type used as a convenience to classify the values.
117# Most ranges in this program will be Type 0, or normal, but there are some
118# ranges that have a non-zero type. These are used only in map tables, and
119# are for mappings that don't fit into the normal scheme of things. Mappings
120# that require a hash entry to communicate with utf8.c are one example;
121# another example is mappings for charnames.pm to use which indicate a name
122# that is algorithmically determinable from its code point (and vice-versa).
123# These are used to significantly compact these tables, instead of listing
124# each one of the tens of thousands individually.
125#
126# In a match table, the value of a range is irrelevant (and hence the type as
127# well, which will always be 0), and arbitrarily set to the null string.
128# Using the example above, there would be two match tables for those two
129# entries, one named Upper would contain the 0x41..0x5A range, and the other
130# named Lower would contain 0x61..0x7A.
131#
132# Actually, there are two types of range lists, "Range_Map" is the one
133# associated with map tables, and "Range_List" with match tables.
134# Again, this is so that methods can be defined on one and not the other so as
135# to prevent operating on them in incorrect ways.
136#
137# Eventually, most tables are written out to files to be read by utf8_heavy.pl
138# in the perl core. All tables could in theory be written, but some are
139# suppressed because there is no current practical use for them. It is easy
140# to change which get written by changing various lists that are near the top
141# of the actual code in this file. The table data structures contain enough
142# ancillary information to allow them to be treated as separate entities for
143# writing, such as the path to each one's file. There is a heading in each
144# map table that gives the format of its entries, and what the map is for all
145# the code points missing from it. (This allows tables to be more compact.)
678f13d5 146#
99870f4d
KW
147# The Property data structure contains one or more tables. All properties
148# contain a map table (except the $perl property which is a
149# pseudo-property containing only match tables), and any properties that
150# are usable in regular expression matches also contain various matching
151# tables, one for each value the property can have. A binary property can
152# have two values, True and False (or Y and N, which are preferred by Unicode
153# terminology). Thus each of these properties will have a map table that
154# takes every code point and maps it to Y or N (but having ranges cuts the
155# number of entries in that table way down), and two match tables, one
156# which has a list of all the code points that map to Y, and one for all the
157# code points that map to N. (For each of these, a third table is also
158# generated for the pseudo Perl property. It contains the identical code
159# points as the Y table, but can be written, not in the compound form, but in
160# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
161# properties have several possible values, some have many, and properties like
162# Name have a different value for every named code point. Those will not,
163# unless the controlling lists are changed, have their match tables written
164# out. But all the ones which can be used in regular expression \p{} and \P{}
c12f2655
KW
165# constructs will. Prior to 5.14, generally a property would have either its
166# map table or its match tables written but not both. Again, what gets
167# written is controlled by lists which can easily be changed. Starting in
168# 5.14, advantage was taken of this, and all the map tables needed to
169# reconstruct the Unicode db are now written out, while suppressing the
170# Unicode .txt files that contain the data. Our tables are much more compact
171# than the .txt files, so a significant space savings was achieved.
172
173# Properties have a 'Type', like binary, or string, or enum depending on how
174# many match tables there are and the content of the maps. This 'Type' is
175# different than a range 'Type', so don't get confused by the two concepts
176# having the same name.
678f13d5 177#
99870f4d
KW
178# For information about the Unicode properties, see Unicode's UAX44 document:
179
180my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
181
182# As stated earlier, this program will work on any release of Unicode so far.
183# Most obvious problems in earlier data have NOT been corrected except when
184# necessary to make Perl or this program work reasonably. For example, no
c12f2655
KW
185# folding information was given in early releases, so this program substitutes
186# lower case instead, just so that a regular expression with the /i option
187# will do something that actually gives the right results in many cases.
188# There are also a couple other corrections for version 1.1.5, commented at
189# the point they are made. As an example of corrections that weren't made
190# (but could be) is this statement from DerivedAge.txt: "The supplementary
191# private use code points and the non-character code points were assigned in
192# version 2.0, but not specifically listed in the UCD until versions 3.0 and
193# 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
194# on Unicode version glitches is further down in these introductory comments.
99870f4d 195#
5f7264c7
KW
196# This program works on all non-provisional properties as of 6.0, though the
197# files for some are suppressed from apparent lack of demand for them. You
198# can change which are output by changing lists in this program.
678f13d5 199#
dc85bd38 200# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
201# loose matchings rules (from Unicode TR18):
202#
203# The recommended names for UCD properties and property values are in
204# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
205# [PropValue]. There are both abbreviated names and longer, more
206# descriptive names. It is strongly recommended that both names be
207# recognized, and that loose matching of property names be used,
208# whereby the case distinctions, whitespace, hyphens, and underbar
209# are ignored.
210# The program still allows Fuzzy to override its determination of if loose
211# matching should be used, but it isn't currently used, as it is no longer
212# needed; the calculations it makes are good enough.
678f13d5 213#
99870f4d
KW
214# SUMMARY OF HOW IT WORKS:
215#
216# Process arguments
217#
218# A list is constructed containing each input file that is to be processed
219#
220# Each file on the list is processed in a loop, using the associated handler
221# code for each:
222# The PropertyAliases.txt and PropValueAliases.txt files are processed
223# first. These files name the properties and property values.
224# Objects are created of all the property and property value names
225# that the rest of the input should expect, including all synonyms.
226# The other input files give mappings from properties to property
227# values. That is, they list code points and say what the mapping
228# is under the given property. Some files give the mappings for
229# just one property; and some for many. This program goes through
230# each file and populates the properties from them. Some properties
231# are listed in more than one file, and Unicode has set up a
232# precedence as to which has priority if there is a conflict. Thus
233# the order of processing matters, and this program handles the
234# conflict possibility by processing the overriding input files
235# last, so that if necessary they replace earlier values.
236# After this is all done, the program creates the property mappings not
237# furnished by Unicode, but derivable from what it does give.
238# The tables of code points that match each property value in each
239# property that is accessible by regular expressions are created.
240# The Perl-defined properties are created and populated. Many of these
241# require data determined from the earlier steps
242# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 243# and Unicode are reconciled and warned about.
99870f4d
KW
244# All the properties are written to files
245# Any other files are written, and final warnings issued.
678f13d5 246#
99870f4d
KW
247# For clarity, a number of operators have been overloaded to work on tables:
248# ~ means invert (take all characters not in the set). The more
249# conventional '!' is not used because of the possibility of confusing
250# it with the actual boolean operation.
251# + means union
252# - means subtraction
253# & means intersection
254# The precedence of these is the order listed. Parentheses should be
255# copiously used. These are not a general scheme. The operations aren't
256# defined for a number of things, deliberately, to avoid getting into trouble.
257# Operations are done on references and affect the underlying structures, so
258# that the copy constructors for them have been overloaded to not return a new
259# clone, but the input object itself.
678f13d5 260#
99870f4d
KW
261# The bool operator is deliberately not overloaded to avoid confusion with
262# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
263#
264# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
265#
266# This program needs to be able to run under miniperl. Therefore, it uses a
267# minimum of other modules, and hence implements some things itself that could
268# be gotten from CPAN
269#
270# This program uses inputs published by the Unicode Consortium. These can
271# change incompatibly between releases without the Perl maintainers realizing
272# it. Therefore this program is now designed to try to flag these. It looks
273# at the directories where the inputs are, and flags any unrecognized files.
274# It keeps track of all the properties in the files it handles, and flags any
275# that it doesn't know how to handle. It also flags any input lines that
276# don't match the expected syntax, among other checks.
277#
278# It is also designed so if a new input file matches one of the known
279# templates, one hopefully just needs to add it to a list to have it
280# processed.
281#
282# As mentioned earlier, some properties are given in more than one file. In
283# particular, the files in the extracted directory are supposedly just
284# reformattings of the others. But they contain information not easily
285# derivable from the other files, including results for Unihan, which this
286# program doesn't ordinarily look at, and for unassigned code points. They
287# also have historically had errors or been incomplete. In an attempt to
288# create the best possible data, this program thus processes them first to
289# glean information missing from the other files; then processes those other
290# files to override any errors in the extracted ones. Much of the design was
291# driven by this need to store things and then possibly override them.
292#
293# It tries to keep fatal errors to a minimum, to generate something usable for
294# testing purposes. It always looks for files that could be inputs, and will
295# warn about any that it doesn't know how to handle (the -q option suppresses
296# the warning).
99870f4d 297#
678f13d5
KW
298# Why is there more than one type of range?
299# This simplified things. There are some very specialized code points that
300# have to be handled specially for output, such as Hangul syllable names.
301# By creating a range type (done late in the development process), it
302# allowed this to be stored with the range, and overridden by other input.
303# Originally these were stored in another data structure, and it became a
304# mess trying to decide if a second file that was for the same property was
305# overriding the earlier one or not.
306#
307# Why are there two kinds of tables, match and map?
308# (And there is a base class shared by the two as well.) As stated above,
309# they actually are for different things. Development proceeded much more
310# smoothly when I (khw) realized the distinction. Map tables are used to
311# give the property value for every code point (actually every code point
312# that doesn't map to a default value). Match tables are used for regular
313# expression matches, and are essentially the inverse mapping. Separating
314# the two allows more specialized methods, and error checks so that one
315# can't just take the intersection of two map tables, for example, as that
316# is nonsensical.
99870f4d 317#
23e33b60
KW
318# DEBUGGING
319#
678f13d5
KW
320# This program is written so it will run under miniperl. Occasionally changes
321# will cause an error where the backtrace doesn't work well under miniperl.
322# To diagnose the problem, you can instead run it under regular perl, if you
323# have one compiled.
324#
325# There is a good trace facility. To enable it, first sub DEBUG must be set
326# to return true. Then a line like
327#
328# local $to_trace = 1 if main::DEBUG;
329#
330# can be added to enable tracing in its lexical scope or until you insert
331# another line:
332#
333# local $to_trace = 0 if main::DEBUG;
334#
335# then use a line like "trace $a, @b, %c, ...;
336#
337# Some of the more complex subroutines already have trace statements in them.
338# Permanent trace statements should be like:
339#
340# trace ... if main::DEBUG && $to_trace;
341#
342# If there is just one or a few files that you're debugging, you can easily
343# cause most everything else to be skipped. Change the line
344#
345# my $debug_skip = 0;
346#
347# to 1, and every file whose object is in @input_file_objects and doesn't have
348# a, 'non_skip => 1,' in its constructor will be skipped.
349#
b4a0206c 350# To compare the output tables, it may be useful to specify the -annotate
c4019d52
KW
351# flag. This causes the tables to expand so there is one entry for each
352# non-algorithmically named code point giving, currently its name, and its
353# graphic representation if printable (and you have a font that knows about
354# it). This makes it easier to see what the particular code points are in
355# each output table. The tables are usable, but because they don't have
356# ranges (for the most part), a Perl using them will run slower. Non-named
357# code points are annotated with a description of their status, and contiguous
358# ones with the same description will be output as a range rather than
359# individually. Algorithmically named characters are also output as ranges,
360# except when there are just a few contiguous ones.
361#
99870f4d
KW
362# FUTURE ISSUES
363#
364# The program would break if Unicode were to change its names so that
365# interior white space, underscores, or dashes differences were significant
366# within property and property value names.
367#
368# It might be easier to use the xml versions of the UCD if this program ever
369# would need heavy revision, and the ability to handle old versions was not
370# required.
371#
372# There is the potential for name collisions, in that Perl has chosen names
373# that Unicode could decide it also likes. There have been such collisions in
374# the past, with mostly Perl deciding to adopt the Unicode definition of the
375# name. However in the 5.2 Unicode beta testing, there were a number of such
376# collisions, which were withdrawn before the final release, because of Perl's
377# and other's protests. These all involved new properties which began with
378# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
379# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
380# Unicode document, so they are unlikely to be used by Unicode for another
381# purpose. However, they might try something beginning with 'In', or use any
382# of the other Perl-defined properties. This program will warn you of name
383# collisions, and refuse to generate tables with them, but manual intervention
384# will be required in this event. One scheme that could be implemented, if
385# necessary, would be to have this program generate another file, or add a
386# field to mktables.lst that gives the date of first definition of a property.
387# Each new release of Unicode would use that file as a basis for the next
388# iteration. And the Perl synonym addition code could sort based on the age
389# of the property, so older properties get priority, and newer ones that clash
390# would be refused; hence existing code would not be impacted, and some other
391# synonym would have to be used for the new property. This is ugly, and
392# manual intervention would certainly be easier to do in the short run; lets
393# hope it never comes to this.
678f13d5 394#
99870f4d
KW
395# A NOTE ON UNIHAN
396#
397# This program can generate tables from the Unihan database. But it doesn't
398# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
399# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
400# database was split into 8 different files, all beginning with the letters
401# 'Unihan'. This program will read those file(s) if present, but it needs to
402# know which of the many properties in the file(s) should have tables created
403# for them. It will create tables for any properties listed in
404# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
405# @cjk_properties array and the @cjk_property_values array. Thus, if a
406# property you want is not in those files of the release you are building
407# against, you must add it to those two arrays. Starting in 4.0, the
408# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
409# is present in the directory, a table will be generated for that property.
410# In 5.2, several more properties were added. For your convenience, the two
5f7264c7 411# arrays are initialized with all the 6.0 listed properties that are also in
99870f4d
KW
412# earlier releases. But these are commented out. You can just uncomment the
413# ones you want, or use them as a template for adding entries for other
414# properties.
415#
416# You may need to adjust the entries to suit your purposes. setup_unihan(),
417# and filter_unihan_line() are the functions where this is done. This program
418# already does some adjusting to make the lines look more like the rest of the
419# Unicode DB; You can see what that is in filter_unihan_line()
420#
421# There is a bug in the 3.2 data file in which some values for the
422# kPrimaryNumeric property have commas and an unexpected comment. A filter
423# could be added for these; or for a particular installation, the Unihan.txt
424# file could be edited to fix them.
99870f4d 425#
678f13d5
KW
426# HOW TO ADD A FILE TO BE PROCESSED
427#
428# A new file from Unicode needs to have an object constructed for it in
429# @input_file_objects, probably at the end or at the end of the extracted
430# ones. The program should warn you if its name will clash with others on
431# restrictive file systems, like DOS. If so, figure out a better name, and
432# add lines to the README.perl file giving that. If the file is a character
433# property, it should be in the format that Unicode has by default
434# standardized for such files for the more recently introduced ones.
435# If so, the Input_file constructor for @input_file_objects can just be the
436# file name and release it first appeared in. If not, then it should be
437# possible to construct an each_line_handler() to massage the line into the
438# standardized form.
439#
440# For non-character properties, more code will be needed. You can look at
441# the existing entries for clues.
442#
443# UNICODE VERSIONS NOTES
444#
445# The Unicode UCD has had a number of errors in it over the versions. And
446# these remain, by policy, in the standard for that version. Therefore it is
447# risky to correct them, because code may be expecting the error. So this
448# program doesn't generally make changes, unless the error breaks the Perl
449# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
450# for U+1105, which causes real problems for the algorithms for Jamo
451# calculations, so it is changed here.
452#
453# But it isn't so clear cut as to what to do about concepts that are
454# introduced in a later release; should they extend back to earlier releases
455# where the concept just didn't exist? It was easier to do this than to not,
456# so that's what was done. For example, the default value for code points not
457# in the files for various properties was probably undefined until changed by
458# some version. No_Block for blocks is such an example. This program will
459# assign No_Block even in Unicode versions that didn't have it. This has the
460# benefit that code being written doesn't have to special case earlier
461# versions; and the detriment that it doesn't match the Standard precisely for
462# the affected versions.
463#
464# Here are some observations about some of the issues in early versions:
465#
6426c51b 466# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
678f13d5
KW
467# the reason is that the CJK block starting at 4E00 was removed from PropList,
468# and was not put back in until 3.1.0
469#
470# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
471# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
472# reason is that 3.2 introduced U+205F=medium math space, which was not
473# classed as white space, but Perl figured out that it should have been. 4.0
474# reclassified it correctly.
475#
476# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
477# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
478# was left with no code points, as all the ones that mapped to 202 stayed
479# mapped to 202. Thus if your program used the numeric name for the class,
480# it would not have been affected, but if it used the mnemonic, it would have
481# been.
482#
483# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
484# points which eventually came to have this script property value, instead
485# mapped to "Unknown". But in the next release all these code points were
486# moved to \p{sc=common} instead.
99870f4d
KW
487#
488# The default for missing code points for BidiClass is complicated. Starting
489# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
490# tries to do the best it can for earlier releases. It is done in
491# process_PropertyAliases()
492#
493##############################################################################
494
495my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
496 # and errors
497my $MAX_LINE_WIDTH = 78;
498
499# Debugging aid to skip most files so as to not be distracted by them when
500# concentrating on the ones being debugged. Add
501# non_skip => 1,
502# to the constructor for those files you want processed when you set this.
503# Files with a first version number of 0 are special: they are always
c12f2655
KW
504# processed regardless of the state of this flag. Generally, Jamo.txt and
505# UnicodeData.txt must not be skipped if you want this program to not die
506# before normal completion.
99870f4d
KW
507my $debug_skip = 0;
508
509# Set to 1 to enable tracing.
510our $to_trace = 0;
511
512{ # Closure for trace: debugging aid
513 my $print_caller = 1; # ? Include calling subroutine name
514 my $main_with_colon = 'main::';
515 my $main_colon_length = length($main_with_colon);
516
517 sub trace {
518 return unless $to_trace; # Do nothing if global flag not set
519
520 my @input = @_;
521
522 local $DB::trace = 0;
523 $DB::trace = 0; # Quiet 'used only once' message
524
525 my $line_number;
526
527 # Loop looking up the stack to get the first non-trace caller
528 my $caller_line;
529 my $caller_name;
530 my $i = 0;
531 do {
532 $line_number = $caller_line;
533 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
534 $caller = $main_with_colon unless defined $caller;
535
536 $caller_name = $caller;
537
538 # get rid of pkg
539 $caller_name =~ s/.*:://;
540 if (substr($caller_name, 0, $main_colon_length)
541 eq $main_with_colon)
542 {
543 $caller_name = substr($caller_name, $main_colon_length);
544 }
545
546 } until ($caller_name ne 'trace');
547
548 # If the stack was empty, we were called from the top level
549 $caller_name = 'main' if ($caller_name eq ""
550 || $caller_name eq 'trace');
551
552 my $output = "";
553 foreach my $string (@input) {
554 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
555 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
556 $output .= simple_dumper($string);
557 }
558 else {
559 $string = "$string" if ref $string;
560 $string = $UNDEF unless defined $string;
561 chomp $string;
562 $string = '""' if $string eq "";
563 $output .= " " if $output ne ""
564 && $string ne ""
565 && substr($output, -1, 1) ne " "
566 && substr($string, 0, 1) ne " ";
567 $output .= $string;
568 }
569 }
570
99f78760
KW
571 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
572 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
573 print STDERR $output, "\n";
574 return;
575 }
576}
577
578# This is for a rarely used development feature that allows you to compare two
579# versions of the Unicode standard without having to deal with changes caused
c12f2655
KW
580# by the code points introduced in the later version. Change the 0 to a
581# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
582# code points introduced in that release and earlier will be used; later ones
583# are thrown away. You use the version number of the earliest one you want to
584# compare; then run this program on directory structures containing each
585# release, and compare the outputs. These outputs will therefore include only
586# the code points common to both releases, and you can see the changes caused
587# just by the underlying release semantic changes. For versions earlier than
588# 3.2, you must copy a version of DAge.txt into the directory.
589my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
99870f4d
KW
590my $compare_versions = DEBUG
591 && $string_compare_versions
592 && pack "C*", split /\./, $string_compare_versions;
593
594sub uniques {
595 # Returns non-duplicated input values. From "Perl Best Practices:
596 # Encapsulated Cleverness". p. 455 in first edition.
597
598 my %seen;
0e407844
NC
599 # Arguably this breaks encapsulation, if the goal is to permit multiple
600 # distinct objects to stringify to the same value, and be interchangeable.
601 # However, for this program, no two objects stringify identically, and all
602 # lists passed to this function are either objects or strings. So this
603 # doesn't affect correctness, but it does give a couple of percent speedup.
604 no overloading;
99870f4d
KW
605 return grep { ! $seen{$_}++ } @_;
606}
607
608$0 = File::Spec->canonpath($0);
609
610my $make_test_script = 0; # ? Should we output a test script
611my $write_unchanged_files = 0; # ? Should we update the output files even if
612 # we don't think they have changed
613my $use_directory = ""; # ? Should we chdir somewhere.
614my $pod_directory; # input directory to store the pod file.
615my $pod_file = 'perluniprops';
616my $t_path; # Path to the .t test file
617my $file_list = 'mktables.lst'; # File to store input and output file names.
618 # This is used to speed up the build, by not
619 # executing the main body of the program if
620 # nothing on the list has changed since the
621 # previous build
622my $make_list = 1; # ? Should we write $file_list. Set to always
623 # make a list so that when the pumpking is
624 # preparing a release, s/he won't have to do
625 # special things
626my $glob_list = 0; # ? Should we try to include unknown .txt files
627 # in the input.
bd9ebcfd
KW
628my $output_range_counts = $debugging_build; # ? Should we include the number
629 # of code points in ranges in
630 # the output
558712cf 631my $annotate = 0; # ? Should character names be in the output
9ef2b94f 632
99870f4d
KW
633# Verbosity levels; 0 is quiet
634my $NORMAL_VERBOSITY = 1;
635my $PROGRESS = 2;
636my $VERBOSE = 3;
637
638my $verbosity = $NORMAL_VERBOSITY;
639
640# Process arguments
641while (@ARGV) {
cf25bb62
JH
642 my $arg = shift @ARGV;
643 if ($arg eq '-v') {
99870f4d
KW
644 $verbosity = $VERBOSE;
645 }
646 elsif ($arg eq '-p') {
647 $verbosity = $PROGRESS;
648 $| = 1; # Flush buffers as we go.
649 }
650 elsif ($arg eq '-q') {
651 $verbosity = 0;
652 }
653 elsif ($arg eq '-w') {
654 $write_unchanged_files = 1; # update the files even if havent changed
655 }
656 elsif ($arg eq '-check') {
6ae7e459
YO
657 my $this = shift @ARGV;
658 my $ok = shift @ARGV;
659 if ($this ne $ok) {
660 print "Skipping as check params are not the same.\n";
661 exit(0);
662 }
00a8df5c 663 }
99870f4d
KW
664 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
665 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
666 }
3df51b85
KW
667 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
668 {
99870f4d 669 $make_test_script = 1;
99870f4d
KW
670 }
671 elsif ($arg eq '-makelist') {
672 $make_list = 1;
673 }
674 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
675 -d $use_directory or croak "Unknown directory '$use_directory'";
676 }
677 elsif ($arg eq '-L') {
678
679 # Existence not tested until have chdir'd
680 $file_list = shift;
681 }
682 elsif ($arg eq '-globlist') {
683 $glob_list = 1;
684 }
685 elsif ($arg eq '-c') {
686 $output_range_counts = ! $output_range_counts
687 }
b4a0206c 688 elsif ($arg eq '-annotate') {
558712cf 689 $annotate = 1;
bd9ebcfd
KW
690 $debugging_build = 1;
691 $output_range_counts = 1;
9ef2b94f 692 }
99870f4d
KW
693 else {
694 my $with_c = 'with';
695 $with_c .= 'out' if $output_range_counts; # Complements the state
696 croak <<END;
697usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
698 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
699 [-check A B ]
700 -c : Output comments $with_c number of code points in ranges
701 -q : Quiet Mode: Only output serious warnings.
702 -p : Set verbosity level to normal plus show progress.
703 -v : Set Verbosity level high: Show progress and non-serious
704 warnings
705 -w : Write files regardless
706 -C dir : Change to this directory before proceeding. All relative paths
707 except those specified by the -P and -T options will be done
708 with respect to this directory.
709 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 710 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
711 -L filelist : Use alternate 'filelist' instead of standard one
712 -globlist : Take as input all non-Test *.txt files in current and sub
713 directories
3df51b85
KW
714 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
715 overrides -T
99870f4d 716 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 717 -annotate : Output an annotation for each character in the table files;
c4019d52
KW
718 useful for debugging mktables, looking at diffs; but is slow,
719 memory intensive; resulting tables are usable but slow and
720 very large.
99870f4d
KW
721 -check A B : Executes $0 only if A and B are the same
722END
723 }
724}
725
726# Stores the most-recently changed file. If none have changed, can skip the
727# build
aeab6150 728my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
729
730# Change directories now, because need to read 'version' early.
731if ($use_directory) {
3df51b85 732 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
733 $pod_directory = File::Spec->rel2abs($pod_directory);
734 }
3df51b85 735 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 736 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 737 }
99870f4d 738 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 739 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 740 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 741 }
3df51b85 742 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 743 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 744 }
00a8df5c
YO
745}
746
99870f4d
KW
747# Get Unicode version into regular and v-string. This is done now because
748# various tables below get populated based on it. These tables are populated
749# here to be near the top of the file, and so easily seeable by those needing
750# to modify things.
751open my $VERSION, "<", "version"
752 or croak "$0: can't open required file 'version': $!\n";
753my $string_version = <$VERSION>;
754close $VERSION;
755chomp $string_version;
756my $v_version = pack "C*", split /\./, $string_version; # v string
757
758# The following are the complete names of properties with property values that
759# are known to not match any code points in some versions of Unicode, but that
760# may change in the future so they should be matchable, hence an empty file is
761# generated for them.
762my @tables_that_may_be_empty = (
763 'Joining_Type=Left_Joining',
764 );
765push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
766push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
767push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
768 if $v_version ge v4.1.0;
82aed44a
KW
769push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
770 if $v_version ge v6.0.0;
99870f4d
KW
771
772# The lists below are hashes, so the key is the item in the list, and the
773# value is the reason why it is in the list. This makes generation of
774# documentation easier.
775
776my %why_suppressed; # No file generated for these.
777
778# Files aren't generated for empty extraneous properties. This is arguable.
779# Extraneous properties generally come about because a property is no longer
780# used in a newer version of Unicode. If we generated a file without code
781# points, programs that used to work on that property will still execute
782# without errors. It just won't ever match (or will always match, with \P{}).
783# This means that the logic is now likely wrong. I (khw) think its better to
784# find this out by getting an error message. Just move them to the table
785# above to change this behavior
786my %why_suppress_if_empty_warn_if_not = (
787
788 # It is the only property that has ever officially been removed from the
789 # Standard. The database never contained any code points for it.
790 'Special_Case_Condition' => 'Obsolete',
791
792 # Apparently never official, but there were code points in some versions of
793 # old-style PropList.txt
794 'Non_Break' => 'Obsolete',
795);
796
797# These would normally go in the warn table just above, but they were changed
798# a long time before this program was written, so warnings about them are
799# moot.
800if ($v_version gt v3.2.0) {
801 push @tables_that_may_be_empty,
802 'Canonical_Combining_Class=Attached_Below_Left'
803}
804
5f7264c7 805# These are listed in the Property aliases file in 6.0, but Unihan is ignored
99870f4d
KW
806# unless explicitly added.
807if ($v_version ge v5.2.0) {
808 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 809 foreach my $table (qw (
99870f4d
KW
810 kAccountingNumeric
811 kOtherNumeric
812 kPrimaryNumeric
813 kCompatibilityVariant
814 kIICore
815 kIRG_GSource
816 kIRG_HSource
817 kIRG_JSource
818 kIRG_KPSource
819 kIRG_MSource
820 kIRG_KSource
821 kIRG_TSource
822 kIRG_USource
823 kIRG_VSource
824 kRSUnicode
ea25a9b2 825 ))
99870f4d
KW
826 {
827 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
828 }
ca12659b
NC
829}
830
272501f6
KW
831# Enum values for to_output_map() method in the Map_Table package.
832my $EXTERNAL_MAP = 1;
833my $INTERNAL_MAP = 2;
834
fcf1973c
KW
835# To override computed values for writing the map tables for these properties.
836# The default for enum map tables is to write them out, so that the Unicode
837# .txt files can be removed, but all the data to compute any property value
838# for any code point is available in a more compact form.
839my %global_to_output_map = (
840 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
c12f2655
KW
841 # get stuck supporting it if things change. Since it is a STRING
842 # property, it normally would be listed in the pod, but INTERNAL_MAP
843 # suppresses that.
fcf1973c
KW
844 Unicode_1_Name => $INTERNAL_MAP,
845
846 Present_In => 0, # Suppress, as easily computed from Age
fcf1973c 847 Block => 0, # Suppress, as Blocks.txt is retained.
53d34b6c
KW
848
849 # Suppress, as mapping can be found instead from the
850 # Perl_Decomposition_Mapping file
851 Decomposition_Type => 0,
fcf1973c
KW
852);
853
99870f4d 854# Properties that this program ignores.
230e0c16
KW
855my @unimplemented_properties;
856
857# With this release, it is automatically handled if the Unihan db is
858# downloaded
859push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
d73e5302 860
99870f4d
KW
861# There are several types of obsolete properties defined by Unicode. These
862# must be hand-edited for every new Unicode release.
863my %why_deprecated; # Generates a deprecated warning message if used.
864my %why_stabilized; # Documentation only
865my %why_obsolete; # Documentation only
866
867{ # Closure
868 my $simple = 'Perl uses the more complete version of this property';
869 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
870
871 my $other_properties = 'other properties';
872 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 873 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
874
875 %why_deprecated = (
5f7264c7 876 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
877 'Jamo_Short_Name' => $contributory,
878 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
879 'Other_Alphabetic' => $contributory,
880 'Other_Default_Ignorable_Code_Point' => $contributory,
881 'Other_Grapheme_Extend' => $contributory,
882 'Other_ID_Continue' => $contributory,
883 'Other_ID_Start' => $contributory,
884 'Other_Lowercase' => $contributory,
885 'Other_Math' => $contributory,
886 'Other_Uppercase' => $contributory,
e22aaf5c
KW
887 'Expands_On_NFC' => $why_no_expand,
888 'Expands_On_NFD' => $why_no_expand,
889 'Expands_On_NFKC' => $why_no_expand,
890 'Expands_On_NFKD' => $why_no_expand,
99870f4d
KW
891 );
892
893 %why_suppressed = (
5f7264c7 894 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
895 # contains the same information, but without the algorithmically
896 # determinable Hangul syllables'. This file is not published, so it's
897 # existence is not noted in the comment.
e0b29447 898 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
99870f4d 899
3111abc0
KW
900 'Indic_Matra_Category' => "Provisional",
901 'Indic_Syllabic_Category' => "Provisional",
902
5f8d1a89
KW
903 # Don't suppress ISO_Comment, as otherwise special handling is needed
904 # to differentiate between it and gc=c, which can be written as 'isc',
905 # which is the same characters as ISO_Comment's short name.
99870f4d 906
fbb93542 907 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
e0b29447
KW
908
909 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
910 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
911 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
912 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
99870f4d 913
5f7264c7 914 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
915 );
916
1704a0ea
KW
917 foreach my $property (
918
919 # The following are suppressed because they were made contributory
920 # or deprecated by Unicode before Perl ever thought about
921 # supporting them.
922 'Jamo_Short_Name',
923 'Grapheme_Link',
924 'Expands_On_NFC',
925 'Expands_On_NFD',
926 'Expands_On_NFKC',
927 'Expands_On_NFKD',
928
929 # The following are suppressed because they have been marked
930 # as deprecated for a sufficient amount of time
931 'Other_Alphabetic',
932 'Other_Default_Ignorable_Code_Point',
933 'Other_Grapheme_Extend',
934 'Other_ID_Continue',
935 'Other_ID_Start',
936 'Other_Lowercase',
937 'Other_Math',
938 'Other_Uppercase',
e22aaf5c 939 ) {
99870f4d
KW
940 $why_suppressed{$property} = $why_deprecated{$property};
941 }
cf25bb62 942
99870f4d
KW
943 # Customize the message for all the 'Other_' properties
944 foreach my $property (keys %why_deprecated) {
945 next if (my $main_property = $property) !~ s/^Other_//;
946 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
947 }
948}
949
950if ($v_version ge 4.0.0) {
951 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
952 if ($v_version ge 6.0.0) {
953 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
954 }
99870f4d 955}
5f7264c7 956if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 957 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7 958 if ($v_version ge 6.0.0) {
63f74647 959 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
5f7264c7 960 }
99870f4d
KW
961}
962
963# Probably obsolete forever
964if ($v_version ge v4.1.0) {
82aed44a
KW
965 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
966}
967if ($v_version ge v6.0.0) {
2b352efd
KW
968 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
969 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
99870f4d
KW
970}
971
972# This program can create files for enumerated-like properties, such as
973# 'Numeric_Type'. This file would be the same format as for a string
974# property, with a mapping from code point to its value, so you could look up,
975# for example, the script a code point is in. But no one so far wants this
976# mapping, or they have found another way to get it since this is a new
977# feature. So no file is generated except if it is in this list.
978my @output_mapped_properties = split "\n", <<END;
979END
980
c12f2655
KW
981# If you are using the Unihan database in a Unicode version before 5.2, you
982# need to add the properties that you want to extract from it to this table.
983# For your convenience, the properties in the 6.0 PropertyAliases.txt file are
984# listed, commented out
99870f4d
KW
985my @cjk_properties = split "\n", <<'END';
986#cjkAccountingNumeric; kAccountingNumeric
987#cjkOtherNumeric; kOtherNumeric
988#cjkPrimaryNumeric; kPrimaryNumeric
989#cjkCompatibilityVariant; kCompatibilityVariant
990#cjkIICore ; kIICore
991#cjkIRG_GSource; kIRG_GSource
992#cjkIRG_HSource; kIRG_HSource
993#cjkIRG_JSource; kIRG_JSource
994#cjkIRG_KPSource; kIRG_KPSource
995#cjkIRG_KSource; kIRG_KSource
996#cjkIRG_TSource; kIRG_TSource
997#cjkIRG_USource; kIRG_USource
998#cjkIRG_VSource; kIRG_VSource
999#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1000END
1001
1002# Similarly for the property values. For your convenience, the lines in the
5f7264c7 1003# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
c12f2655 1004# '#' marks (for Unicode versions before 5.2)
99870f4d
KW
1005my @cjk_property_values = split "\n", <<'END';
1006## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1007## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1008## @missing: 0000..10FFFF; cjkIICore; <none>
1009## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1010## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1011## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1012## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1013## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1014## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1015## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1016## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1017## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1018## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1019## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1020END
1021
1022# The input files don't list every code point. Those not listed are to be
1023# defaulted to some value. Below are hard-coded what those values are for
1024# non-binary properties as of 5.1. Starting in 5.0, there are
1025# machine-parsable comment lines in the files the give the defaults; so this
1026# list shouldn't have to be extended. The claim is that all missing entries
1027# for binary properties will default to 'N'. Unicode tried to change that in
1028# 5.2, but the beta period produced enough protest that they backed off.
1029#
1030# The defaults for the fields that appear in UnicodeData.txt in this hash must
1031# be in the form that it expects. The others may be synonyms.
1032my $CODE_POINT = '<code point>';
1033my %default_mapping = (
1034 Age => "Unassigned",
1035 # Bidi_Class => Complicated; set in code
1036 Bidi_Mirroring_Glyph => "",
1037 Block => 'No_Block',
1038 Canonical_Combining_Class => 0,
1039 Case_Folding => $CODE_POINT,
1040 Decomposition_Mapping => $CODE_POINT,
1041 Decomposition_Type => 'None',
1042 East_Asian_Width => "Neutral",
1043 FC_NFKC_Closure => $CODE_POINT,
1044 General_Category => 'Cn',
1045 Grapheme_Cluster_Break => 'Other',
1046 Hangul_Syllable_Type => 'NA',
1047 ISO_Comment => "",
1048 Jamo_Short_Name => "",
1049 Joining_Group => "No_Joining_Group",
1050 # Joining_Type => Complicated; set in code
1051 kIICore => 'N', # Is converted to binary
1052 #Line_Break => Complicated; set in code
1053 Lowercase_Mapping => $CODE_POINT,
1054 Name => "",
1055 Name_Alias => "",
1056 NFC_QC => 'Yes',
1057 NFD_QC => 'Yes',
1058 NFKC_QC => 'Yes',
1059 NFKD_QC => 'Yes',
1060 Numeric_Type => 'None',
1061 Numeric_Value => 'NaN',
1062 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1063 Sentence_Break => 'Other',
1064 Simple_Case_Folding => $CODE_POINT,
1065 Simple_Lowercase_Mapping => $CODE_POINT,
1066 Simple_Titlecase_Mapping => $CODE_POINT,
1067 Simple_Uppercase_Mapping => $CODE_POINT,
1068 Titlecase_Mapping => $CODE_POINT,
1069 Unicode_1_Name => "",
1070 Unicode_Radical_Stroke => "",
1071 Uppercase_Mapping => $CODE_POINT,
1072 Word_Break => 'Other',
1073);
1074
1075# Below are files that Unicode furnishes, but this program ignores, and why
1076my %ignored_files = (
73ba1144
KW
1077 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1078 'Index.txt' => 'Alphabetical index of Unicode characters',
1079 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1080 'NamesList.txt' => 'Annotated list of characters',
1081 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1082 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1083 'ReadMe.txt' => 'Documentation',
1084 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1085 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
73ba1144
KW
1086 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1087 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1088 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1089 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
99870f4d
KW
1090);
1091
1fec9f60
KW
1092my %skipped_files; # List of files that we skip
1093
678f13d5 1094### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1095
1096my $HEADER=<<"EOF";
1097# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1098# This file is machine-generated by $0 from the Unicode
1099# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1100EOF
1101
126c3d4e 1102my $INTERNAL_ONLY_HEADER = <<"EOF";
99870f4d
KW
1103
1104# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
fac53429
KW
1105# This file is for internal use by core Perl only. The format and even the
1106# name or existence of this file are subject to change without notice. Don't
1107# use it directly.
99870f4d
KW
1108EOF
1109
1110my $DEVELOPMENT_ONLY=<<"EOF";
1111# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1112# This file contains information artificially constrained to code points
1113# present in Unicode release $string_compare_versions.
1114# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1115# not be used for production.
b6922eda
KW
1116
1117EOF
1118
6189eadc
KW
1119my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1120my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1121my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
99870f4d
KW
1122
1123# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1124# two must be 10; if there are 5, the first must not be a 0. Written this way
92199589
KW
1125# to decrease backtracking. The first regex allows the code point to be at
1126# the end of a word, but to work properly, the word shouldn't end with a valid
1127# hex character. The second one won't match a code point at the end of a
1128# word, and doesn't have the run-on issue
8c32d378
KW
1129my $run_on_code_point_re =
1130 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1131my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1132
1133# This matches the beginning of the line in the Unicode db files that give the
1134# defaults for code points not listed (i.e., missing) in the file. The code
1135# depends on this ending with a semi-colon, so it can assume it is a valid
1136# field when the line is split() by semi-colons
1137my $missing_defaults_prefix =
6189eadc 1138 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
99870f4d
KW
1139
1140# Property types. Unicode has more types, but these are sufficient for our
1141# purposes.
1142my $UNKNOWN = -1; # initialized to illegal value
1143my $NON_STRING = 1; # Either binary or enum
1144my $BINARY = 2;
06f26c45
KW
1145my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1146 # tables, additional true and false tables are
1147 # generated so that false is anything matching the
1148 # default value, and true is everything else.
1149my $ENUM = 4; # Include catalog
1150my $STRING = 5; # Anything else: string or misc
99870f4d
KW
1151
1152# Some input files have lines that give default values for code points not
1153# contained in the file. Sometimes these should be ignored.
1154my $NO_DEFAULTS = 0; # Must evaluate to false
1155my $NOT_IGNORED = 1;
1156my $IGNORED = 2;
1157
1158# Range types. Each range has a type. Most ranges are type 0, for normal,
1159# and will appear in the main body of the tables in the output files, but
1160# there are other types of ranges as well, listed below, that are specially
1161# handled. There are pseudo-types as well that will never be stored as a
1162# type, but will affect the calculation of the type.
1163
1164# 0 is for normal, non-specials
1165my $MULTI_CP = 1; # Sequence of more than code point
1166my $HANGUL_SYLLABLE = 2;
1167my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1168my $NULL = 4; # The map is to the null string; utf8.c can't
1169 # handle these, nor is there an accepted syntax
1170 # for them in \p{} constructs
f86864ac 1171my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1172 # otherwise be $MULTI_CP type are instead type 0
1173
1174# process_generic_property_file() can accept certain overrides in its input.
1175# Each of these must begin AND end with $CMD_DELIM.
1176my $CMD_DELIM = "\a";
1177my $REPLACE_CMD = 'replace'; # Override the Replace
1178my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1179
1180my $NO = 0;
1181my $YES = 1;
1182
1183# Values for the Replace argument to add_range.
1184# $NO # Don't replace; add only the code points not
1185 # already present.
1186my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1187 # the comments at the subroutine definition.
1188my $UNCONDITIONALLY = 2; # Replace without conditions.
1189my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1190 # already there
56343c78 1191my $CROAK = 5; # Die with an error if is already there
99870f4d
KW
1192
1193# Flags to give property statuses. The phrases are to remind maintainers that
1194# if the flag is changed, the indefinite article referring to it in the
1195# documentation may need to be as well.
1196my $NORMAL = "";
99870f4d
KW
1197my $DEPRECATED = 'D';
1198my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1199my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1200my $DISCOURAGED = 'X';
1201my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1202my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1203my $STRICTER = 'T';
1204my $a_bold_stricter = "a 'B<$STRICTER>'";
1205my $A_bold_stricter = "A 'B<$STRICTER>'";
1206my $STABILIZED = 'S';
1207my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1208my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1209my $OBSOLETE = 'O';
1210my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1211my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1212
1213my %status_past_participles = (
1214 $DISCOURAGED => 'discouraged',
99870f4d
KW
1215 $STABILIZED => 'stabilized',
1216 $OBSOLETE => 'obsolete',
37e2e78e 1217 $DEPRECATED => 'deprecated',
99870f4d
KW
1218);
1219
395dfc19
KW
1220# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1221# externally documented.
301ba948 1222my $ORDINARY = 0; # The normal fate.
395dfc19
KW
1223my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1224 # but there is a file written that can be used to
1225 # reconstruct this table
301ba948
KW
1226my $SUPPRESSED = 3; # The file for this table is not written out.
1227my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
1228 # for Perl's internal use only
1229my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
1230 # Unicode version that doesn't have it, but we need it
1231 # to be defined, if empty, to have things work.
1232 # Implies no pod entry generated
1233
f5817e0a
KW
1234# The format of the values of the tables:
1235my $EMPTY_FORMAT = "";
99870f4d
KW
1236my $BINARY_FORMAT = 'b';
1237my $DECIMAL_FORMAT = 'd';
1238my $FLOAT_FORMAT = 'f';
1239my $INTEGER_FORMAT = 'i';
1240my $HEX_FORMAT = 'x';
1241my $RATIONAL_FORMAT = 'r';
1242my $STRING_FORMAT = 's';
a14f3cb1 1243my $DECOMP_STRING_FORMAT = 'c';
c3ff2976 1244my $STRING_WHITE_SPACE_LIST = 'sw';
99870f4d
KW
1245
1246my %map_table_formats = (
1247 $BINARY_FORMAT => 'binary',
1248 $DECIMAL_FORMAT => 'single decimal digit',
1249 $FLOAT_FORMAT => 'floating point number',
1250 $INTEGER_FORMAT => 'integer',
add63c13 1251 $HEX_FORMAT => 'non-negative hex whole number; a code point',
99870f4d 1252 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1253 $STRING_FORMAT => 'string',
92f9d56c 1254 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
c3ff2976 1255 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
99870f4d
KW
1256);
1257
1258# Unicode didn't put such derived files in a separate directory at first.
1259my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1260my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1261my $AUXILIARY = 'auxiliary';
1262
1263# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
9e4a1e86 1264# and into UCD.pl for the use of UCD.pm
99870f4d
KW
1265my %loose_to_file_of; # loosely maps table names to their respective
1266 # files
1267my %stricter_to_file_of; # same; but for stricter mapping.
315bfd4e 1268my %loose_property_to_file_of; # Maps a loose property name to its map file
89cf10cc
KW
1269my %file_to_swash_name; # Maps the file name to its corresponding key name
1270 # in the hash %utf8::SwashInfo
99870f4d
KW
1271my %nv_floating_to_rational; # maps numeric values floating point numbers to
1272 # their rational equivalent
c12f2655
KW
1273my %loose_property_name_of; # Loosely maps (non_string) property names to
1274 # standard form
86a52d1e 1275my %string_property_loose_to_name; # Same, for string properties.
c15fda25
KW
1276my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1277 # the property name in standard loose form, and
1278 # 'value' is the default value for that property,
1279 # also in standard loose form.
9e4a1e86
KW
1280my %loose_to_standard_value; # loosely maps table names to the canonical
1281 # alias for them
2df7880f
KW
1282my %ambiguous_names; # keys are alias names (in standard form) that
1283 # have more than one possible meaning.
5d1df013
KW
1284my %prop_aliases; # Keys are standard property name; values are each
1285 # one's aliases
1e863613
KW
1286my %prop_value_aliases; # Keys of top level are standard property name;
1287 # values are keys to another hash, Each one is
1288 # one of the property's values, in standard form.
1289 # The values are that prop-val's aliases.
2df7880f 1290my %ucd_pod; # Holds entries that will go into the UCD section of the pod
99870f4d 1291
d867ccfb
KW
1292# Most properties are immune to caseless matching, otherwise you would get
1293# nonsensical results, as properties are a function of a code point, not
1294# everything that is caselessly equivalent to that code point. For example,
1295# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1296# be true because 's' and 'S' are equivalent caselessly. However,
1297# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1298# extend that concept to those very few properties that are like this. Each
1299# such property will match the full range caselessly. They are hard-coded in
1300# the program; it's not worth trying to make it general as it's extremely
1301# unlikely that they will ever change.
1302my %caseless_equivalent_to;
1303
99870f4d
KW
1304# These constants names and values were taken from the Unicode standard,
1305# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1306# syllables. The '_string' versions are so generated tables can retain the
1307# hex format, which is the more familiar value
1308my $SBase_string = "0xAC00";
1309my $SBase = CORE::hex $SBase_string;
1310my $LBase_string = "0x1100";
1311my $LBase = CORE::hex $LBase_string;
1312my $VBase_string = "0x1161";
1313my $VBase = CORE::hex $VBase_string;
1314my $TBase_string = "0x11A7";
1315my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1316my $SCount = 11172;
1317my $LCount = 19;
1318my $VCount = 21;
1319my $TCount = 28;
1320my $NCount = $VCount * $TCount;
1321
1322# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1323# with the above published constants.
1324my %Jamo;
1325my %Jamo_L; # Leading consonants
1326my %Jamo_V; # Vowels
1327my %Jamo_T; # Trailing consonants
1328
bb1dd3da
KW
1329# For code points whose name contains its ordinal as a '-ABCD' suffix.
1330# The key is the base name of the code point, and the value is an
1331# array giving all the ranges that use this base name. Each range
1332# is actually a hash giving the 'low' and 'high' values of it.
1333my %names_ending_in_code_point;
1334my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1335 # removed from the names
1336# Inverse mapping. The list of ranges that have these kinds of
1337# names. Each element contains the low, high, and base names in an
1338# anonymous hash.
1339my @code_points_ending_in_code_point;
1340
1341# Boolean: does this Unicode version have the hangul syllables, and are we
1342# writing out a table for them?
1343my $has_hangul_syllables = 0;
1344
1345# Does this Unicode version have code points whose names end in their
1346# respective code points, and are we writing out a table for them? 0 for no;
1347# otherwise points to first property that a table is needed for them, so that
1348# if multiple tables are needed, we don't create duplicates
1349my $needing_code_points_ending_in_code_point = 0;
1350
37e2e78e 1351my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1352my @unhandled_properties; # Will contain a list of properties found in
1353 # the input that we didn't process.
f86864ac 1354my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1355 # listed in the pod
1356my @map_properties; # Properties that get map files written
1357my @named_sequences; # NamedSequences.txt contents.
1358my %potential_files; # Generated list of all .txt files in the directory
1359 # structure so we can warn if something is being
1360 # ignored.
1361my @files_actually_output; # List of files we generated.
1362my @more_Names; # Some code point names are compound; this is used
1363 # to store the extra components of them.
1364my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1365 # the minimum before we consider it equivalent to a
1366 # candidate rational
1367my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1368
1369# These store references to certain commonly used property objects
1370my $gc;
1371my $perl;
1372my $block;
3e20195b
KW
1373my $perl_charname;
1374my $print;
7fc6cb55 1375my $Any;
359523e2 1376my $script;
99870f4d
KW
1377
1378# Are there conflicting names because of beginning with 'In_', or 'Is_'
1379my $has_In_conflicts = 0;
1380my $has_Is_conflicts = 0;
1381
1382sub internal_file_to_platform ($) {
1383 # Convert our file paths which have '/' separators to those of the
1384 # platform.
1385
1386 my $file = shift;
1387 return undef unless defined $file;
1388
1389 return File::Spec->join(split '/', $file);
d07a55ed 1390}
5beb625e 1391
99870f4d
KW
1392sub file_exists ($) { # platform independent '-e'. This program internally
1393 # uses slash as a path separator.
1394 my $file = shift;
1395 return 0 if ! defined $file;
1396 return -e internal_file_to_platform($file);
1397}
5beb625e 1398
99870f4d 1399sub objaddr($) {
23e33b60
KW
1400 # Returns the address of the blessed input object.
1401 # It doesn't check for blessedness because that would do a string eval
1402 # every call, and the program is structured so that this is never called
1403 # for a non-blessed object.
99870f4d 1404
23e33b60 1405 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1406
1407 # Numifying a ref gives its address.
051df77b 1408 return pack 'J', $_[0];
99870f4d
KW
1409}
1410
558712cf 1411# These are used only if $annotate is true.
c4019d52
KW
1412# The entire range of Unicode characters is examined to populate these
1413# after all the input has been processed. But most can be skipped, as they
1414# have the same descriptive phrases, such as being unassigned
1415my @viacode; # Contains the 1 million character names
1416my @printable; # boolean: And are those characters printable?
1417my @annotate_char_type; # Contains a type of those characters, specifically
1418 # for the purposes of annotation.
1419my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1420 # name for the purposes of annotation. They map to the
c4019d52
KW
1421 # upper edge of the range, so that the end point can
1422 # be immediately found. This is used to skip ahead to
1423 # the end of a range, and avoid processing each
1424 # individual code point in it.
1425my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1426 # characters, but excluding those which are
1427 # also noncharacter code points
1428
1429# The annotation types are an extension of the regular range types, though
1430# some of the latter are folded into one. Make the new types negative to
1431# avoid conflicting with the regular types
1432my $SURROGATE_TYPE = -1;
1433my $UNASSIGNED_TYPE = -2;
1434my $PRIVATE_USE_TYPE = -3;
1435my $NONCHARACTER_TYPE = -4;
1436my $CONTROL_TYPE = -5;
1437my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1438
1439sub populate_char_info ($) {
558712cf 1440 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1441 # input code point's info that are needed for outputting more detailed
1442 # comments. If calling context wants a return, it is the end point of
1443 # any contiguous range of characters that share essentially the same info
1444
1445 my $i = shift;
1446 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1447
1448 $viacode[$i] = $perl_charname->value_of($i) || "";
1449
1450 # A character is generally printable if Unicode says it is,
1451 # but below we make sure that most Unicode general category 'C' types
1452 # aren't.
1453 $printable[$i] = $print->contains($i);
1454
1455 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1456
1457 # Only these two regular types are treated specially for annotations
1458 # purposes
1459 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1460 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1461
1462 # Give a generic name to all code points that don't have a real name.
1463 # We output ranges, if applicable, for these. Also calculate the end
1464 # point of the range.
1465 my $end;
1466 if (! $viacode[$i]) {
1467 if ($gc-> table('Surrogate')->contains($i)) {
1468 $viacode[$i] = 'Surrogate';
1469 $annotate_char_type[$i] = $SURROGATE_TYPE;
1470 $printable[$i] = 0;
1471 $end = $gc->table('Surrogate')->containing_range($i)->end;
1472 }
1473 elsif ($gc-> table('Private_use')->contains($i)) {
1474 $viacode[$i] = 'Private Use';
1475 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1476 $printable[$i] = 0;
1477 $end = $gc->table('Private_Use')->containing_range($i)->end;
1478 }
1479 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1480 contains($i))
1481 {
1482 $viacode[$i] = 'Noncharacter';
1483 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1484 $printable[$i] = 0;
1485 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1486 containing_range($i)->end;
1487 }
1488 elsif ($gc-> table('Control')->contains($i)) {
1489 $viacode[$i] = 'Control';
1490 $annotate_char_type[$i] = $CONTROL_TYPE;
1491 $printable[$i] = 0;
1492 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1493 }
1494 elsif ($gc-> table('Unassigned')->contains($i)) {
1495 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1496 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1497 $printable[$i] = 0;
1498
1499 # Because we name the unassigned by the blocks they are in, it
1500 # can't go past the end of that block, and it also can't go past
1501 # the unassigned range it is in. The special table makes sure
1502 # that the non-characters, which are unassigned, are separated
1503 # out.
1504 $end = min($block->containing_range($i)->end,
1505 $unassigned_sans_noncharacters-> containing_range($i)->
1506 end);
13ca76ff
KW
1507 }
1508 else {
1509 Carp::my_carp_bug("Can't figure out how to annotate "
1510 . sprintf("U+%04X", $i)
1511 . ". Proceeding anyway.");
c4019d52
KW
1512 $viacode[$i] = 'UNKNOWN';
1513 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1514 $printable[$i] = 0;
1515 }
1516 }
1517
1518 # Here, has a name, but if it's one in which the code point number is
1519 # appended to the name, do that.
1520 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1521 $viacode[$i] .= sprintf("-%04X", $i);
1522 $end = $perl_charname->containing_range($i)->end;
1523 }
1524
1525 # And here, has a name, but if it's a hangul syllable one, replace it with
1526 # the correct name from the Unicode algorithm
1527 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1528 use integer;
1529 my $SIndex = $i - $SBase;
1530 my $L = $LBase + $SIndex / $NCount;
1531 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1532 my $T = $TBase + $SIndex % $TCount;
1533 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1534 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1535 $end = $perl_charname->containing_range($i)->end;
1536 }
1537
1538 return if ! defined wantarray;
1539 return $i if ! defined $end; # If not a range, return the input
1540
1541 # Save this whole range so can find the end point quickly
1542 $annotate_ranges->add_map($i, $end, $end);
1543
1544 return $end;
1545}
1546
23e33b60
KW
1547# Commented code below should work on Perl 5.8.
1548## This 'require' doesn't necessarily work in miniperl, and even if it does,
1549## the native perl version of it (which is what would operate under miniperl)
1550## is extremely slow, as it does a string eval every call.
1551#my $has_fast_scalar_util = $\18 !~ /miniperl/
1552# && defined eval "require Scalar::Util";
1553#
1554#sub objaddr($) {
1555# # Returns the address of the blessed input object. Uses the XS version if
1556# # available. It doesn't check for blessedness because that would do a
1557# # string eval every call, and the program is structured so that this is
1558# # never called for a non-blessed object.
1559#
1560# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1561#
1562# # Check at least that is a ref.
1563# my $pkg = ref($_[0]) or return undef;
1564#
1565# # Change to a fake package to defeat any overloaded stringify
1566# bless $_[0], 'main::Fake';
1567#
1568# # Numifying a ref gives its address.
051df77b 1569# my $addr = pack 'J', $_[0];
23e33b60
KW
1570#
1571# # Return to original class
1572# bless $_[0], $pkg;
1573# return $addr;
1574#}
1575
99870f4d
KW
1576sub max ($$) {
1577 my $a = shift;
1578 my $b = shift;
1579 return $a if $a >= $b;
1580 return $b;
1581}
1582
1583sub min ($$) {
1584 my $a = shift;
1585 my $b = shift;
1586 return $a if $a <= $b;
1587 return $b;
1588}
1589
1590sub clarify_number ($) {
1591 # This returns the input number with underscores inserted every 3 digits
1592 # in large (5 digits or more) numbers. Input must be entirely digits, not
1593 # checked.
1594
1595 my $number = shift;
1596 my $pos = length($number) - 3;
1597 return $number if $pos <= 1;
1598 while ($pos > 0) {
1599 substr($number, $pos, 0) = '_';
1600 $pos -= 3;
5beb625e 1601 }
99870f4d 1602 return $number;
99598c8c
JH
1603}
1604
12ac2576 1605
99870f4d 1606package Carp;
7ebf06b3 1607
99870f4d
KW
1608# These routines give a uniform treatment of messages in this program. They
1609# are placed in the Carp package to cause the stack trace to not include them,
1610# although an alternative would be to use another package and set @CARP_NOT
1611# for it.
12ac2576 1612
99870f4d 1613our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1614
99f78760
KW
1615# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1616# and overload trying to load Scalar:Util under miniperl. See
1617# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1618undef $overload::VERSION;
1619
99870f4d
KW
1620sub my_carp {
1621 my $message = shift || "";
1622 my $nofold = shift || 0;
7ebf06b3 1623
99870f4d
KW
1624 if ($message) {
1625 $message = main::join_lines($message);
1626 $message =~ s/^$0: *//; # Remove initial program name
1627 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1628 $message = "\n$0: $message;";
12ac2576 1629
99870f4d
KW
1630 # Fold the message with program name, semi-colon end punctuation
1631 # (which looks good with the message that carp appends to it), and a
1632 # hanging indent for continuation lines.
1633 $message = main::simple_fold($message, "", 4) unless $nofold;
1634 $message =~ s/\n$//; # Remove the trailing nl so what carp
1635 # appends is to the same line
1636 }
12ac2576 1637
99870f4d 1638 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1639
99870f4d
KW
1640 carp $message;
1641 return;
1642}
7ebf06b3 1643
99870f4d
KW
1644sub my_carp_bug {
1645 # This is called when it is clear that the problem is caused by a bug in
1646 # this program.
7ebf06b3 1647
99870f4d
KW
1648 my $message = shift;
1649 $message =~ s/^$0: *//;
1650 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1651 carp $message;
1652 return;
1653}
7ebf06b3 1654
99870f4d
KW
1655sub carp_too_few_args {
1656 if (@_ != 2) {
1657 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1658 return;
12ac2576 1659 }
7ebf06b3 1660
99870f4d
KW
1661 my $args_ref = shift;
1662 my $count = shift;
7ebf06b3 1663
99870f4d
KW
1664 my_carp_bug("Need at least $count arguments to "
1665 . (caller 1)[3]
1666 . ". Instead got: '"
1667 . join ', ', @$args_ref
1668 . "'. No action taken.");
1669 return;
12ac2576
JP
1670}
1671
99870f4d
KW
1672sub carp_extra_args {
1673 my $args_ref = shift;
1674 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1675
99870f4d
KW
1676 unless (ref $args_ref) {
1677 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1678 return;
1679 }
1680 my ($package, $file, $line) = caller;
1681 my $subroutine = (caller 1)[3];
cf25bb62 1682
99870f4d
KW
1683 my $list;
1684 if (ref $args_ref eq 'HASH') {
1685 foreach my $key (keys %$args_ref) {
1686 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1687 }
99870f4d 1688 $list = join ', ', each %{$args_ref};
cf25bb62 1689 }
99870f4d
KW
1690 elsif (ref $args_ref eq 'ARRAY') {
1691 foreach my $arg (@$args_ref) {
1692 $arg = $UNDEF unless defined $arg;
1693 }
1694 $list = join ', ', @$args_ref;
1695 }
1696 else {
1697 my_carp_bug("Can't cope with ref "
1698 . ref($args_ref)
1699 . " . argument to 'carp_extra_args'. Not checking arguments.");
1700 return;
1701 }
1702
1703 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1704 return;
d73e5302
JH
1705}
1706
99870f4d
KW
1707package main;
1708
1709{ # Closure
1710
1711 # This program uses the inside-out method for objects, as recommended in
1712 # "Perl Best Practices". This closure aids in generating those. There
1713 # are two routines. setup_package() is called once per package to set
1714 # things up, and then set_access() is called for each hash representing a
1715 # field in the object. These routines arrange for the object to be
1716 # properly destroyed when no longer used, and for standard accessor
1717 # functions to be generated. If you need more complex accessors, just
1718 # write your own and leave those accesses out of the call to set_access().
1719 # More details below.
1720
1721 my %constructor_fields; # fields that are to be used in constructors; see
1722 # below
1723
1724 # The values of this hash will be the package names as keys to other
1725 # hashes containing the name of each field in the package as keys, and
1726 # references to their respective hashes as values.
1727 my %package_fields;
1728
1729 sub setup_package {
1730 # Sets up the package, creating standard DESTROY and dump methods
1731 # (unless already defined). The dump method is used in debugging by
1732 # simple_dumper().
1733 # The optional parameters are:
1734 # a) a reference to a hash, that gets populated by later
1735 # set_access() calls with one of the accesses being
1736 # 'constructor'. The caller can then refer to this, but it is
1737 # not otherwise used by these two routines.
1738 # b) a reference to a callback routine to call during destruction
1739 # of the object, before any fields are actually destroyed
1740
1741 my %args = @_;
1742 my $constructor_ref = delete $args{'Constructor_Fields'};
1743 my $destroy_callback = delete $args{'Destroy_Callback'};
1744 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1745
1746 my %fields;
1747 my $package = (caller)[0];
1748
1749 $package_fields{$package} = \%fields;
1750 $constructor_fields{$package} = $constructor_ref;
1751
1752 unless ($package->can('DESTROY')) {
1753 my $destroy_name = "${package}::DESTROY";
1754 no strict "refs";
1755
1756 # Use typeglob to give the anonymous subroutine the name we want
1757 *$destroy_name = sub {
1758 my $self = shift;
ffe43484 1759 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1760
1761 $self->$destroy_callback if $destroy_callback;
1762 foreach my $field (keys %{$package_fields{$package}}) {
1763 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1764 delete $package_fields{$package}{$field}{$addr};
1765 }
1766 return;
1767 }
1768 }
1769
1770 unless ($package->can('dump')) {
1771 my $dump_name = "${package}::dump";
1772 no strict "refs";
1773 *$dump_name = sub {
1774 my $self = shift;
1775 return dump_inside_out($self, $package_fields{$package}, @_);
1776 }
1777 }
1778 return;
1779 }
1780
1781 sub set_access {
1782 # Arrange for the input field to be garbage collected when no longer
1783 # needed. Also, creates standard accessor functions for the field
1784 # based on the optional parameters-- none if none of these parameters:
1785 # 'addable' creates an 'add_NAME()' accessor function.
1786 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1787 # function.
1788 # 'settable' creates a 'set_NAME()' accessor function.
1789 # 'constructor' doesn't create an accessor function, but adds the
1790 # field to the hash that was previously passed to
1791 # setup_package();
1792 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1793 # 'add' etc. all mean 'addable'.
1794 # The read accessor function will work on both array and scalar
1795 # values. If another accessor in the parameter list is 'a', the read
1796 # access assumes an array. You can also force it to be array access
1797 # by specifying 'readable_array' instead of 'readable'
1798 #
1799 # A sort-of 'protected' access can be set-up by preceding the addable,
1800 # readable or settable with some initial portion of 'protected_' (but,
1801 # the underscore is required), like 'p_a', 'pro_set', etc. The
1802 # "protection" is only by convention. All that happens is that the
1803 # accessor functions' names begin with an underscore. So instead of
1804 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1805 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1806 # end of each package, and then storing the __LINE__ ranges and
1807 # checking them on every accessor. But that is way overkill.)
1808
1809 # We create anonymous subroutines as the accessors and then use
1810 # typeglobs to assign them to the proper package and name
1811
1812 my $name = shift; # Name of the field
1813 my $field = shift; # Reference to the inside-out hash containing the
1814 # field
1815
1816 my $package = (caller)[0];
1817
1818 if (! exists $package_fields{$package}) {
1819 croak "$0: Must call 'setup_package' before 'set_access'";
1820 }
d73e5302 1821
99870f4d
KW
1822 # Stash the field so DESTROY can get it.
1823 $package_fields{$package}{$name} = $field;
cf25bb62 1824
99870f4d
KW
1825 # Remaining arguments are the accessors. For each...
1826 foreach my $access (@_) {
1827 my $access = lc $access;
cf25bb62 1828
99870f4d 1829 my $protected = "";
cf25bb62 1830
99870f4d
KW
1831 # Match the input as far as it goes.
1832 if ($access =~ /^(p[^_]*)_/) {
1833 $protected = $1;
1834 if (substr('protected_', 0, length $protected)
1835 eq $protected)
1836 {
1837
1838 # Add 1 for the underscore not included in $protected
1839 $access = substr($access, length($protected) + 1);
1840 $protected = '_';
1841 }
1842 else {
1843 $protected = "";
1844 }
1845 }
1846
1847 if (substr('addable', 0, length $access) eq $access) {
1848 my $subname = "${package}::${protected}add_$name";
1849 no strict "refs";
1850
1851 # add_ accessor. Don't add if already there, which we
1852 # determine using 'eq' for scalars and '==' otherwise.
1853 *$subname = sub {
1854 use strict "refs";
1855 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1856 my $self = shift;
1857 my $value = shift;
ffe43484 1858 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1859 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1860 if (ref $value) {
f998e60c 1861 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1862 }
1863 else {
f998e60c 1864 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1865 }
f998e60c 1866 push @{$field->{$addr}}, $value;
99870f4d
KW
1867 return;
1868 }
1869 }
1870 elsif (substr('constructor', 0, length $access) eq $access) {
1871 if ($protected) {
1872 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1873 }
1874 else {
1875 $constructor_fields{$package}{$name} = $field;
1876 }
1877 }
1878 elsif (substr('readable_array', 0, length $access) eq $access) {
1879
1880 # Here has read access. If one of the other parameters for
1881 # access is array, or this one specifies array (by being more
1882 # than just 'readable_'), then create a subroutine that
1883 # assumes the data is an array. Otherwise just a scalar
1884 my $subname = "${package}::${protected}$name";
1885 if (grep { /^a/i } @_
1886 or length($access) > length('readable_'))
1887 {
1888 no strict "refs";
1889 *$subname = sub {
1890 use strict "refs";
23e33b60 1891 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1892 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1893 if (ref $field->{$addr} ne 'ARRAY') {
1894 my $type = ref $field->{$addr};
1895 $type = 'scalar' unless $type;
1896 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1897 return;
1898 }
1899 return scalar @{$field->{$addr}} unless wantarray;
1900
1901 # Make a copy; had problems with caller modifying the
1902 # original otherwise
1903 my @return = @{$field->{$addr}};
1904 return @return;
1905 }
1906 }
1907 else {
1908
1909 # Here not an array value, a simpler function.
1910 no strict "refs";
1911 *$subname = sub {
1912 use strict "refs";
23e33b60 1913 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1914 no overloading;
051df77b 1915 return $field->{pack 'J', $_[0]};
99870f4d
KW
1916 }
1917 }
1918 }
1919 elsif (substr('settable', 0, length $access) eq $access) {
1920 my $subname = "${package}::${protected}set_$name";
1921 no strict "refs";
1922 *$subname = sub {
1923 use strict "refs";
23e33b60
KW
1924 if (main::DEBUG) {
1925 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1926 Carp::carp_extra_args(\@_) if @_ > 2;
1927 }
1928 # $self is $_[0]; $value is $_[1]
f998e60c 1929 no overloading;
051df77b 1930 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1931 return;
1932 }
1933 }
1934 else {
1935 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1936 }
cf25bb62 1937 }
99870f4d 1938 return;
cf25bb62 1939 }
99870f4d
KW
1940}
1941
1942package Input_file;
1943
1944# All input files use this object, which stores various attributes about them,
1945# and provides for convenient, uniform handling. The run method wraps the
1946# processing. It handles all the bookkeeping of opening, reading, and closing
1947# the file, returning only significant input lines.
1948#
1949# Each object gets a handler which processes the body of the file, and is
1950# called by run(). Most should use the generic, default handler, which has
1951# code scrubbed to handle things you might not expect. A handler should
1952# basically be a while(next_line()) {...} loop.
1953#
1954# You can also set up handlers to
1955# 1) call before the first line is read for pre processing
1956# 2) call to adjust each line of the input before the main handler gets them
1957# 3) call upon EOF before the main handler exits its loop
1958# 4) call at the end for post processing
1959#
1960# $_ is used to store the input line, and is to be filtered by the
1961# each_line_handler()s. So, if the format of the line is not in the desired
1962# format for the main handler, these are used to do that adjusting. They can
1963# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1964# so the $_ output of one is used as the input to the next. None of the other
1965# handlers are stackable, but could easily be changed to be so.
1966#
1967# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1968# which insert the parameters as lines to be processed before the next input
1969# file line is read. This allows the EOF handler to flush buffers, for
1970# example. The difference between the two routines is that the lines inserted
1971# by insert_lines() are subjected to the each_line_handler()s. (So if you
1972# called it from such a handler, you would get infinite recursion.) Lines
1973# inserted by insert_adjusted_lines() go directly to the main handler without
1974# any adjustments. If the post-processing handler calls any of these, there
1975# will be no effect. Some error checking for these conditions could be added,
1976# but it hasn't been done.
1977#
1978# carp_bad_line() should be called to warn of bad input lines, which clears $_
1979# to prevent further processing of the line. This routine will output the
1980# message as a warning once, and then keep a count of the lines that have the
1981# same message, and output that count at the end of the file's processing.
1982# This keeps the number of messages down to a manageable amount.
1983#
1984# get_missings() should be called to retrieve any @missing input lines.
1985# Messages will be raised if this isn't done if the options aren't to ignore
1986# missings.
1987
1988sub trace { return main::trace(@_); }
1989
99870f4d
KW
1990{ # Closure
1991 # Keep track of fields that are to be put into the constructor.
1992 my %constructor_fields;
1993
1994 main::setup_package(Constructor_Fields => \%constructor_fields);
1995
1996 my %file; # Input file name, required
1997 main::set_access('file', \%file, qw{ c r });
1998
1999 my %first_released; # Unicode version file was first released in, required
2000 main::set_access('first_released', \%first_released, qw{ c r });
2001
2002 my %handler; # Subroutine to process the input file, defaults to
2003 # 'process_generic_property_file'
2004 main::set_access('handler', \%handler, qw{ c });
2005
2006 my %property;
2007 # name of property this file is for. defaults to none, meaning not
2008 # applicable, or is otherwise determinable, for example, from each line.
2009 main::set_access('property', \%property, qw{ c });
2010
2011 my %optional;
2012 # If this is true, the file is optional. If not present, no warning is
2013 # output. If it is present, the string given by this parameter is
2014 # evaluated, and if false the file is not processed.
2015 main::set_access('optional', \%optional, 'c', 'r');
2016
2017 my %non_skip;
2018 # This is used for debugging, to skip processing of all but a few input
2019 # files. Add 'non_skip => 1' to the constructor for those files you want
2020 # processed when you set the $debug_skip global.
2021 main::set_access('non_skip', \%non_skip, 'c');
2022
37e2e78e 2023 my %skip;
09ca89ce
KW
2024 # This is used to skip processing of this input file semi-permanently,
2025 # when it evaluates to true. The value should be the reason the file is
2026 # being skipped. It is used for files that we aren't planning to process
2027 # anytime soon, but want to allow to be in the directory and not raise a
2028 # message that we are not handling. Mostly for test files. This is in
2029 # contrast to the non_skip element, which is supposed to be used very
2030 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2031 # pretty much will never look at can be placed in the global
1fec9f60 2032 # %ignored_files instead. Ones used here will be added to %skipped files
37e2e78e
KW
2033 main::set_access('skip', \%skip, 'c');
2034
99870f4d
KW
2035 my %each_line_handler;
2036 # list of subroutines to look at and filter each non-comment line in the
2037 # file. defaults to none. The subroutines are called in order, each is
2038 # to adjust $_ for the next one, and the final one adjusts it for
2039 # 'handler'
2040 main::set_access('each_line_handler', \%each_line_handler, 'c');
2041
2042 my %has_missings_defaults;
2043 # ? Are there lines in the file giving default values for code points
2044 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2045 # the norm, but IGNORED means it has such lines, but the handler doesn't
2046 # use them. Having these three states allows us to catch changes to the
2047 # UCD that this program should track
2048 main::set_access('has_missings_defaults',
2049 \%has_missings_defaults, qw{ c r });
2050
2051 my %pre_handler;
2052 # Subroutine to call before doing anything else in the file. If undef, no
2053 # such handler is called.
2054 main::set_access('pre_handler', \%pre_handler, qw{ c });
2055
2056 my %eof_handler;
2057 # Subroutine to call upon getting an EOF on the input file, but before
2058 # that is returned to the main handler. This is to allow buffers to be
2059 # flushed. The handler is expected to call insert_lines() or
2060 # insert_adjusted() with the buffered material
2061 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2062
2063 my %post_handler;
2064 # Subroutine to call after all the lines of the file are read in and
2065 # processed. If undef, no such handler is called.
2066 main::set_access('post_handler', \%post_handler, qw{ c });
2067
2068 my %progress_message;
2069 # Message to print to display progress in lieu of the standard one
2070 main::set_access('progress_message', \%progress_message, qw{ c });
2071
2072 my %handle;
2073 # cache open file handle, internal. Is undef if file hasn't been
2074 # processed at all, empty if has;
2075 main::set_access('handle', \%handle);
2076
2077 my %added_lines;
2078 # cache of lines added virtually to the file, internal
2079 main::set_access('added_lines', \%added_lines);
2080
2081 my %errors;
2082 # cache of errors found, internal
2083 main::set_access('errors', \%errors);
2084
2085 my %missings;
2086 # storage of '@missing' defaults lines
2087 main::set_access('missings', \%missings);
2088
2089 sub new {
2090 my $class = shift;
2091
2092 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2093 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2094
2095 # Set defaults
2096 $handler{$addr} = \&main::process_generic_property_file;
2097 $non_skip{$addr} = 0;
37e2e78e 2098 $skip{$addr} = 0;
99870f4d
KW
2099 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2100 $handle{$addr} = undef;
2101 $added_lines{$addr} = [ ];
2102 $each_line_handler{$addr} = [ ];
2103 $errors{$addr} = { };
2104 $missings{$addr} = [ ];
2105
2106 # Two positional parameters.
99f78760 2107 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2108 $file{$addr} = main::internal_file_to_platform(shift);
2109 $first_released{$addr} = shift;
2110
2111 # The rest of the arguments are key => value pairs
2112 # %constructor_fields has been set up earlier to list all possible
2113 # ones. Either set or push, depending on how the default has been set
2114 # up just above.
2115 my %args = @_;
2116 foreach my $key (keys %args) {
2117 my $argument = $args{$key};
2118
2119 # Note that the fields are the lower case of the constructor keys
2120 my $hash = $constructor_fields{lc $key};
2121 if (! defined $hash) {
2122 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2123 next;
2124 }
2125 if (ref $hash->{$addr} eq 'ARRAY') {
2126 if (ref $argument eq 'ARRAY') {
2127 foreach my $argument (@{$argument}) {
2128 next if ! defined $argument;
2129 push @{$hash->{$addr}}, $argument;
2130 }
2131 }
2132 else {
2133 push @{$hash->{$addr}}, $argument if defined $argument;
2134 }
2135 }
2136 else {
2137 $hash->{$addr} = $argument;
2138 }
2139 delete $args{$key};
2140 };
2141
2142 # If the file has a property for it, it means that the property is not
2143 # listed in the file's entries. So add a handler to the list of line
2144 # handlers to insert the property name into the lines, to provide a
2145 # uniform interface to the final processing subroutine.
2146 # the final code doesn't have to worry about that.
2147 if ($property{$addr}) {
2148 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2149 }
2150
2151 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2152 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 2153 }
99870f4d 2154
09ca89ce
KW
2155 # If skipping, set to optional, and add to list of ignored files,
2156 # including its reason
2157 if ($skip{$addr}) {
2158 $optional{$addr} = 1;
1fec9f60 2159 $skipped_files{$file{$addr}} = $skip{$addr}
09ca89ce 2160 }
37e2e78e 2161
99870f4d 2162 return $self;
d73e5302
JH
2163 }
2164
cf25bb62 2165
99870f4d
KW
2166 use overload
2167 fallback => 0,
2168 qw("") => "_operator_stringify",
2169 "." => \&main::_operator_dot,
2170 ;
cf25bb62 2171
99870f4d
KW
2172 sub _operator_stringify {
2173 my $self = shift;
cf25bb62 2174
99870f4d 2175 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2176 }
d73e5302 2177
99870f4d
KW
2178 # flag to make sure extracted files are processed early
2179 my $seen_non_extracted_non_age = 0;
d73e5302 2180
99870f4d
KW
2181 sub run {
2182 # Process the input object $self. This opens and closes the file and
2183 # calls all the handlers for it. Currently, this can only be called
2184 # once per file, as it destroy's the EOF handler
d73e5302 2185
99870f4d
KW
2186 my $self = shift;
2187 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2188
ffe43484 2189 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2190
99870f4d 2191 my $file = $file{$addr};
d73e5302 2192
99870f4d
KW
2193 # Don't process if not expecting this file (because released later
2194 # than this Unicode version), and isn't there. This means if someone
2195 # copies it into an earlier version's directory, we will go ahead and
2196 # process it.
2197 return if $first_released{$addr} gt $v_version && ! -e $file;
2198
2199 # If in debugging mode and this file doesn't have the non-skip
2200 # flag set, and isn't one of the critical files, skip it.
2201 if ($debug_skip
2202 && $first_released{$addr} ne v0
2203 && ! $non_skip{$addr})
2204 {
2205 print "Skipping $file in debugging\n" if $verbosity;
2206 return;
2207 }
2208
2209 # File could be optional
37e2e78e 2210 if ($optional{$addr}) {
99870f4d
KW
2211 return unless -e $file;
2212 my $result = eval $optional{$addr};
2213 if (! defined $result) {
2214 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2215 return;
2216 }
2217 if (! $result) {
2218 if ($verbosity) {
2219 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2220 }
2221 return;
2222 }
2223 }
2224
2225 if (! defined $file || ! -e $file) {
2226
2227 # If the file doesn't exist, see if have internal data for it
2228 # (based on first_released being 0).
2229 if ($first_released{$addr} eq v0) {
2230 $handle{$addr} = 'pretend_is_open';
2231 }
2232 else {
2233 if (! $optional{$addr} # File could be optional
2234 && $v_version ge $first_released{$addr})
2235 {
2236 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2237 }
2238 return;
2239 }
2240 }
2241 else {
2242
37e2e78e
KW
2243 # Here, the file exists. Some platforms may change the case of
2244 # its name
99870f4d 2245 if ($seen_non_extracted_non_age) {
517956bf 2246 if ($file =~ /$EXTRACTED/i) {
99870f4d 2247 Carp::my_carp_bug(join_lines(<<END
99f78760 2248$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2249anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2250have subtle problems
2251END
2252 ));
2253 }
2254 }
2255 elsif ($EXTRACTED_DIR
2256 && $first_released{$addr} ne v0
517956bf
CB
2257 && $file !~ /$EXTRACTED/i
2258 && lc($file) ne 'dage.txt')
99870f4d
KW
2259 {
2260 # We don't set this (by the 'if' above) if we have no
2261 # extracted directory, so if running on an early version,
2262 # this test won't work. Not worth worrying about.
2263 $seen_non_extracted_non_age = 1;
2264 }
2265
2266 # And mark the file as having being processed, and warn if it
2267 # isn't a file we are expecting. As we process the files,
2268 # they are deleted from the hash, so any that remain at the
2269 # end of the program are files that we didn't process.
517956bf 2270 my $fkey = File::Spec->rel2abs($file);
faf3cf6b
KW
2271 my $expecting = delete $potential_files{lc($fkey)};
2272
678f13d5
KW
2273 Carp::my_carp("Was not expecting '$file'.") if
2274 ! $expecting
99870f4d
KW
2275 && ! defined $handle{$addr};
2276
37e2e78e
KW
2277 # Having deleted from expected files, we can quit if not to do
2278 # anything. Don't print progress unless really want verbosity
2279 if ($skip{$addr}) {
2280 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2281 return;
2282 }
2283
99870f4d
KW
2284 # Open the file, converting the slashes used in this program
2285 # into the proper form for the OS
2286 my $file_handle;
2287 if (not open $file_handle, "<", $file) {
2288 Carp::my_carp("Can't open $file. Skipping: $!");
2289 return 0;
2290 }
2291 $handle{$addr} = $file_handle; # Cache the open file handle
2292 }
2293
2294 if ($verbosity >= $PROGRESS) {
2295 if ($progress_message{$addr}) {
2296 print "$progress_message{$addr}\n";
2297 }
2298 else {
2299 # If using a virtual file, say so.
2300 print "Processing ", (-e $file)
2301 ? $file
2302 : "substitute $file",
2303 "\n";
2304 }
2305 }
2306
2307
2308 # Call any special handler for before the file.
2309 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2310
2311 # Then the main handler
2312 &{$handler{$addr}}($self);
2313
2314 # Then any special post-file handler.
2315 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2316
2317 # If any errors have been accumulated, output the counts (as the first
2318 # error message in each class was output when it was encountered).
2319 if ($errors{$addr}) {
2320 my $total = 0;
2321 my $types = 0;
2322 foreach my $error (keys %{$errors{$addr}}) {
2323 $total += $errors{$addr}->{$error};
2324 delete $errors{$addr}->{$error};
2325 $types++;
2326 }
2327 if ($total > 1) {
2328 my $message
2329 = "A total of $total lines had errors in $file. ";
2330
2331 $message .= ($types == 1)
2332 ? '(Only the first one was displayed.)'
2333 : '(Only the first of each type was displayed.)';
2334 Carp::my_carp($message);
2335 }
2336 }
2337
2338 if (@{$missings{$addr}}) {
2339 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2340 }
2341
2342 # If a real file handle, close it.
2343 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2344 ref $handle{$addr};
2345 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2346 # the file, as opposed to undef
2347 return;
2348 }
2349
2350 sub next_line {
2351 # Sets $_ to be the next logical input line, if any. Returns non-zero
2352 # if such a line exists. 'logical' means that any lines that have
2353 # been added via insert_lines() will be returned in $_ before the file
2354 # is read again.
2355
2356 my $self = shift;
2357 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2358
ffe43484 2359 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2360
2361 # Here the file is open (or if the handle is not a ref, is an open
2362 # 'virtual' file). Get the next line; any inserted lines get priority
2363 # over the file itself.
2364 my $adjusted;
2365
2366 LINE:
2367 while (1) { # Loop until find non-comment, non-empty line
2368 #local $to_trace = 1 if main::DEBUG;
2369 my $inserted_ref = shift @{$added_lines{$addr}};
2370 if (defined $inserted_ref) {
2371 ($adjusted, $_) = @{$inserted_ref};
2372 trace $adjusted, $_ if main::DEBUG && $to_trace;
2373 return 1 if $adjusted;
2374 }
2375 else {
2376 last if ! ref $handle{$addr}; # Don't read unless is real file
2377 last if ! defined ($_ = readline $handle{$addr});
2378 }
2379 chomp;
2380 trace $_ if main::DEBUG && $to_trace;
2381
2382 # See if this line is the comment line that defines what property
2383 # value that code points that are not listed in the file should
2384 # have. The format or existence of these lines is not guaranteed
2385 # by Unicode since they are comments, but the documentation says
2386 # that this was added for machine-readability, so probably won't
2387 # change. This works starting in Unicode Version 5.0. They look
2388 # like:
2389 #
2390 # @missing: 0000..10FFFF; Not_Reordered
2391 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2392 # @missing: 0000..10FFFF; ; NaN
2393 #
2394 # Save the line for a later get_missings() call.
2395 if (/$missing_defaults_prefix/) {
2396 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2397 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2398 }
2399 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2400 my @defaults = split /\s* ; \s*/x, $_;
2401
2402 # The first field is the @missing, which ends in a
2403 # semi-colon, so can safely shift.
2404 shift @defaults;
2405
2406 # Some of these lines may have empty field placeholders
2407 # which get in the way. An example is:
2408 # @missing: 0000..10FFFF; ; NaN
2409 # Remove them. Process starting from the top so the
2410 # splice doesn't affect things still to be looked at.
2411 for (my $i = @defaults - 1; $i >= 0; $i--) {
2412 next if $defaults[$i] ne "";
2413 splice @defaults, $i, 1;
2414 }
2415
2416 # What's left should be just the property (maybe) and the
2417 # default. Having only one element means it doesn't have
2418 # the property.
2419 my $default;
2420 my $property;
2421 if (@defaults >= 1) {
2422 if (@defaults == 1) {
2423 $default = $defaults[0];
2424 }
2425 else {
2426 $property = $defaults[0];
2427 $default = $defaults[1];
2428 }
2429 }
2430
2431 if (@defaults < 1
2432 || @defaults > 2
2433 || ($default =~ /^</
2434 && $default !~ /^<code *point>$/i
2435 && $default !~ /^<none>$/i))
2436 {
2437 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2438 }
2439 else {
2440
2441 # If the property is missing from the line, it should
2442 # be the one for the whole file
2443 $property = $property{$addr} if ! defined $property;
2444
2445 # Change <none> to the null string, which is what it
2446 # really means. If the default is the code point
2447 # itself, set it to <code point>, which is what
2448 # Unicode uses (but sometimes they've forgotten the
2449 # space)
2450 if ($default =~ /^<none>$/i) {
2451 $default = "";
2452 }
2453 elsif ($default =~ /^<code *point>$/i) {
2454 $default = $CODE_POINT;
2455 }
2456
2457 # Store them as a sub-arrays with both components.
2458 push @{$missings{$addr}}, [ $default, $property ];
2459 }
2460 }
2461
2462 # There is nothing for the caller to process on this comment
2463 # line.
2464 next;
2465 }
2466
2467 # Remove comments and trailing space, and skip this line if the
2468 # result is empty
2469 s/#.*//;
2470 s/\s+$//;
2471 next if /^$/;
2472
2473 # Call any handlers for this line, and skip further processing of
2474 # the line if the handler sets the line to null.
2475 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2476 &{$sub_ref}($self);
2477 next LINE if /^$/;
2478 }
2479
2480 # Here the line is ok. return success.
2481 return 1;
2482 } # End of looping through lines.
2483
2484 # If there is an EOF handler, call it (only once) and if it generates
2485 # more lines to process go back in the loop to handle them.
2486 if ($eof_handler{$addr}) {
2487 &{$eof_handler{$addr}}($self);
2488 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2489 goto LINE if $added_lines{$addr};
2490 }
2491
2492 # Return failure -- no more lines.
2493 return 0;
2494
2495 }
2496
2497# Not currently used, not fully tested.
2498# sub peek {
2499# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2500# # record. Not callable from an each_line_handler(), nor does it call
2501# # an each_line_handler() on the line.
2502#
2503# my $self = shift;
ffe43484 2504# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2505#
2506# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2507# my ($adjusted, $line) = @{$inserted_ref};
2508# next if $adjusted;
2509#
2510# # Remove comments and trailing space, and return a non-empty
2511# # resulting line
2512# $line =~ s/#.*//;
2513# $line =~ s/\s+$//;
2514# return $line if $line ne "";
2515# }
2516#
2517# return if ! ref $handle{$addr}; # Don't read unless is real file
2518# while (1) { # Loop until find non-comment, non-empty line
2519# local $to_trace = 1 if main::DEBUG;
2520# trace $_ if main::DEBUG && $to_trace;
2521# return if ! defined (my $line = readline $handle{$addr});
2522# chomp $line;
2523# push @{$added_lines{$addr}}, [ 0, $line ];
2524#
2525# $line =~ s/#.*//;
2526# $line =~ s/\s+$//;
2527# return $line if $line ne "";
2528# }
2529#
2530# return;
2531# }
2532
2533
2534 sub insert_lines {
2535 # Lines can be inserted so that it looks like they were in the input
2536 # file at the place it was when this routine is called. See also
2537 # insert_adjusted_lines(). Lines inserted via this routine go through
2538 # any each_line_handler()
2539
2540 my $self = shift;
2541
2542 # Each inserted line is an array, with the first element being 0 to
2543 # indicate that this line hasn't been adjusted, and needs to be
2544 # processed.
f998e60c 2545 no overloading;
051df77b 2546 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2547 return;
2548 }
2549
2550 sub insert_adjusted_lines {
2551 # Lines can be inserted so that it looks like they were in the input
2552 # file at the place it was when this routine is called. See also
2553 # insert_lines(). Lines inserted via this routine are already fully
2554 # adjusted, ready to be processed; each_line_handler()s handlers will
2555 # not be called. This means this is not a completely general
2556 # facility, as only the last each_line_handler on the stack should
2557 # call this. It could be made more general, by passing to each of the
2558 # line_handlers their position on the stack, which they would pass on
2559 # to this routine, and that would replace the boolean first element in
2560 # the anonymous array pushed here, so that the next_line routine could
2561 # use that to call only those handlers whose index is after it on the
2562 # stack. But this is overkill for what is needed now.
2563
2564 my $self = shift;
2565 trace $_[0] if main::DEBUG && $to_trace;
2566
2567 # Each inserted line is an array, with the first element being 1 to
2568 # indicate that this line has been adjusted
f998e60c 2569 no overloading;
051df77b 2570 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2571 return;
2572 }
2573
2574 sub get_missings {
2575 # Returns the stored up @missings lines' values, and clears the list.
2576 # The values are in an array, consisting of the default in the first
2577 # element, and the property in the 2nd. However, since these lines
2578 # can be stacked up, the return is an array of all these arrays.
2579
2580 my $self = shift;
2581 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2582
ffe43484 2583 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2584
2585 # If not accepting a list return, just return the first one.
2586 return shift @{$missings{$addr}} unless wantarray;
2587
2588 my @return = @{$missings{$addr}};
2589 undef @{$missings{$addr}};
2590 return @return;
2591 }
2592
2593 sub _insert_property_into_line {
2594 # Add a property field to $_, if this file requires it.
2595
f998e60c 2596 my $self = shift;
ffe43484 2597 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2598 my $property = $property{$addr};
99870f4d
KW
2599 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2600
2601 $_ =~ s/(;|$)/; $property$1/;
2602 return;
2603 }
2604
2605 sub carp_bad_line {
2606 # Output consistent error messages, using either a generic one, or the
2607 # one given by the optional parameter. To avoid gazillions of the
2608 # same message in case the syntax of a file is way off, this routine
2609 # only outputs the first instance of each message, incrementing a
2610 # count so the totals can be output at the end of the file.
2611
2612 my $self = shift;
2613 my $message = shift;
2614 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2615
ffe43484 2616 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2617
2618 $message = 'Unexpected line' unless $message;
2619
2620 # No trailing punctuation so as to fit with our addenda.
2621 $message =~ s/[.:;,]$//;
2622
2623 # If haven't seen this exact message before, output it now. Otherwise
2624 # increment the count of how many times it has occurred
2625 unless ($errors{$addr}->{$message}) {
2626 Carp::my_carp("$message in '$_' in "
f998e60c 2627 . $file{$addr}
99870f4d
KW
2628 . " at line $.. Skipping this line;");
2629 $errors{$addr}->{$message} = 1;
2630 }
2631 else {
2632 $errors{$addr}->{$message}++;
2633 }
2634
2635 # Clear the line to prevent any further (meaningful) processing of it.
2636 $_ = "";
2637
2638 return;
2639 }
2640} # End closure
2641
2642package Multi_Default;
2643
2644# Certain properties in early versions of Unicode had more than one possible
2645# default for code points missing from the files. In these cases, one
2646# default applies to everything left over after all the others are applied,
2647# and for each of the others, there is a description of which class of code
2648# points applies to it. This object helps implement this by storing the
2649# defaults, and for all but that final default, an eval string that generates
2650# the class that it applies to.
2651
2652
2653{ # Closure
2654
2655 main::setup_package();
2656
2657 my %class_defaults;
2658 # The defaults structure for the classes
2659 main::set_access('class_defaults', \%class_defaults);
2660
2661 my %other_default;
2662 # The default that applies to everything left over.
2663 main::set_access('other_default', \%other_default, 'r');
2664
2665
2666 sub new {
2667 # The constructor is called with default => eval pairs, terminated by
2668 # the left-over default. e.g.
2669 # Multi_Default->new(
2670 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2671 # - 0x200D',
2672 # 'R' => 'some other expression that evaluates to code points',
2673 # .
2674 # .
2675 # .
2676 # 'U'));
2677
2678 my $class = shift;
2679
2680 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2681 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2682
2683 while (@_ > 1) {
2684 my $default = shift;
2685 my $eval = shift;
2686 $class_defaults{$addr}->{$default} = $eval;
2687 }
2688
2689 $other_default{$addr} = shift;
2690
2691 return $self;
2692 }
2693
2694 sub get_next_defaults {
2695 # Iterates and returns the next class of defaults.
2696 my $self = shift;
2697 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2698
ffe43484 2699 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2700
2701 return each %{$class_defaults{$addr}};
2702 }
2703}
2704
2705package Alias;
2706
2707# An alias is one of the names that a table goes by. This class defines them
2708# including some attributes. Everything is currently setup in the
2709# constructor.
2710
2711
2712{ # Closure
2713
2714 main::setup_package();
2715
2716 my %name;
2717 main::set_access('name', \%name, 'r');
2718
2719 my %loose_match;
c12f2655 2720 # Should this name match loosely or not.
99870f4d
KW
2721 main::set_access('loose_match', \%loose_match, 'r');
2722
33e96e72
KW
2723 my %make_re_pod_entry;
2724 # Some aliases should not get their own entries in the re section of the
2725 # pod, because they are covered by a wild-card, and some we want to
2726 # discourage use of. Binary
f82fe4ba 2727 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
99870f4d 2728
fd1e3e84
KW
2729 my %ucd;
2730 # Is this documented to be accessible via Unicode::UCD
2731 main::set_access('ucd', \%ucd, 'r', 's');
2732
99870f4d
KW
2733 my %status;
2734 # Aliases have a status, like deprecated, or even suppressed (which means
2735 # they don't appear in documentation). Enum
2736 main::set_access('status', \%status, 'r');
2737
0eac1e20 2738 my %ok_as_filename;
99870f4d
KW
2739 # Similarly, some aliases should not be considered as usable ones for
2740 # external use, such as file names, or we don't want documentation to
2741 # recommend them. Boolean
0eac1e20 2742 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
99870f4d
KW
2743
2744 sub new {
2745 my $class = shift;
2746
2747 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2748 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2749
2750 $name{$addr} = shift;
2751 $loose_match{$addr} = shift;
33e96e72 2752 $make_re_pod_entry{$addr} = shift;
0eac1e20 2753 $ok_as_filename{$addr} = shift;
99870f4d 2754 $status{$addr} = shift;
fd1e3e84 2755 $ucd{$addr} = shift;
99870f4d
KW
2756
2757 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2758
2759 # Null names are never ok externally
0eac1e20 2760 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
99870f4d
KW
2761
2762 return $self;
2763 }
2764}
2765
2766package Range;
2767
2768# A range is the basic unit for storing code points, and is described in the
2769# comments at the beginning of the program. Each range has a starting code
2770# point; an ending code point (not less than the starting one); a value
2771# that applies to every code point in between the two end-points, inclusive;
2772# and an enum type that applies to the value. The type is for the user's
2773# convenience, and has no meaning here, except that a non-zero type is
2774# considered to not obey the normal Unicode rules for having standard forms.
2775#
2776# The same structure is used for both map and match tables, even though in the
2777# latter, the value (and hence type) is irrelevant and could be used as a
2778# comment. In map tables, the value is what all the code points in the range
2779# map to. Type 0 values have the standardized version of the value stored as
2780# well, so as to not have to recalculate it a lot.
2781
2782sub trace { return main::trace(@_); }
2783
2784{ # Closure
2785
2786 main::setup_package();
2787
2788 my %start;
2789 main::set_access('start', \%start, 'r', 's');
2790
2791 my %end;
2792 main::set_access('end', \%end, 'r', 's');
2793
2794 my %value;
2795 main::set_access('value', \%value, 'r');
2796
2797 my %type;
2798 main::set_access('type', \%type, 'r');
2799
2800 my %standard_form;
2801 # The value in internal standard form. Defined only if the type is 0.
2802 main::set_access('standard_form', \%standard_form);
2803
2804 # Note that if these fields change, the dump() method should as well
2805
2806 sub new {
2807 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2808 my $class = shift;
2809
2810 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2811 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2812
2813 $start{$addr} = shift;
2814 $end{$addr} = shift;
2815
2816 my %args = @_;
2817
2818 my $value = delete $args{'Value'}; # Can be 0
2819 $value = "" unless defined $value;
2820 $value{$addr} = $value;
2821
2822 $type{$addr} = delete $args{'Type'} || 0;
2823
2824 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2825
2826 if (! $type{$addr}) {
2827 $standard_form{$addr} = main::standardize($value);
2828 }
2829
2830 return $self;
2831 }
2832
2833 use overload
2834 fallback => 0,
2835 qw("") => "_operator_stringify",
2836 "." => \&main::_operator_dot,
2837 ;
2838
2839 sub _operator_stringify {
2840 my $self = shift;
ffe43484 2841 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2842
2843 # Output it like '0041..0065 (value)'
2844 my $return = sprintf("%04X", $start{$addr})
2845 . '..'
2846 . sprintf("%04X", $end{$addr});
2847 my $value = $value{$addr};
2848 my $type = $type{$addr};
2849 $return .= ' (';
2850 $return .= "$value";
2851 $return .= ", Type=$type" if $type != 0;
2852 $return .= ')';
2853
2854 return $return;
2855 }
2856
2857 sub standard_form {
2858 # The standard form is the value itself if the standard form is
2859 # undefined (that is if the value is special)
2860
2861 my $self = shift;
2862 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2863
ffe43484 2864 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2865
2866 return $standard_form{$addr} if defined $standard_form{$addr};
2867 return $value{$addr};
2868 }
2869
2870 sub dump {
2871 # Human, not machine readable. For machine readable, comment out this
2872 # entire routine and let the standard one take effect.
2873 my $self = shift;
2874 my $indent = shift;
2875 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2876
ffe43484 2877 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2878
2879 my $return = $indent
2880 . sprintf("%04X", $start{$addr})
2881 . '..'
2882 . sprintf("%04X", $end{$addr})
2883 . " '$value{$addr}';";
2884 if (! defined $standard_form{$addr}) {
2885 $return .= "(type=$type{$addr})";
2886 }
2887 elsif ($standard_form{$addr} ne $value{$addr}) {
2888 $return .= "(standard '$standard_form{$addr}')";
2889 }
2890 return $return;
2891 }
2892} # End closure
2893
2894package _Range_List_Base;
2895
2896# Base class for range lists. A range list is simply an ordered list of
2897# ranges, so that the ranges with the lowest starting numbers are first in it.
2898#
2899# When a new range is added that is adjacent to an existing range that has the
2900# same value and type, it merges with it to form a larger range.
2901#
2902# Ranges generally do not overlap, except that there can be multiple entries
2903# of single code point ranges. This is because of NameAliases.txt.
2904#
2905# In this program, there is a standard value such that if two different
2906# values, have the same standard value, they are considered equivalent. This
2907# value was chosen so that it gives correct results on Unicode data
2908
2909# There are a number of methods to manipulate range lists, and some operators
2910# are overloaded to handle them.
2911
99870f4d
KW
2912sub trace { return main::trace(@_); }
2913
2914{ # Closure
2915
2916 our $addr;
2917
2918 main::setup_package();
2919
2920 my %ranges;
2921 # The list of ranges
2922 main::set_access('ranges', \%ranges, 'readable_array');
2923
2924 my %max;
2925 # The highest code point in the list. This was originally a method, but
2926 # actual measurements said it was used a lot.
2927 main::set_access('max', \%max, 'r');
2928
2929 my %each_range_iterator;
2930 # Iterator position for each_range()
2931 main::set_access('each_range_iterator', \%each_range_iterator);
2932
2933 my %owner_name_of;
2934 # Name of parent this is attached to, if any. Solely for better error
2935 # messages.
2936 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2937
2938 my %_search_ranges_cache;
2939 # A cache of the previous result from _search_ranges(), for better
2940 # performance
2941 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2942
2943 sub new {
2944 my $class = shift;
2945 my %args = @_;
2946
2947 # Optional initialization data for the range list.
2948 my $initialize = delete $args{'Initialize'};
2949
2950 my $self;
2951
2952 # Use _union() to initialize. _union() returns an object of this
2953 # class, which means that it will call this constructor recursively.
2954 # But it won't have this $initialize parameter so that it won't
2955 # infinitely loop on this.
2956 return _union($class, $initialize, %args) if defined $initialize;
2957
2958 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2959 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2960
2961 # Optional parent object, only for debug info.
2962 $owner_name_of{$addr} = delete $args{'Owner'};
2963 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2964
2965 # Stringify, in case it is an object.
2966 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2967
2968 # This is used only for error messages, and so a colon is added
2969 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2970
2971 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2972
2973 # Max is initialized to a negative value that isn't adjacent to 0,
2974 # for simpler tests
2975 $max{$addr} = -2;
2976
2977 $_search_ranges_cache{$addr} = 0;
2978 $ranges{$addr} = [];
2979
2980 return $self;
2981 }
2982
2983 use overload
2984 fallback => 0,
2985 qw("") => "_operator_stringify",
2986 "." => \&main::_operator_dot,
2987 ;
2988
2989 sub _operator_stringify {
2990 my $self = shift;
ffe43484 2991 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2992
2993 return "Range_List attached to '$owner_name_of{$addr}'"
2994 if $owner_name_of{$addr};
2995 return "anonymous Range_List " . \$self;
2996 }
2997
2998 sub _union {
2999 # Returns the union of the input code points. It can be called as
3000 # either a constructor or a method. If called as a method, the result
3001 # will be a new() instance of the calling object, containing the union
3002 # of that object with the other parameter's code points; if called as
3003 # a constructor, the first parameter gives the class the new object
3004 # should be, and the second parameter gives the code points to go into
3005 # it.
3006 # In either case, there are two parameters looked at by this routine;
3007 # any additional parameters are passed to the new() constructor.
3008 #
3009 # The code points can come in the form of some object that contains
3010 # ranges, and has a conventionally named method to access them; or
3011 # they can be an array of individual code points (as integers); or
3012 # just a single code point.
3013 #
3014 # If they are ranges, this routine doesn't make any effort to preserve
3015 # the range values of one input over the other. Therefore this base
3016 # class should not allow _union to be called from other than
3017 # initialization code, so as to prevent two tables from being added
3018 # together where the range values matter. The general form of this
3019 # routine therefore belongs in a derived class, but it was moved here
3020 # to avoid duplication of code. The failure to overload this in this
3021 # class keeps it safe.
3022 #
3023
3024 my $self;
3025 my @args; # Arguments to pass to the constructor
3026
3027 my $class = shift;
3028
3029 # If a method call, will start the union with the object itself, and
3030 # the class of the new object will be the same as self.
3031 if (ref $class) {
3032 $self = $class;
3033 $class = ref $self;
3034 push @args, $self;
3035 }
3036
3037 # Add the other required parameter.
3038 push @args, shift;
3039 # Rest of parameters are passed on to the constructor
3040
3041 # Accumulate all records from both lists.
3042 my @records;
3043 for my $arg (@args) {
3044 #local $to_trace = 0 if main::DEBUG;
3045 trace "argument = $arg" if main::DEBUG && $to_trace;
3046 if (! defined $arg) {
3047 my $message = "";
3048 if (defined $self) {
f998e60c 3049 no overloading;
051df77b 3050 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
3051 }
3052 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
3053 return;
3054 }
3055 $arg = [ $arg ] if ! ref $arg;
3056 my $type = ref $arg;
3057 if ($type eq 'ARRAY') {
3058 foreach my $element (@$arg) {
3059 push @records, Range->new($element, $element);
3060 }
3061 }
3062 elsif ($arg->isa('Range')) {
3063 push @records, $arg;
3064 }
3065 elsif ($arg->can('ranges')) {
3066 push @records, $arg->ranges;
3067 }
3068 else {
3069 my $message = "";
3070 if (defined $self) {
f998e60c 3071 no overloading;
051df77b 3072 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
3073 }
3074 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3075 return;
3076 }
3077 }
3078
3079 # Sort with the range containing the lowest ordinal first, but if
3080 # two ranges start at the same code point, sort with the bigger range
3081 # of the two first, because it takes fewer cycles.
3082 @records = sort { ($a->start <=> $b->start)
3083 or
3084 # if b is shorter than a, b->end will be
3085 # less than a->end, and we want to select
3086 # a, so want to return -1
3087 ($b->end <=> $a->end)
3088 } @records;
3089
3090 my $new = $class->new(@_);
3091
3092 # Fold in records so long as they add new information.
3093 for my $set (@records) {
3094 my $start = $set->start;
3095 my $end = $set->end;
3096 my $value = $set->value;
3097 if ($start > $new->max) {
3098 $new->_add_delete('+', $start, $end, $value);
3099 }
3100 elsif ($end > $new->max) {
3101 $new->_add_delete('+', $new->max +1, $end, $value);
3102 }
3103 }
3104
3105 return $new;
3106 }
3107
3108 sub range_count { # Return the number of ranges in the range list
3109 my $self = shift;
3110 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3111
f998e60c 3112 no overloading;
051df77b 3113 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
3114 }
3115
3116 sub min {
3117 # Returns the minimum code point currently in the range list, or if
3118 # the range list is empty, 2 beyond the max possible. This is a
3119 # method because used so rarely, that not worth saving between calls,
3120 # and having to worry about changing it as ranges are added and
3121 # deleted.
3122
3123 my $self = shift;
3124 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3125
ffe43484 3126 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3127
3128 # If the range list is empty, return a large value that isn't adjacent
3129 # to any that could be in the range list, for simpler tests
6189eadc 3130 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
99870f4d
KW
3131 return $ranges{$addr}->[0]->start;
3132 }
3133
3134 sub contains {
3135 # Boolean: Is argument in the range list? If so returns $i such that:
3136 # range[$i]->end < $codepoint <= range[$i+1]->end
3137 # which is one beyond what you want; this is so that the 0th range
3138 # doesn't return false
3139 my $self = shift;
3140 my $codepoint = shift;
3141 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3142
99870f4d
KW
3143 my $i = $self->_search_ranges($codepoint);
3144 return 0 unless defined $i;
3145
3146 # The search returns $i, such that
3147 # range[$i-1]->end < $codepoint <= range[$i]->end
3148 # So is in the table if and only iff it is at least the start position
3149 # of range $i.
f998e60c 3150 no overloading;
051df77b 3151 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
3152 return $i + 1;
3153 }
3154
2f7a8815
KW
3155 sub containing_range {
3156 # Returns the range object that contains the code point, undef if none
3157
3158 my $self = shift;
3159 my $codepoint = shift;
3160 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3161
3162 my $i = $self->contains($codepoint);
3163 return unless $i;
3164
3165 # contains() returns 1 beyond where we should look
3166 no overloading;
3167 return $ranges{pack 'J', $self}->[$i-1];
3168 }
3169
99870f4d
KW
3170 sub value_of {
3171 # Returns the value associated with the code point, undef if none
3172
3173 my $self = shift;
3174 my $codepoint = shift;
3175 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3176
d69c231b
KW
3177 my $range = $self->containing_range($codepoint);
3178 return unless defined $range;
99870f4d 3179
d69c231b 3180 return $range->value;
99870f4d
KW
3181 }
3182
0a9dbafc
KW
3183 sub type_of {
3184 # Returns the type of the range containing the code point, undef if
3185 # the code point is not in the table
3186
3187 my $self = shift;
3188 my $codepoint = shift;
3189 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3190
3191 my $range = $self->containing_range($codepoint);
3192 return unless defined $range;
3193
3194 return $range->type;
3195 }
3196
99870f4d
KW
3197 sub _search_ranges {
3198 # Find the range in the list which contains a code point, or where it
3199 # should go if were to add it. That is, it returns $i, such that:
3200 # range[$i-1]->end < $codepoint <= range[$i]->end
3201 # Returns undef if no such $i is possible (e.g. at end of table), or
3202 # if there is an error.
3203
3204 my $self = shift;
3205 my $code_point = shift;
3206 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3207
ffe43484 3208 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3209
3210 return if $code_point > $max{$addr};
3211 my $r = $ranges{$addr}; # The current list of ranges
3212 my $range_list_size = scalar @$r;
3213 my $i;
3214
3215 use integer; # want integer division
3216
3217 # Use the cached result as the starting guess for this one, because,
3218 # an experiment on 5.1 showed that 90% of the time the cache was the
3219 # same as the result on the next call (and 7% it was one less).
3220 $i = $_search_ranges_cache{$addr};
3221 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3222 # from an intervening deletion
3223 #local $to_trace = 1 if main::DEBUG;
3224 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3225 return $i if $code_point <= $r->[$i]->end
3226 && ($i == 0 || $r->[$i-1]->end < $code_point);
3227
3228 # Here the cache doesn't yield the correct $i. Try adding 1.
3229 if ($i < $range_list_size - 1
3230 && $r->[$i]->end < $code_point &&
3231 $code_point <= $r->[$i+1]->end)
3232 {
3233 $i++;
3234 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3235 $_search_ranges_cache{$addr} = $i;
3236 return $i;
3237 }
3238
3239 # Here, adding 1 also didn't work. We do a binary search to
3240 # find the correct position, starting with current $i
3241 my $lower = 0;
3242 my $upper = $range_list_size - 1;
3243 while (1) {
3244 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3245
3246 if ($code_point <= $r->[$i]->end) {
3247
3248 # Here we have met the upper constraint. We can quit if we
3249 # also meet the lower one.
3250 last if $i == 0 || $r->[$i-1]->end < $code_point;
3251
3252 $upper = $i; # Still too high.
3253
3254 }
3255 else {
3256
3257 # Here, $r[$i]->end < $code_point, so look higher up.
3258 $lower = $i;
3259 }
3260
3261 # Split search domain in half to try again.
3262 my $temp = ($upper + $lower) / 2;
3263
3264 # No point in continuing unless $i changes for next time
3265 # in the loop.
3266 if ($temp == $i) {
3267
3268 # We can't reach the highest element because of the averaging.
3269 # So if one below the upper edge, force it there and try one
3270 # more time.
3271 if ($i == $range_list_size - 2) {
3272
3273 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3274 $i = $range_list_size - 1;
3275
3276 # Change $lower as well so if fails next time through,
3277 # taking the average will yield the same $i, and we will
3278 # quit with the error message just below.
3279 $lower = $i;
3280 next;
3281 }
3282 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3283 return;
3284 }
3285 $i = $temp;
3286 } # End of while loop
3287
3288 if (main::DEBUG && $to_trace) {
3289 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3290 trace "i= [ $i ]", $r->[$i];
3291 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3292 }
3293
3294 # Here we have found the offset. Cache it as a starting point for the
3295 # next call.
3296 $_search_ranges_cache{$addr} = $i;
3297 return $i;
3298 }
3299
3300 sub _add_delete {
3301 # Add, replace or delete ranges to or from a list. The $type
3302 # parameter gives which:
3303 # '+' => insert or replace a range, returning a list of any changed
3304 # ranges.
3305 # '-' => delete a range, returning a list of any deleted ranges.
3306 #
3307 # The next three parameters give respectively the start, end, and
3308 # value associated with the range. 'value' should be null unless the
3309 # operation is '+';
3310 #
3311 # The range list is kept sorted so that the range with the lowest
3312 # starting position is first in the list, and generally, adjacent
c1739a4a 3313 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3314 # exceptions below).
3315 #
c1739a4a 3316 # There are more parameters; all are key => value pairs:
99870f4d
KW
3317 # Type gives the type of the value. It is only valid for '+'.
3318 # All ranges have types; if this parameter is omitted, 0 is
3319 # assumed. Ranges with type 0 are assumed to obey the
3320 # Unicode rules for casing, etc; ranges with other types are
3321 # not. Otherwise, the type is arbitrary, for the caller's
3322 # convenience, and looked at only by this routine to keep
3323 # adjacent ranges of different types from being merged into
3324 # a single larger range, and when Replace =>
3325 # $IF_NOT_EQUIVALENT is specified (see just below).
3326 # Replace determines what to do if the range list already contains
3327 # ranges which coincide with all or portions of the input
3328 # range. It is only valid for '+':
3329 # => $NO means that the new value is not to replace
3330 # any existing ones, but any empty gaps of the
3331 # range list coinciding with the input range
3332 # will be filled in with the new value.
3333 # => $UNCONDITIONALLY means to replace the existing values with
3334 # this one unconditionally. However, if the
3335 # new and old values are identical, the
3336 # replacement is skipped to save cycles
3337 # => $IF_NOT_EQUIVALENT means to replace the existing values
3338 # with this one if they are not equivalent.
3339 # Ranges are equivalent if their types are the
c1739a4a 3340 # same, and they are the same string; or if
99870f4d
KW
3341 # both are type 0 ranges, if their Unicode
3342 # standard forms are identical. In this last
3343 # case, the routine chooses the more "modern"
3344 # one to use. This is because some of the
3345 # older files are formatted with values that
3346 # are, for example, ALL CAPs, whereas the
3347 # derived files have a more modern style,
3348 # which looks better. By looking for this
3349 # style when the pre-existing and replacement
3350 # standard forms are the same, we can move to
3351 # the modern style
3352 # => $MULTIPLE means that if this range duplicates an
3353 # existing one, but has a different value,
3354 # don't replace the existing one, but insert
3355 # this, one so that the same range can occur
53d84487
KW
3356 # multiple times. They are stored LIFO, so
3357 # that the final one inserted is the first one
3358 # returned in an ordered search of the table.
99870f4d
KW
3359 # => anything else is the same as => $IF_NOT_EQUIVALENT
3360 #
c1739a4a
KW
3361 # "same value" means identical for non-type-0 ranges, and it means
3362 # having the same standard forms for type-0 ranges.
99870f4d
KW
3363
3364 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3365
3366 my $self = shift;
3367 my $operation = shift; # '+' for add/replace; '-' for delete;
3368 my $start = shift;
3369 my $end = shift;
3370 my $value = shift;
3371
3372 my %args = @_;
3373
3374 $value = "" if not defined $value; # warning: $value can be "0"
3375
3376 my $replace = delete $args{'Replace'};
3377 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3378
3379 my $type = delete $args{'Type'};
3380 $type = 0 unless defined $type;
3381
3382 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3383
ffe43484 3384 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3385
3386 if ($operation ne '+' && $operation ne '-') {
3387 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3388 return;
3389 }
3390 unless (defined $start && defined $end) {
3391 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3392 return;
3393 }
3394 unless ($end >= $start) {
3395 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3396 return;
3397 }
3398 #local $to_trace = 1 if main::DEBUG;
3399
3400 if ($operation eq '-') {
3401 if ($replace != $IF_NOT_EQUIVALENT) {
3402 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3403 $replace = $IF_NOT_EQUIVALENT;
3404 }
3405 if ($type) {
3406 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3407 $type = 0;
3408 }
3409 if ($value ne "") {
3410 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3411 $value = "";
3412 }
3413 }
3414
3415 my $r = $ranges{$addr}; # The current list of ranges
3416 my $range_list_size = scalar @$r; # And its size
3417 my $max = $max{$addr}; # The current high code point in
3418 # the list of ranges
3419
3420 # Do a special case requiring fewer machine cycles when the new range
3421 # starts after the current highest point. The Unicode input data is
3422 # structured so this is common.
3423 if ($start > $max) {
3424
3425 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3426 return if $operation eq '-'; # Deleting a non-existing range is a
3427 # no-op
3428
3429 # If the new range doesn't logically extend the current final one
3430 # in the range list, create a new range at the end of the range
3431 # list. (max cleverly is initialized to a negative number not
3432 # adjacent to 0 if the range list is empty, so even adding a range
3433 # to an empty range list starting at 0 will have this 'if'
3434 # succeed.)
3435 if ($start > $max + 1 # non-adjacent means can't extend.
3436 || @{$r}[-1]->value ne $value # values differ, can't extend.
3437 || @{$r}[-1]->type != $type # types differ, can't extend.
3438 ) {
3439 push @$r, Range->new($start, $end,
3440 Value => $value,
3441 Type => $type);
3442 }
3443 else {
3444
3445 # Here, the new range starts just after the current highest in
3446 # the range list, and they have the same type and value.
3447 # Extend the current range to incorporate the new one.
3448 @{$r}[-1]->set_end($end);
3449 }
3450
3451 # This becomes the new maximum.
3452 $max{$addr} = $end;
3453
3454 return;
3455 }
3456 #local $to_trace = 0 if main::DEBUG;
3457
3458 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3459
3460 # Here, the input range isn't after the whole rest of the range list.
3461 # Most likely 'splice' will be needed. The rest of the routine finds
3462 # the needed splice parameters, and if necessary, does the splice.
3463 # First, find the offset parameter needed by the splice function for
3464 # the input range. Note that the input range may span multiple
3465 # existing ones, but we'll worry about that later. For now, just find
3466 # the beginning. If the input range is to be inserted starting in a
3467 # position not currently in the range list, it must (obviously) come
3468 # just after the range below it, and just before the range above it.
3469 # Slightly less obviously, it will occupy the position currently
3470 # occupied by the range that is to come after it. More formally, we
3471 # are looking for the position, $i, in the array of ranges, such that:
3472 #
3473 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3474 #
3475 # (The ordered relationships within existing ranges are also shown in
3476 # the equation above). However, if the start of the input range is
3477 # within an existing range, the splice offset should point to that
3478 # existing range's position in the list; that is $i satisfies a
3479 # somewhat different equation, namely:
3480 #
3481 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3482 #
3483 # More briefly, $start can come before or after r[$i]->start, and at
3484 # this point, we don't know which it will be. However, these
3485 # two equations share these constraints:
3486 #
3487 # r[$i-1]->end < $start <= r[$i]->end
3488 #
3489 # And that is good enough to find $i.
3490
3491 my $i = $self->_search_ranges($start);
3492 if (! defined $i) {
3493 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3494 return;
3495 }
3496
3497 # The search function returns $i such that:
3498 #
3499 # r[$i-1]->end < $start <= r[$i]->end
3500 #
3501 # That means that $i points to the first range in the range list
3502 # that could possibly be affected by this operation. We still don't
3503 # know if the start of the input range is within r[$i], or if it
3504 # points to empty space between r[$i-1] and r[$i].
3505 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3506
3507 # Special case the insertion of data that is not to replace any
3508 # existing data.
3509 if ($replace == $NO) { # If $NO, has to be operation '+'
3510 #local $to_trace = 1 if main::DEBUG;
3511 trace "Doesn't replace" if main::DEBUG && $to_trace;
3512
3513 # Here, the new range is to take effect only on those code points
3514 # that aren't already in an existing range. This can be done by
3515 # looking through the existing range list and finding the gaps in
3516 # the ranges that this new range affects, and then calling this
3517 # function recursively on each of those gaps, leaving untouched
3518 # anything already in the list. Gather up a list of the changed
3519 # gaps first so that changes to the internal state as new ranges
3520 # are added won't be a problem.
3521 my @gap_list;
3522
3523 # First, if the starting point of the input range is outside an
3524 # existing one, there is a gap from there to the beginning of the
3525 # existing range -- add a span to fill the part that this new
3526 # range occupies
3527 if ($start < $r->[$i]->start) {
3528 push @gap_list, Range->new($start,
3529 main::min($end,
3530 $r->[$i]->start - 1),
3531 Type => $type);
3532 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3533 }
3534
3535 # Then look through the range list for other gaps until we reach
3536 # the highest range affected by the input one.
3537 my $j;
3538 for ($j = $i+1; $j < $range_list_size; $j++) {
3539 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3540 last if $end < $r->[$j]->start;
3541
3542 # If there is a gap between when this range starts and the
3543 # previous one ends, add a span to fill it. Note that just
3544 # because there are two ranges doesn't mean there is a
3545 # non-zero gap between them. It could be that they have
3546 # different values or types
3547 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3548 push @gap_list,
3549 Range->new($r->[$j-1]->end + 1,
3550 $r->[$j]->start - 1,
3551 Type => $type);
3552 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3553 }
3554 }
3555
3556 # Here, we have either found an existing range in the range list,
3557 # beyond the area affected by the input one, or we fell off the
3558 # end of the loop because the input range affects the whole rest
3559 # of the range list. In either case, $j is 1 higher than the
3560 # highest affected range. If $j == $i, it means that there are no
3561 # affected ranges, that the entire insertion is in the gap between
3562 # r[$i-1], and r[$i], which we already have taken care of before
3563 # the loop.
3564 # On the other hand, if there are affected ranges, it might be
3565 # that there is a gap that needs filling after the final such
3566 # range to the end of the input range
3567 if ($r->[$j-1]->end < $end) {
3568 push @gap_list, Range->new(main::max($start,
3569 $r->[$j-1]->end + 1),
3570 $end,
3571 Type => $type);
3572 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3573 }
3574
3575 # Call recursively to fill in all the gaps.
3576 foreach my $gap (@gap_list) {
3577 $self->_add_delete($operation,
3578 $gap->start,
3579 $gap->end,
3580 $value,
3581 Type => $type);
3582 }
3583
3584 return;
3585 }
3586
53d84487
KW
3587 # Here, we have taken care of the case where $replace is $NO.
3588 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3589 # If inserting a multiple record, this is where it goes, before the
3590 # first (if any) existing one. This implies an insertion, and no
3591 # change to any existing ranges. Note that $i can be -1 if this new
3592 # range doesn't actually duplicate any existing, and comes at the
3593 # beginning of the list.
3594 if ($replace == $MULTIPLE) {
3595
3596 if ($start != $end) {
3597 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3598 return;
3599 }
3600
3601 # Don't add an exact duplicate, as it isn't really a multiple
3602 if ($end >= $r->[$i]->start) {
1f6798c4
KW
3603 my $existing_value = $r->[$i]->value;
3604 my $existing_type = $r->[$i]->type;
3605 return if $value eq $existing_value && $type eq $existing_type;
3606
3607 # If the multiple value is part of an existing range, we want
3608 # to split up that range, so that only the single code point
3609 # is affected. To do this, we first call ourselves
3610 # recursively to delete that code point from the table, having
3611 # preserved its current data above. Then we call ourselves
3612 # recursively again to add the new multiple, which we know by
3613 # the test just above is different than the current code
3614 # point's value, so it will become a range containing a single
3615 # code point: just itself. Finally, we add back in the
3616 # pre-existing code point, which will again be a single code
3617 # point range. Because 'i' likely will have changed as a
3618 # result of these operations, we can't just continue on, but
3619 # do this operation recursively as well.
53d84487 3620 if ($r->[$i]->start != $r->[$i]->end) {
1f6798c4
KW
3621 $self->_add_delete('-', $start, $end, "");
3622 $self->_add_delete('+', $start, $end, $value, Type => $type);
3623 return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
53d84487 3624 }
53d84487
KW
3625 }
3626
3627 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3628 my @return = splice @$r,
3629 $i,
3630 0,
3631 Range->new($start,
3632 $end,
3633 Value => $value,
3634 Type => $type);
3635 if (main::DEBUG && $to_trace) {
3636 trace "After splice:";
3637 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3638 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3639 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3640 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3641 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3642 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3643 }
3644 return @return;
3645 }
3646
3647 # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
3648 # delete, insert, and replace either unconditionally or if not
3649 # equivalent. $i still points to the first potential affected range.
3650 # Now find the highest range affected, which will determine the length
3651 # parameter to splice. (The input range can span multiple existing
3652 # ones.) If this isn't a deletion, while we are looking through the
3653 # range list, see also if this is a replacement rather than a clean
3654 # insertion; that is if it will change the values of at least one
3655 # existing range. Start off assuming it is an insert, until find it
3656 # isn't.
3657 my $clean_insert = $operation eq '+';
99870f4d
KW
3658 my $j; # This will point to the highest affected range
3659
3660 # For non-zero types, the standard form is the value itself;
3661 my $standard_form = ($type) ? $value : main::standardize($value);
3662
3663 for ($j = $i; $j < $range_list_size; $j++) {
3664 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3665
3666 # If find a range that it doesn't overlap into, we can stop
3667 # searching
3668 last if $end < $r->[$j]->start;
3669
969a34cc
KW
3670 # Here, overlaps the range at $j. If the values don't match,
3671 # and so far we think this is a clean insertion, it becomes a
3672 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3673 if ($clean_insert) {
99870f4d 3674 if ($r->[$j]->standard_form ne $standard_form) {
969a34cc 3675 $clean_insert = 0;
56343c78
KW
3676 if ($replace == $CROAK) {
3677 main::croak("The range to add "
3678 . sprintf("%04X", $start)
3679 . '-'
3680 . sprintf("%04X", $end)
3681 . " with value '$value' overlaps an existing range $r->[$j]");
3682 }
99870f4d
KW
3683 }
3684 else {
3685
3686 # Here, the two values are essentially the same. If the
3687 # two are actually identical, replacing wouldn't change
3688 # anything so skip it.
3689 my $pre_existing = $r->[$j]->value;
3690 if ($pre_existing ne $value) {
3691
3692 # Here the new and old standardized values are the
3693 # same, but the non-standardized values aren't. If
3694 # replacing unconditionally, then replace
3695 if( $replace == $UNCONDITIONALLY) {
969a34cc 3696 $clean_insert = 0;
99870f4d
KW
3697 }
3698 else {
3699
3700 # Here, are replacing conditionally. Decide to
3701 # replace or not based on which appears to look
3702 # the "nicest". If one is mixed case and the
3703 # other isn't, choose the mixed case one.
3704 my $new_mixed = $value =~ /[A-Z]/
3705 && $value =~ /[a-z]/;
3706 my $old_mixed = $pre_existing =~ /[A-Z]/
3707 && $pre_existing =~ /[a-z]/;
3708
3709 if ($old_mixed != $new_mixed) {
969a34cc 3710 $clean_insert = 0 if $new_mixed;
99870f4d 3711 if (main::DEBUG && $to_trace) {
969a34cc
KW
3712 if ($clean_insert) {
3713 trace "Retaining $pre_existing over $value";
99870f4d
KW
3714 }
3715 else {
969a34cc 3716 trace "Replacing $pre_existing with $value";
99870f4d
KW
3717 }
3718 }
3719 }
3720 else {
3721
3722 # Here casing wasn't different between the two.
3723 # If one has hyphens or underscores and the
3724 # other doesn't, choose the one with the
3725 # punctuation.
3726 my $new_punct = $value =~ /[-_]/;
3727 my $old_punct = $pre_existing =~ /[-_]/;
3728
3729 if ($old_punct != $new_punct) {
969a34cc 3730 $clean_insert = 0 if $new_punct;
99870f4d 3731 if (main::DEBUG && $to_trace) {
969a34cc
KW
3732 if ($clean_insert) {
3733 trace "Retaining $pre_existing over $value";
99870f4d
KW
3734 }
3735 else {
969a34cc 3736 trace "Replacing $pre_existing with $value";
99870f4d
KW
3737 }
3738 }
3739 } # else existing one is just as "good";
3740 # retain it to save cycles.
3741 }
3742 }
3743 }
3744 }
3745 }
3746 } # End of loop looking for highest affected range.
3747
3748 # Here, $j points to one beyond the highest range that this insertion
3749 # affects (hence to beyond the range list if that range is the final
3750 # one in the range list).
3751
3752 # The splice length is all the affected ranges. Get it before
3753 # subtracting, for efficiency, so we don't have to later add 1.
3754 my $length = $j - $i;
3755
3756 $j--; # $j now points to the highest affected range.
3757 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3758
99870f4d
KW
3759 # Here, have taken care of $NO and $MULTIPLE replaces.
3760 # $j points to the highest affected range. But it can be < $i or even
3761 # -1. These happen only if the insertion is entirely in the gap
3762 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3763 # above exited first time through with $end < $r->[$i]->start. (And
3764 # then we subtracted one from j) This implies also that $start <
3765 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3766 # $start, so the entire input range is in the gap.
3767 if ($j < $i) {
3768
3769 # Here the entire input range is in the gap before $i.
3770
3771 if (main::DEBUG && $to_trace) {
3772 if ($i) {
3773 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3774 }
3775 else {
3776 trace "Entire range is before $r->[$i]";
3777 }
3778 }
3779 return if $operation ne '+'; # Deletion of a non-existent range is
3780 # a no-op
3781 }
3782 else {
3783
969a34cc
KW
3784 # Here part of the input range is not in the gap before $i. Thus,
3785 # there is at least one affected one, and $j points to the highest
3786 # such one.
99870f4d
KW
3787
3788 # At this point, here is the situation:
3789 # This is not an insertion of a multiple, nor of tentative ($NO)
3790 # data.
3791 # $i points to the first element in the current range list that
3792 # may be affected by this operation. In fact, we know
3793 # that the range at $i is affected because we are in
3794 # the else branch of this 'if'
3795 # $j points to the highest affected range.
3796 # In other words,
3797 # r[$i-1]->end < $start <= r[$i]->end
3798 # And:
3799 # r[$i-1]->end < $start <= $end <= r[$j]->end
3800 #
3801 # Also:
969a34cc
KW
3802 # $clean_insert is a boolean which is set true if and only if
3803 # this is a "clean insertion", i.e., not a change nor a
3804 # deletion (multiple was handled above).
99870f4d
KW
3805
3806 # We now have enough information to decide if this call is a no-op
969a34cc
KW
3807 # or not. It is a no-op if this is an insertion of already
3808 # existing data.
99870f4d 3809
969a34cc 3810 if (main::DEBUG && $to_trace && $clean_insert
99870f4d
KW
3811 && $i == $j
3812 && $start >= $r->[$i]->start)
3813 {
3814 trace "no-op";
3815 }
969a34cc 3816 return if $clean_insert
99870f4d
KW
3817 && $i == $j # more than one affected range => not no-op
3818
3819 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3820 # Further, $start and/or $end is >= r[$i]->start
3821 # The test below hence guarantees that
3822 # r[$i]->start < $start <= $end <= r[$i]->end
3823 # This means the input range is contained entirely in
3824 # the one at $i, so is a no-op
3825 && $start >= $r->[$i]->start;
3826 }
3827
3828 # Here, we know that some action will have to be taken. We have
3829 # calculated the offset and length (though adjustments may be needed)
3830 # for the splice. Now start constructing the replacement list.
3831 my @replacement;
3832 my $splice_start = $i;
3833
3834 my $extends_below;
3835 my $extends_above;
3836
3837 # See if should extend any adjacent ranges.
3838 if ($operation eq '-') { # Don't extend deletions
3839 $extends_below = $extends_above = 0;
3840 }
3841 else { # Here, should extend any adjacent ranges. See if there are
3842 # any.
3843 $extends_below = ($i > 0
3844 # can't extend unless adjacent
3845 && $r->[$i-1]->end == $start -1
3846 # can't extend unless are same standard value
3847 && $r->[$i-1]->standard_form eq $standard_form
3848 # can't extend unless share type
3849 && $r->[$i-1]->type == $type);
3850 $extends_above = ($j+1 < $range_list_size
3851 && $r->[$j+1]->start == $end +1
3852 && $r->[$j+1]->standard_form eq $standard_form
23822bda 3853 && $r->[$j+1]->type == $type);
99870f4d
KW
3854 }
3855 if ($extends_below && $extends_above) { # Adds to both
3856 $splice_start--; # start replace at element below
3857 $length += 2; # will replace on both sides
3858 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3859
3860 # The result will fill in any gap, replacing both sides, and
3861 # create one large range.
3862 @replacement = Range->new($r->[$i-1]->start,
3863 $r->[$j+1]->end,
3864 Value => $value,
3865 Type => $type);
3866 }
3867 else {
3868
3869 # Here we know that the result won't just be the conglomeration of
3870 # a new range with both its adjacent neighbors. But it could
3871 # extend one of them.
3872
3873 if ($extends_below) {
3874
3875 # Here the new element adds to the one below, but not to the
3876 # one above. If inserting, and only to that one range, can
3877 # just change its ending to include the new one.
969a34cc 3878 if ($length == 0 && $clean_insert) {
99870f4d
KW
3879 $r->[$i-1]->set_end($end);
3880 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3881 return;
3882 }
3883 else {
3884 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3885 $splice_start--; # start replace at element below
3886 $length++; # will replace the element below
3887 $start = $r->[$i-1]->start;
3888 }
3889 }
3890 elsif ($extends_above) {
3891
3892 # Here the new element adds to the one above, but not below.
3893 # Mirror the code above
969a34cc 3894 if ($length == 0 && $clean_insert) {
99870f4d
KW
3895 $r->[$j+1]->set_start($start);
3896 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3897 return;
3898 }
3899 else {
3900 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3901 $length++; # will replace the element above
3902 $end = $r->[$j+1]->end;
3903 }
3904 }
3905
3906 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3907
3908 # Finally, here we know there will have to be a splice.
3909 # If the change or delete affects only the highest portion of the
3910 # first affected range, the range will have to be split. The
3911 # splice will remove the whole range, but will replace it by a new
3912 # range containing just the unaffected part. So, in this case,
3913 # add to the replacement list just this unaffected portion.
3914 if (! $extends_below
3915 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3916 {
3917 push @replacement,
3918 Range->new($r->[$i]->start,
3919 $start - 1,
3920 Value => $r->[$i]->value,
3921 Type => $r->[$i]->type);
3922 }
3923
3924 # In the case of an insert or change, but not a delete, we have to
3925 # put in the new stuff; this comes next.
3926 if ($operation eq '+') {
3927 push @replacement, Range->new($start,
3928 $end,
3929 Value => $value,
3930 Type => $type);
3931 }
3932
3933 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3934 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3935
3936 # And finally, if we're changing or deleting only a portion of the
3937 # highest affected range, it must be split, as the lowest one was.
3938 if (! $extends_above
3939 && $j >= 0 # Remember that j can be -1 if before first
3940 # current element
3941 && $end >= $r->[$j]->start
3942 && $end < $r->[$j]->end)
3943 {
3944 push @replacement,
3945 Range->new($end + 1,
3946 $r->[$j]->end,
3947 Value => $r->[$j]->value,
3948 Type => $r->[$j]->type);
3949 }
3950 }
3951
3952 # And do the splice, as calculated above
3953 if (main::DEBUG && $to_trace) {
3954 trace "replacing $length element(s) at $i with ";
3955 foreach my $replacement (@replacement) {
3956 trace " $replacement";
3957 }
3958 trace "Before splice:";
3959 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3960 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3961 trace "i =[", $i, "]", $r->[$i];
3962 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3963 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3964 }
3965
3966 my @return = splice @$r, $splice_start, $length, @replacement;
3967
3968 if (main::DEBUG && $to_trace) {
3969 trace "After splice:";
3970 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3971 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3972 trace "i =[", $i, "]", $r->[$i];
3973 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3974 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
e6451557 3975 trace "removed ", @return if @return;
99870f4d
KW
3976 }
3977
3978 # An actual deletion could have changed the maximum in the list.
3979 # There was no deletion if the splice didn't return something, but
3980 # otherwise recalculate it. This is done too rarely to worry about
3981 # performance.
3982 if ($operation eq '-' && @return) {
3983 $max{$addr} = $r->[-1]->end;
3984 }
3985 return @return;
3986 }
3987
3988 sub reset_each_range { # reset the iterator for each_range();
3989 my $self = shift;
3990 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3991
f998e60c 3992 no overloading;
051df77b 3993 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
3994 return;
3995 }
3996
3997 sub each_range {
3998 # Iterate over each range in a range list. Results are undefined if
3999 # the range list is changed during the iteration.
4000
4001 my $self = shift;
4002 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4003
ffe43484 4004 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4005
4006 return if $self->is_empty;
4007
4008 $each_range_iterator{$addr} = -1
4009 if ! defined $each_range_iterator{$addr};
4010 $each_range_iterator{$addr}++;
4011 return $ranges{$addr}->[$each_range_iterator{$addr}]
4012 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4013 undef $each_range_iterator{$addr};
4014 return;
4015 }
4016
4017 sub count { # Returns count of code points in range list
4018 my $self = shift;
4019 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4020
ffe43484 4021 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4022
4023 my $count = 0;
4024 foreach my $range (@{$ranges{$addr}}) {
4025 $count += $range->end - $range->start + 1;
4026 }
4027 return $count;
4028 }
4029
4030 sub delete_range { # Delete a range
4031 my $self = shift;
4032 my $start = shift;
4033 my $end = shift;
4034
4035 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4036
4037 return $self->_add_delete('-', $start, $end, "");
4038 }
4039
4040 sub is_empty { # Returns boolean as to if a range list is empty
4041 my $self = shift;
4042 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4043
f998e60c 4044 no overloading;
051df77b 4045 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
4046 }
4047
4048 sub hash {
4049 # Quickly returns a scalar suitable for separating tables into
4050 # buckets, i.e. it is a hash function of the contents of a table, so
4051 # there are relatively few conflicts.
4052
4053 my $self = shift;
4054 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4055
ffe43484 4056 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4057
4058 # These are quickly computable. Return looks like 'min..max;count'
4059 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4060 }
4061} # End closure for _Range_List_Base
4062
4063package Range_List;
4064use base '_Range_List_Base';
4065
4066# A Range_List is a range list for match tables; i.e. the range values are
4067# not significant. Thus a number of operations can be safely added to it,
4068# such as inversion, intersection. Note that union is also an unsafe
4069# operation when range values are cared about, and that method is in the base
4070# class, not here. But things are set up so that that method is callable only
4071# during initialization. Only in this derived class, is there an operation
4072# that combines two tables. A Range_Map can thus be used to initialize a
4073# Range_List, and its mappings will be in the list, but are not significant to
4074# this class.
4075
4076sub trace { return main::trace(@_); }
4077
4078{ # Closure
4079
4080 use overload
4081 fallback => 0,
4082 '+' => sub { my $self = shift;
4083 my $other = shift;
4084
4085 return $self->_union($other)
4086 },
4087 '&' => sub { my $self = shift;
4088 my $other = shift;
4089
4090 return $self->_intersect($other, 0);
4091 },
4092 '~' => "_invert",
4093 '-' => "_subtract",
4094 ;
4095
4096 sub _invert {
4097 # Returns a new Range_List that gives all code points not in $self.
4098
4099 my $self = shift;
4100
4101 my $new = Range_List->new;
4102
4103 # Go through each range in the table, finding the gaps between them
4104 my $max = -1; # Set so no gap before range beginning at 0
4105 for my $range ($self->ranges) {
4106 my $start = $range->start;
4107 my $end = $range->end;
4108
4109 # If there is a gap before this range, the inverse will contain
4110 # that gap.
4111 if ($start > $max + 1) {
4112 $new->add_range($max + 1, $start - 1);
4113 }
4114 $max = $end;
4115 }
4116
4117 # And finally, add the gap from the end of the table to the max
4118 # possible code point
6189eadc
KW
4119 if ($max < $MAX_UNICODE_CODEPOINT) {
4120 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
99870f4d
KW
4121 }
4122 return $new;
4123 }
4124
4125 sub _subtract {
4126 # Returns a new Range_List with the argument deleted from it. The
4127 # argument can be a single code point, a range, or something that has
4128 # a range, with the _range_list() method on it returning them
4129
4130 my $self = shift;
4131 my $other = shift;
4132 my $reversed = shift;
4133 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4134
4135 if ($reversed) {
4136 Carp::my_carp_bug("Can't cope with a "
4137 . __PACKAGE__
4138 . " being the second parameter in a '-'. Subtraction ignored.");
4139 return $self;
4140 }
4141
4142 my $new = Range_List->new(Initialize => $self);
4143
4144 if (! ref $other) { # Single code point
4145 $new->delete_range($other, $other);
4146 }
4147 elsif ($other->isa('Range')) {
4148 $new->delete_range($other->start, $other->end);
4149 }
4150 elsif ($other->can('_range_list')) {
4151 foreach my $range ($other->_range_list->ranges) {
4152 $new->delete_range($range->start, $range->end);
4153 }
4154 }
4155 else {
4156 Carp::my_carp_bug("Can't cope with a "
4157 . ref($other)
4158 . " argument to '-'. Subtraction ignored."
4159 );
4160 return $self;
4161 }
4162
4163 return $new;
4164 }
4165
4166 sub _intersect {
4167 # Returns either a boolean giving whether the two inputs' range lists
4168 # intersect (overlap), or a new Range_List containing the intersection
4169 # of the two lists. The optional final parameter being true indicates
4170 # to do the check instead of the intersection.
4171
4172 my $a_object = shift;
4173 my $b_object = shift;
4174 my $check_if_overlapping = shift;
4175 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4176 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4177
4178 if (! defined $b_object) {
4179 my $message = "";
4180 $message .= $a_object->_owner_name_of if defined $a_object;
4181 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4182 return;
4183 }
4184
4185 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4186 # Thus the intersection could be much more simply be written:
4187 # return ~(~$a_object + ~$b_object);
4188 # But, this is slower, and when taking the inverse of a large
4189 # range_size_1 table, back when such tables were always stored that
4190 # way, it became prohibitively slow, hence the code was changed to the
4191 # below
4192
4193 if ($b_object->isa('Range')) {
4194 $b_object = Range_List->new(Initialize => $b_object,
4195 Owner => $a_object->_owner_name_of);
4196 }
4197 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4198
4199 my @a_ranges = $a_object->ranges;
4200 my @b_ranges = $b_object->ranges;
4201
4202 #local $to_trace = 1 if main::DEBUG;
4203 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4204
4205 # Start with the first range in each list
4206 my $a_i = 0;
4207 my $range_a = $a_ranges[$a_i];
4208 my $b_i = 0;
4209 my $range_b = $b_ranges[$b_i];
4210
4211 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4212 if ! $check_if_overlapping;
4213
4214 # If either list is empty, there is no intersection and no overlap
4215 if (! defined $range_a || ! defined $range_b) {
4216 return $check_if_overlapping ? 0 : $new;
4217 }
4218 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4219
4220 # Otherwise, must calculate the intersection/overlap. Start with the
4221 # very first code point in each list
4222 my $a = $range_a->start;
4223 my $b = $range_b->start;
4224
4225 # Loop through all the ranges of each list; in each iteration, $a and
4226 # $b are the current code points in their respective lists
4227 while (1) {
4228
4229 # If $a and $b are the same code point, ...
4230 if ($a == $b) {
4231
4232 # it means the lists overlap. If just checking for overlap
4233 # know the answer now,
4234 return 1 if $check_if_overlapping;
4235
4236 # The intersection includes this code point plus anything else
4237 # common to both current ranges.
4238 my $start = $a;
4239 my $end = main::min($range_a->end, $range_b->end);
4240 if (! $check_if_overlapping) {
4241 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4242 $new->add_range($start, $end);
4243 }
4244
4245 # Skip ahead to the end of the current intersect
4246 $a = $b = $end;
4247
4248 # If the current intersect ends at the end of either range (as
4249 # it must for at least one of them), the next possible one
4250 # will be the beginning code point in it's list's next range.
4251 if ($a == $range_a->end) {
4252 $range_a = $a_ranges[++$a_i];
4253 last unless defined $range_a;
4254 $a = $range_a->start;
4255 }
4256 if ($b == $range_b->end) {
4257 $range_b = $b_ranges[++$b_i];
4258 last unless defined $range_b;
4259 $b = $range_b->start;
4260 }
4261
4262 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4263 }
4264 elsif ($a < $b) {
4265
4266 # Not equal, but if the range containing $a encompasses $b,
4267 # change $a to be the middle of the range where it does equal
4268 # $b, so the next iteration will get the intersection
4269 if ($range_a->end >= $b) {
4270 $a = $b;
4271 }
4272 else {
4273
4274 # Here, the current range containing $a is entirely below
4275 # $b. Go try to find a range that could contain $b.
4276 $a_i = $a_object->_search_ranges($b);
4277
4278 # If no range found, quit.
4279 last unless defined $a_i;
4280
4281 # The search returns $a_i, such that
4282 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4283 # Set $a to the beginning of this new range, and repeat.
4284 $range_a = $a_ranges[$a_i];
4285 $a = $range_a->start;
4286 }
4287 }
4288 else { # Here, $b < $a.
4289
4290 # Mirror image code to the leg just above
4291 if ($range_b->end >= $a) {
4292 $b = $a;
4293 }
4294 else {
4295 $b_i = $b_object->_search_ranges($a);
4296 last unless defined $b_i;
4297 $range_b = $b_ranges[$b_i];
4298 $b = $range_b->start;
4299 }
4300 }
4301 } # End of looping through ranges.
4302
4303 # Intersection fully computed, or now know that there is no overlap
4304 return $check_if_overlapping ? 0 : $new;
4305 }
4306
4307 sub overlaps {
4308 # Returns boolean giving whether the two arguments overlap somewhere
4309
4310 my $self = shift;
4311 my $other = shift;
4312 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4313
4314 return $self->_intersect($other, 1);
4315 }
4316
4317 sub add_range {
4318 # Add a range to the list.
4319
4320 my $self = shift;
4321 my $start = shift;
4322 my $end = shift;
4323 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4324
4325 return $self->_add_delete('+', $start, $end, "");
4326 }
4327
09aba7e4
KW
4328 sub matches_identically_to {
4329 # Return a boolean as to whether or not two Range_Lists match identical
4330 # sets of code points.
4331
4332 my $self = shift;
4333 my $other = shift;
4334 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4335
4336 # These are ordered in increasing real time to figure out (at least
4337 # until a patch changes that and doesn't change this)
4338 return 0 if $self->max != $other->max;
4339 return 0 if $self->min != $other->min;
4340 return 0 if $self->range_count != $other->range_count;
4341 return 0 if $self->count != $other->count;
4342
4343 # Here they could be identical because all the tests above passed.
4344 # The loop below is somewhat simpler since we know they have the same
4345 # number of elements. Compare range by range, until reach the end or
4346 # find something that differs.
4347 my @a_ranges = $self->ranges;
4348 my @b_ranges = $other->ranges;
4349 for my $i (0 .. @a_ranges - 1) {
4350 my $a = $a_ranges[$i];
4351 my $b = $b_ranges[$i];
4352 trace "self $a; other $b" if main::DEBUG && $to_trace;
c1c2d9e8
KW
4353 return 0 if ! defined $b
4354 || $a->start != $b->start
4355 || $a->end != $b->end;
09aba7e4
KW
4356 }
4357 return 1;
4358 }
4359
99870f4d
KW
4360 sub is_code_point_usable {
4361 # This used only for making the test script. See if the input
4362 # proposed trial code point is one that Perl will handle. If second
4363 # parameter is 0, it won't select some code points for various
4364 # reasons, noted below.
4365
4366 my $code = shift;
4367 my $try_hard = shift;
4368 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4369
4370 return 0 if $code < 0; # Never use a negative
4371
99870f4d
KW
4372 # shun null. I'm (khw) not sure why this was done, but NULL would be
4373 # the character very frequently used.
4374 return $try_hard if $code == 0x0000;
4375
99870f4d
KW
4376 # shun non-character code points.
4377 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4378 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4379
6189eadc 4380 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
99870f4d
KW
4381 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4382
4383 return 1;
4384 }
4385
4386 sub get_valid_code_point {
4387 # Return a code point that's part of the range list. Returns nothing
4388 # if the table is empty or we can't find a suitable code point. This
4389 # used only for making the test script.
4390
4391 my $self = shift;
4392 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4393
ffe43484 4394 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4395
4396 # On first pass, don't choose less desirable code points; if no good
4397 # one is found, repeat, allowing a less desirable one to be selected.
4398 for my $try_hard (0, 1) {
4399
4400 # Look through all the ranges for a usable code point.
4401 for my $set ($self->ranges) {
4402
4403 # Try the edge cases first, starting with the end point of the
4404 # range.
4405 my $end = $set->end;
4406 return $end if is_code_point_usable($end, $try_hard);
4407
4408 # End point didn't, work. Start at the beginning and try
4409 # every one until find one that does work.
4410 for my $trial ($set->start .. $end - 1) {
4411 return $trial if is_code_point_usable($trial, $try_hard);
4412 }
4413 }
4414 }
4415 return (); # If none found, give up.
4416 }
4417
4418 sub get_invalid_code_point {
4419 # Return a code point that's not part of the table. Returns nothing
4420 # if the table covers all code points or a suitable code point can't
4421 # be found. This used only for making the test script.
4422
4423 my $self = shift;
4424 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4425
4426 # Just find a valid code point of the inverse, if any.
4427 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4428 }
4429} # end closure for Range_List
4430
4431package Range_Map;
4432use base '_Range_List_Base';
4433
4434# A Range_Map is a range list in which the range values (called maps) are
4435# significant, and hence shouldn't be manipulated by our other code, which
4436# could be ambiguous or lose things. For example, in taking the union of two
4437# lists, which share code points, but which have differing values, which one
4438# has precedence in the union?
4439# It turns out that these operations aren't really necessary for map tables,
4440# and so this class was created to make sure they aren't accidentally
4441# applied to them.
4442
4443{ # Closure
4444
4445 sub add_map {
4446 # Add a range containing a mapping value to the list
4447
4448 my $self = shift;
4449 # Rest of parameters passed on
4450
4451 return $self->_add_delete('+', @_);
4452 }
4453
4454 sub add_duplicate {
4455 # Adds entry to a range list which can duplicate an existing entry
4456
4457 my $self = shift;
4458 my $code_point = shift;
4459 my $value = shift;
4460 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4461
4462 return $self->add_map($code_point, $code_point,
4463 $value, Replace => $MULTIPLE);
4464 }
4465} # End of closure for package Range_Map
4466
4467package _Base_Table;
4468
4469# A table is the basic data structure that gets written out into a file for
4470# use by the Perl core. This is the abstract base class implementing the
4471# common elements from the derived ones. A list of the methods to be
4472# furnished by an implementing class is just after the constructor.
4473
4474sub standardize { return main::standardize($_[0]); }
4475sub trace { return main::trace(@_); }
4476
4477{ # Closure
4478
4479 main::setup_package();
4480
4481 my %range_list;
4482 # Object containing the ranges of the table.
4483 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4484
4485 my %full_name;
4486 # The full table name.
4487 main::set_access('full_name', \%full_name, 'r');
4488
4489 my %name;
4490 # The table name, almost always shorter
4491 main::set_access('name', \%name, 'r');
4492
4493 my %short_name;
4494 # The shortest of all the aliases for this table, with underscores removed
4495 main::set_access('short_name', \%short_name);
4496
4497 my %nominal_short_name_length;
4498 # The length of short_name before removing underscores
4499 main::set_access('nominal_short_name_length',
4500 \%nominal_short_name_length);
4501
23e33b60
KW
4502 my %complete_name;
4503 # The complete name, including property.
4504 main::set_access('complete_name', \%complete_name, 'r');
4505
99870f4d
KW
4506 my %property;
4507 # Parent property this table is attached to.
4508 main::set_access('property', \%property, 'r');
4509
4510 my %aliases;
c12f2655
KW
4511 # Ordered list of alias objects of the table's name. The first ones in
4512 # the list are output first in comments
99870f4d
KW
4513 main::set_access('aliases', \%aliases, 'readable_array');
4514
4515 my %comment;
4516 # A comment associated with the table for human readers of the files
4517 main::set_access('comment', \%comment, 's');
4518
4519 my %description;
4520 # A comment giving a short description of the table's meaning for human
4521 # readers of the files.
4522 main::set_access('description', \%description, 'readable_array');
4523
4524 my %note;
4525 # A comment giving a short note about the table for human readers of the
4526 # files.
4527 main::set_access('note', \%note, 'readable_array');
4528
301ba948
KW
4529 my %fate;
4530 # Enum; there are a number of possibilities for what happens to this
4531 # table: it could be normal, or suppressed, or not for external use. See
4532 # values at definition for $SUPPRESSED.
4533 main::set_access('fate', \%fate, 'r');
99870f4d
KW
4534
4535 my %find_table_from_alias;
4536 # The parent property passes this pointer to a hash which this class adds
4537 # all its aliases to, so that the parent can quickly take an alias and
4538 # find this table.
4539 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4540
4541 my %locked;
4542 # After this table is made equivalent to another one; we shouldn't go
4543 # changing the contents because that could mean it's no longer equivalent
4544 main::set_access('locked', \%locked, 'r');
4545
4546 my %file_path;
4547 # This gives the final path to the file containing the table. Each
4548 # directory in the path is an element in the array
4549 main::set_access('file_path', \%file_path, 'readable_array');
4550
4551 my %status;
4552 # What is the table's status, normal, $OBSOLETE, etc. Enum
4553 main::set_access('status', \%status, 'r');
4554
4555 my %status_info;
4556 # A comment about its being obsolete, or whatever non normal status it has
4557 main::set_access('status_info', \%status_info, 'r');
4558
d867ccfb
KW
4559 my %caseless_equivalent;
4560 # The table this is equivalent to under /i matching, if any.
4561 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4562
99870f4d
KW
4563 my %range_size_1;
4564 # Is the table to be output with each range only a single code point?
4565 # This is done to avoid breaking existing code that may have come to rely
4566 # on this behavior in previous versions of this program.)
4567 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4568
4569 my %perl_extension;
4570 # A boolean set iff this table is a Perl extension to the Unicode
4571 # standard.
4572 main::set_access('perl_extension', \%perl_extension, 'r');
4573
0c07e538
KW
4574 my %output_range_counts;
4575 # A boolean set iff this table is to have comments written in the
4576 # output file that contain the number of code points in the range.
4577 # The constructor can override the global flag of the same name.
4578 main::set_access('output_range_counts', \%output_range_counts, 'r');
4579
f5817e0a
KW
4580 my %format;
4581 # The format of the entries of the table. This is calculated from the
4582 # data in the table (or passed in the constructor). This is an enum e.g.,
4583 # $STRING_FORMAT
4584 main::set_access('format', \%format, 'r', 'p_s');
4585
99870f4d
KW
4586 sub new {
4587 # All arguments are key => value pairs, which you can see below, most
33e96e72 4588 # of which match fields documented above. Otherwise: Re_Pod_Entry,
0eac1e20 4589 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
99870f4d
KW
4590 # documented in the Alias package
4591
4592 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4593
4594 my $class = shift;
4595
4596 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4597 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4598
4599 my %args = @_;
4600
4601 $name{$addr} = delete $args{'Name'};
4602 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4603 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4604 my $complete_name = $complete_name{$addr}
4605 = delete $args{'Complete_Name'};
f5817e0a 4606 $format{$addr} = delete $args{'Format'};
0c07e538 4607 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4608 $property{$addr} = delete $args{'_Property'};
4609 $range_list{$addr} = delete $args{'_Range_List'};
4610 $status{$addr} = delete $args{'Status'} || $NORMAL;
4611 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4612 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
d867ccfb 4613 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
301ba948 4614 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
fd1e3e84 4615 my $ucd = delete $args{'UCD'};
99870f4d
KW
4616
4617 my $description = delete $args{'Description'};
0eac1e20 4618 my $ok_as_filename = delete $args{'OK_as_Filename'};
99870f4d
KW
4619 my $loose_match = delete $args{'Fuzzy'};
4620 my $note = delete $args{'Note'};
33e96e72 4621 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
37e2e78e 4622 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4623
4624 # Shouldn't have any left over
4625 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4626
4627 # Can't use || above because conceivably the name could be 0, and
4628 # can't use // operator in case this program gets used in Perl 5.8
4629 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4630 $output_range_counts{$addr} = $output_range_counts if
4631 ! defined $output_range_counts{$addr};
99870f4d
KW
4632
4633 $aliases{$addr} = [ ];
4634 $comment{$addr} = [ ];
4635 $description{$addr} = [ ];
4636 $note{$addr} = [ ];
4637 $file_path{$addr} = [ ];
4638 $locked{$addr} = "";
4639
4640 push @{$description{$addr}}, $description if $description;
4641 push @{$note{$addr}}, $note if $note;
4642
301ba948 4643 if ($fate{$addr} == $PLACEHOLDER) {
37e2e78e
KW
4644
4645 # A placeholder table doesn't get documented, is a perl extension,
4646 # and quite likely will be empty
33e96e72 4647 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
37e2e78e 4648 $perl_extension = 1 if ! defined $perl_extension;
fd1e3e84 4649 $ucd = 0 if ! defined $ucd;
37e2e78e 4650 push @tables_that_may_be_empty, $complete_name{$addr};
301ba948
KW
4651 $self->add_comment(<<END);
4652This is a placeholder because it is not in Version $string_version of Unicode,
4653but is needed by the Perl core to work gracefully. Because it is not in this
4654version of Unicode, it will not be listed in $pod_file.pod
4655END
37e2e78e 4656 }
301ba948 4657 elsif (exists $why_suppressed{$complete_name}
98dc9551 4658 # Don't suppress if overridden
ec11e5f4
KW
4659 && ! grep { $_ eq $complete_name{$addr} }
4660 @output_mapped_properties)
301ba948
KW
4661 {
4662 $fate{$addr} = $SUPPRESSED;
4663 }
4664 elsif ($fate{$addr} == $SUPPRESSED
4665 && ! exists $why_suppressed{$property{$addr}->complete_name})
4666 {
4667 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4668 # perhaps Fate => [ $SUPPRESSED, "reason" ]
4669 }
4670
4671 # If hasn't set its status already, see if it is on one of the
4672 # lists of properties or tables that have particular statuses; if
4673 # not, is normal. The lists are prioritized so the most serious
4674 # ones are checked first
4675 if (! $status{$addr}) {
4676 if (exists $why_deprecated{$complete_name}) {
99870f4d
KW
4677 $status{$addr} = $DEPRECATED;
4678 }
4679 elsif (exists $why_stabilized{$complete_name}) {
4680 $status{$addr} = $STABILIZED;
4681 }
4682 elsif (exists $why_obsolete{$complete_name}) {
4683 $status{$addr} = $OBSOLETE;
4684 }
4685
4686 # Existence above doesn't necessarily mean there is a message
4687 # associated with it. Use the most serious message.
4688 if ($status{$addr}) {
301ba948 4689 if ($why_deprecated{$complete_name}) {
99870f4d
KW
4690 $status_info{$addr}
4691 = $why_deprecated{$complete_name};
4692 }
4693 elsif ($why_stabilized{$complete_name}) {
4694 $status_info{$addr}
4695 = $why_stabilized{$complete_name};
4696 }
4697 elsif ($why_obsolete{$complete_name}) {
4698 $status_info{$addr}
4699 = $why_obsolete{$complete_name};
4700 }
4701 }
4702 }
4703
37e2e78e
KW
4704 $perl_extension{$addr} = $perl_extension || 0;
4705
8050d00f 4706 # Don't list a property by default that is internal only
395dfc19 4707 if ($fate{$addr} > $MAP_PROXIED) {
301ba948 4708 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
fd1e3e84
KW
4709 $ucd = 0 if ! defined $ucd;
4710 }
4711 else {
4712 $ucd = 1 if ! defined $ucd;
301ba948 4713 }
8050d00f 4714
99870f4d
KW
4715 # By convention what typically gets printed only or first is what's
4716 # first in the list, so put the full name there for good output
4717 # clarity. Other routines rely on the full name being first on the
4718 # list
4719 $self->add_alias($full_name{$addr},
0eac1e20 4720 OK_as_Filename => $ok_as_filename,
99870f4d 4721 Fuzzy => $loose_match,
33e96e72 4722 Re_Pod_Entry => $make_re_pod_entry,
99870f4d 4723 Status => $status{$addr},
fd1e3e84 4724 UCD => $ucd,
99870f4d
KW
4725 );
4726
4727 # Then comes the other name, if meaningfully different.
4728 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4729 $self->add_alias($name{$addr},
0eac1e20 4730 OK_as_Filename => $ok_as_filename,
99870f4d 4731 Fuzzy => $loose_match,
33e96e72 4732 Re_Pod_Entry => $make_re_pod_entry,
99870f4d 4733 Status => $status{$addr},
fd1e3e84 4734 UCD => $ucd,
99870f4d
KW
4735 );
4736 }
4737
4738 return $self;
4739 }
4740
4741 # Here are the methods that are required to be defined by any derived
4742 # class
ea25a9b2 4743 for my $sub (qw(
668b3bfc 4744 handle_special_range
99870f4d 4745 append_to_body
99870f4d 4746 pre_body
ea25a9b2 4747 ))
668b3bfc
KW
4748 # write() knows how to write out normal ranges, but it calls
4749 # handle_special_range() when it encounters a non-normal one.
4750 # append_to_body() is called by it after it has handled all
4751 # ranges to add anything after the main portion of the table.
4752 # And finally, pre_body() is called after all this to build up
4753 # anything that should appear before the main portion of the
4754 # table. Doing it this way allows things in the middle to
4755 # affect what should appear before the main portion of the
99870f4d 4756 # table.
99870f4d
KW
4757 {
4758 no strict "refs";
4759 *$sub = sub {
4760 Carp::my_carp_bug( __LINE__
4761 . ": Must create method '$sub()' for "
4762 . ref shift);
4763 return;
4764 }
4765 }
4766
4767 use overload
4768 fallback => 0,
4769 "." => \&main::_operator_dot,
4770 '!=' => \&main::_operator_not_equal,
4771 '==' => \&main::_operator_equal,
4772 ;
4773
4774 sub ranges {
4775 # Returns the array of ranges associated with this table.
4776
f998e60c 4777 no overloading;
051df77b 4778 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4779 }
4780
4781 sub add_alias {
4782 # Add a synonym for this table.
4783
4784 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4785
4786 my $self = shift;
4787 my $name = shift; # The name to add.
4788 my $pointer = shift; # What the alias hash should point to. For
4789 # map tables, this is the parent property;
4790 # for match tables, it is the table itself.
4791
4792 my %args = @_;
4793 my $loose_match = delete $args{'Fuzzy'};
4794
33e96e72
KW
4795 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4796 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
99870f4d 4797
0eac1e20
KW
4798 my $ok_as_filename = delete $args{'OK_as_Filename'};
4799 $ok_as_filename = 1 unless defined $ok_as_filename;
99870f4d
KW
4800
4801 my $status = delete $args{'Status'};
4802 $status = $NORMAL unless defined $status;
4803
fd1e3e84
KW
4804 my $ucd = delete $args{'UCD'} // 1;
4805
99870f4d
KW
4806 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4807
4808 # Capitalize the first letter of the alias unless it is one of the CJK
4809 # ones which specifically begins with a lower 'k'. Do this because
4810 # Unicode has varied whether they capitalize first letters or not, and
4811 # have later changed their minds and capitalized them, but not the
4812 # other way around. So do it always and avoid changes from release to
4813 # release
4814 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4815
ffe43484 4816 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4817
4818 # Figure out if should be loosely matched if not already specified.
4819 if (! defined $loose_match) {
4820
4821 # Is a loose_match if isn't null, and doesn't begin with an
4822 # underscore and isn't just a number
4823 if ($name ne ""
4824 && substr($name, 0, 1) ne '_'
4825 && $name !~ qr{^[0-9_.+-/]+$})
4826 {
4827 $loose_match = 1;
4828 }
4829 else {
4830 $loose_match = 0;
4831 }
4832 }
4833
4834 # If this alias has already been defined, do nothing.
4835 return if defined $find_table_from_alias{$addr}->{$name};
4836
4837 # That includes if it is standardly equivalent to an existing alias,
4838 # in which case, add this name to the list, so won't have to search
4839 # for it again.
4840 my $standard_name = main::standardize($name);
4841 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4842 $find_table_from_alias{$addr}->{$name}
4843 = $find_table_from_alias{$addr}->{$standard_name};
4844 return;
4845 }
4846
4847 # Set the index hash for this alias for future quick reference.
4848 $find_table_from_alias{$addr}->{$name} = $pointer;
4849 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4850 local $to_trace = 0 if main::DEBUG;
4851 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4852 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4853
4854
4855 # Put the new alias at the end of the list of aliases unless the final
4856 # element begins with an underscore (meaning it is for internal perl
4857 # use) or is all numeric, in which case, put the new one before that
4858 # one. This floats any all-numeric or underscore-beginning aliases to
4859 # the end. This is done so that they are listed last in output lists,
4860 # to encourage the user to use a better name (either more descriptive
4861 # or not an internal-only one) instead. This ordering is relied on
4862 # implicitly elsewhere in this program, like in short_name()
4863 my $list = $aliases{$addr};
4864 my $insert_position = (@$list == 0
4865 || (substr($list->[-1]->name, 0, 1) ne '_'
4866 && $list->[-1]->name =~ /\D/))
4867 ? @$list
4868 : @$list - 1;
4869 splice @$list,
4870 $insert_position,
4871 0,
33e96e72 4872 Alias->new($name, $loose_match, $make_re_pod_entry,
0eac1e20 4873 $ok_as_filename, $status, $ucd);
99870f4d
KW
4874
4875 # This name may be shorter than any existing ones, so clear the cache
4876 # of the shortest, so will have to be recalculated.
f998e60c 4877 no overloading;
051df77b 4878 undef $short_name{pack 'J', $self};
99870f4d
KW
4879 return;
4880 }
4881
4882 sub short_name {
4883 # Returns a name suitable for use as the base part of a file name.
4884 # That is, shorter wins. It can return undef if there is no suitable
4885 # name. The name has all non-essential underscores removed.
4886
4887 # The optional second parameter is a reference to a scalar in which
4888 # this routine will store the length the returned name had before the
4889 # underscores were removed, or undef if the return is undef.
4890
4891 # The shortest name can change if new aliases are added. So using
4892 # this should be deferred until after all these are added. The code
4893 # that does that should clear this one's cache.
4894 # Any name with alphabetics is preferred over an all numeric one, even
4895 # if longer.
4896
4897 my $self = shift;
4898 my $nominal_length_ptr = shift;
4899 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4900
ffe43484 4901 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4902
4903 # For efficiency, don't recalculate, but this means that adding new
4904 # aliases could change what the shortest is, so the code that does
4905 # that needs to undef this.
4906 if (defined $short_name{$addr}) {
4907 if ($nominal_length_ptr) {
4908 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4909 }
4910 return $short_name{$addr};
4911 }
4912
4913 # Look at each alias
4914 foreach my $alias ($self->aliases()) {
4915
4916 # Don't use an alias that isn't ok to use for an external name.
0eac1e20 4917 next if ! $alias->ok_as_filename;
99870f4d
KW
4918
4919 my $name = main::Standardize($alias->name);
4920 trace $self, $name if main::DEBUG && $to_trace;
4921
4922 # Take the first one, or a shorter one that isn't numeric. This
4923 # relies on numeric aliases always being last in the array
4924 # returned by aliases(). Any alpha one will have precedence.
4925 if (! defined $short_name{$addr}
4926 || ($name =~ /\D/
4927 && length($name) < length($short_name{$addr})))
4928 {
4929 # Remove interior underscores.
4930 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4931
4932 $nominal_short_name_length{$addr} = length $name;
4933 }
4934 }
4935
ff485b9e
KW
4936 # If the short name isn't a nice one, perhaps an equivalent table has
4937 # a better one.
4938 if (! defined $short_name{$addr}
4939 || $short_name{$addr} eq ""
4940 || $short_name{$addr} eq "_")
4941 {
4942 my $return;
4943 foreach my $follower ($self->children) { # All equivalents
4944 my $follower_name = $follower->short_name;
4945 next unless defined $follower_name;
4946
4947 # Anything (except undefined) is better than underscore or
4948 # empty
4949 if (! defined $return || $return eq "_") {
4950 $return = $follower_name;
4951 next;
4952 }
4953
4954 # If the new follower name isn't "_" and is shorter than the
4955 # current best one, prefer the new one.
4956 next if $follower_name eq "_";
4957 next if length $follower_name > length $return;
4958 $return = $follower_name;
4959 }
4960 $short_name{$addr} = $return if defined $return;
4961 }
4962
99870f4d
KW
4963 # If no suitable external name return undef
4964 if (! defined $short_name{$addr}) {
4965 $$nominal_length_ptr = undef if $nominal_length_ptr;
4966 return;
4967 }
4968
c12f2655 4969 # Don't allow a null short name.
99870f4d
KW
4970 if ($short_name{$addr} eq "") {
4971 $short_name{$addr} = '_';
4972 $nominal_short_name_length{$addr} = 1;
4973 }
4974
4975 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4976
4977 if ($nominal_length_ptr) {
4978 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4979 }
4980 return $short_name{$addr};
4981 }
4982
4983 sub external_name {
4984 # Returns the external name that this table should be known by. This
c12f2655
KW
4985 # is usually the short_name, but not if the short_name is undefined,
4986 # in which case the external_name is arbitrarily set to the
4987 # underscore.
99870f4d
KW
4988
4989 my $self = shift;
4990 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4991
4992 my $short = $self->short_name;
4993 return $short if defined $short;
4994
4995 return '_';
4996 }
4997
4998 sub add_description { # Adds the parameter as a short description.
4999
5000 my $self = shift;
5001 my $description = shift;
5002 chomp $description;
5003 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5004
f998e60c 5005 no overloading;
051df77b 5006 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
5007
5008 return;
5009 }
5010
5011 sub add_note { # Adds the parameter as a short note.
5012
5013 my $self = shift;
5014 my $note = shift;
5015 chomp $note;
5016 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5017
f998e60c 5018 no overloading;
051df77b 5019 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
5020
5021 return;
5022 }
5023
5024 sub add_comment { # Adds the parameter as a comment.
5025
bd9ebcfd
KW
5026 return unless $debugging_build;
5027
99870f4d
KW
5028 my $self = shift;
5029 my $comment = shift;
5030 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5031
5032 chomp $comment;
f998e60c
KW
5033
5034 no overloading;
051df77b 5035 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
5036
5037 return;
5038 }
5039
5040 sub comment {
5041 # Return the current comment for this table. If called in list
5042 # context, returns the array of comments. In scalar, returns a string
5043 # of each element joined together with a period ending each.
5044
5045 my $self = shift;
5046 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5047
ffe43484 5048 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 5049 my @list = @{$comment{$addr}};
99870f4d
KW
5050 return @list if wantarray;
5051 my $return = "";
5052 foreach my $sentence (@list) {
5053 $return .= '. ' if $return;
5054 $return .= $sentence;
5055 $return =~ s/\.$//;
5056 }
5057 $return .= '.' if $return;
5058 return $return;
5059 }
5060
5061 sub initialize {
5062 # Initialize the table with the argument which is any valid
5063 # initialization for range lists.
5064
5065 my $self = shift;
ffe43484 5066 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5067 my $initialization = shift;
5068 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5069
5070 # Replace the current range list with a new one of the same exact
5071 # type.
f998e60c
KW
5072 my $class = ref $range_list{$addr};
5073 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
5074 Initialize => $initialization);
5075 return;
5076
5077 }
5078
5079 sub header {
5080 # The header that is output for the table in the file it is written
5081 # in.
5082
5083 my $self = shift;
5084 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5085
5086 my $return = "";
5087 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5088 $return .= $HEADER;
99870f4d
KW
5089 return $return;
5090 }
5091
5092 sub write {
668b3bfc
KW
5093 # Write a representation of the table to its file. It calls several
5094 # functions furnished by sub-classes of this abstract base class to
5095 # handle non-normal ranges, to add stuff before the table, and at its
5096 # end.
99870f4d
KW
5097
5098 my $self = shift;
5099 my $tab_stops = shift; # The number of tab stops over to put any
5100 # comment.
5101 my $suppress_value = shift; # Optional, if the value associated with
5102 # a range equals this one, don't write
5103 # the range
5104 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5105
ffe43484 5106 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5107
5108 # Start with the header
668b3bfc 5109 my @HEADER = $self->header;
99870f4d
KW
5110
5111 # Then the comments
668b3bfc 5112 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
99870f4d
KW
5113 if $comment{$addr};
5114
668b3bfc
KW
5115 # Things discovered processing the main body of the document may
5116 # affect what gets output before it, therefore pre_body() isn't called
5117 # until after all other processing of the table is done.
99870f4d 5118
c4019d52
KW
5119 # The main body looks like a 'here' document. If annotating, get rid
5120 # of the comments before passing to the caller, as some callers, such
5121 # as charnames.pm, can't cope with them. (Outputting range counts
5122 # also introduces comments, but these don't show up in the tables that
5123 # can't cope with comments, and there aren't that many of them that
5124 # it's worth the extra real time to get rid of them).
668b3bfc 5125 my @OUT;
558712cf 5126 if ($annotate) {
c4019d52
KW
5127 # Use the line below in Perls that don't have /r
5128 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5129 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5130 } else {
5131 push @OUT, "return <<'END';\n";
5132 }
99870f4d
KW
5133
5134 if ($range_list{$addr}->is_empty) {
5135
5136 # This is a kludge for empty tables to silence a warning in
5137 # utf8.c, which can't really deal with empty tables, but it can
5138 # deal with a table that matches nothing, as the inverse of 'Any'
5139 # does.
67a53d68 5140 push @OUT, "!utf8::Any\n";
99870f4d 5141 }
c69a9c68
KW
5142 elsif ($self->name eq 'N'
5143
5144 # To save disk space and table cache space, avoid putting out
5145 # binary N tables, but instead create a file which just inverts
5146 # the Y table. Since the file will still exist and occupy a
5147 # certain number of blocks, might as well output the whole
5148 # thing if it all will fit in one block. The number of
5149 # ranges below is an approximate number for that.
06f26c45
KW
5150 && ($self->property->type == $BINARY
5151 || $self->property->type == $FORCED_BINARY)
c69a9c68
KW
5152 # && $self->property->tables == 2 Can't do this because the
5153 # non-binary properties, like NFDQC aren't specifiable
5154 # by the notation
5155 && $range_list{$addr}->ranges > 15
5156 && ! $annotate) # Under --annotate, want to see everything
5157 {
5158 push @OUT, "!utf8::" . $self->property->name . "\n";
5159 }
99870f4d
KW
5160 else {
5161 my $range_size_1 = $range_size_1{$addr};
558712cf
KW
5162 my $format; # Used only in $annotate option
5163 my $include_name; # Used only in $annotate option
c4019d52 5164
558712cf 5165 if ($annotate) {
c4019d52
KW
5166
5167 # if annotating each code point, must print 1 per line.
5168 # The variable could point to a subroutine, and we don't want
5169 # to lose that fact, so only set if not set already
5170 $range_size_1 = 1 if ! $range_size_1;
5171
5172 $format = $self->format;
5173
5174 # The name of the character is output only for tables that
5175 # don't already include the name in the output.
5176 my $property = $self->property;
5177 $include_name =
5178 ! ($property == $perl_charname
5179 || $property == main::property_ref('Unicode_1_Name')
5180 || $property == main::property_ref('Name')
5181 || $property == main::property_ref('Name_Alias')
5182 );
5183 }
99870f4d
KW
5184
5185 # Output each range as part of the here document.
5a2b5ddb 5186 RANGE:
99870f4d 5187 for my $set ($range_list{$addr}->ranges) {
5a2b5ddb
KW
5188 if ($set->type != 0) {
5189 $self->handle_special_range($set);
5190 next RANGE;
5191 }
99870f4d
KW
5192 my $start = $set->start;
5193 my $end = $set->end;
5194 my $value = $set->value;
5195
5196 # Don't output ranges whose value is the one to suppress
c4019d52
KW
5197 next RANGE if defined $suppress_value
5198 && $value eq $suppress_value;
99870f4d 5199
c4019d52
KW
5200 # If there is a range and doesn't need a single point range
5201 # output
5202 if ($start != $end && ! $range_size_1) {
bd9ebcfd
KW
5203 push @OUT, sprintf "%04X\t%04X", $start, $end;
5204 $OUT[-1] .= "\t$value" if $value ne "";
99870f4d
KW
5205
5206 # Add a comment with the size of the range, if requested.
5207 # Expand Tabs to make sure they all start in the same
5208 # column, and then unexpand to use mostly tabs.
0c07e538 5209 if (! $output_range_counts{$addr}) {
99870f4d
KW
5210 $OUT[-1] .= "\n";
5211 }
5212 else {
5213 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5214 my $count = main::clarify_number($end - $start + 1);
5215 use integer;
5216
5217 my $width = $tab_stops * 8 - 1;
5218 $OUT[-1] = sprintf("%-*s # [%s]\n",
5219 $width,
5220 $OUT[-1],
5221 $count);
5222 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5223 }
c4019d52
KW
5224 next RANGE;
5225 }
5226
5227 # Here to output a single code point per line
5228
5229 # If not to annotate, use the simple formats
558712cf 5230 if (! $annotate) {
c4019d52
KW
5231
5232 # Use any passed in subroutine to output.
5233 if (ref $range_size_1 eq 'CODE') {
5234 for my $i ($start .. $end) {
5235 push @OUT, &{$range_size_1}($i, $value);
5236 }
5237 }
5238 else {
5239
5240 # Here, caller is ok with default output.
5241 for (my $i = $start; $i <= $end; $i++) {
5242 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5243 }
5244 }
5245 next RANGE;
5246 }
5247
5248 # Here, wants annotation.
5249 for (my $i = $start; $i <= $end; $i++) {
5250
5251 # Get character information if don't have it already
5252 main::populate_char_info($i)
5253 if ! defined $viacode[$i];
5254 my $type = $annotate_char_type[$i];
5255
5256 # Figure out if should output the next code points as part
5257 # of a range or not. If this is not in an annotation
5258 # range, then won't output as a range, so returns $i.
5259 # Otherwise use the end of the annotation range, but no
5260 # further than the maximum possible end point of the loop.
5261 my $range_end = main::min($annotate_ranges->value_of($i)
5262 || $i,
5263 $end);
5264
5265 # Use a range if it is a range, and either is one of the
5266 # special annotation ranges, or the range is at most 3
5267 # long. This last case causes the algorithmically named
5268 # code points to be output individually in spans of at
5269 # most 3, as they are the ones whose $type is > 0.
5270 if ($range_end != $i
5271 && ( $type < 0 || $range_end - $i > 2))
5272 {
5273 # Here is to output a range. We don't allow a
5274 # caller-specified output format--just use the
5275 # standard one.
5276 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5277 $range_end,
5278 $value;
5279 my $range_name = $viacode[$i];
5280
5281 # For the code points which end in their hex value, we
5282 # eliminate that from the output annotation, and
5283 # capitalize only the first letter of each word.
5284 if ($type == $CP_IN_NAME) {
5285 my $hex = sprintf "%04X", $i;
5286 $range_name =~ s/-$hex$//;
5287 my @words = split " ", $range_name;
5288 for my $word (@words) {
5289 $word = ucfirst(lc($word)) if $word ne 'CJK';
5290 }
5291 $range_name = join " ", @words;
5292 }
5293 elsif ($type == $HANGUL_SYLLABLE) {
5294 $range_name = "Hangul Syllable";
5295 }
5296
5297 $OUT[-1] .= " $range_name" if $range_name;
5298
5299 # Include the number of code points in the range
5300 my $count = main::clarify_number($range_end - $i + 1);
5301 $OUT[-1] .= " [$count]\n";
5302
5303 # Skip to the end of the range
5304 $i = $range_end;
5305 }
5306 else { # Not in a range.
5307 my $comment = "";
5308
5309 # When outputting the names of each character, use
5310 # the character itself if printable
5311 $comment .= "'" . chr($i) . "' " if $printable[$i];
5312
5313 # To make it more readable, use a minimum indentation
5314 my $comment_indent;
5315
5316 # Determine the annotation
5317 if ($format eq $DECOMP_STRING_FORMAT) {
5318
5319 # This is very specialized, with the type of
5320 # decomposition beginning the line enclosed in
5321 # <...>, and the code points that the code point
5322 # decomposes to separated by blanks. Create two
5323 # strings, one of the printable characters, and
5324 # one of their official names.
5325 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5326 my $tostr = "";
5327 my $to_name = "";
5328 my $to_chr = "";
5329 foreach my $to (split " ", $map) {
5330 $to = CORE::hex $to;
5331 $to_name .= " + " if $to_name;
5332 $to_chr .= chr($to);
5333 main::populate_char_info($to)
5334 if ! defined $viacode[$to];
5335 $to_name .= $viacode[$to];
5336 }
5337
5338 $comment .=
5339 "=> '$to_chr'; $viacode[$i] => $to_name";
5340 $comment_indent = 25; # Determined by experiment
5341 }
5342 else {
5343
5344 # Assume that any table that has hex format is a
5345 # mapping of one code point to another.
5346 if ($format eq $HEX_FORMAT) {
5347 my $decimal_value = CORE::hex $value;
5348 main::populate_char_info($decimal_value)
5349 if ! defined $viacode[$decimal_value];
5350 $comment .= "=> '"
5351 . chr($decimal_value)
5352 . "'; " if $printable[$decimal_value];
5353 }
5354 $comment .= $viacode[$i] if $include_name
5355 && $viacode[$i];
5356 if ($format eq $HEX_FORMAT) {
5357 my $decimal_value = CORE::hex $value;
5358 $comment .= " => $viacode[$decimal_value]"
5359 if $viacode[$decimal_value];
5360 }
5361
5362 # If including the name, no need to indent, as the
5363 # name will already be way across the line.
5364 $comment_indent = ($include_name) ? 0 : 60;
5365 }
5366
5367 # Use any passed in routine to output the base part of
5368 # the line.
5369 if (ref $range_size_1 eq 'CODE') {
5370 my $base_part = &{$range_size_1}($i, $value);
5371 chomp $base_part;
5372 push @OUT, $base_part;
5373 }
5374 else {
5375 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5376 }
5377
5378 # And add the annotation.
5379 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5380 $OUT[-1],
5381 $comment if $comment;
5382 $OUT[-1] .= "\n";
5383 }
99870f4d
KW
5384 }
5385 } # End of loop through all the table's ranges
5386 }
5387
5388 # Add anything that goes after the main body, but within the here
5389 # document,
5390 my $append_to_body = $self->append_to_body;
5391 push @OUT, $append_to_body if $append_to_body;
5392
5393 # And finish the here document.
5394 push @OUT, "END\n";
5395
668b3bfc
KW
5396 # Done with the main portion of the body. Can now figure out what
5397 # should appear before it in the file.
5398 my $pre_body = $self->pre_body;
5399 push @HEADER, $pre_body, "\n" if $pre_body;
668b3bfc 5400
6b0079b5
KW
5401 # All these files should have a .pl suffix added to them.
5402 my @file_with_pl = @{$file_path{$addr}};
5403 $file_with_pl[-1] .= '.pl';
99870f4d 5404
6b0079b5 5405 main::write(\@file_with_pl,
558712cf 5406 $annotate, # utf8 iff annotating
9218f1cf
KW
5407 \@HEADER,
5408 \@OUT);
99870f4d
KW
5409 return;
5410 }
5411
5412 sub set_status { # Set the table's status
5413 my $self = shift;
5414 my $status = shift; # The status enum value
5415 my $info = shift; # Any message associated with it.
5416 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5417
ffe43484 5418 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5419
5420 $status{$addr} = $status;
5421 $status_info{$addr} = $info;
5422 return;
5423 }
5424
301ba948
KW
5425 sub set_fate { # Set the fate of a table
5426 my $self = shift;
5427 my $fate = shift;
5428 my $reason = shift;
5429 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5430
5431 my $addr = do { no overloading; pack 'J', $self; };
5432
5433 return if $fate{$addr} == $fate; # If no-op
5434
395dfc19
KW
5435 # Can only change the ordinary fate, except if going to $MAP_PROXIED
5436 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
301ba948
KW
5437
5438 $fate{$addr} = $fate;
5439
395dfc19
KW
5440 # Don't document anything to do with a non-normal fated table
5441 if ($fate != $ORDINARY) {
fd1e3e84 5442 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
395dfc19 5443 foreach my $alias ($self->aliases) {
fd1e3e84 5444 $alias->set_ucd($put_in_pod);
395dfc19
KW
5445
5446 # MAP_PROXIED doesn't affect the match tables
5447 next if $fate == $MAP_PROXIED;
fd1e3e84 5448 $alias->set_make_re_pod_entry($put_in_pod);
395dfc19
KW
5449 }
5450 }
5451
301ba948
KW
5452 # Save the reason for suppression for output
5453 if ($fate == $SUPPRESSED && defined $reason) {
5454 $why_suppressed{$complete_name{$addr}} = $reason;
5455 }
5456
5457 return;
5458 }
5459
99870f4d
KW
5460 sub lock {
5461 # Don't allow changes to the table from now on. This stores a stack
5462 # trace of where it was called, so that later attempts to modify it
5463 # can immediately show where it got locked.
5464
5465 my $self = shift;
5466 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5467
ffe43484 5468 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5469
5470 $locked{$addr} = "";
5471
5472 my $line = (caller(0))[2];
5473 my $i = 1;
5474
5475 # Accumulate the stack trace
5476 while (1) {
5477 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5478
5479 last unless defined $caller;
5480
5481 $locked{$addr} .= " called from $caller() at line $line\n";
5482 $line = $caller_line;
5483 }
5484 $locked{$addr} .= " called from main at line $line\n";
5485
5486 return;
5487 }
5488
5489 sub carp_if_locked {
5490 # Return whether a table is locked or not, and, by the way, complain
5491 # if is locked
5492
5493 my $self = shift;
5494 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5495
ffe43484 5496 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5497
5498 return 0 if ! $locked{$addr};
5499 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5500 return 1;
5501 }
5502
5503 sub set_file_path { # Set the final directory path for this table
5504 my $self = shift;
5505 # Rest of parameters passed on
5506
f998e60c 5507 no overloading;
051df77b 5508 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
5509 return
5510 }
5511
5512 # Accessors for the range list stored in this table. First for
5513 # unconditional
ea25a9b2 5514 for my $sub (qw(
2f7a8815 5515 containing_range
99870f4d
KW
5516 contains
5517 count
5518 each_range
5519 hash
5520 is_empty
09aba7e4 5521 matches_identically_to
99870f4d
KW
5522 max
5523 min
5524 range_count
5525 reset_each_range
0a9dbafc 5526 type_of
99870f4d 5527 value_of
ea25a9b2 5528 ))
99870f4d
KW
5529 {
5530 no strict "refs";
5531 *$sub = sub {
5532 use strict "refs";
5533 my $self = shift;
ec40ee88 5534 return $self->_range_list->$sub(@_);
99870f4d
KW
5535 }
5536 }
5537
5538 # Then for ones that should fail if locked
ea25a9b2 5539 for my $sub (qw(
99870f4d 5540 delete_range
ea25a9b2 5541 ))
99870f4d
KW
5542 {
5543 no strict "refs";
5544 *$sub = sub {
5545 use strict "refs";
5546 my $self = shift;
5547
5548 return if $self->carp_if_locked;
f998e60c 5549 no overloading;
ec40ee88 5550 return $self->_range_list->$sub(@_);
99870f4d
KW
5551 }
5552 }
5553
5554} # End closure
5555
5556package Map_Table;
5557use base '_Base_Table';
5558
5559# A Map Table is a table that contains the mappings from code points to
5560# values. There are two weird cases:
5561# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5562# are written in the table's file at the end of the table nonetheless. It
5563# requires specially constructed code to handle these; utf8.c can not read
5564# these in, so they should not go in $map_directory. As of this writing,
5565# the only case that these happen is for named sequences used in
5566# charnames.pm. But this code doesn't enforce any syntax on these, so
5567# something else could come along that uses it.
5568# 2) Specials are anything that doesn't fit syntactically into the body of the
5569# table. The ranges for these have a map type of non-zero. The code below
5570# knows about and handles each possible type. In most cases, these are
5571# written as part of the header.
5572#
5573# A map table deliberately can't be manipulated at will unlike match tables.
5574# This is because of the ambiguities having to do with what to do with
5575# overlapping code points. And there just isn't a need for those things;
5576# what one wants to do is just query, add, replace, or delete mappings, plus
5577# write the final result.
5578# However, there is a method to get the list of possible ranges that aren't in
5579# this table to use for defaulting missing code point mappings. And,
5580# map_add_or_replace_non_nulls() does allow one to add another table to this
5581# one, but it is clearly very specialized, and defined that the other's
5582# non-null values replace this one's if there is any overlap.
5583
5584sub trace { return main::trace(@_); }
5585
5586{ # Closure
5587
5588 main::setup_package();
5589
5590 my %default_map;
5591 # Many input files omit some entries; this gives what the mapping for the
5592 # missing entries should be
5593 main::set_access('default_map', \%default_map, 'r');
5594
5595 my %anomalous_entries;
5596 # Things that go in the body of the table which don't fit the normal
5597 # scheme of things, like having a range. Not much can be done with these
5598 # once there except to output them. This was created to handle named
5599 # sequences.
5600 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5601 main::set_access('anomalous_entries', # Append singular, read plural
5602 \%anomalous_entries,
5603 'readable_array');
5604
99870f4d 5605 my %to_output_map;
8572ace0 5606 # Enum as to whether or not to write out this map table:
c12f2655 5607 # 0 don't output
8572ace0
KW
5608 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5609 # it should not be removed nor its format changed. This
5610 # is done for those files that have traditionally been
5611 # output.
5612 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5613 # with this file
99870f4d
KW
5614 main::set_access('to_output_map', \%to_output_map, 's');
5615
5616
5617 sub new {
5618 my $class = shift;
5619 my $name = shift;
5620
5621 my %args = @_;
5622
5623 # Optional initialization data for the table.
5624 my $initialize = delete $args{'Initialize'};
5625
99870f4d 5626 my $default_map = delete $args{'Default_Map'};
99870f4d 5627 my $property = delete $args{'_Property'};
23e33b60 5628 my $full_name = delete $args{'Full_Name'};
20863809 5629
99870f4d
KW
5630 # Rest of parameters passed on
5631
5632 my $range_list = Range_Map->new(Owner => $property);
5633
5634 my $self = $class->SUPER::new(
5635 Name => $name,
23e33b60
KW
5636 Complete_Name => $full_name,
5637 Full_Name => $full_name,
99870f4d
KW
5638 _Property => $property,
5639 _Range_List => $range_list,
5640 %args);
5641
ffe43484 5642 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5643
5644 $anomalous_entries{$addr} = [];
99870f4d 5645 $default_map{$addr} = $default_map;
99870f4d
KW
5646
5647 $self->initialize($initialize) if defined $initialize;
5648
5649 return $self;
5650 }
5651
5652 use overload
5653 fallback => 0,
5654 qw("") => "_operator_stringify",
5655 ;
5656
5657 sub _operator_stringify {
5658 my $self = shift;
5659
5660 my $name = $self->property->full_name;
5661 $name = '""' if $name eq "";
5662 return "Map table for Property '$name'";
5663 }
5664
99870f4d
KW
5665 sub add_alias {
5666 # Add a synonym for this table (which means the property itself)
5667 my $self = shift;
5668 my $name = shift;
5669 # Rest of parameters passed on.
5670
5671 $self->SUPER::add_alias($name, $self->property, @_);
5672 return;
5673 }
5674
5675 sub add_map {
5676 # Add a range of code points to the list of specially-handled code
5677 # points. $MULTI_CP is assumed if the type of special is not passed
5678 # in.
5679
5680 my $self = shift;
5681 my $lower = shift;
5682 my $upper = shift;
5683 my $string = shift;
5684 my %args = @_;
5685
5686 my $type = delete $args{'Type'} || 0;
5687 # Rest of parameters passed on
5688
5689 # Can't change the table if locked.
5690 return if $self->carp_if_locked;
5691
ffe43484 5692 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 5693
99870f4d
KW
5694 $self->_range_list->add_map($lower, $upper,
5695 $string,
5696 @_,
5697 Type => $type);
5698 return;
5699 }
5700
5701 sub append_to_body {
5702 # Adds to the written HERE document of the table's body any anomalous
5703 # entries in the table..
5704
5705 my $self = shift;
5706 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5707
ffe43484 5708 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5709
5710 return "" unless @{$anomalous_entries{$addr}};
5711 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5712 }
5713
5714 sub map_add_or_replace_non_nulls {
5715 # This adds the mappings in the table $other to $self. Non-null
5716 # mappings from $other override those in $self. It essentially merges
5717 # the two tables, with the second having priority except for null
5718 # mappings.
5719
5720 my $self = shift;
5721 my $other = shift;
5722 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5723
5724 return if $self->carp_if_locked;
5725
5726 if (! $other->isa(__PACKAGE__)) {
5727 Carp::my_carp_bug("$other should be a "
5728 . __PACKAGE__
5729 . ". Not a '"
5730 . ref($other)
5731 . "'. Not added;");
5732 return;
5733 }
5734
ffe43484
NC
5735 my $addr = do { no overloading; pack 'J', $self; };
5736 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5737
5738 local $to_trace = 0 if main::DEBUG;
5739
5740 my $self_range_list = $self->_range_list;
5741 my $other_range_list = $other->_range_list;
5742 foreach my $range ($other_range_list->ranges) {
5743 my $value = $range->value;
5744 next if $value eq "";
5745 $self_range_list->_add_delete('+',
5746 $range->start,
5747 $range->end,
5748 $value,
5749 Type => $range->type,
5750 Replace => $UNCONDITIONALLY);
5751 }
5752
99870f4d
KW
5753 return;
5754 }
5755
5756 sub set_default_map {
5757 # Define what code points that are missing from the input files should
5758 # map to
5759
5760 my $self = shift;
5761 my $map = shift;
5762 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5763
ffe43484 5764 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5765
5766 # Convert the input to the standard equivalent, if any (won't have any
5767 # for $STRING properties)
5768 my $standard = $self->_find_table_from_alias->{$map};
5769 $map = $standard->name if defined $standard;
5770
5771 # Warn if there already is a non-equivalent default map for this
5772 # property. Note that a default map can be a ref, which means that
5773 # what it actually means is delayed until later in the program, and it
5774 # IS permissible to override it here without a message.
5775 my $default_map = $default_map{$addr};
5776 if (defined $default_map
5777 && ! ref($default_map)
5778 && $default_map ne $map
5779 && main::Standardize($map) ne $default_map)
5780 {
5781 my $property = $self->property;
5782 my $map_table = $property->table($map);
5783 my $default_table = $property->table($default_map);
5784 if (defined $map_table
5785 && defined $default_table
5786 && $map_table != $default_table)
5787 {
5788 Carp::my_carp("Changing the default mapping for "
5789 . $property
5790 . " from $default_map to $map'");
5791 }
5792 }
5793
5794 $default_map{$addr} = $map;
5795
5796 # Don't also create any missing table for this map at this point,
5797 # because if we did, it could get done before the main table add is
5798 # done for PropValueAliases.txt; instead the caller will have to make
5799 # sure it exists, if desired.
5800 return;
5801 }
5802
5803 sub to_output_map {
5804 # Returns boolean: should we write this map table?
5805
5806 my $self = shift;
5807 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5808
ffe43484 5809 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5810
5811 # If overridden, use that
5812 return $to_output_map{$addr} if defined $to_output_map{$addr};
5813
5814 my $full_name = $self->full_name;
fcf1973c
KW
5815 return $global_to_output_map{$full_name}
5816 if defined $global_to_output_map{$full_name};
99870f4d 5817
20863809 5818 # If table says to output, do so; if says to suppress it, do so.
301ba948
KW
5819 my $fate = $self->fate;
5820 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
8572ace0 5821 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
395dfc19 5822 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
99870f4d
KW
5823
5824 my $type = $self->property->type;
5825
5826 # Don't want to output binary map tables even for debugging.
5827 return 0 if $type == $BINARY;
5828
5829 # But do want to output string ones.
8572ace0 5830 return $EXTERNAL_MAP if $type == $STRING;
99870f4d 5831
8572ace0
KW
5832 # Otherwise is an $ENUM, do output it, for Perl's purposes
5833 return $INTERNAL_MAP;
99870f4d
KW
5834 }
5835
5836 sub inverse_list {
5837 # Returns a Range_List that is gaps of the current table. That is,
5838 # the inversion
5839
5840 my $self = shift;
5841 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5842
5843 my $current = Range_List->new(Initialize => $self->_range_list,
5844 Owner => $self->property);
5845 return ~ $current;
5846 }
5847
8572ace0
KW
5848 sub header {
5849 my $self = shift;
5850 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5851
5852 my $return = $self->SUPER::header();
5853
ae92a9ae
KW
5854 if ($self->to_output_map == $INTERNAL_MAP) {
5855 $return .= $INTERNAL_ONLY_HEADER;
5856 }
5857 else {
5858 my $property_name = $self->property->full_name;
5859 $return .= <<END;
5860
5861# !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
5862
5863# This file is for internal use by core Perl only. It is retained for
5864# backwards compatibility with applications that may have come to rely on it,
5865# but its format and even its name or existence are subject to change without
5866# notice in a future Perl version. Don't use it directly. Instead, its
5867# contents are now retrievable through a stable API in the Unicode::UCD
5868# module: Unicode::UCD::prop_invmap('$property_name').
5869END
5870 }
8572ace0
KW
5871 return $return;
5872 }
5873
99870f4d
KW
5874 sub set_final_comment {
5875 # Just before output, create the comment that heads the file
5876 # containing this table.
5877
bd9ebcfd
KW
5878 return unless $debugging_build;
5879
99870f4d
KW
5880 my $self = shift;
5881 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5882
5883 # No sense generating a comment if aren't going to write it out.
5884 return if ! $self->to_output_map;
5885
ffe43484 5886 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5887
5888 my $property = $self->property;
5889
5890 # Get all the possible names for this property. Don't use any that
5891 # aren't ok for use in a file name, etc. This is perhaps causing that
5892 # flag to do double duty, and may have to be changed in the future to
5893 # have our own flag for just this purpose; but it works now to exclude
5894 # Perl generated synonyms from the lists for properties, where the
5895 # name is always the proper Unicode one.
0eac1e20 5896 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
99870f4d
KW
5897
5898 my $count = $self->count;
5899 my $default_map = $default_map{$addr};
5900
5901 # The ranges that map to the default aren't output, so subtract that
5902 # to get those actually output. A property with matching tables
5903 # already has the information calculated.
5904 if ($property->type != $STRING) {
5905 $count -= $property->table($default_map)->count;
5906 }
5907 elsif (defined $default_map) {
5908
5909 # But for $STRING properties, must calculate now. Subtract the
5910 # count from each range that maps to the default.
5911 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5912 if ($range->value eq $default_map) {
5913 $count -= $range->end +1 - $range->start;
5914 }
5915 }
5916
5917 }
5918
5919 # Get a string version of $count with underscores in large numbers,
5920 # for clarity.
5921 my $string_count = main::clarify_number($count);
5922
5923 my $code_points = ($count == 1)
5924 ? 'single code point'
5925 : "$string_count code points";
5926
5927 my $mapping;
5928 my $these_mappings;
5929 my $are;
5930 if (@property_aliases <= 1) {
5931 $mapping = 'mapping';
5932 $these_mappings = 'this mapping';
5933 $are = 'is'
5934 }
5935 else {
5936 $mapping = 'synonymous mappings';
5937 $these_mappings = 'these mappings';
5938 $are = 'are'
5939 }
5940 my $cp;
5941 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5942 $cp = "any code point in Unicode Version $string_version";
5943 }
5944 else {
5945 my $map_to;
5946 if ($default_map eq "") {
5947 $map_to = 'the null string';
5948 }
5949 elsif ($default_map eq $CODE_POINT) {
5950 $map_to = "itself";
5951 }
5952 else {
5953 $map_to = "'$default_map'";
5954 }
5955 if ($count == 1) {
5956 $cp = "the single code point";
5957 }
5958 else {
5959 $cp = "one of the $code_points";
5960 }
5961 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5962 }
5963
5964 my $comment = "";
5965
5966 my $status = $self->status;
5967 if ($status) {
5968 my $warn = uc $status_past_participles{$status};
5969 $comment .= <<END;
5970
5971!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5972 All property or property=value combinations contained in this file are $warn.
5973 See $unicode_reference_url for what this means.
5974
5975END
5976 }
5977 $comment .= "This file returns the $mapping:\n";
5978
5979 for my $i (0 .. @property_aliases - 1) {
5980 $comment .= sprintf("%-8s%s\n",
5981 " ",
5982 $property_aliases[$i]->name . '(cp)'
5983 );
5984 }
83b7c87d
KW
5985 my $full_name = $self->property->full_name;
5986 $comment .= "\nwhere 'cp' is $cp. Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
99870f4d
KW
5987
5988 # And append any commentary already set from the actual property.
5989 $comment .= "\n\n" . $self->comment if $self->comment;
5990 if ($self->description) {
5991 $comment .= "\n\n" . join " ", $self->description;
5992 }
5993 if ($self->note) {
5994 $comment .= "\n\n" . join " ", $self->note;
5995 }
5996 $comment .= "\n";
5997
5998 if (! $self->perl_extension) {
5999 $comment .= <<END;
6000
6001For information about what this property really means, see:
6002$unicode_reference_url
6003END
6004 }
6005
6006 if ($count) { # Format differs for empty table
6007 $comment.= "\nThe format of the ";
6008 if ($self->range_size_1) {
6009 $comment.= <<END;
6010main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6011is in hex; MAPPING is what CODE_POINT maps to.
6012END
6013 }
6014 else {
6015
6016 # There are tables which end up only having one element per
6017 # range, but it is not worth keeping track of for making just
6018 # this comment a little better.
6019 $comment.= <<END;
6020non-comment portions of the main body of lines of this file is:
6021START\\tSTOP\\tMAPPING where START is the starting code point of the
6022range, in hex; STOP is the ending point, or if omitted, the range has just one
6023code point; MAPPING is what each code point between START and STOP maps to.
6024END
0c07e538 6025 if ($self->output_range_counts) {
99870f4d
KW
6026 $comment .= <<END;
6027Numbers in comments in [brackets] indicate how many code points are in the
6028range (omitted when the range is a single code point or if the mapping is to
6029the null string).
6030END
6031 }
6032 }
6033 }
6034 $self->set_comment(main::join_lines($comment));
6035 return;
6036 }
6037
6038 my %swash_keys; # Makes sure don't duplicate swash names.
6039
668b3bfc
KW
6040 # The remaining variables are temporaries used while writing each table,
6041 # to output special ranges.
668b3bfc
KW
6042 my @multi_code_point_maps; # Map is to more than one code point.
6043
668b3bfc
KW
6044 sub handle_special_range {
6045 # Called in the middle of write when it finds a range it doesn't know
6046 # how to handle.
6047
6048 my $self = shift;
6049 my $range = shift;
6050 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6051
6052 my $addr = do { no overloading; pack 'J', $self; };
6053
6054 my $type = $range->type;
6055
6056 my $low = $range->start;
6057 my $high = $range->end;
6058 my $map = $range->value;
6059
6060 # No need to output the range if it maps to the default.
6061 return if $map eq $default_map{$addr};
6062
bb1dd3da
KW
6063 my $property = $self->property;
6064
668b3bfc
KW
6065 # Switch based on the map type...
6066 if ($type == $HANGUL_SYLLABLE) {
6067
6068 # These are entirely algorithmically determinable based on
6069 # some constants furnished by Unicode; for now, just set a
6070 # flag to indicate that have them. After everything is figured
bb1dd3da
KW
6071 # out, we will output the code that does the algorithm. (Don't
6072 # output them if not needed because we are suppressing this
6073 # property.)
6074 $has_hangul_syllables = 1 if $property->to_output_map;
668b3bfc
KW
6075 }
6076 elsif ($type == $CP_IN_NAME) {
6077
bb1dd3da 6078 # Code points whose name ends in their code point are also
668b3bfc
KW
6079 # algorithmically determinable, but need information about the map
6080 # to do so. Both the map and its inverse are stored in data
bb1dd3da
KW
6081 # structures output in the file. They are stored in the mean time
6082 # in global lists The lists will be written out later into Name.pm,
6083 # which is created only if needed. In order to prevent duplicates
6084 # in the list, only add to them for one property, should multiple
6085 # ones need them.
6086 if ($needing_code_points_ending_in_code_point == 0) {
6087 $needing_code_points_ending_in_code_point = $property;
6088 }
6089 if ($property == $needing_code_points_ending_in_code_point) {
6c1bafed
KW
6090 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6091 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6092
6093 my $squeezed = $map =~ s/[-\s]+//gr;
6094 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6095 $low;
6096 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6097 $high;
6098
6099 push @code_points_ending_in_code_point, { low => $low,
6100 high => $high,
6101 name => $map
6102 };
bb1dd3da 6103 }
668b3bfc
KW
6104 }
6105 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6106
6107 # Multi-code point maps and null string maps have an entry
6108 # for each code point in the range. They use the same
6109 # output format.
6110 for my $code_point ($low .. $high) {
6111
c12f2655
KW
6112 # The pack() below can't cope with surrogates. XXX This may
6113 # no longer be true
668b3bfc 6114 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
98dc9551 6115 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
668b3bfc
KW
6116 next;
6117 }
6118
6119 # Generate the hash entries for these in the form that
6120 # utf8.c understands.
6121 my $tostr = "";
6122 my $to_name = "";
6123 my $to_chr = "";
6124 foreach my $to (split " ", $map) {
6125 if ($to !~ /^$code_point_re$/) {
6126 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6127 next;
6128 }
6129 $tostr .= sprintf "\\x{%s}", $to;
6130 $to = CORE::hex $to;
558712cf 6131 if ($annotate) {
c4019d52
KW
6132 $to_name .= " + " if $to_name;
6133 $to_chr .= chr($to);
6134 main::populate_char_info($to)
6135 if ! defined $viacode[$to];
6136 $to_name .= $viacode[$to];
6137 }
668b3bfc
KW
6138 }
6139
6140 # I (khw) have never waded through this line to
6141 # understand it well enough to comment it.
6142 my $utf8 = sprintf(qq["%s" => "$tostr",],
6143 join("", map { sprintf "\\x%02X", $_ }
6144 unpack("U0C*", pack("U", $code_point))));
6145
6146 # Add a comment so that a human reader can more easily
6147 # see what's going on.
6148 push @multi_code_point_maps,
6149 sprintf("%-45s # U+%04X", $utf8, $code_point);
558712cf 6150 if (! $annotate) {
c4019d52
KW
6151 $multi_code_point_maps[-1] .= " => $map";
6152 }
6153 else {
6154 main::populate_char_info($code_point)
6155 if ! defined $viacode[$code_point];
6156 $multi_code_point_maps[-1] .= " '"
6157 . chr($code_point)
6158 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6159 }
668b3bfc
KW
6160 }
6161 }
6162 else {
6163 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6164 }
6165
6166 return;
6167 }
6168
99870f4d
KW
6169 sub pre_body {
6170 # Returns the string that should be output in the file before the main
668b3bfc
KW
6171 # body of this table. It isn't called until the main body is
6172 # calculated, saving a pass. The string includes some hash entries
6173 # identifying the format of the body, and what the single value should
6174 # be for all ranges missing from it. It also includes any code points
6175 # which have map_types that don't go in the main table.
99870f4d
KW
6176
6177 my $self = shift;
6178 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6179
ffe43484 6180 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6181
6182 my $name = $self->property->swash_name;
6183
19f751d2
KW
6184 # Currently there is nothing in the pre_body unless a swash is being
6185 # generated.
6186 return unless defined $name;
6187
99870f4d
KW
6188 if (defined $swash_keys{$name}) {
6189 Carp::my_carp(join_lines(<<END
6190Already created a swash name '$name' for $swash_keys{$name}. This means that
6191the same name desired for $self shouldn't be used. Bad News. This must be
6192fixed before production use, but proceeding anyway
6193END
6194 ));
6195 }
6196 $swash_keys{$name} = "$self";
6197
99870f4d 6198 my $pre_body = "";
99870f4d 6199
668b3bfc
KW
6200 # Here we assume we were called after have gone through the whole
6201 # file. If we actually generated anything for each map type, add its
6202 # respective header and trailer
ec2f0128 6203 my $specials_name = "";
668b3bfc 6204 if (@multi_code_point_maps) {
ec2f0128 6205 $specials_name = "utf8::ToSpec$name";
668b3bfc 6206 $pre_body .= <<END;
99870f4d
KW
6207
6208# Some code points require special handling because their mappings are each to
6209# multiple code points. These do not appear in the main body, but are defined
6210# in the hash below.
6211
76591e2b
KW
6212# Each key is the string of N bytes that together make up the UTF-8 encoding
6213# for the code point. (i.e. the same as looking at the code point's UTF-8
6214# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
ec2f0128 6215\%$specials_name = (
99870f4d 6216END
668b3bfc
KW
6217 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6218 }
99870f4d 6219
668b3bfc
KW
6220 my $format = $self->format;
6221
6222 my $return = <<END;
6223# The name this swash is to be known by, with the format of the mappings in
6224# the main body of the table, and what all code points missing from this file
6225# map to.
6226\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6227END
ec2f0128
KW
6228 if ($specials_name) {
6229 $return .= <<END;
6230\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6231END
6232 }
668b3bfc
KW
6233 my $default_map = $default_map{$addr};
6234 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6235
6236 if ($default_map eq $CODE_POINT) {
6237 $return .= ' # code point maps to itself';
6238 }
6239 elsif ($default_map eq "") {
6240 $return .= ' # code point maps to the null string';
6241 }
6242 $return .= "\n";
6243
6244 $return .= $pre_body;
6245
6246 return $return;
6247 }
6248
6249 sub write {
6250 # Write the table to the file.
6251
6252 my $self = shift;
6253 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6254
6255 my $addr = do { no overloading; pack 'J', $self; };
6256
6257 # Clear the temporaries
668b3bfc 6258 undef @multi_code_point_maps;
99870f4d
KW
6259
6260 # Calculate the format of the table if not already done.
f5817e0a 6261 my $format = $self->format;
668b3bfc
KW
6262 my $type = $self->property->type;
6263 my $default_map = $self->default_map;
99870f4d
KW
6264 if (! defined $format) {
6265 if ($type == $BINARY) {
6266
6267 # Don't bother checking the values, because we elsewhere
6268 # verify that a binary table has only 2 values.
6269 $format = $BINARY_FORMAT;
6270 }
6271 else {
6272 my @ranges = $self->_range_list->ranges;
6273
6274 # default an empty table based on its type and default map
6275 if (! @ranges) {
6276
6277 # But it turns out that the only one we can say is a
6278 # non-string (besides binary, handled above) is when the
6279 # table is a string and the default map is to a code point
6280 if ($type == $STRING && $default_map eq $CODE_POINT) {
6281 $format = $HEX_FORMAT;
6282 }
6283 else {
6284 $format = $STRING_FORMAT;
6285 }
6286 }
6287 else {
6288
6289 # Start with the most restrictive format, and as we find
6290 # something that doesn't fit with that, change to the next
6291 # most restrictive, and so on.
6292 $format = $DECIMAL_FORMAT;
6293 foreach my $range (@ranges) {
668b3bfc
KW
6294 next if $range->type != 0; # Non-normal ranges don't
6295 # affect the main body
99870f4d
KW
6296 my $map = $range->value;
6297 if ($map ne $default_map) {
6298 last if $format eq $STRING_FORMAT; # already at
6299 # least
6300 # restrictive
6301 $format = $INTEGER_FORMAT
6302 if $format eq $DECIMAL_FORMAT
6303 && $map !~ / ^ [0-9] $ /x;
6304 $format = $FLOAT_FORMAT
6305 if $format eq $INTEGER_FORMAT
6306 && $map !~ / ^ -? [0-9]+ $ /x;
6307 $format = $RATIONAL_FORMAT
6308 if $format eq $FLOAT_FORMAT
6309 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6310 $format = $HEX_FORMAT
6311 if $format eq $RATIONAL_FORMAT
6312 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6313 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6314 && $map =~ /[^0-9A-F]/;
6315 }
6316 }
6317 }
6318 }
6319 } # end of calculating format
6320
668b3bfc 6321 if ($default_map eq $CODE_POINT
99870f4d 6322 && $format ne $HEX_FORMAT
668b3bfc
KW
6323 && ! defined $self->format) # manual settings are always
6324 # considered ok
99870f4d
KW
6325 {
6326 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6327 }
99870f4d 6328
668b3bfc 6329 $self->_set_format($format);
99870f4d 6330
0911a63d
KW
6331 # Core Perl has a different definition of mapping ranges than we do,
6332 # that is applicable mainly to mapping code points, so for tables
6333 # where it is possible that core Perl could be used to read it,
6334 # make it range size 1 to prevent possible confusion
6335 $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6336
99870f4d
KW
6337 return $self->SUPER::write(
6338 ($self->property == $block)
6339 ? 7 # block file needs more tab stops
6340 : 3,
668b3bfc 6341 $default_map); # don't write defaulteds
99870f4d
KW
6342 }
6343
6344 # Accessors for the underlying list that should fail if locked.
ea25a9b2 6345 for my $sub (qw(
99870f4d 6346 add_duplicate
ea25a9b2 6347 ))
99870f4d
KW
6348 {
6349 no strict "refs";
6350 *$sub = sub {
6351 use strict "refs";
6352 my $self = shift;
6353
6354 return if $self->carp_if_locked;
6355 return $self->_range_list->$sub(@_);
6356 }
6357 }
6358} # End closure for Map_Table
6359
6360package Match_Table;
6361use base '_Base_Table';
6362
6363# A Match table is one which is a list of all the code points that have
6364# the same property and property value, for use in \p{property=value}
6365# constructs in regular expressions. It adds very little data to the base
6366# structure, but many methods, as these lists can be combined in many ways to
6367# form new ones.
6368# There are only a few concepts added:
6369# 1) Equivalents and Relatedness.
6370# Two tables can match the identical code points, but have different names.
6371# This always happens when there is a perl single form extension
6372# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6373# tables are set to be related, with the Perl extension being a child, and
6374# the Unicode property being the parent.
6375#
6376# It may be that two tables match the identical code points and we don't
6377# know if they are related or not. This happens most frequently when the
6378# Block and Script properties have the exact range. But note that a
6379# revision to Unicode could add new code points to the script, which would
6380# now have to be in a different block (as the block was filled, or there
6381# would have been 'Unknown' script code points in it and they wouldn't have
6382# been identical). So we can't rely on any two properties from Unicode
6383# always matching the same code points from release to release, and thus
6384# these tables are considered coincidentally equivalent--not related. When
6385# two tables are unrelated but equivalent, one is arbitrarily chosen as the
6386# 'leader', and the others are 'equivalents'. This concept is useful
6387# to minimize the number of tables written out. Only one file is used for
6388# any identical set of code points, with entries in Heavy.pl mapping all
6389# the involved tables to it.
6390#
6391# Related tables will always be identical; we set them up to be so. Thus
6392# if the Unicode one is deprecated, the Perl one will be too. Not so for
6393# unrelated tables. Relatedness makes generating the documentation easier.
6394#
c12f2655
KW
6395# 2) Complement.
6396# Like equivalents, two tables may be the inverses of each other, the
6397# intersection between them is null, and the union is every Unicode code
6398# point. The two tables that occupy a binary property are necessarily like
6399# this. By specifying one table as the complement of another, we can avoid
6400# storing it on disk (using the other table and performing a fast
6401# transform), and some memory and calculations.
6402#
6403# 3) Conflicting. It may be that there will eventually be name clashes, with
99870f4d
KW
6404# the same name meaning different things. For a while, there actually were
6405# conflicts, but they have so far been resolved by changing Perl's or
6406# Unicode's definitions to match the other, but when this code was written,
6407# it wasn't clear that that was what was going to happen. (Unicode changed
6408# because of protests during their beta period.) Name clashes are warned
6409# about during compilation, and the documentation. The generated tables
6410# are sane, free of name clashes, because the code suppresses the Perl
6411# version. But manual intervention to decide what the actual behavior
6412# should be may be required should this happen. The introductory comments
6413# have more to say about this.
6414
6415sub standardize { return main::standardize($_[0]); }
6416sub trace { return main::trace(@_); }
6417
6418
6419{ # Closure
6420
6421 main::setup_package();
6422
6423 my %leader;
6424 # The leader table of this one; initially $self.
6425 main::set_access('leader', \%leader, 'r');
6426
6427 my %equivalents;
6428 # An array of any tables that have this one as their leader
6429 main::set_access('equivalents', \%equivalents, 'readable_array');
6430
6431 my %parent;
6432 # The parent table to this one, initially $self. This allows us to
c12f2655
KW
6433 # distinguish between equivalent tables that are related (for which this
6434 # is set to), and those which may not be, but share the same output file
6435 # because they match the exact same set of code points in the current
6436 # Unicode release.
99870f4d
KW
6437 main::set_access('parent', \%parent, 'r');
6438
6439 my %children;
6440 # An array of any tables that have this one as their parent
6441 main::set_access('children', \%children, 'readable_array');
6442
6443 my %conflicting;
6444 # Array of any tables that would have the same name as this one with
6445 # a different meaning. This is used for the generated documentation.
6446 main::set_access('conflicting', \%conflicting, 'readable_array');
6447
6448 my %matches_all;
6449 # Set in the constructor for tables that are expected to match all code
6450 # points.
6451 main::set_access('matches_all', \%matches_all, 'r');
6452
a92d5c2e
KW
6453 my %complement;
6454 # Points to the complement that this table is expressed in terms of; 0 if
6455 # none.
8ae00c8a 6456 main::set_access('complement', \%complement, 'r');
a92d5c2e 6457
99870f4d
KW
6458 sub new {
6459 my $class = shift;
6460
6461 my %args = @_;
6462
6463 # The property for which this table is a listing of property values.
6464 my $property = delete $args{'_Property'};
6465
23e33b60
KW
6466 my $name = delete $args{'Name'};
6467 my $full_name = delete $args{'Full_Name'};
6468 $full_name = $name if ! defined $full_name;
6469
99870f4d
KW
6470 # Optional
6471 my $initialize = delete $args{'Initialize'};
6472 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 6473 my $format = delete $args{'Format'};
99870f4d
KW
6474 # Rest of parameters passed on.
6475
6476 my $range_list = Range_List->new(Initialize => $initialize,
6477 Owner => $property);
6478
23e33b60
KW
6479 my $complete = $full_name;
6480 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6481 # but this helps debug if it
6482 # does
6483 # The complete name for a match table includes it's property in a
6484 # compound form 'property=table', except if the property is the
6485 # pseudo-property, perl, in which case it is just the single form,
6486 # 'table' (If you change the '=' must also change the ':' in lots of
6487 # places in this program that assume an equal sign)
6488 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 6489
99870f4d 6490 my $self = $class->SUPER::new(%args,
23e33b60
KW
6491 Name => $name,
6492 Complete_Name => $complete,
6493 Full_Name => $full_name,
99870f4d
KW
6494 _Property => $property,
6495 _Range_List => $range_list,
f5817e0a 6496 Format => $EMPTY_FORMAT,
99870f4d 6497 );
ffe43484 6498 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6499
6500 $conflicting{$addr} = [ ];
6501 $equivalents{$addr} = [ ];
6502 $children{$addr} = [ ];
6503 $matches_all{$addr} = $matches_all;
6504 $leader{$addr} = $self;
6505 $parent{$addr} = $self;
a92d5c2e 6506 $complement{$addr} = 0;
99870f4d 6507
f5817e0a
KW
6508 if (defined $format && $format ne $EMPTY_FORMAT) {
6509 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6510 }
6511
99870f4d
KW
6512 return $self;
6513 }
6514
6515 # See this program's beginning comment block about overloading these.
6516 use overload
6517 fallback => 0,
6518 qw("") => "_operator_stringify",
6519 '=' => sub {
6520 my $self = shift;
6521
6522 return if $self->carp_if_locked;
6523 return $self;
6524 },
6525
6526 '+' => sub {
6527 my $self = shift;
6528 my $other = shift;
6529
6530 return $self->_range_list + $other;
6531 },
6532 '&' => sub {
6533 my $self = shift;
6534 my $other = shift;
6535
6536 return $self->_range_list & $other;
6537 },
6538 '+=' => sub {
6539 my $self = shift;
6540 my $other = shift;
6541
6542 return if $self->carp_if_locked;
6543
ffe43484 6544 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6545
6546 if (ref $other) {
6547
6548 # Change the range list of this table to be the
6549 # union of the two.
6550 $self->_set_range_list($self->_range_list
6551 + $other);
6552 }
6553 else { # $other is just a simple value
6554 $self->add_range($other, $other);
6555 }
6556 return $self;
6557 },
6558 '-' => sub { my $self = shift;
6559 my $other = shift;
6560 my $reversed = shift;
6561
6562 if ($reversed) {
6563 Carp::my_carp_bug("Can't cope with a "
6564 . __PACKAGE__
6565 . " being the first parameter in a '-'. Subtraction ignored.");
6566 return;
6567 }
6568
6569 return $self->_range_list - $other;
6570 },
6571 '~' => sub { my $self = shift;
6572 return ~ $self->_range_list;
6573 },
6574 ;
6575
6576 sub _operator_stringify {
6577 my $self = shift;
6578
23e33b60 6579 my $name = $self->complete_name;
99870f4d
KW
6580 return "Table '$name'";
6581 }
6582
ec40ee88
KW
6583 sub _range_list {
6584 # Returns the range list associated with this table, which will be the
6585 # complement's if it has one.
6586
6587 my $self = shift;
6588 my $complement;
6589 if (($complement = $self->complement) != 0) {
6590 return ~ $complement->_range_list;
6591 }
6592 else {
6593 return $self->SUPER::_range_list;
6594 }
6595 }
6596
99870f4d
KW
6597 sub add_alias {
6598 # Add a synonym for this table. See the comments in the base class
6599
6600 my $self = shift;
6601 my $name = shift;
6602 # Rest of parameters passed on.
6603
6604 $self->SUPER::add_alias($name, $self, @_);
6605 return;
6606 }
6607
6608 sub add_conflicting {
6609 # Add the name of some other object to the list of ones that name
6610 # clash with this match table.
6611
6612 my $self = shift;
6613 my $conflicting_name = shift; # The name of the conflicting object
6614 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6615 my $conflicting_object = shift; # Optional, the conflicting object
6616 # itself. This is used to
6617 # disambiguate the text if the input
6618 # name is identical to any of the
6619 # aliases $self is known by.
6620 # Sometimes the conflicting object is
6621 # merely hypothetical, so this has to
6622 # be an optional parameter.
6623 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6624
ffe43484 6625 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6626
6627 # Check if the conflicting name is exactly the same as any existing
6628 # alias in this table (as long as there is a real object there to
6629 # disambiguate with).
6630 if (defined $conflicting_object) {
6631 foreach my $alias ($self->aliases) {
6632 if ($alias->name eq $conflicting_name) {
6633
6634 # Here, there is an exact match. This results in
6635 # ambiguous comments, so disambiguate by changing the
6636 # conflicting name to its object's complete equivalent.
6637 $conflicting_name = $conflicting_object->complete_name;
6638 last;
6639 }
6640 }
6641 }
6642
6643 # Convert to the \p{...} final name
6644 $conflicting_name = "\\$p" . "{$conflicting_name}";
6645
6646 # Only add once
6647 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6648
6649 push @{$conflicting{$addr}}, $conflicting_name;
6650
6651 return;
6652 }
6653
6505c6e2 6654 sub is_set_equivalent_to {
99870f4d
KW
6655 # Return boolean of whether or not the other object is a table of this
6656 # type and has been marked equivalent to this one.
6657
6658 my $self = shift;
6659 my $other = shift;
6660 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6661
6662 return 0 if ! defined $other; # Can happen for incomplete early
6663 # releases
6664 unless ($other->isa(__PACKAGE__)) {
6665 my $ref_other = ref $other;
6666 my $ref_self = ref $self;
6505c6e2 6667 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
99870f4d
KW
6668 return 0;
6669 }
6670
6671 # Two tables are equivalent if they have the same leader.
f998e60c 6672 no overloading;
051df77b 6673 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6674 return;
6675 }
6676
99870f4d
KW
6677 sub set_equivalent_to {
6678 # Set $self equivalent to the parameter table.
6679 # The required Related => 'x' parameter is a boolean indicating
6680 # whether these tables are related or not. If related, $other becomes
6681 # the 'parent' of $self; if unrelated it becomes the 'leader'
6682 #
6683 # Related tables share all characteristics except names; equivalents
6684 # not quite so many.
6685 # If they are related, one must be a perl extension. This is because
6686 # we can't guarantee that Unicode won't change one or the other in a
98dc9551 6687 # later release even if they are identical now.
99870f4d
KW
6688
6689 my $self = shift;
6690 my $other = shift;
6691
6692 my %args = @_;
6693 my $related = delete $args{'Related'};
6694
6695 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6696
6697 return if ! defined $other; # Keep on going; happens in some early
6698 # Unicode releases.
6699
6700 if (! defined $related) {
6701 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6702 $related = 0;
6703 }
6704
6705 # If already are equivalent, no need to re-do it; if subroutine
6706 # returns null, it found an error, also do nothing
6505c6e2 6707 my $are_equivalent = $self->is_set_equivalent_to($other);
99870f4d
KW
6708 return if ! defined $are_equivalent || $are_equivalent;
6709
ffe43484 6710 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6711 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d 6712
45e32b91
KW
6713 if ($related) {
6714 if ($current_leader->perl_extension) {
6715 if ($other->perl_extension) {
6716 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6717 return;
6718 }
7610e9e2
KW
6719 } elsif ($self->property != $other->property # Depending on
6720 # situation, might
6721 # be better to use
6722 # add_alias()
6723 # instead for same
6724 # property
6725 && ! $other->perl_extension)
6726 {
45e32b91
KW
6727 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6728 $related = 0;
6729 }
6730 }
6731
6732 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6733 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
6734 return;
99870f4d
KW
6735 }
6736
ffe43484
NC
6737 my $leader = do { no overloading; pack 'J', $current_leader; };
6738 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6739
6740 # Any tables that are equivalent to or children of this table must now
6741 # instead be equivalent to or (children) to the new leader (parent),
6742 # still equivalent. The equivalency includes their matches_all info,
301ba948 6743 # and for related tables, their fate and status.
99870f4d
KW
6744 # All related tables are of necessity equivalent, but the converse
6745 # isn't necessarily true
6746 my $status = $other->status;
6747 my $status_info = $other->status_info;
301ba948 6748 my $fate = $other->fate;
99870f4d 6749 my $matches_all = $matches_all{other_addr};
d867ccfb 6750 my $caseless_equivalent = $other->caseless_equivalent;
99870f4d
KW
6751 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6752 next if $table == $other;
6753 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6754
ffe43484 6755 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6756 $leader{$table_addr} = $other;
6757 $matches_all{$table_addr} = $matches_all;
6758 $self->_set_range_list($other->_range_list);
6759 push @{$equivalents{$other_addr}}, $table;
6760 if ($related) {
6761 $parent{$table_addr} = $other;
6762 push @{$children{$other_addr}}, $table;
6763 $table->set_status($status, $status_info);
301ba948
KW
6764
6765 # This reason currently doesn't get exposed outside; otherwise
6766 # would have to look up the parent's reason and use it instead.
6767 $table->set_fate($fate, "Parent's fate");
6768
d867ccfb 6769 $self->set_caseless_equivalent($caseless_equivalent);
99870f4d
KW
6770 }
6771 }
6772
6773 # Now that we've declared these to be equivalent, any changes to one
6774 # of the tables would invalidate that equivalency.
6775 $self->lock;
6776 $other->lock;
6777 return;
6778 }
6779
8ae00c8a
KW
6780 sub set_complement {
6781 # Set $self to be the complement of the parameter table. $self is
6782 # locked, as what it contains should all come from the other table.
6783
6784 my $self = shift;
6785 my $other = shift;
6786
6787 my %args = @_;
6788 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6789
6790 if ($other->complement != 0) {
6791 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
6792 return;
6793 }
6794 my $addr = do { no overloading; pack 'J', $self; };
6795 $complement{$addr} = $other;
6796 $self->lock;
6797 return;
6798 }
6799
99870f4d
KW
6800 sub add_range { # Add a range to the list for this table.
6801 my $self = shift;
6802 # Rest of parameters passed on
6803
6804 return if $self->carp_if_locked;
6805 return $self->_range_list->add_range(@_);
6806 }
6807
88c22f80
KW
6808 sub header {
6809 my $self = shift;
6810 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6811
6812 # All match tables are to be used only by the Perl core.
126c3d4e 6813 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
88c22f80
KW
6814 }
6815
99870f4d
KW
6816 sub pre_body { # Does nothing for match tables.
6817 return
6818 }
6819
6820 sub append_to_body { # Does nothing for match tables.
6821 return
6822 }
6823
301ba948
KW
6824 sub set_fate {
6825 my $self = shift;
6826 my $fate = shift;
6827 my $reason = shift;
6828 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6829
6830 $self->SUPER::set_fate($fate, $reason);
6831
6832 # All children share this fate
6833 foreach my $child ($self->children) {
6834 $child->set_fate($fate, $reason);
6835 }
6836 return;
6837 }
6838
99870f4d
KW
6839 sub write {
6840 my $self = shift;
6841 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6842
6843 return $self->SUPER::write(2); # 2 tab stops
6844 }
6845
6846 sub set_final_comment {
6847 # This creates a comment for the file that is to hold the match table
6848 # $self. It is somewhat convoluted to make the English read nicely,
6849 # but, heh, it's just a comment.
6850 # This should be called only with the leader match table of all the
6851 # ones that share the same file. It lists all such tables, ordered so
6852 # that related ones are together.
6853
bd9ebcfd
KW
6854 return unless $debugging_build;
6855
99870f4d
KW
6856 my $leader = shift; # Should only be called on the leader table of
6857 # an equivalent group
6858 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6859
ffe43484 6860 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6861
6862 if ($leader{$addr} != $leader) {
6863 Carp::my_carp_bug(<<END
6864set_final_comment() must be called on a leader table, which $leader is not.
6865It is equivalent to $leader{$addr}. No comment created
6866END
6867 );
6868 return;
6869 }
6870
6871 # Get the number of code points matched by each of the tables in this
6872 # file, and add underscores for clarity.
6873 my $count = $leader->count;
6874 my $string_count = main::clarify_number($count);
6875
6876 my $loose_count = 0; # how many aliases loosely matched
6877 my $compound_name = ""; # ? Are any names compound?, and if so, an
6878 # example
6879 my $properties_with_compound_names = 0; # count of these
6880
6881
6882 my %flags; # The status flags used in the file
6883 my $total_entries = 0; # number of entries written in the comment
6884 my $matches_comment = ""; # The portion of the comment about the
6885 # \p{}'s
6886 my @global_comments; # List of all the tables' comments that are
6887 # there before this routine was called.
6888
6889 # Get list of all the parent tables that are equivalent to this one
6890 # (including itself).
6891 my @parents = grep { $parent{main::objaddr $_} == $_ }
6892 main::uniques($leader, @{$equivalents{$addr}});
6893 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6894 # tables
6895
6896 for my $parent (@parents) {
6897
6898 my $property = $parent->property;
6899
6900 # Special case 'N' tables in properties with two match tables when
6901 # the other is a 'Y' one. These are likely to be binary tables,
6902 # but not necessarily. In either case, \P{} will match the
6903 # complement of \p{}, and so if something is a synonym of \p, the
6904 # complement of that something will be the synonym of \P. This
6905 # would be true of any property with just two match tables, not
6906 # just those whose values are Y and N; but that would require a
6907 # little extra work, and there are none such so far in Unicode.
6908 my $perl_p = 'p'; # which is it? \p{} or \P{}
6909 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6910
6911 if (scalar $property->tables == 2
6912 && $parent == $property->table('N')
6913 && defined (my $yes = $property->table('Y')))
6914 {
ffe43484 6915 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6916 @yes_perl_synonyms
6917 = grep { $_->property == $perl }
6918 main::uniques($yes,
6919 $parent{$yes_addr},
6920 $parent{$yes_addr}->children);
6921
6922 # But these synonyms are \P{} ,not \p{}
6923 $perl_p = 'P';
6924 }
6925
6926 my @description; # Will hold the table description
6927 my @note; # Will hold the table notes.
6928 my @conflicting; # Will hold the table conflicts.
6929
6930 # Look at the parent, any yes synonyms, and all the children
ffe43484 6931 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6932 for my $table ($parent,
6933 @yes_perl_synonyms,
f998e60c 6934 @{$children{$parent_addr}})
99870f4d 6935 {
ffe43484 6936 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6937 my $table_property = $table->property;
6938
6939 # Tables are separated by a blank line to create a grouping.
6940 $matches_comment .= "\n" if $matches_comment;
6941
6942 # The table is named based on the property and value
6943 # combination it is for, like script=greek. But there may be
6944 # a number of synonyms for each side, like 'sc' for 'script',
6945 # and 'grek' for 'greek'. Any combination of these is a valid
6946 # name for this table. In this case, there are three more,
6947 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6948 # listing all possible combinations in the comment, we make
6949 # sure that each synonym occurs at least once, and add
6950 # commentary that the other combinations are possible.
da912e1e
KW
6951 # Because regular expressions don't recognize things like
6952 # \p{jsn=}, only look at non-null right-hand-sides
99870f4d 6953 my @property_aliases = $table_property->aliases;
da912e1e 6954 my @table_aliases = grep { $_->name ne "" } $table->aliases;
99870f4d
KW
6955
6956 # The alias lists above are already ordered in the order we
6957 # want to output them. To ensure that each synonym is listed,
da912e1e
KW
6958 # we must use the max of the two numbers. But if there are no
6959 # legal synonyms (nothing in @table_aliases), then we don't
6960 # list anything.
6961 my $listed_combos = (@table_aliases)
6962 ? main::max(scalar @table_aliases,
6963 scalar @property_aliases)
6964 : 0;
99870f4d
KW
6965 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6966
da912e1e 6967
99870f4d
KW
6968 my $property_had_compound_name = 0;
6969
6970 for my $i (0 .. $listed_combos - 1) {
6971 $total_entries++;
6972
6973 # The current alias for the property is the next one on
6974 # the list, or if beyond the end, start over. Similarly
6975 # for the table (\p{prop=table})
6976 my $property_alias = $property_aliases
6977 [$i % @property_aliases]->name;
6978 my $table_alias_object = $table_aliases
6979 [$i % @table_aliases];
6980 my $table_alias = $table_alias_object->name;
6981 my $loose_match = $table_alias_object->loose_match;
6982
6983 if ($table_alias !~ /\D/) { # Clarify large numbers.
6984 $table_alias = main::clarify_number($table_alias)
6985 }
6986
6987 # Add a comment for this alias combination
6988 my $current_match_comment;
6989 if ($table_property == $perl) {
6990 $current_match_comment = "\\$perl_p"
6991 . "{$table_alias}";
6992 }
6993 else {
6994 $current_match_comment
6995 = "\\p{$property_alias=$table_alias}";
6996 $property_had_compound_name = 1;
6997 }
6998
6999 # Flag any abnormal status for this table.
7000 my $flag = $property->status
7001 || $table->status
7002 || $table_alias_object->status;
301ba948 7003 $flags{$flag} = $status_past_participles{$flag} if $flag;
99870f4d
KW
7004
7005 $loose_count++;
7006
7007 # Pretty up the comment. Note the \b; it says don't make
7008 # this line a continuation.
7009 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7010 $flag,
7011 " " x 7,
7012 $current_match_comment);
7013 } # End of generating the entries for this table.
7014
7015 # Save these for output after this group of related tables.
7016 push @description, $table->description;
7017 push @note, $table->note;
7018 push @conflicting, $table->conflicting;
7019
37e2e78e
KW
7020 # And this for output after all the tables.
7021 push @global_comments, $table->comment;
7022
99870f4d
KW
7023 # Compute an alternate compound name using the final property
7024 # synonym and the first table synonym with a colon instead of
7025 # the equal sign used elsewhere.
7026 if ($property_had_compound_name) {
7027 $properties_with_compound_names ++;
7028 if (! $compound_name || @property_aliases > 1) {
7029 $compound_name = $property_aliases[-1]->name
7030 . ': '
7031 . $table_aliases[0]->name;
7032 }
7033 }
7034 } # End of looping through all children of this table
7035
7036 # Here have assembled in $matches_comment all the related tables
7037 # to the current parent (preceded by the same info for all the
7038 # previous parents). Put out information that applies to all of
7039 # the current family.
7040 if (@conflicting) {
7041
7042 # But output the conflicting information now, as it applies to
7043 # just this table.
7044 my $conflicting = join ", ", @conflicting;
7045 if ($conflicting) {
7046 $matches_comment .= <<END;
7047
7048 Note that contrary to what you might expect, the above is NOT the same as
7049END
7050 $matches_comment .= "any of: " if @conflicting > 1;
7051 $matches_comment .= "$conflicting\n";
7052 }
7053 }
7054 if (@description) {
7055 $matches_comment .= "\n Meaning: "
7056 . join('; ', @description)
7057 . "\n";
7058 }
7059 if (@note) {
7060 $matches_comment .= "\n Note: "
7061 . join("\n ", @note)
7062 . "\n";
7063 }
7064 } # End of looping through all tables
7065
7066
7067 my $code_points;
7068 my $match;
7069 my $any_of_these;
7070 if ($count == 1) {
7071 $match = 'matches';
7072 $code_points = 'single code point';
7073 }
7074 else {
7075 $match = 'match';
7076 $code_points = "$string_count code points";
7077 }
7078
7079 my $synonyms;
7080 my $entries;
da912e1e 7081 if ($total_entries == 1) {
99870f4d
KW
7082 $synonyms = "";
7083 $entries = 'entry';
7084 $any_of_these = 'this'
7085 }
7086 else {
7087 $synonyms = " any of the following regular expression constructs";
7088 $entries = 'entries';
7089 $any_of_these = 'any of these'
7090 }
7091
6efd5c72 7092 my $comment = "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
99870f4d
KW
7093 if ($has_unrelated) {
7094 $comment .= <<END;
7095This file is for tables that are not necessarily related: To conserve
7096resources, every table that matches the identical set of code points in this
7097version of Unicode uses this file. Each one is listed in a separate group
7098below. It could be that the tables will match the same set of code points in
7099other Unicode releases, or it could be purely coincidence that they happen to
7100be the same in Unicode $string_version, and hence may not in other versions.
7101
7102END
7103 }
7104
7105 if (%flags) {
7106 foreach my $flag (sort keys %flags) {
7107 $comment .= <<END;
37e2e78e 7108'$flag' below means that this form is $flags{$flag}.
301ba948 7109Consult $pod_file.pod
99870f4d
KW
7110END
7111 }
7112 $comment .= "\n";
7113 }
7114
da912e1e
KW
7115 if ($total_entries == 0) {
7116 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7117 $comment .= <<END;
7118This file returns the $code_points in Unicode Version $string_version for
7119$leader, but it is inaccessible through Perl regular expressions, as
7120"\\p{prop=}" is not recognized.
7121END
7122
7123 } else {
7124 $comment .= <<END;
99870f4d
KW
7125This file returns the $code_points in Unicode Version $string_version that
7126$match$synonyms:
7127
7128$matches_comment
37e2e78e 7129$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
7130including if adding or subtracting white space, underscore, and hyphen
7131characters matters or doesn't matter, and other permissible syntactic
7132variants. Upper/lower case distinctions never matter.
7133END
7134
da912e1e 7135 }
99870f4d
KW
7136 if ($compound_name) {
7137 $comment .= <<END;
7138
7139A colon can be substituted for the equals sign, and
7140END
7141 if ($properties_with_compound_names > 1) {
7142 $comment .= <<END;
7143within each group above,
7144END
7145 }
7146 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7147
7148 # Note the \b below, it says don't make that line a continuation.
7149 $comment .= <<END;
7150anything to the left of the equals (or colon) can be combined with anything to
7151the right. Thus, for example,
7152$compound_name
7153\bis also valid.
7154END
7155 }
7156
7157 # And append any comment(s) from the actual tables. They are all
7158 # gathered here, so may not read all that well.
37e2e78e
KW
7159 if (@global_comments) {
7160 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7161 }
99870f4d
KW
7162
7163 if ($count) { # The format differs if no code points, and needs no
7164 # explanation in that case
7165 $comment.= <<END;
7166
7167The format of the lines of this file is:
7168END
7169 $comment.= <<END;
7170START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7171STOP is the ending point, or if omitted, the range has just one code point.
7172END
0c07e538 7173 if ($leader->output_range_counts) {
99870f4d
KW
7174 $comment .= <<END;
7175Numbers in comments in [brackets] indicate how many code points are in the
7176range.
7177END
7178 }
7179 }
7180
7181 $leader->set_comment(main::join_lines($comment));
7182 return;
7183 }
7184
7185 # Accessors for the underlying list
ea25a9b2 7186 for my $sub (qw(
99870f4d
KW
7187 get_valid_code_point
7188 get_invalid_code_point
ea25a9b2 7189 ))
99870f4d
KW
7190 {
7191 no strict "refs";
7192 *$sub = sub {
7193 use strict "refs";
7194 my $self = shift;
7195
7196 return $self->_range_list->$sub(@_);
7197 }
7198 }
7199} # End closure for Match_Table
7200
7201package Property;
7202
7203# The Property class represents a Unicode property, or the $perl
7204# pseudo-property. It contains a map table initialized empty at construction
7205# time, and for properties accessible through regular expressions, various
7206# match tables, created through the add_match_table() method, and referenced
7207# by the table('NAME') or tables() methods, the latter returning a list of all
7208# of the match tables. Otherwise table operations implicitly are for the map
7209# table.
7210#
7211# Most of the data in the property is actually about its map table, so it
7212# mostly just uses that table's accessors for most methods. The two could
7213# have been combined into one object, but for clarity because of their
7214# differing semantics, they have been kept separate. It could be argued that
7215# the 'file' and 'directory' fields should be kept with the map table.
7216#
7217# Each property has a type. This can be set in the constructor, or in the
7218# set_type accessor, but mostly it is figured out by the data. Every property
7219# starts with unknown type, overridden by a parameter to the constructor, or
7220# as match tables are added, or ranges added to the map table, the data is
7221# inspected, and the type changed. After the table is mostly or entirely
7222# filled, compute_type() should be called to finalize they analysis.
7223#
7224# There are very few operations defined. One can safely remove a range from
7225# the map table, and property_add_or_replace_non_nulls() adds the maps from another
7226# table to this one, replacing any in the intersection of the two.
7227
7228sub standardize { return main::standardize($_[0]); }
7229sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7230
7231{ # Closure
7232
7233 # This hash will contain as keys, all the aliases of all properties, and
7234 # as values, pointers to their respective property objects. This allows
7235 # quick look-up of a property from any of its names.
7236 my %alias_to_property_of;
7237
7238 sub dump_alias_to_property_of {
7239 # For debugging
7240
7241 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7242 return;
7243 }
7244
7245 sub property_ref {
7246 # This is a package subroutine, not called as a method.
7247 # If the single parameter is a literal '*' it returns a list of all
7248 # defined properties.
7249 # Otherwise, the single parameter is a name, and it returns a pointer
7250 # to the corresponding property object, or undef if none.
7251 #
7252 # Properties can have several different names. The 'standard' form of
7253 # each of them is stored in %alias_to_property_of as they are defined.
7254 # But it's possible that this subroutine will be called with some
7255 # variant, so if the initial lookup fails, it is repeated with the
98dc9551 7256 # standardized form of the input name. If found, besides returning the
99870f4d
KW
7257 # result, the input name is added to the list so future calls won't
7258 # have to do the conversion again.
7259
7260 my $name = shift;
7261
7262 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7263
7264 if (! defined $name) {
7265 Carp::my_carp_bug("Undefined input property. No action taken.");
7266 return;
7267 }
7268
7269 return main::uniques(values %alias_to_property_of) if $name eq '*';
7270
7271 # Return cached result if have it.
7272 my $result = $alias_to_property_of{$name};
7273 return $result if defined $result;
7274
7275 # Convert the input to standard form.
7276 my $standard_name = standardize($name);
7277
7278 $result = $alias_to_property_of{$standard_name};
7279 return unless defined $result; # Don't cache undefs
7280
7281 # Cache the result before returning it.
7282 $alias_to_property_of{$name} = $result;
7283 return $result;
7284 }
7285
7286
7287 main::setup_package();
7288
7289 my %map;
7290 # A pointer to the map table object for this property
7291 main::set_access('map', \%map);
7292
7293 my %full_name;
7294 # The property's full name. This is a duplicate of the copy kept in the
7295 # map table, but is needed because stringify needs it during
7296 # construction of the map table, and then would have a chicken before egg
7297 # problem.
7298 main::set_access('full_name', \%full_name, 'r');
7299
7300 my %table_ref;
7301 # This hash will contain as keys, all the aliases of any match tables
7302 # attached to this property, and as values, the pointers to their
7303 # respective tables. This allows quick look-up of a table from any of its
7304 # names.
7305 main::set_access('table_ref', \%table_ref);
7306
7307 my %type;
7308 # The type of the property, $ENUM, $BINARY, etc
7309 main::set_access('type', \%type, 'r');
7310
7311 my %file;
7312 # The filename where the map table will go (if actually written).
7313 # Normally defaulted, but can be overridden.
7314 main::set_access('file', \%file, 'r', 's');
7315
7316 my %directory;
7317 # The directory where the map table will go (if actually written).
7318 # Normally defaulted, but can be overridden.
7319 main::set_access('directory', \%directory, 's');
7320
7321 my %pseudo_map_type;
7322 # This is used to affect the calculation of the map types for all the
7323 # ranges in the table. It should be set to one of the values that signify
7324 # to alter the calculation.
7325 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7326
7327 my %has_only_code_point_maps;
7328 # A boolean used to help in computing the type of data in the map table.
7329 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7330
7331 my %unique_maps;
7332 # A list of the first few distinct mappings this property has. This is
7333 # used to disambiguate between binary and enum property types, so don't
7334 # have to keep more than three.
7335 main::set_access('unique_maps', \%unique_maps);
7336
56557540
KW
7337 my %pre_declared_maps;
7338 # A boolean that gives whether the input data should declare all the
7339 # tables used, or not. If the former, unknown ones raise a warning.
7340 main::set_access('pre_declared_maps',
047274f2 7341 \%pre_declared_maps, 'r', 's');
56557540 7342
99870f4d
KW
7343 sub new {
7344 # The only required parameter is the positionally first, name. All
7345 # other parameters are key => value pairs. See the documentation just
7346 # above for the meanings of the ones not passed directly on to the map
7347 # table constructor.
7348
7349 my $class = shift;
7350 my $name = shift || "";
7351
7352 my $self = property_ref($name);
7353 if (defined $self) {
7354 my $options_string = join ", ", @_;
7355 $options_string = ". Ignoring options $options_string" if $options_string;
7356 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7357 return $self;
7358 }
7359
7360 my %args = @_;
7361
7362 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 7363 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7364
7365 $directory{$addr} = delete $args{'Directory'};
7366 $file{$addr} = delete $args{'File'};
7367 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7368 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7369 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
56557540
KW
7370 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7371 # Starting in this release, property
7372 # values should be defined for all
7373 # properties, except those overriding this
7374 // $v_version ge v5.1.0;
c12f2655 7375
99870f4d
KW
7376 # Rest of parameters passed on.
7377
7378 $has_only_code_point_maps{$addr} = 1;
7379 $table_ref{$addr} = { };
7380 $unique_maps{$addr} = { };
7381
7382 $map{$addr} = Map_Table->new($name,
7383 Full_Name => $full_name{$addr},
7384 _Alias_Hash => \%alias_to_property_of,
7385 _Property => $self,
7386 %args);
7387 return $self;
7388 }
7389
7390 # See this program's beginning comment block about overloading the copy
7391 # constructor. Few operations are defined on properties, but a couple are
7392 # useful. It is safe to take the inverse of a property, and to remove a
7393 # single code point from it.
7394 use overload
7395 fallback => 0,
7396 qw("") => "_operator_stringify",
7397 "." => \&main::_operator_dot,
7398 '==' => \&main::_operator_equal,
7399 '!=' => \&main::_operator_not_equal,
7400 '=' => sub { return shift },
7401 '-=' => "_minus_and_equal",
7402 ;
7403
7404 sub _operator_stringify {
7405 return "Property '" . shift->full_name . "'";
7406 }
7407
7408 sub _minus_and_equal {
7409 # Remove a single code point from the map table of a property.
7410
7411 my $self = shift;
7412 my $other = shift;
7413 my $reversed = shift;
7414 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7415
7416 if (ref $other) {
7417 Carp::my_carp_bug("Can't cope with a "
7418 . ref($other)
7419 . " argument to '-='. Subtraction ignored.");
7420 return $self;
7421 }
98dc9551 7422 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
99870f4d
KW
7423 Carp::my_carp_bug("Can't cope with a "
7424 . __PACKAGE__
7425 . " being the first parameter in a '-='. Subtraction ignored.");
7426 return $self;
7427 }
7428 else {
f998e60c 7429 no overloading;
051df77b 7430 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
7431 }
7432 return $self;
7433 }
7434
7435 sub add_match_table {
7436 # Add a new match table for this property, with name given by the
7437 # parameter. It returns a pointer to the table.
7438
7439 my $self = shift;
7440 my $name = shift;
7441 my %args = @_;
7442
ffe43484 7443 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7444
7445 my $table = $table_ref{$addr}{$name};
7446 my $standard_name = main::standardize($name);
7447 if (defined $table
7448 || (defined ($table = $table_ref{$addr}{$standard_name})))
7449 {
7450 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7451 $table_ref{$addr}{$name} = $table;
7452 return $table;
7453 }
7454 else {
7455
7456 # See if this is a perl extension, if not passed in.
7457 my $perl_extension = delete $args{'Perl_Extension'};
7458 $perl_extension
7459 = $self->perl_extension if ! defined $perl_extension;
7460
7461 $table = Match_Table->new(
7462 Name => $name,
7463 Perl_Extension => $perl_extension,
7464 _Alias_Hash => $table_ref{$addr},
7465 _Property => $self,
7466
301ba948
KW
7467 # gets property's fate and status by default
7468 Fate => $self->fate,
99870f4d
KW
7469 Status => $self->status,
7470 _Status_Info => $self->status_info,
88c22f80 7471 %args);
99870f4d
KW
7472 return unless defined $table;
7473 }
7474
7475 # Save the names for quick look up
7476 $table_ref{$addr}{$standard_name} = $table;
7477 $table_ref{$addr}{$name} = $table;
7478
7479 # Perhaps we can figure out the type of this property based on the
7480 # fact of adding this match table. First, string properties don't
7481 # have match tables; second, a binary property can't have 3 match
7482 # tables
7483 if ($type{$addr} == $UNKNOWN) {
7484 $type{$addr} = $NON_STRING;
7485 }
7486 elsif ($type{$addr} == $STRING) {
7487 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7488 $type{$addr} = $NON_STRING;
7489 }
06f26c45 7490 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
99870f4d
KW
7491 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7492 && $type{$addr} == $BINARY)
7493 {
7494 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7495 $type{$addr} = $ENUM;
7496 }
7497 }
7498
7499 return $table;
7500 }
7501
4b9b0bc5
KW
7502 sub delete_match_table {
7503 # Delete the table referred to by $2 from the property $1.
7504
7505 my $self = shift;
7506 my $table_to_remove = shift;
7507 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7508
7509 my $addr = do { no overloading; pack 'J', $self; };
7510
7511 # Remove all names that refer to it.
7512 foreach my $key (keys %{$table_ref{$addr}}) {
7513 delete $table_ref{$addr}{$key}
7514 if $table_ref{$addr}{$key} == $table_to_remove;
7515 }
7516
7517 $table_to_remove->DESTROY;
7518 return;
7519 }
7520
99870f4d
KW
7521 sub table {
7522 # Return a pointer to the match table (with name given by the
7523 # parameter) associated with this property; undef if none.
7524
7525 my $self = shift;
7526 my $name = shift;
7527 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7528
ffe43484 7529 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7530
7531 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7532
7533 # If quick look-up failed, try again using the standard form of the
7534 # input name. If that succeeds, cache the result before returning so
7535 # won't have to standardize this input name again.
7536 my $standard_name = main::standardize($name);
7537 return unless defined $table_ref{$addr}{$standard_name};
7538
7539 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7540 return $table_ref{$addr}{$name};
7541 }
7542
7543 sub tables {
7544 # Return a list of pointers to all the match tables attached to this
7545 # property
7546
f998e60c 7547 no overloading;
051df77b 7548 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
7549 }
7550
7551 sub directory {
7552 # Returns the directory the map table for this property should be
7553 # output in. If a specific directory has been specified, that has
7554 # priority; 'undef' is returned if the type isn't defined;
7555 # or $map_directory for everything else.
7556
ffe43484 7557 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
7558
7559 return $directory{$addr} if defined $directory{$addr};
7560 return undef if $type{$addr} == $UNKNOWN;
7561 return $map_directory;
7562 }
7563
7564 sub swash_name {
7565 # Return the name that is used to both:
7566 # 1) Name the file that the map table is written to.
7567 # 2) The name of swash related stuff inside that file.
7568 # The reason for this is that the Perl core historically has used
7569 # certain names that aren't the same as the Unicode property names.
7570 # To continue using these, $file is hard-coded in this file for those,
7571 # but otherwise the standard name is used. This is different from the
7572 # external_name, so that the rest of the files, like in lib can use
7573 # the standard name always, without regard to historical precedent.
7574
7575 my $self = shift;
7576 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7577
ffe43484 7578 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 7579
19f751d2
KW
7580 # Swash names are used only on regular map tables; otherwise there
7581 # should be no access to the property map table from other parts of
7582 # Perl.
7583 return if $map{$addr}->fate != $ORDINARY;
7584
99870f4d
KW
7585 return $file{$addr} if defined $file{$addr};
7586 return $map{$addr}->external_name;
7587 }
7588
7589 sub to_create_match_tables {
7590 # Returns a boolean as to whether or not match tables should be
7591 # created for this property.
7592
7593 my $self = shift;
7594 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7595
7596 # The whole point of this pseudo property is match tables.
7597 return 1 if $self == $perl;
7598
ffe43484 7599 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7600
7601 # Don't generate tables of code points that match the property values
7602 # of a string property. Such a list would most likely have many
7603 # property values, each with just one or very few code points mapping
7604 # to it.
7605 return 0 if $type{$addr} == $STRING;
7606
7607 # Don't generate anything for unimplemented properties.
7608 return 0 if grep { $self->complete_name eq $_ }
7609 @unimplemented_properties;
7610 # Otherwise, do.
7611 return 1;
7612 }
7613
7614 sub property_add_or_replace_non_nulls {
7615 # This adds the mappings in the property $other to $self. Non-null
7616 # mappings from $other override those in $self. It essentially merges
7617 # the two properties, with the second having priority except for null
7618 # mappings.
7619
7620 my $self = shift;
7621 my $other = shift;
7622 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7623
7624 if (! $other->isa(__PACKAGE__)) {
7625 Carp::my_carp_bug("$other should be a "
7626 . __PACKAGE__
7627 . ". Not a '"
7628 . ref($other)
7629 . "'. Not added;");
7630 return;
7631 }
7632
f998e60c 7633 no overloading;
051df77b 7634 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
7635 }
7636
5be997b0
KW
7637 sub set_proxy_for {
7638 # Certain tables are not generally written out to files, but
7639 # Unicode::UCD has the intelligence to know that the file for $self
7640 # can be used to reconstruct those tables. This routine just changes
7641 # things so that UCD pod entries for those suppressed tables are
7642 # generated, so the fact that a proxy is used is invisible to the
7643 # user.
7644
7645 my $self = shift;
7646
7647 foreach my $property_name (@_) {
7648 my $ref = property_ref($property_name);
7649 next if $ref->to_output_map;
7650 $ref->set_fate($MAP_PROXIED);
7651 }
7652 }
7653
99870f4d
KW
7654 sub set_type {
7655 # Set the type of the property. Mostly this is figured out by the
7656 # data in the table. But this is used to set it explicitly. The
7657 # reason it is not a standard accessor is that when setting a binary
7658 # property, we need to make sure that all the true/false aliases are
7659 # present, as they were omitted in early Unicode releases.
7660
7661 my $self = shift;
7662 my $type = shift;
7663 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7664
06f26c45
KW
7665 if ($type != $ENUM
7666 && $type != $BINARY
7667 && $type != $FORCED_BINARY
7668 && $type != $STRING)
7669 {
99870f4d
KW
7670 Carp::my_carp("Unrecognized type '$type'. Type not set");
7671 return;
7672 }
7673
051df77b 7674 { no overloading; $type{pack 'J', $self} = $type; }
06f26c45 7675 return if $type != $BINARY && $type != $FORCED_BINARY;
99870f4d
KW
7676
7677 my $yes = $self->table('Y');
7678 $yes = $self->table('Yes') if ! defined $yes;
01adf4be
KW
7679 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7680 if ! defined $yes;
7681
3c6bf941
KW
7682 # Add aliases in order wanted, duplicates will be ignored. We use a
7683 # binary property present in all releases for its ordered lists of
7684 # true/false aliases. Note, that could run into problems in
7685 # outputting things in that we don't distinguish between the name and
7686 # full name of these. Hopefully, if the table was already created
7687 # before this code is executed, it was done with these set properly.
7688 my $bm = property_ref("Bidi_Mirrored");
7689 foreach my $alias ($bm->table("Y")->aliases) {
7690 $yes->add_alias($alias->name);
7691 }
99870f4d
KW
7692 my $no = $self->table('N');
7693 $no = $self->table('No') if ! defined $no;
01adf4be 7694 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
3c6bf941
KW
7695 foreach my $alias ($bm->table("N")->aliases) {
7696 $no->add_alias($alias->name);
7697 }
c12f2655 7698
99870f4d
KW
7699 return;
7700 }
7701
7702 sub add_map {
7703 # Add a map to the property's map table. This also keeps
7704 # track of the maps so that the property type can be determined from
7705 # its data.
7706
7707 my $self = shift;
7708 my $start = shift; # First code point in range
7709 my $end = shift; # Final code point in range
7710 my $map = shift; # What the range maps to.
7711 # Rest of parameters passed on.
7712
ffe43484 7713 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7714
7715 # If haven't the type of the property, gather information to figure it
7716 # out.
7717 if ($type{$addr} == $UNKNOWN) {
7718
7719 # If the map contains an interior blank or dash, or most other
7720 # nonword characters, it will be a string property. This
7721 # heuristic may actually miss some string properties. If so, they
7722 # may need to have explicit set_types called for them. This
7723 # happens in the Unihan properties.
7724 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7725 || $map =~ / [^\w.\/\ -] /x)
7726 {
7727 $self->set_type($STRING);
7728
7729 # $unique_maps is used for disambiguating between ENUM and
7730 # BINARY later; since we know the property is not going to be
7731 # one of those, no point in keeping the data around
7732 undef $unique_maps{$addr};
7733 }
7734 else {
7735
7736 # Not necessarily a string. The final decision has to be
7737 # deferred until all the data are in. We keep track of if all
7738 # the values are code points for that eventual decision.
7739 $has_only_code_point_maps{$addr} &=
7740 $map =~ / ^ $code_point_re $/x;
7741
7742 # For the purposes of disambiguating between binary and other
7743 # enumerations at the end, we keep track of the first three
7744 # distinct property values. Once we get to three, we know
7745 # it's not going to be binary, so no need to track more.
7746 if (scalar keys %{$unique_maps{$addr}} < 3) {
7747 $unique_maps{$addr}{main::standardize($map)} = 1;
7748 }
7749 }
7750 }
7751
7752 # Add the mapping by calling our map table's method
7753 return $map{$addr}->add_map($start, $end, $map, @_);
7754 }
7755
7756 sub compute_type {
7757 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7758 # should be called after the property is mostly filled with its maps.
7759 # We have been keeping track of what the property values have been,
7760 # and now have the necessary information to figure out the type.
7761
7762 my $self = shift;
7763 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7764
ffe43484 7765 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7766
7767 my $type = $type{$addr};
7768
7769 # If already have figured these out, no need to do so again, but we do
7770 # a double check on ENUMS to make sure that a string property hasn't
7771 # improperly been classified as an ENUM, so continue on with those.
06f26c45
KW
7772 return if $type == $STRING
7773 || $type == $BINARY
7774 || $type == $FORCED_BINARY;
99870f4d
KW
7775
7776 # If every map is to a code point, is a string property.
7777 if ($type == $UNKNOWN
7778 && ($has_only_code_point_maps{$addr}
7779 || (defined $map{$addr}->default_map
7780 && $map{$addr}->default_map eq "")))
7781 {
7782 $self->set_type($STRING);
7783 }
7784 else {
7785
7786 # Otherwise, it is to some sort of enumeration. (The case where
7787 # it is a Unicode miscellaneous property, and treated like a
7788 # string in this program is handled in add_map()). Distinguish
7789 # between binary and some other enumeration type. Of course, if
7790 # there are more than two values, it's not binary. But more
7791 # subtle is the test that the default mapping is defined means it
7792 # isn't binary. This in fact may change in the future if Unicode
7793 # changes the way its data is structured. But so far, no binary
7794 # properties ever have @missing lines for them, so the default map
7795 # isn't defined for them. The few properties that are two-valued
7796 # and aren't considered binary have the default map defined
7797 # starting in Unicode 5.0, when the @missing lines appeared; and
7798 # this program has special code to put in a default map for them
7799 # for earlier than 5.0 releases.
7800 if ($type == $ENUM
7801 || scalar keys %{$unique_maps{$addr}} > 2
7802 || defined $self->default_map)
7803 {
7804 my $tables = $self->tables;
7805 my $count = $self->count;
7806 if ($verbosity && $count > 500 && $tables/$count > .1) {
7807 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7808 }
7809 $self->set_type($ENUM);
7810 }
7811 else {
7812 $self->set_type($BINARY);
7813 }
7814 }
7815 undef $unique_maps{$addr}; # Garbage collect
7816 return;
7817 }
7818
301ba948
KW
7819 sub set_fate {
7820 my $self = shift;
7821 my $fate = shift;
7822 my $reason = shift; # Ignored unless suppressing
7823 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7824
7825 my $addr = do { no overloading; pack 'J', $self; };
7826 if ($fate == $SUPPRESSED) {
7827 $why_suppressed{$self->complete_name} = $reason;
7828 }
7829
395dfc19
KW
7830 # Each table shares the property's fate, except that MAP_PROXIED
7831 # doesn't affect match tables
7832 $map{$addr}->set_fate($fate, $reason);
7833 if ($fate != $MAP_PROXIED) {
13a49092
KW
7834 foreach my $table ($map{$addr}, $self->tables) {
7835 $table->set_fate($fate, $reason);
7836 }
395dfc19 7837 }
301ba948
KW
7838 return;
7839 }
7840
7841
99870f4d
KW
7842 # Most of the accessors for a property actually apply to its map table.
7843 # Setup up accessor functions for those, referring to %map
ea25a9b2 7844 for my $sub (qw(
99870f4d
KW
7845 add_alias
7846 add_anomalous_entry
7847 add_comment
7848 add_conflicting
7849 add_description
7850 add_duplicate
7851 add_note
7852 aliases
7853 comment
7854 complete_name
2f7a8815 7855 containing_range
99870f4d
KW
7856 count
7857 default_map
7858 delete_range
7859 description
7860 each_range
7861 external_name
301ba948 7862 fate
99870f4d
KW
7863 file_path
7864 format
7865 initialize
7866 inverse_list
7867 is_empty
7868 name
7869 note
7870 perl_extension
7871 property
7872 range_count
7873 ranges
7874 range_size_1
7875 reset_each_range
7876 set_comment
99870f4d
KW
7877 set_default_map
7878 set_file_path
7879 set_final_comment
7880 set_range_size_1
7881 set_status
7882 set_to_output_map
7883 short_name
7884 status
7885 status_info
7886 to_output_map
0a9dbafc 7887 type_of
99870f4d
KW
7888 value_of
7889 write
ea25a9b2 7890 ))
99870f4d
KW
7891 # 'property' above is for symmetry, so that one can take
7892 # the property of a property and get itself, and so don't
7893 # have to distinguish between properties and tables in
7894 # calling code
7895 {
7896 no strict "refs";
7897 *$sub = sub {
7898 use strict "refs";
7899 my $self = shift;
f998e60c 7900 no overloading;
051df77b 7901 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7902 }
7903 }
7904
7905
7906} # End closure
7907
7908package main;
7909
7910sub join_lines($) {
7911 # Returns lines of the input joined together, so that they can be folded
7912 # properly.
7913 # This causes continuation lines to be joined together into one long line
7914 # for folding. A continuation line is any line that doesn't begin with a
7915 # space or "\b" (the latter is stripped from the output). This is so
7916 # lines can be be in a HERE document so as to fit nicely in the terminal
7917 # width, but be joined together in one long line, and then folded with
7918 # indents, '#' prefixes, etc, properly handled.
7919 # A blank separates the joined lines except if there is a break; an extra
7920 # blank is inserted after a period ending a line.
7921
98dc9551 7922 # Initialize the return with the first line.
99870f4d
KW
7923 my ($return, @lines) = split "\n", shift;
7924
7925 # If the first line is null, it was an empty line, add the \n back in
7926 $return = "\n" if $return eq "";
7927
7928 # Now join the remainder of the physical lines.
7929 for my $line (@lines) {
7930
7931 # An empty line means wanted a blank line, so add two \n's to get that
7932 # effect, and go to the next line.
7933 if (length $line == 0) {
7934 $return .= "\n\n";
7935 next;
7936 }
7937
7938 # Look at the last character of what we have so far.
7939 my $previous_char = substr($return, -1, 1);
7940
7941 # And at the next char to be output.
7942 my $next_char = substr($line, 0, 1);
7943
7944 if ($previous_char ne "\n") {
7945
7946 # Here didn't end wth a nl. If the next char a blank or \b, it
7947 # means that here there is a break anyway. So add a nl to the
7948 # output.
7949 if ($next_char eq " " || $next_char eq "\b") {
7950 $previous_char = "\n";
7951 $return .= $previous_char;
7952 }
7953
7954 # Add an extra space after periods.
7955 $return .= " " if $previous_char eq '.';
7956 }
7957
7958 # Here $previous_char is still the latest character to be output. If
7959 # it isn't a nl, it means that the next line is to be a continuation
7960 # line, with a blank inserted between them.
7961 $return .= " " if $previous_char ne "\n";
7962
7963 # Get rid of any \b
7964 substr($line, 0, 1) = "" if $next_char eq "\b";
7965
7966 # And append this next line.
7967 $return .= $line;
7968 }
7969
7970 return $return;
7971}
7972
7973sub simple_fold($;$$$) {
7974 # Returns a string of the input (string or an array of strings) folded
7975 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7976 # a \n
7977 # This is tailored for the kind of text written by this program,
7978 # especially the pod file, which can have very long names with
7979 # underscores in the middle, or words like AbcDefgHij.... We allow
7980 # breaking in the middle of such constructs if the line won't fit
7981 # otherwise. The break in such cases will come either just after an
7982 # underscore, or just before one of the Capital letters.
7983
7984 local $to_trace = 0 if main::DEBUG;
7985
7986 my $line = shift;
7987 my $prefix = shift; # Optional string to prepend to each output
7988 # line
7989 $prefix = "" unless defined $prefix;
7990
7991 my $hanging_indent = shift; # Optional number of spaces to indent
7992 # continuation lines
7993 $hanging_indent = 0 unless $hanging_indent;
7994
7995 my $right_margin = shift; # Optional number of spaces to narrow the
7996 # total width by.
7997 $right_margin = 0 unless defined $right_margin;
7998
7999 # Call carp with the 'nofold' option to avoid it from trying to call us
8000 # recursively
8001 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8002
8003 # The space available doesn't include what's automatically prepended
8004 # to each line, or what's reserved on the right.
8005 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8006 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8007
8008 if (DEBUG && $hanging_indent >= $max) {
8009 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8010 $hanging_indent = 0;
8011 }
8012
8013 # First, split into the current physical lines.
8014 my @line;
8015 if (ref $line) { # Better be an array, because not bothering to
8016 # test
8017 foreach my $line (@{$line}) {
8018 push @line, split /\n/, $line;
8019 }
8020 }
8021 else {
8022 @line = split /\n/, $line;
8023 }
8024
8025 #local $to_trace = 1 if main::DEBUG;
8026 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8027
8028 # Look at each current physical line.
8029 for (my $i = 0; $i < @line; $i++) {
8030 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8031 #local $to_trace = 1 if main::DEBUG;
8032 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8033
8034 # Remove prefix, because will be added back anyway, don't want
8035 # doubled prefix
8036 $line[$i] =~ s/^$prefix//;
8037
8038 # Remove trailing space
8039 $line[$i] =~ s/\s+\Z//;
8040
8041 # If the line is too long, fold it.
8042 if (length $line[$i] > $max) {
8043 my $remainder;
8044
8045 # Here needs to fold. Save the leading space in the line for
8046 # later.
8047 $line[$i] =~ /^ ( \s* )/x;
8048 my $leading_space = $1;
8049 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8050
8051 # If character at final permissible position is white space,
8052 # fold there, which will delete that white space
8053 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8054 $remainder = substr($line[$i], $max);
8055 $line[$i] = substr($line[$i], 0, $max - 1);
8056 }
8057 else {
8058
8059 # Otherwise fold at an acceptable break char closest to
8060 # the max length. Look at just the maximal initial
8061 # segment of the line
8062 my $segment = substr($line[$i], 0, $max - 1);
8063 if ($segment =~
8064 /^ ( .{$hanging_indent} # Don't look before the
8065 # indent.
8066 \ * # Don't look in leading
8067 # blanks past the indent
8068 [^ ] .* # Find the right-most
8069 (?: # acceptable break:
8070 [ \s = ] # space or equal
8071 | - (?! [.0-9] ) # or non-unary minus.
8072 ) # $1 includes the character
8073 )/x)
8074 {
8075 # Split into the initial part that fits, and remaining
8076 # part of the input
8077 $remainder = substr($line[$i], length $1);
8078 $line[$i] = $1;
8079 trace $line[$i] if DEBUG && $to_trace;
8080 trace $remainder if DEBUG && $to_trace;
8081 }
8082
8083 # If didn't find a good breaking spot, see if there is a
8084 # not-so-good breaking spot. These are just after
8085 # underscores or where the case changes from lower to
8086 # upper. Use \a as a soft hyphen, but give up
8087 # and don't break the line if there is actually a \a
8088 # already in the input. We use an ascii character for the
8089 # soft-hyphen to avoid any attempt by miniperl to try to
8090 # access the files that this program is creating.
8091 elsif ($segment !~ /\a/
8092 && ($segment =~ s/_/_\a/g
8093 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8094 {
8095 # Here were able to find at least one place to insert
8096 # our substitute soft hyphen. Find the right-most one
8097 # and replace it by a real hyphen.
8098 trace $segment if DEBUG && $to_trace;
8099 substr($segment,
8100 rindex($segment, "\a"),
8101 1) = '-';
8102
8103 # Then remove the soft hyphen substitutes.
8104 $segment =~ s/\a//g;
8105 trace $segment if DEBUG && $to_trace;
8106
8107 # And split into the initial part that fits, and
8108 # remainder of the line
8109 my $pos = rindex($segment, '-');
8110 $remainder = substr($line[$i], $pos);
8111 trace $remainder if DEBUG && $to_trace;
8112 $line[$i] = substr($segment, 0, $pos + 1);
8113 }
8114 }
8115
8116 # Here we know if we can fold or not. If we can, $remainder
8117 # is what remains to be processed in the next iteration.
8118 if (defined $remainder) {
8119 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8120
8121 # Insert the folded remainder of the line as a new element
8122 # of the array. (It may still be too long, but we will
8123 # deal with that next time through the loop.) Omit any
8124 # leading space in the remainder.
8125 $remainder =~ s/^\s+//;
8126 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8127
8128 # But then indent by whichever is larger of:
8129 # 1) the leading space on the input line;
8130 # 2) the hanging indent.
8131 # This preserves indentation in the original line.
8132 my $lead = ($leading_space)
8133 ? length $leading_space
8134 : $hanging_indent;
8135 $lead = max($lead, $hanging_indent);
8136 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8137 }
8138 }
8139
8140 # Ready to output the line. Get rid of any trailing space
8141 # And prefix by the required $prefix passed in.
8142 $line[$i] =~ s/\s+$//;
8143 $line[$i] = "$prefix$line[$i]\n";
8144 } # End of looping through all the lines.
8145
8146 return join "", @line;
8147}
8148
8149sub property_ref { # Returns a reference to a property object.
8150 return Property::property_ref(@_);
8151}
8152
8153sub force_unlink ($) {
8154 my $filename = shift;
8155 return unless file_exists($filename);
8156 return if CORE::unlink($filename);
8157
8158 # We might need write permission
8159 chmod 0777, $filename;
8160 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8161 return;
8162}
8163
9218f1cf 8164sub write ($$@) {
9abe8df8
KW
8165 # Given a filename and references to arrays of lines, write the lines of
8166 # each array to the file
99870f4d
KW
8167 # Filename can be given as an arrayref of directory names
8168
9218f1cf 8169 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
99870f4d 8170
9abe8df8 8171 my $file = shift;
9218f1cf 8172 my $use_utf8 = shift;
99870f4d
KW
8173
8174 # Get into a single string if an array, and get rid of, in Unix terms, any
8175 # leading '.'
8176 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8177 $file = File::Spec->canonpath($file);
8178
8179 # If has directories, make sure that they all exist
8180 (undef, my $directories, undef) = File::Spec->splitpath($file);
8181 File::Path::mkpath($directories) if $directories && ! -d $directories;
8182
8183 push @files_actually_output, $file;
8184
99870f4d
KW
8185 force_unlink ($file);
8186
8187 my $OUT;
8188 if (not open $OUT, ">", $file) {
8189 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8190 return;
8191 }
430ada4c 8192
9218f1cf
KW
8193 binmode $OUT, ":utf8" if $use_utf8;
8194
9abe8df8
KW
8195 while (defined (my $lines_ref = shift)) {
8196 unless (@$lines_ref) {
8197 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8198 }
8199
8200 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8201 }
430ada4c
NC
8202 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8203
99870f4d
KW
8204 print "$file written.\n" if $verbosity >= $VERBOSE;
8205
99870f4d
KW
8206 return;
8207}
8208
8209
8210sub Standardize($) {
8211 # This converts the input name string into a standardized equivalent to
8212 # use internally.
8213
8214 my $name = shift;
8215 unless (defined $name) {
8216 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8217 return;
8218 }
8219
8220 # Remove any leading or trailing white space
8221 $name =~ s/^\s+//g;
8222 $name =~ s/\s+$//g;
8223
98dc9551 8224 # Convert interior white space and hyphens into underscores.
99870f4d
KW
8225 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8226
8227 # Capitalize the letter following an underscore, and convert a sequence of
8228 # multiple underscores to a single one
8229 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8230
8231 # And capitalize the first letter, but not for the special cjk ones.
8232 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8233 return $name;
8234}
8235
8236sub standardize ($) {
8237 # Returns a lower-cased standardized name, without underscores. This form
8238 # is chosen so that it can distinguish between any real versus superficial
8239 # Unicode name differences. It relies on the fact that Unicode doesn't
8240 # have interior underscores, white space, nor dashes in any
8241 # stricter-matched name. It should not be used on Unicode code point
8242 # names (the Name property), as they mostly, but not always follow these
8243 # rules.
8244
8245 my $name = Standardize(shift);
8246 return if !defined $name;
8247
8248 $name =~ s/ (?<= .) _ (?= . ) //xg;
8249 return lc $name;
8250}
8251
c85f591a
KW
8252sub utf8_heavy_name ($$) {
8253 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8254 # perhaps this function should be placed somewhere, like Heavy.pl so that
8255 # utf8_heavy can use it directly without duplicating code that can get
8256 # out-of sync.
8257
8258 my $table = shift;
8259 my $alias = shift;
8260 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8261
8262 my $property = $table->property;
8263 $property = ($property == $perl)
8264 ? "" # 'perl' is never explicitly stated
8265 : standardize($property->name) . '=';
8266 if ($alias->loose_match) {
8267 return $property . standardize($alias->name);
8268 }
8269 else {
8270 return lc ($property . $alias->name);
8271 }
8272
8273 return;
8274}
8275
99870f4d
KW
8276{ # Closure
8277
7e3121cc 8278 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
99870f4d
KW
8279 my %already_output;
8280
8281 $main::simple_dumper_nesting = 0;
8282
8283 sub simple_dumper {
8284 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8285 # the real thing as we have to run under miniperl.
8286
8287 # It is designed so that on input it is at the beginning of a line,
8288 # and the final thing output in any call is a trailing ",\n".
8289
8290 my $item = shift;
8291 my $indent = shift;
7e3121cc 8292 $indent = "" if ! $debugging_build || ! defined $indent;
99870f4d
KW
8293
8294 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8295
8296 # nesting level is localized, so that as the call stack pops, it goes
8297 # back to the prior value.
8298 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8299 undef %already_output if $main::simple_dumper_nesting == 0;
8300 $main::simple_dumper_nesting++;
8301 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8302
8303 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8304
8305 # Determine the indent for recursive calls.
8306 my $next_indent = $indent . $indent_increment;
8307
8308 my $output;
8309 if (! ref $item) {
8310
8311 # Dump of scalar: just output it in quotes if not a number. To do
8312 # so we must escape certain characters, and therefore need to
8313 # operate on a copy to avoid changing the original
8314 my $copy = $item;
8315 $copy = $UNDEF unless defined $copy;
8316
02cc6656
KW
8317 # Quote non-integers (integers also have optional leading '-')
8318 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
99870f4d
KW
8319
8320 # Escape apostrophe and backslash
8321 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8322 $copy = "'$copy'";
8323 }
8324 $output = "$indent$copy,\n";
8325 }
8326 else {
8327
8328 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 8329 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 8330 if (defined $already_output{$addr}) {
99870f4d
KW
8331 return "${indent}ALREADY OUTPUT: $item\n";
8332 }
f998e60c 8333 $already_output{$addr} = $item;
99870f4d
KW
8334
8335 if (ref $item eq 'ARRAY') {
8336 my $using_brackets;
8337 $output = $indent;
8338 if ($main::simple_dumper_nesting > 1) {
8339 $output .= '[';
8340 $using_brackets = 1;
8341 }
8342 else {
8343 $using_brackets = 0;
8344 }
8345
8346 # If the array is empty, put the closing bracket on the same
8347 # line. Otherwise, recursively add each array element
8348 if (@$item == 0) {
8349 $output .= " ";
8350 }
8351 else {
8352 $output .= "\n";
8353 for (my $i = 0; $i < @$item; $i++) {
8354
8355 # Indent array elements one level
8356 $output .= &simple_dumper($item->[$i], $next_indent);
7e3121cc 8357 next if ! $debugging_build;
c12f2655
KW
8358 $output =~ s/\n$//; # Remove any trailing nl so
8359 $output .= " # [$i]\n"; # as to add a comment giving
8360 # the array index
99870f4d
KW
8361 }
8362 $output .= $indent; # Indent closing ']' to orig level
8363 }
8364 $output .= ']' if $using_brackets;
8365 $output .= ",\n";
8366 }
8367 elsif (ref $item eq 'HASH') {
8368 my $is_first_line;
8369 my $using_braces;
8370 my $body_indent;
8371
8372 # No surrounding braces at top level
8373 $output .= $indent;
8374 if ($main::simple_dumper_nesting > 1) {
8375 $output .= "{\n";
8376 $is_first_line = 0;
8377 $body_indent = $next_indent;
8378 $next_indent .= $indent_increment;
8379 $using_braces = 1;
8380 }
8381 else {
8382 $is_first_line = 1;
8383 $body_indent = $indent;
8384 $using_braces = 0;
8385 }
8386
8387 # Output hashes sorted alphabetically instead of apparently
8388 # random. Use caseless alphabetic sort
8389 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8390 {
8391 if ($is_first_line) {
8392 $is_first_line = 0;
8393 }
8394 else {
8395 $output .= "$body_indent";
8396 }
8397
8398 # The key must be a scalar, but this recursive call quotes
8399 # it
8400 $output .= &simple_dumper($key);
8401
8402 # And change the trailing comma and nl to the hash fat
8403 # comma for clarity, and so the value can be on the same
8404 # line
8405 $output =~ s/,\n$/ => /;
8406
8407 # Recursively call to get the value's dump.
8408 my $next = &simple_dumper($item->{$key}, $next_indent);
8409
8410 # If the value is all on one line, remove its indent, so
8411 # will follow the => immediately. If it takes more than
8412 # one line, start it on a new line.
8413 if ($next !~ /\n.*\n/) {
8414 $next =~ s/^ *//;
8415 }
8416 else {
8417 $output .= "\n";
8418 }
8419 $output .= $next;
8420 }
8421
8422 $output .= "$indent},\n" if $using_braces;
8423 }
8424 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8425 $output = $indent . ref($item) . "\n";
8426 # XXX see if blessed
8427 }
8428 elsif ($item->can('dump')) {
8429
8430 # By convention in this program, objects furnish a 'dump'
8431 # method. Since not doing any output at this level, just pass
8432 # on the input indent
8433 $output = $item->dump($indent);
8434 }
8435 else {
8436 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8437 }
8438 }
8439 return $output;
8440 }
8441}
8442
8443sub dump_inside_out {
8444 # Dump inside-out hashes in an object's state by converting them to a
8445 # regular hash and then calling simple_dumper on that.
8446
8447 my $object = shift;
8448 my $fields_ref = shift;
8449 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8450
ffe43484 8451 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
8452
8453 my %hash;
8454 foreach my $key (keys %$fields_ref) {
8455 $hash{$key} = $fields_ref->{$key}{$addr};
8456 }
8457
8458 return simple_dumper(\%hash, @_);
8459}
8460
8461sub _operator_dot {
8462 # Overloaded '.' method that is common to all packages. It uses the
8463 # package's stringify method.
8464
8465 my $self = shift;
8466 my $other = shift;
8467 my $reversed = shift;
8468 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8469
8470 $other = "" unless defined $other;
8471
8472 foreach my $which (\$self, \$other) {
8473 next unless ref $$which;
8474 if ($$which->can('_operator_stringify')) {
8475 $$which = $$which->_operator_stringify;
8476 }
8477 else {
8478 my $ref = ref $$which;
ffe43484 8479 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
8480 $$which = "$ref ($addr)";
8481 }
8482 }
8483 return ($reversed)
8484 ? "$other$self"
8485 : "$self$other";
8486}
8487
8488sub _operator_equal {
8489 # Generic overloaded '==' routine. To be equal, they must be the exact
8490 # same object
8491
8492 my $self = shift;
8493 my $other = shift;
8494
8495 return 0 unless defined $other;
8496 return 0 unless ref $other;
f998e60c 8497 no overloading;
2100aa98 8498 return $self == $other;
99870f4d
KW
8499}
8500
8501sub _operator_not_equal {
8502 my $self = shift;
8503 my $other = shift;
8504
8505 return ! _operator_equal($self, $other);
8506}
8507
8508sub process_PropertyAliases($) {
8509 # This reads in the PropertyAliases.txt file, which contains almost all
8510 # the character properties in Unicode and their equivalent aliases:
8511 # scf ; Simple_Case_Folding ; sfc
8512 #
8513 # Field 0 is the preferred short name for the property.
8514 # Field 1 is the full name.
8515 # Any succeeding ones are other accepted names.
8516
8517 my $file= shift;
8518 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8519
8520 # This whole file was non-existent in early releases, so use our own
8521 # internal one.
8522 $file->insert_lines(get_old_property_aliases())
8523 if ! -e 'PropertyAliases.txt';
8524
8525 # Add any cjk properties that may have been defined.
8526 $file->insert_lines(@cjk_properties);
8527
8528 while ($file->next_line) {
8529
8530 my @data = split /\s*;\s*/;
8531
8532 my $full = $data[1];
8533
8534 my $this = Property->new($data[0], Full_Name => $full);
8535
8536 # Start looking for more aliases after these two.
8537 for my $i (2 .. @data - 1) {
8538 $this->add_alias($data[$i]);
8539 }
8540
8541 }
8542 return;
8543}
8544
8545sub finish_property_setup {
8546 # Finishes setting up after PropertyAliases.
8547
8548 my $file = shift;
8549 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8550
8551 # This entry was missing from this file in earlier Unicode versions
8552 if (-e 'Jamo.txt') {
8553 my $jsn = property_ref('JSN');
8554 if (! defined $jsn) {
8555 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8556 }
8557 }
8558
5f7264c7 8559 # This entry is still missing as of 6.0, perhaps because no short name for
99870f4d
KW
8560 # it.
8561 if (-e 'NameAliases.txt') {
8562 my $aliases = property_ref('Name_Alias');
8563 if (! defined $aliases) {
8564 $aliases = Property->new('Name_Alias');
8565 }
8566 }
8567
8568 # These are used so much, that we set globals for them.
8569 $gc = property_ref('General_Category');
8570 $block = property_ref('Block');
359523e2 8571 $script = property_ref('Script');
99870f4d
KW
8572
8573 # Perl adds this alias.
8574 $gc->add_alias('Category');
8575
8576 # For backwards compatibility, these property files have particular names.
83b7c87d
KW
8577 property_ref('Uppercase_Mapping')->set_file('Upper'); # This is what
8578 # utf8.c calls it
8579 property_ref('Lowercase_Mapping')->set_file('Lower');
8580 property_ref('Titlecase_Mapping')->set_file('Title');
99870f4d
KW
8581
8582 my $fold = property_ref('Case_Folding');
8583 $fold->set_file('Fold') if defined $fold;
8584
d3cbe105
KW
8585 # Unicode::Normalize expects this file with this name and directory.
8586 my $ccc = property_ref('Canonical_Combining_Class');
8587 if (defined $ccc) {
8588 $ccc->set_file('CombiningClass');
8589 $ccc->set_directory(File::Spec->curdir());
8590 }
8591
2cd56239
KW
8592 # utf8.c has a different meaning for non range-size-1 for map properties
8593 # that this program doesn't currently handle; and even if it were changed
8594 # to do so, some other code may be using them expecting range size 1.
99870f4d
KW
8595 foreach my $property (qw {
8596 Case_Folding
8597 Lowercase_Mapping
8598 Titlecase_Mapping
8599 Uppercase_Mapping
8600 })
8601 {
8602 property_ref($property)->set_range_size_1(1);
8603 }
8604
8605 # These two properties aren't actually used in the core, but unfortunately
8606 # the names just above that are in the core interfere with these, so
8607 # choose different names. These aren't a problem unless the map tables
8608 # for these files get written out.
8609 my $lowercase = property_ref('Lowercase');
8610 $lowercase->set_file('IsLower') if defined $lowercase;
8611 my $uppercase = property_ref('Uppercase');
8612 $uppercase->set_file('IsUpper') if defined $uppercase;
8613
8614 # Set up the hard-coded default mappings, but only on properties defined
8615 # for this release
8616 foreach my $property (keys %default_mapping) {
8617 my $property_object = property_ref($property);
8618 next if ! defined $property_object;
8619 my $default_map = $default_mapping{$property};
8620 $property_object->set_default_map($default_map);
8621
8622 # A map of <code point> implies the property is string.
8623 if ($property_object->type == $UNKNOWN
8624 && $default_map eq $CODE_POINT)
8625 {
8626 $property_object->set_type($STRING);
8627 }
8628 }
8629
8630 # The following use the Multi_Default class to create objects for
8631 # defaults.
8632
8633 # Bidi class has a complicated default, but the derived file takes care of
8634 # the complications, leaving just 'L'.
8635 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8636 property_ref('Bidi_Class')->set_default_map('L');
8637 }
8638 else {
8639 my $default;
8640
8641 # The derived file was introduced in 3.1.1. The values below are
8642 # taken from table 3-8, TUS 3.0
8643 my $default_R =
8644 'my $default = Range_List->new;
8645 $default->add_range(0x0590, 0x05FF);
8646 $default->add_range(0xFB1D, 0xFB4F);'
8647 ;
8648
8649 # The defaults apply only to unassigned characters
a67f160a 8650 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
8651
8652 if ($v_version lt v3.0.0) {
8653 $default = Multi_Default->new(R => $default_R, 'L');
8654 }
8655 else {
8656
8657 # AL apparently not introduced until 3.0: TUS 2.x references are
8658 # not on-line to check it out
8659 my $default_AL =
8660 'my $default = Range_List->new;
8661 $default->add_range(0x0600, 0x07BF);
8662 $default->add_range(0xFB50, 0xFDFF);
8663 $default->add_range(0xFE70, 0xFEFF);'
8664 ;
8665
8666 # Non-character code points introduced in this release; aren't AL
8667 if ($v_version ge 3.1.0) {
8668 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8669 }
a67f160a 8670 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
8671 $default = Multi_Default->new(AL => $default_AL,
8672 R => $default_R,
8673 'L');
8674 }
8675 property_ref('Bidi_Class')->set_default_map($default);
8676 }
8677
8678 # Joining type has a complicated default, but the derived file takes care
8679 # of the complications, leaving just 'U' (or Non_Joining), except the file
8680 # is bad in 3.1.0
8681 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8682 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8683 property_ref('Joining_Type')->set_default_map('Non_Joining');
8684 }
8685 else {
8686
8687 # Otherwise, there are not one, but two possibilities for the
8688 # missing defaults: T and U.
8689 # The missing defaults that evaluate to T are given by:
8690 # T = Mn + Cf - ZWNJ - ZWJ
8691 # where Mn and Cf are the general category values. In other words,
8692 # any non-spacing mark or any format control character, except
8693 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8694 # WIDTH JOINER (joining type C).
8695 my $default = Multi_Default->new(
8696 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8697 'Non_Joining');
8698 property_ref('Joining_Type')->set_default_map($default);
8699 }
8700 }
8701
8702 # Line break has a complicated default in early releases. It is 'Unknown'
8703 # for non-assigned code points; 'AL' for assigned.
8704 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8705 my $lb = property_ref('Line_Break');
8706 if ($v_version gt 3.2.0) {
8707 $lb->set_default_map('Unknown');
8708 }
8709 else {
8710 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8711 'AL');
8712 $lb->set_default_map($default);
8713 }
8714
8715 # If has the URS property, make sure that the standard aliases are in
8716 # it, since not in the input tables in some versions.
8717 my $urs = property_ref('Unicode_Radical_Stroke');
8718 if (defined $urs) {
8719 $urs->add_alias('cjkRSUnicode');
8720 $urs->add_alias('kRSUnicode');
8721 }
8722 }
8723 return;
8724}
8725
8726sub get_old_property_aliases() {
8727 # Returns what would be in PropertyAliases.txt if it existed in very old
8728 # versions of Unicode. It was derived from the one in 3.2, and pared
8729 # down based on the data that was actually in the older releases.
8730 # An attempt was made to use the existence of files to mean inclusion or
8731 # not of various aliases, but if this was not sufficient, using version
8732 # numbers was resorted to.
8733
8734 my @return;
8735
8736 # These are to be used in all versions (though some are constructed by
8737 # this program if missing)
8738 push @return, split /\n/, <<'END';
8739bc ; Bidi_Class
8740Bidi_M ; Bidi_Mirrored
8741cf ; Case_Folding
8742ccc ; Canonical_Combining_Class
8743dm ; Decomposition_Mapping
8744dt ; Decomposition_Type
8745gc ; General_Category
8746isc ; ISO_Comment
8747lc ; Lowercase_Mapping
8748na ; Name
8749na1 ; Unicode_1_Name
8750nt ; Numeric_Type
8751nv ; Numeric_Value
8752sfc ; Simple_Case_Folding
8753slc ; Simple_Lowercase_Mapping
8754stc ; Simple_Titlecase_Mapping
8755suc ; Simple_Uppercase_Mapping
8756tc ; Titlecase_Mapping
8757uc ; Uppercase_Mapping
8758END
8759
8760 if (-e 'Blocks.txt') {
8761 push @return, "blk ; Block\n";
8762 }
8763 if (-e 'ArabicShaping.txt') {
8764 push @return, split /\n/, <<'END';
8765jg ; Joining_Group
8766jt ; Joining_Type
8767END
8768 }
8769 if (-e 'PropList.txt') {
8770
8771 # This first set is in the original old-style proplist.
8772 push @return, split /\n/, <<'END';
8773Alpha ; Alphabetic
8774Bidi_C ; Bidi_Control
8775Dash ; Dash
8776Dia ; Diacritic
8777Ext ; Extender
8778Hex ; Hex_Digit
8779Hyphen ; Hyphen
8780IDC ; ID_Continue
8781Ideo ; Ideographic
8782Join_C ; Join_Control
8783Math ; Math
8784QMark ; Quotation_Mark
8785Term ; Terminal_Punctuation
8786WSpace ; White_Space
8787END
8788 # The next sets were added later
8789 if ($v_version ge v3.0.0) {
8790 push @return, split /\n/, <<'END';
8791Upper ; Uppercase
8792Lower ; Lowercase
8793END
8794 }
8795 if ($v_version ge v3.0.1) {
8796 push @return, split /\n/, <<'END';
8797NChar ; Noncharacter_Code_Point
8798END
8799 }
8800 # The next sets were added in the new-style
8801 if ($v_version ge v3.1.0) {
8802 push @return, split /\n/, <<'END';
8803OAlpha ; Other_Alphabetic
8804OLower ; Other_Lowercase
8805OMath ; Other_Math
8806OUpper ; Other_Uppercase
8807END
8808 }
8809 if ($v_version ge v3.1.1) {
8810 push @return, "AHex ; ASCII_Hex_Digit\n";
8811 }
8812 }
8813 if (-e 'EastAsianWidth.txt') {
8814 push @return, "ea ; East_Asian_Width\n";
8815 }
8816 if (-e 'CompositionExclusions.txt') {
8817 push @return, "CE ; Composition_Exclusion\n";
8818 }
8819 if (-e 'LineBreak.txt') {
8820 push @return, "lb ; Line_Break\n";
8821 }
8822 if (-e 'BidiMirroring.txt') {
8823 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8824 }
8825 if (-e 'Scripts.txt') {
8826 push @return, "sc ; Script\n";
8827 }
8828 if (-e 'DNormalizationProps.txt') {
8829 push @return, split /\n/, <<'END';
8830Comp_Ex ; Full_Composition_Exclusion
8831FC_NFKC ; FC_NFKC_Closure
8832NFC_QC ; NFC_Quick_Check
8833NFD_QC ; NFD_Quick_Check
8834NFKC_QC ; NFKC_Quick_Check
8835NFKD_QC ; NFKD_Quick_Check
8836XO_NFC ; Expands_On_NFC
8837XO_NFD ; Expands_On_NFD
8838XO_NFKC ; Expands_On_NFKC
8839XO_NFKD ; Expands_On_NFKD
8840END
8841 }
8842 if (-e 'DCoreProperties.txt') {
8843 push @return, split /\n/, <<'END';
8844IDS ; ID_Start
8845XIDC ; XID_Continue
8846XIDS ; XID_Start
8847END
8848 # These can also appear in some versions of PropList.txt
8849 push @return, "Lower ; Lowercase\n"
8850 unless grep { $_ =~ /^Lower\b/} @return;
8851 push @return, "Upper ; Uppercase\n"
8852 unless grep { $_ =~ /^Upper\b/} @return;
8853 }
8854
8855 # This flag requires the DAge.txt file to be copied into the directory.
8856 if (DEBUG && $compare_versions) {
8857 push @return, 'age ; Age';
8858 }
8859
8860 return @return;
8861}
8862
8863sub process_PropValueAliases {
8864 # This file contains values that properties look like:
8865 # bc ; AL ; Arabic_Letter
8866 # blk; n/a ; Greek_And_Coptic ; Greek
8867 #
8868 # Field 0 is the property.
8869 # Field 1 is the short name of a property value or 'n/a' if no
8870 # short name exists;
8871 # Field 2 is the full property value name;
8872 # Any other fields are more synonyms for the property value.
8873 # Purely numeric property values are omitted from the file; as are some
8874 # others, fewer and fewer in later releases
8875
8876 # Entries for the ccc property have an extra field before the
8877 # abbreviation:
8878 # ccc; 0; NR ; Not_Reordered
8879 # It is the numeric value that the names are synonyms for.
8880
8881 # There are comment entries for values missing from this file:
8882 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8883 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8884
8885 my $file= shift;
8886 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8887
8888 # This whole file was non-existent in early releases, so use our own
8889 # internal one if necessary.
8890 if (! -e 'PropValueAliases.txt') {
8891 $file->insert_lines(get_old_property_value_aliases());
8892 }
8893
8894 # Add any explicit cjk values
8895 $file->insert_lines(@cjk_property_values);
8896
8897 # This line is used only for testing the code that checks for name
8898 # conflicts. There is a script Inherited, and when this line is executed
8899 # it causes there to be a name conflict with the 'Inherited' that this
8900 # program generates for this block property value
8901 #$file->insert_lines('blk; n/a; Herited');
8902
8903
8904 # Process each line of the file ...
8905 while ($file->next_line) {
8906
8907 my ($property, @data) = split /\s*;\s*/;
8908
66b4eb0a
KW
8909 # The ccc property has an extra field at the beginning, which is the
8910 # numeric value. Move it to be after the other two, mnemonic, fields,
8911 # so that those will be used as the property value's names, and the
8912 # number will be an extra alias. (Rightmost splice removes field 1-2,
8913 # returning them in a slice; left splice inserts that before anything,
8914 # thus shifting the former field 0 to after them.)
8915 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8916
8917 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
8918 # there is no short name, use the full one in element 1
027866c1
KW
8919 if ($data[0] eq "n/a") {
8920 $data[0] = $data[1];
8921 }
8922 elsif ($data[0] ne $data[1]
8923 && standardize($data[0]) eq standardize($data[1])
8924 && $data[1] !~ /[[:upper:]]/)
8925 {
8926 # Also, there is a bug in the file in which "n/a" is omitted, and
8927 # the two fields are identical except for case, and the full name
8928 # is all lower case. Copy the "short" name unto the full one to
8929 # give it some upper case.
8930
8931 $data[1] = $data[0];
8932 }
99870f4d
KW
8933
8934 # Earlier releases had the pseudo property 'qc' that should expand to
8935 # the ones that replace it below.
8936 if ($property eq 'qc') {
8937 if (lc $data[0] eq 'y') {
8938 $file->insert_lines('NFC_QC; Y ; Yes',
8939 'NFD_QC; Y ; Yes',
8940 'NFKC_QC; Y ; Yes',
8941 'NFKD_QC; Y ; Yes',
8942 );
8943 }
8944 elsif (lc $data[0] eq 'n') {
8945 $file->insert_lines('NFC_QC; N ; No',
8946 'NFD_QC; N ; No',
8947 'NFKC_QC; N ; No',
8948 'NFKD_QC; N ; No',
8949 );
8950 }
8951 elsif (lc $data[0] eq 'm') {
8952 $file->insert_lines('NFC_QC; M ; Maybe',
8953 'NFKC_QC; M ; Maybe',
8954 );
8955 }
8956 else {
8957 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8958 }
8959 next;
8960 }
8961
8962 # The first field is the short name, 2nd is the full one.
8963 my $property_object = property_ref($property);
8964 my $table = $property_object->add_match_table($data[0],
8965 Full_Name => $data[1]);
8966
8967 # Start looking for more aliases after these two.
8968 for my $i (2 .. @data - 1) {
8969 $table->add_alias($data[$i]);
8970 }
8971 } # End of looping through the file
8972
8973 # As noted in the comments early in the program, it generates tables for
8974 # the default values for all releases, even those for which the concept
8975 # didn't exist at the time. Here we add those if missing.
8976 my $age = property_ref('age');
8977 if (defined $age && ! defined $age->table('Unassigned')) {
8978 $age->add_match_table('Unassigned');
8979 }
8980 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8981 && ! defined $block->table('No_Block');
8982
8983
8984 # Now set the default mappings of the properties from the file. This is
8985 # done after the loop because a number of properties have only @missings
8986 # entries in the file, and may not show up until the end.
8987 my @defaults = $file->get_missings;
8988 foreach my $default_ref (@defaults) {
8989 my $default = $default_ref->[0];
8990 my $property = property_ref($default_ref->[1]);
8991 $property->set_default_map($default);
8992 }
8993 return;
8994}
8995
8996sub get_old_property_value_aliases () {
8997 # Returns what would be in PropValueAliases.txt if it existed in very old
8998 # versions of Unicode. It was derived from the one in 3.2, and pared
8999 # down. An attempt was made to use the existence of files to mean
9000 # inclusion or not of various aliases, but if this was not sufficient,
9001 # using version numbers was resorted to.
9002
9003 my @return = split /\n/, <<'END';
9004bc ; AN ; Arabic_Number
9005bc ; B ; Paragraph_Separator
9006bc ; CS ; Common_Separator
9007bc ; EN ; European_Number
9008bc ; ES ; European_Separator
9009bc ; ET ; European_Terminator
9010bc ; L ; Left_To_Right
9011bc ; ON ; Other_Neutral
9012bc ; R ; Right_To_Left
9013bc ; WS ; White_Space
9014
9015# The standard combining classes are very much different in v1, so only use
9016# ones that look right (not checked thoroughly)
9017ccc; 0; NR ; Not_Reordered
9018ccc; 1; OV ; Overlay
9019ccc; 7; NK ; Nukta
9020ccc; 8; KV ; Kana_Voicing
9021ccc; 9; VR ; Virama
9022ccc; 202; ATBL ; Attached_Below_Left
9023ccc; 216; ATAR ; Attached_Above_Right
9024ccc; 218; BL ; Below_Left
9025ccc; 220; B ; Below
9026ccc; 222; BR ; Below_Right
9027ccc; 224; L ; Left
9028ccc; 228; AL ; Above_Left
9029ccc; 230; A ; Above
9030ccc; 232; AR ; Above_Right
9031ccc; 234; DA ; Double_Above
9032
9033dt ; can ; canonical
9034dt ; enc ; circle
9035dt ; fin ; final
9036dt ; font ; font
9037dt ; fra ; fraction
9038dt ; init ; initial
9039dt ; iso ; isolated
9040dt ; med ; medial
9041dt ; n/a ; none
9042dt ; nb ; noBreak
9043dt ; sqr ; square
9044dt ; sub ; sub
9045dt ; sup ; super
9046
9047gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9048gc ; Cc ; Control
9049gc ; Cn ; Unassigned
9050gc ; Co ; Private_Use
9051gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9052gc ; LC ; Cased_Letter # Ll | Lt | Lu
9053gc ; Ll ; Lowercase_Letter
9054gc ; Lm ; Modifier_Letter
9055gc ; Lo ; Other_Letter
9056gc ; Lu ; Uppercase_Letter
9057gc ; M ; Mark # Mc | Me | Mn
9058gc ; Mc ; Spacing_Mark
9059gc ; Mn ; Nonspacing_Mark
9060gc ; N ; Number # Nd | Nl | No
9061gc ; Nd ; Decimal_Number
9062gc ; No ; Other_Number
9063gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9064gc ; Pd ; Dash_Punctuation
9065gc ; Pe ; Close_Punctuation
9066gc ; Po ; Other_Punctuation
9067gc ; Ps ; Open_Punctuation
9068gc ; S ; Symbol # Sc | Sk | Sm | So
9069gc ; Sc ; Currency_Symbol
9070gc ; Sm ; Math_Symbol
9071gc ; So ; Other_Symbol
9072gc ; Z ; Separator # Zl | Zp | Zs
9073gc ; Zl ; Line_Separator
9074gc ; Zp ; Paragraph_Separator
9075gc ; Zs ; Space_Separator
9076
9077nt ; de ; Decimal
9078nt ; di ; Digit
9079nt ; n/a ; None
9080nt ; nu ; Numeric
9081END
9082
9083 if (-e 'ArabicShaping.txt') {
9084 push @return, split /\n/, <<'END';
9085jg ; n/a ; AIN
9086jg ; n/a ; ALEF
9087jg ; n/a ; DAL
9088jg ; n/a ; GAF
9089jg ; n/a ; LAM
9090jg ; n/a ; MEEM
9091jg ; n/a ; NO_JOINING_GROUP
9092jg ; n/a ; NOON
9093jg ; n/a ; QAF
9094jg ; n/a ; SAD
9095jg ; n/a ; SEEN
9096jg ; n/a ; TAH
9097jg ; n/a ; WAW
9098
9099jt ; C ; Join_Causing
9100jt ; D ; Dual_Joining
9101jt ; L ; Left_Joining
9102jt ; R ; Right_Joining
9103jt ; U ; Non_Joining
9104jt ; T ; Transparent
9105END
9106 if ($v_version ge v3.0.0) {
9107 push @return, split /\n/, <<'END';
9108jg ; n/a ; ALAPH
9109jg ; n/a ; BEH
9110jg ; n/a ; BETH
9111jg ; n/a ; DALATH_RISH
9112jg ; n/a ; E
9113jg ; n/a ; FEH
9114jg ; n/a ; FINAL_SEMKATH
9115jg ; n/a ; GAMAL
9116jg ; n/a ; HAH
9117jg ; n/a ; HAMZA_ON_HEH_GOAL
9118jg ; n/a ; HE
9119jg ; n/a ; HEH
9120jg ; n/a ; HEH_GOAL
9121jg ; n/a ; HETH
9122jg ; n/a ; KAF
9123jg ; n/a ; KAPH
9124jg ; n/a ; KNOTTED_HEH
9125jg ; n/a ; LAMADH
9126jg ; n/a ; MIM
9127jg ; n/a ; NUN
9128jg ; n/a ; PE
9129jg ; n/a ; QAPH
9130jg ; n/a ; REH
9131jg ; n/a ; REVERSED_PE
9132jg ; n/a ; SADHE
9133jg ; n/a ; SEMKATH
9134jg ; n/a ; SHIN
9135jg ; n/a ; SWASH_KAF
9136jg ; n/a ; TAW
9137jg ; n/a ; TEH_MARBUTA
9138jg ; n/a ; TETH
9139jg ; n/a ; YEH
9140jg ; n/a ; YEH_BARREE
9141jg ; n/a ; YEH_WITH_TAIL
9142jg ; n/a ; YUDH
9143jg ; n/a ; YUDH_HE
9144jg ; n/a ; ZAIN
9145END
9146 }
9147 }
9148
9149
9150 if (-e 'EastAsianWidth.txt') {
9151 push @return, split /\n/, <<'END';
9152ea ; A ; Ambiguous
9153ea ; F ; Fullwidth
9154ea ; H ; Halfwidth
9155ea ; N ; Neutral
9156ea ; Na ; Narrow
9157ea ; W ; Wide
9158END
9159 }
9160
9161 if (-e 'LineBreak.txt') {
9162 push @return, split /\n/, <<'END';
9163lb ; AI ; Ambiguous
9164lb ; AL ; Alphabetic
9165lb ; B2 ; Break_Both
9166lb ; BA ; Break_After
9167lb ; BB ; Break_Before
9168lb ; BK ; Mandatory_Break
9169lb ; CB ; Contingent_Break
9170lb ; CL ; Close_Punctuation
9171lb ; CM ; Combining_Mark
9172lb ; CR ; Carriage_Return
9173lb ; EX ; Exclamation
9174lb ; GL ; Glue
9175lb ; HY ; Hyphen
9176lb ; ID ; Ideographic
9177lb ; IN ; Inseperable
9178lb ; IS ; Infix_Numeric
9179lb ; LF ; Line_Feed
9180lb ; NS ; Nonstarter
9181lb ; NU ; Numeric
9182lb ; OP ; Open_Punctuation
9183lb ; PO ; Postfix_Numeric
9184lb ; PR ; Prefix_Numeric
9185lb ; QU ; Quotation
9186lb ; SA ; Complex_Context
9187lb ; SG ; Surrogate
9188lb ; SP ; Space
9189lb ; SY ; Break_Symbols
9190lb ; XX ; Unknown
9191lb ; ZW ; ZWSpace
9192END
9193 }
9194
9195 if (-e 'DNormalizationProps.txt') {
9196 push @return, split /\n/, <<'END';
9197qc ; M ; Maybe
9198qc ; N ; No
9199qc ; Y ; Yes
9200END
9201 }
9202
9203 if (-e 'Scripts.txt') {
9204 push @return, split /\n/, <<'END';
9205sc ; Arab ; Arabic
9206sc ; Armn ; Armenian
9207sc ; Beng ; Bengali
9208sc ; Bopo ; Bopomofo
9209sc ; Cans ; Canadian_Aboriginal
9210sc ; Cher ; Cherokee
9211sc ; Cyrl ; Cyrillic
9212sc ; Deva ; Devanagari
9213sc ; Dsrt ; Deseret
9214sc ; Ethi ; Ethiopic
9215sc ; Geor ; Georgian
9216sc ; Goth ; Gothic
9217sc ; Grek ; Greek
9218sc ; Gujr ; Gujarati
9219sc ; Guru ; Gurmukhi
9220sc ; Hang ; Hangul
9221sc ; Hani ; Han
9222sc ; Hebr ; Hebrew
9223sc ; Hira ; Hiragana
9224sc ; Ital ; Old_Italic
9225sc ; Kana ; Katakana
9226sc ; Khmr ; Khmer
9227sc ; Knda ; Kannada
9228sc ; Laoo ; Lao
9229sc ; Latn ; Latin
9230sc ; Mlym ; Malayalam
9231sc ; Mong ; Mongolian
9232sc ; Mymr ; Myanmar
9233sc ; Ogam ; Ogham
9234sc ; Orya ; Oriya
9235sc ; Qaai ; Inherited
9236sc ; Runr ; Runic
9237sc ; Sinh ; Sinhala
9238sc ; Syrc ; Syriac
9239sc ; Taml ; Tamil
9240sc ; Telu ; Telugu
9241sc ; Thaa ; Thaana
9242sc ; Thai ; Thai
9243sc ; Tibt ; Tibetan
9244sc ; Yiii ; Yi
9245sc ; Zyyy ; Common
9246END
9247 }
9248
9249 if ($v_version ge v2.0.0) {
9250 push @return, split /\n/, <<'END';
9251dt ; com ; compat
9252dt ; nar ; narrow
9253dt ; sml ; small
9254dt ; vert ; vertical
9255dt ; wide ; wide
9256
9257gc ; Cf ; Format
9258gc ; Cs ; Surrogate
9259gc ; Lt ; Titlecase_Letter
9260gc ; Me ; Enclosing_Mark
9261gc ; Nl ; Letter_Number
9262gc ; Pc ; Connector_Punctuation
9263gc ; Sk ; Modifier_Symbol
9264END
9265 }
9266 if ($v_version ge v2.1.2) {
9267 push @return, "bc ; S ; Segment_Separator\n";
9268 }
9269 if ($v_version ge v2.1.5) {
9270 push @return, split /\n/, <<'END';
9271gc ; Pf ; Final_Punctuation
9272gc ; Pi ; Initial_Punctuation
9273END
9274 }
9275 if ($v_version ge v2.1.8) {
9276 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9277 }
9278
9279 if ($v_version ge v3.0.0) {
9280 push @return, split /\n/, <<'END';
9281bc ; AL ; Arabic_Letter
9282bc ; BN ; Boundary_Neutral
9283bc ; LRE ; Left_To_Right_Embedding
9284bc ; LRO ; Left_To_Right_Override
9285bc ; NSM ; Nonspacing_Mark
9286bc ; PDF ; Pop_Directional_Format
9287bc ; RLE ; Right_To_Left_Embedding
9288bc ; RLO ; Right_To_Left_Override
9289
9290ccc; 233; DB ; Double_Below
9291END
9292 }
9293
9294 if ($v_version ge v3.1.0) {
9295 push @return, "ccc; 226; R ; Right\n";
9296 }
9297
9298 return @return;
9299}
9300
b1c167a3
KW
9301sub output_perl_charnames_line ($$) {
9302
9303 # Output the entries in Perl_charnames specially, using 5 digits instead
9304 # of four. This makes the entries a constant length, and simplifies
9305 # charnames.pm which this table is for. Unicode can have 6 digit
9306 # ordinals, but they are all private use or noncharacters which do not
9307 # have names, so won't be in this table.
9308
73d9566f 9309 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
9310}
9311
99870f4d
KW
9312{ # Closure
9313 # This is used to store the range list of all the code points usable when
9314 # the little used $compare_versions feature is enabled.
9315 my $compare_versions_range_list;
9316
96cfc54a
KW
9317 # These are constants to the $property_info hash in this subroutine, to
9318 # avoid using a quoted-string which might have a typo.
9319 my $TYPE = 'type';
9320 my $DEFAULT_MAP = 'default_map';
9321 my $DEFAULT_TABLE = 'default_table';
9322 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9323 my $MISSINGS = 'missings';
9324
99870f4d
KW
9325 sub process_generic_property_file {
9326 # This processes a file containing property mappings and puts them
9327 # into internal map tables. It should be used to handle any property
9328 # files that have mappings from a code point or range thereof to
9329 # something else. This means almost all the UCD .txt files.
9330 # each_line_handlers() should be set to adjust the lines of these
9331 # files, if necessary, to what this routine understands:
9332 #
9333 # 0374 ; NFD_QC; N
9334 # 003C..003E ; Math
9335 #
92f9d56c 9336 # the fields are: "codepoint-range ; property; map"
99870f4d
KW
9337 #
9338 # meaning the codepoints in the range all have the value 'map' under
9339 # 'property'.
98dc9551 9340 # Beginning and trailing white space in each field are not significant.
99870f4d
KW
9341 # Note there is not a trailing semi-colon in the above. A trailing
9342 # semi-colon means the map is a null-string. An omitted map, as
9343 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9344 # table syntax. (This could have been hidden from this routine by
9345 # doing it in the $file object, but that would require parsing of the
9346 # line there, so would have to parse it twice, or change the interface
9347 # to pass this an array. So not done.)
9348 #
9349 # The map field may begin with a sequence of commands that apply to
9350 # this range. Each such command begins and ends with $CMD_DELIM.
9351 # These are used to indicate, for example, that the mapping for a
9352 # range has a non-default type.
9353 #
9354 # This loops through the file, calling it's next_line() method, and
9355 # then taking the map and adding it to the property's table.
9356 # Complications arise because any number of properties can be in the
9357 # file, in any order, interspersed in any way. The first time a
9358 # property is seen, it gets information about that property and
f86864ac 9359 # caches it for quick retrieval later. It also normalizes the maps
5d7f7709
KW
9360 # so that only one of many synonyms is stored. The Unicode input
9361 # files do use some multiple synonyms.
99870f4d
KW
9362
9363 my $file = shift;
9364 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9365
9366 my %property_info; # To keep track of what properties
9367 # have already had entries in the
9368 # current file, and info about each,
9369 # so don't have to recompute.
9370 my $property_name; # property currently being worked on
9371 my $property_type; # and its type
9372 my $previous_property_name = ""; # name from last time through loop
9373 my $property_object; # pointer to the current property's
9374 # object
9375 my $property_addr; # the address of that object
9376 my $default_map; # the string that code points missing
9377 # from the file map to
9378 my $default_table; # For non-string properties, a
9379 # reference to the match table that
9380 # will contain the list of code
9381 # points that map to $default_map.
9382
9383 # Get the next real non-comment line
9384 LINE:
9385 while ($file->next_line) {
9386
9387 # Default replacement type; means that if parts of the range have
9388 # already been stored in our tables, the new map overrides them if
9389 # they differ more than cosmetically
9390 my $replace = $IF_NOT_EQUIVALENT;
9391 my $map_type; # Default type for the map of this range
9392
9393 #local $to_trace = 1 if main::DEBUG;
9394 trace $_ if main::DEBUG && $to_trace;
9395
9396 # Split the line into components
9397 my ($range, $property_name, $map, @remainder)
9398 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9399
9400 # If more or less on the line than we are expecting, warn and skip
9401 # the line
9402 if (@remainder) {
9403 $file->carp_bad_line('Extra fields');
9404 next LINE;
9405 }
9406 elsif ( ! defined $property_name) {
9407 $file->carp_bad_line('Missing property');
9408 next LINE;
9409 }
9410
9411 # Examine the range.
9412 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9413 {
9414 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9415 next LINE;
9416 }
9417 my $low = hex $1;
9418 my $high = (defined $2) ? hex $2 : $low;
9419
9420 # For the very specialized case of comparing two Unicode
9421 # versions...
9422 if (DEBUG && $compare_versions) {
9423 if ($property_name eq 'Age') {
9424
9425 # Only allow code points at least as old as the version
9426 # specified.
9427 my $age = pack "C*", split(/\./, $map); # v string
9428 next LINE if $age gt $compare_versions;
9429 }
9430 else {
9431
9432 # Again, we throw out code points younger than those of
9433 # the specified version. By now, the Age property is
9434 # populated. We use the intersection of each input range
9435 # with this property to find what code points in it are
9436 # valid. To do the intersection, we have to convert the
9437 # Age property map to a Range_list. We only have to do
9438 # this once.
9439 if (! defined $compare_versions_range_list) {
9440 my $age = property_ref('Age');
9441 if (! -e 'DAge.txt') {
9442 croak "Need to have 'DAge.txt' file to do version comparison";
9443 }
9444 elsif ($age->count == 0) {
9445 croak "The 'Age' table is empty, but its file exists";
9446 }
9447 $compare_versions_range_list
9448 = Range_List->new(Initialize => $age);
9449 }
9450
9451 # An undefined map is always 'Y'
9452 $map = 'Y' if ! defined $map;
9453
9454 # Calculate the intersection of the input range with the
9455 # code points that are known in the specified version
9456 my @ranges = ($compare_versions_range_list
9457 & Range->new($low, $high))->ranges;
9458
9459 # If the intersection is empty, throw away this range
9460 next LINE unless @ranges;
9461
9462 # Only examine the first range this time through the loop.
9463 my $this_range = shift @ranges;
9464
9465 # Put any remaining ranges in the queue to be processed
9466 # later. Note that there is unnecessary work here, as we
9467 # will do the intersection again for each of these ranges
9468 # during some future iteration of the LINE loop, but this
9469 # code is not used in production. The later intersections
9470 # are guaranteed to not splinter, so this will not become
9471 # an infinite loop.
9472 my $line = join ';', $property_name, $map;
9473 foreach my $range (@ranges) {
9474 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9475 $range->start,
9476 $range->end,
9477 $line));
9478 }
9479
9480 # And process the first range, like any other.
9481 $low = $this_range->start;
9482 $high = $this_range->end;
9483 }
9484 } # End of $compare_versions
9485
9486 # If changing to a new property, get the things constant per
9487 # property
9488 if ($previous_property_name ne $property_name) {
9489
9490 $property_object = property_ref($property_name);
9491 if (! defined $property_object) {
9492 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9493 next LINE;
9494 }
051df77b 9495 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
9496
9497 # Defer changing names until have a line that is acceptable
9498 # (the 'next' statement above means is unacceptable)
9499 $previous_property_name = $property_name;
9500
9501 # If not the first time for this property, retrieve info about
9502 # it from the cache
96cfc54a
KW
9503 if (defined ($property_info{$property_addr}{$TYPE})) {
9504 $property_type = $property_info{$property_addr}{$TYPE};
9505 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
99870f4d 9506 $map_type
96cfc54a 9507 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
99870f4d 9508 $default_table
96cfc54a 9509 = $property_info{$property_addr}{$DEFAULT_TABLE};
99870f4d
KW
9510 }
9511 else {
9512
9513 # Here, is the first time for this property. Set up the
9514 # cache.
96cfc54a 9515 $property_type = $property_info{$property_addr}{$TYPE}
99870f4d
KW
9516 = $property_object->type;
9517 $map_type
96cfc54a 9518 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
99870f4d
KW
9519 = $property_object->pseudo_map_type;
9520
9521 # The Unicode files are set up so that if the map is not
9522 # defined, it is a binary property
9523 if (! defined $map && $property_type != $BINARY) {
9524 if ($property_type != $UNKNOWN
9525 && $property_type != $NON_STRING)
9526 {
9527 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9528 }
9529 else {
9530 $property_object->set_type($BINARY);
9531 $property_type
96cfc54a 9532 = $property_info{$property_addr}{$TYPE}
99870f4d
KW
9533 = $BINARY;
9534 }
9535 }
9536
9537 # Get any @missings default for this property. This
9538 # should precede the first entry for the property in the
9539 # input file, and is located in a comment that has been
9540 # stored by the Input_file class until we access it here.
9541 # It's possible that there is more than one such line
9542 # waiting for us; collect them all, and parse
9543 my @missings_list = $file->get_missings
9544 if $file->has_missings_defaults;
9545 foreach my $default_ref (@missings_list) {
9546 my $default = $default_ref->[0];
ffe43484 9547 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
9548
9549 # For string properties, the default is just what the
9550 # file says, but non-string properties should already
9551 # have set up a table for the default property value;
9552 # use the table for these, so can resolve synonyms
9553 # later to a single standard one.
9554 if ($property_type == $STRING
9555 || $property_type == $UNKNOWN)
9556 {
96cfc54a 9557 $property_info{$addr}{$MISSINGS} = $default;
99870f4d
KW
9558 }
9559 else {
96cfc54a 9560 $property_info{$addr}{$MISSINGS}
99870f4d
KW
9561 = $property_object->table($default);
9562 }
9563 }
9564
9565 # Finished storing all the @missings defaults in the input
9566 # file so far. Get the one for the current property.
96cfc54a 9567 my $missings = $property_info{$property_addr}{$MISSINGS};
99870f4d
KW
9568
9569 # But we likely have separately stored what the default
9570 # should be. (This is to accommodate versions of the
9571 # standard where the @missings lines are absent or
9572 # incomplete.) Hopefully the two will match. But check
9573 # it out.
9574 $default_map = $property_object->default_map;
9575
9576 # If the map is a ref, it means that the default won't be
9577 # processed until later, so undef it, so next few lines
9578 # will redefine it to something that nothing will match
9579 undef $default_map if ref $default_map;
9580
9581 # Create a $default_map if don't have one; maybe a dummy
9582 # that won't match anything.
9583 if (! defined $default_map) {
9584
9585 # Use any @missings line in the file.
9586 if (defined $missings) {
9587 if (ref $missings) {
9588 $default_map = $missings->full_name;
9589 $default_table = $missings;
9590 }
9591 else {
9592 $default_map = $missings;
9593 }
678f13d5 9594
99870f4d
KW
9595 # And store it with the property for outside use.
9596 $property_object->set_default_map($default_map);
9597 }
9598 else {
9599
9600 # Neither an @missings nor a default map. Create
9601 # a dummy one, so won't have to test definedness
9602 # in the main loop.
9603 $default_map = '_Perl This will never be in a file
9604 from Unicode';
9605 }
9606 }
9607
9608 # Here, we have $default_map defined, possibly in terms of
9609 # $missings, but maybe not, and possibly is a dummy one.
9610 if (defined $missings) {
9611
9612 # Make sure there is no conflict between the two.
9613 # $missings has priority.
9614 if (ref $missings) {
23e33b60
KW
9615 $default_table
9616 = $property_object->table($default_map);
99870f4d
KW
9617 if (! defined $default_table
9618 || $default_table != $missings)
9619 {
9620 if (! defined $default_table) {
9621 $default_table = $UNDEF;
9622 }
9623 $file->carp_bad_line(<<END
9624The \@missings line for $property_name in $file says that missings default to
9625$missings, but we expect it to be $default_table. $missings used.
9626END
9627 );
9628 $default_table = $missings;
9629 $default_map = $missings->full_name;
9630 }
96cfc54a 9631 $property_info{$property_addr}{$DEFAULT_TABLE}
99870f4d
KW
9632 = $default_table;
9633 }
9634 elsif ($default_map ne $missings) {
9635 $file->carp_bad_line(<<END
9636The \@missings line for $property_name in $file says that missings default to
9637$missings, but we expect it to be $default_map. $missings used.
9638END
9639 );
9640 $default_map = $missings;
9641 }
9642 }
9643
96cfc54a 9644 $property_info{$property_addr}{$DEFAULT_MAP}
99870f4d
KW
9645 = $default_map;
9646
9647 # If haven't done so already, find the table corresponding
9648 # to this map for non-string properties.
9649 if (! defined $default_table
9650 && $property_type != $STRING
9651 && $property_type != $UNKNOWN)
9652 {
9653 $default_table = $property_info{$property_addr}
96cfc54a 9654 {$DEFAULT_TABLE}
99870f4d
KW
9655 = $property_object->table($default_map);
9656 }
9657 } # End of is first time for this property
9658 } # End of switching properties.
9659
9660 # Ready to process the line.
9661 # The Unicode files are set up so that if the map is not defined,
9662 # it is a binary property with value 'Y'
9663 if (! defined $map) {
9664 $map = 'Y';
9665 }
9666 else {
9667
9668 # If the map begins with a special command to us (enclosed in
9669 # delimiters), extract the command(s).
a35d7f90
KW
9670 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9671 my $command = $1;
9672 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9673 $replace = $1;
99870f4d 9674 }
a35d7f90
KW
9675 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9676 $map_type = $1;
9677 }
9678 else {
9679 $file->carp_bad_line("Unknown command line: '$1'");
9680 next LINE;
9681 }
9682 }
99870f4d
KW
9683 }
9684
9685 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9686 {
9687
9688 # Here, we have a map to a particular code point, and the
9689 # default map is to a code point itself. If the range
9690 # includes the particular code point, change that portion of
9691 # the range to the default. This makes sure that in the final
9692 # table only the non-defaults are listed.
9693 my $decimal_map = hex $map;
9694 if ($low <= $decimal_map && $decimal_map <= $high) {
9695
9696 # If the range includes stuff before or after the map
9697 # we're changing, split it and process the split-off parts
9698 # later.
9699 if ($low < $decimal_map) {
9700 $file->insert_adjusted_lines(
9701 sprintf("%04X..%04X; %s; %s",
9702 $low,
9703 $decimal_map - 1,
9704 $property_name,
9705 $map));
9706 }
9707 if ($high > $decimal_map) {
9708 $file->insert_adjusted_lines(
9709 sprintf("%04X..%04X; %s; %s",
9710 $decimal_map + 1,
9711 $high,
9712 $property_name,
9713 $map));
9714 }
9715 $low = $high = $decimal_map;
9716 $map = $CODE_POINT;
9717 }
9718 }
9719
9720 # If we can tell that this is a synonym for the default map, use
9721 # the default one instead.
9722 if ($property_type != $STRING
9723 && $property_type != $UNKNOWN)
9724 {
9725 my $table = $property_object->table($map);
9726 if (defined $table && $table == $default_table) {
9727 $map = $default_map;
9728 }
9729 }
9730
9731 # And figure out the map type if not known.
9732 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9733 if ($map eq "") { # Nulls are always $NULL map type
9734 $map_type = $NULL;
9735 } # Otherwise, non-strings, and those that don't allow
9736 # $MULTI_CP, and those that aren't multiple code points are
9737 # 0
9738 elsif
9739 (($property_type != $STRING && $property_type != $UNKNOWN)
9740 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9741 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9742 {
9743 $map_type = 0;
9744 }
9745 else {
9746 $map_type = $MULTI_CP;
9747 }
9748 }
9749
9750 $property_object->add_map($low, $high,
9751 $map,
9752 Type => $map_type,
9753 Replace => $replace);
9754 } # End of loop through file's lines
9755
9756 return;
9757 }
9758}
9759
99870f4d
KW
9760{ # Closure for UnicodeData.txt handling
9761
9762 # This file was the first one in the UCD; its design leads to some
9763 # awkwardness in processing. Here is a sample line:
9764 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9765 # The fields in order are:
9766 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9767 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9768 my $CATEGORY = $i++; # category (e.g. "Lu")
9769 my $CCC = $i++; # Canonical combining class (e.g. "230")
9770 my $BIDI = $i++; # directional class (e.g. "L")
9771 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9772 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9773 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9774 # Dual-use in this program; see below
9775 my $NUMERIC = $i++; # numeric value
9776 my $MIRRORED = $i++; # ? mirrored
9777 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9778 my $COMMENT = $i++; # iso comment
9779 my $UPPER = $i++; # simple uppercase mapping
9780 my $LOWER = $i++; # simple lowercase mapping
9781 my $TITLE = $i++; # simple titlecase mapping
9782 my $input_field_count = $i;
9783
9784 # This routine in addition outputs these extra fields:
9785 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9786
9787 # These fields are modifications of ones above, and are usually
9788 # suppressed; they must come last, as for speed, the loop upper bound is
9789 # normally set to ignore them
9790 my $NAME = $i++; # This is the strict name field, not the one that
9791 # charnames uses.
9792 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9793 # by Unicode::Normalize
99870f4d
KW
9794 my $last_field = $i - 1;
9795
9796 # All these are read into an array for each line, with the indices defined
9797 # above. The empty fields in the example line above indicate that the
9798 # value is defaulted. The handler called for each line of the input
9799 # changes these to their defaults.
9800
9801 # Here are the official names of the properties, in a parallel array:
9802 my @field_names;
9803 $field_names[$BIDI] = 'Bidi_Class';
9804 $field_names[$CATEGORY] = 'General_Category';
9805 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9806 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9807 $field_names[$COMMENT] = 'ISO_Comment';
9808 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9809 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9810 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9811 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9812 $field_names[$NAME] = 'Name';
9813 $field_names[$NUMERIC] = 'Numeric_Value';
9814 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9815 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9816 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9817 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9818 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9819 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9820
28093d0e
KW
9821 # Some of these need a little more explanation:
9822 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9823 # property, but is used in calculating the Numeric_Type. Perl however,
9824 # creates a file from this field, so a Perl property is created from it.
9825 # Similarly, the Other_Digit field is used only for calculating the
9826 # Numeric_Type, and so it can be safely re-used as the place to store
9827 # the value for Numeric_Type; hence it is referred to as
9828 # $NUMERIC_TYPE_OTHER_DIGIT.
9829 # The input field named $PERL_DECOMPOSITION is a combination of both the
9830 # decomposition mapping and its type. Perl creates a file containing
9831 # exactly this field, so it is used for that. The two properties are
9832 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9833 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9834 # output it), as Perl doesn't use it directly.
9835 # The input field named here $CHARNAME is used to construct the
9836 # Perl_Charnames property, which is a combination of the Name property
9837 # (which the input field contains), and the Unicode_1_Name property, and
9838 # others from other files. Since, the strict Name property is not used
9839 # by Perl, this field is used for the table that Perl does use. The
9840 # strict Name property table is usually suppressed (unless the lists are
9841 # changed to output it), so it is accumulated in a separate field,
9842 # $NAME, which to save time is discarded unless the table is actually to
9843 # be output
99870f4d
KW
9844
9845 # This file is processed like most in this program. Control is passed to
9846 # process_generic_property_file() which calls filter_UnicodeData_line()
9847 # for each input line. This filter converts the input into line(s) that
9848 # process_generic_property_file() understands. There is also a setup
9849 # routine called before any of the file is processed, and a handler for
9850 # EOF processing, all in this closure.
9851
9852 # A huge speed-up occurred at the cost of some added complexity when these
9853 # routines were altered to buffer the outputs into ranges. Almost all the
9854 # lines of the input file apply to just one code point, and for most
9855 # properties, the map for the next code point up is the same as the
9856 # current one. So instead of creating a line for each property for each
9857 # input line, filter_UnicodeData_line() remembers what the previous map
9858 # of a property was, and doesn't generate a line to pass on until it has
9859 # to, as when the map changes; and that passed-on line encompasses the
9860 # whole contiguous range of code points that have the same map for that
9861 # property. This means a slight amount of extra setup, and having to
9862 # flush these buffers on EOF, testing if the maps have changed, plus
9863 # remembering state information in the closure. But it means a lot less
9864 # real time in not having to change the data base for each property on
9865 # each line.
9866
9867 # Another complication is that there are already a few ranges designated
9868 # in the input. There are two lines for each, with the same maps except
9869 # the code point and name on each line. This was actually the hardest
9870 # thing to design around. The code points in those ranges may actually
9871 # have real maps not given by these two lines. These maps will either
56339b2c 9872 # be algorithmically determinable, or be in the extracted files furnished
99870f4d
KW
9873 # with the UCD. In the event of conflicts between these extracted files,
9874 # and this one, Unicode says that this one prevails. But it shouldn't
9875 # prevail for conflicts that occur in these ranges. The data from the
9876 # extracted files prevails in those cases. So, this program is structured
9877 # so that those files are processed first, storing maps. Then the other
9878 # files are processed, generally overwriting what the extracted files
9879 # stored. But just the range lines in this input file are processed
9880 # without overwriting. This is accomplished by adding a special string to
9881 # the lines output to tell process_generic_property_file() to turn off the
9882 # overwriting for just this one line.
9883 # A similar mechanism is used to tell it that the map is of a non-default
9884 # type.
9885
9886 sub setup_UnicodeData { # Called before any lines of the input are read
9887 my $file = shift;
9888 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9889
28093d0e
KW
9890 # Create a new property specially located that is a combination of the
9891 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9892 # Name_Alias properties. (The final duplicates elements of the
9893 # first.) A comment for it will later be constructed based on the
9894 # actual properties present and used
3e20195b 9895 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9896 Default_Map => "",
9897 Directory => File::Spec->curdir(),
9898 File => 'Name',
301ba948 9899 Fate => $INTERNAL_ONLY,
28093d0e 9900 Perl_Extension => 1,
b1c167a3 9901 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9902 Type => $STRING,
9903 );
5be997b0 9904 $perl_charname->set_proxy_for('Name', 'Name_Alias');
28093d0e 9905
99870f4d 9906 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9907 Directory => File::Spec->curdir(),
99870f4d 9908 File => 'Decomposition',
a14f3cb1 9909 Format => $DECOMP_STRING_FORMAT,
301ba948 9910 Fate => $INTERNAL_ONLY,
99870f4d
KW
9911 Perl_Extension => 1,
9912 Default_Map => $CODE_POINT,
9913
0c07e538
KW
9914 # normalize.pm can't cope with these
9915 Output_Range_Counts => 0,
9916
99870f4d
KW
9917 # This is a specially formatted table
9918 # explicitly for normalize.pm, which
9919 # is expecting a particular format,
9920 # which means that mappings containing
9921 # multiple code points are in the main
9922 # body of the table
9923 Map_Type => $COMPUTE_NO_MULTI_CP,
9924 Type => $STRING,
9925 );
5be997b0 9926 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
99870f4d
KW
9927 $Perl_decomp->add_comment(join_lines(<<END
9928This mapping is a combination of the Unicode 'Decomposition_Type' and
9929'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
8d6427a5 9930identical to the official Unicode 'Decomposition_Mapping' property except for
99870f4d
KW
9931two things:
9932 1) It omits the algorithmically determinable Hangul syllable decompositions,
9933which normalize.pm handles algorithmically.
9934 2) It contains the decomposition type as well. Non-canonical decompositions
9935begin with a word in angle brackets, like <super>, which denotes the
9936compatible decomposition type. If the map does not begin with the <angle
9937brackets>, the decomposition is canonical.
9938END
9939 ));
9940
9941 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9942 Default_Map => "",
9943 Perl_Extension => 1,
9944 File => 'Digit', # Trad. location
9945 Directory => $map_directory,
9946 Type => $STRING,
9947 Range_Size_1 => 1,
9948 );
9949 $Decimal_Digit->add_comment(join_lines(<<END
9950This file gives the mapping of all code points which represent a single
9951decimal digit [0-9] to their respective digits. For example, the code point
9952U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9953that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9954numerals.
9955END
9956 ));
9957
28093d0e
KW
9958 # These properties are not used for generating anything else, and are
9959 # usually not output. By making them last in the list, we can just
99870f4d 9960 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
9961 # generating a table(s) that is/are just going to get thrown away.
9962 if (! property_ref('Decomposition_Mapping')->to_output_map
9963 && ! property_ref('Name')->to_output_map)
9964 {
9965 $last_field = min($NAME, $DECOMP_MAP) - 1;
9966 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9967 $last_field = $DECOMP_MAP;
9968 } elsif (property_ref('Name')->to_output_map) {
9969 $last_field = $NAME;
99870f4d
KW
9970 }
9971 return;
9972 }
9973
9974 my $first_time = 1; # ? Is this the first line of the file
9975 my $in_range = 0; # ? Are we in one of the file's ranges
9976 my $previous_cp; # hex code point of previous line
9977 my $decimal_previous_cp = -1; # And its decimal equivalent
9978 my @start; # For each field, the current starting
9979 # code point in hex for the range
9980 # being accumulated.
9981 my @fields; # The input fields;
9982 my @previous_fields; # And those from the previous call
9983
9984 sub filter_UnicodeData_line {
9985 # Handle a single input line from UnicodeData.txt; see comments above
9986 # Conceptually this takes a single line from the file containing N
9987 # properties, and converts it into N lines with one property per line,
9988 # which is what the final handler expects. But there are
9989 # complications due to the quirkiness of the input file, and to save
9990 # time, it accumulates ranges where the property values don't change
9991 # and only emits lines when necessary. This is about an order of
9992 # magnitude fewer lines emitted.
9993
9994 my $file = shift;
9995 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9996
9997 # $_ contains the input line.
9998 # -1 in split means retain trailing null fields
9999 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10000
10001 #local $to_trace = 1 if main::DEBUG;
10002 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10003 if (@fields > $input_field_count) {
10004 $file->carp_bad_line('Extra fields');
10005 $_ = "";
10006 return;
10007 }
10008
10009 my $decimal_cp = hex $cp;
10010
10011 # We have to output all the buffered ranges when the next code point
10012 # is not exactly one after the previous one, which means there is a
10013 # gap in the ranges.
10014 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10015
10016 # The decomposition mapping field requires special handling. It looks
10017 # like either:
10018 #
10019 # <compat> 0032 0020
10020 # 0041 0300
10021 #
10022 # The decomposition type is enclosed in <brackets>; if missing, it
10023 # means the type is canonical. There are two decomposition mapping
10024 # tables: the one for use by Perl's normalize.pm has a special format
10025 # which is this field intact; the other, for general use is of
10026 # standard format. In either case we have to find the decomposition
10027 # type. Empty fields have None as their type, and map to the code
10028 # point itself
10029 if ($fields[$PERL_DECOMPOSITION] eq "") {
10030 $fields[$DECOMP_TYPE] = 'None';
10031 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10032 }
10033 else {
10034 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10035 =~ / < ( .+? ) > \s* ( .+ ) /x;
10036 if (! defined $fields[$DECOMP_TYPE]) {
10037 $fields[$DECOMP_TYPE] = 'Canonical';
10038 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10039 }
10040 else {
10041 $fields[$DECOMP_MAP] = $map;
10042 }
10043 }
10044
10045 # The 3 numeric fields also require special handling. The 2 digit
10046 # fields must be either empty or match the number field. This means
10047 # that if it is empty, they must be as well, and the numeric type is
10048 # None, and the numeric value is 'Nan'.
10049 # The decimal digit field must be empty or match the other digit
10050 # field. If the decimal digit field is non-empty, the code point is
10051 # a decimal digit, and the other two fields will have the same value.
10052 # If it is empty, but the other digit field is non-empty, the code
10053 # point is an 'other digit', and the number field will have the same
10054 # value as the other digit field. If the other digit field is empty,
10055 # but the number field is non-empty, the code point is a generic
10056 # numeric type.
10057 if ($fields[$NUMERIC] eq "") {
10058 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10059 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10060 ) {
10061 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10062 }
10063 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10064 $fields[$NUMERIC] = 'NaN';
10065 }
10066 else {
10067 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10068 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10069 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10070 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10071 }
10072 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10073 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10074 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10075 }
10076 else {
10077 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10078
10079 # Rationals require extra effort.
10080 register_fraction($fields[$NUMERIC])
10081 if $fields[$NUMERIC] =~ qr{/};
10082 }
10083 }
10084
10085 # For the properties that have empty fields in the file, and which
10086 # mean something different from empty, change them to that default.
10087 # Certain fields just haven't been empty so far in any Unicode
10088 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10089 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 10090 # the defaults; which are very unlikely to ever change.
99870f4d
KW
10091 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10092 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10093
10094 # UAX44 says that if title is empty, it is the same as whatever upper
10095 # is,
10096 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10097
10098 # There are a few pairs of lines like:
10099 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10100 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10101 # that define ranges. These should be processed after the fields are
10102 # adjusted above, as they may override some of them; but mostly what
28093d0e 10103 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
10104 # paired lines start with a '<', but this is also true of '<control>,
10105 # which isn't one of these special ones.
28093d0e 10106 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
10107
10108 # Some code points in this file have the pseudo-name
10109 # '<control>', but the official name for such ones is the null
28093d0e 10110 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 10111 $fields[$NAME] = "";
28093d0e 10112 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
10113
10114 # We had better not be in between range lines.
10115 if ($in_range) {
28093d0e 10116 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10117 $in_range = 0;
10118 }
10119 }
28093d0e 10120 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
10121
10122 # Here is a non-range line. We had better not be in between range
10123 # lines.
10124 if ($in_range) {
28093d0e 10125 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10126 $in_range = 0;
10127 }
edb80b88
KW
10128 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10129
10130 # These are code points whose names end in their code points,
10131 # which means the names are algorithmically derivable from the
10132 # code points. To shorten the output Name file, the algorithm
10133 # for deriving these is placed in the file instead of each
10134 # code point, so they have map type $CP_IN_NAME
10135 $fields[$CHARNAME] = $CMD_DELIM
10136 . $MAP_TYPE_CMD
10137 . '='
10138 . $CP_IN_NAME
10139 . $CMD_DELIM
10140 . $fields[$CHARNAME];
10141 }
28093d0e 10142 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 10143 }
28093d0e
KW
10144 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10145 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
10146
10147 # Here we are at the beginning of a range pair.
10148 if ($in_range) {
28093d0e 10149 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10150 }
10151 $in_range = 1;
10152
10153 # Because the properties in the range do not overwrite any already
10154 # in the db, we must flush the buffers of what's already there, so
10155 # they get handled in the normal scheme.
10156 $force_output = 1;
10157
10158 }
28093d0e
KW
10159 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10160 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
10161 $_ = "";
10162 return;
10163 }
10164 else { # Here, we are at the last line of a range pair.
10165
10166 if (! $in_range) {
28093d0e 10167 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
10168 $_ = "";
10169 return;
10170 }
10171 $in_range = 0;
10172
28093d0e
KW
10173 $fields[$NAME] = $fields[$CHARNAME];
10174
99870f4d
KW
10175 # Check that the input is valid: that the closing of the range is
10176 # the same as the beginning.
10177 foreach my $i (0 .. $last_field) {
10178 next if $fields[$i] eq $previous_fields[$i];
10179 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10180 }
10181
10182 # The processing differs depending on the type of range,
28093d0e
KW
10183 # determined by its $CHARNAME
10184 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
10185
10186 # Check that the data looks right.
10187 if ($decimal_previous_cp != $SBase) {
10188 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10189 }
10190 if ($decimal_cp != $SBase + $SCount - 1) {
10191 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10192 }
10193
10194 # The Hangul syllable range has a somewhat complicated name
10195 # generation algorithm. Each code point in it has a canonical
10196 # decomposition also computable by an algorithm. The
10197 # perl decomposition map table built from these is used only
10198 # by normalize.pm, which has the algorithm built in it, so the
10199 # decomposition maps are not needed, and are large, so are
10200 # omitted from it. If the full decomposition map table is to
10201 # be output, the decompositions are generated for it, in the
10202 # EOF handling code for this input file.
10203
10204 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10205
10206 # This range is stored in our internal structure with its
10207 # own map type, different from all others.
28093d0e
KW
10208 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10209 = $CMD_DELIM
99870f4d
KW
10210 . $MAP_TYPE_CMD
10211 . '='
10212 . $HANGUL_SYLLABLE
10213 . $CMD_DELIM
28093d0e 10214 . $fields[$CHARNAME];
99870f4d 10215 }
28093d0e 10216 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
10217
10218 # The name for these contains the code point itself, and all
10219 # are defined to have the same base name, regardless of what
10220 # is in the file. They are stored in our internal structure
10221 # with a map type of $CP_IN_NAME
28093d0e
KW
10222 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10223 = $CMD_DELIM
99870f4d
KW
10224 . $MAP_TYPE_CMD
10225 . '='
10226 . $CP_IN_NAME
10227 . $CMD_DELIM
10228 . 'CJK UNIFIED IDEOGRAPH';
10229
10230 }
10231 elsif ($fields[$CATEGORY] eq 'Co'
10232 || $fields[$CATEGORY] eq 'Cs')
10233 {
10234 # The names of all the code points in these ranges are set to
10235 # null, as there are no names for the private use and
10236 # surrogate code points.
10237
28093d0e 10238 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
10239 }
10240 else {
28093d0e 10241 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
10242 }
10243
10244 # The first line of the range caused everything else to be output,
10245 # and then its values were stored as the beginning values for the
10246 # next set of ranges, which this one ends. Now, for each value,
10247 # add a command to tell the handler that these values should not
10248 # replace any existing ones in our database.
10249 foreach my $i (0 .. $last_field) {
10250 $previous_fields[$i] = $CMD_DELIM
10251 . $REPLACE_CMD
10252 . '='
10253 . $NO
10254 . $CMD_DELIM
10255 . $previous_fields[$i];
10256 }
10257
10258 # And change things so it looks like the entire range has been
10259 # gone through with this being the final part of it. Adding the
10260 # command above to each field will cause this range to be flushed
10261 # during the next iteration, as it guaranteed that the stored
10262 # field won't match whatever value the next one has.
10263 $previous_cp = $cp;
10264 $decimal_previous_cp = $decimal_cp;
10265
10266 # We are now set up for the next iteration; so skip the remaining
10267 # code in this subroutine that does the same thing, but doesn't
10268 # know about these ranges.
10269 $_ = "";
c1739a4a 10270
99870f4d
KW
10271 return;
10272 }
10273
10274 # On the very first line, we fake it so the code below thinks there is
10275 # nothing to output, and initialize so that when it does get output it
10276 # uses the first line's values for the lowest part of the range.
10277 # (One could avoid this by using peek(), but then one would need to
10278 # know the adjustments done above and do the same ones in the setup
10279 # routine; not worth it)
10280 if ($first_time) {
10281 $first_time = 0;
10282 @previous_fields = @fields;
10283 @start = ($cp) x scalar @fields;
10284 $decimal_previous_cp = $decimal_cp - 1;
10285 }
10286
10287 # For each field, output the stored up ranges that this code point
10288 # doesn't fit in. Earlier we figured out if all ranges should be
10289 # terminated because of changing the replace or map type styles, or if
10290 # there is a gap between this new code point and the previous one, and
10291 # that is stored in $force_output. But even if those aren't true, we
10292 # need to output the range if this new code point's value for the
10293 # given property doesn't match the stored range's.
10294 #local $to_trace = 1 if main::DEBUG;
10295 foreach my $i (0 .. $last_field) {
10296 my $field = $fields[$i];
10297 if ($force_output || $field ne $previous_fields[$i]) {
10298
10299 # Flush the buffer of stored values.
10300 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10301
10302 # Start a new range with this code point and its value
10303 $start[$i] = $cp;
10304 $previous_fields[$i] = $field;
10305 }
10306 }
10307
10308 # Set the values for the next time.
10309 $previous_cp = $cp;
10310 $decimal_previous_cp = $decimal_cp;
10311
10312 # The input line has generated whatever adjusted lines are needed, and
10313 # should not be looked at further.
10314 $_ = "";
10315 return;
10316 }
10317
10318 sub EOF_UnicodeData {
10319 # Called upon EOF to flush the buffers, and create the Hangul
10320 # decomposition mappings if needed.
10321
10322 my $file = shift;
10323 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10324
10325 # Flush the buffers.
10326 foreach my $i (1 .. $last_field) {
10327 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10328 }
10329
10330 if (-e 'Jamo.txt') {
10331
10332 # The algorithm is published by Unicode, based on values in
10333 # Jamo.txt, (which should have been processed before this
10334 # subroutine), and the results left in %Jamo
10335 unless (%Jamo) {
10336 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10337 return;
10338 }
10339
10340 # If the full decomposition map table is being output, insert
10341 # into it the Hangul syllable mappings. This is to avoid having
10342 # to publish a subroutine in it to compute them. (which would
10343 # essentially be this code.) This uses the algorithm published by
10344 # Unicode.
10345 if (property_ref('Decomposition_Mapping')->to_output_map) {
10346 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10347 use integer;
10348 my $SIndex = $S - $SBase;
10349 my $L = $LBase + $SIndex / $NCount;
10350 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10351 my $T = $TBase + $SIndex % $TCount;
10352
10353 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10354 my $decomposition = sprintf("%04X %04X", $L, $V);
10355 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10356 $file->insert_adjusted_lines(
10357 sprintf("%04X; Decomposition_Mapping; %s",
10358 $S,
10359 $decomposition));
10360 }
10361 }
10362 }
10363
10364 return;
10365 }
10366
10367 sub filter_v1_ucd {
10368 # Fix UCD lines in version 1. This is probably overkill, but this
10369 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10370 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10371 # removed. This program retains them
10372 # 2) didn't include ranges, which it should have, and which are now
10373 # added in @corrected_lines below. It was hand populated by
10374 # taking the data from Version 2, verified by analyzing
10375 # DAge.txt.
10376 # 3) There is a syntax error in the entry for U+09F8 which could
10377 # cause problems for utf8_heavy, and so is changed. It's
10378 # numeric value was simply a minus sign, without any number.
10379 # (Eventually Unicode changed the code point to non-numeric.)
10380 # 4) The decomposition types often don't match later versions
10381 # exactly, and the whole syntax of that field is different; so
10382 # the syntax is changed as well as the types to their later
10383 # terminology. Otherwise normalize.pm would be very unhappy
10384 # 5) Many ccc classes are different. These are left intact.
10385 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10386 # fields. These are unchanged because it doesn't really cause
10387 # problems for Perl.
10388 # 7) A number of code points, such as controls, don't have their
10389 # Unicode Version 1 Names in this file. These are unchanged.
10390
10391 my @corrected_lines = split /\n/, <<'END';
103924E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
103939FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10394E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10395F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10396F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10397FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10398END
10399
10400 my $file = shift;
10401 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10402
10403 #local $to_trace = 1 if main::DEBUG;
10404 trace $_ if main::DEBUG && $to_trace;
10405
10406 # -1 => retain trailing null fields
10407 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10408
10409 # At the first place that is wrong in the input, insert all the
10410 # corrections, replacing the wrong line.
10411 if ($code_point eq '4E00') {
10412 my @copy = @corrected_lines;
10413 $_ = shift @copy;
10414 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10415
10416 $file->insert_lines(@copy);
10417 }
10418
10419
10420 if ($fields[$NUMERIC] eq '-') {
10421 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10422 }
10423
10424 if ($fields[$PERL_DECOMPOSITION] ne "") {
10425
10426 # Several entries have this change to superscript 2 or 3 in the
10427 # middle. Convert these to the modern version, which is to use
10428 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10429 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10430 # 'HHHH HHHH 00B3 HHHH'.
10431 # It turns out that all of these that don't have another
10432 # decomposition defined at the beginning of the line have the
10433 # <square> decomposition in later releases.
10434 if ($code_point ne '00B2' && $code_point ne '00B3') {
10435 if ($fields[$PERL_DECOMPOSITION]
10436 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10437 {
10438 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10439 $fields[$PERL_DECOMPOSITION] = '<square> '
10440 . $fields[$PERL_DECOMPOSITION];
10441 }
10442 }
10443 }
10444
10445 # If is like '<+circled> 0052 <-circled>', convert to
10446 # '<circled> 0052'
10447 $fields[$PERL_DECOMPOSITION] =~
10448 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10449
10450 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10451 $fields[$PERL_DECOMPOSITION] =~
10452 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10453 or $fields[$PERL_DECOMPOSITION] =~
10454 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10455 or $fields[$PERL_DECOMPOSITION] =~
10456 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10457 or $fields[$PERL_DECOMPOSITION] =~
10458 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10459
10460 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10461 $fields[$PERL_DECOMPOSITION] =~
10462 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10463
10464 # Change names to modern form.
10465 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10466 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10467 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10468 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10469
10470 # One entry has weird braces
10471 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10472 }
10473
10474 $_ = join ';', $code_point, @fields;
10475 trace $_ if main::DEBUG && $to_trace;
10476 return;
10477 }
10478
10479 sub filter_v2_1_5_ucd {
10480 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10481 # columns swapped; These all had mirrored be 'N'. So if the numeric
10482 # column appears to be N, swap it back.
10483
10484 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10485 if ($fields[$NUMERIC] eq 'N') {
10486 $fields[$NUMERIC] = $fields[$MIRRORED];
10487 $fields[$MIRRORED] = 'N';
10488 $_ = join ';', $code_point, @fields;
10489 }
10490 return;
10491 }
3ffed8c2
KW
10492
10493 sub filter_v6_ucd {
10494
c12f2655
KW
10495 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10496 # accepted that yet to allow for some deprecation cycles.
3ffed8c2 10497
484741e1 10498 return if $_ !~ /^(?:0007|1F514|070F);/;
3ffed8c2
KW
10499
10500 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10501 if ($code_point eq '0007') {
dcd72625 10502 $fields[$CHARNAME] = "";
3ffed8c2 10503 }
484741e1
KW
10504 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10505 # http://www.unicode.org/versions/corrigendum8.html
10506 $fields[$BIDI] = "AL";
10507 }
10914c78 10508 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
3ffed8c2
KW
10509 $fields[$CHARNAME] = "";
10510 }
10511
10512 $_ = join ';', $code_point, @fields;
10513
10514 return;
10515 }
99870f4d
KW
10516} # End closure for UnicodeData
10517
37e2e78e
KW
10518sub process_GCB_test {
10519
10520 my $file = shift;
10521 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10522
10523 while ($file->next_line) {
10524 push @backslash_X_tests, $_;
10525 }
678f13d5 10526
37e2e78e
KW
10527 return;
10528}
10529
99870f4d
KW
10530sub process_NamedSequences {
10531 # NamedSequences.txt entries are just added to an array. Because these
10532 # don't look like the other tables, they have their own handler.
10533 # An example:
10534 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10535 #
10536 # This just adds the sequence to an array for later handling
10537
99870f4d
KW
10538 my $file = shift;
10539 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10540
10541 while ($file->next_line) {
10542 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10543 if (@remainder) {
10544 $file->carp_bad_line(
10545 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10546 next;
10547 }
fb121860
KW
10548
10549 # Note single \t in keeping with special output format of
10550 # Perl_charnames. But it turns out that the code points don't have to
10551 # be 5 digits long, like the rest, based on the internal workings of
10552 # charnames.pm. This could be easily changed for consistency.
10553 push @named_sequences, "$sequence\t$name";
99870f4d
KW
10554 }
10555 return;
10556}
10557
10558{ # Closure
10559
10560 my $first_range;
10561
10562 sub filter_early_ea_lb {
10563 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10564 # third field be the name of the code point, which can be ignored in
10565 # most cases. But it can be meaningful if it marks a range:
10566 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10567 # 3400;W;<CJK Ideograph Extension A, First>
10568 #
10569 # We need to see the First in the example above to know it's a range.
10570 # They did not use the later range syntaxes. This routine changes it
10571 # to use the modern syntax.
10572 # $1 is the Input_file object.
10573
10574 my @fields = split /\s*;\s*/;
10575 if ($fields[2] =~ /^<.*, First>/) {
10576 $first_range = $fields[0];
10577 $_ = "";
10578 }
10579 elsif ($fields[2] =~ /^<.*, Last>/) {
10580 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10581 }
10582 else {
10583 undef $first_range;
10584 $_ = "$fields[0]; $fields[1]";
10585 }
10586
10587 return;
10588 }
10589}
10590
10591sub filter_old_style_arabic_shaping {
10592 # Early versions used a different term for the later one.
10593
10594 my @fields = split /\s*;\s*/;
10595 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10596 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10597 $_ = join ';', @fields;
10598 return;
10599}
10600
10601sub filter_arabic_shaping_line {
10602 # ArabicShaping.txt has entries that look like:
10603 # 062A; TEH; D; BEH
10604 # The field containing 'TEH' is not used. The next field is Joining_Type
10605 # and the last is Joining_Group
10606 # This generates two lines to pass on, one for each property on the input
10607 # line.
10608
10609 my $file = shift;
10610 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10611
10612 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10613
10614 if (@fields > 4) {
10615 $file->carp_bad_line('Extra fields');
10616 $_ = "";
10617 return;
10618 }
10619
10620 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10621 $_ = "$fields[0]; Joining_Type; $fields[2]";
10622
10623 return;
10624}
10625
d3fed3dd
KW
10626{ # Closure
10627 my $lc; # Table for lowercase mapping
10628 my $tc;
10629 my $uc;
10630
6c0259ad
KW
10631 sub setup_special_casing {
10632 # SpecialCasing.txt contains the non-simple case change mappings. The
10633 # simple ones are in UnicodeData.txt, which should already have been
10634 # read in to the full property data structures, so as to initialize
10635 # these with the simple ones. Then the SpecialCasing.txt entries
10636 # overwrite the ones which have different full mappings.
10637
10638 # This routine sees if the simple mappings are to be output, and if
10639 # so, copies what has already been put into the full mapping tables,
10640 # while they still contain only the simple mappings.
10641
10642 # The reason it is done this way is that the simple mappings are
10643 # probably not going to be output, so it saves work to initialize the
10644 # full tables with the simple mappings, and then overwrite those
10645 # relatively few entries in them that have different full mappings,
10646 # and thus skip the simple mapping tables altogether.
10647
c12f2655
KW
10648 # New tables with just the simple mappings that are overridden by the
10649 # full ones are constructed. These are for Unicode::UCD, which
10650 # requires the simple mappings. The Case_Folding table is a combined
10651 # table of both the simple and full mappings, with the full ones being
10652 # in the hash, and the simple ones, even those overridden by the hash,
10653 # being in the base table. That same mechanism could have been
10654 # employed here, except that the docs have said that the generated
10655 # files are usuable directly by programs, so we dare not change the
10656 # format in any way.
10657
6c0259ad
KW
10658 my $file= shift;
10659 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10660
6c0259ad
KW
10661 $lc = property_ref('lc');
10662 $tc = property_ref('tc');
10663 $uc = property_ref('uc');
10664
10665 # For each of the case change mappings...
10666 foreach my $case_table ($lc, $tc, $uc) {
10667 my $case = $case_table->name;
10668 my $full = property_ref($case);
10669 unless (defined $full && ! $full->is_empty) {
10670 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10671 }
10672
10673 # The simple version's name in each mapping merely has an 's' in
10674 # front of the full one's
301ba948
KW
10675 my $simple_name = 's' . $case;
10676 my $simple = property_ref($simple_name);
6c0259ad
KW
10677 $simple->initialize($full) if $simple->to_output_map();
10678
10679 my $simple_only = Property->new("_s$case",
10680 Type => $STRING,
10681 Default_Map => $CODE_POINT,
10682 Perl_Extension => 1,
301ba948 10683 Fate => $INTERNAL_ONLY,
9c27f500 10684 Description => "This contains the simple mappings for $case for just the code points that have different full mappings");
6c0259ad
KW
10685 $simple_only->set_to_output_map($INTERNAL_MAP);
10686 $simple_only->add_comment(join_lines( <<END
d3fed3dd
KW
10687This file is for UCD.pm so that it can construct simple mappings that would
10688otherwise be lost because they are overridden by full mappings.
10689END
6c0259ad 10690 ));
5be997b0
KW
10691
10692 unless ($simple->to_output_map()) {
10693 $simple_only->set_proxy_for($simple_name);
10694 }
6c0259ad 10695 }
99870f4d 10696
6c0259ad
KW
10697 return;
10698 }
99870f4d 10699
6c0259ad
KW
10700 sub filter_special_casing_line {
10701 # Change the format of $_ from SpecialCasing.txt into something that
10702 # the generic handler understands. Each input line contains three
10703 # case mappings. This will generate three lines to pass to the
10704 # generic handler for each of those.
99870f4d 10705
6c0259ad
KW
10706 # The input syntax (after stripping comments and trailing white space
10707 # is like one of the following (with the final two being entries that
10708 # we ignore):
10709 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10710 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10711 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10712 # Note the trailing semi-colon, unlike many of the input files. That
10713 # means that there will be an extra null field generated by the split
99870f4d 10714
6c0259ad
KW
10715 my $file = shift;
10716 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10717
6c0259ad
KW
10718 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10719 # fields
10720
10721 # field #4 is when this mapping is conditional. If any of these get
10722 # implemented, it would be by hard-coding in the casing functions in
10723 # the Perl core, not through tables. But if there is a new condition
10724 # we don't know about, output a warning. We know about all the
10725 # conditions through 6.0
10726 if ($fields[4] ne "") {
10727 my @conditions = split ' ', $fields[4];
10728 if ($conditions[0] ne 'tr' # We know that these languages have
10729 # conditions, and some are multiple
10730 && $conditions[0] ne 'az'
10731 && $conditions[0] ne 'lt'
10732
10733 # And, we know about a single condition Final_Sigma, but
10734 # nothing else.
10735 && ($v_version gt v5.2.0
10736 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10737 {
10738 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10739 }
10740 elsif ($conditions[0] ne 'Final_Sigma') {
99870f4d 10741
6c0259ad
KW
10742 # Don't print out a message for Final_Sigma, because we
10743 # have hard-coded handling for it. (But the standard
10744 # could change what the rule should be, but it wouldn't
10745 # show up here anyway.
99870f4d 10746
6c0259ad 10747 print "# SKIPPING Special Casing: $_\n"
99870f4d 10748 if $verbosity >= $VERBOSE;
6c0259ad
KW
10749 }
10750 $_ = "";
10751 return;
10752 }
10753 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10754 $file->carp_bad_line('Extra fields');
10755 $_ = "";
10756 return;
99870f4d 10757 }
99870f4d 10758
6c0259ad
KW
10759 $_ = "$fields[0]; lc; $fields[1]";
10760 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10761 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
99870f4d 10762
6c0259ad
KW
10763 # Copy any simple case change to the special tables constructed if
10764 # being overridden by a multi-character case change.
10765 if ($fields[1] ne $fields[0]
10766 && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10767 {
10768 $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10769 }
10770 if ($fields[2] ne $fields[0]
10771 && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10772 {
10773 $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10774 }
10775 if ($fields[3] ne $fields[0]
10776 && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10777 {
10778 $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10779 }
d3fed3dd 10780
6c0259ad
KW
10781 return;
10782 }
d3fed3dd 10783}
99870f4d
KW
10784
10785sub filter_old_style_case_folding {
10786 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 10787 # and later style. Different letters were used in the earlier.
99870f4d
KW
10788
10789 my $file = shift;
10790 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10791
10792 my @fields = split /\s*;\s*/;
10793 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10794 $fields[1] = 'I';
10795 }
10796 elsif ($fields[1] eq 'L') {
10797 $fields[1] = 'C'; # L => C always
10798 }
10799 elsif ($fields[1] eq 'E') {
10800 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10801 $fields[1] = 'F'
10802 }
10803 else {
10804 $fields[1] = 'C'
10805 }
10806 }
10807 else {
10808 $file->carp_bad_line("Expecting L or E in second field");
10809 $_ = "";
10810 return;
10811 }
10812 $_ = join("; ", @fields) . ';';
10813 return;
10814}
10815
10816{ # Closure for case folding
10817
10818 # Create the map for simple only if are going to output it, for otherwise
10819 # it takes no part in anything we do.
10820 my $to_output_simple;
10821
99870f4d
KW
10822 sub setup_case_folding($) {
10823 # Read in the case foldings in CaseFolding.txt. This handles both
10824 # simple and full case folding.
10825
10826 $to_output_simple
10827 = property_ref('Simple_Case_Folding')->to_output_map;
10828
5be997b0
KW
10829 if (! $to_output_simple) {
10830 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
10831 }
10832
6f2a3287
KW
10833 # If we ever wanted to show that these tables were combined, a new
10834 # property method could be created, like set_combined_props()
10835 property_ref('Case_Folding')->add_comment(join_lines( <<END
10836This file includes both the simple and full case folding maps. The simple
10837ones are in the main body of the table below, and the full ones adding to or
10838overriding them are in the hash.
10839END
10840 ));
99870f4d
KW
10841 return;
10842 }
10843
10844 sub filter_case_folding_line {
10845 # Called for each line in CaseFolding.txt
10846 # Input lines look like:
10847 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10848 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10849 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10850 #
10851 # 'C' means that folding is the same for both simple and full
10852 # 'F' that it is only for full folding
10853 # 'S' that it is only for simple folding
10854 # 'T' is locale-dependent, and ignored
10855 # 'I' is a type of 'F' used in some early releases.
10856 # Note the trailing semi-colon, unlike many of the input files. That
10857 # means that there will be an extra null field generated by the split
10858 # below, which we ignore and hence is not an error.
10859
10860 my $file = shift;
10861 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10862
10863 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10864 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10865 $file->carp_bad_line('Extra fields');
10866 $_ = "";
10867 return;
10868 }
10869
10870 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10871 $_ = "";
10872 return;
10873 }
10874
10875 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
3c099872
KW
10876 # I are all full foldings; S is single-char. For S, there is always
10877 # an F entry, so we must allow multiple values for the same code
10878 # point. Fortunately this table doesn't need further manipulation
10879 # which would preclude using multiple-values. The S is now included
10880 # so that _swash_inversion_hash() is able to construct closures
10881 # without having to worry about F mappings.
10882 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10883 $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
99870f4d
KW
10884 }
10885 else {
10886 $_ = "";
3c099872 10887 $file->carp_bad_line('Expecting C F I S or T in second field');
99870f4d
KW
10888 }
10889
10890 # C and S are simple foldings, but simple case folding is not needed
10891 # unless we explicitly want its map table output.
10892 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10893 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10894 }
10895
99870f4d
KW
10896 return;
10897 }
10898
99870f4d
KW
10899} # End case fold closure
10900
10901sub filter_jamo_line {
10902 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10903 # from this file that is used in generating the Name property for Jamo
10904 # code points. But, it also is used to convert early versions' syntax
10905 # into the modern form. Here are two examples:
10906 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10907 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10908 #
10909 # The input is $_, the output is $_ filtered.
10910
10911 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10912
10913 # Let the caller handle unexpected input. In earlier versions, there was
10914 # a third field which is supposed to be a comment, but did not have a '#'
10915 # before it.
10916 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10917
10918 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10919 # beginning.
10920
10921 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10922 $fields[1] = 'R' if $fields[0] eq '1105';
10923
10924 # Add to structure so can generate Names from it.
10925 my $cp = hex $fields[0];
10926 my $short_name = $fields[1];
10927 $Jamo{$cp} = $short_name;
10928 if ($cp <= $LBase + $LCount) {
10929 $Jamo_L{$short_name} = $cp - $LBase;
10930 }
10931 elsif ($cp <= $VBase + $VCount) {
10932 $Jamo_V{$short_name} = $cp - $VBase;
10933 }
10934 elsif ($cp <= $TBase + $TCount) {
10935 $Jamo_T{$short_name} = $cp - $TBase;
10936 }
10937 else {
10938 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10939 }
10940
10941
10942 # Reassemble using just the first two fields to look like a typical
10943 # property file line
10944 $_ = "$fields[0]; $fields[1]";
10945
10946 return;
10947}
10948
99870f4d
KW
10949sub register_fraction($) {
10950 # This registers the input rational number so that it can be passed on to
10951 # utf8_heavy.pl, both in rational and floating forms.
10952
10953 my $rational = shift;
10954
10955 my $float = eval $rational;
10956 $nv_floating_to_rational{$float} = $rational;
10957 return;
10958}
10959
10960sub filter_numeric_value_line {
10961 # DNumValues contains lines of a different syntax than the typical
10962 # property file:
10963 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10964 #
10965 # This routine transforms $_ containing the anomalous syntax to the
10966 # typical, by filtering out the extra columns, and convert early version
10967 # decimal numbers to strings that look like rational numbers.
10968
10969 my $file = shift;
10970 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10971
10972 # Starting in 5.1, there is a rational field. Just use that, omitting the
10973 # extra columns. Otherwise convert the decimal number in the second field
10974 # to a rational, and omit extraneous columns.
10975 my @fields = split /\s*;\s*/, $_, -1;
10976 my $rational;
10977
10978 if ($v_version ge v5.1.0) {
10979 if (@fields != 4) {
10980 $file->carp_bad_line('Not 4 semi-colon separated fields');
10981 $_ = "";
10982 return;
10983 }
10984 $rational = $fields[3];
10985 $_ = join '; ', @fields[ 0, 3 ];
10986 }
10987 else {
10988
10989 # Here, is an older Unicode file, which has decimal numbers instead of
10990 # rationals in it. Use the fraction to calculate the denominator and
10991 # convert to rational.
10992
10993 if (@fields != 2 && @fields != 3) {
10994 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10995 $_ = "";
10996 return;
10997 }
10998
10999 my $codepoints = $fields[0];
11000 my $decimal = $fields[1];
11001 if ($decimal =~ s/\.0+$//) {
11002
11003 # Anything ending with a decimal followed by nothing but 0's is an
11004 # integer
11005 $_ = "$codepoints; $decimal";
11006 $rational = $decimal;
11007 }
11008 else {
11009
11010 my $denominator;
11011 if ($decimal =~ /\.50*$/) {
11012 $denominator = 2;
11013 }
11014
11015 # Here have the hardcoded repeating decimals in the fraction, and
11016 # the denominator they imply. There were only a few denominators
11017 # in the older Unicode versions of this file which this code
11018 # handles, so it is easy to convert them.
11019
11020 # The 4 is because of a round-off error in the Unicode 3.2 files
11021 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11022 $denominator = 3;
11023 }
11024 elsif ($decimal =~ /\.[27]50*$/) {
11025 $denominator = 4;
11026 }
11027 elsif ($decimal =~ /\.[2468]0*$/) {
11028 $denominator = 5;
11029 }
11030 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11031 $denominator = 6;
11032 }
11033 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11034 $denominator = 8;
11035 }
11036 if ($denominator) {
11037 my $sign = ($decimal < 0) ? "-" : "";
11038 my $numerator = int((abs($decimal) * $denominator) + .5);
11039 $rational = "$sign$numerator/$denominator";
11040 $_ = "$codepoints; $rational";
11041 }
11042 else {
11043 $file->carp_bad_line("Can't cope with number '$decimal'.");
11044 $_ = "";
11045 return;
11046 }
11047 }
11048 }
11049
11050 register_fraction($rational) if $rational =~ qr{/};
11051 return;
11052}
11053
11054{ # Closure
11055 my %unihan_properties;
99870f4d
KW
11056
11057 sub setup_unihan {
11058 # Do any special setup for Unihan properties.
11059
11060 # This property gives the wrong computed type, so override.
11061 my $usource = property_ref('kIRG_USource');
11062 $usource->set_type($STRING) if defined $usource;
11063
b2abbe5b
KW
11064 # This property is to be considered binary (it says so in
11065 # http://www.unicode.org/reports/tr38/)
46b2142f 11066 my $iicore = property_ref('kIICore');
99870f4d 11067 if (defined $iicore) {
46b2142f
KW
11068 $iicore->set_type($FORCED_BINARY);
11069 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11070
11071 # Unicode doesn't include the maps for this property, so don't
11072 # warn that they are missing.
11073 $iicore->set_pre_declared_maps(0);
11074 $iicore->add_comment(join_lines( <<END
11075This property contains enum values, but Unicode UAX #38 says it should be
11076interpreted as binary, so Perl creates tables for both 1) its enum values,
11077plus 2) true/false tables in which it is considered true for all code points
11078that have a non-null value
11079END
11080 ));
99870f4d
KW
11081 }
11082
11083 return;
11084 }
11085
11086 sub filter_unihan_line {
11087 # Change unihan db lines to look like the others in the db. Here is
11088 # an input sample:
11089 # U+341C kCangjie IEKN
11090
11091 # Tabs are used instead of semi-colons to separate fields; therefore
11092 # they may have semi-colons embedded in them. Change these to periods
11093 # so won't screw up the rest of the code.
11094 s/;/./g;
11095
11096 # Remove lines that don't look like ones we accept.
11097 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11098 $_ = "";
11099 return;
11100 }
11101
11102 # Extract the property, and save a reference to its object.
11103 my $property = $1;
11104 if (! exists $unihan_properties{$property}) {
11105 $unihan_properties{$property} = property_ref($property);
11106 }
11107
11108 # Don't do anything unless the property is one we're handling, which
11109 # we determine by seeing if there is an object defined for it or not
11110 if (! defined $unihan_properties{$property}) {
11111 $_ = "";
11112 return;
11113 }
11114
99870f4d
KW
11115 # Convert the tab separators to our standard semi-colons, and convert
11116 # the U+HHHH notation to the rest of the standard's HHHH
11117 s/\t/;/g;
11118 s/\b U \+ (?= $code_point_re )//xg;
11119
11120 #local $to_trace = 1 if main::DEBUG;
11121 trace $_ if main::DEBUG && $to_trace;
11122
11123 return;
11124 }
11125}
11126
11127sub filter_blocks_lines {
11128 # In the Blocks.txt file, the names of the blocks don't quite match the
11129 # names given in PropertyValueAliases.txt, so this changes them so they
11130 # do match: Blanks and hyphens are changed into underscores. Also makes
11131 # early release versions look like later ones
11132 #
11133 # $_ is transformed to the correct value.
11134
11135 my $file = shift;
11136 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11137
11138 if ($v_version lt v3.2.0) {
11139 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11140 $_ = "";
11141 return;
11142 }
11143
11144 # Old versions used a different syntax to mark the range.
11145 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11146 }
11147
11148 my @fields = split /\s*;\s*/, $_, -1;
11149 if (@fields != 2) {
11150 $file->carp_bad_line("Expecting exactly two fields");
11151 $_ = "";
11152 return;
11153 }
11154
11155 # Change hyphens and blanks in the block name field only
11156 $fields[1] =~ s/[ -]/_/g;
11157 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11158
11159 $_ = join("; ", @fields);
11160 return;
11161}
11162
11163{ # Closure
11164 my $current_property;
11165
11166 sub filter_old_style_proplist {
11167 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11168 # was in a completely different syntax. Ken Whistler of Unicode says
11169 # that it was something he used as an aid for his own purposes, but
11170 # was never an official part of the standard. However, comments in
11171 # DAge.txt indicate that non-character code points were available in
11172 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11173 # there except through this file (but on the other hand, they first
11174 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11175 # not. But the claim is that it was published as an aid to others who
11176 # might want some more information than was given in the official UCD
11177 # of the time. Many of the properties in it were incorporated into
11178 # the later PropList.txt, but some were not. This program uses this
11179 # early file to generate property tables that are otherwise not
11180 # accessible in the early UCD's, and most were probably not really
11181 # official at that time, so one could argue that it should be ignored,
11182 # and you can easily modify things to skip this. And there are bugs
11183 # in this file in various versions. (For example, the 2.1.9 version
11184 # removes from Alphabetic the CJK range starting at 4E00, and they
11185 # weren't added back in until 3.1.0.) Many of this file's properties
11186 # were later sanctioned, so this code generates tables for those
11187 # properties that aren't otherwise in the UCD of the time but
11188 # eventually did become official, and throws away the rest. Here is a
11189 # list of all the ones that are thrown away:
11190 # Bidi=* duplicates UnicodeData.txt
11191 # Combining never made into official property;
11192 # is \P{ccc=0}
11193 # Composite never made into official property.
11194 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11195 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11196 # Delimiter never made into official property;
11197 # removed in 3.0.1
11198 # Format Control never made into official property;
11199 # similar to gc=cf
11200 # High Surrogate duplicates Blocks.txt
11201 # Ignorable Control never made into official property;
11202 # similar to di=y
11203 # ISO Control duplicates UnicodeData.txt: gc=cc
11204 # Left of Pair never made into official property;
11205 # Line Separator duplicates UnicodeData.txt: gc=zl
11206 # Low Surrogate duplicates Blocks.txt
11207 # Non-break was actually listed as a property
11208 # in 3.2, but without any code
11209 # points. Unicode denies that this
11210 # was ever an official property
11211 # Non-spacing duplicate UnicodeData.txt: gc=mn
11212 # Numeric duplicates UnicodeData.txt: gc=cc
11213 # Paired Punctuation never made into official property;
11214 # appears to be gc=ps + gc=pe
11215 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11216 # Private Use duplicates UnicodeData.txt: gc=co
11217 # Private Use High Surrogate duplicates Blocks.txt
11218 # Punctuation duplicates UnicodeData.txt: gc=p
11219 # Space different definition than eventual
11220 # one.
11221 # Titlecase duplicates UnicodeData.txt: gc=lt
11222 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
98dc9551 11223 # Zero-width never made into official property;
99870f4d
KW
11224 # subset of gc=cf
11225 # Most of the properties have the same names in this file as in later
11226 # versions, but a couple do not.
11227 #
11228 # This subroutine filters $_, converting it from the old style into
11229 # the new style. Here's a sample of the old-style
11230 #
11231 # *******************************************
11232 #
11233 # Property dump for: 0x100000A0 (Join Control)
11234 #
11235 # 200C..200D (2 chars)
11236 #
11237 # In the example, the property is "Join Control". It is kept in this
11238 # closure between calls to the subroutine. The numbers beginning with
11239 # 0x were internal to Ken's program that generated this file.
11240
11241 # If this line contains the property name, extract it.
11242 if (/^Property dump for: [^(]*\((.*)\)/) {
11243 $_ = $1;
11244
11245 # Convert white space to underscores.
11246 s/ /_/g;
11247
11248 # Convert the few properties that don't have the same name as
11249 # their modern counterparts
11250 s/Identifier_Part/ID_Continue/
11251 or s/Not_a_Character/NChar/;
11252
11253 # If the name matches an existing property, use it.
11254 if (defined property_ref($_)) {
11255 trace "new property=", $_ if main::DEBUG && $to_trace;
11256 $current_property = $_;
11257 }
11258 else { # Otherwise discard it
11259 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11260 undef $current_property;
11261 }
11262 $_ = ""; # The property is saved for the next lines of the
11263 # file, but this defining line is of no further use,
11264 # so clear it so that the caller won't process it
11265 # further.
11266 }
11267 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11268
11269 # Here, the input line isn't a header defining a property for the
11270 # following section, and either we aren't in such a section, or
11271 # the line doesn't look like one that defines the code points in
11272 # such a section. Ignore this line.
11273 $_ = "";
11274 }
11275 else {
11276
11277 # Here, we have a line defining the code points for the current
11278 # stashed property. Anything starting with the first blank is
11279 # extraneous. Otherwise, it should look like a normal range to
11280 # the caller. Append the property name so that it looks just like
11281 # a modern PropList entry.
11282
11283 $_ =~ s/\s.*//;
11284 $_ .= "; $current_property";
11285 }
11286 trace $_ if main::DEBUG && $to_trace;
11287 return;
11288 }
11289} # End closure for old style proplist
11290
11291sub filter_old_style_normalization_lines {
11292 # For early releases of Unicode, the lines were like:
11293 # 74..2A76 ; NFKD_NO
11294 # For later releases this became:
11295 # 74..2A76 ; NFKD_QC; N
11296 # Filter $_ to look like those in later releases.
11297 # Similarly for MAYBEs
11298
11299 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11300
11301 # Also, the property FC_NFKC was abbreviated to FNC
11302 s/FNC/FC_NFKC/;
11303 return;
11304}
11305
82aed44a
KW
11306sub setup_script_extensions {
11307 # The Script_Extensions property starts out with a clone of the Script
11308 # property.
11309
11310 my $sc = property_ref("Script");
11311 my $scx = Property->new("scx", Full_Name => "Script_Extensions",
11312 Initialize => $sc,
11313 Default_Map => $sc->default_map,
11314 Pre_Declared_Maps => 0,
c3ff2976 11315 Format => $STRING_WHITE_SPACE_LIST,
82aed44a
KW
11316 );
11317 $scx->add_comment(join_lines( <<END
11318The values for code points that appear in one script are just the same as for
11319the 'Script' property. Likewise the values for those that appear in many
11320scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11321values of code points that appear in a few scripts are a space separated list
11322of those scripts.
11323END
11324 ));
11325
11326 # Make the scx's tables and aliases for them the same as sc's
11327 foreach my $table ($sc->tables) {
11328 my $scx_table = $scx->add_match_table($table->name,
11329 Full_Name => $table->full_name);
11330 foreach my $alias ($table->aliases) {
11331 $scx_table->add_alias($alias->name);
11332 }
11333 }
11334}
11335
fbe1e607
KW
11336sub filter_script_extensions_line {
11337 # The Scripts file comes with the full name for the scripts; the
11338 # ScriptExtensions, with the short name. The final mapping file is a
11339 # combination of these, and without adjustment, would have inconsistent
11340 # entries. This filters the latter file to convert to full names.
11341 # Entries look like this:
11342 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11343
11344 my @fields = split /\s*;\s*/;
11345 my @full_names;
11346 foreach my $short_name (split " ", $fields[1]) {
11347 push @full_names, $script->table($short_name)->full_name;
11348 }
11349 $fields[1] = join " ", @full_names;
11350 $_ = join "; ", @fields;
11351
11352 return;
11353}
11354
dcd72625
KW
11355sub setup_v6_name_alias {
11356 property_ref('Name_Alias')->add_map(7, 7, "ALERT");
11357}
11358
99870f4d
KW
11359sub finish_Unicode() {
11360 # This routine should be called after all the Unicode files have been read
11361 # in. It:
11362 # 1) Adds the mappings for code points missing from the files which have
11363 # defaults specified for them.
11364 # 2) At this this point all mappings are known, so it computes the type of
11365 # each property whose type hasn't been determined yet.
11366 # 3) Calculates all the regular expression match tables based on the
11367 # mappings.
11368 # 3) Calculates and adds the tables which are defined by Unicode, but
11369 # which aren't derived by them
11370
11371 # For each property, fill in any missing mappings, and calculate the re
11372 # match tables. If a property has more than one missing mapping, the
11373 # default is a reference to a data structure, and requires data from other
11374 # properties to resolve. The sort is used to cause these to be processed
11375 # last, after all the other properties have been calculated.
11376 # (Fortunately, the missing properties so far don't depend on each other.)
11377 foreach my $property
11378 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11379 property_ref('*'))
11380 {
11381 # $perl has been defined, but isn't one of the Unicode properties that
11382 # need to be finished up.
11383 next if $property == $perl;
11384
9f877a47
KW
11385 # Nor do we need to do anything with properties that aren't going to
11386 # be output.
11387 next if $property->fate == $SUPPRESSED;
11388
99870f4d
KW
11389 # Handle the properties that have more than one possible default
11390 if (ref $property->default_map) {
11391 my $default_map = $property->default_map;
11392
11393 # These properties have stored in the default_map:
11394 # One or more of:
11395 # 1) A default map which applies to all code points in a
11396 # certain class
11397 # 2) an expression which will evaluate to the list of code
11398 # points in that class
11399 # And
11400 # 3) the default map which applies to every other missing code
11401 # point.
11402 #
11403 # Go through each list.
11404 while (my ($default, $eval) = $default_map->get_next_defaults) {
11405
11406 # Get the class list, and intersect it with all the so-far
11407 # unspecified code points yielding all the code points
11408 # in the class that haven't been specified.
11409 my $list = eval $eval;
11410 if ($@) {
11411 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11412 last;
11413 }
11414
11415 # Narrow down the list to just those code points we don't have
11416 # maps for yet.
11417 $list = $list & $property->inverse_list;
11418
11419 # Add mappings to the property for each code point in the list
11420 foreach my $range ($list->ranges) {
56343c78
KW
11421 $property->add_map($range->start, $range->end, $default,
11422 Replace => $CROAK);
99870f4d
KW
11423 }
11424 }
11425
11426 # All remaining code points have the other mapping. Set that up
11427 # so the normal single-default mapping code will work on them
11428 $property->set_default_map($default_map->other_default);
11429
11430 # And fall through to do that
11431 }
11432
11433 # We should have enough data now to compute the type of the property.
11434 $property->compute_type;
11435 my $property_type = $property->type;
11436
11437 next if ! $property->to_create_match_tables;
11438
11439 # Here want to create match tables for this property
11440
11441 # The Unicode db always (so far, and they claim into the future) have
11442 # the default for missing entries in binary properties be 'N' (unless
11443 # there is a '@missing' line that specifies otherwise)
11444 if ($property_type == $BINARY && ! defined $property->default_map) {
11445 $property->set_default_map('N');
11446 }
11447
11448 # Add any remaining code points to the mapping, using the default for
5d7f7709 11449 # missing code points.
d8fb1cc3 11450 my $default_table;
99870f4d 11451 if (defined (my $default_map = $property->default_map)) {
1520492f 11452
f4c2a127 11453 # Make sure there is a match table for the default
f4c2a127
KW
11454 if (! defined ($default_table = $property->table($default_map))) {
11455 $default_table = $property->add_match_table($default_map);
11456 }
11457
a92d5c2e
KW
11458 # And, if the property is binary, the default table will just
11459 # be the complement of the other table.
11460 if ($property_type == $BINARY) {
11461 my $non_default_table;
11462
11463 # Find the non-default table.
11464 for my $table ($property->tables) {
11465 next if $table == $default_table;
11466 $non_default_table = $table;
11467 }
11468 $default_table->set_complement($non_default_table);
11469 }
862fd107 11470 else {
a92d5c2e 11471
3981d009
KW
11472 # This fills in any missing values with the default. It's not
11473 # necessary to do this with binary properties, as the default
11474 # is defined completely in terms of the Y table.
6189eadc 11475 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
3981d009 11476 $default_map, Replace => $NO);
862fd107 11477 }
99870f4d
KW
11478 }
11479
11480 # Have all we need to populate the match tables.
11481 my $property_name = $property->name;
56557540 11482 my $maps_should_be_defined = $property->pre_declared_maps;
99870f4d
KW
11483 foreach my $range ($property->ranges) {
11484 my $map = $range->value;
f5e9a6ca 11485 my $table = $property->table($map);
99870f4d
KW
11486 if (! defined $table) {
11487
11488 # Integral and rational property values are not necessarily
56557540
KW
11489 # defined in PropValueAliases, but whether all the other ones
11490 # should be depends on the property.
11491 if ($maps_should_be_defined
99870f4d
KW
11492 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11493 {
11494 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
11495 }
f5e9a6ca 11496 $table = $property->add_match_table($map);
99870f4d
KW
11497 }
11498
862fd107 11499 next if $table->complement != 0; # Don't need to populate these
99870f4d
KW
11500 $table->add_range($range->start, $range->end);
11501 }
11502
06f26c45
KW
11503 # A forced binary property has additional true/false tables which
11504 # should have been set up when it was forced into binary. The false
11505 # table matches exactly the same set as the property's default table.
11506 # The true table matches the complement of that. The false table is
11507 # not the same as an additional set of aliases on top of the default
11508 # table, so use 'set_equivalent_to'. If it were implemented as
11509 # additional aliases, various things would have to be adjusted, but
11510 # especially, if the user wants to get a list of names for the table
11511 # using Unicode::UCD::prop_value_aliases(), s/he should get a
11512 # different set depending on whether they want the default table or
11513 # the false table.
11514 if ($property_type == $FORCED_BINARY) {
11515 $property->table('N')->set_equivalent_to($default_table,
11516 Related => 1);
11517 $property->table('Y')->set_complement($default_table);
11518 }
11519
807807b7
KW
11520 # For Perl 5.6 compatibility, all properties matchable in regexes can
11521 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
11522 # But warn if this creates a conflict with a (new) Unicode property
11523 # name, although it appears that Unicode has made a decision never to
11524 # begin a property name with 'Is_', so this shouldn't happen.
99870f4d
KW
11525 foreach my $alias ($property->aliases) {
11526 my $Is_name = 'Is_' . $alias->name;
807807b7 11527 if (defined (my $pre_existing = property_ref($Is_name))) {
99870f4d 11528 Carp::my_carp(<<END
807807b7
KW
11529There is already an alias named $Is_name (from " . $pre_existing . "), so
11530creating one for $property won't work. This is bad news. If it is not too
11531late, get Unicode to back off. Otherwise go back to the old scheme (findable
11532from the git blame log for this area of the code that suppressed individual
11533aliases that conflict with the new Unicode names. Proceeding anyway.
99870f4d
KW
11534END
11535 );
99870f4d
KW
11536 }
11537 } # End of loop through aliases for this property
11538 } # End of loop through all Unicode properties.
11539
11540 # Fill in the mappings that Unicode doesn't completely furnish. First the
11541 # single letter major general categories. If Unicode were to start
11542 # delivering the values, this would be redundant, but better that than to
11543 # try to figure out if should skip and not get it right. Ths could happen
11544 # if a new major category were to be introduced, and the hard-coded test
11545 # wouldn't know about it.
11546 # This routine depends on the standard names for the general categories
11547 # being what it thinks they are, like 'Cn'. The major categories are the
11548 # union of all the general category tables which have the same first
11549 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11550 foreach my $minor_table ($gc->tables) {
11551 my $minor_name = $minor_table->name;
11552 next if length $minor_name == 1;
11553 if (length $minor_name != 2) {
11554 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11555 next;
11556 }
11557
11558 my $major_name = uc(substr($minor_name, 0, 1));
11559 my $major_table = $gc->table($major_name);
11560 $major_table += $minor_table;
11561 }
11562
11563 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11564 # defines it as LC)
11565 my $LC = $gc->table('LC');
11566 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11567 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11568
11569
11570 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11571 # deliver the correct values in it
11572 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11573
11574 # Lt not in release 1.
a5c376b7
KW
11575 if (defined $gc->table('Lt')) {
11576 $LC += $gc->table('Lt');
11577 $gc->table('Lt')->set_caseless_equivalent($LC);
11578 }
99870f4d
KW
11579 }
11580 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11581
a5c376b7
KW
11582 $gc->table('Ll')->set_caseless_equivalent($LC);
11583 $gc->table('Lu')->set_caseless_equivalent($LC);
11584
99870f4d 11585 my $Cs = $gc->table('Cs');
99870f4d
KW
11586
11587
11588 # Folding information was introduced later into Unicode data. To get
11589 # Perl's case ignore (/i) to work at all in releases that don't have
11590 # folding, use the best available alternative, which is lower casing.
11591 my $fold = property_ref('Simple_Case_Folding');
11592 if ($fold->is_empty) {
11593 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11594 $fold->add_note(join_lines(<<END
11595WARNING: This table uses lower case as a substitute for missing fold
11596information
11597END
11598 ));
11599 }
11600
11601 # Multiple-character mapping was introduced later into Unicode data. If
11602 # missing, use the single-characters maps as best available alternative
11603 foreach my $map (qw { Uppercase_Mapping
11604 Lowercase_Mapping
11605 Titlecase_Mapping
11606 Case_Folding
11607 } ) {
11608 my $full = property_ref($map);
11609 if ($full->is_empty) {
11610 my $simple = property_ref('Simple_' . $map);
11611 $full->initialize($simple);
11612 $full->add_comment($simple->comment) if ($simple->comment);
11613 $full->add_note(join_lines(<<END
11614WARNING: This table uses simple mapping (single-character only) as a
11615substitute for missing multiple-character information
11616END
11617 ));
11618 }
11619 }
82aed44a
KW
11620
11621 # The Script_Extensions property started out as a clone of the Script
11622 # property. But processing its data file caused some elements to be
11623 # replaced with different data. (These elements were for the Common and
11624 # Inherited properties.) This data is a qw() list of all the scripts that
11625 # the code points in the given range are in. An example line is:
11626 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
11627 #
11628 # The code above has created a new match table named "Arab Syrc Thaa"
11629 # which contains 060C. (The cloned table started out with this code point
11630 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
11631 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
11632 # match table. This is repeated for all these tables and ranges. The map
11633 # data is retained in the map table for reference, but the spurious match
11634 # tables are deleted.
11635
11636 my $scx = property_ref("Script_Extensions");
d53a7e7d 11637 if (defined $scx) {
c3a37f64
KW
11638 foreach my $table ($scx->tables) {
11639 next unless $table->name =~ /\s/; # All the new and only the new
11640 # tables have a space in their
11641 # names
11642 my @scripts = split /\s+/, $table->name;
11643 foreach my $script (@scripts) {
11644 my $script_table = $scx->table($script);
11645 $script_table += $table;
11646 }
11647 $scx->delete_match_table($table);
82aed44a 11648 }
d53a7e7d 11649 }
82aed44a
KW
11650
11651 return;
99870f4d
KW
11652}
11653
11654sub compile_perl() {
11655 # Create perl-defined tables. Almost all are part of the pseudo-property
11656 # named 'perl' internally to this program. Many of these are recommended
11657 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11658 # on those found there.
11659 # Almost all of these are equivalent to some Unicode property.
11660 # A number of these properties have equivalents restricted to the ASCII
11661 # range, with their names prefaced by 'Posix', to signify that these match
11662 # what the Posix standard says they should match. A couple are
11663 # effectively this, but the name doesn't have 'Posix' in it because there
cbc24f92
KW
11664 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
11665 # to the full Unicode range, by our guesses as to what is appropriate.
99870f4d
KW
11666
11667 # 'Any' is all code points. As an error check, instead of just setting it
11668 # to be that, construct it to be the union of all the major categories
7fc6cb55 11669 $Any = $perl->add_match_table('Any',
6189eadc 11670 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
99870f4d
KW
11671 Matches_All => 1);
11672
11673 foreach my $major_table ($gc->tables) {
11674
11675 # Major categories are the ones with single letter names.
11676 next if length($major_table->name) != 1;
11677
11678 $Any += $major_table;
11679 }
11680
6189eadc 11681 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
99870f4d
KW
11682 Carp::my_carp_bug("Generated highest code point ("
11683 . sprintf("%X", $Any->max)
6189eadc 11684 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
99870f4d
KW
11685 }
11686 if ($Any->range_count != 1 || $Any->min != 0) {
11687 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11688 }
11689
11690 $Any->add_alias('All');
11691
11692 # Assigned is the opposite of gc=unassigned
11693 my $Assigned = $perl->add_match_table('Assigned',
11694 Description => "All assigned code points",
11695 Initialize => ~ $gc->table('Unassigned'),
11696 );
11697
11698 # Our internal-only property should be treated as more than just a
8050d00f 11699 # synonym; grandfather it in to the pod.
b15a0a3b
KW
11700 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
11701 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
99870f4d
KW
11702 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11703 Related => 1);
11704
11705 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11706 if (defined $block) { # This is equivalent to the block if have it.
11707 my $Unicode_ASCII = $block->table('Basic_Latin');
11708 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11709 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11710 }
11711 }
11712
11713 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11714 # necessary
11715 if ($ASCII->is_empty) {
11716 $ASCII->initialize([ 0..127 ]);
11717 }
11718
99870f4d
KW
11719 # Get the best available case definitions. Early Unicode versions didn't
11720 # have Uppercase and Lowercase defined, so use the general category
11721 # instead for them.
11722 my $Lower = $perl->add_match_table('Lower');
11723 my $Unicode_Lower = property_ref('Lowercase');
11724 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11725 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
a5c376b7
KW
11726 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11727 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11728 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11729
99870f4d
KW
11730 }
11731 else {
11732 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11733 Related => 1);
11734 }
cbc24f92 11735 $Lower->add_alias('XPosixLower');
a5c376b7 11736 my $Posix_Lower = $perl->add_match_table("PosixLower",
ad5e8af1
KW
11737 Description => "[a-z]",
11738 Initialize => $Lower & $ASCII,
11739 );
99870f4d
KW
11740
11741 my $Upper = $perl->add_match_table('Upper');
11742 my $Unicode_Upper = property_ref('Uppercase');
11743 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11744 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
a5c376b7
KW
11745 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11746 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11747 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
99870f4d
KW
11748 }
11749 else {
11750 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11751 Related => 1);
11752 }
cbc24f92 11753 $Upper->add_alias('XPosixUpper');
a5c376b7 11754 my $Posix_Upper = $perl->add_match_table("PosixUpper",
ad5e8af1
KW
11755 Description => "[A-Z]",
11756 Initialize => $Upper & $ASCII,
11757 );
99870f4d
KW
11758
11759 # Earliest releases didn't have title case. Initialize it to empty if not
11760 # otherwise present
4364919a
KW
11761 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11762 Description => '(= \p{Gc=Lt})');
99870f4d 11763 my $lt = $gc->table('Lt');
a5c376b7
KW
11764
11765 # Earlier versions of mktables had this related to $lt since they have
c12f2655
KW
11766 # identical code points, but their caseless equivalents are not the same,
11767 # one being 'Cased' and the other being 'LC', and so now must be kept as
11768 # separate entities.
a5c376b7 11769 $Title += $lt if defined $lt;
99870f4d
KW
11770
11771 # If this Unicode version doesn't have Cased, set up our own. From
11772 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11773 # and only if C has the Lowercase or Uppercase property or has a
11774 # General_Category value of Titlecase_Letter.
a5c376b7
KW
11775 my $Unicode_Cased = property_ref('Cased');
11776 unless (defined $Unicode_Cased) {
99870f4d
KW
11777 my $cased = $perl->add_match_table('Cased',
11778 Initialize => $Lower + $Upper + $Title,
11779 Description => 'Uppercase or Lowercase or Titlecase',
11780 );
a5c376b7 11781 $Unicode_Cased = $cased;
99870f4d 11782 }
a5c376b7 11783 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
99870f4d
KW
11784
11785 # Similarly, set up our own Case_Ignorable property if this Unicode
11786 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11787 # C is defined to be case-ignorable if C has the value MidLetter or the
11788 # value MidNumLet for the Word_Break property or its General_Category is
11789 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11790 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11791
8050d00f
KW
11792 # Perl has long had an internal-only alias for this property; grandfather
11793 # it in to the pod, but discourage its use.
11794 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
b15a0a3b
KW
11795 Re_Pod_Entry => 1,
11796 Fate => $INTERNAL_ONLY,
11797 Status => $DISCOURAGED);
99870f4d
KW
11798 my $case_ignorable = property_ref('Case_Ignorable');
11799 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11800 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11801 Related => 1);
11802 }
11803 else {
11804
11805 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11806
11807 # The following three properties are not in early releases
11808 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11809 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11810 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11811
11812 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11813 # correspondingly the case-ignorable definition lacks that one. For
11814 # 4.0, it appears that it was meant to be the same definition, but was
11815 # inadvertently omitted from the standard's text, so add it if the
11816 # property actually is there
11817 my $wb = property_ref('Word_Break');
11818 if (defined $wb) {
11819 my $midlet = $wb->table('MidLetter');
11820 $perl_case_ignorable += $midlet if defined $midlet;
11821 my $midnumlet = $wb->table('MidNumLet');
11822 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11823 }
11824 else {
11825
11826 # In earlier versions of the standard, instead of the above two
11827 # properties , just the following characters were used:
11828 $perl_case_ignorable += 0x0027 # APOSTROPHE
11829 + 0x00AD # SOFT HYPHEN (SHY)
11830 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11831 }
11832 }
11833
11834 # The remaining perl defined tables are mostly based on Unicode TR 18,
11835 # "Annex C: Compatibility Properties". All of these have two versions,
11836 # one whose name generally begins with Posix that is posix-compliant, and
11837 # one that matches Unicode characters beyond the Posix, ASCII range
11838
ad5e8af1 11839 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
11840
11841 # Alphabetic was not present in early releases
11842 my $Alphabetic = property_ref('Alphabetic');
11843 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11844 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11845 }
11846 else {
11847
11848 # For early releases, we don't get it exactly right. The below
11849 # includes more than it should, which in 5.2 terms is: L + Nl +
11850 # Other_Alphabetic. Other_Alphabetic contains many characters from
11851 # Mn and Mc. It's better to match more than we should, than less than
11852 # we should.
11853 $Alpha->initialize($gc->table('Letter')
11854 + $gc->table('Mn')
11855 + $gc->table('Mc'));
11856 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 11857 $Alpha->add_description('Alphabetic');
99870f4d 11858 }
cbc24f92 11859 $Alpha->add_alias('XPosixAlpha');
a5c376b7 11860 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
ad5e8af1
KW
11861 Description => "[A-Za-z]",
11862 Initialize => $Alpha & $ASCII,
11863 );
a5c376b7
KW
11864 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11865 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
99870f4d
KW
11866
11867 my $Alnum = $perl->add_match_table('Alnum',
56339b2c 11868 Description => 'Alphabetic and (decimal) Numeric',
99870f4d
KW
11869 Initialize => $Alpha + $gc->table('Decimal_Number'),
11870 );
cbc24f92 11871 $Alnum->add_alias('XPosixAlnum');
ad5e8af1
KW
11872 $perl->add_match_table("PosixAlnum",
11873 Description => "[A-Za-z0-9]",
11874 Initialize => $Alnum & $ASCII,
11875 );
99870f4d
KW
11876
11877 my $Word = $perl->add_match_table('Word',
d35dd6c6
KW
11878 Description => '\w, including beyond ASCII;'
11879 . ' = \p{Alnum} + \pM + \p{Pc}',
99870f4d
KW
11880 Initialize => $Alnum + $gc->table('Mark'),
11881 );
cbc24f92 11882 $Word->add_alias('XPosixWord');
99870f4d
KW
11883 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11884 $Word += $Pc if defined $Pc;
11885
f38f76ae 11886 # This is a Perl extension, so the name doesn't begin with Posix.
cbc24f92 11887 my $PerlWord = $perl->add_match_table('PerlWord',
99870f4d
KW
11888 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11889 Initialize => $Word & $ASCII,
11890 );
cbc24f92 11891 $PerlWord->add_alias('PosixWord');
99870f4d
KW
11892
11893 my $Blank = $perl->add_match_table('Blank',
11894 Description => '\h, Horizontal white space',
11895
11896 # 200B is Zero Width Space which is for line
11897 # break control, and was listed as
11898 # Space_Separator in early releases
11899 Initialize => $gc->table('Space_Separator')
11900 + 0x0009 # TAB
11901 - 0x200B, # ZWSP
11902 );
11903 $Blank->add_alias('HorizSpace'); # Another name for it.
cbc24f92 11904 $Blank->add_alias('XPosixBlank');
ad5e8af1
KW
11905 $perl->add_match_table("PosixBlank",
11906 Description => "\\t and ' '",
11907 Initialize => $Blank & $ASCII,
11908 );
99870f4d
KW
11909
11910 my $VertSpace = $perl->add_match_table('VertSpace',
11911 Description => '\v',
11912 Initialize => $gc->table('Line_Separator')
11913 + $gc->table('Paragraph_Separator')
11914 + 0x000A # LINE FEED
11915 + 0x000B # VERTICAL TAB
11916 + 0x000C # FORM FEED
11917 + 0x000D # CARRIAGE RETURN
11918 + 0x0085, # NEL
11919 );
11920 # No Posix equivalent for vertical space
11921
11922 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
11923 Description => '\s including beyond ASCII plus vertical tab',
11924 Initialize => $Blank + $VertSpace,
99870f4d 11925 );
cbc24f92 11926 $Space->add_alias('XPosixSpace');
ad5e8af1 11927 $perl->add_match_table("PosixSpace",
f38f76ae 11928 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
11929 Initialize => $Space & $ASCII,
11930 );
99870f4d
KW
11931
11932 # Perl's traditional space doesn't include Vertical Tab
cbc24f92 11933 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
99870f4d
KW
11934 Description => '\s, including beyond ASCII',
11935 Initialize => $Space - 0x000B,
11936 );
cbc24f92
KW
11937 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
11938 my $PerlSpace = $perl->add_match_table('PerlSpace',
de25ec47
KW
11939 Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11940 Initialize => $XPerlSpace & $ASCII,
99870f4d
KW
11941 );
11942
cbc24f92 11943
99870f4d 11944 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 11945 Description => 'Control characters');
99870f4d 11946 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
cbc24f92 11947 $Cntrl->add_alias('XPosixCntrl');
ad5e8af1 11948 $perl->add_match_table("PosixCntrl",
f38f76ae 11949 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
11950 Initialize => $Cntrl & $ASCII,
11951 );
99870f4d
KW
11952
11953 # $controls is a temporary used to construct Graph.
11954 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11955 + $gc->table('Control'));
11956 # Cs not in release 1
11957 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11958
11959 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11960 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 11961 Description => 'Characters that are graphical',
99870f4d
KW
11962 Initialize => ~ ($Space + $controls),
11963 );
cbc24f92 11964 $Graph->add_alias('XPosixGraph');
ad5e8af1 11965 $perl->add_match_table("PosixGraph",
f38f76ae
KW
11966 Description =>
11967 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
11968 Initialize => $Graph & $ASCII,
11969 );
99870f4d 11970
3e20195b 11971 $print = $perl->add_match_table('Print',
ad5e8af1 11972 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 11973 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 11974 );
cbc24f92 11975 $print->add_alias('XPosixPrint');
ad5e8af1 11976 $perl->add_match_table("PosixPrint",
66fd7fd0 11977 Description =>
f38f76ae 11978 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 11979 Initialize => $print & $ASCII,
ad5e8af1 11980 );
99870f4d
KW
11981
11982 my $Punct = $perl->add_match_table('Punct');
11983 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11984
11985 # \p{punct} doesn't include the symbols, which posix does
cbc24f92
KW
11986 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
11987 Description => '\p{Punct} + ASCII-range \p{Symbol}',
11988 Initialize => $gc->table('Punctuation')
11989 + ($ASCII & $gc->table('Symbol')),
bb080638 11990 Perl_Extension => 1
cbc24f92 11991 );
bb080638 11992 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
f38f76ae 11993 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
cbc24f92 11994 Initialize => $ASCII & $XPosixPunct,
ad5e8af1 11995 );
99870f4d
KW
11996
11997 my $Digit = $perl->add_match_table('Digit',
f3a73f6e 11998 Description => '[0-9] + all other decimal digits');
99870f4d 11999 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
cbc24f92 12000 $Digit->add_alias('XPosixDigit');
ad5e8af1
KW
12001 my $PosixDigit = $perl->add_match_table("PosixDigit",
12002 Description => '[0-9]',
12003 Initialize => $Digit & $ASCII,
12004 );
99870f4d 12005
eadadd41
KW
12006 # Hex_Digit was not present in first release
12007 my $Xdigit = $perl->add_match_table('XDigit');
cbc24f92 12008 $Xdigit->add_alias('XPosixXDigit');
eadadd41
KW
12009 my $Hex = property_ref('Hex_Digit');
12010 if (defined $Hex && ! $Hex->is_empty) {
12011 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
12012 }
12013 else {
eadadd41
KW
12014 # (Have to use hex instead of e.g. '0', because could be running on an
12015 # non-ASCII machine, and we want the Unicode (ASCII) values)
12016 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
12017 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
12018 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d 12019 }
4efcc33b
KW
12020
12021 # AHex was not present in early releases
12022 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
12023 my $AHex = property_ref('ASCII_Hex_Digit');
12024 if (defined $AHex && ! $AHex->is_empty) {
12025 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
12026 }
12027 else {
12028 $PosixXDigit->initialize($Xdigit & $ASCII);
12029 }
12030 $PosixXDigit->add_description('[0-9A-Fa-f]');
99870f4d 12031
99870f4d
KW
12032 my $dt = property_ref('Decomposition_Type');
12033 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
12034 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
12035 Perl_Extension => 1,
d57ccc9a 12036 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
12037 );
12038
12039 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
12040 # than SD appeared, construct it ourselves, based on the first release SD
8050d00f 12041 # was in. A pod entry is grandfathered in for it
33e96e72 12042 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
301ba948
KW
12043 Perl_Extension => 1,
12044 Fate => $INTERNAL_ONLY,
12045 Status => $DISCOURAGED);
99870f4d
KW
12046 my $soft_dotted = property_ref('Soft_Dotted');
12047 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
12048 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
12049 }
12050 else {
12051
12052 # This list came from 3.2 Soft_Dotted.
12053 $CanonDCIJ->initialize([ 0x0069,
12054 0x006A,
12055 0x012F,
12056 0x0268,
12057 0x0456,
12058 0x0458,
12059 0x1E2D,
12060 0x1ECB,
12061 ]);
12062 $CanonDCIJ = $CanonDCIJ & $Assigned;
12063 }
12064
f86864ac 12065 # These are used in Unicode's definition of \X
6ba2d696 12066 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
301ba948 12067 Fate => $INTERNAL_ONLY);
6ba2d696 12068 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
301ba948 12069 Fate => $INTERNAL_ONLY);
37e2e78e 12070
ee24a51c
KW
12071 # For backward compatibility, Perl has its own definition for IDStart
12072 # First, we include the underscore, and then the regular XID_Start also
12073 # have to be Words
12074 $perl->add_match_table('_Perl_IDStart',
12075 Perl_Extension => 1,
301ba948 12076 Fate => $INTERNAL_ONLY,
ee24a51c
KW
12077 Initialize =>
12078 ord('_')
12079 + (property_ref('XID_Start')->table('Y') & $Word)
12080 );
12081
99870f4d 12082 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 12083
678f13d5 12084 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
12085 # definition differs too much from the traditional Perl one to use.
12086 if (defined $gcb && defined $gcb->table('SpacingMark')) {
12087
12088 # Note that assumes HST is defined; it came in an earlier release than
12089 # GCB. In the line below, two negatives means: yes hangul
12090 $begin += ~ property_ref('Hangul_Syllable_Type')
12091 ->table('Not_Applicable')
12092 + ~ ($gcb->table('Control')
12093 + $gcb->table('CR')
12094 + $gcb->table('LF'));
12095 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12096
12097 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12098 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
12099 }
12100 else { # Old definition, used on early releases.
f86864ac 12101 $extend += $gc->table('Mark')
37e2e78e
KW
12102 + 0x200C # ZWNJ
12103 + 0x200D; # ZWJ
12104 $begin += ~ $extend;
12105
12106 # Here we may have a release that has the regular grapheme cluster
12107 # defined, or a release that doesn't have anything defined.
12108 # We set things up so the Perl core degrades gracefully, possibly with
12109 # placeholders that match nothing.
12110
12111 if (! defined $gcb) {
12112 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
12113 }
12114 my $hst = property_ref('HST');
12115 if (!defined $hst) {
12116 $hst = Property->new('HST', Status => $PLACEHOLDER);
12117 $hst->add_match_table('Not_Applicable',
12118 Initialize => $Any,
12119 Matches_All => 1);
12120 }
12121
12122 # On some releases, here we may not have the needed tables for the
12123 # perl core, in some releases we may.
12124 foreach my $name (qw{ L LV LVT T V prepend }) {
12125 my $table = $gcb->table($name);
12126 if (! defined $table) {
12127 $table = $gcb->add_match_table($name);
12128 push @tables_that_may_be_empty, $table->complete_name;
12129 }
12130
12131 # The HST property predates the GCB one, and has identical tables
12132 # for some of them, so use it if we can.
12133 if ($table->is_empty
12134 && defined $hst
12135 && defined $hst->table($name))
12136 {
12137 $table += $hst->table($name);
12138 }
12139 }
12140 }
12141
12142 # More GCB. If we found some hangul syllables, populate a combined
12143 # table.
301ba948
KW
12144 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
12145 Perl_Extension => 1,
12146 Fate => $INTERNAL_ONLY);
37e2e78e
KW
12147 my $LV = $gcb->table('LV');
12148 if ($LV->is_empty) {
12149 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12150 } else {
12151 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12152 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
12153 }
12154
28093d0e 12155 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
12156 my @composition = ('Name', 'Unicode_1_Name');
12157
12158 if (@named_sequences) {
12159 push @composition, 'Named_Sequence';
12160 foreach my $sequence (@named_sequences) {
12161 $perl_charname->add_anomalous_entry($sequence);
12162 }
12163 }
12164
12165 my $alias_sentence = "";
12166 my $alias = property_ref('Name_Alias');
12167 if (defined $alias) {
12168 push @composition, 'Name_Alias';
12169 $alias->reset_each_range;
12170 while (my ($range) = $alias->each_range) {
12171 next if $range->value eq "";
12172 if ($range->start != $range->end) {
12173 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
12174 }
12175 $perl_charname->add_duplicate($range->start, $range->value);
12176 }
12177 $alias_sentence = <<END;
12178The Name_Alias property adds duplicate code point entries with a corrected
12179name. The original (less correct, but still valid) name will be physically
53d84487 12180last.
99870f4d
KW
12181END
12182 }
12183 my $comment;
12184 if (@composition <= 2) { # Always at least 2
12185 $comment = join " and ", @composition;
12186 }
12187 else {
12188 $comment = join ", ", @composition[0 .. scalar @composition - 2];
12189 $comment .= ", and $composition[-1]";
12190 }
12191
99870f4d
KW
12192 $perl_charname->add_comment(join_lines( <<END
12193This file is for charnames.pm. It is the union of the $comment properties.
12194Unicode_1_Name entries are used only for otherwise nameless code
12195points.
12196$alias_sentence
a03f0b9f
KW
12197This file doesn't include the algorithmically determinable names. For those,
12198use 'unicore/Name.pm'
12199END
12200 ));
12201 property_ref('Name')->add_comment(join_lines( <<END
12202This file doesn't include the algorithmically determinable names. For those,
12203use 'unicore/Name.pm'
99870f4d
KW
12204END
12205 ));
12206
99870f4d
KW
12207 # Construct the Present_In property from the Age property.
12208 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12209 my $default_map = $age->default_map;
12210 my $in = Property->new('In',
12211 Default_Map => $default_map,
12212 Full_Name => "Present_In",
99870f4d
KW
12213 Perl_Extension => 1,
12214 Type => $ENUM,
12215 Initialize => $age,
12216 );
12217 $in->add_comment(join_lines(<<END
c12f2655 12218THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
99870f4d
KW
12219same as for $age, and not for what $in really means. This is because anything
12220defined in a given release should have multiple values: that release and all
12221higher ones. But only one value per code point can be represented in a table
12222like this.
12223END
12224 ));
12225
12226 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
12227 # lowest numbered (earliest) come first, with the non-numeric one
12228 # last.
12229 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12230 ? 1
12231 : ($b->name !~ /^[\d.]*$/)
12232 ? -1
12233 : $a->name <=> $b->name
12234 } $age->tables;
12235
12236 # The Present_In property is the cumulative age properties. The first
12237 # one hence is identical to the first age one.
12238 my $previous_in = $in->add_match_table($first_age->name);
12239 $previous_in->set_equivalent_to($first_age, Related => 1);
12240
12241 my $description_start = "Code point's usage introduced in version ";
12242 $first_age->add_description($description_start . $first_age->name);
12243
98dc9551 12244 # To construct the accumulated values, for each of the age tables
99870f4d
KW
12245 # starting with the 2nd earliest, merge the earliest with it, to get
12246 # all those code points existing in the 2nd earliest. Repeat merging
12247 # the new 2nd earliest with the 3rd earliest to get all those existing
12248 # in the 3rd earliest, and so on.
12249 foreach my $current_age (@rest_ages) {
12250 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
12251
12252 my $current_in = $in->add_match_table(
12253 $current_age->name,
12254 Initialize => $current_age + $previous_in,
12255 Description => $description_start
12256 . $current_age->name
12257 . ' or earlier',
12258 );
12259 $previous_in = $current_in;
12260
12261 # Add clarifying material for the corresponding age file. This is
12262 # in part because of the confusing and contradictory information
12263 # given in the Standard's documentation itself, as of 5.2.
12264 $current_age->add_description(
12265 "Code point's usage was introduced in version "
12266 . $current_age->name);
12267 $current_age->add_note("See also $in");
12268
12269 }
12270
12271 # And finally the code points whose usages have yet to be decided are
12272 # the same in both properties. Note that permanently unassigned code
12273 # points actually have their usage assigned (as being permanently
12274 # unassigned), so that these tables are not the same as gc=cn.
12275 my $unassigned = $in->add_match_table($default_map);
12276 my $age_default = $age->table($default_map);
12277 $age_default->add_description(<<END
12278Code point's usage has not been assigned in any Unicode release thus far.
12279END
12280 );
12281 $unassigned->set_equivalent_to($age_default, Related => 1);
12282 }
12283
12284
12285 # Finished creating all the perl properties. All non-internal non-string
12286 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
12287 # an underscore.) These do not get a separate entry in the pod file
12288 foreach my $table ($perl->tables) {
12289 foreach my $alias ($table->aliases) {
12290 next if $alias->name =~ /^_/;
12291 $table->add_alias('Is_' . $alias->name,
33e96e72 12292 Re_Pod_Entry => 0,
fd1e3e84 12293 UCD => 0,
99870f4d 12294 Status => $alias->status,
0eac1e20 12295 OK_as_Filename => 0);
99870f4d
KW
12296 }
12297 }
12298
c4019d52
KW
12299 # Here done with all the basic stuff. Ready to populate the information
12300 # about each character if annotating them.
558712cf 12301 if ($annotate) {
c4019d52
KW
12302
12303 # See comments at its declaration
12304 $annotate_ranges = Range_Map->new;
12305
12306 # This separates out the non-characters from the other unassigneds, so
12307 # can give different annotations for each.
12308 $unassigned_sans_noncharacters = Range_List->new(
12309 Initialize => $gc->table('Unassigned')
12310 & property_ref('Noncharacter_Code_Point')->table('N'));
12311
6189eadc 12312 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
c4019d52
KW
12313 $i = populate_char_info($i); # Note sets $i so may cause skips
12314 }
12315 }
12316
99870f4d
KW
12317 return;
12318}
12319
12320sub add_perl_synonyms() {
12321 # A number of Unicode tables have Perl synonyms that are expressed in
12322 # the single-form, \p{name}. These are:
12323 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12324 # \p{Is_Name} as synonyms
12325 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12326 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12327 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12328 # conflict, \p{Value} and \p{Is_Value} as well
12329 #
12330 # This routine generates these synonyms, warning of any unexpected
12331 # conflicts.
12332
12333 # Construct the list of tables to get synonyms for. Start with all the
12334 # binary and the General_Category ones.
06f26c45
KW
12335 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
12336 property_ref('*');
99870f4d
KW
12337 push @tables, $gc->tables;
12338
12339 # If the version of Unicode includes the Script property, add its tables
359523e2 12340 push @tables, $script->tables if defined $script;
99870f4d
KW
12341
12342 # The Block tables are kept separate because they are treated differently.
12343 # And the earliest versions of Unicode didn't include them, so add only if
12344 # there are some.
12345 my @blocks;
12346 push @blocks, $block->tables if defined $block;
12347
12348 # Here, have the lists of tables constructed. Process blocks last so that
12349 # if there are name collisions with them, blocks have lowest priority.
12350 # Should there ever be other collisions, manual intervention would be
12351 # required. See the comments at the beginning of the program for a
12352 # possible way to handle those semi-automatically.
12353 foreach my $table (@tables, @blocks) {
12354
12355 # For non-binary properties, the synonym is just the name of the
12356 # table, like Greek, but for binary properties the synonym is the name
12357 # of the property, and means the code points in its 'Y' table.
12358 my $nominal = $table;
12359 my $nominal_property = $nominal->property;
12360 my $actual;
12361 if (! $nominal->isa('Property')) {
12362 $actual = $table;
12363 }
12364 else {
12365
12366 # Here is a binary property. Use the 'Y' table. Verify that is
12367 # there
12368 my $yes = $nominal->table('Y');
12369 unless (defined $yes) { # Must be defined, but is permissible to
12370 # be empty.
12371 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
12372 next;
12373 }
12374 $actual = $yes;
12375 }
12376
12377 foreach my $alias ($nominal->aliases) {
12378
12379 # Attempt to create a table in the perl directory for the
12380 # candidate table, using whatever aliases in it that don't
12381 # conflict. Also add non-conflicting aliases for all these
12382 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12383 PREFIX:
12384 foreach my $prefix ("", 'Is_', 'In_') {
12385
12386 # Only Block properties can have added 'In_' aliases.
12387 next if $prefix eq 'In_' and $nominal_property != $block;
12388
12389 my $proposed_name = $prefix . $alias->name;
12390
12391 # No Is_Is, In_In, nor combinations thereof
12392 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12393 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12394
12395 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12396
12397 # Get a reference to any existing table in the perl
12398 # directory with the desired name.
12399 my $pre_existing = $perl->table($proposed_name);
12400
12401 if (! defined $pre_existing) {
12402
12403 # No name collision, so ok to add the perl synonym.
12404
33e96e72 12405 my $make_re_pod_entry;
0eac1e20 12406 my $ok_as_filename;
4cd1260a 12407 my $status = $alias->status;
99870f4d
KW
12408 if ($nominal_property == $block) {
12409
12410 # For block properties, the 'In' form is preferred for
12411 # external use; the pod file contains wild cards for
12412 # this and the 'Is' form so no entries for those; and
12413 # we don't want people using the name without the
12414 # 'In', so discourage that.
12415 if ($prefix eq "") {
33e96e72 12416 $make_re_pod_entry = 1;
99870f4d 12417 $status = $status || $DISCOURAGED;
0eac1e20 12418 $ok_as_filename = 0;
99870f4d
KW
12419 }
12420 elsif ($prefix eq 'In_') {
33e96e72 12421 $make_re_pod_entry = 0;
99870f4d 12422 $status = $status || $NORMAL;
0eac1e20 12423 $ok_as_filename = 1;
99870f4d
KW
12424 }
12425 else {
33e96e72 12426 $make_re_pod_entry = 0;
99870f4d 12427 $status = $status || $DISCOURAGED;
0eac1e20 12428 $ok_as_filename = 0;
99870f4d
KW
12429 }
12430 }
12431 elsif ($prefix ne "") {
12432
12433 # The 'Is' prefix is handled in the pod by a wild
12434 # card, and we won't use it for an external name
33e96e72 12435 $make_re_pod_entry = 0;
99870f4d 12436 $status = $status || $NORMAL;
0eac1e20 12437 $ok_as_filename = 0;
99870f4d
KW
12438 }
12439 else {
12440
12441 # Here, is an empty prefix, non block. This gets its
12442 # own pod entry and can be used for an external name.
33e96e72 12443 $make_re_pod_entry = 1;
99870f4d 12444 $status = $status || $NORMAL;
0eac1e20 12445 $ok_as_filename = 1;
99870f4d
KW
12446 }
12447
12448 # Here, there isn't a perl pre-existing table with the
12449 # name. Look through the list of equivalents of this
12450 # table to see if one is a perl table.
12451 foreach my $equivalent ($actual->leader->equivalents) {
12452 next if $equivalent->property != $perl;
12453
12454 # Here, have found a table for $perl. Add this alias
12455 # to it, and are done with this prefix.
12456 $equivalent->add_alias($proposed_name,
33e96e72 12457 Re_Pod_Entry => $make_re_pod_entry,
fd1e3e84
KW
12458
12459 # Currently don't output these in the
12460 # ucd pod, as are strongly discouraged
12461 # from being used
12462 UCD => 0,
12463
99870f4d 12464 Status => $status,
0eac1e20 12465 OK_as_Filename => $ok_as_filename);
99870f4d
KW
12466 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12467 next PREFIX;
12468 }
12469
12470 # Here, $perl doesn't already have a table that is a
12471 # synonym for this property, add one.
12472 my $added_table = $perl->add_match_table($proposed_name,
33e96e72 12473 Re_Pod_Entry => $make_re_pod_entry,
fd1e3e84
KW
12474
12475 # See UCD comment just above
12476 UCD => 0,
12477
99870f4d 12478 Status => $status,
0eac1e20 12479 OK_as_Filename => $ok_as_filename);
99870f4d
KW
12480 # And it will be related to the actual table, since it is
12481 # based on it.
12482 $added_table->set_equivalent_to($actual, Related => 1);
12483 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12484 next;
12485 } # End of no pre-existing.
12486
12487 # Here, there is a pre-existing table that has the proposed
12488 # name. We could be in trouble, but not if this is just a
12489 # synonym for another table that we have already made a child
12490 # of the pre-existing one.
6505c6e2 12491 if ($pre_existing->is_set_equivalent_to($actual)) {
99870f4d
KW
12492 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12493 $pre_existing->add_alias($proposed_name);
12494 next;
12495 }
12496
12497 # Here, there is a name collision, but it still could be ok if
12498 # the tables match the identical set of code points, in which
12499 # case, we can combine the names. Compare each table's code
12500 # point list to see if they are identical.
12501 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12502 if ($pre_existing->matches_identically_to($actual)) {
12503
12504 # Here, they do match identically. Not a real conflict.
12505 # Make the perl version a child of the Unicode one, except
12506 # in the non-obvious case of where the perl name is
12507 # already a synonym of another Unicode property. (This is
12508 # excluded by the test for it being its own parent.) The
12509 # reason for this exclusion is that then the two Unicode
12510 # properties become related; and we don't really know if
12511 # they are or not. We generate documentation based on
12512 # relatedness, and this would be misleading. Code
12513 # later executed in the process will cause the tables to
12514 # be represented by a single file anyway, without making
12515 # it look in the pod like they are necessarily related.
12516 if ($pre_existing->parent == $pre_existing
12517 && ($pre_existing->property == $perl
12518 || $actual->property == $perl))
12519 {
12520 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12521 $pre_existing->set_equivalent_to($actual, Related => 1);
12522 }
12523 elsif (main::DEBUG && $to_trace) {
12524 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12525 trace $pre_existing->parent;
12526 }
12527 next PREFIX;
12528 }
12529
12530 # Here they didn't match identically, there is a real conflict
12531 # between our new name and a pre-existing property.
12532 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12533 $pre_existing->add_conflicting($nominal->full_name,
12534 'p',
12535 $actual);
12536
12537 # Don't output a warning for aliases for the block
12538 # properties (unless they start with 'In_') as it is
12539 # expected that there will be conflicts and the block
12540 # form loses.
12541 if ($verbosity >= $NORMAL_VERBOSITY
12542 && ($actual->property != $block || $prefix eq 'In_'))
12543 {
12544 print simple_fold(join_lines(<<END
12545There is already an alias named $proposed_name (from " . $pre_existing . "),
12546so not creating this alias for " . $actual
12547END
12548 ), "", 4);
12549 }
12550
12551 # Keep track for documentation purposes.
12552 $has_In_conflicts++ if $prefix eq 'In_';
12553 $has_Is_conflicts++ if $prefix eq 'Is_';
12554 }
12555 }
12556 }
12557
12558 # There are some properties which have No and Yes (and N and Y) as
12559 # property values, but aren't binary, and could possibly be confused with
12560 # binary ones. So create caveats for them. There are tables that are
12561 # named 'No', and tables that are named 'N', but confusion is not likely
12562 # unless they are the same table. For example, N meaning Number or
12563 # Neutral is not likely to cause confusion, so don't add caveats to things
12564 # like them.
06f26c45
KW
12565 foreach my $property (grep { $_->type != $BINARY
12566 && $_->type != $FORCED_BINARY }
12567 property_ref('*'))
12568 {
99870f4d
KW
12569 my $yes = $property->table('Yes');
12570 if (defined $yes) {
12571 my $y = $property->table('Y');
12572 if (defined $y && $yes == $y) {
12573 foreach my $alias ($property->aliases) {
12574 $yes->add_conflicting($alias->name);
12575 }
12576 }
12577 }
12578 my $no = $property->table('No');
12579 if (defined $no) {
12580 my $n = $property->table('N');
12581 if (defined $n && $no == $n) {
12582 foreach my $alias ($property->aliases) {
12583 $no->add_conflicting($alias->name, 'P');
12584 }
12585 }
12586 }
12587 }
12588
12589 return;
12590}
12591
12592sub register_file_for_name($$$) {
12593 # Given info about a table and a datafile that it should be associated
98dc9551 12594 # with, register that association
99870f4d
KW
12595
12596 my $table = shift;
12597 my $directory_ref = shift; # Array of the directory path for the file
e6ebc4c0 12598 my $file = shift; # The file name in the final directory.
99870f4d
KW
12599 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12600
12601 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12602
12603 if ($table->isa('Property')) {
12604 $table->set_file_path(@$directory_ref, $file);
48cf9da9 12605 push @map_properties, $table;
315bfd4e
KW
12606
12607 # No swash means don't do the rest of this.
12608 return if $table->fate != $ORDINARY;
12609
12610 # Get the path to the file
12611 my @path = $table->file_path;
12612
12613 # Use just the file name if no subdirectory.
12614 shift @path if $path[0] eq File::Spec->curdir();
12615
12616 my $file = join '/', @path;
12617
12618 # Create a hash entry for utf8_heavy to get the file that stores this
12619 # property's map table
12620 foreach my $alias ($table->aliases) {
12621 my $name = $alias->name;
12622 $loose_property_to_file_of{standardize($name)} = $file;
12623 }
12624
89cf10cc
KW
12625 # And a way for utf8_heavy to find the proper key in the SwashInfo
12626 # hash for this property.
12627 $file_to_swash_name{$file} = "To" . $table->swash_name;
99870f4d
KW
12628 return;
12629 }
12630
12631 # Do all of the work for all equivalent tables when called with the leader
12632 # table, so skip if isn't the leader.
12633 return if $table->leader != $table;
12634
a92d5c2e
KW
12635 # If this is a complement of another file, use that other file instead,
12636 # with a ! prepended to it.
12637 my $complement;
12638 if (($complement = $table->complement) != 0) {
12639 my @directories = $complement->file_path;
12640
12641 # This assumes that the 0th element is something like 'lib',
12642 # the 1th element the property name (in its own directory), like
12643 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12644 # appended to it later.
12645 $directories[1] =~ s/^/!/;
12646 $file = pop @directories;
12647 $directory_ref =\@directories;
12648 }
12649
99870f4d
KW
12650 # Join all the file path components together, using slashes.
12651 my $full_filename = join('/', @$directory_ref, $file);
12652
12653 # All go in the same subdirectory of unicore
12654 if ($directory_ref->[0] ne $matches_directory) {
12655 Carp::my_carp("Unexpected directory in "
12656 . join('/', @{$directory_ref}, $file));
12657 }
12658
12659 # For this table and all its equivalents ...
12660 foreach my $table ($table, $table->equivalents) {
12661
12662 # Associate it with its file internally. Don't include the
12663 # $matches_directory first component
12664 $table->set_file_path(@$directory_ref, $file);
c15fda25
KW
12665
12666 # No swash means don't do the rest of this.
12667 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
12668
99870f4d
KW
12669 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12670
12671 my $property = $table->property;
ae51efca
KW
12672 my $property_name = ($property == $perl)
12673 ? "" # 'perl' is never explicitly stated
12674 : standardize($property->name) . '=';
99870f4d 12675
c15fda25
KW
12676 my $is_default = 0; # Is this table the default one for the property?
12677
12678 # To calculate $is_default, we find if this table is the same as the
12679 # default one for the property. But this is complicated by the
12680 # possibility that there is a master table for this one, and the
12681 # information is stored there instead of here.
9e4a1e86
KW
12682 my $parent = $table->parent;
12683 my $leader_prop = $parent->property;
c15fda25
KW
12684 my $default_map = $leader_prop->default_map;
12685 if (defined $default_map) {
12686 my $default_table = $leader_prop->table($default_map);
12687 $is_default = 1 if defined $default_table && $parent == $default_table;
12688 }
9e4a1e86
KW
12689
12690 # Calculate the loose name for this table. Mostly it's just its name,
12691 # standardized. But in the case of Perl tables that are single-form
12692 # equivalents to Unicode properties, it is the latter's name.
12693 my $loose_table_name =
12694 ($property != $perl || $leader_prop == $perl)
12695 ? standardize($table->name)
12696 : standardize($parent->name);
12697
99870f4d
KW
12698 my $deprecated = ($table->status eq $DEPRECATED)
12699 ? $table->status_info
12700 : "";
d867ccfb 12701 my $caseless_equivalent = $table->caseless_equivalent;
99870f4d
KW
12702
12703 # And for each of the table's aliases... This inner loop eventually
12704 # goes through all aliases in the UCD that we generate regex match
12705 # files for
12706 foreach my $alias ($table->aliases) {
c85f591a 12707 my $standard = utf8_heavy_name($table, $alias);
99870f4d
KW
12708
12709 # Generate an entry in either the loose or strict hashes, which
12710 # will translate the property and alias names combination into the
12711 # file where the table for them is stored.
99870f4d 12712 if ($alias->loose_match) {
99870f4d
KW
12713 if (exists $loose_to_file_of{$standard}) {
12714 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12715 }
12716 else {
12717 $loose_to_file_of{$standard} = $sub_filename;
12718 }
12719 }
12720 else {
99870f4d
KW
12721 if (exists $stricter_to_file_of{$standard}) {
12722 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12723 }
12724 else {
12725 $stricter_to_file_of{$standard} = $sub_filename;
12726
12727 # Tightly coupled with how utf8_heavy.pl works, for a
12728 # floating point number that is a whole number, get rid of
12729 # the trailing decimal point and 0's, so that utf8_heavy
12730 # will work. Also note that this assumes that such a
12731 # number is matched strictly; so if that were to change,
12732 # this would be wrong.
c85f591a 12733 if ((my $integer_name = $alias->name)
99870f4d
KW
12734 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12735 {
ae51efca 12736 $stricter_to_file_of{$property_name . $integer_name}
c12f2655 12737 = $sub_filename;
99870f4d
KW
12738 }
12739 }
12740 }
12741
9e4a1e86
KW
12742 # For Unicode::UCD, create a mapping of the prop=value to the
12743 # canonical =value for that property.
12744 if ($standard =~ /=/) {
12745
12746 # This could happen if a strict name mapped into an existing
12747 # loose name. In that event, the strict names would have to
12748 # be moved to a new hash.
12749 if (exists($loose_to_standard_value{$standard})) {
12750 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
12751 }
12752 $loose_to_standard_value{$standard} = $loose_table_name;
12753 }
12754
99870f4d 12755 # Keep a list of the deprecated properties and their filenames
a92d5c2e 12756 if ($deprecated && $complement == 0) {
99870f4d
KW
12757 $utf8::why_deprecated{$sub_filename} = $deprecated;
12758 }
d867ccfb
KW
12759
12760 # And a substitute table, if any, for case-insensitive matching
12761 if ($caseless_equivalent != 0) {
12762 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12763 }
c15fda25
KW
12764
12765 # Add to defaults list if the table this alias belongs to is the
12766 # default one
12767 $loose_defaults{$standard} = 1 if $is_default;
99870f4d
KW
12768 }
12769 }
12770
12771 return;
12772}
12773
12774{ # Closure
12775 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12776 # conflicts
12777 my %full_dir_name_of; # Full length names of directories used.
12778
12779 sub construct_filename($$$) {
12780 # Return a file name for a table, based on the table name, but perhaps
12781 # changed to get rid of non-portable characters in it, and to make
12782 # sure that it is unique on a file system that allows the names before
12783 # any period to be at most 8 characters (DOS). While we're at it
12784 # check and complain if there are any directory conflicts.
12785
12786 my $name = shift; # The name to start with
12787 my $mutable = shift; # Boolean: can it be changed? If no, but
12788 # yet it must be to work properly, a warning
12789 # is given
12790 my $directories_ref = shift; # A reference to an array containing the
12791 # path to the file, with each element one path
12792 # component. This is used because the same
12793 # name can be used in different directories.
12794 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12795
12796 my $warn = ! defined wantarray; # If true, then if the name is
12797 # changed, a warning is issued as well.
12798
12799 if (! defined $name) {
12800 Carp::my_carp("Undefined name in directory "
12801 . File::Spec->join(@$directories_ref)
12802 . ". '_' used");
12803 return '_';
12804 }
12805
12806 # Make sure that no directory names conflict with each other. Look at
12807 # each directory in the input file's path. If it is already in use,
12808 # assume it is correct, and is merely being re-used, but if we
12809 # truncate it to 8 characters, and find that there are two directories
12810 # that are the same for the first 8 characters, but differ after that,
12811 # then that is a problem.
12812 foreach my $directory (@$directories_ref) {
12813 my $short_dir = substr($directory, 0, 8);
12814 if (defined $full_dir_name_of{$short_dir}) {
12815 next if $full_dir_name_of{$short_dir} eq $directory;
12816 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12817 }
12818 else {
12819 $full_dir_name_of{$short_dir} = $directory;
12820 }
12821 }
12822
12823 my $path = join '/', @$directories_ref;
12824 $path .= '/' if $path;
12825
12826 # Remove interior underscores.
12827 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12828
12829 # Change any non-word character into an underscore, and truncate to 8.
12830 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12831 substr($filename, 8) = "" if length($filename) > 8;
12832
12833 # Make sure the basename doesn't conflict with something we
12834 # might have already written. If we have, say,
12835 # InGreekExtended1
12836 # InGreekExtended2
12837 # they become
12838 # InGreekE
12839 # InGreek2
12840 my $warned = 0;
12841 while (my $num = $base_names{$path}{lc $filename}++) {
12842 $num++; # so basenames with numbers start with '2', which
12843 # just looks more natural.
12844
12845 # Want to append $num, but if it'll make the basename longer
12846 # than 8 characters, pre-truncate $filename so that the result
12847 # is acceptable.
12848 my $delta = length($filename) + length($num) - 8;
12849 if ($delta > 0) {
12850 substr($filename, -$delta) = $num;
12851 }
12852 else {
12853 $filename .= $num;
12854 }
12855 if ($warn && ! $warned) {
12856 $warned = 1;
12857 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12858 }
12859 }
12860
12861 return $filename if $mutable;
12862
12863 # If not changeable, must return the input name, but warn if needed to
12864 # change it beyond shortening it.
12865 if ($name ne $filename
12866 && substr($name, 0, length($filename)) ne $filename) {
12867 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12868 }
12869 return $name;
12870 }
12871}
12872
12873# The pod file contains a very large table. Many of the lines in that table
12874# would exceed a typical output window's size, and so need to be wrapped with
12875# a hanging indent to make them look good. The pod language is really
12876# insufficient here. There is no general construct to do that in pod, so it
12877# is done here by beginning each such line with a space to cause the result to
12878# be output without formatting, and doing all the formatting here. This leads
12879# to the result that if the eventual display window is too narrow it won't
12880# look good, and if the window is too wide, no advantage is taken of that
12881# extra width. A further complication is that the output may be indented by
12882# the formatter so that there is less space than expected. What I (khw) have
12883# done is to assume that that indent is a particular number of spaces based on
12884# what it is in my Linux system; people can always resize their windows if
12885# necessary, but this is obviously less than desirable, but the best that can
12886# be expected.
12887my $automatic_pod_indent = 8;
12888
12889# Try to format so that uses fewest lines, but few long left column entries
12890# slide into the right column. An experiment on 5.1 data yielded the
12891# following percentages that didn't cut into the other side along with the
12892# associated first-column widths
12893# 69% = 24
12894# 80% not too bad except for a few blocks
12895# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12896# 95% = 37;
12897my $indent_info_column = 27; # 75% of lines didn't have overlap
12898
12899my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12900 # The 3 is because of:
12901 # 1 for the leading space to tell the pod formatter to
12902 # output as-is
12903 # 1 for the flag
12904 # 1 for the space between the flag and the main data
12905
12906sub format_pod_line ($$$;$$) {
12907 # Take a pod line and return it, formatted properly
12908
12909 my $first_column_width = shift;
12910 my $entry = shift; # Contents of left column
12911 my $info = shift; # Contents of right column
12912
12913 my $status = shift || ""; # Any flag
12914
12915 my $loose_match = shift; # Boolean.
12916 $loose_match = 1 unless defined $loose_match;
12917
12918 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12919
12920 my $flags = "";
12921 $flags .= $STRICTER if ! $loose_match;
12922
12923 $flags .= $status if $status;
12924
12925 # There is a blank in the left column to cause the pod formatter to
12926 # output the line as-is.
12927 return sprintf " %-*s%-*s %s\n",
12928 # The first * in the format is replaced by this, the -1 is
12929 # to account for the leading blank. There isn't a
12930 # hard-coded blank after this to separate the flags from
12931 # the rest of the line, so that in the unlikely event that
12932 # multiple flags are shown on the same line, they both
12933 # will get displayed at the expense of that separation,
12934 # but since they are left justified, a blank will be
12935 # inserted in the normal case.
12936 $FILLER - 1,
12937 $flags,
12938
12939 # The other * in the format is replaced by this number to
12940 # cause the first main column to right fill with blanks.
12941 # The -1 is for the guaranteed blank following it.
12942 $first_column_width - $FILLER - 1,
12943 $entry,
12944 $info;
12945}
12946
12947my @zero_match_tables; # List of tables that have no matches in this release
12948
d1476e4d 12949sub make_re_pod_entries($) {
99870f4d
KW
12950 # This generates the entries for the pod file for a given table.
12951 # Also done at this time are any children tables. The output looks like:
12952 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
12953
12954 my $input_table = shift; # Table the entry is for
12955 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12956
12957 # Generate parent and all its children at the same time.
12958 return if $input_table->parent != $input_table;
12959
12960 my $property = $input_table->property;
12961 my $type = $property->type;
12962 my $full_name = $property->full_name;
12963
12964 my $count = $input_table->count;
12965 my $string_count = clarify_number($count);
12966 my $status = $input_table->status;
12967 my $status_info = $input_table->status_info;
56ca34ca 12968 my $caseless_equivalent = $input_table->caseless_equivalent;
99870f4d
KW
12969
12970 my $entry_for_first_table; # The entry for the first table output.
12971 # Almost certainly, it is the parent.
12972
12973 # For each related table (including itself), we will generate a pod entry
12974 # for each name each table goes by
12975 foreach my $table ($input_table, $input_table->children) {
12976
d4da3f74
KW
12977 # utf8_heavy.pl cannot deal with null string property values, so skip
12978 # any tables that have no non-null names.
12979 next if ! grep { $_->name ne "" } $table->aliases;
99870f4d
KW
12980
12981 # First, gather all the info that applies to this table as a whole.
12982
12983 push @zero_match_tables, $table if $count == 0;
12984
12985 my $table_property = $table->property;
12986
12987 # The short name has all the underscores removed, while the full name
12988 # retains them. Later, we decide whether to output a short synonym
12989 # for the full one, we need to compare apples to apples, so we use the
12990 # short name's length including underscores.
12991 my $table_property_short_name_length;
12992 my $table_property_short_name
12993 = $table_property->short_name(\$table_property_short_name_length);
12994 my $table_property_full_name = $table_property->full_name;
12995
12996 # Get how much savings there is in the short name over the full one
12997 # (delta will always be <= 0)
12998 my $table_property_short_delta = $table_property_short_name_length
12999 - length($table_property_full_name);
13000 my @table_description = $table->description;
13001 my @table_note = $table->note;
13002
13003 # Generate an entry for each alias in this table.
13004 my $entry_for_first_alias; # saves the first one encountered.
13005 foreach my $alias ($table->aliases) {
13006
13007 # Skip if not to go in pod.
33e96e72 13008 next unless $alias->make_re_pod_entry;
99870f4d
KW
13009
13010 # Start gathering all the components for the entry
13011 my $name = $alias->name;
13012
d4da3f74
KW
13013 # Skip if name is empty, as can't be accessed by regexes.
13014 next if $name eq "";
13015
99870f4d
KW
13016 my $entry; # Holds the left column, may include extras
13017 my $entry_ref; # To refer to the left column's contents from
13018 # another entry; has no extras
13019
13020 # First the left column of the pod entry. Tables for the $perl
13021 # property always use the single form.
13022 if ($table_property == $perl) {
13023 $entry = "\\p{$name}";
13024 $entry_ref = "\\p{$name}";
13025 }
13026 else { # Compound form.
13027
13028 # Only generate one entry for all the aliases that mean true
13029 # or false in binary properties. Append a '*' to indicate
13030 # some are missing. (The heading comment notes this.)
60e471b3 13031 my $rhs;
99870f4d
KW
13032 if ($type == $BINARY) {
13033 next if $name ne 'N' && $name ne 'Y';
60e471b3 13034 $rhs = "$name*";
99870f4d 13035 }
06f26c45 13036 elsif ($type != $FORCED_BINARY) {
60e471b3 13037 $rhs = $name;
99870f4d 13038 }
06f26c45
KW
13039 else {
13040
13041 # Forced binary properties require special handling. It
13042 # has two sets of tables, one set is true/false; and the
13043 # other set is everything else. Entries are generated for
13044 # each set. Use the Bidi_Mirrored property (which appears
13045 # in all Unicode versions) to get a list of the aliases
13046 # for the true/false tables. Of these, only output the N
13047 # and Y ones, the same as, a regular binary property. And
13048 # output all the rest, same as a non-binary property.
13049 my $bm = property_ref("Bidi_Mirrored");
13050 if ($name eq 'N' || $name eq 'Y') {
13051 $rhs = "$name*";
13052 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
13053 $bm->table("N")->aliases)
13054 {
13055 next;
13056 }
13057 else {
13058 $rhs = $name;
13059 }
13060 }
99870f4d
KW
13061
13062 # Colon-space is used to give a little more space to be easier
13063 # to read;
13064 $entry = "\\p{"
13065 . $table_property_full_name
60e471b3 13066 . ": $rhs}";
99870f4d
KW
13067
13068 # But for the reference to this entry, which will go in the
13069 # right column, where space is at a premium, use equals
13070 # without a space
13071 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
13072 }
13073
13074 # Then the right (info) column. This is stored as components of
13075 # an array for the moment, then joined into a string later. For
13076 # non-internal only properties, begin the info with the entry for
13077 # the first table we encountered (if any), as things are ordered
13078 # so that that one is the most descriptive. This leads to the
13079 # info column of an entry being a more descriptive version of the
13080 # name column
13081 my @info;
13082 if ($name =~ /^_/) {
13083 push @info,
13084 '(For internal use by Perl, not necessarily stable)';
13085 }
13086 elsif ($entry_for_first_alias) {
13087 push @info, $entry_for_first_alias;
13088 }
13089
13090 # If this entry is equivalent to another, add that to the info,
13091 # using the first such table we encountered
13092 if ($entry_for_first_table) {
13093 if (@info) {
13094 push @info, "(= $entry_for_first_table)";
13095 }
13096 else {
13097 push @info, $entry_for_first_table;
13098 }
13099 }
13100
13101 # If the name is a large integer, add an equivalent with an
13102 # exponent for better readability
13103 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
13104 push @info, sprintf "(= %.1e)", $name
13105 }
13106
13107 my $parenthesized = "";
13108 if (! $entry_for_first_alias) {
13109
13110 # This is the first alias for the current table. The alias
13111 # array is ordered so that this is the fullest, most
13112 # descriptive alias, so it gets the fullest info. The other
13113 # aliases are mostly merely pointers to this one, using the
13114 # information already added above.
13115
13116 # Display any status message, but only on the parent table
13117 if ($status && ! $entry_for_first_table) {
13118 push @info, $status_info;
13119 }
13120
13121 # Put out any descriptive info
13122 if (@table_description || @table_note) {
13123 push @info, join "; ", @table_description, @table_note;
13124 }
13125
13126 # Look to see if there is a shorter name we can point people
13127 # at
13128 my $standard_name = standardize($name);
13129 my $short_name;
13130 my $proposed_short = $table->short_name;
13131 if (defined $proposed_short) {
13132 my $standard_short = standardize($proposed_short);
13133
13134 # If the short name is shorter than the standard one, or
13135 # even it it's not, but the combination of it and its
13136 # short property name (as in \p{prop=short} ($perl doesn't
13137 # have this form)) saves at least two characters, then,
13138 # cause it to be listed as a shorter synonym.
13139 if (length $standard_short < length $standard_name
13140 || ($table_property != $perl
13141 && (length($standard_short)
13142 - length($standard_name)
13143 + $table_property_short_delta) # (<= 0)
13144 < -2))
13145 {
13146 $short_name = $proposed_short;
13147 if ($table_property != $perl) {
13148 $short_name = $table_property_short_name
13149 . "=$short_name";
13150 }
13151 $short_name = "\\p{$short_name}";
13152 }
13153 }
13154
13155 # And if this is a compound form name, see if there is a
13156 # single form equivalent
13157 my $single_form;
13158 if ($table_property != $perl) {
13159
13160 # Special case the binary N tables, so that will print
13161 # \P{single}, but use the Y table values to populate
c12f2655 13162 # 'single', as we haven't likewise populated the N table.
06f26c45
KW
13163 # For forced binary tables, we can't just look at the N
13164 # table, but must see if this table is equivalent to the N
13165 # one, as there are two equivalent beasts in these
13166 # properties.
99870f4d
KW
13167 my $test_table;
13168 my $p;
06f26c45
KW
13169 if ( ($type == $BINARY
13170 && $input_table == $property->table('No'))
13171 || ($type == $FORCED_BINARY
13172 && $property->table('No')->
13173 is_set_equivalent_to($input_table)))
99870f4d
KW
13174 {
13175 $test_table = $property->table('Yes');
13176 $p = 'P';
13177 }
13178 else {
13179 $test_table = $input_table;
13180 $p = 'p';
13181 }
13182
13183 # Look for a single form amongst all the children.
13184 foreach my $table ($test_table->children) {
13185 next if $table->property != $perl;
13186 my $proposed_name = $table->short_name;
13187 next if ! defined $proposed_name;
13188
13189 # Don't mention internal-only properties as a possible
13190 # single form synonym
13191 next if substr($proposed_name, 0, 1) eq '_';
13192
13193 $proposed_name = "\\$p\{$proposed_name}";
13194 if (! defined $single_form
13195 || length($proposed_name) < length $single_form)
13196 {
13197 $single_form = $proposed_name;
13198
13199 # The goal here is to find a single form; not the
13200 # shortest possible one. We've already found a
13201 # short name. So, stop at the first single form
13202 # found, which is likely to be closer to the
13203 # original.
13204 last;
13205 }
13206 }
13207 }
13208
13209 # Ouput both short and single in the same parenthesized
13210 # expression, but with only one of 'Single', 'Short' if there
13211 # are both items.
13212 if ($short_name || $single_form || $table->conflicting) {
99870f4d
KW
13213 $parenthesized .= "Short: $short_name" if $short_name;
13214 if ($short_name && $single_form) {
13215 $parenthesized .= ', ';
13216 }
13217 elsif ($single_form) {
13218 $parenthesized .= 'Single: ';
13219 }
13220 $parenthesized .= $single_form if $single_form;
13221 }
13222 }
13223
56ca34ca
KW
13224 if ($caseless_equivalent != 0) {
13225 $parenthesized .= '; ' if $parenthesized ne "";
13226 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
13227 }
13228
99870f4d
KW
13229
13230 # Warn if this property isn't the same as one that a
13231 # semi-casual user might expect. The other components of this
13232 # parenthesized structure are calculated only for the first entry
13233 # for this table, but the conflicting is deemed important enough
13234 # to go on every entry.
13235 my $conflicting = join " NOR ", $table->conflicting;
13236 if ($conflicting) {
e5228720 13237 $parenthesized .= '; ' if $parenthesized ne "";
99870f4d
KW
13238 $parenthesized .= "NOT $conflicting";
13239 }
99870f4d 13240
e5228720 13241 push @info, "($parenthesized)" if $parenthesized;
d57ccc9a 13242
0f88d393
KW
13243 if ($name =~ /_$/ && $alias->loose_match) {
13244 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13245 }
13246
d57ccc9a
KW
13247 if ($table_property != $perl && $table->perl_extension) {
13248 push @info, '(Perl extension)';
13249 }
2cf724d4 13250 push @info, "($string_count)";
99870f4d
KW
13251
13252 # Now, we have both the entry and info so add them to the
13253 # list of all the properties.
13254 push @match_properties,
13255 format_pod_line($indent_info_column,
13256 $entry,
13257 join( " ", @info),
13258 $alias->status,
13259 $alias->loose_match);
13260
13261 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13262 } # End of looping through the aliases for this table.
13263
13264 if (! $entry_for_first_table) {
13265 $entry_for_first_table = $entry_for_first_alias;
13266 }
13267 } # End of looping through all the related tables
13268 return;
13269}
13270
2df7880f
KW
13271sub make_ucd_table_pod_entries {
13272 my $table = shift;
13273
ee94c7d1
KW
13274 # Generate the entries for the UCD section of the pod for $table. This
13275 # also calculates if names are ambiguous, so has to be called even if the
13276 # pod is not being output
13277
13278 my $short_name = $table->name;
13279 my $standard_short_name = standardize($short_name);
13280 my $full_name = $table->full_name;
13281 my $standard_full_name = standardize($full_name);
13282
13283 my $full_info = ""; # Text of info column for full-name entries
13284 my $other_info = ""; # Text of info column for short-name entries
13285 my $short_info = ""; # Text of info column for other entries
13286 my $meaning = ""; # Synonym of this table
2df7880f
KW
13287
13288 my $property = ($table->isa('Property'))
13289 ? $table
13290 : $table->parent->property;
13291
ee94c7d1
KW
13292 my $perl_extension = $table->perl_extension;
13293
13294 # Get the more official name for for perl extensions that aren't
13295 # stand-alone properties
13296 if ($perl_extension && $property != $table) {
13297 if ($property == $perl ||$property->type == $BINARY) {
13298 $meaning = $table->complete_name;
13299 }
13300 else {
13301 $meaning = $property->full_name . "=$full_name";
13302 }
13303 }
13304
13305 # There are three types of info column. One for the short name, one for
13306 # the full name, and one for everything else. They mostly are the same,
13307 # so initialize in the same loop.
13308 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
13309 if ($perl_extension && $property != $table) {
13310
13311 # Add the synonymous name for the non-full name entries; and to
13312 # the full-name entry if it adds extra information
13313 if ($info_ref == \$other_info
13314 || ($info_ref == \$short_info
13315 && $standard_short_name ne $standard_full_name)
13316 || standardize($meaning) ne $standard_full_name
13317 ) {
13318 $$info_ref .= "$meaning.";
13319 }
13320 }
13321 elsif ($info_ref != \$full_info) {
13322
13323 # Otherwise, the non-full name columns include the full name
13324 $$info_ref .= $full_name;
13325 }
13326
13327 # And the full-name entry includes the short name, if different
13328 if ($info_ref == \$full_info
13329 && $standard_short_name ne $standard_full_name)
13330 {
13331 $full_info =~ s/\.\Z//;
13332 $full_info .= " " if $full_info;
13333 $full_info .= "(Short: $short_name)";
13334 }
13335
13336 if ($table->perl_extension) {
13337 $$info_ref =~ s/\.\Z//;
13338 $$info_ref .= ". " if $$info_ref;
13339 $$info_ref .= "(Perl extension)";
13340 }
13341 }
13342
13343 # Add any extra annotations to the full name entry
13344 foreach my $more_info ($table->description,
13345 $table->note,
13346 $table->status_info)
13347 {
13348 next unless $more_info;
13349 $full_info =~ s/\.\Z//;
13350 $full_info .= ". " if $full_info;
13351 $full_info .= $more_info;
13352 }
13353
13354 # These keep track if have created full and short name pod entries for the
13355 # property
13356 my $done_full = 0;
13357 my $done_short = 0;
13358
2df7880f
KW
13359 # Every possible name is kept track of, even those that aren't going to be
13360 # output. This way we can be sure to find the ambiguities.
13361 foreach my $alias ($table->aliases) {
13362 my $name = $alias->name;
13363 my $standard = standardize($name);
ee94c7d1
KW
13364 my $info;
13365 my $output_this = $alias->ucd;
13366
13367 # If the full and short names are the same, we want to output the full
13368 # one's entry, so it has priority.
13369 if ($standard eq $standard_full_name) {
13370 next if $done_full;
13371 $done_full = 1;
13372 $info = $full_info;
13373 }
13374 elsif ($standard eq $standard_short_name) {
13375 next if $done_short;
13376 $done_short = 1;
13377 next if $standard_short_name eq $standard_full_name;
13378 $info = $short_info;
13379 }
13380 else {
13381 $info = $other_info;
13382 }
2df7880f 13383
ee94c7d1
KW
13384 # Here, we have set up the two columns for this entry. But if an
13385 # entry already exists for this name, we have to decide which one
13386 # we're going to later output.
2df7880f
KW
13387 if (exists $ucd_pod{$standard}) {
13388
13389 # If the two entries refer to the same property, it's not going to
ee94c7d1
KW
13390 # be ambiguous. (Likely it's because the names when standardized
13391 # are the same.) But that means if they are different properties,
13392 # there is ambiguity.
2df7880f
KW
13393 if ($ucd_pod{$standard}->{'property'} != $property) {
13394
ee94c7d1
KW
13395 # Here, we have an ambiguity. This code assumes that one is
13396 # scheduled to be output and one not and that one is a perl
13397 # extension (which is not to be output) and the other isn't.
13398 # If those assumptions are wrong, things have to be rethought.
13399 if ($ucd_pod{$standard}{'output_this'} == $output_this
13400 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
13401 || $output_this == $perl_extension)
13402 {
13403 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output statuss and perl-extension combinations. Proceeding anyway.");
13404 }
13405
13406 # We modifiy the info column of the one being output to
13407 # indicate the ambiguity. Set $which to point to that one's
13408 # info.
13409 my $which;
13410 if ($ucd_pod{$standard}{'output_this'}) {
13411 $which = \$ucd_pod{$standard}->{'info'};
13412 }
13413 else {
13414 $which = \$info;
13415 $meaning = $ucd_pod{$standard}{'meaning'};
13416 }
13417
13418 chomp $$which;
13419 $$which =~ s/\.\Z//;
13420 $$which .= "; NOT '$standard' meaning '$meaning'";
13421
2df7880f
KW
13422 $ambiguous_names{$standard} = 1;
13423 }
13424
ee94c7d1
KW
13425 # Use the non-perl-extension variant
13426 next unless $ucd_pod{$standard}{'perl_extension'};
2df7880f
KW
13427 }
13428
ee94c7d1
KW
13429 # Store enough information about this entry that we can later look for
13430 # ambiguities, and output it properly.
13431 $ucd_pod{$standard} = { 'name' => $name,
13432 'info' => $info,
13433 'meaning' => $meaning,
13434 'output_this' => $output_this,
13435 'perl_extension' => $perl_extension,
2df7880f 13436 'property' => $property,
ee94c7d1 13437 'status' => $alias->status,
2df7880f
KW
13438 };
13439 } # End of looping through all this table's aliases
13440
13441 return;
13442}
13443
99870f4d
KW
13444sub pod_alphanumeric_sort {
13445 # Sort pod entries alphanumerically.
13446
99f78760
KW
13447 # The first few character columns are filler, plus the '\p{'; and get rid
13448 # of all the trailing stuff, starting with the trailing '}', so as to sort
13449 # on just 'Name=Value'
13450 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 13451 $a =~ s/}.*//;
99f78760 13452 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
13453 $b =~ s/}.*//;
13454
99f78760
KW
13455 # Determine if the two operands are both internal only or both not.
13456 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13457 # should be the underscore that begins internal only
13458 my $a_is_internal = (substr($a, 0, 1) eq '_');
13459 my $b_is_internal = (substr($b, 0, 1) eq '_');
13460
13461 # Sort so the internals come last in the table instead of first (which the
13462 # leading underscore would otherwise indicate).
13463 if ($a_is_internal != $b_is_internal) {
13464 return 1 if $a_is_internal;
13465 return -1
13466 }
13467
99870f4d 13468 # Determine if the two operands are numeric property values or not.
99f78760 13469 # A numeric property will look like xyz: 3. But the number
99870f4d 13470 # can begin with an optional minus sign, and may have a
99f78760 13471 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
13472 # isn't numeric, use alphabetic sort.
13473 my ($a_initial, $a_number) =
99f78760 13474 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13475 return $a cmp $b unless defined $a_number;
13476 my ($b_initial, $b_number) =
99f78760 13477 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13478 return $a cmp $b unless defined $b_number;
13479
13480 # Here they are both numeric, but use alphabetic sort if the
13481 # initial parts don't match
13482 return $a cmp $b if $a_initial ne $b_initial;
13483
13484 # Convert rationals to floating for the comparison.
13485 $a_number = eval $a_number if $a_number =~ qr{/};
13486 $b_number = eval $b_number if $b_number =~ qr{/};
13487
13488 return $a_number <=> $b_number;
13489}
13490
13491sub make_pod () {
13492 # Create the .pod file. This generates the various subsections and then
13493 # combines them in one big HERE document.
13494
07c070a8
KW
13495 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
13496
99870f4d
KW
13497 return unless defined $pod_directory;
13498 print "Making pod file\n" if $verbosity >= $PROGRESS;
13499
13500 my $exception_message =
13501 '(Any exceptions are individually noted beginning with the word NOT.)';
13502 my @block_warning;
13503 if (-e 'Blocks.txt') {
13504
13505 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
13506 # if the global $has_In_conflicts indicates we have them.
13507 push @match_properties, format_pod_line($indent_info_column,
13508 '\p{In_*}',
13509 '\p{Block: *}'
13510 . (($has_In_conflicts)
13511 ? " $exception_message"
13512 : ""));
13513 @block_warning = << "END";
13514
77173124
KW
13515Matches in the Block property have shortcuts that begin with "In_". For
13516example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
13517backward compatibility, if there is no conflict with another shortcut, these
13518may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
13519are numerous such conflicting shortcuts. Use of these forms for Block is
13520discouraged, and are flagged as such, not only because of the potential
13521confusion as to what is meant, but also because a later release of Unicode may
13522preempt the shortcut, and your program would no longer be correct. Use the
13523"In_" form instead to avoid this, or even more clearly, use the compound form,
13524e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
13525about this.
99870f4d
KW
13526END
13527 }
07c070a8 13528 my $text = $Is_flags_text;
99870f4d
KW
13529 $text = "$exception_message $text" if $has_Is_conflicts;
13530
13531 # And the 'Is_ line';
13532 push @match_properties, format_pod_line($indent_info_column,
13533 '\p{Is_*}',
13534 "\\p{*} $text");
13535
13536 # Sort the properties array for output. It is sorted alphabetically
13537 # except numerically for numeric properties, and only output unique lines.
13538 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13539
13540 my $formatted_properties = simple_fold(\@match_properties,
13541 "",
13542 # indent succeeding lines by two extra
13543 # which looks better
13544 $indent_info_column + 2,
13545
13546 # shorten the line length by how much
13547 # the formatter indents, so the folded
13548 # line will fit in the space
13549 # presumably available
13550 $automatic_pod_indent);
13551 # Add column headings, indented to be a little more centered, but not
13552 # exactly
13553 $formatted_properties = format_pod_line($indent_info_column,
13554 ' NAME',
13555 ' INFO')
13556 . "\n"
13557 . $formatted_properties;
13558
13559 # Generate pod documentation lines for the tables that match nothing
0090c5d1 13560 my $zero_matches = "";
99870f4d
KW
13561 if (@zero_match_tables) {
13562 @zero_match_tables = uniques(@zero_match_tables);
13563 $zero_matches = join "\n\n",
13564 map { $_ = '=item \p{' . $_->complete_name . "}" }
13565 sort { $a->complete_name cmp $b->complete_name }
c0de960f 13566 @zero_match_tables;
99870f4d
KW
13567
13568 $zero_matches = <<END;
13569
77173124 13570=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
99870f4d
KW
13571
13572Unicode has some property-value pairs that currently don't match anything.
c12f2655
KW
13573This happens generally either because they are obsolete, or they exist for
13574symmetry with other forms, but no language has yet been encoded that uses
13575them. In this version of Unicode, the following match zero code points:
99870f4d
KW
13576
13577=over 4
13578
13579$zero_matches
13580
13581=back
13582
13583END
13584 }
13585
13586 # Generate list of properties that we don't accept, grouped by the reasons
13587 # why. This is so only put out the 'why' once, and then list all the
13588 # properties that have that reason under it.
13589
13590 my %why_list; # The keys are the reasons; the values are lists of
13591 # properties that have the key as their reason
13592
13593 # For each property, add it to the list that are suppressed for its reason
13594 # The sort will cause the alphabetically first properties to be added to
13595 # each list first, so each list will be sorted.
13596 foreach my $property (sort keys %why_suppressed) {
13597 push @{$why_list{$why_suppressed{$property}}}, $property;
13598 }
13599
13600 # For each reason (sorted by the first property that has that reason)...
13601 my @bad_re_properties;
13602 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13603 keys %why_list)
13604 {
54ce19c9 13605 # Add to the output, all the properties that have that reason.
99870f4d
KW
13606 my $has_item = 0; # Flag if actually output anything.
13607 foreach my $name (@{$why_list{$why}}) {
13608
13609 # Split compound names into $property and $table components
13610 my $property = $name;
13611 my $table;
13612 if ($property =~ / (.*) = (.*) /x) {
13613 $property = $1;
13614 $table = $2;
13615 }
13616
13617 # This release of Unicode may not have a property that is
13618 # suppressed, so don't reference a non-existent one.
13619 $property = property_ref($property);
13620 next if ! defined $property;
13621
13622 # And since this list is only for match tables, don't list the
13623 # ones that don't have match tables.
13624 next if ! $property->to_create_match_tables;
13625
13626 # Find any abbreviation, and turn it into a compound name if this
13627 # is a property=value pair.
13628 my $short_name = $property->name;
13629 $short_name .= '=' . $property->table($table)->name if $table;
13630
54ce19c9
KW
13631 # Start with an empty line.
13632 push @bad_re_properties, "\n\n" unless $has_item;
13633
99870f4d
KW
13634 # And add the property as an item for the reason.
13635 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13636 $has_item = 1;
13637 }
13638
13639 # And add the reason under the list of properties, if such a list
13640 # actually got generated. Note that the header got added
13641 # unconditionally before. But pod ignores extra blank lines, so no
13642 # harm.
13643 push @bad_re_properties, "\n$why\n" if $has_item;
13644
13645 } # End of looping through each reason.
13646
54ce19c9
KW
13647 if (! @bad_re_properties) {
13648 push @bad_re_properties,
13649 "*** This installation accepts ALL non-Unihan properties ***";
13650 }
13651 else {
13652 # Add =over only if non-empty to avoid an empty =over/=back section,
13653 # which is considered bad form.
13654 unshift @bad_re_properties, "\n=over 4\n";
13655 push @bad_re_properties, "\n=back\n";
13656 }
13657
8d099389
KW
13658 # Similiarly, generate a list of files that we don't use, grouped by the
13659 # reasons why. First, create a hash whose keys are the reasons, and whose
13660 # values are anonymous arrays of all the files that share that reason.
13661 my %grouped_by_reason;
13662 foreach my $file (keys %ignored_files) {
13663 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
13664 }
1fec9f60
KW
13665 foreach my $file (keys %skipped_files) {
13666 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
13667 }
8d099389
KW
13668
13669 # Then, sort each group.
13670 foreach my $group (keys %grouped_by_reason) {
13671 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
13672 @{$grouped_by_reason{$group}} ;
13673 }
13674
13675 # Finally, create the output text. For each reason (sorted by the
13676 # alphabetically first file that has that reason)...
13677 my @unused_files;
13678 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
13679 cmp lc $grouped_by_reason{$b}->[0]
13680 }
13681 keys %grouped_by_reason)
13682 {
13683 # Add all the files that have that reason to the output. Start
13684 # with an empty line.
13685 push @unused_files, "\n\n";
13686 push @unused_files, map { "\n=item F<$_> \n" }
13687 @{$grouped_by_reason{$reason}};
13688 # And add the reason under the list of files
13689 push @unused_files, "\n$reason\n";
13690 }
13691
ee94c7d1
KW
13692 # Similarly, create the output text for the UCD section of the pod
13693 my @ucd_pod;
13694 foreach my $key (keys %ucd_pod) {
13695 next unless $ucd_pod{$key}->{'output_this'};
13696 push @ucd_pod, format_pod_line($indent_info_column,
13697 $ucd_pod{$key}->{'name'},
13698 $ucd_pod{$key}->{'info'},
13699 $ucd_pod{$key}->{'status'},
13700 );
13701 }
13702
13703 # Sort alphabetically, and fold for output
13704 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
13705 my $ucd_pod = simple_fold(\@ucd_pod,
13706 ' ',
13707 $indent_info_column,
13708 $automatic_pod_indent);
13709 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
13710 . "\n"
13711 . $ucd_pod;
12916dad
MS
13712 local $" = "";
13713
99870f4d
KW
13714 # Everything is ready to assemble.
13715 my @OUT = << "END";
13716=begin comment
13717
13718$HEADER
13719
13720To change this file, edit $0 instead.
13721
13722=end comment
13723
13724=head1 NAME
13725
8d099389 13726$pod_file - Index of Unicode Version $string_version character properties in Perl
99870f4d
KW
13727
13728=head1 DESCRIPTION
13729
8d099389
KW
13730This document provides information about the portion of the Unicode database
13731that deals with character properties, that is the portion that is defined on
13732single code points. (L</Other information in the Unicode data base>
13733below briefly mentions other data that Unicode provides.)
99870f4d 13734
8d099389
KW
13735Perl can provide access to all non-provisional Unicode character properties,
13736though not all are enabled by default. The omitted ones are the Unihan
13737properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
13738deprecated or Unicode-internal properties. (An installation may choose to
ea5acc0f 13739recompile Perl's tables to change this. See L<Unicode character
8d099389
KW
13740properties that are NOT accepted by Perl>.)
13741
ee94c7d1
KW
13742For most purposes, access to Unicode properties from the Perl core is through
13743regular expression matches, as described in the next section.
13744For some special purposes, and to access the properties that are not suitable
13745for regular expression matching, all the Unicode character properties that
13746Perl handles are accessible via the standard L<Unicode::UCD> module, as
13747described in the section L</Properties accessible through Unicode::UCD>.
13748
8d099389
KW
13749Perl also provides some additional extensions and short-cut synonyms
13750for Unicode properties.
99870f4d
KW
13751
13752This document merely lists all available properties and does not attempt to
13753explain what each property really means. There is a brief description of each
043f3b3f
KW
13754Perl extension; see L<perlunicode/Other Properties> for more information on
13755these. There is some detail about Blocks, Scripts, General_Category,
99870f4d 13756and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
043f3b3f
KW
13757official Unicode properties, refer to the Unicode standard. A good starting
13758place is L<$unicode_reference_url>.
99870f4d
KW
13759
13760Note that you can define your own properties; see
13761L<perlunicode/"User-Defined Character Properties">.
13762
77173124 13763=head1 Properties accessible through C<\\p{}> and C<\\P{}>
99870f4d 13764
77173124
KW
13765The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13766most of the Unicode character properties. The table below shows all these
13767constructs, both single and compound forms.
99870f4d
KW
13768
13769B<Compound forms> consist of two components, separated by an equals sign or a
13770colon. The first component is the property name, and the second component is
13771the particular value of the property to match against, for example,
77173124 13772C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
99870f4d
KW
13773whose Script property is Greek.
13774
77173124 13775B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
99870f4d 13776their equivalent compound forms. The table shows these equivalences. (In our
77173124 13777example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
99870f4d 13778There are also a few Perl-defined single forms that are not shortcuts for a
77173124 13779compound form. One such is C<\\p{Word}>. These are also listed in the table.
99870f4d
KW
13780
13781In parsing these constructs, Perl always ignores Upper/lower case differences
77173124
KW
13782everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
13783C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
13784the left brace completely changes the meaning of the construct, from "match"
13785(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
13786for improved legibility.
99870f4d
KW
13787
13788Also, white space, hyphens, and underscores are also normally ignored
13789everywhere between the {braces}, and hence can be freely added or removed
13790even if the C</x> modifier hasn't been specified on the regular expression.
13791But $a_bold_stricter at the beginning of an entry in the table below
13792means that tighter (stricter) rules are used for that entry:
13793
13794=over 4
13795
77173124 13796=item Single form (C<\\p{name}>) tighter rules:
99870f4d
KW
13797
13798White space, hyphens, and underscores ARE significant
13799except for:
13800
13801=over 4
13802
13803=item * white space adjacent to a non-word character
13804
13805=item * underscores separating digits in numbers
13806
13807=back
13808
13809That means, for example, that you can freely add or remove white space
13810adjacent to (but within) the braces without affecting the meaning.
13811
77173124 13812=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
99870f4d
KW
13813
13814The tighter rules given above for the single form apply to everything to the
13815right of the colon or equals; the looser rules still apply to everything to
13816the left.
13817
13818That means, for example, that you can freely add or remove white space
13819adjacent to (but within) the braces and the colon or equal sign.
13820
13821=back
13822
78bb419c
KW
13823Some properties are considered obsolete by Unicode, but still available.
13824There are several varieties of obsolescence:
99870f4d
KW
13825
13826=over 4
13827
99870f4d
KW
13828=item Stabilized
13829
f8c38b14 13830A property may be stabilized. Such a determination does not indicate
5f7264c7
KW
13831that the property should or should not be used; instead it is a declaration
13832that the property will not be maintained nor extended for newly encoded
13833characters. Such properties are marked with $a_bold_stabilized in the
13834table.
99870f4d
KW
13835
13836=item Deprecated
13837
f8c38b14 13838A property may be deprecated, perhaps because its original intent
78bb419c
KW
13839has been replaced by another property, or because its specification was
13840somehow defective. This means that its use is strongly
99870f4d
KW
13841discouraged, so much so that a warning will be issued if used, unless the
13842regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13843statement. $A_bold_deprecated flags each such entry in the table, and
13844the entry there for the longest, most descriptive version of the property will
13845give the reason it is deprecated, and perhaps advice. Perl may issue such a
13846warning, even for properties that aren't officially deprecated by Unicode,
13847when there used to be characters or code points that were matched by them, but
13848no longer. This is to warn you that your program may not work like it did on
13849earlier Unicode releases.
13850
13851A deprecated property may be made unavailable in a future Perl version, so it
13852is best to move away from them.
13853
c12f2655
KW
13854A deprecated property may also be stabilized, but this fact is not shown.
13855
13856=item Obsolete
13857
13858Properties marked with $a_bold_obsolete in the table are considered (plain)
13859obsolete. Generally this designation is given to properties that Unicode once
13860used for internal purposes (but not any longer).
13861
99870f4d
KW
13862=back
13863
13864Some Perl extensions are present for backwards compatibility and are
c12f2655
KW
13865discouraged from being used, but are not obsolete. $A_bold_discouraged
13866flags each such entry in the table. Future Unicode versions may force
13867some of these extensions to be removed without warning, replaced by another
13868property with the same name that means something different. Use the
13869equivalent shown instead.
99870f4d
KW
13870
13871@block_warning
13872
77173124 13873The table below has two columns. The left column contains the C<\\p{}>
98dc9551 13874constructs to look up, possibly preceded by the flags mentioned above; and
99870f4d
KW
13875the right column contains information about them, like a description, or
13876synonyms. It shows both the single and compound forms for each property that
13877has them. If the left column is a short name for a property, the right column
13878will give its longer, more descriptive name; and if the left column is the
13879longest name, the right column will show any equivalent shortest name, in both
13880single and compound forms if applicable.
13881
13882The right column will also caution you if a property means something different
13883than what might normally be expected.
13884
d57ccc9a
KW
13885All single forms are Perl extensions; a few compound forms are as well, and
13886are noted as such.
13887
99870f4d
KW
13888Numbers in (parentheses) indicate the total number of code points matched by
13889the property. For emphasis, those properties that match no code points at all
13890are listed as well in a separate section following the table.
13891
56ca34ca
KW
13892Most properties match the same code points regardless of whether C<"/i">
13893case-insensitive matching is specified or not. But a few properties are
13894affected. These are shown with the notation
13895
13896 (/i= other_property)
13897
13898in the second column. Under case-insensitive matching they match the
13899same code pode points as the property "other_property".
13900
99870f4d 13901There is no description given for most non-Perl defined properties (See
77173124 13902L<$unicode_reference_url> for that).
d73e5302 13903
99870f4d
KW
13904For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13905combinations. For example, entries like:
d73e5302 13906
99870f4d 13907 \\p{Gc: *} \\p{General_Category: *}
5beb625e 13908
99870f4d
KW
13909mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13910for the latter is also valid for the former. Similarly,
5beb625e 13911
99870f4d 13912 \\p{Is_*} \\p{*}
5beb625e 13913
77173124
KW
13914means that if and only if, for example, C<\\p{Foo}> exists, then
13915C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13916And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13917C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
13918underscore.
5beb625e 13919
99870f4d
KW
13920Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13921And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
13922'N*' to indicate this, and doesn't have separate entries for the other
13923possibilities. Note that not all properties which have values 'Yes' and 'No'
13924are binary, and they have all their values spelled out without using this wild
13925card, and a C<NOT> clause in their description that highlights their not being
13926binary. These also require the compound form to match them, whereas true
13927binary properties have both single and compound forms available.
5beb625e 13928
99870f4d
KW
13929Note that all non-essential underscores are removed in the display of the
13930short names below.
5beb625e 13931
c12f2655 13932B<Legend summary:>
5beb625e 13933
99870f4d 13934=over 4
cf25bb62 13935
21405004 13936=item Z<>B<*> is a wild-card
cf25bb62 13937
99870f4d
KW
13938=item B<(\\d+)> in the info column gives the number of code points matched by
13939this property.
cf25bb62 13940
99870f4d 13941=item B<$DEPRECATED> means this is deprecated.
cf25bb62 13942
99870f4d 13943=item B<$OBSOLETE> means this is obsolete.
cf25bb62 13944
99870f4d 13945=item B<$STABILIZED> means this is stabilized.
cf25bb62 13946
99870f4d 13947=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 13948
c12f2655
KW
13949=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13950stable.
5beb625e 13951
99870f4d 13952=back
da7fcca4 13953
99870f4d 13954$formatted_properties
cf25bb62 13955
99870f4d 13956$zero_matches
cf25bb62 13957
ee94c7d1
KW
13958=head1 Properties accessible through Unicode::UCD
13959
13960All the Unicode character properties mentioned above (except for those marked
13961as for internal use by Perl) are also accessible by
13962L<Unicode::UCD/prop_invlist()>.
13963
13964Due to their nature, not all Unicode character properties are suitable for
13965regular expression matches, nor C<prop_invlist()>. The remaining
13966non-provisional, non-internal ones are accessible via
13967L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
13968hasn't included; see L<below for which those are|/Unicode character properties
13969that are NOT accepted by Perl>).
13970
13971For compatibility with other parts of Perl, all the single forms given in the
13972table in the L<section above|/Properties accessible through \\p{} and \\P{}>
13973are recognized. BUT, there are some ambiguities between some Perl extensions
13974and the Unicode properties, all of which are silently resolved in favor of the
13975official Unicode property. To avoid surprises, you should only use
13976C<prop_invmap()> for forms listed in the table below, which omits the
13977non-recommended ones. The affected forms are the Perl single form equivalents
13978of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
13979C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
13980whose short name is C<sc>. The table indicates the current ambiguities in the
13981INFO column, beginning with the word C<"NOT">.
13982
13983The standard Unicode properties listed below are documented in
13984L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
13985L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
13986L<perlunicode/Other Properties>;
13987
13988The first column in the table is a name for the property; the second column is
13989an alternative name, if any, plus possibly some annotations. The alternative
13990name is the property's full name, unless that would simply repeat the first
13991column, in which case the second column indicates the property's short name
13992(if different). The annotations are given only in the entry for the full
13993name. If a property is obsolete, etc, the entry will be flagged with the same
13994characters used in the table in the L<section above|/Properties accessible
13995through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
13996
13997$ucd_pod
13998
13999=head1 Properties accessible through other means
14000
14001Certain properties are accessible also via core function calls. These are:
78bb419c 14002
99870f4d
KW
14003 Lowercase_Mapping lc() and lcfirst()
14004 Titlecase_Mapping ucfirst()
14005 Uppercase_Mapping uc()
12ac2576 14006
043f3b3f
KW
14007Also, Case_Folding is accessible through the C</i> modifier in regular
14008expressions.
cf25bb62 14009
043f3b3f 14010And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
fbb93542
KW
14011interpolation in double-quoted strings and regular expressions; and functions
14012C<charnames::viacode()>, C<charnames::vianame()>, and
14013C<charnames::string_vianame()> (which require a C<use charnames ();> to be
14014specified.
cf25bb62 14015
ee94c7d1
KW
14016Finally, most properties related to decomposition are accessible via
14017L<Unicode::Normalize>.
14018
ea5acc0f 14019=head1 Unicode character properties that are NOT accepted by Perl
d2d499f5 14020
99870f4d
KW
14021Perl will generate an error for a few character properties in Unicode when
14022used in a regular expression. The non-Unihan ones are listed below, with the
14023reasons they are not accepted, perhaps with work-arounds. The short names for
14024the properties are listed enclosed in (parentheses).
c12f2655
KW
14025As described after the list, an installation can change the defaults and choose
14026to accept any of these. The list is machine generated based on the
14027choices made for the installation that generated this document.
ae6979a8 14028
99870f4d 14029@bad_re_properties
a3a8c5f0 14030
b7986f4f
KW
14031An installation can choose to allow any of these to be matched by downloading
14032the Unicode database from L<http://www.unicode.org/Public/> to
f3514a2f
KW
14033C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
14034controlling lists contained in the program
14035C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
14036(C<\%Config> is available from the Config module).
d73e5302 14037
8d099389
KW
14038=head1 Other information in the Unicode data base
14039
14040The Unicode data base is delivered in two different formats. The XML version
14041is valid for more modern Unicode releases. The other version is a collection
14042of files. The two are intended to give equivalent information. Perl uses the
14043older form; this allows you to recompile Perl to use early Unicode releases.
14044
14045The only non-character property that Perl currently supports is Named
14046Sequences, in which a sequence of code points
14047is given a name and generally treated as a single entity. (Perl supports
14048these via the C<\\N{...}> double-quotish construct,
14049L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
14050
14051Below is a list of the files in the Unicode data base that Perl doesn't
14052currently use, along with very brief descriptions of their purposes.
14053Some of the names of the files have been shortened from those that Unicode
14054uses, in order to allow them to be distinguishable from similarly named files
14055on file systems for which only the first 8 characters of a name are
14056significant.
14057
14058=over 4
14059
14060@unused_files
14061
14062=back
14063
99870f4d 14064=head1 SEE ALSO
d73e5302 14065
99870f4d 14066L<$unicode_reference_url>
12ac2576 14067
99870f4d 14068L<perlrecharclass>
12ac2576 14069
99870f4d 14070L<perlunicode>
d73e5302 14071
99870f4d 14072END
d73e5302 14073
9218f1cf
KW
14074 # And write it. The 0 means no utf8.
14075 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
99870f4d
KW
14076 return;
14077}
d73e5302 14078
99870f4d
KW
14079sub make_Heavy () {
14080 # Create and write Heavy.pl, which passes info about the tables to
14081 # utf8_heavy.pl
12ac2576 14082
143b2c48
KW
14083 # Stringify structures for output
14084 my $loose_property_name_of
14085 = simple_dumper(\%loose_property_name_of, ' ' x 4);
14086 chomp $loose_property_name_of;
14087
14088 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
14089 chomp $stricter_to_file_of;
14090
14091 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
14092 chomp $loose_to_file_of;
14093
14094 my $nv_floating_to_rational
14095 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
14096 chomp $nv_floating_to_rational;
14097
14098 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
14099 chomp $why_deprecated;
14100
14101 # We set the key to the file when we associated files with tables, but we
14102 # couldn't do the same for the value then, as we might not have the file
14103 # for the alternate table figured out at that time.
14104 foreach my $cased (keys %caseless_equivalent_to) {
14105 my @path = $caseless_equivalent_to{$cased}->file_path;
14106 my $path = join '/', @path[1, -1];
14107 $caseless_equivalent_to{$cased} = $path;
14108 }
14109 my $caseless_equivalent_to
14110 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
14111 chomp $caseless_equivalent_to;
14112
315bfd4e
KW
14113 my $loose_property_to_file_of
14114 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
14115 chomp $loose_property_to_file_of;
14116
89cf10cc
KW
14117 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
14118 chomp $file_to_swash_name;
14119
99870f4d
KW
14120 my @heavy = <<END;
14121$HEADER
126c3d4e 14122$INTERNAL_ONLY_HEADER
d73e5302 14123
01da8b85 14124# This file is for the use of utf8_heavy.pl and Unicode::UCD
12ac2576 14125
c12f2655
KW
14126# Maps Unicode (not Perl single-form extensions) property names in loose
14127# standard form to their corresponding standard names
99870f4d 14128\%utf8::loose_property_name_of = (
143b2c48 14129$loose_property_name_of
99870f4d 14130);
12ac2576 14131
99870f4d
KW
14132# Maps property, table to file for those using stricter matching
14133\%utf8::stricter_to_file_of = (
143b2c48 14134$stricter_to_file_of
99870f4d 14135);
12ac2576 14136
99870f4d
KW
14137# Maps property, table to file for those using loose matching
14138\%utf8::loose_to_file_of = (
143b2c48 14139$loose_to_file_of
99870f4d 14140);
12ac2576 14141
99870f4d
KW
14142# Maps floating point to fractional form
14143\%utf8::nv_floating_to_rational = (
143b2c48 14144$nv_floating_to_rational
99870f4d 14145);
12ac2576 14146
99870f4d
KW
14147# If a floating point number doesn't have enough digits in it to get this
14148# close to a fraction, it isn't considered to be that fraction even if all the
14149# digits it does have match.
14150\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 14151
99870f4d
KW
14152# Deprecated tables to generate a warning for. The key is the file containing
14153# the table, so as to avoid duplication, as many property names can map to the
14154# file, but we only need one entry for all of them.
14155\%utf8::why_deprecated = (
143b2c48 14156$why_deprecated
99870f4d 14157);
12ac2576 14158
143b2c48 14159# A few properties have different behavior under /i matching. This maps
d867ccfb
KW
14160# those to substitute files to use under /i.
14161\%utf8::caseless_equivalent = (
143b2c48 14162$caseless_equivalent_to
d867ccfb
KW
14163);
14164
315bfd4e
KW
14165# Property names to mapping files
14166\%utf8::loose_property_to_file_of = (
14167$loose_property_to_file_of
14168);
14169
89cf10cc
KW
14170# Files to the swash names within them.
14171\%utf8::file_to_swash_name = (
14172$file_to_swash_name
14173);
14174
99870f4d
KW
141751;
14176END
12ac2576 14177
9218f1cf 14178 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
99870f4d 14179 return;
12ac2576
JP
14180}
14181
52dc8b5d 14182sub make_Name_pm () {
6f424f62 14183 # Create and write Name.pm, which contains subroutines and data to use in
52dc8b5d
KW
14184 # conjunction with Name.pl
14185
bb1dd3da
KW
14186 # Maybe there's nothing to do.
14187 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
14188
52dc8b5d
KW
14189 my @name = <<END;
14190$HEADER
126c3d4e 14191$INTERNAL_ONLY_HEADER
52dc8b5d 14192END
0f6f7bc2 14193
fb848dce
KW
14194 # Convert these structures to output format.
14195 my $code_points_ending_in_code_point =
14196 main::simple_dumper(\@code_points_ending_in_code_point,
14197 ' ' x 8);
14198 my $names = main::simple_dumper(\%names_ending_in_code_point,
14199 ' ' x 8);
14200 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
0f6f7bc2 14201 ' ' x 8);
0f6f7bc2 14202
fb848dce
KW
14203 # Do the same with the Hangul names,
14204 my $jamo;
14205 my $jamo_l;
14206 my $jamo_v;
14207 my $jamo_t;
14208 my $jamo_re;
14209 if ($has_hangul_syllables) {
0f6f7bc2 14210
fb848dce
KW
14211 # Construct a regular expression of all the possible
14212 # combinations of the Hangul syllables.
14213 my @L_re; # Leading consonants
14214 for my $i ($LBase .. $LBase + $LCount - 1) {
14215 push @L_re, $Jamo{$i}
14216 }
14217 my @V_re; # Middle vowels
14218 for my $i ($VBase .. $VBase + $VCount - 1) {
14219 push @V_re, $Jamo{$i}
14220 }
14221 my @T_re; # Trailing consonants
14222 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
14223 push @T_re, $Jamo{$i}
14224 }
0f6f7bc2 14225
fb848dce
KW
14226 # The whole re is made up of the L V T combination.
14227 $jamo_re = '('
14228 . join ('|', sort @L_re)
14229 . ')('
14230 . join ('|', sort @V_re)
14231 . ')('
14232 . join ('|', sort @T_re)
14233 . ')?';
0f6f7bc2 14234
fb848dce
KW
14235 # These hashes needed by the algorithm were generated
14236 # during reading of the Jamo.txt file
14237 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
14238 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
14239 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
14240 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
14241 }
0f6f7bc2 14242
6f424f62 14243 push @name, <<END;
0f6f7bc2 14244
e7a078a0
KW
14245package charnames;
14246
6f424f62
KW
14247# This module contains machine-generated tables and code for the
14248# algorithmically-determinable Unicode character names. The following
14249# routines can be used to translate between name and code point and vice versa
0f6f7bc2
KW
14250
14251{ # Closure
14252
92199589
KW
14253 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
14254 # two must be 10; if there are 5, the first must not be a 0. Written this
14255 # way to decrease backtracking. The first regex allows the code point to
14256 # be at the end of a word, but to work properly, the word shouldn't end
14257 # with a valid hex character. The second one won't match a code point at
14258 # the end of a word, and doesn't have the run-on issue
0f6f7bc2
KW
14259 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
14260 my \$code_point_re = qr/$code_point_re/;
14261
14262 # In the following hash, the keys are the bases of names which includes
14263 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
14264 # of each key is another hash which is used to get the low and high ends
14265 # for each range of code points that apply to the name.
14266 my %names_ending_in_code_point = (
14267$names
14268 );
14269
14270 # The following hash is a copy of the previous one, except is for loose
14271 # matching, so each name has blanks and dashes squeezed out
14272 my %loose_names_ending_in_code_point = (
14273$loose_names
14274 );
14275
14276 # And the following array gives the inverse mapping from code points to
14277 # names. Lowest code points are first
14278 my \@code_points_ending_in_code_point = (
14279$code_points_ending_in_code_point
14280 );
14281END
fb848dce
KW
14282 # Earlier releases didn't have Jamos. No sense outputting
14283 # them unless will be used.
14284 if ($has_hangul_syllables) {
6f424f62 14285 push @name, <<END;
0f6f7bc2
KW
14286
14287 # Convert from code point to Jamo short name for use in composing Hangul
14288 # syllable names
14289 my %Jamo = (
14290$jamo
14291 );
14292
14293 # Leading consonant (can be null)
14294 my %Jamo_L = (
14295$jamo_l
14296 );
14297
14298 # Vowel
14299 my %Jamo_V = (
14300$jamo_v
14301 );
14302
14303 # Optional trailing consonant
14304 my %Jamo_T = (
14305$jamo_t
14306 );
14307
14308 # Computed re that splits up a Hangul name into LVT or LV syllables
14309 my \$syllable_re = qr/$jamo_re/;
14310
14311 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
14312 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
14313
14314 # These constants names and values were taken from the Unicode standard,
14315 # version 5.1, section 3.12. They are used in conjunction with Hangul
14316 # syllables
14317 my \$SBase = $SBase_string;
14318 my \$LBase = $LBase_string;
14319 my \$VBase = $VBase_string;
14320 my \$TBase = $TBase_string;
14321 my \$SCount = $SCount;
14322 my \$LCount = $LCount;
14323 my \$VCount = $VCount;
14324 my \$TCount = $TCount;
14325 my \$NCount = \$VCount * \$TCount;
14326END
fb848dce 14327 } # End of has Jamos
0f6f7bc2 14328
6f424f62 14329 push @name, << 'END';
0f6f7bc2
KW
14330
14331 sub name_to_code_point_special {
14332 my ($name, $loose) = @_;
14333
14334 # Returns undef if not one of the specially handled names; otherwise
14335 # returns the code point equivalent to the input name
14336 # $loose is non-zero if to use loose matching, 'name' in that case
14337 # must be input as upper case with all blanks and dashes squeezed out.
14338END
fb848dce 14339 if ($has_hangul_syllables) {
6f424f62 14340 push @name, << 'END';
0f6f7bc2
KW
14341
14342 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
14343 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
14344 {
14345 return if $name !~ qr/^$syllable_re$/;
14346 my $L = $Jamo_L{$1};
14347 my $V = $Jamo_V{$2};
14348 my $T = (defined $3) ? $Jamo_T{$3} : 0;
14349 return ($L * $VCount + $V) * $TCount + $T + $SBase;
14350 }
14351END
fb848dce 14352 }
6f424f62 14353 push @name, << 'END';
0f6f7bc2
KW
14354
14355 # Name must end in 'code_point' for this to handle.
14356 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
14357 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
14358
14359 my $base = $1;
14360 my $code_point = CORE::hex $2;
14361 my $names_ref;
14362
14363 if ($loose) {
14364 $names_ref = \%loose_names_ending_in_code_point;
14365 }
14366 else {
14367 return if $base !~ s/-$//;
14368 $names_ref = \%names_ending_in_code_point;
14369 }
14370
14371 # Name must be one of the ones which has the code point in it.
14372 return if ! $names_ref->{$base};
14373
14374 # Look through the list of ranges that apply to this name to see if
14375 # the code point is in one of them.
14376 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
14377 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
14378 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
14379
14380 # Here, the code point is in the range.
14381 return $code_point;
14382 }
14383
14384 # Here, looked like the name had a code point number in it, but
14385 # did not match one of the valid ones.
14386 return;
14387 }
14388
14389 sub code_point_to_name_special {
14390 my $code_point = shift;
14391
14392 # Returns the name of a code point if algorithmically determinable;
14393 # undef if not
14394END
fb848dce 14395 if ($has_hangul_syllables) {
6f424f62 14396 push @name, << 'END';
0f6f7bc2
KW
14397
14398 # If in the Hangul range, calculate the name based on Unicode's
14399 # algorithm
14400 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
14401 use integer;
14402 my $SIndex = $code_point - $SBase;
14403 my $L = $LBase + $SIndex / $NCount;
14404 my $V = $VBase + ($SIndex % $NCount) / $TCount;
14405 my $T = $TBase + $SIndex % $TCount;
14406 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
14407 $name .= $Jamo{$T} if $T != $TBase;
14408 return $name;
14409 }
14410END
fb848dce 14411 }
6f424f62 14412 push @name, << 'END';
0f6f7bc2
KW
14413
14414 # Look through list of these code points for one in range.
14415 foreach my $hash (@code_points_ending_in_code_point) {
14416 return if $code_point < $hash->{'low'};
14417 if ($code_point <= $hash->{'high'}) {
14418 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
14419 }
14420 }
14421 return; # None found
14422 }
14423} # End closure
14424
6f424f62 144251;
0f6f7bc2 14426END
52dc8b5d
KW
14427
14428 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
14429 return;
14430}
14431
9f077a68
KW
14432sub make_UCD () {
14433 # Create and write UCD.pl, which passes info about the tables to
14434 # Unicode::UCD
14435
f7be2375
KW
14436 # Create a mapping from each alias of Perl single-form extensions to all
14437 # its equivalent aliases, for quick look-up.
14438 my %perlprop_to_aliases;
14439 foreach my $table ($perl->tables) {
14440
14441 # First create the list of the aliases of each extension
14442 my @aliases_list; # List of legal aliases for this extension
14443
14444 my $table_name = $table->name;
14445 my $standard_table_name = standardize($table_name);
14446 my $table_full_name = $table->full_name;
14447 my $standard_table_full_name = standardize($table_full_name);
14448
14449 # Make sure that the list has both the short and full names
14450 push @aliases_list, $table_name, $table_full_name;
14451
14452 my $found_ucd = 0; # ? Did we actually get an alias that should be
14453 # output for this table
14454
14455 # Go through all the aliases (including the two just added), and add
14456 # any new unique ones to the list
14457 foreach my $alias ($table->aliases) {
14458
14459 # Skip non-legal names
0eac1e20 14460 next unless $alias->ok_as_filename;
f7be2375
KW
14461 next unless $alias->ucd;
14462
14463 $found_ucd = 1; # have at least one legal name
14464
14465 my $name = $alias->name;
14466 my $standard = standardize($name);
14467
14468 # Don't repeat a name that is equivalent to one already on the
14469 # list
14470 next if $standard eq $standard_table_name;
14471 next if $standard eq $standard_table_full_name;
14472
14473 push @aliases_list, $name;
14474 }
14475
14476 # If there were no legal names, don't output anything.
14477 next unless $found_ucd;
14478
14479 # To conserve memory in the program reading these in, omit full names
14480 # that are identical to the short name, when those are the only two
14481 # aliases for the property.
14482 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
14483 pop @aliases_list;
14484 }
14485
14486 # Here, @aliases_list is the list of all the aliases that this
14487 # extension legally has. Now can create a map to it from each legal
14488 # standardized alias
14489 foreach my $alias ($table->aliases) {
14490 next unless $alias->ucd;
0eac1e20 14491 next unless $alias->ok_as_filename;
f7be2375
KW
14492 push @{$perlprop_to_aliases{standardize($alias->name)}},
14493 @aliases_list;
14494 }
14495 }
14496
55a40252
KW
14497 # Make a list of all combinations of properties/values that are suppressed.
14498 my @suppressed;
14499 foreach my $property_name (keys %why_suppressed) {
14500
14501 # Just the value
14502 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
14503
14504 # The hash may contain properties not in this release of Unicode
14505 next unless defined (my $property = property_ref($property_name));
14506
14507 # Find all combinations
14508 foreach my $prop_alias ($property->aliases) {
14509 my $prop_alias_name = standardize($prop_alias->name);
14510
14511 # If no =value, there's just one combination possibe for this
14512 if (! $value_name) {
14513
14514 # The property may be suppressed, but there may be a proxy for
14515 # it, so it shouldn't be listed as suppressed
14516 next if $prop_alias->ucd;
14517 push @suppressed, $prop_alias_name;
14518 }
14519 else { # Otherwise
14520 foreach my $value_alias ($property->table($value_name)->aliases)
14521 {
14522 next if $value_alias->ucd;
14523
14524 push @suppressed, "$prop_alias_name="
14525 . standardize($value_alias->name);
14526 }
14527 }
14528 }
14529 }
14530
6a40599f
KW
14531 # Convert the structure below (designed for Name.pm) to a form that UCD
14532 # wants, so it doesn't have to modify it at all; i.e. so that it includes
14533 # an element for the Hangul syllables in the appropriate place, and
14534 # otherwise changes the name to include the "-<code point>" suffix.
14535 my @algorithm_names;
14536 my $done_hangul = 0;
14537
14538 # Copy it linearly.
14539 for my $i (0 .. @code_points_ending_in_code_point - 1) {
14540
14541 # Insert the hanguls in the correct place.
14542 if (! $done_hangul
14543 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
14544 {
14545 $done_hangul = 1;
14546 push @algorithm_names, { low => $SBase,
14547 high => $SBase + $SCount - 1,
14548 name => '<hangul syllable>',
14549 };
14550 }
14551
14552 # Copy the current entry, modified.
14553 push @algorithm_names, {
14554 low => $code_points_ending_in_code_point[$i]->{'low'},
14555 high => $code_points_ending_in_code_point[$i]->{'high'},
14556 name =>
14557 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
14558 };
14559 }
14560
9e4a1e86
KW
14561 # Serialize these structures for output.
14562 my $loose_to_standard_value
14563 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
14564 chomp $loose_to_standard_value;
14565
86a52d1e
KW
14566 my $string_property_loose_to_name
14567 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
14568 chomp $string_property_loose_to_name;
14569
f7be2375
KW
14570 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
14571 chomp $perlprop_to_aliases;
14572
5d1df013
KW
14573 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
14574 chomp $prop_aliases;
14575
1e863613
KW
14576 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
14577 chomp $prop_value_aliases;
14578
55a40252
KW
14579 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
14580 chomp $suppressed;
14581
6a40599f
KW
14582 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
14583 chomp $algorithm_names;
14584
2df7880f
KW
14585 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
14586 chomp $ambiguous_names;
14587
c15fda25
KW
14588 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
14589 chomp $loose_defaults;
14590
9f077a68
KW
14591 my @ucd = <<END;
14592$HEADER
14593$INTERNAL_ONLY_HEADER
14594
14595# This file is for the use of Unicode::UCD
14596
14597# Highest legal Unicode code point
14598\$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
14599
14600# Hangul syllables
14601\$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
14602\$Unicode::UCD::HANGUL_COUNT = $SCount;
14603
9e4a1e86
KW
14604# Keys are all the possible "prop=value" combinations, in loose form; values
14605# are the standard loose name for the 'value' part of the key
14606\%Unicode::UCD::loose_to_standard_value = (
14607$loose_to_standard_value
14608);
14609
86a52d1e
KW
14610# String property loose names to standard loose name
14611\%Unicode::UCD::string_property_loose_to_name = (
14612$string_property_loose_to_name
14613);
14614
f7be2375
KW
14615# Keys are Perl extensions in loose form; values are each one's list of
14616# aliases
14617\%Unicode::UCD::loose_perlprop_to_name = (
14618$perlprop_to_aliases
14619);
14620
5d1df013
KW
14621# Keys are standard property name; values are each one's aliases
14622\%Unicode::UCD::prop_aliases = (
14623$prop_aliases
14624);
14625
1e863613
KW
14626# Keys of top level are standard property name; values are keys to another
14627# hash, Each one is one of the property's values, in standard form. The
14628# values are that prop-val's aliases. If only one specified, the short and
14629# long alias are identical.
14630\%Unicode::UCD::prop_value_aliases = (
14631$prop_value_aliases
14632);
14633
6a40599f
KW
14634# Ordered (by code point ordinal) list of the ranges of code points whose
14635# names are algorithmically determined. Each range entry is an anonymous hash
14636# of the start and end points and a template for the names within it.
14637\@Unicode::UCD::algorithmic_named_code_points = (
14638$algorithm_names
14639);
14640
2df7880f
KW
14641# The properties that as-is have two meanings, and which must be disambiguated
14642\%Unicode::UCD::ambiguous_names = (
14643$ambiguous_names
14644);
14645
c15fda25
KW
14646# Keys are the prop-val combinations which are the default values for the
14647# given property, expressed in standard loose form
14648\%Unicode::UCD::loose_defaults = (
14649$loose_defaults
14650);
14651
55a40252
KW
14652# All combinations of names that are suppressed.
14653# This is actually for UCD.t, so it knows which properties shouldn't have
14654# entries. If it got any bigger, would probably want to put it in its own
14655# file to use memory only when it was needed, in testing.
14656\@Unicode::UCD::suppressed_properties = (
14657$suppressed
14658);
14659
9f077a68
KW
146601;
14661END
14662
14663 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
14664 return;
14665}
52dc8b5d 14666
99870f4d
KW
14667sub write_all_tables() {
14668 # Write out all the tables generated by this program to files, as well as
14669 # the supporting data structures, pod file, and .t file.
14670
14671 my @writables; # List of tables that actually get written
14672 my %match_tables_to_write; # Used to collapse identical match tables
14673 # into one file. Each key is a hash function
14674 # result to partition tables into buckets.
14675 # Each value is an array of the tables that
14676 # fit in the bucket.
14677
14678 # For each property ...
14679 # (sort so that if there is an immutable file name, it has precedence, so
14680 # some other property can't come in and take over its file name. If b's
14681 # file name is defined, will return 1, meaning to take it first; don't
7fc6cb55
KW
14682 # care if both defined, as they had better be different anyway. And the
14683 # property named 'Perl' needs to be first (it doesn't have any immutable
14684 # file name) because empty properties are defined in terms of it's table
14685 # named 'Any'.)
99870f4d 14686 PROPERTY:
7fc6cb55
KW
14687 foreach my $property (sort { return -1 if $a == $perl;
14688 return 1 if $b == $perl;
14689 return defined $b->file
14690 } property_ref('*'))
14691 {
99870f4d
KW
14692 my $type = $property->type;
14693
14694 # And for each table for that property, starting with the mapping
14695 # table for it ...
14696 TABLE:
14697 foreach my $table($property,
14698
14699 # and all the match tables for it (if any), sorted so
14700 # the ones with the shortest associated file name come
14701 # first. The length sorting prevents problems of a
14702 # longer file taking a name that might have to be used
14703 # by a shorter one. The alphabetic sorting prevents
14704 # differences between releases
14705 sort { my $ext_a = $a->external_name;
14706 return 1 if ! defined $ext_a;
14707 my $ext_b = $b->external_name;
14708 return -1 if ! defined $ext_b;
a92d5c2e
KW
14709
14710 # But return the non-complement table before
14711 # the complement one, as the latter is defined
14712 # in terms of the former, and needs to have
14713 # the information for the former available.
14714 return 1 if $a->complement != 0;
14715 return -1 if $b->complement != 0;
14716
0a695432
KW
14717 # Similarly, return a subservient table after
14718 # a leader
14719 return 1 if $a->leader != $a;
14720 return -1 if $b->leader != $b;
14721
99870f4d
KW
14722 my $cmp = length $ext_a <=> length $ext_b;
14723
14724 # Return result if lengths not equal
14725 return $cmp if $cmp;
14726
14727 # Alphabetic if lengths equal
14728 return $ext_a cmp $ext_b
14729 } $property->tables
14730 )
14731 {
12ac2576 14732
99870f4d
KW
14733 # Here we have a table associated with a property. It could be
14734 # the map table (done first for each property), or one of the
14735 # other tables. Determine which type.
14736 my $is_property = $table->isa('Property');
14737
14738 my $name = $table->name;
14739 my $complete_name = $table->complete_name;
14740
14741 # See if should suppress the table if is empty, but warn if it
14742 # contains something.
0332277c
KW
14743 my $suppress_if_empty_warn_if_not
14744 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
99870f4d
KW
14745
14746 # Calculate if this table should have any code points associated
14747 # with it or not.
14748 my $expected_empty =
14749
14750 # $perl should be empty, as well as properties that we just
14751 # don't do anything with
14752 ($is_property
14753 && ($table == $perl
14754 || grep { $complete_name eq $_ }
14755 @unimplemented_properties
14756 )
14757 )
14758
14759 # Match tables in properties we skipped populating should be
14760 # empty
14761 || (! $is_property && ! $property->to_create_match_tables)
14762
14763 # Tables and properties that are expected to have no code
14764 # points should be empty
14765 || $suppress_if_empty_warn_if_not
14766 ;
14767
14768 # Set a boolean if this table is the complement of an empty binary
14769 # table
14770 my $is_complement_of_empty_binary =
14771 $type == $BINARY &&
14772 (($table == $property->table('Y')
14773 && $property->table('N')->is_empty)
14774 || ($table == $property->table('N')
14775 && $property->table('Y')->is_empty));
14776
99870f4d
KW
14777 if ($table->is_empty) {
14778
99870f4d 14779 if ($suppress_if_empty_warn_if_not) {
301ba948
KW
14780 $table->set_fate($SUPPRESSED,
14781 $suppress_if_empty_warn_if_not);
99870f4d 14782 }
12ac2576 14783
c12f2655 14784 # Suppress (by skipping them) expected empty tables.
99870f4d
KW
14785 next TABLE if $expected_empty;
14786
14787 # And setup to later output a warning for those that aren't
14788 # known to be allowed to be empty. Don't do the warning if
14789 # this table is a child of another one to avoid duplicating
14790 # the warning that should come from the parent one.
14791 if (($table == $property || $table->parent == $table)
301ba948 14792 && $table->fate != $SUPPRESSED
395dfc19 14793 && $table->fate != $MAP_PROXIED
99870f4d
KW
14794 && ! grep { $complete_name =~ /^$_$/ }
14795 @tables_that_may_be_empty)
14796 {
14797 push @unhandled_properties, "$table";
14798 }
7fc6cb55
KW
14799
14800 # An empty table is just the complement of everything.
14801 $table->set_complement($Any) if $table != $property;
99870f4d
KW
14802 }
14803 elsif ($expected_empty) {
14804 my $because = "";
14805 if ($suppress_if_empty_warn_if_not) {
0332277c 14806 $because = " because $suppress_if_empty_warn_if_not";
99870f4d 14807 }
12ac2576 14808
99870f4d
KW
14809 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
14810 }
12ac2576 14811
14479722
KW
14812 # Some tables should match everything
14813 my $expected_full =
1583a95b
KW
14814 ($table->fate == $SUPPRESSED)
14815 ? 0
e75669bd
KW
14816 : ($is_property)
14817 ? # All these types of map tables will be full because
14818 # they will have been populated with defaults
14819 ($type == $ENUM || $type == $FORCED_BINARY)
14820
14821 : # A match table should match everything if its method
14822 # shows it should
14823 ($table->matches_all
14824
14825 # The complement of an empty binary table will match
14826 # everything
14827 || $is_complement_of_empty_binary
14828 )
14479722
KW
14829 ;
14830
99870f4d
KW
14831 my $count = $table->count;
14832 if ($expected_full) {
14833 if ($count != $MAX_UNICODE_CODEPOINTS) {
14834 Carp::my_carp("$table matches only "
14835 . clarify_number($count)
14836 . " Unicode code points but should match "
14837 . clarify_number($MAX_UNICODE_CODEPOINTS)
14838 . " (off by "
14839 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
14840 . "). Proceeding anyway.");
14841 }
12ac2576 14842
99870f4d
KW
14843 # Here is expected to be full. If it is because it is the
14844 # complement of an (empty) binary table that is to be
14845 # suppressed, then suppress this one as well.
14846 if ($is_complement_of_empty_binary) {
14847 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
14848 my $opposing = $property->table($opposing_name);
14849 my $opposing_status = $opposing->status;
14850 if ($opposing_status) {
14851 $table->set_status($opposing_status,
14852 $opposing->status_info);
14853 }
14854 }
14855 }
14856 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
14857 if ($table == $property || $table->leader == $table) {
14858 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
14859 }
14860 }
d73e5302 14861
301ba948 14862 if ($table->fate == $SUPPRESSED) {
99870f4d
KW
14863 if (! $is_property) {
14864 my @children = $table->children;
14865 foreach my $child (@children) {
301ba948 14866 if ($child->fate != $SUPPRESSED) {
99870f4d
KW
14867 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
14868 }
14869 }
14870 }
14871 next TABLE;
d73e5302 14872
99870f4d 14873 }
2df7880f 14874
99870f4d
KW
14875 if (! $is_property) {
14876
2df7880f
KW
14877 make_ucd_table_pod_entries($table) if $table->property == $perl;
14878
99870f4d
KW
14879 # Several things need to be done just once for each related
14880 # group of match tables. Do them on the parent.
14881 if ($table->parent == $table) {
14882
14883 # Add an entry in the pod file for the table; it also does
14884 # the children.
d1476e4d 14885 make_re_pod_entries($table) if defined $pod_directory;
99870f4d
KW
14886
14887 # See if the the table matches identical code points with
14888 # something that has already been output. In that case,
14889 # no need to have two files with the same code points in
14890 # them. We use the table's hash() method to store these
14891 # in buckets, so that it is quite likely that if two
14892 # tables are in the same bucket they will be identical, so
14893 # don't have to compare tables frequently. The tables
14894 # have to have the same status to share a file, so add
14895 # this to the bucket hash. (The reason for this latter is
14896 # that Heavy.pl associates a status with a file.)
06671cbc
KW
14897 # We don't check tables that are inverses of others, as it
14898 # would lead to some coding complications, and checking
14899 # all the regular ones should find everything.
14900 if ($table->complement == 0) {
21be712a 14901 my $hash = $table->hash . ';' . $table->status;
99870f4d 14902
21be712a
KW
14903 # Look at each table that is in the same bucket as
14904 # this one would be.
14905 foreach my $comparison
14906 (@{$match_tables_to_write{$hash}})
14907 {
14908 if ($table->matches_identically_to($comparison)) {
14909 $table->set_equivalent_to($comparison,
99870f4d 14910 Related => 0);
21be712a
KW
14911 next TABLE;
14912 }
99870f4d 14913 }
d73e5302 14914
21be712a
KW
14915 # Here, not equivalent, add this table to the bucket.
14916 push @{$match_tables_to_write{$hash}}, $table;
06671cbc 14917 }
99870f4d
KW
14918 }
14919 }
14920 else {
14921
14922 # Here is the property itself.
14923 # Don't write out or make references to the $perl property
14924 next if $table == $perl;
14925
2df7880f
KW
14926 make_ucd_table_pod_entries($table);
14927
382cadab
KW
14928 # There is a mapping stored of the various synonyms to the
14929 # standardized name of the property for utf8_heavy.pl.
14930 # Also, the pod file contains entries of the form:
14931 # \p{alias: *} \p{full: *}
14932 # rather than show every possible combination of things.
99870f4d 14933
382cadab 14934 my @property_aliases = $property->aliases;
99870f4d 14935
382cadab
KW
14936 my $full_property_name = $property->full_name;
14937 my $property_name = $property->name;
14938 my $standard_property_name = standardize($property_name);
5d1df013
KW
14939 my $standard_property_full_name
14940 = standardize($full_property_name);
14941
14942 # We also create for Unicode::UCD a list of aliases for
14943 # the property. The list starts with the property name;
14944 # then its full name.
14945 my @property_list;
14946 my @standard_list;
14947 if ( $property->fate <= $MAP_PROXIED) {
14948 @property_list = ($property_name, $full_property_name);
14949 @standard_list = ($standard_property_name,
14950 $standard_property_full_name);
14951 }
99870f4d 14952
382cadab
KW
14953 # For each synonym ...
14954 for my $i (0 .. @property_aliases - 1) {
14955 my $alias = $property_aliases[$i];
14956 my $alias_name = $alias->name;
14957 my $alias_standard = standardize($alias_name);
99870f4d 14958
382cadab 14959
5d1df013
KW
14960 # Add other aliases to the list of property aliases
14961 if ($property->fate <= $MAP_PROXIED
14962 && ! grep { $alias_standard eq $_ } @standard_list)
14963 {
14964 push @property_list, $alias_name;
14965 push @standard_list, $alias_standard;
14966 }
382cadab
KW
14967
14968 # For utf8_heavy, set the mapping of the alias to the
14969 # property
86a52d1e
KW
14970 if ($type == $STRING) {
14971 if ($property->fate <= $MAP_PROXIED) {
14972 $string_property_loose_to_name{$alias_standard}
14973 = $standard_property_name;
14974 }
14975 }
14976 else {
99870f4d
KW
14977 if (exists ($loose_property_name_of{$alias_standard}))
14978 {
14979 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
14980 }
14981 else {
14982 $loose_property_name_of{$alias_standard}
14983 = $standard_property_name;
14984 }
14985
d1476e4d 14986 # Now for the re pod entry for this alias. Skip if not
23e33b60
KW
14987 # outputting a pod; skip the first one, which is the
14988 # full name so won't have an entry like: '\p{full: *}
14989 # \p{full: *}', and skip if don't want an entry for
14990 # this one.
14991 next if $i == 0
14992 || ! defined $pod_directory
33e96e72 14993 || ! $alias->make_re_pod_entry;
99870f4d 14994
01d970b5 14995 my $rhs = "\\p{$full_property_name: *}";
d57ccc9a
KW
14996 if ($property != $perl && $table->perl_extension) {
14997 $rhs .= ' (Perl extension)';
14998 }
99870f4d
KW
14999 push @match_properties,
15000 format_pod_line($indent_info_column,
15001 '\p{' . $alias->name . ': *}',
d57ccc9a 15002 $rhs,
99870f4d
KW
15003 $alias->status);
15004 }
382cadab 15005 }
d73e5302 15006
5d1df013
KW
15007 # The list of all possible names is attached to each alias, so
15008 # lookup is easy
15009 if (@property_list) {
15010 push @{$prop_aliases{$standard_list[0]}}, @property_list;
15011 }
15012
1e863613
KW
15013 if ($property->fate <= $MAP_PROXIED) {
15014
15015 # Similarly, we create for Unicode::UCD a list of
15016 # property-value aliases.
15017
15018 my $property_full_name = $property->full_name;
15019
15020 # Look at each table in the property...
15021 foreach my $table ($property->tables) {
15022 my @values_list;
15023 my $table_full_name = $table->full_name;
15024 my $standard_table_full_name
15025 = standardize($table_full_name);
15026 my $table_name = $table->name;
15027 my $standard_table_name = standardize($table_name);
15028
15029 # The list starts with the table name and its full
15030 # name.
15031 push @values_list, $table_name, $table_full_name;
15032
15033 # We add to the table each unique alias that isn't
15034 # discouraged from use.
15035 foreach my $alias ($table->aliases) {
15036 next if $alias->status
15037 && $alias->status eq $DISCOURAGED;
15038 my $name = $alias->name;
15039 my $standard = standardize($name);
15040 next if $standard eq $standard_table_name;
15041 next if $standard eq $standard_table_full_name;
15042 push @values_list, $name;
15043 }
5d1df013 15044
1e863613
KW
15045 # Here @values_list is a list of all the aliases for
15046 # the table. That is, all the property-values given
15047 # by this table. By agreement with Unicode::UCD,
15048 # if the name and full name are identical, and there
15049 # are no other names, drop the duplcate entry to save
15050 # memory.
15051 if (@values_list == 2
15052 && $values_list[0] eq $values_list[1])
15053 {
15054 pop @values_list
15055 }
15056
15057 # To save memory, unlike the similar list for property
15058 # aliases above, only the standard forms hve the list.
15059 # This forces an extra step of converting from input
15060 # name to standard name, but the savings are
15061 # considerable. (There is only marginal savings if we
15062 # did this with the property aliases.)
15063 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
15064 }
15065 }
d73e5302 15066
c12f2655 15067 # Don't write out a mapping file if not desired.
99870f4d
KW
15068 next if ! $property->to_output_map;
15069 }
d73e5302 15070
99870f4d
KW
15071 # Here, we know we want to write out the table, but don't do it
15072 # yet because there may be other tables that come along and will
15073 # want to share the file, and the file's comments will change to
15074 # mention them. So save for later.
15075 push @writables, $table;
15076
15077 } # End of looping through the property and all its tables.
15078 } # End of looping through all properties.
15079
15080 # Now have all the tables that will have files written for them. Do it.
15081 foreach my $table (@writables) {
15082 my @directory;
15083 my $filename;
15084 my $property = $table->property;
15085 my $is_property = ($table == $property);
15086 if (! $is_property) {
15087
15088 # Match tables for the property go in lib/$subdirectory, which is
15089 # the property's name. Don't use the standard file name for this,
15090 # as may get an unfamiliar alias
15091 @directory = ($matches_directory, $property->external_name);
15092 }
15093 else {
d73e5302 15094
99870f4d
KW
15095 @directory = $table->directory;
15096 $filename = $table->file;
15097 }
d73e5302 15098
98dc9551 15099 # Use specified filename if available, or default to property's
99870f4d
KW
15100 # shortest name. We need an 8.3 safe filename (which means "an 8
15101 # safe" filename, since after the dot is only 'pl', which is < 3)
15102 # The 2nd parameter is if the filename shouldn't be changed, and
15103 # it shouldn't iff there is a hard-coded name for this table.
15104 $filename = construct_filename(
15105 $filename || $table->external_name,
15106 ! $filename, # mutable if no filename
15107 \@directory);
d73e5302 15108
99870f4d 15109 register_file_for_name($table, \@directory, $filename);
d73e5302 15110
99870f4d
KW
15111 # Only need to write one file when shared by more than one
15112 # property
a92d5c2e
KW
15113 next if ! $is_property
15114 && ($table->leader != $table || $table->complement != 0);
d73e5302 15115
99870f4d
KW
15116 # Construct a nice comment to add to the file
15117 $table->set_final_comment;
15118
15119 $table->write;
cf25bb62 15120 }
d73e5302 15121
d73e5302 15122
99870f4d
KW
15123 # Write out the pod file
15124 make_pod;
15125
9f077a68 15126 # And Heavy.pl, Name.pm, UCD.pl
99870f4d 15127 make_Heavy;
52dc8b5d 15128 make_Name_pm;
9f077a68 15129 make_UCD;
d73e5302 15130
99870f4d
KW
15131 make_property_test_script() if $make_test_script;
15132 return;
cf25bb62 15133}
d73e5302 15134
99870f4d
KW
15135my @white_space_separators = ( # This used only for making the test script.
15136 "",
15137 ' ',
15138 "\t",
15139 ' '
15140 );
d73e5302 15141
99870f4d
KW
15142sub generate_separator($) {
15143 # This used only for making the test script. It generates the colon or
15144 # equal separator between the property and property value, with random
15145 # white space surrounding the separator
d73e5302 15146
99870f4d 15147 my $lhs = shift;
d73e5302 15148
99870f4d 15149 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 15150
99870f4d
KW
15151 # Choose space before and after randomly
15152 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
15153 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 15154
99870f4d
KW
15155 # And return the whole complex, half the time using a colon, half the
15156 # equals
15157 return $spaces_before
15158 . (rand() < 0.5) ? '=' : ':'
15159 . $spaces_after;
15160}
76ccdbe2 15161
430ada4c 15162sub generate_tests($$$$$) {
99870f4d
KW
15163 # This used only for making the test script. It generates test cases that
15164 # are expected to compile successfully in perl. Note that the lhs and
15165 # rhs are assumed to already be as randomized as the caller wants.
15166
99870f4d
KW
15167 my $lhs = shift; # The property: what's to the left of the colon
15168 # or equals separator
15169 my $rhs = shift; # The property value; what's to the right
15170 my $valid_code = shift; # A code point that's known to be in the
15171 # table given by lhs=rhs; undef if table is
15172 # empty
15173 my $invalid_code = shift; # A code point known to not be in the table;
15174 # undef if the table is all code points
15175 my $warning = shift;
15176
15177 # Get the colon or equal
15178 my $separator = generate_separator($lhs);
15179
15180 # The whole 'property=value'
15181 my $name = "$lhs$separator$rhs";
15182
430ada4c 15183 my @output;
99870f4d
KW
15184 # Create a complete set of tests, with complements.
15185 if (defined $valid_code) {
430ada4c
NC
15186 push @output, <<"EOC"
15187Expect(1, $valid_code, '\\p{$name}', $warning);
15188Expect(0, $valid_code, '\\p{^$name}', $warning);
15189Expect(0, $valid_code, '\\P{$name}', $warning);
15190Expect(1, $valid_code, '\\P{^$name}', $warning);
15191EOC
99870f4d
KW
15192 }
15193 if (defined $invalid_code) {
430ada4c
NC
15194 push @output, <<"EOC"
15195Expect(0, $invalid_code, '\\p{$name}', $warning);
15196Expect(1, $invalid_code, '\\p{^$name}', $warning);
15197Expect(1, $invalid_code, '\\P{$name}', $warning);
15198Expect(0, $invalid_code, '\\P{^$name}', $warning);
15199EOC
15200 }
15201 return @output;
99870f4d 15202}
cf25bb62 15203
430ada4c 15204sub generate_error($$$) {
99870f4d
KW
15205 # This used only for making the test script. It generates test cases that
15206 # are expected to not only not match, but to be syntax or similar errors
15207
99870f4d
KW
15208 my $lhs = shift; # The property: what's to the left of the
15209 # colon or equals separator
15210 my $rhs = shift; # The property value; what's to the right
15211 my $already_in_error = shift; # Boolean; if true it's known that the
15212 # unmodified lhs and rhs will cause an error.
15213 # This routine should not force another one
15214 # Get the colon or equal
15215 my $separator = generate_separator($lhs);
15216
15217 # Since this is an error only, don't bother to randomly decide whether to
15218 # put the error on the left or right side; and assume that the rhs is
15219 # loosely matched, again for convenience rather than rigor.
15220 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
15221
15222 my $property = $lhs . $separator . $rhs;
15223
430ada4c
NC
15224 return <<"EOC";
15225Error('\\p{$property}');
15226Error('\\P{$property}');
15227EOC
d73e5302
JH
15228}
15229
99870f4d
KW
15230# These are used only for making the test script
15231# XXX Maybe should also have a bad strict seps, which includes underscore.
15232
15233my @good_loose_seps = (
15234 " ",
15235 "-",
15236 "\t",
15237 "",
15238 "_",
15239 );
15240my @bad_loose_seps = (
15241 "/a/",
15242 ':=',
15243 );
15244
15245sub randomize_stricter_name {
15246 # This used only for making the test script. Take the input name and
15247 # return a randomized, but valid version of it under the stricter matching
15248 # rules.
15249
15250 my $name = shift;
15251 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15252
15253 # If the name looks like a number (integer, floating, or rational), do
15254 # some extra work
15255 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
15256 my $sign = $1;
15257 my $number = $2;
15258 my $separator = $3;
15259
15260 # If there isn't a sign, part of the time add a plus
15261 # Note: Not testing having any denominator having a minus sign
15262 if (! $sign) {
15263 $sign = '+' if rand() <= .3;
15264 }
15265
15266 # And add 0 or more leading zeros.
15267 $name = $sign . ('0' x int rand(10)) . $number;
15268
15269 if (defined $separator) {
15270 my $extra_zeros = '0' x int rand(10);
cf25bb62 15271
99870f4d
KW
15272 if ($separator eq '.') {
15273
15274 # Similarly, add 0 or more trailing zeros after a decimal
15275 # point
15276 $name .= $extra_zeros;
15277 }
15278 else {
15279
15280 # Or, leading zeros before the denominator
15281 $name =~ s,/,/$extra_zeros,;
15282 }
15283 }
cf25bb62 15284 }
d73e5302 15285
99870f4d
KW
15286 # For legibility of the test, only change the case of whole sections at a
15287 # time. To do this, first split into sections. The split returns the
15288 # delimiters
15289 my @sections;
15290 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
15291 trace $section if main::DEBUG && $to_trace;
15292
15293 if (length $section > 1 && $section !~ /\D/) {
15294
15295 # If the section is a sequence of digits, about half the time
15296 # randomly add underscores between some of them.
15297 if (rand() > .5) {
15298
15299 # Figure out how many underscores to add. max is 1 less than
15300 # the number of digits. (But add 1 at the end to make sure
15301 # result isn't 0, and compensate earlier by subtracting 2
15302 # instead of 1)
15303 my $num_underscores = int rand(length($section) - 2) + 1;
15304
15305 # And add them evenly throughout, for convenience, not rigor
15306 use integer;
15307 my $spacing = (length($section) - 1)/ $num_underscores;
15308 my $temp = $section;
15309 $section = "";
15310 for my $i (1 .. $num_underscores) {
15311 $section .= substr($temp, 0, $spacing, "") . '_';
15312 }
15313 $section .= $temp;
15314 }
15315 push @sections, $section;
15316 }
15317 else {
d73e5302 15318
99870f4d
KW
15319 # Here not a sequence of digits. Change the case of the section
15320 # randomly
15321 my $switch = int rand(4);
15322 if ($switch == 0) {
15323 push @sections, uc $section;
15324 }
15325 elsif ($switch == 1) {
15326 push @sections, lc $section;
15327 }
15328 elsif ($switch == 2) {
15329 push @sections, ucfirst $section;
15330 }
15331 else {
15332 push @sections, $section;
15333 }
15334 }
cf25bb62 15335 }
99870f4d
KW
15336 trace "returning", join "", @sections if main::DEBUG && $to_trace;
15337 return join "", @sections;
15338}
71d929cb 15339
99870f4d
KW
15340sub randomize_loose_name($;$) {
15341 # This used only for making the test script
71d929cb 15342
99870f4d
KW
15343 my $name = shift;
15344 my $want_error = shift; # if true, make an error
15345 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15346
15347 $name = randomize_stricter_name($name);
5beb625e
JH
15348
15349 my @parts;
99870f4d 15350 push @parts, $good_loose_seps[rand(@good_loose_seps)];
45376db6
KW
15351
15352 # Preserve trailing ones for the sake of not stripping the underscore from
15353 # 'L_'
15354 for my $part (split /[-\s_]+ (?= . )/, $name) {
5beb625e 15355 if (@parts) {
99870f4d
KW
15356 if ($want_error and rand() < 0.3) {
15357 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
15358 $want_error = 0;
15359 }
15360 else {
15361 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
15362 }
15363 }
99870f4d 15364 push @parts, $part;
5beb625e 15365 }
99870f4d
KW
15366 my $new = join("", @parts);
15367 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 15368
99870f4d 15369 if ($want_error) {
5beb625e 15370 if (rand() >= 0.5) {
99870f4d
KW
15371 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
15372 }
15373 else {
15374 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
15375 }
15376 }
15377 return $new;
15378}
15379
99870f4d
KW
15380# Used to make sure don't generate duplicate test cases.
15381my %test_generated;
5beb625e 15382
99870f4d
KW
15383sub make_property_test_script() {
15384 # This used only for making the test script
15385 # this written directly -- it's huge.
5beb625e 15386
99870f4d 15387 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 15388
99870f4d
KW
15389 # This uses randomness to test different possibilities without testing all
15390 # possibilities. To ensure repeatability, set the seed to 0. But if
15391 # tests are added, it will perturb all later ones in the .t file
15392 srand 0;
5beb625e 15393
3df51b85
KW
15394 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
15395
99870f4d
KW
15396 # Keep going down an order of magnitude
15397 # until find that adding this quantity to
15398 # 1 remains 1; but put an upper limit on
15399 # this so in case this algorithm doesn't
15400 # work properly on some platform, that we
15401 # won't loop forever.
15402 my $digits = 0;
15403 my $min_floating_slop = 1;
15404 while (1+ $min_floating_slop != 1
15405 && $digits++ < 50)
5beb625e 15406 {
99870f4d
KW
15407 my $next = $min_floating_slop / 10;
15408 last if $next == 0; # If underflows,
15409 # use previous one
15410 $min_floating_slop = $next;
5beb625e 15411 }
430ada4c
NC
15412
15413 # It doesn't matter whether the elements of this array contain single lines
15414 # or multiple lines. main::write doesn't count the lines.
15415 my @output;
99870f4d
KW
15416
15417 foreach my $property (property_ref('*')) {
15418 foreach my $table ($property->tables) {
15419
15420 # Find code points that match, and don't match this table.
15421 my $valid = $table->get_valid_code_point;
15422 my $invalid = $table->get_invalid_code_point;
15423 my $warning = ($table->status eq $DEPRECATED)
15424 ? "'deprecated'"
15425 : '""';
15426
15427 # Test each possible combination of the property's aliases with
15428 # the table's. If this gets to be too many, could do what is done
15429 # in the set_final_comment() for Tables
15430 my @table_aliases = $table->aliases;
15431 my @property_aliases = $table->property->aliases;
807807b7
KW
15432
15433 # Every property can be optionally be prefixed by 'Is_', so test
15434 # that those work, by creating such a new alias for each
15435 # pre-existing one.
15436 push @property_aliases, map { Alias->new("Is_" . $_->name,
15437 $_->loose_match,
33e96e72 15438 $_->make_re_pod_entry,
0eac1e20 15439 $_->ok_as_filename,
fd1e3e84
KW
15440 $_->status,
15441 $_->ucd,
15442 )
807807b7 15443 } @property_aliases;
99870f4d
KW
15444 my $max = max(scalar @table_aliases, scalar @property_aliases);
15445 for my $j (0 .. $max - 1) {
15446
15447 # The current alias for property is the next one on the list,
15448 # or if beyond the end, start over. Similarly for table
15449 my $property_name
15450 = $property_aliases[$j % @property_aliases]->name;
15451
15452 $property_name = "" if $table->property == $perl;
15453 my $table_alias = $table_aliases[$j % @table_aliases];
15454 my $table_name = $table_alias->name;
15455 my $loose_match = $table_alias->loose_match;
15456
15457 # If the table doesn't have a file, any test for it is
15458 # already guaranteed to be in error
15459 my $already_error = ! $table->file_path;
15460
15461 # Generate error cases for this alias.
430ada4c
NC
15462 push @output, generate_error($property_name,
15463 $table_name,
15464 $already_error);
99870f4d
KW
15465
15466 # If the table is guaranteed to always generate an error,
15467 # quit now without generating success cases.
15468 next if $already_error;
15469
15470 # Now for the success cases.
15471 my $random;
15472 if ($loose_match) {
15473
15474 # For loose matching, create an extra test case for the
15475 # standard name.
15476 my $standard = standardize($table_name);
15477
15478 # $test_name should be a unique combination for each test
15479 # case; used just to avoid duplicate tests
15480 my $test_name = "$property_name=$standard";
15481
15482 # Don't output duplicate test cases.
15483 if (! exists $test_generated{$test_name}) {
15484 $test_generated{$test_name} = 1;
430ada4c
NC
15485 push @output, generate_tests($property_name,
15486 $standard,
15487 $valid,
15488 $invalid,
15489 $warning,
15490 );
5beb625e 15491 }
99870f4d
KW
15492 $random = randomize_loose_name($table_name)
15493 }
15494 else { # Stricter match
15495 $random = randomize_stricter_name($table_name);
99598c8c 15496 }
99598c8c 15497
99870f4d
KW
15498 # Now for the main test case for this alias.
15499 my $test_name = "$property_name=$random";
15500 if (! exists $test_generated{$test_name}) {
15501 $test_generated{$test_name} = 1;
430ada4c
NC
15502 push @output, generate_tests($property_name,
15503 $random,
15504 $valid,
15505 $invalid,
15506 $warning,
15507 );
99870f4d
KW
15508
15509 # If the name is a rational number, add tests for the
15510 # floating point equivalent.
15511 if ($table_name =~ qr{/}) {
15512
15513 # Calculate the float, and find just the fraction.
15514 my $float = eval $table_name;
15515 my ($whole, $fraction)
15516 = $float =~ / (.*) \. (.*) /x;
15517
15518 # Starting with one digit after the decimal point,
15519 # create a test for each possible precision (number of
15520 # digits past the decimal point) until well beyond the
15521 # native number found on this machine. (If we started
15522 # with 0 digits, it would be an integer, which could
15523 # well match an unrelated table)
15524 PLACE:
15525 for my $i (1 .. $min_floating_slop + 3) {
15526 my $table_name = sprintf("%.*f", $i, $float);
15527 if ($i < $MIN_FRACTION_LENGTH) {
15528
15529 # If the test case has fewer digits than the
15530 # minimum acceptable precision, it shouldn't
15531 # succeed, so we expect an error for it.
15532 # E.g., 2/3 = .7 at one decimal point, and we
15533 # shouldn't say it matches .7. We should make
15534 # it be .667 at least before agreeing that the
15535 # intent was to match 2/3. But at the
15536 # less-than- acceptable level of precision, it
15537 # might actually match an unrelated number.
15538 # So don't generate a test case if this
15539 # conflating is possible. In our example, we
15540 # don't want 2/3 matching 7/10, if there is
15541 # a 7/10 code point.
15542 for my $existing
15543 (keys %nv_floating_to_rational)
15544 {
15545 next PLACE
15546 if abs($table_name - $existing)
15547 < $MAX_FLOATING_SLOP;
15548 }
430ada4c
NC
15549 push @output, generate_error($property_name,
15550 $table_name,
15551 1 # 1 => already an error
15552 );
99870f4d
KW
15553 }
15554 else {
15555
15556 # Here the number of digits exceeds the
15557 # minimum we think is needed. So generate a
15558 # success test case for it.
430ada4c
NC
15559 push @output, generate_tests($property_name,
15560 $table_name,
15561 $valid,
15562 $invalid,
15563 $warning,
15564 );
99870f4d
KW
15565 }
15566 }
99598c8c
JH
15567 }
15568 }
99870f4d
KW
15569 }
15570 }
15571 }
37e2e78e 15572
9218f1cf
KW
15573 &write($t_path,
15574 0, # Not utf8;
15575 [<DATA>,
15576 @output,
15577 (map {"Test_X('$_');\n"} @backslash_X_tests),
15578 "Finished();\n"]);
99870f4d
KW
15579 return;
15580}
99598c8c 15581
99870f4d
KW
15582# This is a list of the input files and how to handle them. The files are
15583# processed in their order in this list. Some reordering is possible if
15584# desired, but the v0 files should be first, and the extracted before the
15585# others except DAge.txt (as data in an extracted file can be over-ridden by
15586# the non-extracted. Some other files depend on data derived from an earlier
15587# file, like UnicodeData requires data from Jamo, and the case changing and
15588# folding requires data from Unicode. Mostly, it safest to order by first
15589# version releases in (except the Jamo). DAge.txt is read before the
15590# extracted ones because of the rarely used feature $compare_versions. In the
15591# unlikely event that there were ever an extracted file that contained the Age
15592# property information, it would have to go in front of DAge.
15593#
15594# The version strings allow the program to know whether to expect a file or
15595# not, but if a file exists in the directory, it will be processed, even if it
15596# is in a version earlier than expected, so you can copy files from a later
15597# release into an earlier release's directory.
15598my @input_file_objects = (
15599 Input_file->new('PropertyAliases.txt', v0,
15600 Handler => \&process_PropertyAliases,
15601 ),
15602 Input_file->new(undef, v0, # No file associated with this
3df51b85 15603 Progress_Message => 'Finishing property setup',
99870f4d
KW
15604 Handler => \&finish_property_setup,
15605 ),
15606 Input_file->new('PropValueAliases.txt', v0,
15607 Handler => \&process_PropValueAliases,
15608 Has_Missings_Defaults => $NOT_IGNORED,
15609 ),
15610 Input_file->new('DAge.txt', v3.2.0,
15611 Has_Missings_Defaults => $NOT_IGNORED,
15612 Property => 'Age'
15613 ),
15614 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
15615 Property => 'General_Category',
15616 ),
15617 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
15618 Property => 'Canonical_Combining_Class',
15619 Has_Missings_Defaults => $NOT_IGNORED,
15620 ),
15621 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
15622 Property => 'Numeric_Type',
15623 Has_Missings_Defaults => $NOT_IGNORED,
15624 ),
15625 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
15626 Property => 'East_Asian_Width',
15627 Has_Missings_Defaults => $NOT_IGNORED,
15628 ),
15629 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
15630 Property => 'Line_Break',
15631 Has_Missings_Defaults => $NOT_IGNORED,
15632 ),
15633 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
15634 Property => 'Bidi_Class',
15635 Has_Missings_Defaults => $NOT_IGNORED,
15636 ),
15637 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
15638 Property => 'Decomposition_Type',
15639 Has_Missings_Defaults => $NOT_IGNORED,
15640 ),
15641 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
15642 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
15643 Property => 'Numeric_Value',
15644 Each_Line_Handler => \&filter_numeric_value_line,
15645 Has_Missings_Defaults => $NOT_IGNORED,
15646 ),
15647 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
15648 Property => 'Joining_Group',
15649 Has_Missings_Defaults => $NOT_IGNORED,
15650 ),
15651
15652 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
15653 Property => 'Joining_Type',
15654 Has_Missings_Defaults => $NOT_IGNORED,
15655 ),
15656 Input_file->new('Jamo.txt', v2.0.0,
15657 Property => 'Jamo_Short_Name',
15658 Each_Line_Handler => \&filter_jamo_line,
15659 ),
15660 Input_file->new('UnicodeData.txt', v1.1.5,
15661 Pre_Handler => \&setup_UnicodeData,
15662
15663 # We clean up this file for some early versions.
15664 Each_Line_Handler => [ (($v_version lt v2.0.0 )
15665 ? \&filter_v1_ucd
15666 : ($v_version eq v2.1.5)
15667 ? \&filter_v2_1_5_ucd
3ffed8c2
KW
15668
15669 # And for 5.14 Perls with 6.0,
15670 # have to also make changes
15671 : ($v_version ge v6.0.0)
15672 ? \&filter_v6_ucd
15673 : undef),
99870f4d
KW
15674
15675 # And the main filter
15676 \&filter_UnicodeData_line,
15677 ],
15678 EOF_Handler => \&EOF_UnicodeData,
15679 ),
15680 Input_file->new('ArabicShaping.txt', v2.0.0,
15681 Each_Line_Handler =>
15682 [ ($v_version lt 4.1.0)
15683 ? \&filter_old_style_arabic_shaping
15684 : undef,
15685 \&filter_arabic_shaping_line,
15686 ],
15687 Has_Missings_Defaults => $NOT_IGNORED,
15688 ),
15689 Input_file->new('Blocks.txt', v2.0.0,
15690 Property => 'Block',
15691 Has_Missings_Defaults => $NOT_IGNORED,
15692 Each_Line_Handler => \&filter_blocks_lines
15693 ),
15694 Input_file->new('PropList.txt', v2.0.0,
15695 Each_Line_Handler => (($v_version lt v3.1.0)
15696 ? \&filter_old_style_proplist
15697 : undef),
15698 ),
15699 Input_file->new('Unihan.txt', v2.0.0,
15700 Pre_Handler => \&setup_unihan,
15701 Optional => 1,
15702 Each_Line_Handler => \&filter_unihan_line,
15703 ),
15704 Input_file->new('SpecialCasing.txt', v2.1.8,
15705 Each_Line_Handler => \&filter_special_casing_line,
15706 Pre_Handler => \&setup_special_casing,
15707 ),
15708 Input_file->new(
15709 'LineBreak.txt', v3.0.0,
15710 Has_Missings_Defaults => $NOT_IGNORED,
15711 Property => 'Line_Break',
15712 # Early versions had problematic syntax
15713 Each_Line_Handler => (($v_version lt v3.1.0)
15714 ? \&filter_early_ea_lb
15715 : undef),
15716 ),
15717 Input_file->new('EastAsianWidth.txt', v3.0.0,
15718 Property => 'East_Asian_Width',
15719 Has_Missings_Defaults => $NOT_IGNORED,
15720 # Early versions had problematic syntax
15721 Each_Line_Handler => (($v_version lt v3.1.0)
15722 ? \&filter_early_ea_lb
15723 : undef),
15724 ),
15725 Input_file->new('CompositionExclusions.txt', v3.0.0,
15726 Property => 'Composition_Exclusion',
15727 ),
15728 Input_file->new('BidiMirroring.txt', v3.0.1,
15729 Property => 'Bidi_Mirroring_Glyph',
15730 ),
37e2e78e 15731 Input_file->new("NormalizationTest.txt", v3.0.1,
09ca89ce 15732 Skip => 'Validation Tests',
37e2e78e 15733 ),
99870f4d
KW
15734 Input_file->new('CaseFolding.txt', v3.0.1,
15735 Pre_Handler => \&setup_case_folding,
15736 Each_Line_Handler =>
15737 [ ($v_version lt v3.1.0)
15738 ? \&filter_old_style_case_folding
15739 : undef,
15740 \&filter_case_folding_line
15741 ],
99870f4d
KW
15742 ),
15743 Input_file->new('DCoreProperties.txt', v3.1.0,
15744 # 5.2 changed this file
15745 Has_Missings_Defaults => (($v_version ge v5.2.0)
15746 ? $NOT_IGNORED
15747 : $NO_DEFAULTS),
15748 ),
15749 Input_file->new('Scripts.txt', v3.1.0,
15750 Property => 'Script',
15751 Has_Missings_Defaults => $NOT_IGNORED,
15752 ),
15753 Input_file->new('DNormalizationProps.txt', v3.1.0,
15754 Has_Missings_Defaults => $NOT_IGNORED,
15755 Each_Line_Handler => (($v_version lt v4.0.1)
15756 ? \&filter_old_style_normalization_lines
15757 : undef),
15758 ),
15759 Input_file->new('HangulSyllableType.txt', v4.0.0,
15760 Has_Missings_Defaults => $NOT_IGNORED,
15761 Property => 'Hangul_Syllable_Type'),
15762 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
15763 Property => 'Word_Break',
15764 Has_Missings_Defaults => $NOT_IGNORED,
15765 ),
15766 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
15767 Property => 'Grapheme_Cluster_Break',
15768 Has_Missings_Defaults => $NOT_IGNORED,
15769 ),
37e2e78e
KW
15770 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
15771 Handler => \&process_GCB_test,
15772 ),
15773 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
09ca89ce 15774 Skip => 'Validation Tests',
37e2e78e
KW
15775 ),
15776 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
09ca89ce 15777 Skip => 'Validation Tests',
37e2e78e
KW
15778 ),
15779 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
09ca89ce 15780 Skip => 'Validation Tests',
37e2e78e 15781 ),
99870f4d
KW
15782 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
15783 Property => 'Sentence_Break',
15784 Has_Missings_Defaults => $NOT_IGNORED,
15785 ),
15786 Input_file->new('NamedSequences.txt', v4.1.0,
15787 Handler => \&process_NamedSequences
15788 ),
15789 Input_file->new('NameAliases.txt', v5.0.0,
15790 Property => 'Name_Alias',
dcd72625
KW
15791 Pre_Handler => ($v_version ge v6.0.0)
15792 ? \&setup_v6_name_alias
15793 : undef,
99870f4d 15794 ),
37e2e78e 15795 Input_file->new("BidiTest.txt", v5.2.0,
09ca89ce 15796 Skip => 'Validation Tests',
37e2e78e 15797 ),
99870f4d
KW
15798 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
15799 Optional => 1,
15800 Each_Line_Handler => \&filter_unihan_line,
15801 ),
15802 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
15803 Optional => 1,
15804 Each_Line_Handler => \&filter_unihan_line,
15805 ),
15806 Input_file->new('UnihanIRGSources.txt', v5.2.0,
15807 Optional => 1,
15808 Pre_Handler => \&setup_unihan,
15809 Each_Line_Handler => \&filter_unihan_line,
15810 ),
15811 Input_file->new('UnihanNumericValues.txt', v5.2.0,
15812 Optional => 1,
15813 Each_Line_Handler => \&filter_unihan_line,
15814 ),
15815 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
15816 Optional => 1,
15817 Each_Line_Handler => \&filter_unihan_line,
15818 ),
15819 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
15820 Optional => 1,
15821 Each_Line_Handler => \&filter_unihan_line,
15822 ),
15823 Input_file->new('UnihanReadings.txt', v5.2.0,
15824 Optional => 1,
15825 Each_Line_Handler => \&filter_unihan_line,
15826 ),
15827 Input_file->new('UnihanVariants.txt', v5.2.0,
15828 Optional => 1,
15829 Each_Line_Handler => \&filter_unihan_line,
15830 ),
82aed44a
KW
15831 Input_file->new('ScriptExtensions.txt', v6.0.0,
15832 Property => 'Script_Extensions',
15833 Pre_Handler => \&setup_script_extensions,
fbe1e607 15834 Each_Line_Handler => \&filter_script_extensions_line,
82aed44a 15835 ),
3111abc0
KW
15836 # The two Indic files are actually available starting in v6.0.0, but their
15837 # property values are missing from PropValueAliases.txt in that release,
15838 # so that further work would have to be done to get them to work properly
15839 # for that release.
15840 Input_file->new('IndicMatraCategory.txt', v6.1.0,
15841 Property => 'Indic_Matra_Category',
15842 Has_Missings_Defaults => $NOT_IGNORED,
15843 Skip => "Provisional; for the analysis and processing of Indic scripts",
15844 ),
15845 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
15846 Property => 'Indic_Syllabic_Category',
15847 Has_Missings_Defaults => $NOT_IGNORED,
15848 Skip => "Provisional; for the analysis and processing of Indic scripts",
15849 ),
99870f4d 15850);
99598c8c 15851
99870f4d
KW
15852# End of all the preliminaries.
15853# Do it...
99598c8c 15854
99870f4d
KW
15855if ($compare_versions) {
15856 Carp::my_carp(<<END
15857Warning. \$compare_versions is set. Output is not suitable for production
15858END
15859 );
15860}
99598c8c 15861
99870f4d
KW
15862# Put into %potential_files a list of all the files in the directory structure
15863# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 15864# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
15865my @ignored_files_full_names = map { File::Spec->rel2abs(
15866 internal_file_to_platform($_))
15867 } keys %ignored_files;
15868File::Find::find({
15869 wanted=>sub {
37e2e78e 15870 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 15871 my $full = lc(File::Spec->rel2abs($_));
99870f4d 15872 $potential_files{$full} = 1
37e2e78e 15873 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
15874 return;
15875 }
15876}, File::Spec->curdir());
99598c8c 15877
99870f4d 15878my @mktables_list_output_files;
cdcef19a 15879my $old_start_time = 0;
cf25bb62 15880
3644ba60
KW
15881if (! -e $file_list) {
15882 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
15883 $write_unchanged_files = 1;
15884} elsif ($write_unchanged_files) {
99870f4d
KW
15885 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
15886}
15887else {
15888 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
15889 my $file_handle;
23e33b60 15890 if (! open $file_handle, "<", $file_list) {
3644ba60 15891 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
15892 $glob_list = 1;
15893 }
15894 else {
15895 my @input;
15896
15897 # Read and parse mktables.lst, placing the results from the first part
15898 # into @input, and the second part into @mktables_list_output_files
15899 for my $list ( \@input, \@mktables_list_output_files ) {
15900 while (<$file_handle>) {
15901 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
15902 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
15903 $old_start_time = $1;
15904 }
99870f4d
KW
15905 next if /^ \s* (?: \# .* )? $/x;
15906 last if /^ =+ $/x;
15907 my ( $file ) = split /\t/;
15908 push @$list, $file;
cf25bb62 15909 }
99870f4d
KW
15910 @$list = uniques(@$list);
15911 next;
cf25bb62
JH
15912 }
15913
99870f4d
KW
15914 # Look through all the input files
15915 foreach my $input (@input) {
15916 next if $input eq 'version'; # Already have checked this.
cf25bb62 15917
99870f4d
KW
15918 # Ignore if doesn't exist. The checking about whether we care or
15919 # not is done via the Input_file object.
15920 next if ! file_exists($input);
5beb625e 15921
99870f4d
KW
15922 # The paths are stored with relative names, and with '/' as the
15923 # delimiter; convert to absolute on this machine
517956bf 15924 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
faf3cf6b
KW
15925 $potential_files{lc $full} = 1
15926 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 15927 }
5beb625e 15928 }
cf25bb62 15929
99870f4d
KW
15930 close $file_handle;
15931}
15932
15933if ($glob_list) {
15934
15935 # Here wants to process all .txt files in the directory structure.
15936 # Convert them to full path names. They are stored in the platform's
15937 # relative style
f86864ac
KW
15938 my @known_files;
15939 foreach my $object (@input_file_objects) {
15940 my $file = $object->file;
15941 next unless defined $file;
15942 push @known_files, File::Spec->rel2abs($file);
15943 }
99870f4d
KW
15944
15945 my @unknown_input_files;
faf3cf6b
KW
15946 foreach my $file (keys %potential_files) { # The keys are stored in lc
15947 next if grep { $file eq lc($_) } @known_files;
99870f4d
KW
15948
15949 # Here, the file is unknown to us. Get relative path name
15950 $file = File::Spec->abs2rel($file);
15951 push @unknown_input_files, $file;
15952
15953 # What will happen is we create a data structure for it, and add it to
15954 # the list of input files to process. First get the subdirectories
15955 # into an array
15956 my (undef, $directories, undef) = File::Spec->splitpath($file);
15957 $directories =~ s;/$;;; # Can have extraneous trailing '/'
15958 my @directories = File::Spec->splitdir($directories);
15959
15960 # If the file isn't extracted (meaning none of the directories is the
15961 # extracted one), just add it to the end of the list of inputs.
15962 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 15963 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
15964 }
15965 else {
15966
15967 # Here, the file is extracted. It needs to go ahead of most other
15968 # processing. Search for the first input file that isn't a
15969 # special required property (that is, find one whose first_release
15970 # is non-0), and isn't extracted. Also, the Age property file is
15971 # processed before the extracted ones, just in case
15972 # $compare_versions is set.
15973 for (my $i = 0; $i < @input_file_objects; $i++) {
15974 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
15975 && lc($input_file_objects[$i]->file) ne 'dage.txt'
15976 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 15977 {
99f78760 15978 splice @input_file_objects, $i, 0,
37e2e78e 15979 Input_file->new($file, v0);
99870f4d
KW
15980 last;
15981 }
cf25bb62 15982 }
99870f4d 15983
cf25bb62 15984 }
d2d499f5 15985 }
99870f4d 15986 if (@unknown_input_files) {
23e33b60 15987 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
15988
15989The following files are unknown as to how to handle. Assuming they are
15990typical property files. You'll know by later error messages if it worked or
15991not:
15992END
99f78760 15993 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
15994 }
15995} # End of looking through directory structure for more .txt files.
5beb625e 15996
99870f4d
KW
15997# Create the list of input files from the objects we have defined, plus
15998# version
15999my @input_files = 'version';
16000foreach my $object (@input_file_objects) {
16001 my $file = $object->file;
16002 next if ! defined $file; # Not all objects have files
16003 next if $object->optional && ! -e $file;
16004 push @input_files, $file;
16005}
5beb625e 16006
99870f4d
KW
16007if ( $verbosity >= $VERBOSE ) {
16008 print "Expecting ".scalar( @input_files )." input files. ",
16009 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
16010}
cf25bb62 16011
aeab6150
KW
16012# We set $most_recent to be the most recently changed input file, including
16013# this program itself (done much earlier in this file)
99870f4d 16014foreach my $in (@input_files) {
cdcef19a
KW
16015 next unless -e $in; # Keep going even if missing a file
16016 my $mod_time = (stat $in)[9];
aeab6150 16017 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
16018
16019 # See that the input files have distinct names, to warn someone if they
16020 # are adding a new one
16021 if ($make_list) {
16022 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
16023 $directories =~ s;/$;;; # Can have extraneous trailing '/'
16024 my @directories = File::Spec->splitdir($directories);
16025 my $base = $file =~ s/\.txt$//;
16026 construct_filename($file, 'mutable', \@directories);
cf25bb62 16027 }
99870f4d 16028}
cf25bb62 16029
dff6c046 16030my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 16031 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 16032 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 16033
99870f4d
KW
16034# Now we check to see if any output files are older than youngest, if
16035# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 16036if (! $rebuild) {
99870f4d
KW
16037 foreach my $out (@mktables_list_output_files) {
16038 if ( ! file_exists($out)) {
16039 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 16040 $rebuild = 1;
99870f4d
KW
16041 last;
16042 }
16043 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
16044 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
16045 if ( (stat $out)[9] <= $most_recent ) {
16046 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 16047 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 16048 $rebuild = 1;
99870f4d 16049 last;
cf25bb62 16050 }
cf25bb62 16051 }
99870f4d 16052}
d1d1cd7a 16053if (! $rebuild) {
1265e11f 16054 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
16055 exit(0);
16056}
16057print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 16058
99870f4d
KW
16059# Ready to do the major processing. First create the perl pseudo-property.
16060$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 16061
99870f4d
KW
16062# Process each input file
16063foreach my $file (@input_file_objects) {
16064 $file->run;
d2d499f5
JH
16065}
16066
99870f4d 16067# Finish the table generation.
c4051cc5 16068
99870f4d
KW
16069print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
16070finish_Unicode();
c4051cc5 16071
99870f4d
KW
16072print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
16073compile_perl();
c4051cc5 16074
99870f4d
KW
16075print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
16076add_perl_synonyms();
c4051cc5 16077
99870f4d
KW
16078print "Writing tables\n" if $verbosity >= $PROGRESS;
16079write_all_tables();
c4051cc5 16080
99870f4d
KW
16081# Write mktables.lst
16082if ( $file_list and $make_list ) {
c4051cc5 16083
99870f4d
KW
16084 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
16085 foreach my $file (@input_files, @files_actually_output) {
16086 my (undef, $directories, $file) = File::Spec->splitpath($file);
16087 my @directories = File::Spec->splitdir($directories);
16088 $file = join '/', @directories, $file;
16089 }
16090
16091 my $ofh;
16092 if (! open $ofh,">",$file_list) {
16093 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
16094 return
16095 }
16096 else {
cdcef19a 16097 my $localtime = localtime $start_time;
99870f4d
KW
16098 print $ofh <<"END";
16099#
16100# $file_list -- File list for $0.
97050450 16101#
cdcef19a 16102# Autogenerated starting on $start_time ($localtime)
97050450
YO
16103#
16104# - First section is input files
99870f4d 16105# ($0 itself is not listed but is automatically considered an input)
98dc9551 16106# - Section separator is /^=+\$/
97050450
YO
16107# - Second section is a list of output files.
16108# - Lines matching /^\\s*#/ are treated as comments
16109# which along with blank lines are ignored.
16110#
16111
16112# Input files:
16113
99870f4d
KW
16114END
16115 print $ofh "$_\n" for sort(@input_files);
16116 print $ofh "\n=================================\n# Output files:\n\n";
16117 print $ofh "$_\n" for sort @files_actually_output;
16118 print $ofh "\n# ",scalar(@input_files)," input files\n",
16119 "# ",scalar(@files_actually_output)+1," output files\n\n",
16120 "# End list\n";
16121 close $ofh
16122 or Carp::my_carp("Failed to close $ofh: $!");
16123
16124 print "Filelist has ",scalar(@input_files)," input files and ",
16125 scalar(@files_actually_output)+1," output files\n"
16126 if $verbosity >= $VERBOSE;
16127 }
16128}
16129
16130# Output these warnings unless -q explicitly specified.
c83dffeb 16131if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
99870f4d
KW
16132 if (@unhandled_properties) {
16133 print "\nProperties and tables that unexpectedly have no code points\n";
16134 foreach my $property (sort @unhandled_properties) {
16135 print $property, "\n";
16136 }
16137 }
16138
16139 if (%potential_files) {
16140 print "\nInput files that are not considered:\n";
16141 foreach my $file (sort keys %potential_files) {
16142 print File::Spec->abs2rel($file), "\n";
16143 }
16144 }
16145 print "\nAll done\n" if $verbosity >= $VERBOSE;
16146}
5beb625e 16147exit(0);
cf25bb62 16148
99870f4d 16149# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 16150__DATA__
99870f4d 16151
5beb625e
JH
16152use strict;
16153use warnings;
16154
66fd7fd0
KW
16155# If run outside the normal test suite on an ASCII platform, you can
16156# just create a latin1_to_native() function that just returns its
16157# inputs, because that's the only function used from test.pl
16158require "test.pl";
16159
37e2e78e
KW
16160# Test qr/\X/ and the \p{} regular expression constructs. This file is
16161# constructed by mktables from the tables it generates, so if mktables is
16162# buggy, this won't necessarily catch those bugs. Tests are generated for all
16163# feasible properties; a few aren't currently feasible; see
16164# is_code_point_usable() in mktables for details.
99870f4d
KW
16165
16166# Standard test packages are not used because this manipulates SIG_WARN. It
16167# exits 0 if every non-skipped test succeeded; -1 if any failed.
16168
5beb625e
JH
16169my $Tests = 0;
16170my $Fails = 0;
99870f4d 16171
99870f4d
KW
16172sub Expect($$$$) {
16173 my $expected = shift;
16174 my $ord = shift;
16175 my $regex = shift;
16176 my $warning_type = shift; # Type of warning message, like 'deprecated'
16177 # or empty if none
16178 my $line = (caller)[2];
66fd7fd0 16179 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 16180
99870f4d 16181 # Convert the code point to hex form
23e33b60 16182 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 16183
99870f4d 16184 my @tests = "";
5beb625e 16185
37e2e78e
KW
16186 # The first time through, use all warnings. If the input should generate
16187 # a warning, add another time through with them turned off
99870f4d
KW
16188 push @tests, "no warnings '$warning_type';" if $warning_type;
16189
16190 foreach my $no_warnings (@tests) {
16191
16192 # Store any warning messages instead of outputting them
16193 local $SIG{__WARN__} = $SIG{__WARN__};
16194 my $warning_message;
16195 $SIG{__WARN__} = sub { $warning_message = $_[0] };
16196
16197 $Tests++;
16198
16199 # A string eval is needed because of the 'no warnings'.
16200 # Assumes no parens in the regular expression
16201 my $result = eval "$no_warnings
16202 my \$RegObj = qr($regex);
16203 $string =~ \$RegObj ? 1 : 0";
16204 if (not defined $result) {
16205 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
16206 $Fails++;
16207 }
16208 elsif ($result ^ $expected) {
16209 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
16210 $Fails++;
16211 }
16212 elsif ($warning_message) {
16213 if (! $warning_type || ($warning_type && $no_warnings)) {
16214 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
16215 $Fails++;
16216 }
16217 else {
16218 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
16219 }
16220 }
16221 elsif ($warning_type && ! $no_warnings) {
16222 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
16223 $Fails++;
16224 }
16225 else {
16226 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
16227 }
5beb625e 16228 }
99870f4d 16229 return;
5beb625e 16230}
d73e5302 16231
99870f4d
KW
16232sub Error($) {
16233 my $regex = shift;
5beb625e 16234 $Tests++;
99870f4d 16235 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 16236 $Fails++;
99870f4d
KW
16237 my $line = (caller)[2];
16238 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 16239 }
99870f4d
KW
16240 else {
16241 my $line = (caller)[2];
16242 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
16243 }
16244 return;
5beb625e
JH
16245}
16246
37e2e78e
KW
16247# GCBTest.txt character that separates grapheme clusters
16248my $breakable_utf8 = my $breakable = chr(0xF7);
16249utf8::upgrade($breakable_utf8);
16250
16251# GCBTest.txt character that indicates that the adjoining code points are part
16252# of the same grapheme cluster
16253my $nobreak_utf8 = my $nobreak = chr(0xD7);
16254utf8::upgrade($nobreak_utf8);
16255
16256sub Test_X($) {
16257 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
16258 # Each such line is a sequence of code points given by their hex numbers,
16259 # separated by the two characters defined just before this subroutine that
16260 # indicate that either there can or cannot be a break between the adjacent
16261 # code points. If there isn't a break, that means the sequence forms an
16262 # extended grapheme cluster, which means that \X should match the whole
16263 # thing. If there is a break, \X should stop there. This is all
16264 # converted by this routine into a match:
16265 # $string =~ /(\X)/,
16266 # Each \X should match the next cluster; and that is what is checked.
16267
16268 my $template = shift;
16269
16270 my $line = (caller)[2];
16271
16272 # The line contains characters above the ASCII range, but in Latin1. It
16273 # may or may not be in utf8, and if it is, it may or may not know it. So,
16274 # convert these characters to 8 bits. If knows is in utf8, simply
16275 # downgrade.
16276 if (utf8::is_utf8($template)) {
16277 utf8::downgrade($template);
16278 } else {
16279
16280 # Otherwise, if it is in utf8, but doesn't know it, the next lines
16281 # convert the two problematic characters to their 8-bit equivalents.
16282 # If it isn't in utf8, they don't harm anything.
16283 use bytes;
16284 $template =~ s/$nobreak_utf8/$nobreak/g;
16285 $template =~ s/$breakable_utf8/$breakable/g;
16286 }
16287
16288 # Get rid of the leading and trailing breakables
16289 $template =~ s/^ \s* $breakable \s* //x;
16290 $template =~ s/ \s* $breakable \s* $ //x;
16291
16292 # And no-breaks become just a space.
16293 $template =~ s/ \s* $nobreak \s* / /xg;
16294
16295 # Split the input into segments that are breakable between them.
16296 my @segments = split /\s*$breakable\s*/, $template;
16297
16298 my $string = "";
16299 my $display_string = "";
16300 my @should_match;
16301 my @should_display;
16302
16303 # Convert the code point sequence in each segment into a Perl string of
16304 # characters
16305 foreach my $segment (@segments) {
16306 my @code_points = split /\s+/, $segment;
16307 my $this_string = "";
16308 my $this_display = "";
16309 foreach my $code_point (@code_points) {
66fd7fd0 16310 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
16311 $this_display .= "\\x{$code_point}";
16312 }
16313
16314 # The next cluster should match the string in this segment.
16315 push @should_match, $this_string;
16316 push @should_display, $this_display;
16317 $string .= $this_string;
16318 $display_string .= $this_display;
16319 }
16320
16321 # If a string can be represented in both non-ut8 and utf8, test both cases
16322 UPGRADE:
16323 for my $to_upgrade (0 .. 1) {
678f13d5 16324
37e2e78e
KW
16325 if ($to_upgrade) {
16326
16327 # If already in utf8, would just be a repeat
16328 next UPGRADE if utf8::is_utf8($string);
16329
16330 utf8::upgrade($string);
16331 }
16332
16333 # Finally, do the \X match.
16334 my @matches = $string =~ /(\X)/g;
16335
16336 # Look through each matched cluster to verify that it matches what we
16337 # expect.
16338 my $min = (@matches < @should_match) ? @matches : @should_match;
16339 for my $i (0 .. $min - 1) {
16340 $Tests++;
16341 if ($matches[$i] eq $should_match[$i]) {
16342 print "ok $Tests - ";
16343 if ($i == 0) {
16344 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
16345 } else {
16346 print "And \\X #", $i + 1,
16347 }
16348 print " correctly matched $should_display[$i]; line $line\n";
16349 } else {
16350 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
16351 unpack("U*", $matches[$i]));
16352 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
16353 $i + 1,
16354 " should have matched $should_display[$i]",
16355 " but instead matched $matches[$i]",
16356 ". Abandoning rest of line $line\n";
16357 next UPGRADE;
16358 }
16359 }
16360
16361 # And the number of matches should equal the number of expected matches.
16362 $Tests++;
16363 if (@matches == @should_match) {
16364 print "ok $Tests - Nothing was left over; line $line\n";
16365 } else {
16366 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
16367 }
16368 }
16369
16370 return;
16371}
16372
99870f4d 16373sub Finished() {
f86864ac 16374 print "1..$Tests\n";
99870f4d 16375 exit($Fails ? -1 : 0);
5beb625e 16376}
99870f4d
KW
16377
16378Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 16379Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
16380Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
16381Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 16382Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726