This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Don't generate swash info unnecessarily
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a 19my $start_time;
98dc9551 20BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
21 # get it as close as possible
22 $start_time= time;
23}
24
23e33b60 25require 5.010_001;
d73e5302 26use strict;
99870f4d 27use warnings;
cf25bb62 28use Carp;
bd9ebcfd 29use Config;
99870f4d
KW
30use File::Find;
31use File::Path;
d07a55ed 32use File::Spec;
99870f4d
KW
33use Text::Tabs;
34
35sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 36my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d
KW
37
38##########################################################################
39#
40# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
41# from the Unicode database files (lib/unicore/.../*.txt), It also generates
42# a pod file and a .t file
43#
44# The structure of this file is:
45# First these introductory comments; then
46# code needed for everywhere, such as debugging stuff; then
47# code to handle input parameters; then
48# data structures likely to be of external interest (some of which depend on
49# the input parameters, so follows them; then
50# more data structures and subroutine and package (class) definitions; then
51# the small actual loop to process the input files and finish up; then
52# a __DATA__ section, for the .t tests
53#
5f7264c7 54# This program works on all releases of Unicode through at least 6.0. The
99870f4d
KW
55# outputs have been scrutinized most intently for release 5.1. The others
56# have been checked for somewhat more than just sanity. It can handle all
57# existing Unicode character properties in those releases.
58#
99870f4d
KW
59# This program is mostly about Unicode character (or code point) properties.
60# A property describes some attribute or quality of a code point, like if it
61# is lowercase or not, its name, what version of Unicode it was first defined
62# in, or what its uppercase equivalent is. Unicode deals with these disparate
63# possibilities by making all properties into mappings from each code point
64# into some corresponding value. In the case of it being lowercase or not,
65# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
66# property maps each Unicode code point to a single value, called a "property
67# value". (Hence each Unicode property is a true mathematical function with
68# exactly one value per code point.)
69#
70# When using a property in a regular expression, what is desired isn't the
71# mapping of the code point to its property's value, but the reverse (or the
72# mathematical "inverse relation"): starting with the property value, "Does a
73# code point map to it?" These are written in a "compound" form:
74# \p{property=value}, e.g., \p{category=punctuation}. This program generates
75# files containing the lists of code points that map to each such regular
76# expression property value, one file per list
77#
78# There is also a single form shortcut that Perl adds for many of the commonly
79# used properties. This happens for all binary properties, plus script,
80# general_category, and block properties.
81#
82# Thus the outputs of this program are files. There are map files, mostly in
83# the 'To' directory; and there are list files for use in regular expression
84# matching, all in subdirectories of the 'lib' directory, with each
85# subdirectory being named for the property that the lists in it are for.
86# Bookkeeping, test, and documentation files are also generated.
87
88my $matches_directory = 'lib'; # Where match (\p{}) files go.
89my $map_directory = 'To'; # Where map files go.
90
91# DATA STRUCTURES
92#
93# The major data structures of this program are Property, of course, but also
94# Table. There are two kinds of tables, very similar to each other.
95# "Match_Table" is the data structure giving the list of code points that have
96# a particular property value, mentioned above. There is also a "Map_Table"
97# data structure which gives the property's mapping from code point to value.
98# There are two structures because the match tables need to be combined in
99# various ways, such as constructing unions, intersections, complements, etc.,
100# and the map ones don't. And there would be problems, perhaps subtle, if
101# a map table were inadvertently operated on in some of those ways.
102# The use of separate classes with operations defined on one but not the other
103# prevents accidentally confusing the two.
104#
105# At the heart of each table's data structure is a "Range_List", which is just
106# an ordered list of "Ranges", plus ancillary information, and methods to
107# operate on them. A Range is a compact way to store property information.
108# Each range has a starting code point, an ending code point, and a value that
109# is meant to apply to all the code points between the two end points,
110# inclusive. For a map table, this value is the property value for those
111# code points. Two such ranges could be written like this:
112# 0x41 .. 0x5A, 'Upper',
113# 0x61 .. 0x7A, 'Lower'
114#
115# Each range also has a type used as a convenience to classify the values.
116# Most ranges in this program will be Type 0, or normal, but there are some
117# ranges that have a non-zero type. These are used only in map tables, and
118# are for mappings that don't fit into the normal scheme of things. Mappings
119# that require a hash entry to communicate with utf8.c are one example;
120# another example is mappings for charnames.pm to use which indicate a name
121# that is algorithmically determinable from its code point (and vice-versa).
122# These are used to significantly compact these tables, instead of listing
123# each one of the tens of thousands individually.
124#
125# In a match table, the value of a range is irrelevant (and hence the type as
126# well, which will always be 0), and arbitrarily set to the null string.
127# Using the example above, there would be two match tables for those two
128# entries, one named Upper would contain the 0x41..0x5A range, and the other
129# named Lower would contain 0x61..0x7A.
130#
131# Actually, there are two types of range lists, "Range_Map" is the one
132# associated with map tables, and "Range_List" with match tables.
133# Again, this is so that methods can be defined on one and not the other so as
134# to prevent operating on them in incorrect ways.
135#
136# Eventually, most tables are written out to files to be read by utf8_heavy.pl
137# in the perl core. All tables could in theory be written, but some are
138# suppressed because there is no current practical use for them. It is easy
139# to change which get written by changing various lists that are near the top
140# of the actual code in this file. The table data structures contain enough
141# ancillary information to allow them to be treated as separate entities for
142# writing, such as the path to each one's file. There is a heading in each
143# map table that gives the format of its entries, and what the map is for all
144# the code points missing from it. (This allows tables to be more compact.)
678f13d5 145#
99870f4d
KW
146# The Property data structure contains one or more tables. All properties
147# contain a map table (except the $perl property which is a
148# pseudo-property containing only match tables), and any properties that
149# are usable in regular expression matches also contain various matching
150# tables, one for each value the property can have. A binary property can
151# have two values, True and False (or Y and N, which are preferred by Unicode
152# terminology). Thus each of these properties will have a map table that
153# takes every code point and maps it to Y or N (but having ranges cuts the
154# number of entries in that table way down), and two match tables, one
155# which has a list of all the code points that map to Y, and one for all the
156# code points that map to N. (For each of these, a third table is also
157# generated for the pseudo Perl property. It contains the identical code
158# points as the Y table, but can be written, not in the compound form, but in
159# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
160# properties have several possible values, some have many, and properties like
161# Name have a different value for every named code point. Those will not,
162# unless the controlling lists are changed, have their match tables written
163# out. But all the ones which can be used in regular expression \p{} and \P{}
c12f2655
KW
164# constructs will. Prior to 5.14, generally a property would have either its
165# map table or its match tables written but not both. Again, what gets
166# written is controlled by lists which can easily be changed. Starting in
167# 5.14, advantage was taken of this, and all the map tables needed to
168# reconstruct the Unicode db are now written out, while suppressing the
169# Unicode .txt files that contain the data. Our tables are much more compact
170# than the .txt files, so a significant space savings was achieved.
171
172# Properties have a 'Type', like binary, or string, or enum depending on how
173# many match tables there are and the content of the maps. This 'Type' is
174# different than a range 'Type', so don't get confused by the two concepts
175# having the same name.
678f13d5 176#
99870f4d
KW
177# For information about the Unicode properties, see Unicode's UAX44 document:
178
179my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
180
181# As stated earlier, this program will work on any release of Unicode so far.
182# Most obvious problems in earlier data have NOT been corrected except when
183# necessary to make Perl or this program work reasonably. For example, no
c12f2655
KW
184# folding information was given in early releases, so this program substitutes
185# lower case instead, just so that a regular expression with the /i option
186# will do something that actually gives the right results in many cases.
187# There are also a couple other corrections for version 1.1.5, commented at
188# the point they are made. As an example of corrections that weren't made
189# (but could be) is this statement from DerivedAge.txt: "The supplementary
190# private use code points and the non-character code points were assigned in
191# version 2.0, but not specifically listed in the UCD until versions 3.0 and
192# 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
193# on Unicode version glitches is further down in these introductory comments.
99870f4d 194#
5f7264c7
KW
195# This program works on all non-provisional properties as of 6.0, though the
196# files for some are suppressed from apparent lack of demand for them. You
197# can change which are output by changing lists in this program.
678f13d5 198#
dc85bd38 199# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
200# loose matchings rules (from Unicode TR18):
201#
202# The recommended names for UCD properties and property values are in
203# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
204# [PropValue]. There are both abbreviated names and longer, more
205# descriptive names. It is strongly recommended that both names be
206# recognized, and that loose matching of property names be used,
207# whereby the case distinctions, whitespace, hyphens, and underbar
208# are ignored.
209# The program still allows Fuzzy to override its determination of if loose
210# matching should be used, but it isn't currently used, as it is no longer
211# needed; the calculations it makes are good enough.
678f13d5 212#
99870f4d
KW
213# SUMMARY OF HOW IT WORKS:
214#
215# Process arguments
216#
217# A list is constructed containing each input file that is to be processed
218#
219# Each file on the list is processed in a loop, using the associated handler
220# code for each:
221# The PropertyAliases.txt and PropValueAliases.txt files are processed
222# first. These files name the properties and property values.
223# Objects are created of all the property and property value names
224# that the rest of the input should expect, including all synonyms.
225# The other input files give mappings from properties to property
226# values. That is, they list code points and say what the mapping
227# is under the given property. Some files give the mappings for
228# just one property; and some for many. This program goes through
229# each file and populates the properties from them. Some properties
230# are listed in more than one file, and Unicode has set up a
231# precedence as to which has priority if there is a conflict. Thus
232# the order of processing matters, and this program handles the
233# conflict possibility by processing the overriding input files
234# last, so that if necessary they replace earlier values.
235# After this is all done, the program creates the property mappings not
236# furnished by Unicode, but derivable from what it does give.
237# The tables of code points that match each property value in each
238# property that is accessible by regular expressions are created.
239# The Perl-defined properties are created and populated. Many of these
240# require data determined from the earlier steps
241# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 242# and Unicode are reconciled and warned about.
99870f4d
KW
243# All the properties are written to files
244# Any other files are written, and final warnings issued.
678f13d5 245#
99870f4d
KW
246# For clarity, a number of operators have been overloaded to work on tables:
247# ~ means invert (take all characters not in the set). The more
248# conventional '!' is not used because of the possibility of confusing
249# it with the actual boolean operation.
250# + means union
251# - means subtraction
252# & means intersection
253# The precedence of these is the order listed. Parentheses should be
254# copiously used. These are not a general scheme. The operations aren't
255# defined for a number of things, deliberately, to avoid getting into trouble.
256# Operations are done on references and affect the underlying structures, so
257# that the copy constructors for them have been overloaded to not return a new
258# clone, but the input object itself.
678f13d5 259#
99870f4d
KW
260# The bool operator is deliberately not overloaded to avoid confusion with
261# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
262#
263# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
264#
265# This program needs to be able to run under miniperl. Therefore, it uses a
266# minimum of other modules, and hence implements some things itself that could
267# be gotten from CPAN
268#
269# This program uses inputs published by the Unicode Consortium. These can
270# change incompatibly between releases without the Perl maintainers realizing
271# it. Therefore this program is now designed to try to flag these. It looks
272# at the directories where the inputs are, and flags any unrecognized files.
273# It keeps track of all the properties in the files it handles, and flags any
274# that it doesn't know how to handle. It also flags any input lines that
275# don't match the expected syntax, among other checks.
276#
277# It is also designed so if a new input file matches one of the known
278# templates, one hopefully just needs to add it to a list to have it
279# processed.
280#
281# As mentioned earlier, some properties are given in more than one file. In
282# particular, the files in the extracted directory are supposedly just
283# reformattings of the others. But they contain information not easily
284# derivable from the other files, including results for Unihan, which this
285# program doesn't ordinarily look at, and for unassigned code points. They
286# also have historically had errors or been incomplete. In an attempt to
287# create the best possible data, this program thus processes them first to
288# glean information missing from the other files; then processes those other
289# files to override any errors in the extracted ones. Much of the design was
290# driven by this need to store things and then possibly override them.
291#
292# It tries to keep fatal errors to a minimum, to generate something usable for
293# testing purposes. It always looks for files that could be inputs, and will
294# warn about any that it doesn't know how to handle (the -q option suppresses
295# the warning).
99870f4d 296#
678f13d5
KW
297# Why is there more than one type of range?
298# This simplified things. There are some very specialized code points that
299# have to be handled specially for output, such as Hangul syllable names.
300# By creating a range type (done late in the development process), it
301# allowed this to be stored with the range, and overridden by other input.
302# Originally these were stored in another data structure, and it became a
303# mess trying to decide if a second file that was for the same property was
304# overriding the earlier one or not.
305#
306# Why are there two kinds of tables, match and map?
307# (And there is a base class shared by the two as well.) As stated above,
308# they actually are for different things. Development proceeded much more
309# smoothly when I (khw) realized the distinction. Map tables are used to
310# give the property value for every code point (actually every code point
311# that doesn't map to a default value). Match tables are used for regular
312# expression matches, and are essentially the inverse mapping. Separating
313# the two allows more specialized methods, and error checks so that one
314# can't just take the intersection of two map tables, for example, as that
315# is nonsensical.
99870f4d 316#
23e33b60
KW
317# DEBUGGING
318#
678f13d5
KW
319# This program is written so it will run under miniperl. Occasionally changes
320# will cause an error where the backtrace doesn't work well under miniperl.
321# To diagnose the problem, you can instead run it under regular perl, if you
322# have one compiled.
323#
324# There is a good trace facility. To enable it, first sub DEBUG must be set
325# to return true. Then a line like
326#
327# local $to_trace = 1 if main::DEBUG;
328#
329# can be added to enable tracing in its lexical scope or until you insert
330# another line:
331#
332# local $to_trace = 0 if main::DEBUG;
333#
334# then use a line like "trace $a, @b, %c, ...;
335#
336# Some of the more complex subroutines already have trace statements in them.
337# Permanent trace statements should be like:
338#
339# trace ... if main::DEBUG && $to_trace;
340#
341# If there is just one or a few files that you're debugging, you can easily
342# cause most everything else to be skipped. Change the line
343#
344# my $debug_skip = 0;
345#
346# to 1, and every file whose object is in @input_file_objects and doesn't have
347# a, 'non_skip => 1,' in its constructor will be skipped.
348#
b4a0206c 349# To compare the output tables, it may be useful to specify the -annotate
c4019d52
KW
350# flag. This causes the tables to expand so there is one entry for each
351# non-algorithmically named code point giving, currently its name, and its
352# graphic representation if printable (and you have a font that knows about
353# it). This makes it easier to see what the particular code points are in
354# each output table. The tables are usable, but because they don't have
355# ranges (for the most part), a Perl using them will run slower. Non-named
356# code points are annotated with a description of their status, and contiguous
357# ones with the same description will be output as a range rather than
358# individually. Algorithmically named characters are also output as ranges,
359# except when there are just a few contiguous ones.
360#
99870f4d
KW
361# FUTURE ISSUES
362#
363# The program would break if Unicode were to change its names so that
364# interior white space, underscores, or dashes differences were significant
365# within property and property value names.
366#
367# It might be easier to use the xml versions of the UCD if this program ever
368# would need heavy revision, and the ability to handle old versions was not
369# required.
370#
371# There is the potential for name collisions, in that Perl has chosen names
372# that Unicode could decide it also likes. There have been such collisions in
373# the past, with mostly Perl deciding to adopt the Unicode definition of the
374# name. However in the 5.2 Unicode beta testing, there were a number of such
375# collisions, which were withdrawn before the final release, because of Perl's
376# and other's protests. These all involved new properties which began with
377# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
378# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
379# Unicode document, so they are unlikely to be used by Unicode for another
380# purpose. However, they might try something beginning with 'In', or use any
381# of the other Perl-defined properties. This program will warn you of name
382# collisions, and refuse to generate tables with them, but manual intervention
383# will be required in this event. One scheme that could be implemented, if
384# necessary, would be to have this program generate another file, or add a
385# field to mktables.lst that gives the date of first definition of a property.
386# Each new release of Unicode would use that file as a basis for the next
387# iteration. And the Perl synonym addition code could sort based on the age
388# of the property, so older properties get priority, and newer ones that clash
389# would be refused; hence existing code would not be impacted, and some other
390# synonym would have to be used for the new property. This is ugly, and
391# manual intervention would certainly be easier to do in the short run; lets
392# hope it never comes to this.
678f13d5 393#
99870f4d
KW
394# A NOTE ON UNIHAN
395#
396# This program can generate tables from the Unihan database. But it doesn't
397# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
398# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
399# database was split into 8 different files, all beginning with the letters
400# 'Unihan'. This program will read those file(s) if present, but it needs to
401# know which of the many properties in the file(s) should have tables created
402# for them. It will create tables for any properties listed in
403# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
404# @cjk_properties array and the @cjk_property_values array. Thus, if a
405# property you want is not in those files of the release you are building
406# against, you must add it to those two arrays. Starting in 4.0, the
407# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
408# is present in the directory, a table will be generated for that property.
409# In 5.2, several more properties were added. For your convenience, the two
5f7264c7 410# arrays are initialized with all the 6.0 listed properties that are also in
99870f4d
KW
411# earlier releases. But these are commented out. You can just uncomment the
412# ones you want, or use them as a template for adding entries for other
413# properties.
414#
415# You may need to adjust the entries to suit your purposes. setup_unihan(),
416# and filter_unihan_line() are the functions where this is done. This program
417# already does some adjusting to make the lines look more like the rest of the
418# Unicode DB; You can see what that is in filter_unihan_line()
419#
420# There is a bug in the 3.2 data file in which some values for the
421# kPrimaryNumeric property have commas and an unexpected comment. A filter
422# could be added for these; or for a particular installation, the Unihan.txt
423# file could be edited to fix them.
99870f4d 424#
678f13d5
KW
425# HOW TO ADD A FILE TO BE PROCESSED
426#
427# A new file from Unicode needs to have an object constructed for it in
428# @input_file_objects, probably at the end or at the end of the extracted
429# ones. The program should warn you if its name will clash with others on
430# restrictive file systems, like DOS. If so, figure out a better name, and
431# add lines to the README.perl file giving that. If the file is a character
432# property, it should be in the format that Unicode has by default
433# standardized for such files for the more recently introduced ones.
434# If so, the Input_file constructor for @input_file_objects can just be the
435# file name and release it first appeared in. If not, then it should be
436# possible to construct an each_line_handler() to massage the line into the
437# standardized form.
438#
439# For non-character properties, more code will be needed. You can look at
440# the existing entries for clues.
441#
442# UNICODE VERSIONS NOTES
443#
444# The Unicode UCD has had a number of errors in it over the versions. And
445# these remain, by policy, in the standard for that version. Therefore it is
446# risky to correct them, because code may be expecting the error. So this
447# program doesn't generally make changes, unless the error breaks the Perl
448# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
449# for U+1105, which causes real problems for the algorithms for Jamo
450# calculations, so it is changed here.
451#
452# But it isn't so clear cut as to what to do about concepts that are
453# introduced in a later release; should they extend back to earlier releases
454# where the concept just didn't exist? It was easier to do this than to not,
455# so that's what was done. For example, the default value for code points not
456# in the files for various properties was probably undefined until changed by
457# some version. No_Block for blocks is such an example. This program will
458# assign No_Block even in Unicode versions that didn't have it. This has the
459# benefit that code being written doesn't have to special case earlier
460# versions; and the detriment that it doesn't match the Standard precisely for
461# the affected versions.
462#
463# Here are some observations about some of the issues in early versions:
464#
6426c51b 465# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
678f13d5
KW
466# the reason is that the CJK block starting at 4E00 was removed from PropList,
467# and was not put back in until 3.1.0
468#
469# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
470# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
471# reason is that 3.2 introduced U+205F=medium math space, which was not
472# classed as white space, but Perl figured out that it should have been. 4.0
473# reclassified it correctly.
474#
475# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
476# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
477# was left with no code points, as all the ones that mapped to 202 stayed
478# mapped to 202. Thus if your program used the numeric name for the class,
479# it would not have been affected, but if it used the mnemonic, it would have
480# been.
481#
482# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
483# points which eventually came to have this script property value, instead
484# mapped to "Unknown". But in the next release all these code points were
485# moved to \p{sc=common} instead.
99870f4d
KW
486#
487# The default for missing code points for BidiClass is complicated. Starting
488# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
489# tries to do the best it can for earlier releases. It is done in
490# process_PropertyAliases()
491#
492##############################################################################
493
494my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
495 # and errors
496my $MAX_LINE_WIDTH = 78;
497
498# Debugging aid to skip most files so as to not be distracted by them when
499# concentrating on the ones being debugged. Add
500# non_skip => 1,
501# to the constructor for those files you want processed when you set this.
502# Files with a first version number of 0 are special: they are always
c12f2655
KW
503# processed regardless of the state of this flag. Generally, Jamo.txt and
504# UnicodeData.txt must not be skipped if you want this program to not die
505# before normal completion.
99870f4d
KW
506my $debug_skip = 0;
507
508# Set to 1 to enable tracing.
509our $to_trace = 0;
510
511{ # Closure for trace: debugging aid
512 my $print_caller = 1; # ? Include calling subroutine name
513 my $main_with_colon = 'main::';
514 my $main_colon_length = length($main_with_colon);
515
516 sub trace {
517 return unless $to_trace; # Do nothing if global flag not set
518
519 my @input = @_;
520
521 local $DB::trace = 0;
522 $DB::trace = 0; # Quiet 'used only once' message
523
524 my $line_number;
525
526 # Loop looking up the stack to get the first non-trace caller
527 my $caller_line;
528 my $caller_name;
529 my $i = 0;
530 do {
531 $line_number = $caller_line;
532 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
533 $caller = $main_with_colon unless defined $caller;
534
535 $caller_name = $caller;
536
537 # get rid of pkg
538 $caller_name =~ s/.*:://;
539 if (substr($caller_name, 0, $main_colon_length)
540 eq $main_with_colon)
541 {
542 $caller_name = substr($caller_name, $main_colon_length);
543 }
544
545 } until ($caller_name ne 'trace');
546
547 # If the stack was empty, we were called from the top level
548 $caller_name = 'main' if ($caller_name eq ""
549 || $caller_name eq 'trace');
550
551 my $output = "";
552 foreach my $string (@input) {
553 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
554 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
555 $output .= simple_dumper($string);
556 }
557 else {
558 $string = "$string" if ref $string;
559 $string = $UNDEF unless defined $string;
560 chomp $string;
561 $string = '""' if $string eq "";
562 $output .= " " if $output ne ""
563 && $string ne ""
564 && substr($output, -1, 1) ne " "
565 && substr($string, 0, 1) ne " ";
566 $output .= $string;
567 }
568 }
569
99f78760
KW
570 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
571 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
572 print STDERR $output, "\n";
573 return;
574 }
575}
576
577# This is for a rarely used development feature that allows you to compare two
578# versions of the Unicode standard without having to deal with changes caused
c12f2655
KW
579# by the code points introduced in the later version. Change the 0 to a
580# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
581# code points introduced in that release and earlier will be used; later ones
582# are thrown away. You use the version number of the earliest one you want to
583# compare; then run this program on directory structures containing each
584# release, and compare the outputs. These outputs will therefore include only
585# the code points common to both releases, and you can see the changes caused
586# just by the underlying release semantic changes. For versions earlier than
587# 3.2, you must copy a version of DAge.txt into the directory.
588my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
99870f4d
KW
589my $compare_versions = DEBUG
590 && $string_compare_versions
591 && pack "C*", split /\./, $string_compare_versions;
592
593sub uniques {
594 # Returns non-duplicated input values. From "Perl Best Practices:
595 # Encapsulated Cleverness". p. 455 in first edition.
596
597 my %seen;
0e407844
NC
598 # Arguably this breaks encapsulation, if the goal is to permit multiple
599 # distinct objects to stringify to the same value, and be interchangeable.
600 # However, for this program, no two objects stringify identically, and all
601 # lists passed to this function are either objects or strings. So this
602 # doesn't affect correctness, but it does give a couple of percent speedup.
603 no overloading;
99870f4d
KW
604 return grep { ! $seen{$_}++ } @_;
605}
606
607$0 = File::Spec->canonpath($0);
608
609my $make_test_script = 0; # ? Should we output a test script
610my $write_unchanged_files = 0; # ? Should we update the output files even if
611 # we don't think they have changed
612my $use_directory = ""; # ? Should we chdir somewhere.
613my $pod_directory; # input directory to store the pod file.
614my $pod_file = 'perluniprops';
615my $t_path; # Path to the .t test file
616my $file_list = 'mktables.lst'; # File to store input and output file names.
617 # This is used to speed up the build, by not
618 # executing the main body of the program if
619 # nothing on the list has changed since the
620 # previous build
621my $make_list = 1; # ? Should we write $file_list. Set to always
622 # make a list so that when the pumpking is
623 # preparing a release, s/he won't have to do
624 # special things
625my $glob_list = 0; # ? Should we try to include unknown .txt files
626 # in the input.
bd9ebcfd
KW
627my $output_range_counts = $debugging_build; # ? Should we include the number
628 # of code points in ranges in
629 # the output
558712cf 630my $annotate = 0; # ? Should character names be in the output
9ef2b94f 631
99870f4d
KW
632# Verbosity levels; 0 is quiet
633my $NORMAL_VERBOSITY = 1;
634my $PROGRESS = 2;
635my $VERBOSE = 3;
636
637my $verbosity = $NORMAL_VERBOSITY;
638
639# Process arguments
640while (@ARGV) {
cf25bb62
JH
641 my $arg = shift @ARGV;
642 if ($arg eq '-v') {
99870f4d
KW
643 $verbosity = $VERBOSE;
644 }
645 elsif ($arg eq '-p') {
646 $verbosity = $PROGRESS;
647 $| = 1; # Flush buffers as we go.
648 }
649 elsif ($arg eq '-q') {
650 $verbosity = 0;
651 }
652 elsif ($arg eq '-w') {
653 $write_unchanged_files = 1; # update the files even if havent changed
654 }
655 elsif ($arg eq '-check') {
6ae7e459
YO
656 my $this = shift @ARGV;
657 my $ok = shift @ARGV;
658 if ($this ne $ok) {
659 print "Skipping as check params are not the same.\n";
660 exit(0);
661 }
00a8df5c 662 }
99870f4d
KW
663 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
664 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
665 }
3df51b85
KW
666 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
667 {
99870f4d 668 $make_test_script = 1;
99870f4d
KW
669 }
670 elsif ($arg eq '-makelist') {
671 $make_list = 1;
672 }
673 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
674 -d $use_directory or croak "Unknown directory '$use_directory'";
675 }
676 elsif ($arg eq '-L') {
677
678 # Existence not tested until have chdir'd
679 $file_list = shift;
680 }
681 elsif ($arg eq '-globlist') {
682 $glob_list = 1;
683 }
684 elsif ($arg eq '-c') {
685 $output_range_counts = ! $output_range_counts
686 }
b4a0206c 687 elsif ($arg eq '-annotate') {
558712cf 688 $annotate = 1;
bd9ebcfd
KW
689 $debugging_build = 1;
690 $output_range_counts = 1;
9ef2b94f 691 }
99870f4d
KW
692 else {
693 my $with_c = 'with';
694 $with_c .= 'out' if $output_range_counts; # Complements the state
695 croak <<END;
696usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
697 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
698 [-check A B ]
699 -c : Output comments $with_c number of code points in ranges
700 -q : Quiet Mode: Only output serious warnings.
701 -p : Set verbosity level to normal plus show progress.
702 -v : Set Verbosity level high: Show progress and non-serious
703 warnings
704 -w : Write files regardless
705 -C dir : Change to this directory before proceeding. All relative paths
706 except those specified by the -P and -T options will be done
707 with respect to this directory.
708 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 709 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
710 -L filelist : Use alternate 'filelist' instead of standard one
711 -globlist : Take as input all non-Test *.txt files in current and sub
712 directories
3df51b85
KW
713 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
714 overrides -T
99870f4d 715 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 716 -annotate : Output an annotation for each character in the table files;
c4019d52
KW
717 useful for debugging mktables, looking at diffs; but is slow,
718 memory intensive; resulting tables are usable but slow and
719 very large.
99870f4d
KW
720 -check A B : Executes $0 only if A and B are the same
721END
722 }
723}
724
725# Stores the most-recently changed file. If none have changed, can skip the
726# build
aeab6150 727my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
728
729# Change directories now, because need to read 'version' early.
730if ($use_directory) {
3df51b85 731 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
732 $pod_directory = File::Spec->rel2abs($pod_directory);
733 }
3df51b85 734 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 735 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 736 }
99870f4d 737 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 738 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 739 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 740 }
3df51b85 741 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 742 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 743 }
00a8df5c
YO
744}
745
99870f4d
KW
746# Get Unicode version into regular and v-string. This is done now because
747# various tables below get populated based on it. These tables are populated
748# here to be near the top of the file, and so easily seeable by those needing
749# to modify things.
750open my $VERSION, "<", "version"
751 or croak "$0: can't open required file 'version': $!\n";
752my $string_version = <$VERSION>;
753close $VERSION;
754chomp $string_version;
755my $v_version = pack "C*", split /\./, $string_version; # v string
756
757# The following are the complete names of properties with property values that
758# are known to not match any code points in some versions of Unicode, but that
759# may change in the future so they should be matchable, hence an empty file is
760# generated for them.
761my @tables_that_may_be_empty = (
762 'Joining_Type=Left_Joining',
763 );
764push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
765push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
766push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
767 if $v_version ge v4.1.0;
82aed44a
KW
768push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
769 if $v_version ge v6.0.0;
99870f4d
KW
770
771# The lists below are hashes, so the key is the item in the list, and the
772# value is the reason why it is in the list. This makes generation of
773# documentation easier.
774
775my %why_suppressed; # No file generated for these.
776
777# Files aren't generated for empty extraneous properties. This is arguable.
778# Extraneous properties generally come about because a property is no longer
779# used in a newer version of Unicode. If we generated a file without code
780# points, programs that used to work on that property will still execute
781# without errors. It just won't ever match (or will always match, with \P{}).
782# This means that the logic is now likely wrong. I (khw) think its better to
783# find this out by getting an error message. Just move them to the table
784# above to change this behavior
785my %why_suppress_if_empty_warn_if_not = (
786
787 # It is the only property that has ever officially been removed from the
788 # Standard. The database never contained any code points for it.
789 'Special_Case_Condition' => 'Obsolete',
790
791 # Apparently never official, but there were code points in some versions of
792 # old-style PropList.txt
793 'Non_Break' => 'Obsolete',
794);
795
796# These would normally go in the warn table just above, but they were changed
797# a long time before this program was written, so warnings about them are
798# moot.
799if ($v_version gt v3.2.0) {
800 push @tables_that_may_be_empty,
801 'Canonical_Combining_Class=Attached_Below_Left'
802}
803
5f7264c7 804# These are listed in the Property aliases file in 6.0, but Unihan is ignored
99870f4d
KW
805# unless explicitly added.
806if ($v_version ge v5.2.0) {
807 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 808 foreach my $table (qw (
99870f4d
KW
809 kAccountingNumeric
810 kOtherNumeric
811 kPrimaryNumeric
812 kCompatibilityVariant
813 kIICore
814 kIRG_GSource
815 kIRG_HSource
816 kIRG_JSource
817 kIRG_KPSource
818 kIRG_MSource
819 kIRG_KSource
820 kIRG_TSource
821 kIRG_USource
822 kIRG_VSource
823 kRSUnicode
ea25a9b2 824 ))
99870f4d
KW
825 {
826 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
827 }
ca12659b
NC
828}
829
272501f6
KW
830# Enum values for to_output_map() method in the Map_Table package.
831my $EXTERNAL_MAP = 1;
832my $INTERNAL_MAP = 2;
833
fcf1973c
KW
834# To override computed values for writing the map tables for these properties.
835# The default for enum map tables is to write them out, so that the Unicode
836# .txt files can be removed, but all the data to compute any property value
837# for any code point is available in a more compact form.
838my %global_to_output_map = (
839 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
c12f2655
KW
840 # get stuck supporting it if things change. Since it is a STRING
841 # property, it normally would be listed in the pod, but INTERNAL_MAP
842 # suppresses that.
fcf1973c
KW
843 Unicode_1_Name => $INTERNAL_MAP,
844
845 Present_In => 0, # Suppress, as easily computed from Age
fcf1973c 846 Block => 0, # Suppress, as Blocks.txt is retained.
53d34b6c
KW
847
848 # Suppress, as mapping can be found instead from the
849 # Perl_Decomposition_Mapping file
850 Decomposition_Type => 0,
fcf1973c
KW
851);
852
99870f4d 853# Properties that this program ignores.
230e0c16
KW
854my @unimplemented_properties;
855
856# With this release, it is automatically handled if the Unihan db is
857# downloaded
858push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
d73e5302 859
99870f4d
KW
860# There are several types of obsolete properties defined by Unicode. These
861# must be hand-edited for every new Unicode release.
862my %why_deprecated; # Generates a deprecated warning message if used.
863my %why_stabilized; # Documentation only
864my %why_obsolete; # Documentation only
865
866{ # Closure
867 my $simple = 'Perl uses the more complete version of this property';
868 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
869
870 my $other_properties = 'other properties';
871 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 872 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
873
874 %why_deprecated = (
5f7264c7 875 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
876 'Jamo_Short_Name' => $contributory,
877 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
878 'Other_Alphabetic' => $contributory,
879 'Other_Default_Ignorable_Code_Point' => $contributory,
880 'Other_Grapheme_Extend' => $contributory,
881 'Other_ID_Continue' => $contributory,
882 'Other_ID_Start' => $contributory,
883 'Other_Lowercase' => $contributory,
884 'Other_Math' => $contributory,
885 'Other_Uppercase' => $contributory,
e22aaf5c
KW
886 'Expands_On_NFC' => $why_no_expand,
887 'Expands_On_NFD' => $why_no_expand,
888 'Expands_On_NFKC' => $why_no_expand,
889 'Expands_On_NFKD' => $why_no_expand,
99870f4d
KW
890 );
891
892 %why_suppressed = (
5f7264c7 893 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
894 # contains the same information, but without the algorithmically
895 # determinable Hangul syllables'. This file is not published, so it's
896 # existence is not noted in the comment.
897 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
898
899 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
99870f4d
KW
900
901 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
902 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
903 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
904 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
905
906 'Name' => "Accessible via 'use charnames;'",
907 'Name_Alias' => "Accessible via 'use charnames;'",
908
5f7264c7 909 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
910 );
911
912 # The following are suppressed because they were made contributory or
913 # deprecated by Unicode before Perl ever thought about supporting them.
e22aaf5c
KW
914 foreach my $property ('Jamo_Short_Name',
915 'Grapheme_Link',
916 'Expands_On_NFC',
917 'Expands_On_NFD',
918 'Expands_On_NFKC',
919 'Expands_On_NFKD'
920 ) {
99870f4d
KW
921 $why_suppressed{$property} = $why_deprecated{$property};
922 }
cf25bb62 923
99870f4d
KW
924 # Customize the message for all the 'Other_' properties
925 foreach my $property (keys %why_deprecated) {
926 next if (my $main_property = $property) !~ s/^Other_//;
927 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
928 }
929}
930
931if ($v_version ge 4.0.0) {
932 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
933 if ($v_version ge 6.0.0) {
934 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
935 }
99870f4d 936}
5f7264c7 937if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 938 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7 939 if ($v_version ge 6.0.0) {
63f74647 940 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
5f7264c7 941 }
99870f4d
KW
942}
943
944# Probably obsolete forever
945if ($v_version ge v4.1.0) {
82aed44a
KW
946 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
947}
948if ($v_version ge v6.0.0) {
2b352efd
KW
949 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
950 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
99870f4d
KW
951}
952
953# This program can create files for enumerated-like properties, such as
954# 'Numeric_Type'. This file would be the same format as for a string
955# property, with a mapping from code point to its value, so you could look up,
956# for example, the script a code point is in. But no one so far wants this
957# mapping, or they have found another way to get it since this is a new
958# feature. So no file is generated except if it is in this list.
959my @output_mapped_properties = split "\n", <<END;
960END
961
c12f2655
KW
962# If you are using the Unihan database in a Unicode version before 5.2, you
963# need to add the properties that you want to extract from it to this table.
964# For your convenience, the properties in the 6.0 PropertyAliases.txt file are
965# listed, commented out
99870f4d
KW
966my @cjk_properties = split "\n", <<'END';
967#cjkAccountingNumeric; kAccountingNumeric
968#cjkOtherNumeric; kOtherNumeric
969#cjkPrimaryNumeric; kPrimaryNumeric
970#cjkCompatibilityVariant; kCompatibilityVariant
971#cjkIICore ; kIICore
972#cjkIRG_GSource; kIRG_GSource
973#cjkIRG_HSource; kIRG_HSource
974#cjkIRG_JSource; kIRG_JSource
975#cjkIRG_KPSource; kIRG_KPSource
976#cjkIRG_KSource; kIRG_KSource
977#cjkIRG_TSource; kIRG_TSource
978#cjkIRG_USource; kIRG_USource
979#cjkIRG_VSource; kIRG_VSource
980#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
981END
982
983# Similarly for the property values. For your convenience, the lines in the
5f7264c7 984# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
c12f2655 985# '#' marks (for Unicode versions before 5.2)
99870f4d
KW
986my @cjk_property_values = split "\n", <<'END';
987## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
988## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
989## @missing: 0000..10FFFF; cjkIICore; <none>
990## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
991## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
992## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
993## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
994## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
995## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
996## @missing: 0000..10FFFF; cjkIRG_USource; <none>
997## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
998## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
999## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1000## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1001END
1002
1003# The input files don't list every code point. Those not listed are to be
1004# defaulted to some value. Below are hard-coded what those values are for
1005# non-binary properties as of 5.1. Starting in 5.0, there are
1006# machine-parsable comment lines in the files the give the defaults; so this
1007# list shouldn't have to be extended. The claim is that all missing entries
1008# for binary properties will default to 'N'. Unicode tried to change that in
1009# 5.2, but the beta period produced enough protest that they backed off.
1010#
1011# The defaults for the fields that appear in UnicodeData.txt in this hash must
1012# be in the form that it expects. The others may be synonyms.
1013my $CODE_POINT = '<code point>';
1014my %default_mapping = (
1015 Age => "Unassigned",
1016 # Bidi_Class => Complicated; set in code
1017 Bidi_Mirroring_Glyph => "",
1018 Block => 'No_Block',
1019 Canonical_Combining_Class => 0,
1020 Case_Folding => $CODE_POINT,
1021 Decomposition_Mapping => $CODE_POINT,
1022 Decomposition_Type => 'None',
1023 East_Asian_Width => "Neutral",
1024 FC_NFKC_Closure => $CODE_POINT,
1025 General_Category => 'Cn',
1026 Grapheme_Cluster_Break => 'Other',
1027 Hangul_Syllable_Type => 'NA',
1028 ISO_Comment => "",
1029 Jamo_Short_Name => "",
1030 Joining_Group => "No_Joining_Group",
1031 # Joining_Type => Complicated; set in code
1032 kIICore => 'N', # Is converted to binary
1033 #Line_Break => Complicated; set in code
1034 Lowercase_Mapping => $CODE_POINT,
1035 Name => "",
1036 Name_Alias => "",
1037 NFC_QC => 'Yes',
1038 NFD_QC => 'Yes',
1039 NFKC_QC => 'Yes',
1040 NFKD_QC => 'Yes',
1041 Numeric_Type => 'None',
1042 Numeric_Value => 'NaN',
1043 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1044 Sentence_Break => 'Other',
1045 Simple_Case_Folding => $CODE_POINT,
1046 Simple_Lowercase_Mapping => $CODE_POINT,
1047 Simple_Titlecase_Mapping => $CODE_POINT,
1048 Simple_Uppercase_Mapping => $CODE_POINT,
1049 Titlecase_Mapping => $CODE_POINT,
1050 Unicode_1_Name => "",
1051 Unicode_Radical_Stroke => "",
1052 Uppercase_Mapping => $CODE_POINT,
1053 Word_Break => 'Other',
1054);
1055
1056# Below are files that Unicode furnishes, but this program ignores, and why
1057my %ignored_files = (
73ba1144
KW
1058 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1059 'Index.txt' => 'Alphabetical index of Unicode characters',
1060 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1061 'NamesList.txt' => 'Annotated list of characters',
1062 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1063 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1064 'ReadMe.txt' => 'Documentation',
1065 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1066 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1067 'IndicMatraCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
1068 'IndicSyllabicCategory.txt' => 'Provisional; for the analysis and processing of Indic scripts',
1069 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1070 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1071 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1072 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
99870f4d
KW
1073);
1074
678f13d5 1075### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1076
1077my $HEADER=<<"EOF";
1078# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1079# This file is machine-generated by $0 from the Unicode
1080# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1081EOF
1082
126c3d4e 1083my $INTERNAL_ONLY_HEADER = <<"EOF";
99870f4d
KW
1084
1085# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
fac53429
KW
1086# This file is for internal use by core Perl only. The format and even the
1087# name or existence of this file are subject to change without notice. Don't
1088# use it directly.
99870f4d
KW
1089EOF
1090
1091my $DEVELOPMENT_ONLY=<<"EOF";
1092# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1093# This file contains information artificially constrained to code points
1094# present in Unicode release $string_compare_versions.
1095# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1096# not be used for production.
b6922eda
KW
1097
1098EOF
1099
6189eadc
KW
1100my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1101my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1102my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
99870f4d
KW
1103
1104# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1105# two must be 10; if there are 5, the first must not be a 0. Written this way
92199589
KW
1106# to decrease backtracking. The first regex allows the code point to be at
1107# the end of a word, but to work properly, the word shouldn't end with a valid
1108# hex character. The second one won't match a code point at the end of a
1109# word, and doesn't have the run-on issue
8c32d378
KW
1110my $run_on_code_point_re =
1111 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1112my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1113
1114# This matches the beginning of the line in the Unicode db files that give the
1115# defaults for code points not listed (i.e., missing) in the file. The code
1116# depends on this ending with a semi-colon, so it can assume it is a valid
1117# field when the line is split() by semi-colons
1118my $missing_defaults_prefix =
6189eadc 1119 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
99870f4d
KW
1120
1121# Property types. Unicode has more types, but these are sufficient for our
1122# purposes.
1123my $UNKNOWN = -1; # initialized to illegal value
1124my $NON_STRING = 1; # Either binary or enum
1125my $BINARY = 2;
06f26c45
KW
1126my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1127 # tables, additional true and false tables are
1128 # generated so that false is anything matching the
1129 # default value, and true is everything else.
1130my $ENUM = 4; # Include catalog
1131my $STRING = 5; # Anything else: string or misc
99870f4d
KW
1132
1133# Some input files have lines that give default values for code points not
1134# contained in the file. Sometimes these should be ignored.
1135my $NO_DEFAULTS = 0; # Must evaluate to false
1136my $NOT_IGNORED = 1;
1137my $IGNORED = 2;
1138
1139# Range types. Each range has a type. Most ranges are type 0, for normal,
1140# and will appear in the main body of the tables in the output files, but
1141# there are other types of ranges as well, listed below, that are specially
1142# handled. There are pseudo-types as well that will never be stored as a
1143# type, but will affect the calculation of the type.
1144
1145# 0 is for normal, non-specials
1146my $MULTI_CP = 1; # Sequence of more than code point
1147my $HANGUL_SYLLABLE = 2;
1148my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1149my $NULL = 4; # The map is to the null string; utf8.c can't
1150 # handle these, nor is there an accepted syntax
1151 # for them in \p{} constructs
f86864ac 1152my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1153 # otherwise be $MULTI_CP type are instead type 0
1154
1155# process_generic_property_file() can accept certain overrides in its input.
1156# Each of these must begin AND end with $CMD_DELIM.
1157my $CMD_DELIM = "\a";
1158my $REPLACE_CMD = 'replace'; # Override the Replace
1159my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1160
1161my $NO = 0;
1162my $YES = 1;
1163
1164# Values for the Replace argument to add_range.
1165# $NO # Don't replace; add only the code points not
1166 # already present.
1167my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1168 # the comments at the subroutine definition.
1169my $UNCONDITIONALLY = 2; # Replace without conditions.
1170my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1171 # already there
56343c78 1172my $CROAK = 5; # Die with an error if is already there
99870f4d
KW
1173
1174# Flags to give property statuses. The phrases are to remind maintainers that
1175# if the flag is changed, the indefinite article referring to it in the
1176# documentation may need to be as well.
1177my $NORMAL = "";
99870f4d
KW
1178my $DEPRECATED = 'D';
1179my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1180my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1181my $DISCOURAGED = 'X';
1182my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1183my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1184my $STRICTER = 'T';
1185my $a_bold_stricter = "a 'B<$STRICTER>'";
1186my $A_bold_stricter = "A 'B<$STRICTER>'";
1187my $STABILIZED = 'S';
1188my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1189my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1190my $OBSOLETE = 'O';
1191my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1192my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1193
1194my %status_past_participles = (
1195 $DISCOURAGED => 'discouraged',
99870f4d
KW
1196 $STABILIZED => 'stabilized',
1197 $OBSOLETE => 'obsolete',
37e2e78e 1198 $DEPRECATED => 'deprecated',
99870f4d
KW
1199);
1200
301ba948
KW
1201# Table fates.
1202my $ORDINARY = 0; # The normal fate.
1203my $SUPPRESSED = 3; # The file for this table is not written out.
1204my $INTERNAL_ONLY = 4; # The file for this table is written out, but it is
1205 # for Perl's internal use only
1206my $PLACEHOLDER = 5; # A property that is defined as a placeholder in a
1207 # Unicode version that doesn't have it, but we need it
1208 # to be defined, if empty, to have things work.
1209 # Implies no pod entry generated
1210
f5817e0a
KW
1211# The format of the values of the tables:
1212my $EMPTY_FORMAT = "";
99870f4d
KW
1213my $BINARY_FORMAT = 'b';
1214my $DECIMAL_FORMAT = 'd';
1215my $FLOAT_FORMAT = 'f';
1216my $INTEGER_FORMAT = 'i';
1217my $HEX_FORMAT = 'x';
1218my $RATIONAL_FORMAT = 'r';
1219my $STRING_FORMAT = 's';
a14f3cb1 1220my $DECOMP_STRING_FORMAT = 'c';
c3ff2976 1221my $STRING_WHITE_SPACE_LIST = 'sw';
99870f4d
KW
1222
1223my %map_table_formats = (
1224 $BINARY_FORMAT => 'binary',
1225 $DECIMAL_FORMAT => 'single decimal digit',
1226 $FLOAT_FORMAT => 'floating point number',
1227 $INTEGER_FORMAT => 'integer',
add63c13 1228 $HEX_FORMAT => 'non-negative hex whole number; a code point',
99870f4d 1229 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1230 $STRING_FORMAT => 'string',
92f9d56c 1231 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
c3ff2976 1232 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
99870f4d
KW
1233);
1234
1235# Unicode didn't put such derived files in a separate directory at first.
1236my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1237my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1238my $AUXILIARY = 'auxiliary';
1239
1240# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1241my %loose_to_file_of; # loosely maps table names to their respective
1242 # files
1243my %stricter_to_file_of; # same; but for stricter mapping.
1244my %nv_floating_to_rational; # maps numeric values floating point numbers to
1245 # their rational equivalent
c12f2655
KW
1246my %loose_property_name_of; # Loosely maps (non_string) property names to
1247 # standard form
99870f4d 1248
d867ccfb
KW
1249# Most properties are immune to caseless matching, otherwise you would get
1250# nonsensical results, as properties are a function of a code point, not
1251# everything that is caselessly equivalent to that code point. For example,
1252# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1253# be true because 's' and 'S' are equivalent caselessly. However,
1254# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1255# extend that concept to those very few properties that are like this. Each
1256# such property will match the full range caselessly. They are hard-coded in
1257# the program; it's not worth trying to make it general as it's extremely
1258# unlikely that they will ever change.
1259my %caseless_equivalent_to;
1260
99870f4d
KW
1261# These constants names and values were taken from the Unicode standard,
1262# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1263# syllables. The '_string' versions are so generated tables can retain the
1264# hex format, which is the more familiar value
1265my $SBase_string = "0xAC00";
1266my $SBase = CORE::hex $SBase_string;
1267my $LBase_string = "0x1100";
1268my $LBase = CORE::hex $LBase_string;
1269my $VBase_string = "0x1161";
1270my $VBase = CORE::hex $VBase_string;
1271my $TBase_string = "0x11A7";
1272my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1273my $SCount = 11172;
1274my $LCount = 19;
1275my $VCount = 21;
1276my $TCount = 28;
1277my $NCount = $VCount * $TCount;
1278
1279# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1280# with the above published constants.
1281my %Jamo;
1282my %Jamo_L; # Leading consonants
1283my %Jamo_V; # Vowels
1284my %Jamo_T; # Trailing consonants
1285
bb1dd3da
KW
1286# For code points whose name contains its ordinal as a '-ABCD' suffix.
1287# The key is the base name of the code point, and the value is an
1288# array giving all the ranges that use this base name. Each range
1289# is actually a hash giving the 'low' and 'high' values of it.
1290my %names_ending_in_code_point;
1291my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1292 # removed from the names
1293# Inverse mapping. The list of ranges that have these kinds of
1294# names. Each element contains the low, high, and base names in an
1295# anonymous hash.
1296my @code_points_ending_in_code_point;
1297
1298# Boolean: does this Unicode version have the hangul syllables, and are we
1299# writing out a table for them?
1300my $has_hangul_syllables = 0;
1301
1302# Does this Unicode version have code points whose names end in their
1303# respective code points, and are we writing out a table for them? 0 for no;
1304# otherwise points to first property that a table is needed for them, so that
1305# if multiple tables are needed, we don't create duplicates
1306my $needing_code_points_ending_in_code_point = 0;
1307
37e2e78e 1308my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1309my @unhandled_properties; # Will contain a list of properties found in
1310 # the input that we didn't process.
f86864ac 1311my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1312 # listed in the pod
1313my @map_properties; # Properties that get map files written
1314my @named_sequences; # NamedSequences.txt contents.
1315my %potential_files; # Generated list of all .txt files in the directory
1316 # structure so we can warn if something is being
1317 # ignored.
1318my @files_actually_output; # List of files we generated.
1319my @more_Names; # Some code point names are compound; this is used
1320 # to store the extra components of them.
1321my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1322 # the minimum before we consider it equivalent to a
1323 # candidate rational
1324my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1325
1326# These store references to certain commonly used property objects
1327my $gc;
1328my $perl;
1329my $block;
3e20195b
KW
1330my $perl_charname;
1331my $print;
7fc6cb55 1332my $Any;
359523e2 1333my $script;
99870f4d
KW
1334
1335# Are there conflicting names because of beginning with 'In_', or 'Is_'
1336my $has_In_conflicts = 0;
1337my $has_Is_conflicts = 0;
1338
1339sub internal_file_to_platform ($) {
1340 # Convert our file paths which have '/' separators to those of the
1341 # platform.
1342
1343 my $file = shift;
1344 return undef unless defined $file;
1345
1346 return File::Spec->join(split '/', $file);
d07a55ed 1347}
5beb625e 1348
99870f4d
KW
1349sub file_exists ($) { # platform independent '-e'. This program internally
1350 # uses slash as a path separator.
1351 my $file = shift;
1352 return 0 if ! defined $file;
1353 return -e internal_file_to_platform($file);
1354}
5beb625e 1355
99870f4d 1356sub objaddr($) {
23e33b60
KW
1357 # Returns the address of the blessed input object.
1358 # It doesn't check for blessedness because that would do a string eval
1359 # every call, and the program is structured so that this is never called
1360 # for a non-blessed object.
99870f4d 1361
23e33b60 1362 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1363
1364 # Numifying a ref gives its address.
051df77b 1365 return pack 'J', $_[0];
99870f4d
KW
1366}
1367
558712cf 1368# These are used only if $annotate is true.
c4019d52
KW
1369# The entire range of Unicode characters is examined to populate these
1370# after all the input has been processed. But most can be skipped, as they
1371# have the same descriptive phrases, such as being unassigned
1372my @viacode; # Contains the 1 million character names
1373my @printable; # boolean: And are those characters printable?
1374my @annotate_char_type; # Contains a type of those characters, specifically
1375 # for the purposes of annotation.
1376my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1377 # name for the purposes of annotation. They map to the
c4019d52
KW
1378 # upper edge of the range, so that the end point can
1379 # be immediately found. This is used to skip ahead to
1380 # the end of a range, and avoid processing each
1381 # individual code point in it.
1382my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1383 # characters, but excluding those which are
1384 # also noncharacter code points
1385
1386# The annotation types are an extension of the regular range types, though
1387# some of the latter are folded into one. Make the new types negative to
1388# avoid conflicting with the regular types
1389my $SURROGATE_TYPE = -1;
1390my $UNASSIGNED_TYPE = -2;
1391my $PRIVATE_USE_TYPE = -3;
1392my $NONCHARACTER_TYPE = -4;
1393my $CONTROL_TYPE = -5;
1394my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1395
1396sub populate_char_info ($) {
558712cf 1397 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1398 # input code point's info that are needed for outputting more detailed
1399 # comments. If calling context wants a return, it is the end point of
1400 # any contiguous range of characters that share essentially the same info
1401
1402 my $i = shift;
1403 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1404
1405 $viacode[$i] = $perl_charname->value_of($i) || "";
1406
1407 # A character is generally printable if Unicode says it is,
1408 # but below we make sure that most Unicode general category 'C' types
1409 # aren't.
1410 $printable[$i] = $print->contains($i);
1411
1412 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1413
1414 # Only these two regular types are treated specially for annotations
1415 # purposes
1416 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1417 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1418
1419 # Give a generic name to all code points that don't have a real name.
1420 # We output ranges, if applicable, for these. Also calculate the end
1421 # point of the range.
1422 my $end;
1423 if (! $viacode[$i]) {
1424 if ($gc-> table('Surrogate')->contains($i)) {
1425 $viacode[$i] = 'Surrogate';
1426 $annotate_char_type[$i] = $SURROGATE_TYPE;
1427 $printable[$i] = 0;
1428 $end = $gc->table('Surrogate')->containing_range($i)->end;
1429 }
1430 elsif ($gc-> table('Private_use')->contains($i)) {
1431 $viacode[$i] = 'Private Use';
1432 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1433 $printable[$i] = 0;
1434 $end = $gc->table('Private_Use')->containing_range($i)->end;
1435 }
1436 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1437 contains($i))
1438 {
1439 $viacode[$i] = 'Noncharacter';
1440 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1441 $printable[$i] = 0;
1442 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1443 containing_range($i)->end;
1444 }
1445 elsif ($gc-> table('Control')->contains($i)) {
1446 $viacode[$i] = 'Control';
1447 $annotate_char_type[$i] = $CONTROL_TYPE;
1448 $printable[$i] = 0;
1449 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1450 }
1451 elsif ($gc-> table('Unassigned')->contains($i)) {
1452 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1453 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1454 $printable[$i] = 0;
1455
1456 # Because we name the unassigned by the blocks they are in, it
1457 # can't go past the end of that block, and it also can't go past
1458 # the unassigned range it is in. The special table makes sure
1459 # that the non-characters, which are unassigned, are separated
1460 # out.
1461 $end = min($block->containing_range($i)->end,
1462 $unassigned_sans_noncharacters-> containing_range($i)->
1463 end);
13ca76ff
KW
1464 }
1465 else {
1466 Carp::my_carp_bug("Can't figure out how to annotate "
1467 . sprintf("U+%04X", $i)
1468 . ". Proceeding anyway.");
c4019d52
KW
1469 $viacode[$i] = 'UNKNOWN';
1470 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1471 $printable[$i] = 0;
1472 }
1473 }
1474
1475 # Here, has a name, but if it's one in which the code point number is
1476 # appended to the name, do that.
1477 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1478 $viacode[$i] .= sprintf("-%04X", $i);
1479 $end = $perl_charname->containing_range($i)->end;
1480 }
1481
1482 # And here, has a name, but if it's a hangul syllable one, replace it with
1483 # the correct name from the Unicode algorithm
1484 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1485 use integer;
1486 my $SIndex = $i - $SBase;
1487 my $L = $LBase + $SIndex / $NCount;
1488 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1489 my $T = $TBase + $SIndex % $TCount;
1490 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1491 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1492 $end = $perl_charname->containing_range($i)->end;
1493 }
1494
1495 return if ! defined wantarray;
1496 return $i if ! defined $end; # If not a range, return the input
1497
1498 # Save this whole range so can find the end point quickly
1499 $annotate_ranges->add_map($i, $end, $end);
1500
1501 return $end;
1502}
1503
23e33b60
KW
1504# Commented code below should work on Perl 5.8.
1505## This 'require' doesn't necessarily work in miniperl, and even if it does,
1506## the native perl version of it (which is what would operate under miniperl)
1507## is extremely slow, as it does a string eval every call.
1508#my $has_fast_scalar_util = $\18 !~ /miniperl/
1509# && defined eval "require Scalar::Util";
1510#
1511#sub objaddr($) {
1512# # Returns the address of the blessed input object. Uses the XS version if
1513# # available. It doesn't check for blessedness because that would do a
1514# # string eval every call, and the program is structured so that this is
1515# # never called for a non-blessed object.
1516#
1517# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1518#
1519# # Check at least that is a ref.
1520# my $pkg = ref($_[0]) or return undef;
1521#
1522# # Change to a fake package to defeat any overloaded stringify
1523# bless $_[0], 'main::Fake';
1524#
1525# # Numifying a ref gives its address.
051df77b 1526# my $addr = pack 'J', $_[0];
23e33b60
KW
1527#
1528# # Return to original class
1529# bless $_[0], $pkg;
1530# return $addr;
1531#}
1532
99870f4d
KW
1533sub max ($$) {
1534 my $a = shift;
1535 my $b = shift;
1536 return $a if $a >= $b;
1537 return $b;
1538}
1539
1540sub min ($$) {
1541 my $a = shift;
1542 my $b = shift;
1543 return $a if $a <= $b;
1544 return $b;
1545}
1546
1547sub clarify_number ($) {
1548 # This returns the input number with underscores inserted every 3 digits
1549 # in large (5 digits or more) numbers. Input must be entirely digits, not
1550 # checked.
1551
1552 my $number = shift;
1553 my $pos = length($number) - 3;
1554 return $number if $pos <= 1;
1555 while ($pos > 0) {
1556 substr($number, $pos, 0) = '_';
1557 $pos -= 3;
5beb625e 1558 }
99870f4d 1559 return $number;
99598c8c
JH
1560}
1561
12ac2576 1562
99870f4d 1563package Carp;
7ebf06b3 1564
99870f4d
KW
1565# These routines give a uniform treatment of messages in this program. They
1566# are placed in the Carp package to cause the stack trace to not include them,
1567# although an alternative would be to use another package and set @CARP_NOT
1568# for it.
12ac2576 1569
99870f4d 1570our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1571
99f78760
KW
1572# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1573# and overload trying to load Scalar:Util under miniperl. See
1574# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1575undef $overload::VERSION;
1576
99870f4d
KW
1577sub my_carp {
1578 my $message = shift || "";
1579 my $nofold = shift || 0;
7ebf06b3 1580
99870f4d
KW
1581 if ($message) {
1582 $message = main::join_lines($message);
1583 $message =~ s/^$0: *//; # Remove initial program name
1584 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1585 $message = "\n$0: $message;";
12ac2576 1586
99870f4d
KW
1587 # Fold the message with program name, semi-colon end punctuation
1588 # (which looks good with the message that carp appends to it), and a
1589 # hanging indent for continuation lines.
1590 $message = main::simple_fold($message, "", 4) unless $nofold;
1591 $message =~ s/\n$//; # Remove the trailing nl so what carp
1592 # appends is to the same line
1593 }
12ac2576 1594
99870f4d 1595 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1596
99870f4d
KW
1597 carp $message;
1598 return;
1599}
7ebf06b3 1600
99870f4d
KW
1601sub my_carp_bug {
1602 # This is called when it is clear that the problem is caused by a bug in
1603 # this program.
7ebf06b3 1604
99870f4d
KW
1605 my $message = shift;
1606 $message =~ s/^$0: *//;
1607 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1608 carp $message;
1609 return;
1610}
7ebf06b3 1611
99870f4d
KW
1612sub carp_too_few_args {
1613 if (@_ != 2) {
1614 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1615 return;
12ac2576 1616 }
7ebf06b3 1617
99870f4d
KW
1618 my $args_ref = shift;
1619 my $count = shift;
7ebf06b3 1620
99870f4d
KW
1621 my_carp_bug("Need at least $count arguments to "
1622 . (caller 1)[3]
1623 . ". Instead got: '"
1624 . join ', ', @$args_ref
1625 . "'. No action taken.");
1626 return;
12ac2576
JP
1627}
1628
99870f4d
KW
1629sub carp_extra_args {
1630 my $args_ref = shift;
1631 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1632
99870f4d
KW
1633 unless (ref $args_ref) {
1634 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1635 return;
1636 }
1637 my ($package, $file, $line) = caller;
1638 my $subroutine = (caller 1)[3];
cf25bb62 1639
99870f4d
KW
1640 my $list;
1641 if (ref $args_ref eq 'HASH') {
1642 foreach my $key (keys %$args_ref) {
1643 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1644 }
99870f4d 1645 $list = join ', ', each %{$args_ref};
cf25bb62 1646 }
99870f4d
KW
1647 elsif (ref $args_ref eq 'ARRAY') {
1648 foreach my $arg (@$args_ref) {
1649 $arg = $UNDEF unless defined $arg;
1650 }
1651 $list = join ', ', @$args_ref;
1652 }
1653 else {
1654 my_carp_bug("Can't cope with ref "
1655 . ref($args_ref)
1656 . " . argument to 'carp_extra_args'. Not checking arguments.");
1657 return;
1658 }
1659
1660 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1661 return;
d73e5302
JH
1662}
1663
99870f4d
KW
1664package main;
1665
1666{ # Closure
1667
1668 # This program uses the inside-out method for objects, as recommended in
1669 # "Perl Best Practices". This closure aids in generating those. There
1670 # are two routines. setup_package() is called once per package to set
1671 # things up, and then set_access() is called for each hash representing a
1672 # field in the object. These routines arrange for the object to be
1673 # properly destroyed when no longer used, and for standard accessor
1674 # functions to be generated. If you need more complex accessors, just
1675 # write your own and leave those accesses out of the call to set_access().
1676 # More details below.
1677
1678 my %constructor_fields; # fields that are to be used in constructors; see
1679 # below
1680
1681 # The values of this hash will be the package names as keys to other
1682 # hashes containing the name of each field in the package as keys, and
1683 # references to their respective hashes as values.
1684 my %package_fields;
1685
1686 sub setup_package {
1687 # Sets up the package, creating standard DESTROY and dump methods
1688 # (unless already defined). The dump method is used in debugging by
1689 # simple_dumper().
1690 # The optional parameters are:
1691 # a) a reference to a hash, that gets populated by later
1692 # set_access() calls with one of the accesses being
1693 # 'constructor'. The caller can then refer to this, but it is
1694 # not otherwise used by these two routines.
1695 # b) a reference to a callback routine to call during destruction
1696 # of the object, before any fields are actually destroyed
1697
1698 my %args = @_;
1699 my $constructor_ref = delete $args{'Constructor_Fields'};
1700 my $destroy_callback = delete $args{'Destroy_Callback'};
1701 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1702
1703 my %fields;
1704 my $package = (caller)[0];
1705
1706 $package_fields{$package} = \%fields;
1707 $constructor_fields{$package} = $constructor_ref;
1708
1709 unless ($package->can('DESTROY')) {
1710 my $destroy_name = "${package}::DESTROY";
1711 no strict "refs";
1712
1713 # Use typeglob to give the anonymous subroutine the name we want
1714 *$destroy_name = sub {
1715 my $self = shift;
ffe43484 1716 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1717
1718 $self->$destroy_callback if $destroy_callback;
1719 foreach my $field (keys %{$package_fields{$package}}) {
1720 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1721 delete $package_fields{$package}{$field}{$addr};
1722 }
1723 return;
1724 }
1725 }
1726
1727 unless ($package->can('dump')) {
1728 my $dump_name = "${package}::dump";
1729 no strict "refs";
1730 *$dump_name = sub {
1731 my $self = shift;
1732 return dump_inside_out($self, $package_fields{$package}, @_);
1733 }
1734 }
1735 return;
1736 }
1737
1738 sub set_access {
1739 # Arrange for the input field to be garbage collected when no longer
1740 # needed. Also, creates standard accessor functions for the field
1741 # based on the optional parameters-- none if none of these parameters:
1742 # 'addable' creates an 'add_NAME()' accessor function.
1743 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1744 # function.
1745 # 'settable' creates a 'set_NAME()' accessor function.
1746 # 'constructor' doesn't create an accessor function, but adds the
1747 # field to the hash that was previously passed to
1748 # setup_package();
1749 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1750 # 'add' etc. all mean 'addable'.
1751 # The read accessor function will work on both array and scalar
1752 # values. If another accessor in the parameter list is 'a', the read
1753 # access assumes an array. You can also force it to be array access
1754 # by specifying 'readable_array' instead of 'readable'
1755 #
1756 # A sort-of 'protected' access can be set-up by preceding the addable,
1757 # readable or settable with some initial portion of 'protected_' (but,
1758 # the underscore is required), like 'p_a', 'pro_set', etc. The
1759 # "protection" is only by convention. All that happens is that the
1760 # accessor functions' names begin with an underscore. So instead of
1761 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1762 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1763 # end of each package, and then storing the __LINE__ ranges and
1764 # checking them on every accessor. But that is way overkill.)
1765
1766 # We create anonymous subroutines as the accessors and then use
1767 # typeglobs to assign them to the proper package and name
1768
1769 my $name = shift; # Name of the field
1770 my $field = shift; # Reference to the inside-out hash containing the
1771 # field
1772
1773 my $package = (caller)[0];
1774
1775 if (! exists $package_fields{$package}) {
1776 croak "$0: Must call 'setup_package' before 'set_access'";
1777 }
d73e5302 1778
99870f4d
KW
1779 # Stash the field so DESTROY can get it.
1780 $package_fields{$package}{$name} = $field;
cf25bb62 1781
99870f4d
KW
1782 # Remaining arguments are the accessors. For each...
1783 foreach my $access (@_) {
1784 my $access = lc $access;
cf25bb62 1785
99870f4d 1786 my $protected = "";
cf25bb62 1787
99870f4d
KW
1788 # Match the input as far as it goes.
1789 if ($access =~ /^(p[^_]*)_/) {
1790 $protected = $1;
1791 if (substr('protected_', 0, length $protected)
1792 eq $protected)
1793 {
1794
1795 # Add 1 for the underscore not included in $protected
1796 $access = substr($access, length($protected) + 1);
1797 $protected = '_';
1798 }
1799 else {
1800 $protected = "";
1801 }
1802 }
1803
1804 if (substr('addable', 0, length $access) eq $access) {
1805 my $subname = "${package}::${protected}add_$name";
1806 no strict "refs";
1807
1808 # add_ accessor. Don't add if already there, which we
1809 # determine using 'eq' for scalars and '==' otherwise.
1810 *$subname = sub {
1811 use strict "refs";
1812 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1813 my $self = shift;
1814 my $value = shift;
ffe43484 1815 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1816 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1817 if (ref $value) {
f998e60c 1818 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1819 }
1820 else {
f998e60c 1821 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1822 }
f998e60c 1823 push @{$field->{$addr}}, $value;
99870f4d
KW
1824 return;
1825 }
1826 }
1827 elsif (substr('constructor', 0, length $access) eq $access) {
1828 if ($protected) {
1829 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1830 }
1831 else {
1832 $constructor_fields{$package}{$name} = $field;
1833 }
1834 }
1835 elsif (substr('readable_array', 0, length $access) eq $access) {
1836
1837 # Here has read access. If one of the other parameters for
1838 # access is array, or this one specifies array (by being more
1839 # than just 'readable_'), then create a subroutine that
1840 # assumes the data is an array. Otherwise just a scalar
1841 my $subname = "${package}::${protected}$name";
1842 if (grep { /^a/i } @_
1843 or length($access) > length('readable_'))
1844 {
1845 no strict "refs";
1846 *$subname = sub {
1847 use strict "refs";
23e33b60 1848 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1849 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1850 if (ref $field->{$addr} ne 'ARRAY') {
1851 my $type = ref $field->{$addr};
1852 $type = 'scalar' unless $type;
1853 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1854 return;
1855 }
1856 return scalar @{$field->{$addr}} unless wantarray;
1857
1858 # Make a copy; had problems with caller modifying the
1859 # original otherwise
1860 my @return = @{$field->{$addr}};
1861 return @return;
1862 }
1863 }
1864 else {
1865
1866 # Here not an array value, a simpler function.
1867 no strict "refs";
1868 *$subname = sub {
1869 use strict "refs";
23e33b60 1870 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1871 no overloading;
051df77b 1872 return $field->{pack 'J', $_[0]};
99870f4d
KW
1873 }
1874 }
1875 }
1876 elsif (substr('settable', 0, length $access) eq $access) {
1877 my $subname = "${package}::${protected}set_$name";
1878 no strict "refs";
1879 *$subname = sub {
1880 use strict "refs";
23e33b60
KW
1881 if (main::DEBUG) {
1882 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1883 Carp::carp_extra_args(\@_) if @_ > 2;
1884 }
1885 # $self is $_[0]; $value is $_[1]
f998e60c 1886 no overloading;
051df77b 1887 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1888 return;
1889 }
1890 }
1891 else {
1892 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1893 }
cf25bb62 1894 }
99870f4d 1895 return;
cf25bb62 1896 }
99870f4d
KW
1897}
1898
1899package Input_file;
1900
1901# All input files use this object, which stores various attributes about them,
1902# and provides for convenient, uniform handling. The run method wraps the
1903# processing. It handles all the bookkeeping of opening, reading, and closing
1904# the file, returning only significant input lines.
1905#
1906# Each object gets a handler which processes the body of the file, and is
1907# called by run(). Most should use the generic, default handler, which has
1908# code scrubbed to handle things you might not expect. A handler should
1909# basically be a while(next_line()) {...} loop.
1910#
1911# You can also set up handlers to
1912# 1) call before the first line is read for pre processing
1913# 2) call to adjust each line of the input before the main handler gets them
1914# 3) call upon EOF before the main handler exits its loop
1915# 4) call at the end for post processing
1916#
1917# $_ is used to store the input line, and is to be filtered by the
1918# each_line_handler()s. So, if the format of the line is not in the desired
1919# format for the main handler, these are used to do that adjusting. They can
1920# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1921# so the $_ output of one is used as the input to the next. None of the other
1922# handlers are stackable, but could easily be changed to be so.
1923#
1924# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1925# which insert the parameters as lines to be processed before the next input
1926# file line is read. This allows the EOF handler to flush buffers, for
1927# example. The difference between the two routines is that the lines inserted
1928# by insert_lines() are subjected to the each_line_handler()s. (So if you
1929# called it from such a handler, you would get infinite recursion.) Lines
1930# inserted by insert_adjusted_lines() go directly to the main handler without
1931# any adjustments. If the post-processing handler calls any of these, there
1932# will be no effect. Some error checking for these conditions could be added,
1933# but it hasn't been done.
1934#
1935# carp_bad_line() should be called to warn of bad input lines, which clears $_
1936# to prevent further processing of the line. This routine will output the
1937# message as a warning once, and then keep a count of the lines that have the
1938# same message, and output that count at the end of the file's processing.
1939# This keeps the number of messages down to a manageable amount.
1940#
1941# get_missings() should be called to retrieve any @missing input lines.
1942# Messages will be raised if this isn't done if the options aren't to ignore
1943# missings.
1944
1945sub trace { return main::trace(@_); }
1946
99870f4d
KW
1947{ # Closure
1948 # Keep track of fields that are to be put into the constructor.
1949 my %constructor_fields;
1950
1951 main::setup_package(Constructor_Fields => \%constructor_fields);
1952
1953 my %file; # Input file name, required
1954 main::set_access('file', \%file, qw{ c r });
1955
1956 my %first_released; # Unicode version file was first released in, required
1957 main::set_access('first_released', \%first_released, qw{ c r });
1958
1959 my %handler; # Subroutine to process the input file, defaults to
1960 # 'process_generic_property_file'
1961 main::set_access('handler', \%handler, qw{ c });
1962
1963 my %property;
1964 # name of property this file is for. defaults to none, meaning not
1965 # applicable, or is otherwise determinable, for example, from each line.
1966 main::set_access('property', \%property, qw{ c });
1967
1968 my %optional;
1969 # If this is true, the file is optional. If not present, no warning is
1970 # output. If it is present, the string given by this parameter is
1971 # evaluated, and if false the file is not processed.
1972 main::set_access('optional', \%optional, 'c', 'r');
1973
1974 my %non_skip;
1975 # This is used for debugging, to skip processing of all but a few input
1976 # files. Add 'non_skip => 1' to the constructor for those files you want
1977 # processed when you set the $debug_skip global.
1978 main::set_access('non_skip', \%non_skip, 'c');
1979
37e2e78e 1980 my %skip;
09ca89ce
KW
1981 # This is used to skip processing of this input file semi-permanently,
1982 # when it evaluates to true. The value should be the reason the file is
1983 # being skipped. It is used for files that we aren't planning to process
1984 # anytime soon, but want to allow to be in the directory and not raise a
1985 # message that we are not handling. Mostly for test files. This is in
1986 # contrast to the non_skip element, which is supposed to be used very
1987 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
1988 # pretty much will never look at can be placed in the global
1989 # %ignored_files instead. Ones used here will be added to that list.
37e2e78e
KW
1990 main::set_access('skip', \%skip, 'c');
1991
99870f4d
KW
1992 my %each_line_handler;
1993 # list of subroutines to look at and filter each non-comment line in the
1994 # file. defaults to none. The subroutines are called in order, each is
1995 # to adjust $_ for the next one, and the final one adjusts it for
1996 # 'handler'
1997 main::set_access('each_line_handler', \%each_line_handler, 'c');
1998
1999 my %has_missings_defaults;
2000 # ? Are there lines in the file giving default values for code points
2001 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2002 # the norm, but IGNORED means it has such lines, but the handler doesn't
2003 # use them. Having these three states allows us to catch changes to the
2004 # UCD that this program should track
2005 main::set_access('has_missings_defaults',
2006 \%has_missings_defaults, qw{ c r });
2007
2008 my %pre_handler;
2009 # Subroutine to call before doing anything else in the file. If undef, no
2010 # such handler is called.
2011 main::set_access('pre_handler', \%pre_handler, qw{ c });
2012
2013 my %eof_handler;
2014 # Subroutine to call upon getting an EOF on the input file, but before
2015 # that is returned to the main handler. This is to allow buffers to be
2016 # flushed. The handler is expected to call insert_lines() or
2017 # insert_adjusted() with the buffered material
2018 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2019
2020 my %post_handler;
2021 # Subroutine to call after all the lines of the file are read in and
2022 # processed. If undef, no such handler is called.
2023 main::set_access('post_handler', \%post_handler, qw{ c });
2024
2025 my %progress_message;
2026 # Message to print to display progress in lieu of the standard one
2027 main::set_access('progress_message', \%progress_message, qw{ c });
2028
2029 my %handle;
2030 # cache open file handle, internal. Is undef if file hasn't been
2031 # processed at all, empty if has;
2032 main::set_access('handle', \%handle);
2033
2034 my %added_lines;
2035 # cache of lines added virtually to the file, internal
2036 main::set_access('added_lines', \%added_lines);
2037
2038 my %errors;
2039 # cache of errors found, internal
2040 main::set_access('errors', \%errors);
2041
2042 my %missings;
2043 # storage of '@missing' defaults lines
2044 main::set_access('missings', \%missings);
2045
2046 sub new {
2047 my $class = shift;
2048
2049 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2050 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2051
2052 # Set defaults
2053 $handler{$addr} = \&main::process_generic_property_file;
2054 $non_skip{$addr} = 0;
37e2e78e 2055 $skip{$addr} = 0;
99870f4d
KW
2056 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2057 $handle{$addr} = undef;
2058 $added_lines{$addr} = [ ];
2059 $each_line_handler{$addr} = [ ];
2060 $errors{$addr} = { };
2061 $missings{$addr} = [ ];
2062
2063 # Two positional parameters.
99f78760 2064 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2065 $file{$addr} = main::internal_file_to_platform(shift);
2066 $first_released{$addr} = shift;
2067
2068 # The rest of the arguments are key => value pairs
2069 # %constructor_fields has been set up earlier to list all possible
2070 # ones. Either set or push, depending on how the default has been set
2071 # up just above.
2072 my %args = @_;
2073 foreach my $key (keys %args) {
2074 my $argument = $args{$key};
2075
2076 # Note that the fields are the lower case of the constructor keys
2077 my $hash = $constructor_fields{lc $key};
2078 if (! defined $hash) {
2079 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2080 next;
2081 }
2082 if (ref $hash->{$addr} eq 'ARRAY') {
2083 if (ref $argument eq 'ARRAY') {
2084 foreach my $argument (@{$argument}) {
2085 next if ! defined $argument;
2086 push @{$hash->{$addr}}, $argument;
2087 }
2088 }
2089 else {
2090 push @{$hash->{$addr}}, $argument if defined $argument;
2091 }
2092 }
2093 else {
2094 $hash->{$addr} = $argument;
2095 }
2096 delete $args{$key};
2097 };
2098
2099 # If the file has a property for it, it means that the property is not
2100 # listed in the file's entries. So add a handler to the list of line
2101 # handlers to insert the property name into the lines, to provide a
2102 # uniform interface to the final processing subroutine.
2103 # the final code doesn't have to worry about that.
2104 if ($property{$addr}) {
2105 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2106 }
2107
2108 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2109 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 2110 }
99870f4d 2111
09ca89ce
KW
2112 # If skipping, set to optional, and add to list of ignored files,
2113 # including its reason
2114 if ($skip{$addr}) {
2115 $optional{$addr} = 1;
2116 $ignored_files{$file{$addr}} = $skip{$addr}
2117 }
37e2e78e 2118
99870f4d 2119 return $self;
d73e5302
JH
2120 }
2121
cf25bb62 2122
99870f4d
KW
2123 use overload
2124 fallback => 0,
2125 qw("") => "_operator_stringify",
2126 "." => \&main::_operator_dot,
2127 ;
cf25bb62 2128
99870f4d
KW
2129 sub _operator_stringify {
2130 my $self = shift;
cf25bb62 2131
99870f4d 2132 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2133 }
d73e5302 2134
99870f4d
KW
2135 # flag to make sure extracted files are processed early
2136 my $seen_non_extracted_non_age = 0;
d73e5302 2137
99870f4d
KW
2138 sub run {
2139 # Process the input object $self. This opens and closes the file and
2140 # calls all the handlers for it. Currently, this can only be called
2141 # once per file, as it destroy's the EOF handler
d73e5302 2142
99870f4d
KW
2143 my $self = shift;
2144 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2145
ffe43484 2146 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2147
99870f4d 2148 my $file = $file{$addr};
d73e5302 2149
99870f4d
KW
2150 # Don't process if not expecting this file (because released later
2151 # than this Unicode version), and isn't there. This means if someone
2152 # copies it into an earlier version's directory, we will go ahead and
2153 # process it.
2154 return if $first_released{$addr} gt $v_version && ! -e $file;
2155
2156 # If in debugging mode and this file doesn't have the non-skip
2157 # flag set, and isn't one of the critical files, skip it.
2158 if ($debug_skip
2159 && $first_released{$addr} ne v0
2160 && ! $non_skip{$addr})
2161 {
2162 print "Skipping $file in debugging\n" if $verbosity;
2163 return;
2164 }
2165
2166 # File could be optional
37e2e78e 2167 if ($optional{$addr}) {
99870f4d
KW
2168 return unless -e $file;
2169 my $result = eval $optional{$addr};
2170 if (! defined $result) {
2171 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2172 return;
2173 }
2174 if (! $result) {
2175 if ($verbosity) {
2176 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2177 }
2178 return;
2179 }
2180 }
2181
2182 if (! defined $file || ! -e $file) {
2183
2184 # If the file doesn't exist, see if have internal data for it
2185 # (based on first_released being 0).
2186 if ($first_released{$addr} eq v0) {
2187 $handle{$addr} = 'pretend_is_open';
2188 }
2189 else {
2190 if (! $optional{$addr} # File could be optional
2191 && $v_version ge $first_released{$addr})
2192 {
2193 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2194 }
2195 return;
2196 }
2197 }
2198 else {
2199
37e2e78e
KW
2200 # Here, the file exists. Some platforms may change the case of
2201 # its name
99870f4d 2202 if ($seen_non_extracted_non_age) {
517956bf 2203 if ($file =~ /$EXTRACTED/i) {
99870f4d 2204 Carp::my_carp_bug(join_lines(<<END
99f78760 2205$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2206anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2207have subtle problems
2208END
2209 ));
2210 }
2211 }
2212 elsif ($EXTRACTED_DIR
2213 && $first_released{$addr} ne v0
517956bf
CB
2214 && $file !~ /$EXTRACTED/i
2215 && lc($file) ne 'dage.txt')
99870f4d
KW
2216 {
2217 # We don't set this (by the 'if' above) if we have no
2218 # extracted directory, so if running on an early version,
2219 # this test won't work. Not worth worrying about.
2220 $seen_non_extracted_non_age = 1;
2221 }
2222
2223 # And mark the file as having being processed, and warn if it
2224 # isn't a file we are expecting. As we process the files,
2225 # they are deleted from the hash, so any that remain at the
2226 # end of the program are files that we didn't process.
517956bf
CB
2227 my $fkey = File::Spec->rel2abs($file);
2228 my $expecting = delete $potential_files{$fkey};
2229 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
678f13d5
KW
2230 Carp::my_carp("Was not expecting '$file'.") if
2231 ! $expecting
99870f4d
KW
2232 && ! defined $handle{$addr};
2233
37e2e78e
KW
2234 # Having deleted from expected files, we can quit if not to do
2235 # anything. Don't print progress unless really want verbosity
2236 if ($skip{$addr}) {
2237 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2238 return;
2239 }
2240
99870f4d
KW
2241 # Open the file, converting the slashes used in this program
2242 # into the proper form for the OS
2243 my $file_handle;
2244 if (not open $file_handle, "<", $file) {
2245 Carp::my_carp("Can't open $file. Skipping: $!");
2246 return 0;
2247 }
2248 $handle{$addr} = $file_handle; # Cache the open file handle
2249 }
2250
2251 if ($verbosity >= $PROGRESS) {
2252 if ($progress_message{$addr}) {
2253 print "$progress_message{$addr}\n";
2254 }
2255 else {
2256 # If using a virtual file, say so.
2257 print "Processing ", (-e $file)
2258 ? $file
2259 : "substitute $file",
2260 "\n";
2261 }
2262 }
2263
2264
2265 # Call any special handler for before the file.
2266 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2267
2268 # Then the main handler
2269 &{$handler{$addr}}($self);
2270
2271 # Then any special post-file handler.
2272 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2273
2274 # If any errors have been accumulated, output the counts (as the first
2275 # error message in each class was output when it was encountered).
2276 if ($errors{$addr}) {
2277 my $total = 0;
2278 my $types = 0;
2279 foreach my $error (keys %{$errors{$addr}}) {
2280 $total += $errors{$addr}->{$error};
2281 delete $errors{$addr}->{$error};
2282 $types++;
2283 }
2284 if ($total > 1) {
2285 my $message
2286 = "A total of $total lines had errors in $file. ";
2287
2288 $message .= ($types == 1)
2289 ? '(Only the first one was displayed.)'
2290 : '(Only the first of each type was displayed.)';
2291 Carp::my_carp($message);
2292 }
2293 }
2294
2295 if (@{$missings{$addr}}) {
2296 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2297 }
2298
2299 # If a real file handle, close it.
2300 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2301 ref $handle{$addr};
2302 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2303 # the file, as opposed to undef
2304 return;
2305 }
2306
2307 sub next_line {
2308 # Sets $_ to be the next logical input line, if any. Returns non-zero
2309 # if such a line exists. 'logical' means that any lines that have
2310 # been added via insert_lines() will be returned in $_ before the file
2311 # is read again.
2312
2313 my $self = shift;
2314 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2315
ffe43484 2316 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2317
2318 # Here the file is open (or if the handle is not a ref, is an open
2319 # 'virtual' file). Get the next line; any inserted lines get priority
2320 # over the file itself.
2321 my $adjusted;
2322
2323 LINE:
2324 while (1) { # Loop until find non-comment, non-empty line
2325 #local $to_trace = 1 if main::DEBUG;
2326 my $inserted_ref = shift @{$added_lines{$addr}};
2327 if (defined $inserted_ref) {
2328 ($adjusted, $_) = @{$inserted_ref};
2329 trace $adjusted, $_ if main::DEBUG && $to_trace;
2330 return 1 if $adjusted;
2331 }
2332 else {
2333 last if ! ref $handle{$addr}; # Don't read unless is real file
2334 last if ! defined ($_ = readline $handle{$addr});
2335 }
2336 chomp;
2337 trace $_ if main::DEBUG && $to_trace;
2338
2339 # See if this line is the comment line that defines what property
2340 # value that code points that are not listed in the file should
2341 # have. The format or existence of these lines is not guaranteed
2342 # by Unicode since they are comments, but the documentation says
2343 # that this was added for machine-readability, so probably won't
2344 # change. This works starting in Unicode Version 5.0. They look
2345 # like:
2346 #
2347 # @missing: 0000..10FFFF; Not_Reordered
2348 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2349 # @missing: 0000..10FFFF; ; NaN
2350 #
2351 # Save the line for a later get_missings() call.
2352 if (/$missing_defaults_prefix/) {
2353 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2354 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2355 }
2356 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2357 my @defaults = split /\s* ; \s*/x, $_;
2358
2359 # The first field is the @missing, which ends in a
2360 # semi-colon, so can safely shift.
2361 shift @defaults;
2362
2363 # Some of these lines may have empty field placeholders
2364 # which get in the way. An example is:
2365 # @missing: 0000..10FFFF; ; NaN
2366 # Remove them. Process starting from the top so the
2367 # splice doesn't affect things still to be looked at.
2368 for (my $i = @defaults - 1; $i >= 0; $i--) {
2369 next if $defaults[$i] ne "";
2370 splice @defaults, $i, 1;
2371 }
2372
2373 # What's left should be just the property (maybe) and the
2374 # default. Having only one element means it doesn't have
2375 # the property.
2376 my $default;
2377 my $property;
2378 if (@defaults >= 1) {
2379 if (@defaults == 1) {
2380 $default = $defaults[0];
2381 }
2382 else {
2383 $property = $defaults[0];
2384 $default = $defaults[1];
2385 }
2386 }
2387
2388 if (@defaults < 1
2389 || @defaults > 2
2390 || ($default =~ /^</
2391 && $default !~ /^<code *point>$/i
2392 && $default !~ /^<none>$/i))
2393 {
2394 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2395 }
2396 else {
2397
2398 # If the property is missing from the line, it should
2399 # be the one for the whole file
2400 $property = $property{$addr} if ! defined $property;
2401
2402 # Change <none> to the null string, which is what it
2403 # really means. If the default is the code point
2404 # itself, set it to <code point>, which is what
2405 # Unicode uses (but sometimes they've forgotten the
2406 # space)
2407 if ($default =~ /^<none>$/i) {
2408 $default = "";
2409 }
2410 elsif ($default =~ /^<code *point>$/i) {
2411 $default = $CODE_POINT;
2412 }
2413
2414 # Store them as a sub-arrays with both components.
2415 push @{$missings{$addr}}, [ $default, $property ];
2416 }
2417 }
2418
2419 # There is nothing for the caller to process on this comment
2420 # line.
2421 next;
2422 }
2423
2424 # Remove comments and trailing space, and skip this line if the
2425 # result is empty
2426 s/#.*//;
2427 s/\s+$//;
2428 next if /^$/;
2429
2430 # Call any handlers for this line, and skip further processing of
2431 # the line if the handler sets the line to null.
2432 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2433 &{$sub_ref}($self);
2434 next LINE if /^$/;
2435 }
2436
2437 # Here the line is ok. return success.
2438 return 1;
2439 } # End of looping through lines.
2440
2441 # If there is an EOF handler, call it (only once) and if it generates
2442 # more lines to process go back in the loop to handle them.
2443 if ($eof_handler{$addr}) {
2444 &{$eof_handler{$addr}}($self);
2445 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2446 goto LINE if $added_lines{$addr};
2447 }
2448
2449 # Return failure -- no more lines.
2450 return 0;
2451
2452 }
2453
2454# Not currently used, not fully tested.
2455# sub peek {
2456# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2457# # record. Not callable from an each_line_handler(), nor does it call
2458# # an each_line_handler() on the line.
2459#
2460# my $self = shift;
ffe43484 2461# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2462#
2463# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2464# my ($adjusted, $line) = @{$inserted_ref};
2465# next if $adjusted;
2466#
2467# # Remove comments and trailing space, and return a non-empty
2468# # resulting line
2469# $line =~ s/#.*//;
2470# $line =~ s/\s+$//;
2471# return $line if $line ne "";
2472# }
2473#
2474# return if ! ref $handle{$addr}; # Don't read unless is real file
2475# while (1) { # Loop until find non-comment, non-empty line
2476# local $to_trace = 1 if main::DEBUG;
2477# trace $_ if main::DEBUG && $to_trace;
2478# return if ! defined (my $line = readline $handle{$addr});
2479# chomp $line;
2480# push @{$added_lines{$addr}}, [ 0, $line ];
2481#
2482# $line =~ s/#.*//;
2483# $line =~ s/\s+$//;
2484# return $line if $line ne "";
2485# }
2486#
2487# return;
2488# }
2489
2490
2491 sub insert_lines {
2492 # Lines can be inserted so that it looks like they were in the input
2493 # file at the place it was when this routine is called. See also
2494 # insert_adjusted_lines(). Lines inserted via this routine go through
2495 # any each_line_handler()
2496
2497 my $self = shift;
2498
2499 # Each inserted line is an array, with the first element being 0 to
2500 # indicate that this line hasn't been adjusted, and needs to be
2501 # processed.
f998e60c 2502 no overloading;
051df77b 2503 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2504 return;
2505 }
2506
2507 sub insert_adjusted_lines {
2508 # Lines can be inserted so that it looks like they were in the input
2509 # file at the place it was when this routine is called. See also
2510 # insert_lines(). Lines inserted via this routine are already fully
2511 # adjusted, ready to be processed; each_line_handler()s handlers will
2512 # not be called. This means this is not a completely general
2513 # facility, as only the last each_line_handler on the stack should
2514 # call this. It could be made more general, by passing to each of the
2515 # line_handlers their position on the stack, which they would pass on
2516 # to this routine, and that would replace the boolean first element in
2517 # the anonymous array pushed here, so that the next_line routine could
2518 # use that to call only those handlers whose index is after it on the
2519 # stack. But this is overkill for what is needed now.
2520
2521 my $self = shift;
2522 trace $_[0] if main::DEBUG && $to_trace;
2523
2524 # Each inserted line is an array, with the first element being 1 to
2525 # indicate that this line has been adjusted
f998e60c 2526 no overloading;
051df77b 2527 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2528 return;
2529 }
2530
2531 sub get_missings {
2532 # Returns the stored up @missings lines' values, and clears the list.
2533 # The values are in an array, consisting of the default in the first
2534 # element, and the property in the 2nd. However, since these lines
2535 # can be stacked up, the return is an array of all these arrays.
2536
2537 my $self = shift;
2538 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2539
ffe43484 2540 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2541
2542 # If not accepting a list return, just return the first one.
2543 return shift @{$missings{$addr}} unless wantarray;
2544
2545 my @return = @{$missings{$addr}};
2546 undef @{$missings{$addr}};
2547 return @return;
2548 }
2549
2550 sub _insert_property_into_line {
2551 # Add a property field to $_, if this file requires it.
2552
f998e60c 2553 my $self = shift;
ffe43484 2554 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2555 my $property = $property{$addr};
99870f4d
KW
2556 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2557
2558 $_ =~ s/(;|$)/; $property$1/;
2559 return;
2560 }
2561
2562 sub carp_bad_line {
2563 # Output consistent error messages, using either a generic one, or the
2564 # one given by the optional parameter. To avoid gazillions of the
2565 # same message in case the syntax of a file is way off, this routine
2566 # only outputs the first instance of each message, incrementing a
2567 # count so the totals can be output at the end of the file.
2568
2569 my $self = shift;
2570 my $message = shift;
2571 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2572
ffe43484 2573 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2574
2575 $message = 'Unexpected line' unless $message;
2576
2577 # No trailing punctuation so as to fit with our addenda.
2578 $message =~ s/[.:;,]$//;
2579
2580 # If haven't seen this exact message before, output it now. Otherwise
2581 # increment the count of how many times it has occurred
2582 unless ($errors{$addr}->{$message}) {
2583 Carp::my_carp("$message in '$_' in "
f998e60c 2584 . $file{$addr}
99870f4d
KW
2585 . " at line $.. Skipping this line;");
2586 $errors{$addr}->{$message} = 1;
2587 }
2588 else {
2589 $errors{$addr}->{$message}++;
2590 }
2591
2592 # Clear the line to prevent any further (meaningful) processing of it.
2593 $_ = "";
2594
2595 return;
2596 }
2597} # End closure
2598
2599package Multi_Default;
2600
2601# Certain properties in early versions of Unicode had more than one possible
2602# default for code points missing from the files. In these cases, one
2603# default applies to everything left over after all the others are applied,
2604# and for each of the others, there is a description of which class of code
2605# points applies to it. This object helps implement this by storing the
2606# defaults, and for all but that final default, an eval string that generates
2607# the class that it applies to.
2608
2609
2610{ # Closure
2611
2612 main::setup_package();
2613
2614 my %class_defaults;
2615 # The defaults structure for the classes
2616 main::set_access('class_defaults', \%class_defaults);
2617
2618 my %other_default;
2619 # The default that applies to everything left over.
2620 main::set_access('other_default', \%other_default, 'r');
2621
2622
2623 sub new {
2624 # The constructor is called with default => eval pairs, terminated by
2625 # the left-over default. e.g.
2626 # Multi_Default->new(
2627 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2628 # - 0x200D',
2629 # 'R' => 'some other expression that evaluates to code points',
2630 # .
2631 # .
2632 # .
2633 # 'U'));
2634
2635 my $class = shift;
2636
2637 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2638 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2639
2640 while (@_ > 1) {
2641 my $default = shift;
2642 my $eval = shift;
2643 $class_defaults{$addr}->{$default} = $eval;
2644 }
2645
2646 $other_default{$addr} = shift;
2647
2648 return $self;
2649 }
2650
2651 sub get_next_defaults {
2652 # Iterates and returns the next class of defaults.
2653 my $self = shift;
2654 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2655
ffe43484 2656 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2657
2658 return each %{$class_defaults{$addr}};
2659 }
2660}
2661
2662package Alias;
2663
2664# An alias is one of the names that a table goes by. This class defines them
2665# including some attributes. Everything is currently setup in the
2666# constructor.
2667
2668
2669{ # Closure
2670
2671 main::setup_package();
2672
2673 my %name;
2674 main::set_access('name', \%name, 'r');
2675
2676 my %loose_match;
c12f2655 2677 # Should this name match loosely or not.
99870f4d
KW
2678 main::set_access('loose_match', \%loose_match, 'r');
2679
33e96e72
KW
2680 my %make_re_pod_entry;
2681 # Some aliases should not get their own entries in the re section of the
2682 # pod, because they are covered by a wild-card, and some we want to
2683 # discourage use of. Binary
2684 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r');
99870f4d
KW
2685
2686 my %status;
2687 # Aliases have a status, like deprecated, or even suppressed (which means
2688 # they don't appear in documentation). Enum
2689 main::set_access('status', \%status, 'r');
2690
2691 my %externally_ok;
2692 # Similarly, some aliases should not be considered as usable ones for
2693 # external use, such as file names, or we don't want documentation to
2694 # recommend them. Boolean
2695 main::set_access('externally_ok', \%externally_ok, 'r');
2696
2697 sub new {
2698 my $class = shift;
2699
2700 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2701 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2702
2703 $name{$addr} = shift;
2704 $loose_match{$addr} = shift;
33e96e72 2705 $make_re_pod_entry{$addr} = shift;
99870f4d
KW
2706 $externally_ok{$addr} = shift;
2707 $status{$addr} = shift;
2708
2709 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2710
2711 # Null names are never ok externally
2712 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2713
2714 return $self;
2715 }
2716}
2717
2718package Range;
2719
2720# A range is the basic unit for storing code points, and is described in the
2721# comments at the beginning of the program. Each range has a starting code
2722# point; an ending code point (not less than the starting one); a value
2723# that applies to every code point in between the two end-points, inclusive;
2724# and an enum type that applies to the value. The type is for the user's
2725# convenience, and has no meaning here, except that a non-zero type is
2726# considered to not obey the normal Unicode rules for having standard forms.
2727#
2728# The same structure is used for both map and match tables, even though in the
2729# latter, the value (and hence type) is irrelevant and could be used as a
2730# comment. In map tables, the value is what all the code points in the range
2731# map to. Type 0 values have the standardized version of the value stored as
2732# well, so as to not have to recalculate it a lot.
2733
2734sub trace { return main::trace(@_); }
2735
2736{ # Closure
2737
2738 main::setup_package();
2739
2740 my %start;
2741 main::set_access('start', \%start, 'r', 's');
2742
2743 my %end;
2744 main::set_access('end', \%end, 'r', 's');
2745
2746 my %value;
2747 main::set_access('value', \%value, 'r');
2748
2749 my %type;
2750 main::set_access('type', \%type, 'r');
2751
2752 my %standard_form;
2753 # The value in internal standard form. Defined only if the type is 0.
2754 main::set_access('standard_form', \%standard_form);
2755
2756 # Note that if these fields change, the dump() method should as well
2757
2758 sub new {
2759 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2760 my $class = shift;
2761
2762 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2763 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2764
2765 $start{$addr} = shift;
2766 $end{$addr} = shift;
2767
2768 my %args = @_;
2769
2770 my $value = delete $args{'Value'}; # Can be 0
2771 $value = "" unless defined $value;
2772 $value{$addr} = $value;
2773
2774 $type{$addr} = delete $args{'Type'} || 0;
2775
2776 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2777
2778 if (! $type{$addr}) {
2779 $standard_form{$addr} = main::standardize($value);
2780 }
2781
2782 return $self;
2783 }
2784
2785 use overload
2786 fallback => 0,
2787 qw("") => "_operator_stringify",
2788 "." => \&main::_operator_dot,
2789 ;
2790
2791 sub _operator_stringify {
2792 my $self = shift;
ffe43484 2793 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2794
2795 # Output it like '0041..0065 (value)'
2796 my $return = sprintf("%04X", $start{$addr})
2797 . '..'
2798 . sprintf("%04X", $end{$addr});
2799 my $value = $value{$addr};
2800 my $type = $type{$addr};
2801 $return .= ' (';
2802 $return .= "$value";
2803 $return .= ", Type=$type" if $type != 0;
2804 $return .= ')';
2805
2806 return $return;
2807 }
2808
2809 sub standard_form {
2810 # The standard form is the value itself if the standard form is
2811 # undefined (that is if the value is special)
2812
2813 my $self = shift;
2814 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2815
ffe43484 2816 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2817
2818 return $standard_form{$addr} if defined $standard_form{$addr};
2819 return $value{$addr};
2820 }
2821
2822 sub dump {
2823 # Human, not machine readable. For machine readable, comment out this
2824 # entire routine and let the standard one take effect.
2825 my $self = shift;
2826 my $indent = shift;
2827 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2828
ffe43484 2829 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2830
2831 my $return = $indent
2832 . sprintf("%04X", $start{$addr})
2833 . '..'
2834 . sprintf("%04X", $end{$addr})
2835 . " '$value{$addr}';";
2836 if (! defined $standard_form{$addr}) {
2837 $return .= "(type=$type{$addr})";
2838 }
2839 elsif ($standard_form{$addr} ne $value{$addr}) {
2840 $return .= "(standard '$standard_form{$addr}')";
2841 }
2842 return $return;
2843 }
2844} # End closure
2845
2846package _Range_List_Base;
2847
2848# Base class for range lists. A range list is simply an ordered list of
2849# ranges, so that the ranges with the lowest starting numbers are first in it.
2850#
2851# When a new range is added that is adjacent to an existing range that has the
2852# same value and type, it merges with it to form a larger range.
2853#
2854# Ranges generally do not overlap, except that there can be multiple entries
2855# of single code point ranges. This is because of NameAliases.txt.
2856#
2857# In this program, there is a standard value such that if two different
2858# values, have the same standard value, they are considered equivalent. This
2859# value was chosen so that it gives correct results on Unicode data
2860
2861# There are a number of methods to manipulate range lists, and some operators
2862# are overloaded to handle them.
2863
99870f4d
KW
2864sub trace { return main::trace(@_); }
2865
2866{ # Closure
2867
2868 our $addr;
2869
2870 main::setup_package();
2871
2872 my %ranges;
2873 # The list of ranges
2874 main::set_access('ranges', \%ranges, 'readable_array');
2875
2876 my %max;
2877 # The highest code point in the list. This was originally a method, but
2878 # actual measurements said it was used a lot.
2879 main::set_access('max', \%max, 'r');
2880
2881 my %each_range_iterator;
2882 # Iterator position for each_range()
2883 main::set_access('each_range_iterator', \%each_range_iterator);
2884
2885 my %owner_name_of;
2886 # Name of parent this is attached to, if any. Solely for better error
2887 # messages.
2888 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2889
2890 my %_search_ranges_cache;
2891 # A cache of the previous result from _search_ranges(), for better
2892 # performance
2893 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2894
2895 sub new {
2896 my $class = shift;
2897 my %args = @_;
2898
2899 # Optional initialization data for the range list.
2900 my $initialize = delete $args{'Initialize'};
2901
2902 my $self;
2903
2904 # Use _union() to initialize. _union() returns an object of this
2905 # class, which means that it will call this constructor recursively.
2906 # But it won't have this $initialize parameter so that it won't
2907 # infinitely loop on this.
2908 return _union($class, $initialize, %args) if defined $initialize;
2909
2910 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2911 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2912
2913 # Optional parent object, only for debug info.
2914 $owner_name_of{$addr} = delete $args{'Owner'};
2915 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2916
2917 # Stringify, in case it is an object.
2918 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2919
2920 # This is used only for error messages, and so a colon is added
2921 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2922
2923 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2924
2925 # Max is initialized to a negative value that isn't adjacent to 0,
2926 # for simpler tests
2927 $max{$addr} = -2;
2928
2929 $_search_ranges_cache{$addr} = 0;
2930 $ranges{$addr} = [];
2931
2932 return $self;
2933 }
2934
2935 use overload
2936 fallback => 0,
2937 qw("") => "_operator_stringify",
2938 "." => \&main::_operator_dot,
2939 ;
2940
2941 sub _operator_stringify {
2942 my $self = shift;
ffe43484 2943 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2944
2945 return "Range_List attached to '$owner_name_of{$addr}'"
2946 if $owner_name_of{$addr};
2947 return "anonymous Range_List " . \$self;
2948 }
2949
2950 sub _union {
2951 # Returns the union of the input code points. It can be called as
2952 # either a constructor or a method. If called as a method, the result
2953 # will be a new() instance of the calling object, containing the union
2954 # of that object with the other parameter's code points; if called as
2955 # a constructor, the first parameter gives the class the new object
2956 # should be, and the second parameter gives the code points to go into
2957 # it.
2958 # In either case, there are two parameters looked at by this routine;
2959 # any additional parameters are passed to the new() constructor.
2960 #
2961 # The code points can come in the form of some object that contains
2962 # ranges, and has a conventionally named method to access them; or
2963 # they can be an array of individual code points (as integers); or
2964 # just a single code point.
2965 #
2966 # If they are ranges, this routine doesn't make any effort to preserve
2967 # the range values of one input over the other. Therefore this base
2968 # class should not allow _union to be called from other than
2969 # initialization code, so as to prevent two tables from being added
2970 # together where the range values matter. The general form of this
2971 # routine therefore belongs in a derived class, but it was moved here
2972 # to avoid duplication of code. The failure to overload this in this
2973 # class keeps it safe.
2974 #
2975
2976 my $self;
2977 my @args; # Arguments to pass to the constructor
2978
2979 my $class = shift;
2980
2981 # If a method call, will start the union with the object itself, and
2982 # the class of the new object will be the same as self.
2983 if (ref $class) {
2984 $self = $class;
2985 $class = ref $self;
2986 push @args, $self;
2987 }
2988
2989 # Add the other required parameter.
2990 push @args, shift;
2991 # Rest of parameters are passed on to the constructor
2992
2993 # Accumulate all records from both lists.
2994 my @records;
2995 for my $arg (@args) {
2996 #local $to_trace = 0 if main::DEBUG;
2997 trace "argument = $arg" if main::DEBUG && $to_trace;
2998 if (! defined $arg) {
2999 my $message = "";
3000 if (defined $self) {
f998e60c 3001 no overloading;
051df77b 3002 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
3003 }
3004 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
3005 return;
3006 }
3007 $arg = [ $arg ] if ! ref $arg;
3008 my $type = ref $arg;
3009 if ($type eq 'ARRAY') {
3010 foreach my $element (@$arg) {
3011 push @records, Range->new($element, $element);
3012 }
3013 }
3014 elsif ($arg->isa('Range')) {
3015 push @records, $arg;
3016 }
3017 elsif ($arg->can('ranges')) {
3018 push @records, $arg->ranges;
3019 }
3020 else {
3021 my $message = "";
3022 if (defined $self) {
f998e60c 3023 no overloading;
051df77b 3024 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
3025 }
3026 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3027 return;
3028 }
3029 }
3030
3031 # Sort with the range containing the lowest ordinal first, but if
3032 # two ranges start at the same code point, sort with the bigger range
3033 # of the two first, because it takes fewer cycles.
3034 @records = sort { ($a->start <=> $b->start)
3035 or
3036 # if b is shorter than a, b->end will be
3037 # less than a->end, and we want to select
3038 # a, so want to return -1
3039 ($b->end <=> $a->end)
3040 } @records;
3041
3042 my $new = $class->new(@_);
3043
3044 # Fold in records so long as they add new information.
3045 for my $set (@records) {
3046 my $start = $set->start;
3047 my $end = $set->end;
3048 my $value = $set->value;
3049 if ($start > $new->max) {
3050 $new->_add_delete('+', $start, $end, $value);
3051 }
3052 elsif ($end > $new->max) {
3053 $new->_add_delete('+', $new->max +1, $end, $value);
3054 }
3055 }
3056
3057 return $new;
3058 }
3059
3060 sub range_count { # Return the number of ranges in the range list
3061 my $self = shift;
3062 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3063
f998e60c 3064 no overloading;
051df77b 3065 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
3066 }
3067
3068 sub min {
3069 # Returns the minimum code point currently in the range list, or if
3070 # the range list is empty, 2 beyond the max possible. This is a
3071 # method because used so rarely, that not worth saving between calls,
3072 # and having to worry about changing it as ranges are added and
3073 # deleted.
3074
3075 my $self = shift;
3076 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3077
ffe43484 3078 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3079
3080 # If the range list is empty, return a large value that isn't adjacent
3081 # to any that could be in the range list, for simpler tests
6189eadc 3082 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
99870f4d
KW
3083 return $ranges{$addr}->[0]->start;
3084 }
3085
3086 sub contains {
3087 # Boolean: Is argument in the range list? If so returns $i such that:
3088 # range[$i]->end < $codepoint <= range[$i+1]->end
3089 # which is one beyond what you want; this is so that the 0th range
3090 # doesn't return false
3091 my $self = shift;
3092 my $codepoint = shift;
3093 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3094
99870f4d
KW
3095 my $i = $self->_search_ranges($codepoint);
3096 return 0 unless defined $i;
3097
3098 # The search returns $i, such that
3099 # range[$i-1]->end < $codepoint <= range[$i]->end
3100 # So is in the table if and only iff it is at least the start position
3101 # of range $i.
f998e60c 3102 no overloading;
051df77b 3103 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
3104 return $i + 1;
3105 }
3106
2f7a8815
KW
3107 sub containing_range {
3108 # Returns the range object that contains the code point, undef if none
3109
3110 my $self = shift;
3111 my $codepoint = shift;
3112 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3113
3114 my $i = $self->contains($codepoint);
3115 return unless $i;
3116
3117 # contains() returns 1 beyond where we should look
3118 no overloading;
3119 return $ranges{pack 'J', $self}->[$i-1];
3120 }
3121
99870f4d
KW
3122 sub value_of {
3123 # Returns the value associated with the code point, undef if none
3124
3125 my $self = shift;
3126 my $codepoint = shift;
3127 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3128
d69c231b
KW
3129 my $range = $self->containing_range($codepoint);
3130 return unless defined $range;
99870f4d 3131
d69c231b 3132 return $range->value;
99870f4d
KW
3133 }
3134
0a9dbafc
KW
3135 sub type_of {
3136 # Returns the type of the range containing the code point, undef if
3137 # the code point is not in the table
3138
3139 my $self = shift;
3140 my $codepoint = shift;
3141 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3142
3143 my $range = $self->containing_range($codepoint);
3144 return unless defined $range;
3145
3146 return $range->type;
3147 }
3148
99870f4d
KW
3149 sub _search_ranges {
3150 # Find the range in the list which contains a code point, or where it
3151 # should go if were to add it. That is, it returns $i, such that:
3152 # range[$i-1]->end < $codepoint <= range[$i]->end
3153 # Returns undef if no such $i is possible (e.g. at end of table), or
3154 # if there is an error.
3155
3156 my $self = shift;
3157 my $code_point = shift;
3158 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3159
ffe43484 3160 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3161
3162 return if $code_point > $max{$addr};
3163 my $r = $ranges{$addr}; # The current list of ranges
3164 my $range_list_size = scalar @$r;
3165 my $i;
3166
3167 use integer; # want integer division
3168
3169 # Use the cached result as the starting guess for this one, because,
3170 # an experiment on 5.1 showed that 90% of the time the cache was the
3171 # same as the result on the next call (and 7% it was one less).
3172 $i = $_search_ranges_cache{$addr};
3173 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3174 # from an intervening deletion
3175 #local $to_trace = 1 if main::DEBUG;
3176 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3177 return $i if $code_point <= $r->[$i]->end
3178 && ($i == 0 || $r->[$i-1]->end < $code_point);
3179
3180 # Here the cache doesn't yield the correct $i. Try adding 1.
3181 if ($i < $range_list_size - 1
3182 && $r->[$i]->end < $code_point &&
3183 $code_point <= $r->[$i+1]->end)
3184 {
3185 $i++;
3186 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3187 $_search_ranges_cache{$addr} = $i;
3188 return $i;
3189 }
3190
3191 # Here, adding 1 also didn't work. We do a binary search to
3192 # find the correct position, starting with current $i
3193 my $lower = 0;
3194 my $upper = $range_list_size - 1;
3195 while (1) {
3196 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3197
3198 if ($code_point <= $r->[$i]->end) {
3199
3200 # Here we have met the upper constraint. We can quit if we
3201 # also meet the lower one.
3202 last if $i == 0 || $r->[$i-1]->end < $code_point;
3203
3204 $upper = $i; # Still too high.
3205
3206 }
3207 else {
3208
3209 # Here, $r[$i]->end < $code_point, so look higher up.
3210 $lower = $i;
3211 }
3212
3213 # Split search domain in half to try again.
3214 my $temp = ($upper + $lower) / 2;
3215
3216 # No point in continuing unless $i changes for next time
3217 # in the loop.
3218 if ($temp == $i) {
3219
3220 # We can't reach the highest element because of the averaging.
3221 # So if one below the upper edge, force it there and try one
3222 # more time.
3223 if ($i == $range_list_size - 2) {
3224
3225 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3226 $i = $range_list_size - 1;
3227
3228 # Change $lower as well so if fails next time through,
3229 # taking the average will yield the same $i, and we will
3230 # quit with the error message just below.
3231 $lower = $i;
3232 next;
3233 }
3234 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3235 return;
3236 }
3237 $i = $temp;
3238 } # End of while loop
3239
3240 if (main::DEBUG && $to_trace) {
3241 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3242 trace "i= [ $i ]", $r->[$i];
3243 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3244 }
3245
3246 # Here we have found the offset. Cache it as a starting point for the
3247 # next call.
3248 $_search_ranges_cache{$addr} = $i;
3249 return $i;
3250 }
3251
3252 sub _add_delete {
3253 # Add, replace or delete ranges to or from a list. The $type
3254 # parameter gives which:
3255 # '+' => insert or replace a range, returning a list of any changed
3256 # ranges.
3257 # '-' => delete a range, returning a list of any deleted ranges.
3258 #
3259 # The next three parameters give respectively the start, end, and
3260 # value associated with the range. 'value' should be null unless the
3261 # operation is '+';
3262 #
3263 # The range list is kept sorted so that the range with the lowest
3264 # starting position is first in the list, and generally, adjacent
c1739a4a 3265 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3266 # exceptions below).
3267 #
c1739a4a 3268 # There are more parameters; all are key => value pairs:
99870f4d
KW
3269 # Type gives the type of the value. It is only valid for '+'.
3270 # All ranges have types; if this parameter is omitted, 0 is
3271 # assumed. Ranges with type 0 are assumed to obey the
3272 # Unicode rules for casing, etc; ranges with other types are
3273 # not. Otherwise, the type is arbitrary, for the caller's
3274 # convenience, and looked at only by this routine to keep
3275 # adjacent ranges of different types from being merged into
3276 # a single larger range, and when Replace =>
3277 # $IF_NOT_EQUIVALENT is specified (see just below).
3278 # Replace determines what to do if the range list already contains
3279 # ranges which coincide with all or portions of the input
3280 # range. It is only valid for '+':
3281 # => $NO means that the new value is not to replace
3282 # any existing ones, but any empty gaps of the
3283 # range list coinciding with the input range
3284 # will be filled in with the new value.
3285 # => $UNCONDITIONALLY means to replace the existing values with
3286 # this one unconditionally. However, if the
3287 # new and old values are identical, the
3288 # replacement is skipped to save cycles
3289 # => $IF_NOT_EQUIVALENT means to replace the existing values
3290 # with this one if they are not equivalent.
3291 # Ranges are equivalent if their types are the
c1739a4a 3292 # same, and they are the same string; or if
99870f4d
KW
3293 # both are type 0 ranges, if their Unicode
3294 # standard forms are identical. In this last
3295 # case, the routine chooses the more "modern"
3296 # one to use. This is because some of the
3297 # older files are formatted with values that
3298 # are, for example, ALL CAPs, whereas the
3299 # derived files have a more modern style,
3300 # which looks better. By looking for this
3301 # style when the pre-existing and replacement
3302 # standard forms are the same, we can move to
3303 # the modern style
3304 # => $MULTIPLE means that if this range duplicates an
3305 # existing one, but has a different value,
3306 # don't replace the existing one, but insert
3307 # this, one so that the same range can occur
53d84487
KW
3308 # multiple times. They are stored LIFO, so
3309 # that the final one inserted is the first one
3310 # returned in an ordered search of the table.
99870f4d
KW
3311 # => anything else is the same as => $IF_NOT_EQUIVALENT
3312 #
c1739a4a
KW
3313 # "same value" means identical for non-type-0 ranges, and it means
3314 # having the same standard forms for type-0 ranges.
99870f4d
KW
3315
3316 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3317
3318 my $self = shift;
3319 my $operation = shift; # '+' for add/replace; '-' for delete;
3320 my $start = shift;
3321 my $end = shift;
3322 my $value = shift;
3323
3324 my %args = @_;
3325
3326 $value = "" if not defined $value; # warning: $value can be "0"
3327
3328 my $replace = delete $args{'Replace'};
3329 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3330
3331 my $type = delete $args{'Type'};
3332 $type = 0 unless defined $type;
3333
3334 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3335
ffe43484 3336 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3337
3338 if ($operation ne '+' && $operation ne '-') {
3339 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3340 return;
3341 }
3342 unless (defined $start && defined $end) {
3343 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3344 return;
3345 }
3346 unless ($end >= $start) {
3347 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3348 return;
3349 }
3350 #local $to_trace = 1 if main::DEBUG;
3351
3352 if ($operation eq '-') {
3353 if ($replace != $IF_NOT_EQUIVALENT) {
3354 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3355 $replace = $IF_NOT_EQUIVALENT;
3356 }
3357 if ($type) {
3358 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3359 $type = 0;
3360 }
3361 if ($value ne "") {
3362 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3363 $value = "";
3364 }
3365 }
3366
3367 my $r = $ranges{$addr}; # The current list of ranges
3368 my $range_list_size = scalar @$r; # And its size
3369 my $max = $max{$addr}; # The current high code point in
3370 # the list of ranges
3371
3372 # Do a special case requiring fewer machine cycles when the new range
3373 # starts after the current highest point. The Unicode input data is
3374 # structured so this is common.
3375 if ($start > $max) {
3376
3377 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3378 return if $operation eq '-'; # Deleting a non-existing range is a
3379 # no-op
3380
3381 # If the new range doesn't logically extend the current final one
3382 # in the range list, create a new range at the end of the range
3383 # list. (max cleverly is initialized to a negative number not
3384 # adjacent to 0 if the range list is empty, so even adding a range
3385 # to an empty range list starting at 0 will have this 'if'
3386 # succeed.)
3387 if ($start > $max + 1 # non-adjacent means can't extend.
3388 || @{$r}[-1]->value ne $value # values differ, can't extend.
3389 || @{$r}[-1]->type != $type # types differ, can't extend.
3390 ) {
3391 push @$r, Range->new($start, $end,
3392 Value => $value,
3393 Type => $type);
3394 }
3395 else {
3396
3397 # Here, the new range starts just after the current highest in
3398 # the range list, and they have the same type and value.
3399 # Extend the current range to incorporate the new one.
3400 @{$r}[-1]->set_end($end);
3401 }
3402
3403 # This becomes the new maximum.
3404 $max{$addr} = $end;
3405
3406 return;
3407 }
3408 #local $to_trace = 0 if main::DEBUG;
3409
3410 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3411
3412 # Here, the input range isn't after the whole rest of the range list.
3413 # Most likely 'splice' will be needed. The rest of the routine finds
3414 # the needed splice parameters, and if necessary, does the splice.
3415 # First, find the offset parameter needed by the splice function for
3416 # the input range. Note that the input range may span multiple
3417 # existing ones, but we'll worry about that later. For now, just find
3418 # the beginning. If the input range is to be inserted starting in a
3419 # position not currently in the range list, it must (obviously) come
3420 # just after the range below it, and just before the range above it.
3421 # Slightly less obviously, it will occupy the position currently
3422 # occupied by the range that is to come after it. More formally, we
3423 # are looking for the position, $i, in the array of ranges, such that:
3424 #
3425 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3426 #
3427 # (The ordered relationships within existing ranges are also shown in
3428 # the equation above). However, if the start of the input range is
3429 # within an existing range, the splice offset should point to that
3430 # existing range's position in the list; that is $i satisfies a
3431 # somewhat different equation, namely:
3432 #
3433 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3434 #
3435 # More briefly, $start can come before or after r[$i]->start, and at
3436 # this point, we don't know which it will be. However, these
3437 # two equations share these constraints:
3438 #
3439 # r[$i-1]->end < $start <= r[$i]->end
3440 #
3441 # And that is good enough to find $i.
3442
3443 my $i = $self->_search_ranges($start);
3444 if (! defined $i) {
3445 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3446 return;
3447 }
3448
3449 # The search function returns $i such that:
3450 #
3451 # r[$i-1]->end < $start <= r[$i]->end
3452 #
3453 # That means that $i points to the first range in the range list
3454 # that could possibly be affected by this operation. We still don't
3455 # know if the start of the input range is within r[$i], or if it
3456 # points to empty space between r[$i-1] and r[$i].
3457 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3458
3459 # Special case the insertion of data that is not to replace any
3460 # existing data.
3461 if ($replace == $NO) { # If $NO, has to be operation '+'
3462 #local $to_trace = 1 if main::DEBUG;
3463 trace "Doesn't replace" if main::DEBUG && $to_trace;
3464
3465 # Here, the new range is to take effect only on those code points
3466 # that aren't already in an existing range. This can be done by
3467 # looking through the existing range list and finding the gaps in
3468 # the ranges that this new range affects, and then calling this
3469 # function recursively on each of those gaps, leaving untouched
3470 # anything already in the list. Gather up a list of the changed
3471 # gaps first so that changes to the internal state as new ranges
3472 # are added won't be a problem.
3473 my @gap_list;
3474
3475 # First, if the starting point of the input range is outside an
3476 # existing one, there is a gap from there to the beginning of the
3477 # existing range -- add a span to fill the part that this new
3478 # range occupies
3479 if ($start < $r->[$i]->start) {
3480 push @gap_list, Range->new($start,
3481 main::min($end,
3482 $r->[$i]->start - 1),
3483 Type => $type);
3484 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3485 }
3486
3487 # Then look through the range list for other gaps until we reach
3488 # the highest range affected by the input one.
3489 my $j;
3490 for ($j = $i+1; $j < $range_list_size; $j++) {
3491 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3492 last if $end < $r->[$j]->start;
3493
3494 # If there is a gap between when this range starts and the
3495 # previous one ends, add a span to fill it. Note that just
3496 # because there are two ranges doesn't mean there is a
3497 # non-zero gap between them. It could be that they have
3498 # different values or types
3499 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3500 push @gap_list,
3501 Range->new($r->[$j-1]->end + 1,
3502 $r->[$j]->start - 1,
3503 Type => $type);
3504 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3505 }
3506 }
3507
3508 # Here, we have either found an existing range in the range list,
3509 # beyond the area affected by the input one, or we fell off the
3510 # end of the loop because the input range affects the whole rest
3511 # of the range list. In either case, $j is 1 higher than the
3512 # highest affected range. If $j == $i, it means that there are no
3513 # affected ranges, that the entire insertion is in the gap between
3514 # r[$i-1], and r[$i], which we already have taken care of before
3515 # the loop.
3516 # On the other hand, if there are affected ranges, it might be
3517 # that there is a gap that needs filling after the final such
3518 # range to the end of the input range
3519 if ($r->[$j-1]->end < $end) {
3520 push @gap_list, Range->new(main::max($start,
3521 $r->[$j-1]->end + 1),
3522 $end,
3523 Type => $type);
3524 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3525 }
3526
3527 # Call recursively to fill in all the gaps.
3528 foreach my $gap (@gap_list) {
3529 $self->_add_delete($operation,
3530 $gap->start,
3531 $gap->end,
3532 $value,
3533 Type => $type);
3534 }
3535
3536 return;
3537 }
3538
53d84487
KW
3539 # Here, we have taken care of the case where $replace is $NO.
3540 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3541 # If inserting a multiple record, this is where it goes, before the
3542 # first (if any) existing one. This implies an insertion, and no
3543 # change to any existing ranges. Note that $i can be -1 if this new
3544 # range doesn't actually duplicate any existing, and comes at the
3545 # beginning of the list.
3546 if ($replace == $MULTIPLE) {
3547
3548 if ($start != $end) {
3549 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3550 return;
3551 }
3552
3553 # Don't add an exact duplicate, as it isn't really a multiple
3554 if ($end >= $r->[$i]->start) {
1f6798c4
KW
3555 my $existing_value = $r->[$i]->value;
3556 my $existing_type = $r->[$i]->type;
3557 return if $value eq $existing_value && $type eq $existing_type;
3558
3559 # If the multiple value is part of an existing range, we want
3560 # to split up that range, so that only the single code point
3561 # is affected. To do this, we first call ourselves
3562 # recursively to delete that code point from the table, having
3563 # preserved its current data above. Then we call ourselves
3564 # recursively again to add the new multiple, which we know by
3565 # the test just above is different than the current code
3566 # point's value, so it will become a range containing a single
3567 # code point: just itself. Finally, we add back in the
3568 # pre-existing code point, which will again be a single code
3569 # point range. Because 'i' likely will have changed as a
3570 # result of these operations, we can't just continue on, but
3571 # do this operation recursively as well.
53d84487 3572 if ($r->[$i]->start != $r->[$i]->end) {
1f6798c4
KW
3573 $self->_add_delete('-', $start, $end, "");
3574 $self->_add_delete('+', $start, $end, $value, Type => $type);
3575 return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
53d84487 3576 }
53d84487
KW
3577 }
3578
3579 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3580 my @return = splice @$r,
3581 $i,
3582 0,
3583 Range->new($start,
3584 $end,
3585 Value => $value,
3586 Type => $type);
3587 if (main::DEBUG && $to_trace) {
3588 trace "After splice:";
3589 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3590 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3591 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3592 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3593 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3594 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3595 }
3596 return @return;
3597 }
3598
3599 # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
3600 # delete, insert, and replace either unconditionally or if not
3601 # equivalent. $i still points to the first potential affected range.
3602 # Now find the highest range affected, which will determine the length
3603 # parameter to splice. (The input range can span multiple existing
3604 # ones.) If this isn't a deletion, while we are looking through the
3605 # range list, see also if this is a replacement rather than a clean
3606 # insertion; that is if it will change the values of at least one
3607 # existing range. Start off assuming it is an insert, until find it
3608 # isn't.
3609 my $clean_insert = $operation eq '+';
99870f4d
KW
3610 my $j; # This will point to the highest affected range
3611
3612 # For non-zero types, the standard form is the value itself;
3613 my $standard_form = ($type) ? $value : main::standardize($value);
3614
3615 for ($j = $i; $j < $range_list_size; $j++) {
3616 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3617
3618 # If find a range that it doesn't overlap into, we can stop
3619 # searching
3620 last if $end < $r->[$j]->start;
3621
969a34cc
KW
3622 # Here, overlaps the range at $j. If the values don't match,
3623 # and so far we think this is a clean insertion, it becomes a
3624 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3625 if ($clean_insert) {
99870f4d 3626 if ($r->[$j]->standard_form ne $standard_form) {
969a34cc 3627 $clean_insert = 0;
56343c78
KW
3628 if ($replace == $CROAK) {
3629 main::croak("The range to add "
3630 . sprintf("%04X", $start)
3631 . '-'
3632 . sprintf("%04X", $end)
3633 . " with value '$value' overlaps an existing range $r->[$j]");
3634 }
99870f4d
KW
3635 }
3636 else {
3637
3638 # Here, the two values are essentially the same. If the
3639 # two are actually identical, replacing wouldn't change
3640 # anything so skip it.
3641 my $pre_existing = $r->[$j]->value;
3642 if ($pre_existing ne $value) {
3643
3644 # Here the new and old standardized values are the
3645 # same, but the non-standardized values aren't. If
3646 # replacing unconditionally, then replace
3647 if( $replace == $UNCONDITIONALLY) {
969a34cc 3648 $clean_insert = 0;
99870f4d
KW
3649 }
3650 else {
3651
3652 # Here, are replacing conditionally. Decide to
3653 # replace or not based on which appears to look
3654 # the "nicest". If one is mixed case and the
3655 # other isn't, choose the mixed case one.
3656 my $new_mixed = $value =~ /[A-Z]/
3657 && $value =~ /[a-z]/;
3658 my $old_mixed = $pre_existing =~ /[A-Z]/
3659 && $pre_existing =~ /[a-z]/;
3660
3661 if ($old_mixed != $new_mixed) {
969a34cc 3662 $clean_insert = 0 if $new_mixed;
99870f4d 3663 if (main::DEBUG && $to_trace) {
969a34cc
KW
3664 if ($clean_insert) {
3665 trace "Retaining $pre_existing over $value";
99870f4d
KW
3666 }
3667 else {
969a34cc 3668 trace "Replacing $pre_existing with $value";
99870f4d
KW
3669 }
3670 }
3671 }
3672 else {
3673
3674 # Here casing wasn't different between the two.
3675 # If one has hyphens or underscores and the
3676 # other doesn't, choose the one with the
3677 # punctuation.
3678 my $new_punct = $value =~ /[-_]/;
3679 my $old_punct = $pre_existing =~ /[-_]/;
3680
3681 if ($old_punct != $new_punct) {
969a34cc 3682 $clean_insert = 0 if $new_punct;
99870f4d 3683 if (main::DEBUG && $to_trace) {
969a34cc
KW
3684 if ($clean_insert) {
3685 trace "Retaining $pre_existing over $value";
99870f4d
KW
3686 }
3687 else {
969a34cc 3688 trace "Replacing $pre_existing with $value";
99870f4d
KW
3689 }
3690 }
3691 } # else existing one is just as "good";
3692 # retain it to save cycles.
3693 }
3694 }
3695 }
3696 }
3697 }
3698 } # End of loop looking for highest affected range.
3699
3700 # Here, $j points to one beyond the highest range that this insertion
3701 # affects (hence to beyond the range list if that range is the final
3702 # one in the range list).
3703
3704 # The splice length is all the affected ranges. Get it before
3705 # subtracting, for efficiency, so we don't have to later add 1.
3706 my $length = $j - $i;
3707
3708 $j--; # $j now points to the highest affected range.
3709 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3710
99870f4d
KW
3711 # Here, have taken care of $NO and $MULTIPLE replaces.
3712 # $j points to the highest affected range. But it can be < $i or even
3713 # -1. These happen only if the insertion is entirely in the gap
3714 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3715 # above exited first time through with $end < $r->[$i]->start. (And
3716 # then we subtracted one from j) This implies also that $start <
3717 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3718 # $start, so the entire input range is in the gap.
3719 if ($j < $i) {
3720
3721 # Here the entire input range is in the gap before $i.
3722
3723 if (main::DEBUG && $to_trace) {
3724 if ($i) {
3725 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3726 }
3727 else {
3728 trace "Entire range is before $r->[$i]";
3729 }
3730 }
3731 return if $operation ne '+'; # Deletion of a non-existent range is
3732 # a no-op
3733 }
3734 else {
3735
969a34cc
KW
3736 # Here part of the input range is not in the gap before $i. Thus,
3737 # there is at least one affected one, and $j points to the highest
3738 # such one.
99870f4d
KW
3739
3740 # At this point, here is the situation:
3741 # This is not an insertion of a multiple, nor of tentative ($NO)
3742 # data.
3743 # $i points to the first element in the current range list that
3744 # may be affected by this operation. In fact, we know
3745 # that the range at $i is affected because we are in
3746 # the else branch of this 'if'
3747 # $j points to the highest affected range.
3748 # In other words,
3749 # r[$i-1]->end < $start <= r[$i]->end
3750 # And:
3751 # r[$i-1]->end < $start <= $end <= r[$j]->end
3752 #
3753 # Also:
969a34cc
KW
3754 # $clean_insert is a boolean which is set true if and only if
3755 # this is a "clean insertion", i.e., not a change nor a
3756 # deletion (multiple was handled above).
99870f4d
KW
3757
3758 # We now have enough information to decide if this call is a no-op
969a34cc
KW
3759 # or not. It is a no-op if this is an insertion of already
3760 # existing data.
99870f4d 3761
969a34cc 3762 if (main::DEBUG && $to_trace && $clean_insert
99870f4d
KW
3763 && $i == $j
3764 && $start >= $r->[$i]->start)
3765 {
3766 trace "no-op";
3767 }
969a34cc 3768 return if $clean_insert
99870f4d
KW
3769 && $i == $j # more than one affected range => not no-op
3770
3771 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3772 # Further, $start and/or $end is >= r[$i]->start
3773 # The test below hence guarantees that
3774 # r[$i]->start < $start <= $end <= r[$i]->end
3775 # This means the input range is contained entirely in
3776 # the one at $i, so is a no-op
3777 && $start >= $r->[$i]->start;
3778 }
3779
3780 # Here, we know that some action will have to be taken. We have
3781 # calculated the offset and length (though adjustments may be needed)
3782 # for the splice. Now start constructing the replacement list.
3783 my @replacement;
3784 my $splice_start = $i;
3785
3786 my $extends_below;
3787 my $extends_above;
3788
3789 # See if should extend any adjacent ranges.
3790 if ($operation eq '-') { # Don't extend deletions
3791 $extends_below = $extends_above = 0;
3792 }
3793 else { # Here, should extend any adjacent ranges. See if there are
3794 # any.
3795 $extends_below = ($i > 0
3796 # can't extend unless adjacent
3797 && $r->[$i-1]->end == $start -1
3798 # can't extend unless are same standard value
3799 && $r->[$i-1]->standard_form eq $standard_form
3800 # can't extend unless share type
3801 && $r->[$i-1]->type == $type);
3802 $extends_above = ($j+1 < $range_list_size
3803 && $r->[$j+1]->start == $end +1
3804 && $r->[$j+1]->standard_form eq $standard_form
23822bda 3805 && $r->[$j+1]->type == $type);
99870f4d
KW
3806 }
3807 if ($extends_below && $extends_above) { # Adds to both
3808 $splice_start--; # start replace at element below
3809 $length += 2; # will replace on both sides
3810 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3811
3812 # The result will fill in any gap, replacing both sides, and
3813 # create one large range.
3814 @replacement = Range->new($r->[$i-1]->start,
3815 $r->[$j+1]->end,
3816 Value => $value,
3817 Type => $type);
3818 }
3819 else {
3820
3821 # Here we know that the result won't just be the conglomeration of
3822 # a new range with both its adjacent neighbors. But it could
3823 # extend one of them.
3824
3825 if ($extends_below) {
3826
3827 # Here the new element adds to the one below, but not to the
3828 # one above. If inserting, and only to that one range, can
3829 # just change its ending to include the new one.
969a34cc 3830 if ($length == 0 && $clean_insert) {
99870f4d
KW
3831 $r->[$i-1]->set_end($end);
3832 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3833 return;
3834 }
3835 else {
3836 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3837 $splice_start--; # start replace at element below
3838 $length++; # will replace the element below
3839 $start = $r->[$i-1]->start;
3840 }
3841 }
3842 elsif ($extends_above) {
3843
3844 # Here the new element adds to the one above, but not below.
3845 # Mirror the code above
969a34cc 3846 if ($length == 0 && $clean_insert) {
99870f4d
KW
3847 $r->[$j+1]->set_start($start);
3848 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3849 return;
3850 }
3851 else {
3852 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3853 $length++; # will replace the element above
3854 $end = $r->[$j+1]->end;
3855 }
3856 }
3857
3858 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3859
3860 # Finally, here we know there will have to be a splice.
3861 # If the change or delete affects only the highest portion of the
3862 # first affected range, the range will have to be split. The
3863 # splice will remove the whole range, but will replace it by a new
3864 # range containing just the unaffected part. So, in this case,
3865 # add to the replacement list just this unaffected portion.
3866 if (! $extends_below
3867 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3868 {
3869 push @replacement,
3870 Range->new($r->[$i]->start,
3871 $start - 1,
3872 Value => $r->[$i]->value,
3873 Type => $r->[$i]->type);
3874 }
3875
3876 # In the case of an insert or change, but not a delete, we have to
3877 # put in the new stuff; this comes next.
3878 if ($operation eq '+') {
3879 push @replacement, Range->new($start,
3880 $end,
3881 Value => $value,
3882 Type => $type);
3883 }
3884
3885 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3886 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3887
3888 # And finally, if we're changing or deleting only a portion of the
3889 # highest affected range, it must be split, as the lowest one was.
3890 if (! $extends_above
3891 && $j >= 0 # Remember that j can be -1 if before first
3892 # current element
3893 && $end >= $r->[$j]->start
3894 && $end < $r->[$j]->end)
3895 {
3896 push @replacement,
3897 Range->new($end + 1,
3898 $r->[$j]->end,
3899 Value => $r->[$j]->value,
3900 Type => $r->[$j]->type);
3901 }
3902 }
3903
3904 # And do the splice, as calculated above
3905 if (main::DEBUG && $to_trace) {
3906 trace "replacing $length element(s) at $i with ";
3907 foreach my $replacement (@replacement) {
3908 trace " $replacement";
3909 }
3910 trace "Before splice:";
3911 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3912 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3913 trace "i =[", $i, "]", $r->[$i];
3914 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3915 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3916 }
3917
3918 my @return = splice @$r, $splice_start, $length, @replacement;
3919
3920 if (main::DEBUG && $to_trace) {
3921 trace "After splice:";
3922 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3923 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3924 trace "i =[", $i, "]", $r->[$i];
3925 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3926 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
e6451557 3927 trace "removed ", @return if @return;
99870f4d
KW
3928 }
3929
3930 # An actual deletion could have changed the maximum in the list.
3931 # There was no deletion if the splice didn't return something, but
3932 # otherwise recalculate it. This is done too rarely to worry about
3933 # performance.
3934 if ($operation eq '-' && @return) {
3935 $max{$addr} = $r->[-1]->end;
3936 }
3937 return @return;
3938 }
3939
3940 sub reset_each_range { # reset the iterator for each_range();
3941 my $self = shift;
3942 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3943
f998e60c 3944 no overloading;
051df77b 3945 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
3946 return;
3947 }
3948
3949 sub each_range {
3950 # Iterate over each range in a range list. Results are undefined if
3951 # the range list is changed during the iteration.
3952
3953 my $self = shift;
3954 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3955
ffe43484 3956 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3957
3958 return if $self->is_empty;
3959
3960 $each_range_iterator{$addr} = -1
3961 if ! defined $each_range_iterator{$addr};
3962 $each_range_iterator{$addr}++;
3963 return $ranges{$addr}->[$each_range_iterator{$addr}]
3964 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3965 undef $each_range_iterator{$addr};
3966 return;
3967 }
3968
3969 sub count { # Returns count of code points in range list
3970 my $self = shift;
3971 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3972
ffe43484 3973 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3974
3975 my $count = 0;
3976 foreach my $range (@{$ranges{$addr}}) {
3977 $count += $range->end - $range->start + 1;
3978 }
3979 return $count;
3980 }
3981
3982 sub delete_range { # Delete a range
3983 my $self = shift;
3984 my $start = shift;
3985 my $end = shift;
3986
3987 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3988
3989 return $self->_add_delete('-', $start, $end, "");
3990 }
3991
3992 sub is_empty { # Returns boolean as to if a range list is empty
3993 my $self = shift;
3994 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3995
f998e60c 3996 no overloading;
051df77b 3997 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
3998 }
3999
4000 sub hash {
4001 # Quickly returns a scalar suitable for separating tables into
4002 # buckets, i.e. it is a hash function of the contents of a table, so
4003 # there are relatively few conflicts.
4004
4005 my $self = shift;
4006 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4007
ffe43484 4008 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4009
4010 # These are quickly computable. Return looks like 'min..max;count'
4011 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4012 }
4013} # End closure for _Range_List_Base
4014
4015package Range_List;
4016use base '_Range_List_Base';
4017
4018# A Range_List is a range list for match tables; i.e. the range values are
4019# not significant. Thus a number of operations can be safely added to it,
4020# such as inversion, intersection. Note that union is also an unsafe
4021# operation when range values are cared about, and that method is in the base
4022# class, not here. But things are set up so that that method is callable only
4023# during initialization. Only in this derived class, is there an operation
4024# that combines two tables. A Range_Map can thus be used to initialize a
4025# Range_List, and its mappings will be in the list, but are not significant to
4026# this class.
4027
4028sub trace { return main::trace(@_); }
4029
4030{ # Closure
4031
4032 use overload
4033 fallback => 0,
4034 '+' => sub { my $self = shift;
4035 my $other = shift;
4036
4037 return $self->_union($other)
4038 },
4039 '&' => sub { my $self = shift;
4040 my $other = shift;
4041
4042 return $self->_intersect($other, 0);
4043 },
4044 '~' => "_invert",
4045 '-' => "_subtract",
4046 ;
4047
4048 sub _invert {
4049 # Returns a new Range_List that gives all code points not in $self.
4050
4051 my $self = shift;
4052
4053 my $new = Range_List->new;
4054
4055 # Go through each range in the table, finding the gaps between them
4056 my $max = -1; # Set so no gap before range beginning at 0
4057 for my $range ($self->ranges) {
4058 my $start = $range->start;
4059 my $end = $range->end;
4060
4061 # If there is a gap before this range, the inverse will contain
4062 # that gap.
4063 if ($start > $max + 1) {
4064 $new->add_range($max + 1, $start - 1);
4065 }
4066 $max = $end;
4067 }
4068
4069 # And finally, add the gap from the end of the table to the max
4070 # possible code point
6189eadc
KW
4071 if ($max < $MAX_UNICODE_CODEPOINT) {
4072 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
99870f4d
KW
4073 }
4074 return $new;
4075 }
4076
4077 sub _subtract {
4078 # Returns a new Range_List with the argument deleted from it. The
4079 # argument can be a single code point, a range, or something that has
4080 # a range, with the _range_list() method on it returning them
4081
4082 my $self = shift;
4083 my $other = shift;
4084 my $reversed = shift;
4085 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4086
4087 if ($reversed) {
4088 Carp::my_carp_bug("Can't cope with a "
4089 . __PACKAGE__
4090 . " being the second parameter in a '-'. Subtraction ignored.");
4091 return $self;
4092 }
4093
4094 my $new = Range_List->new(Initialize => $self);
4095
4096 if (! ref $other) { # Single code point
4097 $new->delete_range($other, $other);
4098 }
4099 elsif ($other->isa('Range')) {
4100 $new->delete_range($other->start, $other->end);
4101 }
4102 elsif ($other->can('_range_list')) {
4103 foreach my $range ($other->_range_list->ranges) {
4104 $new->delete_range($range->start, $range->end);
4105 }
4106 }
4107 else {
4108 Carp::my_carp_bug("Can't cope with a "
4109 . ref($other)
4110 . " argument to '-'. Subtraction ignored."
4111 );
4112 return $self;
4113 }
4114
4115 return $new;
4116 }
4117
4118 sub _intersect {
4119 # Returns either a boolean giving whether the two inputs' range lists
4120 # intersect (overlap), or a new Range_List containing the intersection
4121 # of the two lists. The optional final parameter being true indicates
4122 # to do the check instead of the intersection.
4123
4124 my $a_object = shift;
4125 my $b_object = shift;
4126 my $check_if_overlapping = shift;
4127 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4128 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4129
4130 if (! defined $b_object) {
4131 my $message = "";
4132 $message .= $a_object->_owner_name_of if defined $a_object;
4133 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4134 return;
4135 }
4136
4137 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4138 # Thus the intersection could be much more simply be written:
4139 # return ~(~$a_object + ~$b_object);
4140 # But, this is slower, and when taking the inverse of a large
4141 # range_size_1 table, back when such tables were always stored that
4142 # way, it became prohibitively slow, hence the code was changed to the
4143 # below
4144
4145 if ($b_object->isa('Range')) {
4146 $b_object = Range_List->new(Initialize => $b_object,
4147 Owner => $a_object->_owner_name_of);
4148 }
4149 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4150
4151 my @a_ranges = $a_object->ranges;
4152 my @b_ranges = $b_object->ranges;
4153
4154 #local $to_trace = 1 if main::DEBUG;
4155 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4156
4157 # Start with the first range in each list
4158 my $a_i = 0;
4159 my $range_a = $a_ranges[$a_i];
4160 my $b_i = 0;
4161 my $range_b = $b_ranges[$b_i];
4162
4163 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4164 if ! $check_if_overlapping;
4165
4166 # If either list is empty, there is no intersection and no overlap
4167 if (! defined $range_a || ! defined $range_b) {
4168 return $check_if_overlapping ? 0 : $new;
4169 }
4170 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4171
4172 # Otherwise, must calculate the intersection/overlap. Start with the
4173 # very first code point in each list
4174 my $a = $range_a->start;
4175 my $b = $range_b->start;
4176
4177 # Loop through all the ranges of each list; in each iteration, $a and
4178 # $b are the current code points in their respective lists
4179 while (1) {
4180
4181 # If $a and $b are the same code point, ...
4182 if ($a == $b) {
4183
4184 # it means the lists overlap. If just checking for overlap
4185 # know the answer now,
4186 return 1 if $check_if_overlapping;
4187
4188 # The intersection includes this code point plus anything else
4189 # common to both current ranges.
4190 my $start = $a;
4191 my $end = main::min($range_a->end, $range_b->end);
4192 if (! $check_if_overlapping) {
4193 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4194 $new->add_range($start, $end);
4195 }
4196
4197 # Skip ahead to the end of the current intersect
4198 $a = $b = $end;
4199
4200 # If the current intersect ends at the end of either range (as
4201 # it must for at least one of them), the next possible one
4202 # will be the beginning code point in it's list's next range.
4203 if ($a == $range_a->end) {
4204 $range_a = $a_ranges[++$a_i];
4205 last unless defined $range_a;
4206 $a = $range_a->start;
4207 }
4208 if ($b == $range_b->end) {
4209 $range_b = $b_ranges[++$b_i];
4210 last unless defined $range_b;
4211 $b = $range_b->start;
4212 }
4213
4214 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4215 }
4216 elsif ($a < $b) {
4217
4218 # Not equal, but if the range containing $a encompasses $b,
4219 # change $a to be the middle of the range where it does equal
4220 # $b, so the next iteration will get the intersection
4221 if ($range_a->end >= $b) {
4222 $a = $b;
4223 }
4224 else {
4225
4226 # Here, the current range containing $a is entirely below
4227 # $b. Go try to find a range that could contain $b.
4228 $a_i = $a_object->_search_ranges($b);
4229
4230 # If no range found, quit.
4231 last unless defined $a_i;
4232
4233 # The search returns $a_i, such that
4234 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4235 # Set $a to the beginning of this new range, and repeat.
4236 $range_a = $a_ranges[$a_i];
4237 $a = $range_a->start;
4238 }
4239 }
4240 else { # Here, $b < $a.
4241
4242 # Mirror image code to the leg just above
4243 if ($range_b->end >= $a) {
4244 $b = $a;
4245 }
4246 else {
4247 $b_i = $b_object->_search_ranges($a);
4248 last unless defined $b_i;
4249 $range_b = $b_ranges[$b_i];
4250 $b = $range_b->start;
4251 }
4252 }
4253 } # End of looping through ranges.
4254
4255 # Intersection fully computed, or now know that there is no overlap
4256 return $check_if_overlapping ? 0 : $new;
4257 }
4258
4259 sub overlaps {
4260 # Returns boolean giving whether the two arguments overlap somewhere
4261
4262 my $self = shift;
4263 my $other = shift;
4264 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4265
4266 return $self->_intersect($other, 1);
4267 }
4268
4269 sub add_range {
4270 # Add a range to the list.
4271
4272 my $self = shift;
4273 my $start = shift;
4274 my $end = shift;
4275 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4276
4277 return $self->_add_delete('+', $start, $end, "");
4278 }
4279
09aba7e4
KW
4280 sub matches_identically_to {
4281 # Return a boolean as to whether or not two Range_Lists match identical
4282 # sets of code points.
4283
4284 my $self = shift;
4285 my $other = shift;
4286 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4287
4288 # These are ordered in increasing real time to figure out (at least
4289 # until a patch changes that and doesn't change this)
4290 return 0 if $self->max != $other->max;
4291 return 0 if $self->min != $other->min;
4292 return 0 if $self->range_count != $other->range_count;
4293 return 0 if $self->count != $other->count;
4294
4295 # Here they could be identical because all the tests above passed.
4296 # The loop below is somewhat simpler since we know they have the same
4297 # number of elements. Compare range by range, until reach the end or
4298 # find something that differs.
4299 my @a_ranges = $self->ranges;
4300 my @b_ranges = $other->ranges;
4301 for my $i (0 .. @a_ranges - 1) {
4302 my $a = $a_ranges[$i];
4303 my $b = $b_ranges[$i];
4304 trace "self $a; other $b" if main::DEBUG && $to_trace;
c1c2d9e8
KW
4305 return 0 if ! defined $b
4306 || $a->start != $b->start
4307 || $a->end != $b->end;
09aba7e4
KW
4308 }
4309 return 1;
4310 }
4311
99870f4d
KW
4312 sub is_code_point_usable {
4313 # This used only for making the test script. See if the input
4314 # proposed trial code point is one that Perl will handle. If second
4315 # parameter is 0, it won't select some code points for various
4316 # reasons, noted below.
4317
4318 my $code = shift;
4319 my $try_hard = shift;
4320 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4321
4322 return 0 if $code < 0; # Never use a negative
4323
99870f4d
KW
4324 # shun null. I'm (khw) not sure why this was done, but NULL would be
4325 # the character very frequently used.
4326 return $try_hard if $code == 0x0000;
4327
99870f4d
KW
4328 # shun non-character code points.
4329 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4330 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4331
6189eadc 4332 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
99870f4d
KW
4333 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4334
4335 return 1;
4336 }
4337
4338 sub get_valid_code_point {
4339 # Return a code point that's part of the range list. Returns nothing
4340 # if the table is empty or we can't find a suitable code point. This
4341 # used only for making the test script.
4342
4343 my $self = shift;
4344 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4345
ffe43484 4346 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4347
4348 # On first pass, don't choose less desirable code points; if no good
4349 # one is found, repeat, allowing a less desirable one to be selected.
4350 for my $try_hard (0, 1) {
4351
4352 # Look through all the ranges for a usable code point.
4353 for my $set ($self->ranges) {
4354
4355 # Try the edge cases first, starting with the end point of the
4356 # range.
4357 my $end = $set->end;
4358 return $end if is_code_point_usable($end, $try_hard);
4359
4360 # End point didn't, work. Start at the beginning and try
4361 # every one until find one that does work.
4362 for my $trial ($set->start .. $end - 1) {
4363 return $trial if is_code_point_usable($trial, $try_hard);
4364 }
4365 }
4366 }
4367 return (); # If none found, give up.
4368 }
4369
4370 sub get_invalid_code_point {
4371 # Return a code point that's not part of the table. Returns nothing
4372 # if the table covers all code points or a suitable code point can't
4373 # be found. This used only for making the test script.
4374
4375 my $self = shift;
4376 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4377
4378 # Just find a valid code point of the inverse, if any.
4379 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4380 }
4381} # end closure for Range_List
4382
4383package Range_Map;
4384use base '_Range_List_Base';
4385
4386# A Range_Map is a range list in which the range values (called maps) are
4387# significant, and hence shouldn't be manipulated by our other code, which
4388# could be ambiguous or lose things. For example, in taking the union of two
4389# lists, which share code points, but which have differing values, which one
4390# has precedence in the union?
4391# It turns out that these operations aren't really necessary for map tables,
4392# and so this class was created to make sure they aren't accidentally
4393# applied to them.
4394
4395{ # Closure
4396
4397 sub add_map {
4398 # Add a range containing a mapping value to the list
4399
4400 my $self = shift;
4401 # Rest of parameters passed on
4402
4403 return $self->_add_delete('+', @_);
4404 }
4405
4406 sub add_duplicate {
4407 # Adds entry to a range list which can duplicate an existing entry
4408
4409 my $self = shift;
4410 my $code_point = shift;
4411 my $value = shift;
4412 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4413
4414 return $self->add_map($code_point, $code_point,
4415 $value, Replace => $MULTIPLE);
4416 }
4417} # End of closure for package Range_Map
4418
4419package _Base_Table;
4420
4421# A table is the basic data structure that gets written out into a file for
4422# use by the Perl core. This is the abstract base class implementing the
4423# common elements from the derived ones. A list of the methods to be
4424# furnished by an implementing class is just after the constructor.
4425
4426sub standardize { return main::standardize($_[0]); }
4427sub trace { return main::trace(@_); }
4428
4429{ # Closure
4430
4431 main::setup_package();
4432
4433 my %range_list;
4434 # Object containing the ranges of the table.
4435 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4436
4437 my %full_name;
4438 # The full table name.
4439 main::set_access('full_name', \%full_name, 'r');
4440
4441 my %name;
4442 # The table name, almost always shorter
4443 main::set_access('name', \%name, 'r');
4444
4445 my %short_name;
4446 # The shortest of all the aliases for this table, with underscores removed
4447 main::set_access('short_name', \%short_name);
4448
4449 my %nominal_short_name_length;
4450 # The length of short_name before removing underscores
4451 main::set_access('nominal_short_name_length',
4452 \%nominal_short_name_length);
4453
23e33b60
KW
4454 my %complete_name;
4455 # The complete name, including property.
4456 main::set_access('complete_name', \%complete_name, 'r');
4457
99870f4d
KW
4458 my %property;
4459 # Parent property this table is attached to.
4460 main::set_access('property', \%property, 'r');
4461
4462 my %aliases;
c12f2655
KW
4463 # Ordered list of alias objects of the table's name. The first ones in
4464 # the list are output first in comments
99870f4d
KW
4465 main::set_access('aliases', \%aliases, 'readable_array');
4466
4467 my %comment;
4468 # A comment associated with the table for human readers of the files
4469 main::set_access('comment', \%comment, 's');
4470
4471 my %description;
4472 # A comment giving a short description of the table's meaning for human
4473 # readers of the files.
4474 main::set_access('description', \%description, 'readable_array');
4475
4476 my %note;
4477 # A comment giving a short note about the table for human readers of the
4478 # files.
4479 main::set_access('note', \%note, 'readable_array');
4480
301ba948
KW
4481 my %fate;
4482 # Enum; there are a number of possibilities for what happens to this
4483 # table: it could be normal, or suppressed, or not for external use. See
4484 # values at definition for $SUPPRESSED.
4485 main::set_access('fate', \%fate, 'r');
99870f4d
KW
4486
4487 my %find_table_from_alias;
4488 # The parent property passes this pointer to a hash which this class adds
4489 # all its aliases to, so that the parent can quickly take an alias and
4490 # find this table.
4491 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4492
4493 my %locked;
4494 # After this table is made equivalent to another one; we shouldn't go
4495 # changing the contents because that could mean it's no longer equivalent
4496 main::set_access('locked', \%locked, 'r');
4497
4498 my %file_path;
4499 # This gives the final path to the file containing the table. Each
4500 # directory in the path is an element in the array
4501 main::set_access('file_path', \%file_path, 'readable_array');
4502
4503 my %status;
4504 # What is the table's status, normal, $OBSOLETE, etc. Enum
4505 main::set_access('status', \%status, 'r');
4506
4507 my %status_info;
4508 # A comment about its being obsolete, or whatever non normal status it has
4509 main::set_access('status_info', \%status_info, 'r');
4510
d867ccfb
KW
4511 my %caseless_equivalent;
4512 # The table this is equivalent to under /i matching, if any.
4513 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4514
99870f4d
KW
4515 my %range_size_1;
4516 # Is the table to be output with each range only a single code point?
4517 # This is done to avoid breaking existing code that may have come to rely
4518 # on this behavior in previous versions of this program.)
4519 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4520
4521 my %perl_extension;
4522 # A boolean set iff this table is a Perl extension to the Unicode
4523 # standard.
4524 main::set_access('perl_extension', \%perl_extension, 'r');
4525
0c07e538
KW
4526 my %output_range_counts;
4527 # A boolean set iff this table is to have comments written in the
4528 # output file that contain the number of code points in the range.
4529 # The constructor can override the global flag of the same name.
4530 main::set_access('output_range_counts', \%output_range_counts, 'r');
4531
f5817e0a
KW
4532 my %format;
4533 # The format of the entries of the table. This is calculated from the
4534 # data in the table (or passed in the constructor). This is an enum e.g.,
4535 # $STRING_FORMAT
4536 main::set_access('format', \%format, 'r', 'p_s');
4537
99870f4d
KW
4538 sub new {
4539 # All arguments are key => value pairs, which you can see below, most
33e96e72 4540 # of which match fields documented above. Otherwise: Re_Pod_Entry,
99870f4d
KW
4541 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4542 # documented in the Alias package
4543
4544 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4545
4546 my $class = shift;
4547
4548 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4549 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4550
4551 my %args = @_;
4552
4553 $name{$addr} = delete $args{'Name'};
4554 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4555 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4556 my $complete_name = $complete_name{$addr}
4557 = delete $args{'Complete_Name'};
f5817e0a 4558 $format{$addr} = delete $args{'Format'};
0c07e538 4559 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4560 $property{$addr} = delete $args{'_Property'};
4561 $range_list{$addr} = delete $args{'_Range_List'};
4562 $status{$addr} = delete $args{'Status'} || $NORMAL;
4563 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4564 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
d867ccfb 4565 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
301ba948 4566 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
99870f4d
KW
4567
4568 my $description = delete $args{'Description'};
4569 my $externally_ok = delete $args{'Externally_Ok'};
4570 my $loose_match = delete $args{'Fuzzy'};
4571 my $note = delete $args{'Note'};
33e96e72 4572 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
37e2e78e 4573 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4574
4575 # Shouldn't have any left over
4576 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4577
4578 # Can't use || above because conceivably the name could be 0, and
4579 # can't use // operator in case this program gets used in Perl 5.8
4580 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4581 $output_range_counts{$addr} = $output_range_counts if
4582 ! defined $output_range_counts{$addr};
99870f4d
KW
4583
4584 $aliases{$addr} = [ ];
4585 $comment{$addr} = [ ];
4586 $description{$addr} = [ ];
4587 $note{$addr} = [ ];
4588 $file_path{$addr} = [ ];
4589 $locked{$addr} = "";
4590
4591 push @{$description{$addr}}, $description if $description;
4592 push @{$note{$addr}}, $note if $note;
4593
301ba948 4594 if ($fate{$addr} == $PLACEHOLDER) {
37e2e78e
KW
4595
4596 # A placeholder table doesn't get documented, is a perl extension,
4597 # and quite likely will be empty
33e96e72 4598 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
37e2e78e
KW
4599 $perl_extension = 1 if ! defined $perl_extension;
4600 push @tables_that_may_be_empty, $complete_name{$addr};
301ba948
KW
4601 $self->add_comment(<<END);
4602This is a placeholder because it is not in Version $string_version of Unicode,
4603but is needed by the Perl core to work gracefully. Because it is not in this
4604version of Unicode, it will not be listed in $pod_file.pod
4605END
37e2e78e 4606 }
301ba948 4607 elsif (exists $why_suppressed{$complete_name}
98dc9551 4608 # Don't suppress if overridden
ec11e5f4
KW
4609 && ! grep { $_ eq $complete_name{$addr} }
4610 @output_mapped_properties)
301ba948
KW
4611 {
4612 $fate{$addr} = $SUPPRESSED;
4613 }
4614 elsif ($fate{$addr} == $SUPPRESSED
4615 && ! exists $why_suppressed{$property{$addr}->complete_name})
4616 {
4617 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4618 # perhaps Fate => [ $SUPPRESSED, "reason" ]
4619 }
4620
4621 # If hasn't set its status already, see if it is on one of the
4622 # lists of properties or tables that have particular statuses; if
4623 # not, is normal. The lists are prioritized so the most serious
4624 # ones are checked first
4625 if (! $status{$addr}) {
4626 if (exists $why_deprecated{$complete_name}) {
99870f4d
KW
4627 $status{$addr} = $DEPRECATED;
4628 }
4629 elsif (exists $why_stabilized{$complete_name}) {
4630 $status{$addr} = $STABILIZED;
4631 }
4632 elsif (exists $why_obsolete{$complete_name}) {
4633 $status{$addr} = $OBSOLETE;
4634 }
4635
4636 # Existence above doesn't necessarily mean there is a message
4637 # associated with it. Use the most serious message.
4638 if ($status{$addr}) {
301ba948 4639 if ($why_deprecated{$complete_name}) {
99870f4d
KW
4640 $status_info{$addr}
4641 = $why_deprecated{$complete_name};
4642 }
4643 elsif ($why_stabilized{$complete_name}) {
4644 $status_info{$addr}
4645 = $why_stabilized{$complete_name};
4646 }
4647 elsif ($why_obsolete{$complete_name}) {
4648 $status_info{$addr}
4649 = $why_obsolete{$complete_name};
4650 }
4651 }
4652 }
4653
37e2e78e
KW
4654 $perl_extension{$addr} = $perl_extension || 0;
4655
8050d00f 4656 # Don't list a property by default that is internal only
301ba948
KW
4657 if ($fate{$addr} != $ORDINARY) {
4658 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4659 }
8050d00f 4660
99870f4d
KW
4661 # By convention what typically gets printed only or first is what's
4662 # first in the list, so put the full name there for good output
4663 # clarity. Other routines rely on the full name being first on the
4664 # list
4665 $self->add_alias($full_name{$addr},
4666 Externally_Ok => $externally_ok,
4667 Fuzzy => $loose_match,
33e96e72 4668 Re_Pod_Entry => $make_re_pod_entry,
99870f4d
KW
4669 Status => $status{$addr},
4670 );
4671
4672 # Then comes the other name, if meaningfully different.
4673 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4674 $self->add_alias($name{$addr},
4675 Externally_Ok => $externally_ok,
4676 Fuzzy => $loose_match,
33e96e72 4677 Re_Pod_Entry => $make_re_pod_entry,
99870f4d
KW
4678 Status => $status{$addr},
4679 );
4680 }
4681
4682 return $self;
4683 }
4684
4685 # Here are the methods that are required to be defined by any derived
4686 # class
ea25a9b2 4687 for my $sub (qw(
668b3bfc 4688 handle_special_range
99870f4d 4689 append_to_body
99870f4d 4690 pre_body
ea25a9b2 4691 ))
668b3bfc
KW
4692 # write() knows how to write out normal ranges, but it calls
4693 # handle_special_range() when it encounters a non-normal one.
4694 # append_to_body() is called by it after it has handled all
4695 # ranges to add anything after the main portion of the table.
4696 # And finally, pre_body() is called after all this to build up
4697 # anything that should appear before the main portion of the
4698 # table. Doing it this way allows things in the middle to
4699 # affect what should appear before the main portion of the
99870f4d 4700 # table.
99870f4d
KW
4701 {
4702 no strict "refs";
4703 *$sub = sub {
4704 Carp::my_carp_bug( __LINE__
4705 . ": Must create method '$sub()' for "
4706 . ref shift);
4707 return;
4708 }
4709 }
4710
4711 use overload
4712 fallback => 0,
4713 "." => \&main::_operator_dot,
4714 '!=' => \&main::_operator_not_equal,
4715 '==' => \&main::_operator_equal,
4716 ;
4717
4718 sub ranges {
4719 # Returns the array of ranges associated with this table.
4720
f998e60c 4721 no overloading;
051df77b 4722 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4723 }
4724
4725 sub add_alias {
4726 # Add a synonym for this table.
4727
4728 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4729
4730 my $self = shift;
4731 my $name = shift; # The name to add.
4732 my $pointer = shift; # What the alias hash should point to. For
4733 # map tables, this is the parent property;
4734 # for match tables, it is the table itself.
4735
4736 my %args = @_;
4737 my $loose_match = delete $args{'Fuzzy'};
4738
33e96e72
KW
4739 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4740 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
99870f4d
KW
4741
4742 my $externally_ok = delete $args{'Externally_Ok'};
4743 $externally_ok = 1 unless defined $externally_ok;
4744
4745 my $status = delete $args{'Status'};
4746 $status = $NORMAL unless defined $status;
4747
4748 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4749
4750 # Capitalize the first letter of the alias unless it is one of the CJK
4751 # ones which specifically begins with a lower 'k'. Do this because
4752 # Unicode has varied whether they capitalize first letters or not, and
4753 # have later changed their minds and capitalized them, but not the
4754 # other way around. So do it always and avoid changes from release to
4755 # release
4756 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4757
ffe43484 4758 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4759
4760 # Figure out if should be loosely matched if not already specified.
4761 if (! defined $loose_match) {
4762
4763 # Is a loose_match if isn't null, and doesn't begin with an
4764 # underscore and isn't just a number
4765 if ($name ne ""
4766 && substr($name, 0, 1) ne '_'
4767 && $name !~ qr{^[0-9_.+-/]+$})
4768 {
4769 $loose_match = 1;
4770 }
4771 else {
4772 $loose_match = 0;
4773 }
4774 }
4775
4776 # If this alias has already been defined, do nothing.
4777 return if defined $find_table_from_alias{$addr}->{$name};
4778
4779 # That includes if it is standardly equivalent to an existing alias,
4780 # in which case, add this name to the list, so won't have to search
4781 # for it again.
4782 my $standard_name = main::standardize($name);
4783 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4784 $find_table_from_alias{$addr}->{$name}
4785 = $find_table_from_alias{$addr}->{$standard_name};
4786 return;
4787 }
4788
4789 # Set the index hash for this alias for future quick reference.
4790 $find_table_from_alias{$addr}->{$name} = $pointer;
4791 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4792 local $to_trace = 0 if main::DEBUG;
4793 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4794 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4795
4796
4797 # Put the new alias at the end of the list of aliases unless the final
4798 # element begins with an underscore (meaning it is for internal perl
4799 # use) or is all numeric, in which case, put the new one before that
4800 # one. This floats any all-numeric or underscore-beginning aliases to
4801 # the end. This is done so that they are listed last in output lists,
4802 # to encourage the user to use a better name (either more descriptive
4803 # or not an internal-only one) instead. This ordering is relied on
4804 # implicitly elsewhere in this program, like in short_name()
4805 my $list = $aliases{$addr};
4806 my $insert_position = (@$list == 0
4807 || (substr($list->[-1]->name, 0, 1) ne '_'
4808 && $list->[-1]->name =~ /\D/))
4809 ? @$list
4810 : @$list - 1;
4811 splice @$list,
4812 $insert_position,
4813 0,
33e96e72 4814 Alias->new($name, $loose_match, $make_re_pod_entry,
99870f4d
KW
4815 $externally_ok, $status);
4816
4817 # This name may be shorter than any existing ones, so clear the cache
4818 # of the shortest, so will have to be recalculated.
f998e60c 4819 no overloading;
051df77b 4820 undef $short_name{pack 'J', $self};
99870f4d
KW
4821 return;
4822 }
4823
4824 sub short_name {
4825 # Returns a name suitable for use as the base part of a file name.
4826 # That is, shorter wins. It can return undef if there is no suitable
4827 # name. The name has all non-essential underscores removed.
4828
4829 # The optional second parameter is a reference to a scalar in which
4830 # this routine will store the length the returned name had before the
4831 # underscores were removed, or undef if the return is undef.
4832
4833 # The shortest name can change if new aliases are added. So using
4834 # this should be deferred until after all these are added. The code
4835 # that does that should clear this one's cache.
4836 # Any name with alphabetics is preferred over an all numeric one, even
4837 # if longer.
4838
4839 my $self = shift;
4840 my $nominal_length_ptr = shift;
4841 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4842
ffe43484 4843 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4844
4845 # For efficiency, don't recalculate, but this means that adding new
4846 # aliases could change what the shortest is, so the code that does
4847 # that needs to undef this.
4848 if (defined $short_name{$addr}) {
4849 if ($nominal_length_ptr) {
4850 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4851 }
4852 return $short_name{$addr};
4853 }
4854
4855 # Look at each alias
4856 foreach my $alias ($self->aliases()) {
4857
4858 # Don't use an alias that isn't ok to use for an external name.
4859 next if ! $alias->externally_ok;
4860
4861 my $name = main::Standardize($alias->name);
4862 trace $self, $name if main::DEBUG && $to_trace;
4863
4864 # Take the first one, or a shorter one that isn't numeric. This
4865 # relies on numeric aliases always being last in the array
4866 # returned by aliases(). Any alpha one will have precedence.
4867 if (! defined $short_name{$addr}
4868 || ($name =~ /\D/
4869 && length($name) < length($short_name{$addr})))
4870 {
4871 # Remove interior underscores.
4872 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4873
4874 $nominal_short_name_length{$addr} = length $name;
4875 }
4876 }
4877
ff485b9e
KW
4878 # If the short name isn't a nice one, perhaps an equivalent table has
4879 # a better one.
4880 if (! defined $short_name{$addr}
4881 || $short_name{$addr} eq ""
4882 || $short_name{$addr} eq "_")
4883 {
4884 my $return;
4885 foreach my $follower ($self->children) { # All equivalents
4886 my $follower_name = $follower->short_name;
4887 next unless defined $follower_name;
4888
4889 # Anything (except undefined) is better than underscore or
4890 # empty
4891 if (! defined $return || $return eq "_") {
4892 $return = $follower_name;
4893 next;
4894 }
4895
4896 # If the new follower name isn't "_" and is shorter than the
4897 # current best one, prefer the new one.
4898 next if $follower_name eq "_";
4899 next if length $follower_name > length $return;
4900 $return = $follower_name;
4901 }
4902 $short_name{$addr} = $return if defined $return;
4903 }
4904
99870f4d
KW
4905 # If no suitable external name return undef
4906 if (! defined $short_name{$addr}) {
4907 $$nominal_length_ptr = undef if $nominal_length_ptr;
4908 return;
4909 }
4910
c12f2655 4911 # Don't allow a null short name.
99870f4d
KW
4912 if ($short_name{$addr} eq "") {
4913 $short_name{$addr} = '_';
4914 $nominal_short_name_length{$addr} = 1;
4915 }
4916
4917 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4918
4919 if ($nominal_length_ptr) {
4920 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4921 }
4922 return $short_name{$addr};
4923 }
4924
4925 sub external_name {
4926 # Returns the external name that this table should be known by. This
c12f2655
KW
4927 # is usually the short_name, but not if the short_name is undefined,
4928 # in which case the external_name is arbitrarily set to the
4929 # underscore.
99870f4d
KW
4930
4931 my $self = shift;
4932 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4933
4934 my $short = $self->short_name;
4935 return $short if defined $short;
4936
4937 return '_';
4938 }
4939
4940 sub add_description { # Adds the parameter as a short description.
4941
4942 my $self = shift;
4943 my $description = shift;
4944 chomp $description;
4945 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4946
f998e60c 4947 no overloading;
051df77b 4948 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
4949
4950 return;
4951 }
4952
4953 sub add_note { # Adds the parameter as a short note.
4954
4955 my $self = shift;
4956 my $note = shift;
4957 chomp $note;
4958 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4959
f998e60c 4960 no overloading;
051df77b 4961 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
4962
4963 return;
4964 }
4965
4966 sub add_comment { # Adds the parameter as a comment.
4967
bd9ebcfd
KW
4968 return unless $debugging_build;
4969
99870f4d
KW
4970 my $self = shift;
4971 my $comment = shift;
4972 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4973
4974 chomp $comment;
f998e60c
KW
4975
4976 no overloading;
051df77b 4977 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
4978
4979 return;
4980 }
4981
4982 sub comment {
4983 # Return the current comment for this table. If called in list
4984 # context, returns the array of comments. In scalar, returns a string
4985 # of each element joined together with a period ending each.
4986
4987 my $self = shift;
4988 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4989
ffe43484 4990 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 4991 my @list = @{$comment{$addr}};
99870f4d
KW
4992 return @list if wantarray;
4993 my $return = "";
4994 foreach my $sentence (@list) {
4995 $return .= '. ' if $return;
4996 $return .= $sentence;
4997 $return =~ s/\.$//;
4998 }
4999 $return .= '.' if $return;
5000 return $return;
5001 }
5002
5003 sub initialize {
5004 # Initialize the table with the argument which is any valid
5005 # initialization for range lists.
5006
5007 my $self = shift;
ffe43484 5008 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5009 my $initialization = shift;
5010 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5011
5012 # Replace the current range list with a new one of the same exact
5013 # type.
f998e60c
KW
5014 my $class = ref $range_list{$addr};
5015 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
5016 Initialize => $initialization);
5017 return;
5018
5019 }
5020
5021 sub header {
5022 # The header that is output for the table in the file it is written
5023 # in.
5024
5025 my $self = shift;
5026 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5027
5028 my $return = "";
5029 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5030 $return .= $HEADER;
99870f4d
KW
5031 return $return;
5032 }
5033
5034 sub write {
668b3bfc
KW
5035 # Write a representation of the table to its file. It calls several
5036 # functions furnished by sub-classes of this abstract base class to
5037 # handle non-normal ranges, to add stuff before the table, and at its
5038 # end.
99870f4d
KW
5039
5040 my $self = shift;
5041 my $tab_stops = shift; # The number of tab stops over to put any
5042 # comment.
5043 my $suppress_value = shift; # Optional, if the value associated with
5044 # a range equals this one, don't write
5045 # the range
5046 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5047
ffe43484 5048 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5049
5050 # Start with the header
668b3bfc 5051 my @HEADER = $self->header;
99870f4d
KW
5052
5053 # Then the comments
668b3bfc 5054 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
99870f4d
KW
5055 if $comment{$addr};
5056
668b3bfc
KW
5057 # Things discovered processing the main body of the document may
5058 # affect what gets output before it, therefore pre_body() isn't called
5059 # until after all other processing of the table is done.
99870f4d 5060
c4019d52
KW
5061 # The main body looks like a 'here' document. If annotating, get rid
5062 # of the comments before passing to the caller, as some callers, such
5063 # as charnames.pm, can't cope with them. (Outputting range counts
5064 # also introduces comments, but these don't show up in the tables that
5065 # can't cope with comments, and there aren't that many of them that
5066 # it's worth the extra real time to get rid of them).
668b3bfc 5067 my @OUT;
558712cf 5068 if ($annotate) {
c4019d52
KW
5069 # Use the line below in Perls that don't have /r
5070 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5071 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5072 } else {
5073 push @OUT, "return <<'END';\n";
5074 }
99870f4d
KW
5075
5076 if ($range_list{$addr}->is_empty) {
5077
5078 # This is a kludge for empty tables to silence a warning in
5079 # utf8.c, which can't really deal with empty tables, but it can
5080 # deal with a table that matches nothing, as the inverse of 'Any'
5081 # does.
67a53d68 5082 push @OUT, "!utf8::Any\n";
99870f4d 5083 }
c69a9c68
KW
5084 elsif ($self->name eq 'N'
5085
5086 # To save disk space and table cache space, avoid putting out
5087 # binary N tables, but instead create a file which just inverts
5088 # the Y table. Since the file will still exist and occupy a
5089 # certain number of blocks, might as well output the whole
5090 # thing if it all will fit in one block. The number of
5091 # ranges below is an approximate number for that.
06f26c45
KW
5092 && ($self->property->type == $BINARY
5093 || $self->property->type == $FORCED_BINARY)
c69a9c68
KW
5094 # && $self->property->tables == 2 Can't do this because the
5095 # non-binary properties, like NFDQC aren't specifiable
5096 # by the notation
5097 && $range_list{$addr}->ranges > 15
5098 && ! $annotate) # Under --annotate, want to see everything
5099 {
5100 push @OUT, "!utf8::" . $self->property->name . "\n";
5101 }
99870f4d
KW
5102 else {
5103 my $range_size_1 = $range_size_1{$addr};
558712cf
KW
5104 my $format; # Used only in $annotate option
5105 my $include_name; # Used only in $annotate option
c4019d52 5106
558712cf 5107 if ($annotate) {
c4019d52
KW
5108
5109 # if annotating each code point, must print 1 per line.
5110 # The variable could point to a subroutine, and we don't want
5111 # to lose that fact, so only set if not set already
5112 $range_size_1 = 1 if ! $range_size_1;
5113
5114 $format = $self->format;
5115
5116 # The name of the character is output only for tables that
5117 # don't already include the name in the output.
5118 my $property = $self->property;
5119 $include_name =
5120 ! ($property == $perl_charname
5121 || $property == main::property_ref('Unicode_1_Name')
5122 || $property == main::property_ref('Name')
5123 || $property == main::property_ref('Name_Alias')
5124 );
5125 }
99870f4d
KW
5126
5127 # Output each range as part of the here document.
5a2b5ddb 5128 RANGE:
99870f4d 5129 for my $set ($range_list{$addr}->ranges) {
5a2b5ddb
KW
5130 if ($set->type != 0) {
5131 $self->handle_special_range($set);
5132 next RANGE;
5133 }
99870f4d
KW
5134 my $start = $set->start;
5135 my $end = $set->end;
5136 my $value = $set->value;
5137
5138 # Don't output ranges whose value is the one to suppress
c4019d52
KW
5139 next RANGE if defined $suppress_value
5140 && $value eq $suppress_value;
99870f4d 5141
c4019d52
KW
5142 # If there is a range and doesn't need a single point range
5143 # output
5144 if ($start != $end && ! $range_size_1) {
bd9ebcfd
KW
5145 push @OUT, sprintf "%04X\t%04X", $start, $end;
5146 $OUT[-1] .= "\t$value" if $value ne "";
99870f4d
KW
5147
5148 # Add a comment with the size of the range, if requested.
5149 # Expand Tabs to make sure they all start in the same
5150 # column, and then unexpand to use mostly tabs.
0c07e538 5151 if (! $output_range_counts{$addr}) {
99870f4d
KW
5152 $OUT[-1] .= "\n";
5153 }
5154 else {
5155 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5156 my $count = main::clarify_number($end - $start + 1);
5157 use integer;
5158
5159 my $width = $tab_stops * 8 - 1;
5160 $OUT[-1] = sprintf("%-*s # [%s]\n",
5161 $width,
5162 $OUT[-1],
5163 $count);
5164 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5165 }
c4019d52
KW
5166 next RANGE;
5167 }
5168
5169 # Here to output a single code point per line
5170
5171 # If not to annotate, use the simple formats
558712cf 5172 if (! $annotate) {
c4019d52
KW
5173
5174 # Use any passed in subroutine to output.
5175 if (ref $range_size_1 eq 'CODE') {
5176 for my $i ($start .. $end) {
5177 push @OUT, &{$range_size_1}($i, $value);
5178 }
5179 }
5180 else {
5181
5182 # Here, caller is ok with default output.
5183 for (my $i = $start; $i <= $end; $i++) {
5184 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5185 }
5186 }
5187 next RANGE;
5188 }
5189
5190 # Here, wants annotation.
5191 for (my $i = $start; $i <= $end; $i++) {
5192
5193 # Get character information if don't have it already
5194 main::populate_char_info($i)
5195 if ! defined $viacode[$i];
5196 my $type = $annotate_char_type[$i];
5197
5198 # Figure out if should output the next code points as part
5199 # of a range or not. If this is not in an annotation
5200 # range, then won't output as a range, so returns $i.
5201 # Otherwise use the end of the annotation range, but no
5202 # further than the maximum possible end point of the loop.
5203 my $range_end = main::min($annotate_ranges->value_of($i)
5204 || $i,
5205 $end);
5206
5207 # Use a range if it is a range, and either is one of the
5208 # special annotation ranges, or the range is at most 3
5209 # long. This last case causes the algorithmically named
5210 # code points to be output individually in spans of at
5211 # most 3, as they are the ones whose $type is > 0.
5212 if ($range_end != $i
5213 && ( $type < 0 || $range_end - $i > 2))
5214 {
5215 # Here is to output a range. We don't allow a
5216 # caller-specified output format--just use the
5217 # standard one.
5218 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5219 $range_end,
5220 $value;
5221 my $range_name = $viacode[$i];
5222
5223 # For the code points which end in their hex value, we
5224 # eliminate that from the output annotation, and
5225 # capitalize only the first letter of each word.
5226 if ($type == $CP_IN_NAME) {
5227 my $hex = sprintf "%04X", $i;
5228 $range_name =~ s/-$hex$//;
5229 my @words = split " ", $range_name;
5230 for my $word (@words) {
5231 $word = ucfirst(lc($word)) if $word ne 'CJK';
5232 }
5233 $range_name = join " ", @words;
5234 }
5235 elsif ($type == $HANGUL_SYLLABLE) {
5236 $range_name = "Hangul Syllable";
5237 }
5238
5239 $OUT[-1] .= " $range_name" if $range_name;
5240
5241 # Include the number of code points in the range
5242 my $count = main::clarify_number($range_end - $i + 1);
5243 $OUT[-1] .= " [$count]\n";
5244
5245 # Skip to the end of the range
5246 $i = $range_end;
5247 }
5248 else { # Not in a range.
5249 my $comment = "";
5250
5251 # When outputting the names of each character, use
5252 # the character itself if printable
5253 $comment .= "'" . chr($i) . "' " if $printable[$i];
5254
5255 # To make it more readable, use a minimum indentation
5256 my $comment_indent;
5257
5258 # Determine the annotation
5259 if ($format eq $DECOMP_STRING_FORMAT) {
5260
5261 # This is very specialized, with the type of
5262 # decomposition beginning the line enclosed in
5263 # <...>, and the code points that the code point
5264 # decomposes to separated by blanks. Create two
5265 # strings, one of the printable characters, and
5266 # one of their official names.
5267 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5268 my $tostr = "";
5269 my $to_name = "";
5270 my $to_chr = "";
5271 foreach my $to (split " ", $map) {
5272 $to = CORE::hex $to;
5273 $to_name .= " + " if $to_name;
5274 $to_chr .= chr($to);
5275 main::populate_char_info($to)
5276 if ! defined $viacode[$to];
5277 $to_name .= $viacode[$to];
5278 }
5279
5280 $comment .=
5281 "=> '$to_chr'; $viacode[$i] => $to_name";
5282 $comment_indent = 25; # Determined by experiment
5283 }
5284 else {
5285
5286 # Assume that any table that has hex format is a
5287 # mapping of one code point to another.
5288 if ($format eq $HEX_FORMAT) {
5289 my $decimal_value = CORE::hex $value;
5290 main::populate_char_info($decimal_value)
5291 if ! defined $viacode[$decimal_value];
5292 $comment .= "=> '"
5293 . chr($decimal_value)
5294 . "'; " if $printable[$decimal_value];
5295 }
5296 $comment .= $viacode[$i] if $include_name
5297 && $viacode[$i];
5298 if ($format eq $HEX_FORMAT) {
5299 my $decimal_value = CORE::hex $value;
5300 $comment .= " => $viacode[$decimal_value]"
5301 if $viacode[$decimal_value];
5302 }
5303
5304 # If including the name, no need to indent, as the
5305 # name will already be way across the line.
5306 $comment_indent = ($include_name) ? 0 : 60;
5307 }
5308
5309 # Use any passed in routine to output the base part of
5310 # the line.
5311 if (ref $range_size_1 eq 'CODE') {
5312 my $base_part = &{$range_size_1}($i, $value);
5313 chomp $base_part;
5314 push @OUT, $base_part;
5315 }
5316 else {
5317 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5318 }
5319
5320 # And add the annotation.
5321 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5322 $OUT[-1],
5323 $comment if $comment;
5324 $OUT[-1] .= "\n";
5325 }
99870f4d
KW
5326 }
5327 } # End of loop through all the table's ranges
5328 }
5329
5330 # Add anything that goes after the main body, but within the here
5331 # document,
5332 my $append_to_body = $self->append_to_body;
5333 push @OUT, $append_to_body if $append_to_body;
5334
5335 # And finish the here document.
5336 push @OUT, "END\n";
5337
668b3bfc
KW
5338 # Done with the main portion of the body. Can now figure out what
5339 # should appear before it in the file.
5340 my $pre_body = $self->pre_body;
5341 push @HEADER, $pre_body, "\n" if $pre_body;
668b3bfc 5342
6b0079b5
KW
5343 # All these files should have a .pl suffix added to them.
5344 my @file_with_pl = @{$file_path{$addr}};
5345 $file_with_pl[-1] .= '.pl';
99870f4d 5346
6b0079b5 5347 main::write(\@file_with_pl,
558712cf 5348 $annotate, # utf8 iff annotating
9218f1cf
KW
5349 \@HEADER,
5350 \@OUT);
99870f4d
KW
5351 return;
5352 }
5353
5354 sub set_status { # Set the table's status
5355 my $self = shift;
5356 my $status = shift; # The status enum value
5357 my $info = shift; # Any message associated with it.
5358 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5359
ffe43484 5360 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5361
5362 $status{$addr} = $status;
5363 $status_info{$addr} = $info;
5364 return;
5365 }
5366
301ba948
KW
5367 sub set_fate { # Set the fate of a table
5368 my $self = shift;
5369 my $fate = shift;
5370 my $reason = shift;
5371 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5372
5373 my $addr = do { no overloading; pack 'J', $self; };
5374
5375 return if $fate{$addr} == $fate; # If no-op
5376
5377 # Can only change the ordinary fate.
5378 if ($fate{$addr} != $ORDINARY) {
5379 return;
5380 }
5381
5382 $fate{$addr} = $fate;
5383
5384 # Save the reason for suppression for output
5385 if ($fate == $SUPPRESSED && defined $reason) {
5386 $why_suppressed{$complete_name{$addr}} = $reason;
5387 }
5388
5389 return;
5390 }
5391
99870f4d
KW
5392 sub lock {
5393 # Don't allow changes to the table from now on. This stores a stack
5394 # trace of where it was called, so that later attempts to modify it
5395 # can immediately show where it got locked.
5396
5397 my $self = shift;
5398 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5399
ffe43484 5400 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5401
5402 $locked{$addr} = "";
5403
5404 my $line = (caller(0))[2];
5405 my $i = 1;
5406
5407 # Accumulate the stack trace
5408 while (1) {
5409 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5410
5411 last unless defined $caller;
5412
5413 $locked{$addr} .= " called from $caller() at line $line\n";
5414 $line = $caller_line;
5415 }
5416 $locked{$addr} .= " called from main at line $line\n";
5417
5418 return;
5419 }
5420
5421 sub carp_if_locked {
5422 # Return whether a table is locked or not, and, by the way, complain
5423 # if is locked
5424
5425 my $self = shift;
5426 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5427
ffe43484 5428 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5429
5430 return 0 if ! $locked{$addr};
5431 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5432 return 1;
5433 }
5434
5435 sub set_file_path { # Set the final directory path for this table
5436 my $self = shift;
5437 # Rest of parameters passed on
5438
f998e60c 5439 no overloading;
051df77b 5440 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
5441 return
5442 }
5443
5444 # Accessors for the range list stored in this table. First for
5445 # unconditional
ea25a9b2 5446 for my $sub (qw(
2f7a8815 5447 containing_range
99870f4d
KW
5448 contains
5449 count
5450 each_range
5451 hash
5452 is_empty
09aba7e4 5453 matches_identically_to
99870f4d
KW
5454 max
5455 min
5456 range_count
5457 reset_each_range
0a9dbafc 5458 type_of
99870f4d 5459 value_of
ea25a9b2 5460 ))
99870f4d
KW
5461 {
5462 no strict "refs";
5463 *$sub = sub {
5464 use strict "refs";
5465 my $self = shift;
ec40ee88 5466 return $self->_range_list->$sub(@_);
99870f4d
KW
5467 }
5468 }
5469
5470 # Then for ones that should fail if locked
ea25a9b2 5471 for my $sub (qw(
99870f4d 5472 delete_range
ea25a9b2 5473 ))
99870f4d
KW
5474 {
5475 no strict "refs";
5476 *$sub = sub {
5477 use strict "refs";
5478 my $self = shift;
5479
5480 return if $self->carp_if_locked;
f998e60c 5481 no overloading;
ec40ee88 5482 return $self->_range_list->$sub(@_);
99870f4d
KW
5483 }
5484 }
5485
5486} # End closure
5487
5488package Map_Table;
5489use base '_Base_Table';
5490
5491# A Map Table is a table that contains the mappings from code points to
5492# values. There are two weird cases:
5493# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5494# are written in the table's file at the end of the table nonetheless. It
5495# requires specially constructed code to handle these; utf8.c can not read
5496# these in, so they should not go in $map_directory. As of this writing,
5497# the only case that these happen is for named sequences used in
5498# charnames.pm. But this code doesn't enforce any syntax on these, so
5499# something else could come along that uses it.
5500# 2) Specials are anything that doesn't fit syntactically into the body of the
5501# table. The ranges for these have a map type of non-zero. The code below
5502# knows about and handles each possible type. In most cases, these are
5503# written as part of the header.
5504#
5505# A map table deliberately can't be manipulated at will unlike match tables.
5506# This is because of the ambiguities having to do with what to do with
5507# overlapping code points. And there just isn't a need for those things;
5508# what one wants to do is just query, add, replace, or delete mappings, plus
5509# write the final result.
5510# However, there is a method to get the list of possible ranges that aren't in
5511# this table to use for defaulting missing code point mappings. And,
5512# map_add_or_replace_non_nulls() does allow one to add another table to this
5513# one, but it is clearly very specialized, and defined that the other's
5514# non-null values replace this one's if there is any overlap.
5515
5516sub trace { return main::trace(@_); }
5517
5518{ # Closure
5519
5520 main::setup_package();
5521
5522 my %default_map;
5523 # Many input files omit some entries; this gives what the mapping for the
5524 # missing entries should be
5525 main::set_access('default_map', \%default_map, 'r');
5526
5527 my %anomalous_entries;
5528 # Things that go in the body of the table which don't fit the normal
5529 # scheme of things, like having a range. Not much can be done with these
5530 # once there except to output them. This was created to handle named
5531 # sequences.
5532 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5533 main::set_access('anomalous_entries', # Append singular, read plural
5534 \%anomalous_entries,
5535 'readable_array');
5536
99870f4d
KW
5537 my %core_access;
5538 # This is a string, solely for documentation, indicating how one can get
5539 # access to this property via the Perl core.
5540 main::set_access('core_access', \%core_access, 'r', 's');
5541
99870f4d 5542 my %to_output_map;
8572ace0 5543 # Enum as to whether or not to write out this map table:
c12f2655 5544 # 0 don't output
8572ace0
KW
5545 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5546 # it should not be removed nor its format changed. This
5547 # is done for those files that have traditionally been
5548 # output.
5549 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5550 # with this file
99870f4d
KW
5551 main::set_access('to_output_map', \%to_output_map, 's');
5552
5553
5554 sub new {
5555 my $class = shift;
5556 my $name = shift;
5557
5558 my %args = @_;
5559
5560 # Optional initialization data for the table.
5561 my $initialize = delete $args{'Initialize'};
5562
5563 my $core_access = delete $args{'Core_Access'};
5564 my $default_map = delete $args{'Default_Map'};
99870f4d 5565 my $property = delete $args{'_Property'};
23e33b60 5566 my $full_name = delete $args{'Full_Name'};
20863809 5567
99870f4d
KW
5568 # Rest of parameters passed on
5569
5570 my $range_list = Range_Map->new(Owner => $property);
5571
5572 my $self = $class->SUPER::new(
5573 Name => $name,
23e33b60
KW
5574 Complete_Name => $full_name,
5575 Full_Name => $full_name,
99870f4d
KW
5576 _Property => $property,
5577 _Range_List => $range_list,
5578 %args);
5579
ffe43484 5580 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5581
5582 $anomalous_entries{$addr} = [];
5583 $core_access{$addr} = $core_access;
5584 $default_map{$addr} = $default_map;
99870f4d
KW
5585
5586 $self->initialize($initialize) if defined $initialize;
5587
5588 return $self;
5589 }
5590
5591 use overload
5592 fallback => 0,
5593 qw("") => "_operator_stringify",
5594 ;
5595
5596 sub _operator_stringify {
5597 my $self = shift;
5598
5599 my $name = $self->property->full_name;
5600 $name = '""' if $name eq "";
5601 return "Map table for Property '$name'";
5602 }
5603
99870f4d
KW
5604 sub add_alias {
5605 # Add a synonym for this table (which means the property itself)
5606 my $self = shift;
5607 my $name = shift;
5608 # Rest of parameters passed on.
5609
5610 $self->SUPER::add_alias($name, $self->property, @_);
5611 return;
5612 }
5613
5614 sub add_map {
5615 # Add a range of code points to the list of specially-handled code
5616 # points. $MULTI_CP is assumed if the type of special is not passed
5617 # in.
5618
5619 my $self = shift;
5620 my $lower = shift;
5621 my $upper = shift;
5622 my $string = shift;
5623 my %args = @_;
5624
5625 my $type = delete $args{'Type'} || 0;
5626 # Rest of parameters passed on
5627
5628 # Can't change the table if locked.
5629 return if $self->carp_if_locked;
5630
ffe43484 5631 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 5632
99870f4d
KW
5633 $self->_range_list->add_map($lower, $upper,
5634 $string,
5635 @_,
5636 Type => $type);
5637 return;
5638 }
5639
5640 sub append_to_body {
5641 # Adds to the written HERE document of the table's body any anomalous
5642 # entries in the table..
5643
5644 my $self = shift;
5645 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5646
ffe43484 5647 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5648
5649 return "" unless @{$anomalous_entries{$addr}};
5650 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5651 }
5652
5653 sub map_add_or_replace_non_nulls {
5654 # This adds the mappings in the table $other to $self. Non-null
5655 # mappings from $other override those in $self. It essentially merges
5656 # the two tables, with the second having priority except for null
5657 # mappings.
5658
5659 my $self = shift;
5660 my $other = shift;
5661 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5662
5663 return if $self->carp_if_locked;
5664
5665 if (! $other->isa(__PACKAGE__)) {
5666 Carp::my_carp_bug("$other should be a "
5667 . __PACKAGE__
5668 . ". Not a '"
5669 . ref($other)
5670 . "'. Not added;");
5671 return;
5672 }
5673
ffe43484
NC
5674 my $addr = do { no overloading; pack 'J', $self; };
5675 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5676
5677 local $to_trace = 0 if main::DEBUG;
5678
5679 my $self_range_list = $self->_range_list;
5680 my $other_range_list = $other->_range_list;
5681 foreach my $range ($other_range_list->ranges) {
5682 my $value = $range->value;
5683 next if $value eq "";
5684 $self_range_list->_add_delete('+',
5685 $range->start,
5686 $range->end,
5687 $value,
5688 Type => $range->type,
5689 Replace => $UNCONDITIONALLY);
5690 }
5691
99870f4d
KW
5692 return;
5693 }
5694
5695 sub set_default_map {
5696 # Define what code points that are missing from the input files should
5697 # map to
5698
5699 my $self = shift;
5700 my $map = shift;
5701 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5702
ffe43484 5703 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5704
5705 # Convert the input to the standard equivalent, if any (won't have any
5706 # for $STRING properties)
5707 my $standard = $self->_find_table_from_alias->{$map};
5708 $map = $standard->name if defined $standard;
5709
5710 # Warn if there already is a non-equivalent default map for this
5711 # property. Note that a default map can be a ref, which means that
5712 # what it actually means is delayed until later in the program, and it
5713 # IS permissible to override it here without a message.
5714 my $default_map = $default_map{$addr};
5715 if (defined $default_map
5716 && ! ref($default_map)
5717 && $default_map ne $map
5718 && main::Standardize($map) ne $default_map)
5719 {
5720 my $property = $self->property;
5721 my $map_table = $property->table($map);
5722 my $default_table = $property->table($default_map);
5723 if (defined $map_table
5724 && defined $default_table
5725 && $map_table != $default_table)
5726 {
5727 Carp::my_carp("Changing the default mapping for "
5728 . $property
5729 . " from $default_map to $map'");
5730 }
5731 }
5732
5733 $default_map{$addr} = $map;
5734
5735 # Don't also create any missing table for this map at this point,
5736 # because if we did, it could get done before the main table add is
5737 # done for PropValueAliases.txt; instead the caller will have to make
5738 # sure it exists, if desired.
5739 return;
5740 }
5741
5742 sub to_output_map {
5743 # Returns boolean: should we write this map table?
5744
5745 my $self = shift;
5746 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5747
ffe43484 5748 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5749
5750 # If overridden, use that
5751 return $to_output_map{$addr} if defined $to_output_map{$addr};
5752
5753 my $full_name = $self->full_name;
fcf1973c
KW
5754 return $global_to_output_map{$full_name}
5755 if defined $global_to_output_map{$full_name};
99870f4d 5756
20863809 5757 # If table says to output, do so; if says to suppress it, do so.
301ba948
KW
5758 my $fate = $self->fate;
5759 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
8572ace0 5760 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
301ba948 5761 return 0 if $fate == $SUPPRESSED;
99870f4d
KW
5762
5763 my $type = $self->property->type;
5764
5765 # Don't want to output binary map tables even for debugging.
5766 return 0 if $type == $BINARY;
5767
5768 # But do want to output string ones.
8572ace0 5769 return $EXTERNAL_MAP if $type == $STRING;
99870f4d 5770
8572ace0
KW
5771 # Otherwise is an $ENUM, do output it, for Perl's purposes
5772 return $INTERNAL_MAP;
99870f4d
KW
5773 }
5774
5775 sub inverse_list {
5776 # Returns a Range_List that is gaps of the current table. That is,
5777 # the inversion
5778
5779 my $self = shift;
5780 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5781
5782 my $current = Range_List->new(Initialize => $self->_range_list,
5783 Owner => $self->property);
5784 return ~ $current;
5785 }
5786
8572ace0
KW
5787 sub header {
5788 my $self = shift;
5789 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5790
5791 my $return = $self->SUPER::header();
5792
126c3d4e 5793 $return .= $INTERNAL_ONLY_HEADER if $self->to_output_map == $INTERNAL_MAP;
8572ace0
KW
5794 return $return;
5795 }
5796
99870f4d
KW
5797 sub set_final_comment {
5798 # Just before output, create the comment that heads the file
5799 # containing this table.
5800
bd9ebcfd
KW
5801 return unless $debugging_build;
5802
99870f4d
KW
5803 my $self = shift;
5804 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5805
5806 # No sense generating a comment if aren't going to write it out.
5807 return if ! $self->to_output_map;
5808
ffe43484 5809 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5810
5811 my $property = $self->property;
5812
5813 # Get all the possible names for this property. Don't use any that
5814 # aren't ok for use in a file name, etc. This is perhaps causing that
5815 # flag to do double duty, and may have to be changed in the future to
5816 # have our own flag for just this purpose; but it works now to exclude
5817 # Perl generated synonyms from the lists for properties, where the
5818 # name is always the proper Unicode one.
5819 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5820
5821 my $count = $self->count;
5822 my $default_map = $default_map{$addr};
5823
5824 # The ranges that map to the default aren't output, so subtract that
5825 # to get those actually output. A property with matching tables
5826 # already has the information calculated.
5827 if ($property->type != $STRING) {
5828 $count -= $property->table($default_map)->count;
5829 }
5830 elsif (defined $default_map) {
5831
5832 # But for $STRING properties, must calculate now. Subtract the
5833 # count from each range that maps to the default.
5834 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5835 if ($range->value eq $default_map) {
5836 $count -= $range->end +1 - $range->start;
5837 }
5838 }
5839
5840 }
5841
5842 # Get a string version of $count with underscores in large numbers,
5843 # for clarity.
5844 my $string_count = main::clarify_number($count);
5845
5846 my $code_points = ($count == 1)
5847 ? 'single code point'
5848 : "$string_count code points";
5849
5850 my $mapping;
5851 my $these_mappings;
5852 my $are;
5853 if (@property_aliases <= 1) {
5854 $mapping = 'mapping';
5855 $these_mappings = 'this mapping';
5856 $are = 'is'
5857 }
5858 else {
5859 $mapping = 'synonymous mappings';
5860 $these_mappings = 'these mappings';
5861 $are = 'are'
5862 }
5863 my $cp;
5864 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5865 $cp = "any code point in Unicode Version $string_version";
5866 }
5867 else {
5868 my $map_to;
5869 if ($default_map eq "") {
5870 $map_to = 'the null string';
5871 }
5872 elsif ($default_map eq $CODE_POINT) {
5873 $map_to = "itself";
5874 }
5875 else {
5876 $map_to = "'$default_map'";
5877 }
5878 if ($count == 1) {
5879 $cp = "the single code point";
5880 }
5881 else {
5882 $cp = "one of the $code_points";
5883 }
5884 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5885 }
5886
5887 my $comment = "";
5888
5889 my $status = $self->status;
5890 if ($status) {
5891 my $warn = uc $status_past_participles{$status};
5892 $comment .= <<END;
5893
5894!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5895 All property or property=value combinations contained in this file are $warn.
5896 See $unicode_reference_url for what this means.
5897
5898END
5899 }
5900 $comment .= "This file returns the $mapping:\n";
5901
5902 for my $i (0 .. @property_aliases - 1) {
5903 $comment .= sprintf("%-8s%s\n",
5904 " ",
5905 $property_aliases[$i]->name . '(cp)'
5906 );
5907 }
5908 $comment .=
5909 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5910
5911 my $access = $core_access{$addr};
5912 if ($access) {
5913 $comment .= "accessible through the Perl core via $access.";
5914 }
5915 else {
5916 $comment .= "not accessible through the Perl core directly.";
5917 }
5918
5919 # And append any commentary already set from the actual property.
5920 $comment .= "\n\n" . $self->comment if $self->comment;
5921 if ($self->description) {
5922 $comment .= "\n\n" . join " ", $self->description;
5923 }
5924 if ($self->note) {
5925 $comment .= "\n\n" . join " ", $self->note;
5926 }
5927 $comment .= "\n";
5928
5929 if (! $self->perl_extension) {
5930 $comment .= <<END;
5931
5932For information about what this property really means, see:
5933$unicode_reference_url
5934END
5935 }
5936
5937 if ($count) { # Format differs for empty table
5938 $comment.= "\nThe format of the ";
5939 if ($self->range_size_1) {
5940 $comment.= <<END;
5941main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5942is in hex; MAPPING is what CODE_POINT maps to.
5943END
5944 }
5945 else {
5946
5947 # There are tables which end up only having one element per
5948 # range, but it is not worth keeping track of for making just
5949 # this comment a little better.
5950 $comment.= <<END;
5951non-comment portions of the main body of lines of this file is:
5952START\\tSTOP\\tMAPPING where START is the starting code point of the
5953range, in hex; STOP is the ending point, or if omitted, the range has just one
5954code point; MAPPING is what each code point between START and STOP maps to.
5955END
0c07e538 5956 if ($self->output_range_counts) {
99870f4d
KW
5957 $comment .= <<END;
5958Numbers in comments in [brackets] indicate how many code points are in the
5959range (omitted when the range is a single code point or if the mapping is to
5960the null string).
5961END
5962 }
5963 }
5964 }
5965 $self->set_comment(main::join_lines($comment));
5966 return;
5967 }
5968
5969 my %swash_keys; # Makes sure don't duplicate swash names.
5970
668b3bfc
KW
5971 # The remaining variables are temporaries used while writing each table,
5972 # to output special ranges.
668b3bfc
KW
5973 my @multi_code_point_maps; # Map is to more than one code point.
5974
668b3bfc
KW
5975 sub handle_special_range {
5976 # Called in the middle of write when it finds a range it doesn't know
5977 # how to handle.
5978
5979 my $self = shift;
5980 my $range = shift;
5981 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5982
5983 my $addr = do { no overloading; pack 'J', $self; };
5984
5985 my $type = $range->type;
5986
5987 my $low = $range->start;
5988 my $high = $range->end;
5989 my $map = $range->value;
5990
5991 # No need to output the range if it maps to the default.
5992 return if $map eq $default_map{$addr};
5993
bb1dd3da
KW
5994 my $property = $self->property;
5995
668b3bfc
KW
5996 # Switch based on the map type...
5997 if ($type == $HANGUL_SYLLABLE) {
5998
5999 # These are entirely algorithmically determinable based on
6000 # some constants furnished by Unicode; for now, just set a
6001 # flag to indicate that have them. After everything is figured
bb1dd3da
KW
6002 # out, we will output the code that does the algorithm. (Don't
6003 # output them if not needed because we are suppressing this
6004 # property.)
6005 $has_hangul_syllables = 1 if $property->to_output_map;
668b3bfc
KW
6006 }
6007 elsif ($type == $CP_IN_NAME) {
6008
bb1dd3da 6009 # Code points whose name ends in their code point are also
668b3bfc
KW
6010 # algorithmically determinable, but need information about the map
6011 # to do so. Both the map and its inverse are stored in data
bb1dd3da
KW
6012 # structures output in the file. They are stored in the mean time
6013 # in global lists The lists will be written out later into Name.pm,
6014 # which is created only if needed. In order to prevent duplicates
6015 # in the list, only add to them for one property, should multiple
6016 # ones need them.
6017 if ($needing_code_points_ending_in_code_point == 0) {
6018 $needing_code_points_ending_in_code_point = $property;
6019 }
6020 if ($property == $needing_code_points_ending_in_code_point) {
6c1bafed
KW
6021 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6022 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6023
6024 my $squeezed = $map =~ s/[-\s]+//gr;
6025 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6026 $low;
6027 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6028 $high;
6029
6030 push @code_points_ending_in_code_point, { low => $low,
6031 high => $high,
6032 name => $map
6033 };
bb1dd3da 6034 }
668b3bfc
KW
6035 }
6036 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6037
6038 # Multi-code point maps and null string maps have an entry
6039 # for each code point in the range. They use the same
6040 # output format.
6041 for my $code_point ($low .. $high) {
6042
c12f2655
KW
6043 # The pack() below can't cope with surrogates. XXX This may
6044 # no longer be true
668b3bfc 6045 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
98dc9551 6046 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
668b3bfc
KW
6047 next;
6048 }
6049
6050 # Generate the hash entries for these in the form that
6051 # utf8.c understands.
6052 my $tostr = "";
6053 my $to_name = "";
6054 my $to_chr = "";
6055 foreach my $to (split " ", $map) {
6056 if ($to !~ /^$code_point_re$/) {
6057 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6058 next;
6059 }
6060 $tostr .= sprintf "\\x{%s}", $to;
6061 $to = CORE::hex $to;
558712cf 6062 if ($annotate) {
c4019d52
KW
6063 $to_name .= " + " if $to_name;
6064 $to_chr .= chr($to);
6065 main::populate_char_info($to)
6066 if ! defined $viacode[$to];
6067 $to_name .= $viacode[$to];
6068 }
668b3bfc
KW
6069 }
6070
6071 # I (khw) have never waded through this line to
6072 # understand it well enough to comment it.
6073 my $utf8 = sprintf(qq["%s" => "$tostr",],
6074 join("", map { sprintf "\\x%02X", $_ }
6075 unpack("U0C*", pack("U", $code_point))));
6076
6077 # Add a comment so that a human reader can more easily
6078 # see what's going on.
6079 push @multi_code_point_maps,
6080 sprintf("%-45s # U+%04X", $utf8, $code_point);
558712cf 6081 if (! $annotate) {
c4019d52
KW
6082 $multi_code_point_maps[-1] .= " => $map";
6083 }
6084 else {
6085 main::populate_char_info($code_point)
6086 if ! defined $viacode[$code_point];
6087 $multi_code_point_maps[-1] .= " '"
6088 . chr($code_point)
6089 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6090 }
668b3bfc
KW
6091 }
6092 }
6093 else {
6094 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6095 }
6096
6097 return;
6098 }
6099
99870f4d
KW
6100 sub pre_body {
6101 # Returns the string that should be output in the file before the main
668b3bfc
KW
6102 # body of this table. It isn't called until the main body is
6103 # calculated, saving a pass. The string includes some hash entries
6104 # identifying the format of the body, and what the single value should
6105 # be for all ranges missing from it. It also includes any code points
6106 # which have map_types that don't go in the main table.
99870f4d
KW
6107
6108 my $self = shift;
6109 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6110
ffe43484 6111 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6112
6113 my $name = $self->property->swash_name;
6114
19f751d2
KW
6115 # Currently there is nothing in the pre_body unless a swash is being
6116 # generated.
6117 return unless defined $name;
6118
99870f4d
KW
6119 if (defined $swash_keys{$name}) {
6120 Carp::my_carp(join_lines(<<END
6121Already created a swash name '$name' for $swash_keys{$name}. This means that
6122the same name desired for $self shouldn't be used. Bad News. This must be
6123fixed before production use, but proceeding anyway
6124END
6125 ));
6126 }
6127 $swash_keys{$name} = "$self";
6128
99870f4d 6129 my $pre_body = "";
99870f4d 6130
668b3bfc
KW
6131 # Here we assume we were called after have gone through the whole
6132 # file. If we actually generated anything for each map type, add its
6133 # respective header and trailer
ec2f0128 6134 my $specials_name = "";
668b3bfc 6135 if (@multi_code_point_maps) {
ec2f0128 6136 $specials_name = "utf8::ToSpec$name";
668b3bfc 6137 $pre_body .= <<END;
99870f4d
KW
6138
6139# Some code points require special handling because their mappings are each to
6140# multiple code points. These do not appear in the main body, but are defined
6141# in the hash below.
6142
76591e2b
KW
6143# Each key is the string of N bytes that together make up the UTF-8 encoding
6144# for the code point. (i.e. the same as looking at the code point's UTF-8
6145# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
ec2f0128 6146\%$specials_name = (
99870f4d 6147END
668b3bfc
KW
6148 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6149 }
99870f4d 6150
668b3bfc
KW
6151 my $format = $self->format;
6152
6153 my $return = <<END;
6154# The name this swash is to be known by, with the format of the mappings in
6155# the main body of the table, and what all code points missing from this file
6156# map to.
6157\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6158END
ec2f0128
KW
6159 if ($specials_name) {
6160 $return .= <<END;
6161\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6162END
6163 }
668b3bfc
KW
6164 my $default_map = $default_map{$addr};
6165 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6166
6167 if ($default_map eq $CODE_POINT) {
6168 $return .= ' # code point maps to itself';
6169 }
6170 elsif ($default_map eq "") {
6171 $return .= ' # code point maps to the null string';
6172 }
6173 $return .= "\n";
6174
6175 $return .= $pre_body;
6176
6177 return $return;
6178 }
6179
6180 sub write {
6181 # Write the table to the file.
6182
6183 my $self = shift;
6184 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6185
6186 my $addr = do { no overloading; pack 'J', $self; };
6187
6188 # Clear the temporaries
668b3bfc 6189 undef @multi_code_point_maps;
99870f4d
KW
6190
6191 # Calculate the format of the table if not already done.
f5817e0a 6192 my $format = $self->format;
668b3bfc
KW
6193 my $type = $self->property->type;
6194 my $default_map = $self->default_map;
99870f4d
KW
6195 if (! defined $format) {
6196 if ($type == $BINARY) {
6197
6198 # Don't bother checking the values, because we elsewhere
6199 # verify that a binary table has only 2 values.
6200 $format = $BINARY_FORMAT;
6201 }
6202 else {
6203 my @ranges = $self->_range_list->ranges;
6204
6205 # default an empty table based on its type and default map
6206 if (! @ranges) {
6207
6208 # But it turns out that the only one we can say is a
6209 # non-string (besides binary, handled above) is when the
6210 # table is a string and the default map is to a code point
6211 if ($type == $STRING && $default_map eq $CODE_POINT) {
6212 $format = $HEX_FORMAT;
6213 }
6214 else {
6215 $format = $STRING_FORMAT;
6216 }
6217 }
6218 else {
6219
6220 # Start with the most restrictive format, and as we find
6221 # something that doesn't fit with that, change to the next
6222 # most restrictive, and so on.
6223 $format = $DECIMAL_FORMAT;
6224 foreach my $range (@ranges) {
668b3bfc
KW
6225 next if $range->type != 0; # Non-normal ranges don't
6226 # affect the main body
99870f4d
KW
6227 my $map = $range->value;
6228 if ($map ne $default_map) {
6229 last if $format eq $STRING_FORMAT; # already at
6230 # least
6231 # restrictive
6232 $format = $INTEGER_FORMAT
6233 if $format eq $DECIMAL_FORMAT
6234 && $map !~ / ^ [0-9] $ /x;
6235 $format = $FLOAT_FORMAT
6236 if $format eq $INTEGER_FORMAT
6237 && $map !~ / ^ -? [0-9]+ $ /x;
6238 $format = $RATIONAL_FORMAT
6239 if $format eq $FLOAT_FORMAT
6240 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6241 $format = $HEX_FORMAT
6242 if $format eq $RATIONAL_FORMAT
6243 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6244 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6245 && $map =~ /[^0-9A-F]/;
6246 }
6247 }
6248 }
6249 }
6250 } # end of calculating format
6251
668b3bfc 6252 if ($default_map eq $CODE_POINT
99870f4d 6253 && $format ne $HEX_FORMAT
668b3bfc
KW
6254 && ! defined $self->format) # manual settings are always
6255 # considered ok
99870f4d
KW
6256 {
6257 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6258 }
99870f4d 6259
668b3bfc 6260 $self->_set_format($format);
99870f4d 6261
0911a63d
KW
6262 # Core Perl has a different definition of mapping ranges than we do,
6263 # that is applicable mainly to mapping code points, so for tables
6264 # where it is possible that core Perl could be used to read it,
6265 # make it range size 1 to prevent possible confusion
6266 $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6267
99870f4d
KW
6268 return $self->SUPER::write(
6269 ($self->property == $block)
6270 ? 7 # block file needs more tab stops
6271 : 3,
668b3bfc 6272 $default_map); # don't write defaulteds
99870f4d
KW
6273 }
6274
6275 # Accessors for the underlying list that should fail if locked.
ea25a9b2 6276 for my $sub (qw(
99870f4d 6277 add_duplicate
ea25a9b2 6278 ))
99870f4d
KW
6279 {
6280 no strict "refs";
6281 *$sub = sub {
6282 use strict "refs";
6283 my $self = shift;
6284
6285 return if $self->carp_if_locked;
6286 return $self->_range_list->$sub(@_);
6287 }
6288 }
6289} # End closure for Map_Table
6290
6291package Match_Table;
6292use base '_Base_Table';
6293
6294# A Match table is one which is a list of all the code points that have
6295# the same property and property value, for use in \p{property=value}
6296# constructs in regular expressions. It adds very little data to the base
6297# structure, but many methods, as these lists can be combined in many ways to
6298# form new ones.
6299# There are only a few concepts added:
6300# 1) Equivalents and Relatedness.
6301# Two tables can match the identical code points, but have different names.
6302# This always happens when there is a perl single form extension
6303# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6304# tables are set to be related, with the Perl extension being a child, and
6305# the Unicode property being the parent.
6306#
6307# It may be that two tables match the identical code points and we don't
6308# know if they are related or not. This happens most frequently when the
6309# Block and Script properties have the exact range. But note that a
6310# revision to Unicode could add new code points to the script, which would
6311# now have to be in a different block (as the block was filled, or there
6312# would have been 'Unknown' script code points in it and they wouldn't have
6313# been identical). So we can't rely on any two properties from Unicode
6314# always matching the same code points from release to release, and thus
6315# these tables are considered coincidentally equivalent--not related. When
6316# two tables are unrelated but equivalent, one is arbitrarily chosen as the
6317# 'leader', and the others are 'equivalents'. This concept is useful
6318# to minimize the number of tables written out. Only one file is used for
6319# any identical set of code points, with entries in Heavy.pl mapping all
6320# the involved tables to it.
6321#
6322# Related tables will always be identical; we set them up to be so. Thus
6323# if the Unicode one is deprecated, the Perl one will be too. Not so for
6324# unrelated tables. Relatedness makes generating the documentation easier.
6325#
c12f2655
KW
6326# 2) Complement.
6327# Like equivalents, two tables may be the inverses of each other, the
6328# intersection between them is null, and the union is every Unicode code
6329# point. The two tables that occupy a binary property are necessarily like
6330# this. By specifying one table as the complement of another, we can avoid
6331# storing it on disk (using the other table and performing a fast
6332# transform), and some memory and calculations.
6333#
6334# 3) Conflicting. It may be that there will eventually be name clashes, with
99870f4d
KW
6335# the same name meaning different things. For a while, there actually were
6336# conflicts, but they have so far been resolved by changing Perl's or
6337# Unicode's definitions to match the other, but when this code was written,
6338# it wasn't clear that that was what was going to happen. (Unicode changed
6339# because of protests during their beta period.) Name clashes are warned
6340# about during compilation, and the documentation. The generated tables
6341# are sane, free of name clashes, because the code suppresses the Perl
6342# version. But manual intervention to decide what the actual behavior
6343# should be may be required should this happen. The introductory comments
6344# have more to say about this.
6345
6346sub standardize { return main::standardize($_[0]); }
6347sub trace { return main::trace(@_); }
6348
6349
6350{ # Closure
6351
6352 main::setup_package();
6353
6354 my %leader;
6355 # The leader table of this one; initially $self.
6356 main::set_access('leader', \%leader, 'r');
6357
6358 my %equivalents;
6359 # An array of any tables that have this one as their leader
6360 main::set_access('equivalents', \%equivalents, 'readable_array');
6361
6362 my %parent;
6363 # The parent table to this one, initially $self. This allows us to
c12f2655
KW
6364 # distinguish between equivalent tables that are related (for which this
6365 # is set to), and those which may not be, but share the same output file
6366 # because they match the exact same set of code points in the current
6367 # Unicode release.
99870f4d
KW
6368 main::set_access('parent', \%parent, 'r');
6369
6370 my %children;
6371 # An array of any tables that have this one as their parent
6372 main::set_access('children', \%children, 'readable_array');
6373
6374 my %conflicting;
6375 # Array of any tables that would have the same name as this one with
6376 # a different meaning. This is used for the generated documentation.
6377 main::set_access('conflicting', \%conflicting, 'readable_array');
6378
6379 my %matches_all;
6380 # Set in the constructor for tables that are expected to match all code
6381 # points.
6382 main::set_access('matches_all', \%matches_all, 'r');
6383
a92d5c2e
KW
6384 my %complement;
6385 # Points to the complement that this table is expressed in terms of; 0 if
6386 # none.
8ae00c8a 6387 main::set_access('complement', \%complement, 'r');
a92d5c2e 6388
99870f4d
KW
6389 sub new {
6390 my $class = shift;
6391
6392 my %args = @_;
6393
6394 # The property for which this table is a listing of property values.
6395 my $property = delete $args{'_Property'};
6396
23e33b60
KW
6397 my $name = delete $args{'Name'};
6398 my $full_name = delete $args{'Full_Name'};
6399 $full_name = $name if ! defined $full_name;
6400
99870f4d
KW
6401 # Optional
6402 my $initialize = delete $args{'Initialize'};
6403 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 6404 my $format = delete $args{'Format'};
99870f4d
KW
6405 # Rest of parameters passed on.
6406
6407 my $range_list = Range_List->new(Initialize => $initialize,
6408 Owner => $property);
6409
23e33b60
KW
6410 my $complete = $full_name;
6411 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6412 # but this helps debug if it
6413 # does
6414 # The complete name for a match table includes it's property in a
6415 # compound form 'property=table', except if the property is the
6416 # pseudo-property, perl, in which case it is just the single form,
6417 # 'table' (If you change the '=' must also change the ':' in lots of
6418 # places in this program that assume an equal sign)
6419 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 6420
99870f4d 6421 my $self = $class->SUPER::new(%args,
23e33b60
KW
6422 Name => $name,
6423 Complete_Name => $complete,
6424 Full_Name => $full_name,
99870f4d
KW
6425 _Property => $property,
6426 _Range_List => $range_list,
f5817e0a 6427 Format => $EMPTY_FORMAT,
99870f4d 6428 );
ffe43484 6429 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6430
6431 $conflicting{$addr} = [ ];
6432 $equivalents{$addr} = [ ];
6433 $children{$addr} = [ ];
6434 $matches_all{$addr} = $matches_all;
6435 $leader{$addr} = $self;
6436 $parent{$addr} = $self;
a92d5c2e 6437 $complement{$addr} = 0;
99870f4d 6438
f5817e0a
KW
6439 if (defined $format && $format ne $EMPTY_FORMAT) {
6440 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6441 }
6442
99870f4d
KW
6443 return $self;
6444 }
6445
6446 # See this program's beginning comment block about overloading these.
6447 use overload
6448 fallback => 0,
6449 qw("") => "_operator_stringify",
6450 '=' => sub {
6451 my $self = shift;
6452
6453 return if $self->carp_if_locked;
6454 return $self;
6455 },
6456
6457 '+' => sub {
6458 my $self = shift;
6459 my $other = shift;
6460
6461 return $self->_range_list + $other;
6462 },
6463 '&' => sub {
6464 my $self = shift;
6465 my $other = shift;
6466
6467 return $self->_range_list & $other;
6468 },
6469 '+=' => sub {
6470 my $self = shift;
6471 my $other = shift;
6472
6473 return if $self->carp_if_locked;
6474
ffe43484 6475 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6476
6477 if (ref $other) {
6478
6479 # Change the range list of this table to be the
6480 # union of the two.
6481 $self->_set_range_list($self->_range_list
6482 + $other);
6483 }
6484 else { # $other is just a simple value
6485 $self->add_range($other, $other);
6486 }
6487 return $self;
6488 },
6489 '-' => sub { my $self = shift;
6490 my $other = shift;
6491 my $reversed = shift;
6492
6493 if ($reversed) {
6494 Carp::my_carp_bug("Can't cope with a "
6495 . __PACKAGE__
6496 . " being the first parameter in a '-'. Subtraction ignored.");
6497 return;
6498 }
6499
6500 return $self->_range_list - $other;
6501 },
6502 '~' => sub { my $self = shift;
6503 return ~ $self->_range_list;
6504 },
6505 ;
6506
6507 sub _operator_stringify {
6508 my $self = shift;
6509
23e33b60 6510 my $name = $self->complete_name;
99870f4d
KW
6511 return "Table '$name'";
6512 }
6513
ec40ee88
KW
6514 sub _range_list {
6515 # Returns the range list associated with this table, which will be the
6516 # complement's if it has one.
6517
6518 my $self = shift;
6519 my $complement;
6520 if (($complement = $self->complement) != 0) {
6521 return ~ $complement->_range_list;
6522 }
6523 else {
6524 return $self->SUPER::_range_list;
6525 }
6526 }
6527
99870f4d
KW
6528 sub add_alias {
6529 # Add a synonym for this table. See the comments in the base class
6530
6531 my $self = shift;
6532 my $name = shift;
6533 # Rest of parameters passed on.
6534
6535 $self->SUPER::add_alias($name, $self, @_);
6536 return;
6537 }
6538
6539 sub add_conflicting {
6540 # Add the name of some other object to the list of ones that name
6541 # clash with this match table.
6542
6543 my $self = shift;
6544 my $conflicting_name = shift; # The name of the conflicting object
6545 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6546 my $conflicting_object = shift; # Optional, the conflicting object
6547 # itself. This is used to
6548 # disambiguate the text if the input
6549 # name is identical to any of the
6550 # aliases $self is known by.
6551 # Sometimes the conflicting object is
6552 # merely hypothetical, so this has to
6553 # be an optional parameter.
6554 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6555
ffe43484 6556 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6557
6558 # Check if the conflicting name is exactly the same as any existing
6559 # alias in this table (as long as there is a real object there to
6560 # disambiguate with).
6561 if (defined $conflicting_object) {
6562 foreach my $alias ($self->aliases) {
6563 if ($alias->name eq $conflicting_name) {
6564
6565 # Here, there is an exact match. This results in
6566 # ambiguous comments, so disambiguate by changing the
6567 # conflicting name to its object's complete equivalent.
6568 $conflicting_name = $conflicting_object->complete_name;
6569 last;
6570 }
6571 }
6572 }
6573
6574 # Convert to the \p{...} final name
6575 $conflicting_name = "\\$p" . "{$conflicting_name}";
6576
6577 # Only add once
6578 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6579
6580 push @{$conflicting{$addr}}, $conflicting_name;
6581
6582 return;
6583 }
6584
6505c6e2 6585 sub is_set_equivalent_to {
99870f4d
KW
6586 # Return boolean of whether or not the other object is a table of this
6587 # type and has been marked equivalent to this one.
6588
6589 my $self = shift;
6590 my $other = shift;
6591 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6592
6593 return 0 if ! defined $other; # Can happen for incomplete early
6594 # releases
6595 unless ($other->isa(__PACKAGE__)) {
6596 my $ref_other = ref $other;
6597 my $ref_self = ref $self;
6505c6e2 6598 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
99870f4d
KW
6599 return 0;
6600 }
6601
6602 # Two tables are equivalent if they have the same leader.
f998e60c 6603 no overloading;
051df77b 6604 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6605 return;
6606 }
6607
99870f4d
KW
6608 sub set_equivalent_to {
6609 # Set $self equivalent to the parameter table.
6610 # The required Related => 'x' parameter is a boolean indicating
6611 # whether these tables are related or not. If related, $other becomes
6612 # the 'parent' of $self; if unrelated it becomes the 'leader'
6613 #
6614 # Related tables share all characteristics except names; equivalents
6615 # not quite so many.
6616 # If they are related, one must be a perl extension. This is because
6617 # we can't guarantee that Unicode won't change one or the other in a
98dc9551 6618 # later release even if they are identical now.
99870f4d
KW
6619
6620 my $self = shift;
6621 my $other = shift;
6622
6623 my %args = @_;
6624 my $related = delete $args{'Related'};
6625
6626 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6627
6628 return if ! defined $other; # Keep on going; happens in some early
6629 # Unicode releases.
6630
6631 if (! defined $related) {
6632 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6633 $related = 0;
6634 }
6635
6636 # If already are equivalent, no need to re-do it; if subroutine
6637 # returns null, it found an error, also do nothing
6505c6e2 6638 my $are_equivalent = $self->is_set_equivalent_to($other);
99870f4d
KW
6639 return if ! defined $are_equivalent || $are_equivalent;
6640
ffe43484 6641 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6642 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d 6643
45e32b91
KW
6644 if ($related) {
6645 if ($current_leader->perl_extension) {
6646 if ($other->perl_extension) {
6647 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6648 return;
6649 }
7610e9e2
KW
6650 } elsif ($self->property != $other->property # Depending on
6651 # situation, might
6652 # be better to use
6653 # add_alias()
6654 # instead for same
6655 # property
6656 && ! $other->perl_extension)
6657 {
45e32b91
KW
6658 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6659 $related = 0;
6660 }
6661 }
6662
6663 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6664 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
6665 return;
99870f4d
KW
6666 }
6667
ffe43484
NC
6668 my $leader = do { no overloading; pack 'J', $current_leader; };
6669 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6670
6671 # Any tables that are equivalent to or children of this table must now
6672 # instead be equivalent to or (children) to the new leader (parent),
6673 # still equivalent. The equivalency includes their matches_all info,
301ba948 6674 # and for related tables, their fate and status.
99870f4d
KW
6675 # All related tables are of necessity equivalent, but the converse
6676 # isn't necessarily true
6677 my $status = $other->status;
6678 my $status_info = $other->status_info;
301ba948 6679 my $fate = $other->fate;
99870f4d 6680 my $matches_all = $matches_all{other_addr};
d867ccfb 6681 my $caseless_equivalent = $other->caseless_equivalent;
99870f4d
KW
6682 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6683 next if $table == $other;
6684 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6685
ffe43484 6686 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6687 $leader{$table_addr} = $other;
6688 $matches_all{$table_addr} = $matches_all;
6689 $self->_set_range_list($other->_range_list);
6690 push @{$equivalents{$other_addr}}, $table;
6691 if ($related) {
6692 $parent{$table_addr} = $other;
6693 push @{$children{$other_addr}}, $table;
6694 $table->set_status($status, $status_info);
301ba948
KW
6695
6696 # This reason currently doesn't get exposed outside; otherwise
6697 # would have to look up the parent's reason and use it instead.
6698 $table->set_fate($fate, "Parent's fate");
6699
d867ccfb 6700 $self->set_caseless_equivalent($caseless_equivalent);
99870f4d
KW
6701 }
6702 }
6703
6704 # Now that we've declared these to be equivalent, any changes to one
6705 # of the tables would invalidate that equivalency.
6706 $self->lock;
6707 $other->lock;
6708 return;
6709 }
6710
8ae00c8a
KW
6711 sub set_complement {
6712 # Set $self to be the complement of the parameter table. $self is
6713 # locked, as what it contains should all come from the other table.
6714
6715 my $self = shift;
6716 my $other = shift;
6717
6718 my %args = @_;
6719 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6720
6721 if ($other->complement != 0) {
6722 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
6723 return;
6724 }
6725 my $addr = do { no overloading; pack 'J', $self; };
6726 $complement{$addr} = $other;
6727 $self->lock;
6728 return;
6729 }
6730
99870f4d
KW
6731 sub add_range { # Add a range to the list for this table.
6732 my $self = shift;
6733 # Rest of parameters passed on
6734
6735 return if $self->carp_if_locked;
6736 return $self->_range_list->add_range(@_);
6737 }
6738
88c22f80
KW
6739 sub header {
6740 my $self = shift;
6741 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6742
6743 # All match tables are to be used only by the Perl core.
126c3d4e 6744 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
88c22f80
KW
6745 }
6746
99870f4d
KW
6747 sub pre_body { # Does nothing for match tables.
6748 return
6749 }
6750
6751 sub append_to_body { # Does nothing for match tables.
6752 return
6753 }
6754
301ba948
KW
6755 sub set_fate {
6756 my $self = shift;
6757 my $fate = shift;
6758 my $reason = shift;
6759 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6760
6761 $self->SUPER::set_fate($fate, $reason);
6762
6763 # All children share this fate
6764 foreach my $child ($self->children) {
6765 $child->set_fate($fate, $reason);
6766 }
6767 return;
6768 }
6769
99870f4d
KW
6770 sub write {
6771 my $self = shift;
6772 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6773
6774 return $self->SUPER::write(2); # 2 tab stops
6775 }
6776
6777 sub set_final_comment {
6778 # This creates a comment for the file that is to hold the match table
6779 # $self. It is somewhat convoluted to make the English read nicely,
6780 # but, heh, it's just a comment.
6781 # This should be called only with the leader match table of all the
6782 # ones that share the same file. It lists all such tables, ordered so
6783 # that related ones are together.
6784
bd9ebcfd
KW
6785 return unless $debugging_build;
6786
99870f4d
KW
6787 my $leader = shift; # Should only be called on the leader table of
6788 # an equivalent group
6789 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6790
ffe43484 6791 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6792
6793 if ($leader{$addr} != $leader) {
6794 Carp::my_carp_bug(<<END
6795set_final_comment() must be called on a leader table, which $leader is not.
6796It is equivalent to $leader{$addr}. No comment created
6797END
6798 );
6799 return;
6800 }
6801
6802 # Get the number of code points matched by each of the tables in this
6803 # file, and add underscores for clarity.
6804 my $count = $leader->count;
6805 my $string_count = main::clarify_number($count);
6806
6807 my $loose_count = 0; # how many aliases loosely matched
6808 my $compound_name = ""; # ? Are any names compound?, and if so, an
6809 # example
6810 my $properties_with_compound_names = 0; # count of these
6811
6812
6813 my %flags; # The status flags used in the file
6814 my $total_entries = 0; # number of entries written in the comment
6815 my $matches_comment = ""; # The portion of the comment about the
6816 # \p{}'s
6817 my @global_comments; # List of all the tables' comments that are
6818 # there before this routine was called.
6819
6820 # Get list of all the parent tables that are equivalent to this one
6821 # (including itself).
6822 my @parents = grep { $parent{main::objaddr $_} == $_ }
6823 main::uniques($leader, @{$equivalents{$addr}});
6824 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6825 # tables
6826
6827 for my $parent (@parents) {
6828
6829 my $property = $parent->property;
6830
6831 # Special case 'N' tables in properties with two match tables when
6832 # the other is a 'Y' one. These are likely to be binary tables,
6833 # but not necessarily. In either case, \P{} will match the
6834 # complement of \p{}, and so if something is a synonym of \p, the
6835 # complement of that something will be the synonym of \P. This
6836 # would be true of any property with just two match tables, not
6837 # just those whose values are Y and N; but that would require a
6838 # little extra work, and there are none such so far in Unicode.
6839 my $perl_p = 'p'; # which is it? \p{} or \P{}
6840 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6841
6842 if (scalar $property->tables == 2
6843 && $parent == $property->table('N')
6844 && defined (my $yes = $property->table('Y')))
6845 {
ffe43484 6846 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6847 @yes_perl_synonyms
6848 = grep { $_->property == $perl }
6849 main::uniques($yes,
6850 $parent{$yes_addr},
6851 $parent{$yes_addr}->children);
6852
6853 # But these synonyms are \P{} ,not \p{}
6854 $perl_p = 'P';
6855 }
6856
6857 my @description; # Will hold the table description
6858 my @note; # Will hold the table notes.
6859 my @conflicting; # Will hold the table conflicts.
6860
6861 # Look at the parent, any yes synonyms, and all the children
ffe43484 6862 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6863 for my $table ($parent,
6864 @yes_perl_synonyms,
f998e60c 6865 @{$children{$parent_addr}})
99870f4d 6866 {
ffe43484 6867 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6868 my $table_property = $table->property;
6869
6870 # Tables are separated by a blank line to create a grouping.
6871 $matches_comment .= "\n" if $matches_comment;
6872
6873 # The table is named based on the property and value
6874 # combination it is for, like script=greek. But there may be
6875 # a number of synonyms for each side, like 'sc' for 'script',
6876 # and 'grek' for 'greek'. Any combination of these is a valid
6877 # name for this table. In this case, there are three more,
6878 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6879 # listing all possible combinations in the comment, we make
6880 # sure that each synonym occurs at least once, and add
6881 # commentary that the other combinations are possible.
da912e1e
KW
6882 # Because regular expressions don't recognize things like
6883 # \p{jsn=}, only look at non-null right-hand-sides
99870f4d 6884 my @property_aliases = $table_property->aliases;
da912e1e 6885 my @table_aliases = grep { $_->name ne "" } $table->aliases;
99870f4d
KW
6886
6887 # The alias lists above are already ordered in the order we
6888 # want to output them. To ensure that each synonym is listed,
da912e1e
KW
6889 # we must use the max of the two numbers. But if there are no
6890 # legal synonyms (nothing in @table_aliases), then we don't
6891 # list anything.
6892 my $listed_combos = (@table_aliases)
6893 ? main::max(scalar @table_aliases,
6894 scalar @property_aliases)
6895 : 0;
99870f4d
KW
6896 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6897
da912e1e 6898
99870f4d
KW
6899 my $property_had_compound_name = 0;
6900
6901 for my $i (0 .. $listed_combos - 1) {
6902 $total_entries++;
6903
6904 # The current alias for the property is the next one on
6905 # the list, or if beyond the end, start over. Similarly
6906 # for the table (\p{prop=table})
6907 my $property_alias = $property_aliases
6908 [$i % @property_aliases]->name;
6909 my $table_alias_object = $table_aliases
6910 [$i % @table_aliases];
6911 my $table_alias = $table_alias_object->name;
6912 my $loose_match = $table_alias_object->loose_match;
6913
6914 if ($table_alias !~ /\D/) { # Clarify large numbers.
6915 $table_alias = main::clarify_number($table_alias)
6916 }
6917
6918 # Add a comment for this alias combination
6919 my $current_match_comment;
6920 if ($table_property == $perl) {
6921 $current_match_comment = "\\$perl_p"
6922 . "{$table_alias}";
6923 }
6924 else {
6925 $current_match_comment
6926 = "\\p{$property_alias=$table_alias}";
6927 $property_had_compound_name = 1;
6928 }
6929
6930 # Flag any abnormal status for this table.
6931 my $flag = $property->status
6932 || $table->status
6933 || $table_alias_object->status;
301ba948 6934 $flags{$flag} = $status_past_participles{$flag} if $flag;
99870f4d
KW
6935
6936 $loose_count++;
6937
6938 # Pretty up the comment. Note the \b; it says don't make
6939 # this line a continuation.
6940 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6941 $flag,
6942 " " x 7,
6943 $current_match_comment);
6944 } # End of generating the entries for this table.
6945
6946 # Save these for output after this group of related tables.
6947 push @description, $table->description;
6948 push @note, $table->note;
6949 push @conflicting, $table->conflicting;
6950
37e2e78e
KW
6951 # And this for output after all the tables.
6952 push @global_comments, $table->comment;
6953
99870f4d
KW
6954 # Compute an alternate compound name using the final property
6955 # synonym and the first table synonym with a colon instead of
6956 # the equal sign used elsewhere.
6957 if ($property_had_compound_name) {
6958 $properties_with_compound_names ++;
6959 if (! $compound_name || @property_aliases > 1) {
6960 $compound_name = $property_aliases[-1]->name
6961 . ': '
6962 . $table_aliases[0]->name;
6963 }
6964 }
6965 } # End of looping through all children of this table
6966
6967 # Here have assembled in $matches_comment all the related tables
6968 # to the current parent (preceded by the same info for all the
6969 # previous parents). Put out information that applies to all of
6970 # the current family.
6971 if (@conflicting) {
6972
6973 # But output the conflicting information now, as it applies to
6974 # just this table.
6975 my $conflicting = join ", ", @conflicting;
6976 if ($conflicting) {
6977 $matches_comment .= <<END;
6978
6979 Note that contrary to what you might expect, the above is NOT the same as
6980END
6981 $matches_comment .= "any of: " if @conflicting > 1;
6982 $matches_comment .= "$conflicting\n";
6983 }
6984 }
6985 if (@description) {
6986 $matches_comment .= "\n Meaning: "
6987 . join('; ', @description)
6988 . "\n";
6989 }
6990 if (@note) {
6991 $matches_comment .= "\n Note: "
6992 . join("\n ", @note)
6993 . "\n";
6994 }
6995 } # End of looping through all tables
6996
6997
6998 my $code_points;
6999 my $match;
7000 my $any_of_these;
7001 if ($count == 1) {
7002 $match = 'matches';
7003 $code_points = 'single code point';
7004 }
7005 else {
7006 $match = 'match';
7007 $code_points = "$string_count code points";
7008 }
7009
7010 my $synonyms;
7011 my $entries;
da912e1e 7012 if ($total_entries == 1) {
99870f4d
KW
7013 $synonyms = "";
7014 $entries = 'entry';
7015 $any_of_these = 'this'
7016 }
7017 else {
7018 $synonyms = " any of the following regular expression constructs";
7019 $entries = 'entries';
7020 $any_of_these = 'any of these'
7021 }
7022
7023 my $comment = "";
7024 if ($has_unrelated) {
7025 $comment .= <<END;
7026This file is for tables that are not necessarily related: To conserve
7027resources, every table that matches the identical set of code points in this
7028version of Unicode uses this file. Each one is listed in a separate group
7029below. It could be that the tables will match the same set of code points in
7030other Unicode releases, or it could be purely coincidence that they happen to
7031be the same in Unicode $string_version, and hence may not in other versions.
7032
7033END
7034 }
7035
7036 if (%flags) {
7037 foreach my $flag (sort keys %flags) {
7038 $comment .= <<END;
37e2e78e 7039'$flag' below means that this form is $flags{$flag}.
301ba948 7040Consult $pod_file.pod
99870f4d
KW
7041END
7042 }
7043 $comment .= "\n";
7044 }
7045
da912e1e
KW
7046 if ($total_entries == 0) {
7047 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7048 $comment .= <<END;
7049This file returns the $code_points in Unicode Version $string_version for
7050$leader, but it is inaccessible through Perl regular expressions, as
7051"\\p{prop=}" is not recognized.
7052END
7053
7054 } else {
7055 $comment .= <<END;
99870f4d
KW
7056This file returns the $code_points in Unicode Version $string_version that
7057$match$synonyms:
7058
7059$matches_comment
37e2e78e 7060$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
7061including if adding or subtracting white space, underscore, and hyphen
7062characters matters or doesn't matter, and other permissible syntactic
7063variants. Upper/lower case distinctions never matter.
7064END
7065
da912e1e 7066 }
99870f4d
KW
7067 if ($compound_name) {
7068 $comment .= <<END;
7069
7070A colon can be substituted for the equals sign, and
7071END
7072 if ($properties_with_compound_names > 1) {
7073 $comment .= <<END;
7074within each group above,
7075END
7076 }
7077 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7078
7079 # Note the \b below, it says don't make that line a continuation.
7080 $comment .= <<END;
7081anything to the left of the equals (or colon) can be combined with anything to
7082the right. Thus, for example,
7083$compound_name
7084\bis also valid.
7085END
7086 }
7087
7088 # And append any comment(s) from the actual tables. They are all
7089 # gathered here, so may not read all that well.
37e2e78e
KW
7090 if (@global_comments) {
7091 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7092 }
99870f4d
KW
7093
7094 if ($count) { # The format differs if no code points, and needs no
7095 # explanation in that case
7096 $comment.= <<END;
7097
7098The format of the lines of this file is:
7099END
7100 $comment.= <<END;
7101START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7102STOP is the ending point, or if omitted, the range has just one code point.
7103END
0c07e538 7104 if ($leader->output_range_counts) {
99870f4d
KW
7105 $comment .= <<END;
7106Numbers in comments in [brackets] indicate how many code points are in the
7107range.
7108END
7109 }
7110 }
7111
7112 $leader->set_comment(main::join_lines($comment));
7113 return;
7114 }
7115
7116 # Accessors for the underlying list
ea25a9b2 7117 for my $sub (qw(
99870f4d
KW
7118 get_valid_code_point
7119 get_invalid_code_point
ea25a9b2 7120 ))
99870f4d
KW
7121 {
7122 no strict "refs";
7123 *$sub = sub {
7124 use strict "refs";
7125 my $self = shift;
7126
7127 return $self->_range_list->$sub(@_);
7128 }
7129 }
7130} # End closure for Match_Table
7131
7132package Property;
7133
7134# The Property class represents a Unicode property, or the $perl
7135# pseudo-property. It contains a map table initialized empty at construction
7136# time, and for properties accessible through regular expressions, various
7137# match tables, created through the add_match_table() method, and referenced
7138# by the table('NAME') or tables() methods, the latter returning a list of all
7139# of the match tables. Otherwise table operations implicitly are for the map
7140# table.
7141#
7142# Most of the data in the property is actually about its map table, so it
7143# mostly just uses that table's accessors for most methods. The two could
7144# have been combined into one object, but for clarity because of their
7145# differing semantics, they have been kept separate. It could be argued that
7146# the 'file' and 'directory' fields should be kept with the map table.
7147#
7148# Each property has a type. This can be set in the constructor, or in the
7149# set_type accessor, but mostly it is figured out by the data. Every property
7150# starts with unknown type, overridden by a parameter to the constructor, or
7151# as match tables are added, or ranges added to the map table, the data is
7152# inspected, and the type changed. After the table is mostly or entirely
7153# filled, compute_type() should be called to finalize they analysis.
7154#
7155# There are very few operations defined. One can safely remove a range from
7156# the map table, and property_add_or_replace_non_nulls() adds the maps from another
7157# table to this one, replacing any in the intersection of the two.
7158
7159sub standardize { return main::standardize($_[0]); }
7160sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7161
7162{ # Closure
7163
7164 # This hash will contain as keys, all the aliases of all properties, and
7165 # as values, pointers to their respective property objects. This allows
7166 # quick look-up of a property from any of its names.
7167 my %alias_to_property_of;
7168
7169 sub dump_alias_to_property_of {
7170 # For debugging
7171
7172 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7173 return;
7174 }
7175
7176 sub property_ref {
7177 # This is a package subroutine, not called as a method.
7178 # If the single parameter is a literal '*' it returns a list of all
7179 # defined properties.
7180 # Otherwise, the single parameter is a name, and it returns a pointer
7181 # to the corresponding property object, or undef if none.
7182 #
7183 # Properties can have several different names. The 'standard' form of
7184 # each of them is stored in %alias_to_property_of as they are defined.
7185 # But it's possible that this subroutine will be called with some
7186 # variant, so if the initial lookup fails, it is repeated with the
98dc9551 7187 # standardized form of the input name. If found, besides returning the
99870f4d
KW
7188 # result, the input name is added to the list so future calls won't
7189 # have to do the conversion again.
7190
7191 my $name = shift;
7192
7193 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7194
7195 if (! defined $name) {
7196 Carp::my_carp_bug("Undefined input property. No action taken.");
7197 return;
7198 }
7199
7200 return main::uniques(values %alias_to_property_of) if $name eq '*';
7201
7202 # Return cached result if have it.
7203 my $result = $alias_to_property_of{$name};
7204 return $result if defined $result;
7205
7206 # Convert the input to standard form.
7207 my $standard_name = standardize($name);
7208
7209 $result = $alias_to_property_of{$standard_name};
7210 return unless defined $result; # Don't cache undefs
7211
7212 # Cache the result before returning it.
7213 $alias_to_property_of{$name} = $result;
7214 return $result;
7215 }
7216
7217
7218 main::setup_package();
7219
7220 my %map;
7221 # A pointer to the map table object for this property
7222 main::set_access('map', \%map);
7223
7224 my %full_name;
7225 # The property's full name. This is a duplicate of the copy kept in the
7226 # map table, but is needed because stringify needs it during
7227 # construction of the map table, and then would have a chicken before egg
7228 # problem.
7229 main::set_access('full_name', \%full_name, 'r');
7230
7231 my %table_ref;
7232 # This hash will contain as keys, all the aliases of any match tables
7233 # attached to this property, and as values, the pointers to their
7234 # respective tables. This allows quick look-up of a table from any of its
7235 # names.
7236 main::set_access('table_ref', \%table_ref);
7237
7238 my %type;
7239 # The type of the property, $ENUM, $BINARY, etc
7240 main::set_access('type', \%type, 'r');
7241
7242 my %file;
7243 # The filename where the map table will go (if actually written).
7244 # Normally defaulted, but can be overridden.
7245 main::set_access('file', \%file, 'r', 's');
7246
7247 my %directory;
7248 # The directory where the map table will go (if actually written).
7249 # Normally defaulted, but can be overridden.
7250 main::set_access('directory', \%directory, 's');
7251
7252 my %pseudo_map_type;
7253 # This is used to affect the calculation of the map types for all the
7254 # ranges in the table. It should be set to one of the values that signify
7255 # to alter the calculation.
7256 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7257
7258 my %has_only_code_point_maps;
7259 # A boolean used to help in computing the type of data in the map table.
7260 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7261
7262 my %unique_maps;
7263 # A list of the first few distinct mappings this property has. This is
7264 # used to disambiguate between binary and enum property types, so don't
7265 # have to keep more than three.
7266 main::set_access('unique_maps', \%unique_maps);
7267
56557540
KW
7268 my %pre_declared_maps;
7269 # A boolean that gives whether the input data should declare all the
7270 # tables used, or not. If the former, unknown ones raise a warning.
7271 main::set_access('pre_declared_maps',
047274f2 7272 \%pre_declared_maps, 'r', 's');
56557540 7273
99870f4d
KW
7274 sub new {
7275 # The only required parameter is the positionally first, name. All
7276 # other parameters are key => value pairs. See the documentation just
7277 # above for the meanings of the ones not passed directly on to the map
7278 # table constructor.
7279
7280 my $class = shift;
7281 my $name = shift || "";
7282
7283 my $self = property_ref($name);
7284 if (defined $self) {
7285 my $options_string = join ", ", @_;
7286 $options_string = ". Ignoring options $options_string" if $options_string;
7287 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7288 return $self;
7289 }
7290
7291 my %args = @_;
7292
7293 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 7294 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7295
7296 $directory{$addr} = delete $args{'Directory'};
7297 $file{$addr} = delete $args{'File'};
7298 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7299 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7300 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
56557540
KW
7301 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7302 # Starting in this release, property
7303 # values should be defined for all
7304 # properties, except those overriding this
7305 // $v_version ge v5.1.0;
c12f2655 7306
99870f4d
KW
7307 # Rest of parameters passed on.
7308
7309 $has_only_code_point_maps{$addr} = 1;
7310 $table_ref{$addr} = { };
7311 $unique_maps{$addr} = { };
7312
7313 $map{$addr} = Map_Table->new($name,
7314 Full_Name => $full_name{$addr},
7315 _Alias_Hash => \%alias_to_property_of,
7316 _Property => $self,
7317 %args);
7318 return $self;
7319 }
7320
7321 # See this program's beginning comment block about overloading the copy
7322 # constructor. Few operations are defined on properties, but a couple are
7323 # useful. It is safe to take the inverse of a property, and to remove a
7324 # single code point from it.
7325 use overload
7326 fallback => 0,
7327 qw("") => "_operator_stringify",
7328 "." => \&main::_operator_dot,
7329 '==' => \&main::_operator_equal,
7330 '!=' => \&main::_operator_not_equal,
7331 '=' => sub { return shift },
7332 '-=' => "_minus_and_equal",
7333 ;
7334
7335 sub _operator_stringify {
7336 return "Property '" . shift->full_name . "'";
7337 }
7338
7339 sub _minus_and_equal {
7340 # Remove a single code point from the map table of a property.
7341
7342 my $self = shift;
7343 my $other = shift;
7344 my $reversed = shift;
7345 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7346
7347 if (ref $other) {
7348 Carp::my_carp_bug("Can't cope with a "
7349 . ref($other)
7350 . " argument to '-='. Subtraction ignored.");
7351 return $self;
7352 }
98dc9551 7353 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
99870f4d
KW
7354 Carp::my_carp_bug("Can't cope with a "
7355 . __PACKAGE__
7356 . " being the first parameter in a '-='. Subtraction ignored.");
7357 return $self;
7358 }
7359 else {
f998e60c 7360 no overloading;
051df77b 7361 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
7362 }
7363 return $self;
7364 }
7365
7366 sub add_match_table {
7367 # Add a new match table for this property, with name given by the
7368 # parameter. It returns a pointer to the table.
7369
7370 my $self = shift;
7371 my $name = shift;
7372 my %args = @_;
7373
ffe43484 7374 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7375
7376 my $table = $table_ref{$addr}{$name};
7377 my $standard_name = main::standardize($name);
7378 if (defined $table
7379 || (defined ($table = $table_ref{$addr}{$standard_name})))
7380 {
7381 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7382 $table_ref{$addr}{$name} = $table;
7383 return $table;
7384 }
7385 else {
7386
7387 # See if this is a perl extension, if not passed in.
7388 my $perl_extension = delete $args{'Perl_Extension'};
7389 $perl_extension
7390 = $self->perl_extension if ! defined $perl_extension;
7391
7392 $table = Match_Table->new(
7393 Name => $name,
7394 Perl_Extension => $perl_extension,
7395 _Alias_Hash => $table_ref{$addr},
7396 _Property => $self,
7397
301ba948
KW
7398 # gets property's fate and status by default
7399 Fate => $self->fate,
99870f4d
KW
7400 Status => $self->status,
7401 _Status_Info => $self->status_info,
88c22f80 7402 %args);
99870f4d
KW
7403 return unless defined $table;
7404 }
7405
7406 # Save the names for quick look up
7407 $table_ref{$addr}{$standard_name} = $table;
7408 $table_ref{$addr}{$name} = $table;
7409
7410 # Perhaps we can figure out the type of this property based on the
7411 # fact of adding this match table. First, string properties don't
7412 # have match tables; second, a binary property can't have 3 match
7413 # tables
7414 if ($type{$addr} == $UNKNOWN) {
7415 $type{$addr} = $NON_STRING;
7416 }
7417 elsif ($type{$addr} == $STRING) {
7418 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7419 $type{$addr} = $NON_STRING;
7420 }
06f26c45 7421 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
99870f4d
KW
7422 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7423 && $type{$addr} == $BINARY)
7424 {
7425 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7426 $type{$addr} = $ENUM;
7427 }
7428 }
7429
7430 return $table;
7431 }
7432
4b9b0bc5
KW
7433 sub delete_match_table {
7434 # Delete the table referred to by $2 from the property $1.
7435
7436 my $self = shift;
7437 my $table_to_remove = shift;
7438 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7439
7440 my $addr = do { no overloading; pack 'J', $self; };
7441
7442 # Remove all names that refer to it.
7443 foreach my $key (keys %{$table_ref{$addr}}) {
7444 delete $table_ref{$addr}{$key}
7445 if $table_ref{$addr}{$key} == $table_to_remove;
7446 }
7447
7448 $table_to_remove->DESTROY;
7449 return;
7450 }
7451
99870f4d
KW
7452 sub table {
7453 # Return a pointer to the match table (with name given by the
7454 # parameter) associated with this property; undef if none.
7455
7456 my $self = shift;
7457 my $name = shift;
7458 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7459
ffe43484 7460 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7461
7462 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7463
7464 # If quick look-up failed, try again using the standard form of the
7465 # input name. If that succeeds, cache the result before returning so
7466 # won't have to standardize this input name again.
7467 my $standard_name = main::standardize($name);
7468 return unless defined $table_ref{$addr}{$standard_name};
7469
7470 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7471 return $table_ref{$addr}{$name};
7472 }
7473
7474 sub tables {
7475 # Return a list of pointers to all the match tables attached to this
7476 # property
7477
f998e60c 7478 no overloading;
051df77b 7479 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
7480 }
7481
7482 sub directory {
7483 # Returns the directory the map table for this property should be
7484 # output in. If a specific directory has been specified, that has
7485 # priority; 'undef' is returned if the type isn't defined;
7486 # or $map_directory for everything else.
7487
ffe43484 7488 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
7489
7490 return $directory{$addr} if defined $directory{$addr};
7491 return undef if $type{$addr} == $UNKNOWN;
7492 return $map_directory;
7493 }
7494
7495 sub swash_name {
7496 # Return the name that is used to both:
7497 # 1) Name the file that the map table is written to.
7498 # 2) The name of swash related stuff inside that file.
7499 # The reason for this is that the Perl core historically has used
7500 # certain names that aren't the same as the Unicode property names.
7501 # To continue using these, $file is hard-coded in this file for those,
7502 # but otherwise the standard name is used. This is different from the
7503 # external_name, so that the rest of the files, like in lib can use
7504 # the standard name always, without regard to historical precedent.
7505
7506 my $self = shift;
7507 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7508
ffe43484 7509 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 7510
19f751d2
KW
7511 # Swash names are used only on regular map tables; otherwise there
7512 # should be no access to the property map table from other parts of
7513 # Perl.
7514 return if $map{$addr}->fate != $ORDINARY;
7515
99870f4d
KW
7516 return $file{$addr} if defined $file{$addr};
7517 return $map{$addr}->external_name;
7518 }
7519
7520 sub to_create_match_tables {
7521 # Returns a boolean as to whether or not match tables should be
7522 # created for this property.
7523
7524 my $self = shift;
7525 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7526
7527 # The whole point of this pseudo property is match tables.
7528 return 1 if $self == $perl;
7529
ffe43484 7530 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7531
7532 # Don't generate tables of code points that match the property values
7533 # of a string property. Such a list would most likely have many
7534 # property values, each with just one or very few code points mapping
7535 # to it.
7536 return 0 if $type{$addr} == $STRING;
7537
7538 # Don't generate anything for unimplemented properties.
7539 return 0 if grep { $self->complete_name eq $_ }
7540 @unimplemented_properties;
7541 # Otherwise, do.
7542 return 1;
7543 }
7544
7545 sub property_add_or_replace_non_nulls {
7546 # This adds the mappings in the property $other to $self. Non-null
7547 # mappings from $other override those in $self. It essentially merges
7548 # the two properties, with the second having priority except for null
7549 # mappings.
7550
7551 my $self = shift;
7552 my $other = shift;
7553 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7554
7555 if (! $other->isa(__PACKAGE__)) {
7556 Carp::my_carp_bug("$other should be a "
7557 . __PACKAGE__
7558 . ". Not a '"
7559 . ref($other)
7560 . "'. Not added;");
7561 return;
7562 }
7563
f998e60c 7564 no overloading;
051df77b 7565 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
7566 }
7567
7568 sub set_type {
7569 # Set the type of the property. Mostly this is figured out by the
7570 # data in the table. But this is used to set it explicitly. The
7571 # reason it is not a standard accessor is that when setting a binary
7572 # property, we need to make sure that all the true/false aliases are
7573 # present, as they were omitted in early Unicode releases.
7574
7575 my $self = shift;
7576 my $type = shift;
7577 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7578
06f26c45
KW
7579 if ($type != $ENUM
7580 && $type != $BINARY
7581 && $type != $FORCED_BINARY
7582 && $type != $STRING)
7583 {
99870f4d
KW
7584 Carp::my_carp("Unrecognized type '$type'. Type not set");
7585 return;
7586 }
7587
051df77b 7588 { no overloading; $type{pack 'J', $self} = $type; }
06f26c45 7589 return if $type != $BINARY && $type != $FORCED_BINARY;
99870f4d
KW
7590
7591 my $yes = $self->table('Y');
7592 $yes = $self->table('Yes') if ! defined $yes;
01adf4be
KW
7593 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7594 if ! defined $yes;
7595
3c6bf941
KW
7596 # Add aliases in order wanted, duplicates will be ignored. We use a
7597 # binary property present in all releases for its ordered lists of
7598 # true/false aliases. Note, that could run into problems in
7599 # outputting things in that we don't distinguish between the name and
7600 # full name of these. Hopefully, if the table was already created
7601 # before this code is executed, it was done with these set properly.
7602 my $bm = property_ref("Bidi_Mirrored");
7603 foreach my $alias ($bm->table("Y")->aliases) {
7604 $yes->add_alias($alias->name);
7605 }
99870f4d
KW
7606 my $no = $self->table('N');
7607 $no = $self->table('No') if ! defined $no;
01adf4be 7608 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
3c6bf941
KW
7609 foreach my $alias ($bm->table("N")->aliases) {
7610 $no->add_alias($alias->name);
7611 }
c12f2655 7612
99870f4d
KW
7613 return;
7614 }
7615
7616 sub add_map {
7617 # Add a map to the property's map table. This also keeps
7618 # track of the maps so that the property type can be determined from
7619 # its data.
7620
7621 my $self = shift;
7622 my $start = shift; # First code point in range
7623 my $end = shift; # Final code point in range
7624 my $map = shift; # What the range maps to.
7625 # Rest of parameters passed on.
7626
ffe43484 7627 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7628
7629 # If haven't the type of the property, gather information to figure it
7630 # out.
7631 if ($type{$addr} == $UNKNOWN) {
7632
7633 # If the map contains an interior blank or dash, or most other
7634 # nonword characters, it will be a string property. This
7635 # heuristic may actually miss some string properties. If so, they
7636 # may need to have explicit set_types called for them. This
7637 # happens in the Unihan properties.
7638 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7639 || $map =~ / [^\w.\/\ -] /x)
7640 {
7641 $self->set_type($STRING);
7642
7643 # $unique_maps is used for disambiguating between ENUM and
7644 # BINARY later; since we know the property is not going to be
7645 # one of those, no point in keeping the data around
7646 undef $unique_maps{$addr};
7647 }
7648 else {
7649
7650 # Not necessarily a string. The final decision has to be
7651 # deferred until all the data are in. We keep track of if all
7652 # the values are code points for that eventual decision.
7653 $has_only_code_point_maps{$addr} &=
7654 $map =~ / ^ $code_point_re $/x;
7655
7656 # For the purposes of disambiguating between binary and other
7657 # enumerations at the end, we keep track of the first three
7658 # distinct property values. Once we get to three, we know
7659 # it's not going to be binary, so no need to track more.
7660 if (scalar keys %{$unique_maps{$addr}} < 3) {
7661 $unique_maps{$addr}{main::standardize($map)} = 1;
7662 }
7663 }
7664 }
7665
7666 # Add the mapping by calling our map table's method
7667 return $map{$addr}->add_map($start, $end, $map, @_);
7668 }
7669
7670 sub compute_type {
7671 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7672 # should be called after the property is mostly filled with its maps.
7673 # We have been keeping track of what the property values have been,
7674 # and now have the necessary information to figure out the type.
7675
7676 my $self = shift;
7677 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7678
ffe43484 7679 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7680
7681 my $type = $type{$addr};
7682
7683 # If already have figured these out, no need to do so again, but we do
7684 # a double check on ENUMS to make sure that a string property hasn't
7685 # improperly been classified as an ENUM, so continue on with those.
06f26c45
KW
7686 return if $type == $STRING
7687 || $type == $BINARY
7688 || $type == $FORCED_BINARY;
99870f4d
KW
7689
7690 # If every map is to a code point, is a string property.
7691 if ($type == $UNKNOWN
7692 && ($has_only_code_point_maps{$addr}
7693 || (defined $map{$addr}->default_map
7694 && $map{$addr}->default_map eq "")))
7695 {
7696 $self->set_type($STRING);
7697 }
7698 else {
7699
7700 # Otherwise, it is to some sort of enumeration. (The case where
7701 # it is a Unicode miscellaneous property, and treated like a
7702 # string in this program is handled in add_map()). Distinguish
7703 # between binary and some other enumeration type. Of course, if
7704 # there are more than two values, it's not binary. But more
7705 # subtle is the test that the default mapping is defined means it
7706 # isn't binary. This in fact may change in the future if Unicode
7707 # changes the way its data is structured. But so far, no binary
7708 # properties ever have @missing lines for them, so the default map
7709 # isn't defined for them. The few properties that are two-valued
7710 # and aren't considered binary have the default map defined
7711 # starting in Unicode 5.0, when the @missing lines appeared; and
7712 # this program has special code to put in a default map for them
7713 # for earlier than 5.0 releases.
7714 if ($type == $ENUM
7715 || scalar keys %{$unique_maps{$addr}} > 2
7716 || defined $self->default_map)
7717 {
7718 my $tables = $self->tables;
7719 my $count = $self->count;
7720 if ($verbosity && $count > 500 && $tables/$count > .1) {
7721 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7722 }
7723 $self->set_type($ENUM);
7724 }
7725 else {
7726 $self->set_type($BINARY);
7727 }
7728 }
7729 undef $unique_maps{$addr}; # Garbage collect
7730 return;
7731 }
7732
301ba948
KW
7733 sub set_fate {
7734 my $self = shift;
7735 my $fate = shift;
7736 my $reason = shift; # Ignored unless suppressing
7737 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7738
7739 my $addr = do { no overloading; pack 'J', $self; };
7740 if ($fate == $SUPPRESSED) {
7741 $why_suppressed{$self->complete_name} = $reason;
7742 }
7743
7744 # Each table shares the property's fate
7745 foreach my $table ($map{$addr}, $self->tables) {
7746 $table->set_fate($fate, $reason);
7747 }
7748 return;
7749 }
7750
7751
99870f4d
KW
7752 # Most of the accessors for a property actually apply to its map table.
7753 # Setup up accessor functions for those, referring to %map
ea25a9b2 7754 for my $sub (qw(
99870f4d
KW
7755 add_alias
7756 add_anomalous_entry
7757 add_comment
7758 add_conflicting
7759 add_description
7760 add_duplicate
7761 add_note
7762 aliases
7763 comment
7764 complete_name
2f7a8815 7765 containing_range
99870f4d
KW
7766 core_access
7767 count
7768 default_map
7769 delete_range
7770 description
7771 each_range
7772 external_name
301ba948 7773 fate
99870f4d
KW
7774 file_path
7775 format
7776 initialize
7777 inverse_list
7778 is_empty
7779 name
7780 note
7781 perl_extension
7782 property
7783 range_count
7784 ranges
7785 range_size_1
7786 reset_each_range
7787 set_comment
7788 set_core_access
7789 set_default_map
7790 set_file_path
7791 set_final_comment
7792 set_range_size_1
7793 set_status
7794 set_to_output_map
7795 short_name
7796 status
7797 status_info
7798 to_output_map
0a9dbafc 7799 type_of
99870f4d
KW
7800 value_of
7801 write
ea25a9b2 7802 ))
99870f4d
KW
7803 # 'property' above is for symmetry, so that one can take
7804 # the property of a property and get itself, and so don't
7805 # have to distinguish between properties and tables in
7806 # calling code
7807 {
7808 no strict "refs";
7809 *$sub = sub {
7810 use strict "refs";
7811 my $self = shift;
f998e60c 7812 no overloading;
051df77b 7813 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7814 }
7815 }
7816
7817
7818} # End closure
7819
7820package main;
7821
7822sub join_lines($) {
7823 # Returns lines of the input joined together, so that they can be folded
7824 # properly.
7825 # This causes continuation lines to be joined together into one long line
7826 # for folding. A continuation line is any line that doesn't begin with a
7827 # space or "\b" (the latter is stripped from the output). This is so
7828 # lines can be be in a HERE document so as to fit nicely in the terminal
7829 # width, but be joined together in one long line, and then folded with
7830 # indents, '#' prefixes, etc, properly handled.
7831 # A blank separates the joined lines except if there is a break; an extra
7832 # blank is inserted after a period ending a line.
7833
98dc9551 7834 # Initialize the return with the first line.
99870f4d
KW
7835 my ($return, @lines) = split "\n", shift;
7836
7837 # If the first line is null, it was an empty line, add the \n back in
7838 $return = "\n" if $return eq "";
7839
7840 # Now join the remainder of the physical lines.
7841 for my $line (@lines) {
7842
7843 # An empty line means wanted a blank line, so add two \n's to get that
7844 # effect, and go to the next line.
7845 if (length $line == 0) {
7846 $return .= "\n\n";
7847 next;
7848 }
7849
7850 # Look at the last character of what we have so far.
7851 my $previous_char = substr($return, -1, 1);
7852
7853 # And at the next char to be output.
7854 my $next_char = substr($line, 0, 1);
7855
7856 if ($previous_char ne "\n") {
7857
7858 # Here didn't end wth a nl. If the next char a blank or \b, it
7859 # means that here there is a break anyway. So add a nl to the
7860 # output.
7861 if ($next_char eq " " || $next_char eq "\b") {
7862 $previous_char = "\n";
7863 $return .= $previous_char;
7864 }
7865
7866 # Add an extra space after periods.
7867 $return .= " " if $previous_char eq '.';
7868 }
7869
7870 # Here $previous_char is still the latest character to be output. If
7871 # it isn't a nl, it means that the next line is to be a continuation
7872 # line, with a blank inserted between them.
7873 $return .= " " if $previous_char ne "\n";
7874
7875 # Get rid of any \b
7876 substr($line, 0, 1) = "" if $next_char eq "\b";
7877
7878 # And append this next line.
7879 $return .= $line;
7880 }
7881
7882 return $return;
7883}
7884
7885sub simple_fold($;$$$) {
7886 # Returns a string of the input (string or an array of strings) folded
7887 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7888 # a \n
7889 # This is tailored for the kind of text written by this program,
7890 # especially the pod file, which can have very long names with
7891 # underscores in the middle, or words like AbcDefgHij.... We allow
7892 # breaking in the middle of such constructs if the line won't fit
7893 # otherwise. The break in such cases will come either just after an
7894 # underscore, or just before one of the Capital letters.
7895
7896 local $to_trace = 0 if main::DEBUG;
7897
7898 my $line = shift;
7899 my $prefix = shift; # Optional string to prepend to each output
7900 # line
7901 $prefix = "" unless defined $prefix;
7902
7903 my $hanging_indent = shift; # Optional number of spaces to indent
7904 # continuation lines
7905 $hanging_indent = 0 unless $hanging_indent;
7906
7907 my $right_margin = shift; # Optional number of spaces to narrow the
7908 # total width by.
7909 $right_margin = 0 unless defined $right_margin;
7910
7911 # Call carp with the 'nofold' option to avoid it from trying to call us
7912 # recursively
7913 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7914
7915 # The space available doesn't include what's automatically prepended
7916 # to each line, or what's reserved on the right.
7917 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7918 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7919
7920 if (DEBUG && $hanging_indent >= $max) {
7921 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7922 $hanging_indent = 0;
7923 }
7924
7925 # First, split into the current physical lines.
7926 my @line;
7927 if (ref $line) { # Better be an array, because not bothering to
7928 # test
7929 foreach my $line (@{$line}) {
7930 push @line, split /\n/, $line;
7931 }
7932 }
7933 else {
7934 @line = split /\n/, $line;
7935 }
7936
7937 #local $to_trace = 1 if main::DEBUG;
7938 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7939
7940 # Look at each current physical line.
7941 for (my $i = 0; $i < @line; $i++) {
7942 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7943 #local $to_trace = 1 if main::DEBUG;
7944 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7945
7946 # Remove prefix, because will be added back anyway, don't want
7947 # doubled prefix
7948 $line[$i] =~ s/^$prefix//;
7949
7950 # Remove trailing space
7951 $line[$i] =~ s/\s+\Z//;
7952
7953 # If the line is too long, fold it.
7954 if (length $line[$i] > $max) {
7955 my $remainder;
7956
7957 # Here needs to fold. Save the leading space in the line for
7958 # later.
7959 $line[$i] =~ /^ ( \s* )/x;
7960 my $leading_space = $1;
7961 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7962
7963 # If character at final permissible position is white space,
7964 # fold there, which will delete that white space
7965 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7966 $remainder = substr($line[$i], $max);
7967 $line[$i] = substr($line[$i], 0, $max - 1);
7968 }
7969 else {
7970
7971 # Otherwise fold at an acceptable break char closest to
7972 # the max length. Look at just the maximal initial
7973 # segment of the line
7974 my $segment = substr($line[$i], 0, $max - 1);
7975 if ($segment =~
7976 /^ ( .{$hanging_indent} # Don't look before the
7977 # indent.
7978 \ * # Don't look in leading
7979 # blanks past the indent
7980 [^ ] .* # Find the right-most
7981 (?: # acceptable break:
7982 [ \s = ] # space or equal
7983 | - (?! [.0-9] ) # or non-unary minus.
7984 ) # $1 includes the character
7985 )/x)
7986 {
7987 # Split into the initial part that fits, and remaining
7988 # part of the input
7989 $remainder = substr($line[$i], length $1);
7990 $line[$i] = $1;
7991 trace $line[$i] if DEBUG && $to_trace;
7992 trace $remainder if DEBUG && $to_trace;
7993 }
7994
7995 # If didn't find a good breaking spot, see if there is a
7996 # not-so-good breaking spot. These are just after
7997 # underscores or where the case changes from lower to
7998 # upper. Use \a as a soft hyphen, but give up
7999 # and don't break the line if there is actually a \a
8000 # already in the input. We use an ascii character for the
8001 # soft-hyphen to avoid any attempt by miniperl to try to
8002 # access the files that this program is creating.
8003 elsif ($segment !~ /\a/
8004 && ($segment =~ s/_/_\a/g
8005 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8006 {
8007 # Here were able to find at least one place to insert
8008 # our substitute soft hyphen. Find the right-most one
8009 # and replace it by a real hyphen.
8010 trace $segment if DEBUG && $to_trace;
8011 substr($segment,
8012 rindex($segment, "\a"),
8013 1) = '-';
8014
8015 # Then remove the soft hyphen substitutes.
8016 $segment =~ s/\a//g;
8017 trace $segment if DEBUG && $to_trace;
8018
8019 # And split into the initial part that fits, and
8020 # remainder of the line
8021 my $pos = rindex($segment, '-');
8022 $remainder = substr($line[$i], $pos);
8023 trace $remainder if DEBUG && $to_trace;
8024 $line[$i] = substr($segment, 0, $pos + 1);
8025 }
8026 }
8027
8028 # Here we know if we can fold or not. If we can, $remainder
8029 # is what remains to be processed in the next iteration.
8030 if (defined $remainder) {
8031 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8032
8033 # Insert the folded remainder of the line as a new element
8034 # of the array. (It may still be too long, but we will
8035 # deal with that next time through the loop.) Omit any
8036 # leading space in the remainder.
8037 $remainder =~ s/^\s+//;
8038 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8039
8040 # But then indent by whichever is larger of:
8041 # 1) the leading space on the input line;
8042 # 2) the hanging indent.
8043 # This preserves indentation in the original line.
8044 my $lead = ($leading_space)
8045 ? length $leading_space
8046 : $hanging_indent;
8047 $lead = max($lead, $hanging_indent);
8048 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8049 }
8050 }
8051
8052 # Ready to output the line. Get rid of any trailing space
8053 # And prefix by the required $prefix passed in.
8054 $line[$i] =~ s/\s+$//;
8055 $line[$i] = "$prefix$line[$i]\n";
8056 } # End of looping through all the lines.
8057
8058 return join "", @line;
8059}
8060
8061sub property_ref { # Returns a reference to a property object.
8062 return Property::property_ref(@_);
8063}
8064
8065sub force_unlink ($) {
8066 my $filename = shift;
8067 return unless file_exists($filename);
8068 return if CORE::unlink($filename);
8069
8070 # We might need write permission
8071 chmod 0777, $filename;
8072 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8073 return;
8074}
8075
9218f1cf 8076sub write ($$@) {
9abe8df8
KW
8077 # Given a filename and references to arrays of lines, write the lines of
8078 # each array to the file
99870f4d
KW
8079 # Filename can be given as an arrayref of directory names
8080
9218f1cf 8081 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
99870f4d 8082
9abe8df8 8083 my $file = shift;
9218f1cf 8084 my $use_utf8 = shift;
99870f4d
KW
8085
8086 # Get into a single string if an array, and get rid of, in Unix terms, any
8087 # leading '.'
8088 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8089 $file = File::Spec->canonpath($file);
8090
8091 # If has directories, make sure that they all exist
8092 (undef, my $directories, undef) = File::Spec->splitpath($file);
8093 File::Path::mkpath($directories) if $directories && ! -d $directories;
8094
8095 push @files_actually_output, $file;
8096
99870f4d
KW
8097 force_unlink ($file);
8098
8099 my $OUT;
8100 if (not open $OUT, ">", $file) {
8101 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8102 return;
8103 }
430ada4c 8104
9218f1cf
KW
8105 binmode $OUT, ":utf8" if $use_utf8;
8106
9abe8df8
KW
8107 while (defined (my $lines_ref = shift)) {
8108 unless (@$lines_ref) {
8109 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8110 }
8111
8112 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8113 }
430ada4c
NC
8114 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8115
99870f4d
KW
8116 print "$file written.\n" if $verbosity >= $VERBOSE;
8117
99870f4d
KW
8118 return;
8119}
8120
8121
8122sub Standardize($) {
8123 # This converts the input name string into a standardized equivalent to
8124 # use internally.
8125
8126 my $name = shift;
8127 unless (defined $name) {
8128 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8129 return;
8130 }
8131
8132 # Remove any leading or trailing white space
8133 $name =~ s/^\s+//g;
8134 $name =~ s/\s+$//g;
8135
98dc9551 8136 # Convert interior white space and hyphens into underscores.
99870f4d
KW
8137 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8138
8139 # Capitalize the letter following an underscore, and convert a sequence of
8140 # multiple underscores to a single one
8141 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8142
8143 # And capitalize the first letter, but not for the special cjk ones.
8144 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8145 return $name;
8146}
8147
8148sub standardize ($) {
8149 # Returns a lower-cased standardized name, without underscores. This form
8150 # is chosen so that it can distinguish between any real versus superficial
8151 # Unicode name differences. It relies on the fact that Unicode doesn't
8152 # have interior underscores, white space, nor dashes in any
8153 # stricter-matched name. It should not be used on Unicode code point
8154 # names (the Name property), as they mostly, but not always follow these
8155 # rules.
8156
8157 my $name = Standardize(shift);
8158 return if !defined $name;
8159
8160 $name =~ s/ (?<= .) _ (?= . ) //xg;
8161 return lc $name;
8162}
8163
c85f591a
KW
8164sub utf8_heavy_name ($$) {
8165 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8166 # perhaps this function should be placed somewhere, like Heavy.pl so that
8167 # utf8_heavy can use it directly without duplicating code that can get
8168 # out-of sync.
8169
8170 my $table = shift;
8171 my $alias = shift;
8172 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8173
8174 my $property = $table->property;
8175 $property = ($property == $perl)
8176 ? "" # 'perl' is never explicitly stated
8177 : standardize($property->name) . '=';
8178 if ($alias->loose_match) {
8179 return $property . standardize($alias->name);
8180 }
8181 else {
8182 return lc ($property . $alias->name);
8183 }
8184
8185 return;
8186}
8187
99870f4d
KW
8188{ # Closure
8189
8190 my $indent_increment = " " x 2;
8191 my %already_output;
8192
8193 $main::simple_dumper_nesting = 0;
8194
8195 sub simple_dumper {
8196 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8197 # the real thing as we have to run under miniperl.
8198
8199 # It is designed so that on input it is at the beginning of a line,
8200 # and the final thing output in any call is a trailing ",\n".
8201
8202 my $item = shift;
8203 my $indent = shift;
8204 $indent = "" if ! defined $indent;
8205
8206 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8207
8208 # nesting level is localized, so that as the call stack pops, it goes
8209 # back to the prior value.
8210 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8211 undef %already_output if $main::simple_dumper_nesting == 0;
8212 $main::simple_dumper_nesting++;
8213 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8214
8215 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8216
8217 # Determine the indent for recursive calls.
8218 my $next_indent = $indent . $indent_increment;
8219
8220 my $output;
8221 if (! ref $item) {
8222
8223 # Dump of scalar: just output it in quotes if not a number. To do
8224 # so we must escape certain characters, and therefore need to
8225 # operate on a copy to avoid changing the original
8226 my $copy = $item;
8227 $copy = $UNDEF unless defined $copy;
8228
02cc6656
KW
8229 # Quote non-integers (integers also have optional leading '-')
8230 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
99870f4d
KW
8231
8232 # Escape apostrophe and backslash
8233 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8234 $copy = "'$copy'";
8235 }
8236 $output = "$indent$copy,\n";
8237 }
8238 else {
8239
8240 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 8241 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 8242 if (defined $already_output{$addr}) {
99870f4d
KW
8243 return "${indent}ALREADY OUTPUT: $item\n";
8244 }
f998e60c 8245 $already_output{$addr} = $item;
99870f4d
KW
8246
8247 if (ref $item eq 'ARRAY') {
8248 my $using_brackets;
8249 $output = $indent;
8250 if ($main::simple_dumper_nesting > 1) {
8251 $output .= '[';
8252 $using_brackets = 1;
8253 }
8254 else {
8255 $using_brackets = 0;
8256 }
8257
8258 # If the array is empty, put the closing bracket on the same
8259 # line. Otherwise, recursively add each array element
8260 if (@$item == 0) {
8261 $output .= " ";
8262 }
8263 else {
8264 $output .= "\n";
8265 for (my $i = 0; $i < @$item; $i++) {
8266
8267 # Indent array elements one level
8268 $output .= &simple_dumper($item->[$i], $next_indent);
c12f2655
KW
8269 $output =~ s/\n$//; # Remove any trailing nl so
8270 $output .= " # [$i]\n"; # as to add a comment giving
8271 # the array index
99870f4d
KW
8272 }
8273 $output .= $indent; # Indent closing ']' to orig level
8274 }
8275 $output .= ']' if $using_brackets;
8276 $output .= ",\n";
8277 }
8278 elsif (ref $item eq 'HASH') {
8279 my $is_first_line;
8280 my $using_braces;
8281 my $body_indent;
8282
8283 # No surrounding braces at top level
8284 $output .= $indent;
8285 if ($main::simple_dumper_nesting > 1) {
8286 $output .= "{\n";
8287 $is_first_line = 0;
8288 $body_indent = $next_indent;
8289 $next_indent .= $indent_increment;
8290 $using_braces = 1;
8291 }
8292 else {
8293 $is_first_line = 1;
8294 $body_indent = $indent;
8295 $using_braces = 0;
8296 }
8297
8298 # Output hashes sorted alphabetically instead of apparently
8299 # random. Use caseless alphabetic sort
8300 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8301 {
8302 if ($is_first_line) {
8303 $is_first_line = 0;
8304 }
8305 else {
8306 $output .= "$body_indent";
8307 }
8308
8309 # The key must be a scalar, but this recursive call quotes
8310 # it
8311 $output .= &simple_dumper($key);
8312
8313 # And change the trailing comma and nl to the hash fat
8314 # comma for clarity, and so the value can be on the same
8315 # line
8316 $output =~ s/,\n$/ => /;
8317
8318 # Recursively call to get the value's dump.
8319 my $next = &simple_dumper($item->{$key}, $next_indent);
8320
8321 # If the value is all on one line, remove its indent, so
8322 # will follow the => immediately. If it takes more than
8323 # one line, start it on a new line.
8324 if ($next !~ /\n.*\n/) {
8325 $next =~ s/^ *//;
8326 }
8327 else {
8328 $output .= "\n";
8329 }
8330 $output .= $next;
8331 }
8332
8333 $output .= "$indent},\n" if $using_braces;
8334 }
8335 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8336 $output = $indent . ref($item) . "\n";
8337 # XXX see if blessed
8338 }
8339 elsif ($item->can('dump')) {
8340
8341 # By convention in this program, objects furnish a 'dump'
8342 # method. Since not doing any output at this level, just pass
8343 # on the input indent
8344 $output = $item->dump($indent);
8345 }
8346 else {
8347 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8348 }
8349 }
8350 return $output;
8351 }
8352}
8353
8354sub dump_inside_out {
8355 # Dump inside-out hashes in an object's state by converting them to a
8356 # regular hash and then calling simple_dumper on that.
8357
8358 my $object = shift;
8359 my $fields_ref = shift;
8360 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8361
ffe43484 8362 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
8363
8364 my %hash;
8365 foreach my $key (keys %$fields_ref) {
8366 $hash{$key} = $fields_ref->{$key}{$addr};
8367 }
8368
8369 return simple_dumper(\%hash, @_);
8370}
8371
8372sub _operator_dot {
8373 # Overloaded '.' method that is common to all packages. It uses the
8374 # package's stringify method.
8375
8376 my $self = shift;
8377 my $other = shift;
8378 my $reversed = shift;
8379 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8380
8381 $other = "" unless defined $other;
8382
8383 foreach my $which (\$self, \$other) {
8384 next unless ref $$which;
8385 if ($$which->can('_operator_stringify')) {
8386 $$which = $$which->_operator_stringify;
8387 }
8388 else {
8389 my $ref = ref $$which;
ffe43484 8390 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
8391 $$which = "$ref ($addr)";
8392 }
8393 }
8394 return ($reversed)
8395 ? "$other$self"
8396 : "$self$other";
8397}
8398
8399sub _operator_equal {
8400 # Generic overloaded '==' routine. To be equal, they must be the exact
8401 # same object
8402
8403 my $self = shift;
8404 my $other = shift;
8405
8406 return 0 unless defined $other;
8407 return 0 unless ref $other;
f998e60c 8408 no overloading;
2100aa98 8409 return $self == $other;
99870f4d
KW
8410}
8411
8412sub _operator_not_equal {
8413 my $self = shift;
8414 my $other = shift;
8415
8416 return ! _operator_equal($self, $other);
8417}
8418
8419sub process_PropertyAliases($) {
8420 # This reads in the PropertyAliases.txt file, which contains almost all
8421 # the character properties in Unicode and their equivalent aliases:
8422 # scf ; Simple_Case_Folding ; sfc
8423 #
8424 # Field 0 is the preferred short name for the property.
8425 # Field 1 is the full name.
8426 # Any succeeding ones are other accepted names.
8427
8428 my $file= shift;
8429 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8430
8431 # This whole file was non-existent in early releases, so use our own
8432 # internal one.
8433 $file->insert_lines(get_old_property_aliases())
8434 if ! -e 'PropertyAliases.txt';
8435
8436 # Add any cjk properties that may have been defined.
8437 $file->insert_lines(@cjk_properties);
8438
8439 while ($file->next_line) {
8440
8441 my @data = split /\s*;\s*/;
8442
8443 my $full = $data[1];
8444
8445 my $this = Property->new($data[0], Full_Name => $full);
8446
8447 # Start looking for more aliases after these two.
8448 for my $i (2 .. @data - 1) {
8449 $this->add_alias($data[$i]);
8450 }
8451
8452 }
8453 return;
8454}
8455
8456sub finish_property_setup {
8457 # Finishes setting up after PropertyAliases.
8458
8459 my $file = shift;
8460 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8461
8462 # This entry was missing from this file in earlier Unicode versions
8463 if (-e 'Jamo.txt') {
8464 my $jsn = property_ref('JSN');
8465 if (! defined $jsn) {
8466 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8467 }
8468 }
8469
5f7264c7 8470 # This entry is still missing as of 6.0, perhaps because no short name for
99870f4d
KW
8471 # it.
8472 if (-e 'NameAliases.txt') {
8473 my $aliases = property_ref('Name_Alias');
8474 if (! defined $aliases) {
8475 $aliases = Property->new('Name_Alias');
8476 }
8477 }
8478
8479 # These are used so much, that we set globals for them.
8480 $gc = property_ref('General_Category');
8481 $block = property_ref('Block');
359523e2 8482 $script = property_ref('Script');
99870f4d
KW
8483
8484 # Perl adds this alias.
8485 $gc->add_alias('Category');
8486
8487 # For backwards compatibility, these property files have particular names.
8488 my $upper = property_ref('Uppercase_Mapping');
8489 $upper->set_core_access('uc()');
8490 $upper->set_file('Upper'); # This is what utf8.c calls it
8491
8492 my $lower = property_ref('Lowercase_Mapping');
8493 $lower->set_core_access('lc()');
8494 $lower->set_file('Lower');
8495
8496 my $title = property_ref('Titlecase_Mapping');
8497 $title->set_core_access('ucfirst()');
8498 $title->set_file('Title');
8499
8500 my $fold = property_ref('Case_Folding');
8501 $fold->set_file('Fold') if defined $fold;
8502
d3cbe105
KW
8503 # Unicode::Normalize expects this file with this name and directory.
8504 my $ccc = property_ref('Canonical_Combining_Class');
8505 if (defined $ccc) {
8506 $ccc->set_file('CombiningClass');
8507 $ccc->set_directory(File::Spec->curdir());
8508 }
8509
2cd56239
KW
8510 # utf8.c has a different meaning for non range-size-1 for map properties
8511 # that this program doesn't currently handle; and even if it were changed
8512 # to do so, some other code may be using them expecting range size 1.
99870f4d
KW
8513 foreach my $property (qw {
8514 Case_Folding
8515 Lowercase_Mapping
8516 Titlecase_Mapping
8517 Uppercase_Mapping
8518 })
8519 {
8520 property_ref($property)->set_range_size_1(1);
8521 }
8522
8523 # These two properties aren't actually used in the core, but unfortunately
8524 # the names just above that are in the core interfere with these, so
8525 # choose different names. These aren't a problem unless the map tables
8526 # for these files get written out.
8527 my $lowercase = property_ref('Lowercase');
8528 $lowercase->set_file('IsLower') if defined $lowercase;
8529 my $uppercase = property_ref('Uppercase');
8530 $uppercase->set_file('IsUpper') if defined $uppercase;
8531
8532 # Set up the hard-coded default mappings, but only on properties defined
8533 # for this release
8534 foreach my $property (keys %default_mapping) {
8535 my $property_object = property_ref($property);
8536 next if ! defined $property_object;
8537 my $default_map = $default_mapping{$property};
8538 $property_object->set_default_map($default_map);
8539
8540 # A map of <code point> implies the property is string.
8541 if ($property_object->type == $UNKNOWN
8542 && $default_map eq $CODE_POINT)
8543 {
8544 $property_object->set_type($STRING);
8545 }
8546 }
8547
8548 # The following use the Multi_Default class to create objects for
8549 # defaults.
8550
8551 # Bidi class has a complicated default, but the derived file takes care of
8552 # the complications, leaving just 'L'.
8553 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8554 property_ref('Bidi_Class')->set_default_map('L');
8555 }
8556 else {
8557 my $default;
8558
8559 # The derived file was introduced in 3.1.1. The values below are
8560 # taken from table 3-8, TUS 3.0
8561 my $default_R =
8562 'my $default = Range_List->new;
8563 $default->add_range(0x0590, 0x05FF);
8564 $default->add_range(0xFB1D, 0xFB4F);'
8565 ;
8566
8567 # The defaults apply only to unassigned characters
a67f160a 8568 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
8569
8570 if ($v_version lt v3.0.0) {
8571 $default = Multi_Default->new(R => $default_R, 'L');
8572 }
8573 else {
8574
8575 # AL apparently not introduced until 3.0: TUS 2.x references are
8576 # not on-line to check it out
8577 my $default_AL =
8578 'my $default = Range_List->new;
8579 $default->add_range(0x0600, 0x07BF);
8580 $default->add_range(0xFB50, 0xFDFF);
8581 $default->add_range(0xFE70, 0xFEFF);'
8582 ;
8583
8584 # Non-character code points introduced in this release; aren't AL
8585 if ($v_version ge 3.1.0) {
8586 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8587 }
a67f160a 8588 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
8589 $default = Multi_Default->new(AL => $default_AL,
8590 R => $default_R,
8591 'L');
8592 }
8593 property_ref('Bidi_Class')->set_default_map($default);
8594 }
8595
8596 # Joining type has a complicated default, but the derived file takes care
8597 # of the complications, leaving just 'U' (or Non_Joining), except the file
8598 # is bad in 3.1.0
8599 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8600 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8601 property_ref('Joining_Type')->set_default_map('Non_Joining');
8602 }
8603 else {
8604
8605 # Otherwise, there are not one, but two possibilities for the
8606 # missing defaults: T and U.
8607 # The missing defaults that evaluate to T are given by:
8608 # T = Mn + Cf - ZWNJ - ZWJ
8609 # where Mn and Cf are the general category values. In other words,
8610 # any non-spacing mark or any format control character, except
8611 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8612 # WIDTH JOINER (joining type C).
8613 my $default = Multi_Default->new(
8614 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8615 'Non_Joining');
8616 property_ref('Joining_Type')->set_default_map($default);
8617 }
8618 }
8619
8620 # Line break has a complicated default in early releases. It is 'Unknown'
8621 # for non-assigned code points; 'AL' for assigned.
8622 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8623 my $lb = property_ref('Line_Break');
8624 if ($v_version gt 3.2.0) {
8625 $lb->set_default_map('Unknown');
8626 }
8627 else {
8628 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8629 'AL');
8630 $lb->set_default_map($default);
8631 }
8632
8633 # If has the URS property, make sure that the standard aliases are in
8634 # it, since not in the input tables in some versions.
8635 my $urs = property_ref('Unicode_Radical_Stroke');
8636 if (defined $urs) {
8637 $urs->add_alias('cjkRSUnicode');
8638 $urs->add_alias('kRSUnicode');
8639 }
8640 }
8641 return;
8642}
8643
8644sub get_old_property_aliases() {
8645 # Returns what would be in PropertyAliases.txt if it existed in very old
8646 # versions of Unicode. It was derived from the one in 3.2, and pared
8647 # down based on the data that was actually in the older releases.
8648 # An attempt was made to use the existence of files to mean inclusion or
8649 # not of various aliases, but if this was not sufficient, using version
8650 # numbers was resorted to.
8651
8652 my @return;
8653
8654 # These are to be used in all versions (though some are constructed by
8655 # this program if missing)
8656 push @return, split /\n/, <<'END';
8657bc ; Bidi_Class
8658Bidi_M ; Bidi_Mirrored
8659cf ; Case_Folding
8660ccc ; Canonical_Combining_Class
8661dm ; Decomposition_Mapping
8662dt ; Decomposition_Type
8663gc ; General_Category
8664isc ; ISO_Comment
8665lc ; Lowercase_Mapping
8666na ; Name
8667na1 ; Unicode_1_Name
8668nt ; Numeric_Type
8669nv ; Numeric_Value
8670sfc ; Simple_Case_Folding
8671slc ; Simple_Lowercase_Mapping
8672stc ; Simple_Titlecase_Mapping
8673suc ; Simple_Uppercase_Mapping
8674tc ; Titlecase_Mapping
8675uc ; Uppercase_Mapping
8676END
8677
8678 if (-e 'Blocks.txt') {
8679 push @return, "blk ; Block\n";
8680 }
8681 if (-e 'ArabicShaping.txt') {
8682 push @return, split /\n/, <<'END';
8683jg ; Joining_Group
8684jt ; Joining_Type
8685END
8686 }
8687 if (-e 'PropList.txt') {
8688
8689 # This first set is in the original old-style proplist.
8690 push @return, split /\n/, <<'END';
8691Alpha ; Alphabetic
8692Bidi_C ; Bidi_Control
8693Dash ; Dash
8694Dia ; Diacritic
8695Ext ; Extender
8696Hex ; Hex_Digit
8697Hyphen ; Hyphen
8698IDC ; ID_Continue
8699Ideo ; Ideographic
8700Join_C ; Join_Control
8701Math ; Math
8702QMark ; Quotation_Mark
8703Term ; Terminal_Punctuation
8704WSpace ; White_Space
8705END
8706 # The next sets were added later
8707 if ($v_version ge v3.0.0) {
8708 push @return, split /\n/, <<'END';
8709Upper ; Uppercase
8710Lower ; Lowercase
8711END
8712 }
8713 if ($v_version ge v3.0.1) {
8714 push @return, split /\n/, <<'END';
8715NChar ; Noncharacter_Code_Point
8716END
8717 }
8718 # The next sets were added in the new-style
8719 if ($v_version ge v3.1.0) {
8720 push @return, split /\n/, <<'END';
8721OAlpha ; Other_Alphabetic
8722OLower ; Other_Lowercase
8723OMath ; Other_Math
8724OUpper ; Other_Uppercase
8725END
8726 }
8727 if ($v_version ge v3.1.1) {
8728 push @return, "AHex ; ASCII_Hex_Digit\n";
8729 }
8730 }
8731 if (-e 'EastAsianWidth.txt') {
8732 push @return, "ea ; East_Asian_Width\n";
8733 }
8734 if (-e 'CompositionExclusions.txt') {
8735 push @return, "CE ; Composition_Exclusion\n";
8736 }
8737 if (-e 'LineBreak.txt') {
8738 push @return, "lb ; Line_Break\n";
8739 }
8740 if (-e 'BidiMirroring.txt') {
8741 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8742 }
8743 if (-e 'Scripts.txt') {
8744 push @return, "sc ; Script\n";
8745 }
8746 if (-e 'DNormalizationProps.txt') {
8747 push @return, split /\n/, <<'END';
8748Comp_Ex ; Full_Composition_Exclusion
8749FC_NFKC ; FC_NFKC_Closure
8750NFC_QC ; NFC_Quick_Check
8751NFD_QC ; NFD_Quick_Check
8752NFKC_QC ; NFKC_Quick_Check
8753NFKD_QC ; NFKD_Quick_Check
8754XO_NFC ; Expands_On_NFC
8755XO_NFD ; Expands_On_NFD
8756XO_NFKC ; Expands_On_NFKC
8757XO_NFKD ; Expands_On_NFKD
8758END
8759 }
8760 if (-e 'DCoreProperties.txt') {
8761 push @return, split /\n/, <<'END';
8762IDS ; ID_Start
8763XIDC ; XID_Continue
8764XIDS ; XID_Start
8765END
8766 # These can also appear in some versions of PropList.txt
8767 push @return, "Lower ; Lowercase\n"
8768 unless grep { $_ =~ /^Lower\b/} @return;
8769 push @return, "Upper ; Uppercase\n"
8770 unless grep { $_ =~ /^Upper\b/} @return;
8771 }
8772
8773 # This flag requires the DAge.txt file to be copied into the directory.
8774 if (DEBUG && $compare_versions) {
8775 push @return, 'age ; Age';
8776 }
8777
8778 return @return;
8779}
8780
8781sub process_PropValueAliases {
8782 # This file contains values that properties look like:
8783 # bc ; AL ; Arabic_Letter
8784 # blk; n/a ; Greek_And_Coptic ; Greek
8785 #
8786 # Field 0 is the property.
8787 # Field 1 is the short name of a property value or 'n/a' if no
8788 # short name exists;
8789 # Field 2 is the full property value name;
8790 # Any other fields are more synonyms for the property value.
8791 # Purely numeric property values are omitted from the file; as are some
8792 # others, fewer and fewer in later releases
8793
8794 # Entries for the ccc property have an extra field before the
8795 # abbreviation:
8796 # ccc; 0; NR ; Not_Reordered
8797 # It is the numeric value that the names are synonyms for.
8798
8799 # There are comment entries for values missing from this file:
8800 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8801 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8802
8803 my $file= shift;
8804 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8805
8806 # This whole file was non-existent in early releases, so use our own
8807 # internal one if necessary.
8808 if (! -e 'PropValueAliases.txt') {
8809 $file->insert_lines(get_old_property_value_aliases());
8810 }
8811
8812 # Add any explicit cjk values
8813 $file->insert_lines(@cjk_property_values);
8814
8815 # This line is used only for testing the code that checks for name
8816 # conflicts. There is a script Inherited, and when this line is executed
8817 # it causes there to be a name conflict with the 'Inherited' that this
8818 # program generates for this block property value
8819 #$file->insert_lines('blk; n/a; Herited');
8820
8821
8822 # Process each line of the file ...
8823 while ($file->next_line) {
8824
8825 my ($property, @data) = split /\s*;\s*/;
8826
66b4eb0a
KW
8827 # The ccc property has an extra field at the beginning, which is the
8828 # numeric value. Move it to be after the other two, mnemonic, fields,
8829 # so that those will be used as the property value's names, and the
8830 # number will be an extra alias. (Rightmost splice removes field 1-2,
8831 # returning them in a slice; left splice inserts that before anything,
8832 # thus shifting the former field 0 to after them.)
8833 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8834
8835 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
8836 # there is no short name, use the full one in element 1
027866c1
KW
8837 if ($data[0] eq "n/a") {
8838 $data[0] = $data[1];
8839 }
8840 elsif ($data[0] ne $data[1]
8841 && standardize($data[0]) eq standardize($data[1])
8842 && $data[1] !~ /[[:upper:]]/)
8843 {
8844 # Also, there is a bug in the file in which "n/a" is omitted, and
8845 # the two fields are identical except for case, and the full name
8846 # is all lower case. Copy the "short" name unto the full one to
8847 # give it some upper case.
8848
8849 $data[1] = $data[0];
8850 }
99870f4d
KW
8851
8852 # Earlier releases had the pseudo property 'qc' that should expand to
8853 # the ones that replace it below.
8854 if ($property eq 'qc') {
8855 if (lc $data[0] eq 'y') {
8856 $file->insert_lines('NFC_QC; Y ; Yes',
8857 'NFD_QC; Y ; Yes',
8858 'NFKC_QC; Y ; Yes',
8859 'NFKD_QC; Y ; Yes',
8860 );
8861 }
8862 elsif (lc $data[0] eq 'n') {
8863 $file->insert_lines('NFC_QC; N ; No',
8864 'NFD_QC; N ; No',
8865 'NFKC_QC; N ; No',
8866 'NFKD_QC; N ; No',
8867 );
8868 }
8869 elsif (lc $data[0] eq 'm') {
8870 $file->insert_lines('NFC_QC; M ; Maybe',
8871 'NFKC_QC; M ; Maybe',
8872 );
8873 }
8874 else {
8875 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8876 }
8877 next;
8878 }
8879
8880 # The first field is the short name, 2nd is the full one.
8881 my $property_object = property_ref($property);
8882 my $table = $property_object->add_match_table($data[0],
8883 Full_Name => $data[1]);
8884
8885 # Start looking for more aliases after these two.
8886 for my $i (2 .. @data - 1) {
8887 $table->add_alias($data[$i]);
8888 }
8889 } # End of looping through the file
8890
8891 # As noted in the comments early in the program, it generates tables for
8892 # the default values for all releases, even those for which the concept
8893 # didn't exist at the time. Here we add those if missing.
8894 my $age = property_ref('age');
8895 if (defined $age && ! defined $age->table('Unassigned')) {
8896 $age->add_match_table('Unassigned');
8897 }
8898 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8899 && ! defined $block->table('No_Block');
8900
8901
8902 # Now set the default mappings of the properties from the file. This is
8903 # done after the loop because a number of properties have only @missings
8904 # entries in the file, and may not show up until the end.
8905 my @defaults = $file->get_missings;
8906 foreach my $default_ref (@defaults) {
8907 my $default = $default_ref->[0];
8908 my $property = property_ref($default_ref->[1]);
8909 $property->set_default_map($default);
8910 }
8911 return;
8912}
8913
8914sub get_old_property_value_aliases () {
8915 # Returns what would be in PropValueAliases.txt if it existed in very old
8916 # versions of Unicode. It was derived from the one in 3.2, and pared
8917 # down. An attempt was made to use the existence of files to mean
8918 # inclusion or not of various aliases, but if this was not sufficient,
8919 # using version numbers was resorted to.
8920
8921 my @return = split /\n/, <<'END';
8922bc ; AN ; Arabic_Number
8923bc ; B ; Paragraph_Separator
8924bc ; CS ; Common_Separator
8925bc ; EN ; European_Number
8926bc ; ES ; European_Separator
8927bc ; ET ; European_Terminator
8928bc ; L ; Left_To_Right
8929bc ; ON ; Other_Neutral
8930bc ; R ; Right_To_Left
8931bc ; WS ; White_Space
8932
8933# The standard combining classes are very much different in v1, so only use
8934# ones that look right (not checked thoroughly)
8935ccc; 0; NR ; Not_Reordered
8936ccc; 1; OV ; Overlay
8937ccc; 7; NK ; Nukta
8938ccc; 8; KV ; Kana_Voicing
8939ccc; 9; VR ; Virama
8940ccc; 202; ATBL ; Attached_Below_Left
8941ccc; 216; ATAR ; Attached_Above_Right
8942ccc; 218; BL ; Below_Left
8943ccc; 220; B ; Below
8944ccc; 222; BR ; Below_Right
8945ccc; 224; L ; Left
8946ccc; 228; AL ; Above_Left
8947ccc; 230; A ; Above
8948ccc; 232; AR ; Above_Right
8949ccc; 234; DA ; Double_Above
8950
8951dt ; can ; canonical
8952dt ; enc ; circle
8953dt ; fin ; final
8954dt ; font ; font
8955dt ; fra ; fraction
8956dt ; init ; initial
8957dt ; iso ; isolated
8958dt ; med ; medial
8959dt ; n/a ; none
8960dt ; nb ; noBreak
8961dt ; sqr ; square
8962dt ; sub ; sub
8963dt ; sup ; super
8964
8965gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8966gc ; Cc ; Control
8967gc ; Cn ; Unassigned
8968gc ; Co ; Private_Use
8969gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8970gc ; LC ; Cased_Letter # Ll | Lt | Lu
8971gc ; Ll ; Lowercase_Letter
8972gc ; Lm ; Modifier_Letter
8973gc ; Lo ; Other_Letter
8974gc ; Lu ; Uppercase_Letter
8975gc ; M ; Mark # Mc | Me | Mn
8976gc ; Mc ; Spacing_Mark
8977gc ; Mn ; Nonspacing_Mark
8978gc ; N ; Number # Nd | Nl | No
8979gc ; Nd ; Decimal_Number
8980gc ; No ; Other_Number
8981gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8982gc ; Pd ; Dash_Punctuation
8983gc ; Pe ; Close_Punctuation
8984gc ; Po ; Other_Punctuation
8985gc ; Ps ; Open_Punctuation
8986gc ; S ; Symbol # Sc | Sk | Sm | So
8987gc ; Sc ; Currency_Symbol
8988gc ; Sm ; Math_Symbol
8989gc ; So ; Other_Symbol
8990gc ; Z ; Separator # Zl | Zp | Zs
8991gc ; Zl ; Line_Separator
8992gc ; Zp ; Paragraph_Separator
8993gc ; Zs ; Space_Separator
8994
8995nt ; de ; Decimal
8996nt ; di ; Digit
8997nt ; n/a ; None
8998nt ; nu ; Numeric
8999END
9000
9001 if (-e 'ArabicShaping.txt') {
9002 push @return, split /\n/, <<'END';
9003jg ; n/a ; AIN
9004jg ; n/a ; ALEF
9005jg ; n/a ; DAL
9006jg ; n/a ; GAF
9007jg ; n/a ; LAM
9008jg ; n/a ; MEEM
9009jg ; n/a ; NO_JOINING_GROUP
9010jg ; n/a ; NOON
9011jg ; n/a ; QAF
9012jg ; n/a ; SAD
9013jg ; n/a ; SEEN
9014jg ; n/a ; TAH
9015jg ; n/a ; WAW
9016
9017jt ; C ; Join_Causing
9018jt ; D ; Dual_Joining
9019jt ; L ; Left_Joining
9020jt ; R ; Right_Joining
9021jt ; U ; Non_Joining
9022jt ; T ; Transparent
9023END
9024 if ($v_version ge v3.0.0) {
9025 push @return, split /\n/, <<'END';
9026jg ; n/a ; ALAPH
9027jg ; n/a ; BEH
9028jg ; n/a ; BETH
9029jg ; n/a ; DALATH_RISH
9030jg ; n/a ; E
9031jg ; n/a ; FEH
9032jg ; n/a ; FINAL_SEMKATH
9033jg ; n/a ; GAMAL
9034jg ; n/a ; HAH
9035jg ; n/a ; HAMZA_ON_HEH_GOAL
9036jg ; n/a ; HE
9037jg ; n/a ; HEH
9038jg ; n/a ; HEH_GOAL
9039jg ; n/a ; HETH
9040jg ; n/a ; KAF
9041jg ; n/a ; KAPH
9042jg ; n/a ; KNOTTED_HEH
9043jg ; n/a ; LAMADH
9044jg ; n/a ; MIM
9045jg ; n/a ; NUN
9046jg ; n/a ; PE
9047jg ; n/a ; QAPH
9048jg ; n/a ; REH
9049jg ; n/a ; REVERSED_PE
9050jg ; n/a ; SADHE
9051jg ; n/a ; SEMKATH
9052jg ; n/a ; SHIN
9053jg ; n/a ; SWASH_KAF
9054jg ; n/a ; TAW
9055jg ; n/a ; TEH_MARBUTA
9056jg ; n/a ; TETH
9057jg ; n/a ; YEH
9058jg ; n/a ; YEH_BARREE
9059jg ; n/a ; YEH_WITH_TAIL
9060jg ; n/a ; YUDH
9061jg ; n/a ; YUDH_HE
9062jg ; n/a ; ZAIN
9063END
9064 }
9065 }
9066
9067
9068 if (-e 'EastAsianWidth.txt') {
9069 push @return, split /\n/, <<'END';
9070ea ; A ; Ambiguous
9071ea ; F ; Fullwidth
9072ea ; H ; Halfwidth
9073ea ; N ; Neutral
9074ea ; Na ; Narrow
9075ea ; W ; Wide
9076END
9077 }
9078
9079 if (-e 'LineBreak.txt') {
9080 push @return, split /\n/, <<'END';
9081lb ; AI ; Ambiguous
9082lb ; AL ; Alphabetic
9083lb ; B2 ; Break_Both
9084lb ; BA ; Break_After
9085lb ; BB ; Break_Before
9086lb ; BK ; Mandatory_Break
9087lb ; CB ; Contingent_Break
9088lb ; CL ; Close_Punctuation
9089lb ; CM ; Combining_Mark
9090lb ; CR ; Carriage_Return
9091lb ; EX ; Exclamation
9092lb ; GL ; Glue
9093lb ; HY ; Hyphen
9094lb ; ID ; Ideographic
9095lb ; IN ; Inseperable
9096lb ; IS ; Infix_Numeric
9097lb ; LF ; Line_Feed
9098lb ; NS ; Nonstarter
9099lb ; NU ; Numeric
9100lb ; OP ; Open_Punctuation
9101lb ; PO ; Postfix_Numeric
9102lb ; PR ; Prefix_Numeric
9103lb ; QU ; Quotation
9104lb ; SA ; Complex_Context
9105lb ; SG ; Surrogate
9106lb ; SP ; Space
9107lb ; SY ; Break_Symbols
9108lb ; XX ; Unknown
9109lb ; ZW ; ZWSpace
9110END
9111 }
9112
9113 if (-e 'DNormalizationProps.txt') {
9114 push @return, split /\n/, <<'END';
9115qc ; M ; Maybe
9116qc ; N ; No
9117qc ; Y ; Yes
9118END
9119 }
9120
9121 if (-e 'Scripts.txt') {
9122 push @return, split /\n/, <<'END';
9123sc ; Arab ; Arabic
9124sc ; Armn ; Armenian
9125sc ; Beng ; Bengali
9126sc ; Bopo ; Bopomofo
9127sc ; Cans ; Canadian_Aboriginal
9128sc ; Cher ; Cherokee
9129sc ; Cyrl ; Cyrillic
9130sc ; Deva ; Devanagari
9131sc ; Dsrt ; Deseret
9132sc ; Ethi ; Ethiopic
9133sc ; Geor ; Georgian
9134sc ; Goth ; Gothic
9135sc ; Grek ; Greek
9136sc ; Gujr ; Gujarati
9137sc ; Guru ; Gurmukhi
9138sc ; Hang ; Hangul
9139sc ; Hani ; Han
9140sc ; Hebr ; Hebrew
9141sc ; Hira ; Hiragana
9142sc ; Ital ; Old_Italic
9143sc ; Kana ; Katakana
9144sc ; Khmr ; Khmer
9145sc ; Knda ; Kannada
9146sc ; Laoo ; Lao
9147sc ; Latn ; Latin
9148sc ; Mlym ; Malayalam
9149sc ; Mong ; Mongolian
9150sc ; Mymr ; Myanmar
9151sc ; Ogam ; Ogham
9152sc ; Orya ; Oriya
9153sc ; Qaai ; Inherited
9154sc ; Runr ; Runic
9155sc ; Sinh ; Sinhala
9156sc ; Syrc ; Syriac
9157sc ; Taml ; Tamil
9158sc ; Telu ; Telugu
9159sc ; Thaa ; Thaana
9160sc ; Thai ; Thai
9161sc ; Tibt ; Tibetan
9162sc ; Yiii ; Yi
9163sc ; Zyyy ; Common
9164END
9165 }
9166
9167 if ($v_version ge v2.0.0) {
9168 push @return, split /\n/, <<'END';
9169dt ; com ; compat
9170dt ; nar ; narrow
9171dt ; sml ; small
9172dt ; vert ; vertical
9173dt ; wide ; wide
9174
9175gc ; Cf ; Format
9176gc ; Cs ; Surrogate
9177gc ; Lt ; Titlecase_Letter
9178gc ; Me ; Enclosing_Mark
9179gc ; Nl ; Letter_Number
9180gc ; Pc ; Connector_Punctuation
9181gc ; Sk ; Modifier_Symbol
9182END
9183 }
9184 if ($v_version ge v2.1.2) {
9185 push @return, "bc ; S ; Segment_Separator\n";
9186 }
9187 if ($v_version ge v2.1.5) {
9188 push @return, split /\n/, <<'END';
9189gc ; Pf ; Final_Punctuation
9190gc ; Pi ; Initial_Punctuation
9191END
9192 }
9193 if ($v_version ge v2.1.8) {
9194 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9195 }
9196
9197 if ($v_version ge v3.0.0) {
9198 push @return, split /\n/, <<'END';
9199bc ; AL ; Arabic_Letter
9200bc ; BN ; Boundary_Neutral
9201bc ; LRE ; Left_To_Right_Embedding
9202bc ; LRO ; Left_To_Right_Override
9203bc ; NSM ; Nonspacing_Mark
9204bc ; PDF ; Pop_Directional_Format
9205bc ; RLE ; Right_To_Left_Embedding
9206bc ; RLO ; Right_To_Left_Override
9207
9208ccc; 233; DB ; Double_Below
9209END
9210 }
9211
9212 if ($v_version ge v3.1.0) {
9213 push @return, "ccc; 226; R ; Right\n";
9214 }
9215
9216 return @return;
9217}
9218
b1c167a3
KW
9219sub output_perl_charnames_line ($$) {
9220
9221 # Output the entries in Perl_charnames specially, using 5 digits instead
9222 # of four. This makes the entries a constant length, and simplifies
9223 # charnames.pm which this table is for. Unicode can have 6 digit
9224 # ordinals, but they are all private use or noncharacters which do not
9225 # have names, so won't be in this table.
9226
73d9566f 9227 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
9228}
9229
99870f4d
KW
9230{ # Closure
9231 # This is used to store the range list of all the code points usable when
9232 # the little used $compare_versions feature is enabled.
9233 my $compare_versions_range_list;
9234
96cfc54a
KW
9235 # These are constants to the $property_info hash in this subroutine, to
9236 # avoid using a quoted-string which might have a typo.
9237 my $TYPE = 'type';
9238 my $DEFAULT_MAP = 'default_map';
9239 my $DEFAULT_TABLE = 'default_table';
9240 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9241 my $MISSINGS = 'missings';
9242
99870f4d
KW
9243 sub process_generic_property_file {
9244 # This processes a file containing property mappings and puts them
9245 # into internal map tables. It should be used to handle any property
9246 # files that have mappings from a code point or range thereof to
9247 # something else. This means almost all the UCD .txt files.
9248 # each_line_handlers() should be set to adjust the lines of these
9249 # files, if necessary, to what this routine understands:
9250 #
9251 # 0374 ; NFD_QC; N
9252 # 003C..003E ; Math
9253 #
92f9d56c 9254 # the fields are: "codepoint-range ; property; map"
99870f4d
KW
9255 #
9256 # meaning the codepoints in the range all have the value 'map' under
9257 # 'property'.
98dc9551 9258 # Beginning and trailing white space in each field are not significant.
99870f4d
KW
9259 # Note there is not a trailing semi-colon in the above. A trailing
9260 # semi-colon means the map is a null-string. An omitted map, as
9261 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9262 # table syntax. (This could have been hidden from this routine by
9263 # doing it in the $file object, but that would require parsing of the
9264 # line there, so would have to parse it twice, or change the interface
9265 # to pass this an array. So not done.)
9266 #
9267 # The map field may begin with a sequence of commands that apply to
9268 # this range. Each such command begins and ends with $CMD_DELIM.
9269 # These are used to indicate, for example, that the mapping for a
9270 # range has a non-default type.
9271 #
9272 # This loops through the file, calling it's next_line() method, and
9273 # then taking the map and adding it to the property's table.
9274 # Complications arise because any number of properties can be in the
9275 # file, in any order, interspersed in any way. The first time a
9276 # property is seen, it gets information about that property and
f86864ac 9277 # caches it for quick retrieval later. It also normalizes the maps
5d7f7709
KW
9278 # so that only one of many synonyms is stored. The Unicode input
9279 # files do use some multiple synonyms.
99870f4d
KW
9280
9281 my $file = shift;
9282 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9283
9284 my %property_info; # To keep track of what properties
9285 # have already had entries in the
9286 # current file, and info about each,
9287 # so don't have to recompute.
9288 my $property_name; # property currently being worked on
9289 my $property_type; # and its type
9290 my $previous_property_name = ""; # name from last time through loop
9291 my $property_object; # pointer to the current property's
9292 # object
9293 my $property_addr; # the address of that object
9294 my $default_map; # the string that code points missing
9295 # from the file map to
9296 my $default_table; # For non-string properties, a
9297 # reference to the match table that
9298 # will contain the list of code
9299 # points that map to $default_map.
9300
9301 # Get the next real non-comment line
9302 LINE:
9303 while ($file->next_line) {
9304
9305 # Default replacement type; means that if parts of the range have
9306 # already been stored in our tables, the new map overrides them if
9307 # they differ more than cosmetically
9308 my $replace = $IF_NOT_EQUIVALENT;
9309 my $map_type; # Default type for the map of this range
9310
9311 #local $to_trace = 1 if main::DEBUG;
9312 trace $_ if main::DEBUG && $to_trace;
9313
9314 # Split the line into components
9315 my ($range, $property_name, $map, @remainder)
9316 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9317
9318 # If more or less on the line than we are expecting, warn and skip
9319 # the line
9320 if (@remainder) {
9321 $file->carp_bad_line('Extra fields');
9322 next LINE;
9323 }
9324 elsif ( ! defined $property_name) {
9325 $file->carp_bad_line('Missing property');
9326 next LINE;
9327 }
9328
9329 # Examine the range.
9330 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9331 {
9332 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9333 next LINE;
9334 }
9335 my $low = hex $1;
9336 my $high = (defined $2) ? hex $2 : $low;
9337
9338 # For the very specialized case of comparing two Unicode
9339 # versions...
9340 if (DEBUG && $compare_versions) {
9341 if ($property_name eq 'Age') {
9342
9343 # Only allow code points at least as old as the version
9344 # specified.
9345 my $age = pack "C*", split(/\./, $map); # v string
9346 next LINE if $age gt $compare_versions;
9347 }
9348 else {
9349
9350 # Again, we throw out code points younger than those of
9351 # the specified version. By now, the Age property is
9352 # populated. We use the intersection of each input range
9353 # with this property to find what code points in it are
9354 # valid. To do the intersection, we have to convert the
9355 # Age property map to a Range_list. We only have to do
9356 # this once.
9357 if (! defined $compare_versions_range_list) {
9358 my $age = property_ref('Age');
9359 if (! -e 'DAge.txt') {
9360 croak "Need to have 'DAge.txt' file to do version comparison";
9361 }
9362 elsif ($age->count == 0) {
9363 croak "The 'Age' table is empty, but its file exists";
9364 }
9365 $compare_versions_range_list
9366 = Range_List->new(Initialize => $age);
9367 }
9368
9369 # An undefined map is always 'Y'
9370 $map = 'Y' if ! defined $map;
9371
9372 # Calculate the intersection of the input range with the
9373 # code points that are known in the specified version
9374 my @ranges = ($compare_versions_range_list
9375 & Range->new($low, $high))->ranges;
9376
9377 # If the intersection is empty, throw away this range
9378 next LINE unless @ranges;
9379
9380 # Only examine the first range this time through the loop.
9381 my $this_range = shift @ranges;
9382
9383 # Put any remaining ranges in the queue to be processed
9384 # later. Note that there is unnecessary work here, as we
9385 # will do the intersection again for each of these ranges
9386 # during some future iteration of the LINE loop, but this
9387 # code is not used in production. The later intersections
9388 # are guaranteed to not splinter, so this will not become
9389 # an infinite loop.
9390 my $line = join ';', $property_name, $map;
9391 foreach my $range (@ranges) {
9392 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9393 $range->start,
9394 $range->end,
9395 $line));
9396 }
9397
9398 # And process the first range, like any other.
9399 $low = $this_range->start;
9400 $high = $this_range->end;
9401 }
9402 } # End of $compare_versions
9403
9404 # If changing to a new property, get the things constant per
9405 # property
9406 if ($previous_property_name ne $property_name) {
9407
9408 $property_object = property_ref($property_name);
9409 if (! defined $property_object) {
9410 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9411 next LINE;
9412 }
051df77b 9413 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
9414
9415 # Defer changing names until have a line that is acceptable
9416 # (the 'next' statement above means is unacceptable)
9417 $previous_property_name = $property_name;
9418
9419 # If not the first time for this property, retrieve info about
9420 # it from the cache
96cfc54a
KW
9421 if (defined ($property_info{$property_addr}{$TYPE})) {
9422 $property_type = $property_info{$property_addr}{$TYPE};
9423 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
99870f4d 9424 $map_type
96cfc54a 9425 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
99870f4d 9426 $default_table
96cfc54a 9427 = $property_info{$property_addr}{$DEFAULT_TABLE};
99870f4d
KW
9428 }
9429 else {
9430
9431 # Here, is the first time for this property. Set up the
9432 # cache.
96cfc54a 9433 $property_type = $property_info{$property_addr}{$TYPE}
99870f4d
KW
9434 = $property_object->type;
9435 $map_type
96cfc54a 9436 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
99870f4d
KW
9437 = $property_object->pseudo_map_type;
9438
9439 # The Unicode files are set up so that if the map is not
9440 # defined, it is a binary property
9441 if (! defined $map && $property_type != $BINARY) {
9442 if ($property_type != $UNKNOWN
9443 && $property_type != $NON_STRING)
9444 {
9445 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9446 }
9447 else {
9448 $property_object->set_type($BINARY);
9449 $property_type
96cfc54a 9450 = $property_info{$property_addr}{$TYPE}
99870f4d
KW
9451 = $BINARY;
9452 }
9453 }
9454
9455 # Get any @missings default for this property. This
9456 # should precede the first entry for the property in the
9457 # input file, and is located in a comment that has been
9458 # stored by the Input_file class until we access it here.
9459 # It's possible that there is more than one such line
9460 # waiting for us; collect them all, and parse
9461 my @missings_list = $file->get_missings
9462 if $file->has_missings_defaults;
9463 foreach my $default_ref (@missings_list) {
9464 my $default = $default_ref->[0];
ffe43484 9465 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
9466
9467 # For string properties, the default is just what the
9468 # file says, but non-string properties should already
9469 # have set up a table for the default property value;
9470 # use the table for these, so can resolve synonyms
9471 # later to a single standard one.
9472 if ($property_type == $STRING
9473 || $property_type == $UNKNOWN)
9474 {
96cfc54a 9475 $property_info{$addr}{$MISSINGS} = $default;
99870f4d
KW
9476 }
9477 else {
96cfc54a 9478 $property_info{$addr}{$MISSINGS}
99870f4d
KW
9479 = $property_object->table($default);
9480 }
9481 }
9482
9483 # Finished storing all the @missings defaults in the input
9484 # file so far. Get the one for the current property.
96cfc54a 9485 my $missings = $property_info{$property_addr}{$MISSINGS};
99870f4d
KW
9486
9487 # But we likely have separately stored what the default
9488 # should be. (This is to accommodate versions of the
9489 # standard where the @missings lines are absent or
9490 # incomplete.) Hopefully the two will match. But check
9491 # it out.
9492 $default_map = $property_object->default_map;
9493
9494 # If the map is a ref, it means that the default won't be
9495 # processed until later, so undef it, so next few lines
9496 # will redefine it to something that nothing will match
9497 undef $default_map if ref $default_map;
9498
9499 # Create a $default_map if don't have one; maybe a dummy
9500 # that won't match anything.
9501 if (! defined $default_map) {
9502
9503 # Use any @missings line in the file.
9504 if (defined $missings) {
9505 if (ref $missings) {
9506 $default_map = $missings->full_name;
9507 $default_table = $missings;
9508 }
9509 else {
9510 $default_map = $missings;
9511 }
678f13d5 9512
99870f4d
KW
9513 # And store it with the property for outside use.
9514 $property_object->set_default_map($default_map);
9515 }
9516 else {
9517
9518 # Neither an @missings nor a default map. Create
9519 # a dummy one, so won't have to test definedness
9520 # in the main loop.
9521 $default_map = '_Perl This will never be in a file
9522 from Unicode';
9523 }
9524 }
9525
9526 # Here, we have $default_map defined, possibly in terms of
9527 # $missings, but maybe not, and possibly is a dummy one.
9528 if (defined $missings) {
9529
9530 # Make sure there is no conflict between the two.
9531 # $missings has priority.
9532 if (ref $missings) {
23e33b60
KW
9533 $default_table
9534 = $property_object->table($default_map);
99870f4d
KW
9535 if (! defined $default_table
9536 || $default_table != $missings)
9537 {
9538 if (! defined $default_table) {
9539 $default_table = $UNDEF;
9540 }
9541 $file->carp_bad_line(<<END
9542The \@missings line for $property_name in $file says that missings default to
9543$missings, but we expect it to be $default_table. $missings used.
9544END
9545 );
9546 $default_table = $missings;
9547 $default_map = $missings->full_name;
9548 }
96cfc54a 9549 $property_info{$property_addr}{$DEFAULT_TABLE}
99870f4d
KW
9550 = $default_table;
9551 }
9552 elsif ($default_map ne $missings) {
9553 $file->carp_bad_line(<<END
9554The \@missings line for $property_name in $file says that missings default to
9555$missings, but we expect it to be $default_map. $missings used.
9556END
9557 );
9558 $default_map = $missings;
9559 }
9560 }
9561
96cfc54a 9562 $property_info{$property_addr}{$DEFAULT_MAP}
99870f4d
KW
9563 = $default_map;
9564
9565 # If haven't done so already, find the table corresponding
9566 # to this map for non-string properties.
9567 if (! defined $default_table
9568 && $property_type != $STRING
9569 && $property_type != $UNKNOWN)
9570 {
9571 $default_table = $property_info{$property_addr}
96cfc54a 9572 {$DEFAULT_TABLE}
99870f4d
KW
9573 = $property_object->table($default_map);
9574 }
9575 } # End of is first time for this property
9576 } # End of switching properties.
9577
9578 # Ready to process the line.
9579 # The Unicode files are set up so that if the map is not defined,
9580 # it is a binary property with value 'Y'
9581 if (! defined $map) {
9582 $map = 'Y';
9583 }
9584 else {
9585
9586 # If the map begins with a special command to us (enclosed in
9587 # delimiters), extract the command(s).
a35d7f90
KW
9588 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9589 my $command = $1;
9590 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9591 $replace = $1;
99870f4d 9592 }
a35d7f90
KW
9593 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9594 $map_type = $1;
9595 }
9596 else {
9597 $file->carp_bad_line("Unknown command line: '$1'");
9598 next LINE;
9599 }
9600 }
99870f4d
KW
9601 }
9602
9603 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9604 {
9605
9606 # Here, we have a map to a particular code point, and the
9607 # default map is to a code point itself. If the range
9608 # includes the particular code point, change that portion of
9609 # the range to the default. This makes sure that in the final
9610 # table only the non-defaults are listed.
9611 my $decimal_map = hex $map;
9612 if ($low <= $decimal_map && $decimal_map <= $high) {
9613
9614 # If the range includes stuff before or after the map
9615 # we're changing, split it and process the split-off parts
9616 # later.
9617 if ($low < $decimal_map) {
9618 $file->insert_adjusted_lines(
9619 sprintf("%04X..%04X; %s; %s",
9620 $low,
9621 $decimal_map - 1,
9622 $property_name,
9623 $map));
9624 }
9625 if ($high > $decimal_map) {
9626 $file->insert_adjusted_lines(
9627 sprintf("%04X..%04X; %s; %s",
9628 $decimal_map + 1,
9629 $high,
9630 $property_name,
9631 $map));
9632 }
9633 $low = $high = $decimal_map;
9634 $map = $CODE_POINT;
9635 }
9636 }
9637
9638 # If we can tell that this is a synonym for the default map, use
9639 # the default one instead.
9640 if ($property_type != $STRING
9641 && $property_type != $UNKNOWN)
9642 {
9643 my $table = $property_object->table($map);
9644 if (defined $table && $table == $default_table) {
9645 $map = $default_map;
9646 }
9647 }
9648
9649 # And figure out the map type if not known.
9650 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9651 if ($map eq "") { # Nulls are always $NULL map type
9652 $map_type = $NULL;
9653 } # Otherwise, non-strings, and those that don't allow
9654 # $MULTI_CP, and those that aren't multiple code points are
9655 # 0
9656 elsif
9657 (($property_type != $STRING && $property_type != $UNKNOWN)
9658 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9659 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9660 {
9661 $map_type = 0;
9662 }
9663 else {
9664 $map_type = $MULTI_CP;
9665 }
9666 }
9667
9668 $property_object->add_map($low, $high,
9669 $map,
9670 Type => $map_type,
9671 Replace => $replace);
9672 } # End of loop through file's lines
9673
9674 return;
9675 }
9676}
9677
99870f4d
KW
9678{ # Closure for UnicodeData.txt handling
9679
9680 # This file was the first one in the UCD; its design leads to some
9681 # awkwardness in processing. Here is a sample line:
9682 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9683 # The fields in order are:
9684 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9685 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9686 my $CATEGORY = $i++; # category (e.g. "Lu")
9687 my $CCC = $i++; # Canonical combining class (e.g. "230")
9688 my $BIDI = $i++; # directional class (e.g. "L")
9689 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9690 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9691 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9692 # Dual-use in this program; see below
9693 my $NUMERIC = $i++; # numeric value
9694 my $MIRRORED = $i++; # ? mirrored
9695 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9696 my $COMMENT = $i++; # iso comment
9697 my $UPPER = $i++; # simple uppercase mapping
9698 my $LOWER = $i++; # simple lowercase mapping
9699 my $TITLE = $i++; # simple titlecase mapping
9700 my $input_field_count = $i;
9701
9702 # This routine in addition outputs these extra fields:
9703 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9704
9705 # These fields are modifications of ones above, and are usually
9706 # suppressed; they must come last, as for speed, the loop upper bound is
9707 # normally set to ignore them
9708 my $NAME = $i++; # This is the strict name field, not the one that
9709 # charnames uses.
9710 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9711 # by Unicode::Normalize
99870f4d
KW
9712 my $last_field = $i - 1;
9713
9714 # All these are read into an array for each line, with the indices defined
9715 # above. The empty fields in the example line above indicate that the
9716 # value is defaulted. The handler called for each line of the input
9717 # changes these to their defaults.
9718
9719 # Here are the official names of the properties, in a parallel array:
9720 my @field_names;
9721 $field_names[$BIDI] = 'Bidi_Class';
9722 $field_names[$CATEGORY] = 'General_Category';
9723 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9724 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9725 $field_names[$COMMENT] = 'ISO_Comment';
9726 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9727 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9728 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9729 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9730 $field_names[$NAME] = 'Name';
9731 $field_names[$NUMERIC] = 'Numeric_Value';
9732 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9733 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9734 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9735 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9736 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9737 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9738
28093d0e
KW
9739 # Some of these need a little more explanation:
9740 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9741 # property, but is used in calculating the Numeric_Type. Perl however,
9742 # creates a file from this field, so a Perl property is created from it.
9743 # Similarly, the Other_Digit field is used only for calculating the
9744 # Numeric_Type, and so it can be safely re-used as the place to store
9745 # the value for Numeric_Type; hence it is referred to as
9746 # $NUMERIC_TYPE_OTHER_DIGIT.
9747 # The input field named $PERL_DECOMPOSITION is a combination of both the
9748 # decomposition mapping and its type. Perl creates a file containing
9749 # exactly this field, so it is used for that. The two properties are
9750 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9751 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9752 # output it), as Perl doesn't use it directly.
9753 # The input field named here $CHARNAME is used to construct the
9754 # Perl_Charnames property, which is a combination of the Name property
9755 # (which the input field contains), and the Unicode_1_Name property, and
9756 # others from other files. Since, the strict Name property is not used
9757 # by Perl, this field is used for the table that Perl does use. The
9758 # strict Name property table is usually suppressed (unless the lists are
9759 # changed to output it), so it is accumulated in a separate field,
9760 # $NAME, which to save time is discarded unless the table is actually to
9761 # be output
99870f4d
KW
9762
9763 # This file is processed like most in this program. Control is passed to
9764 # process_generic_property_file() which calls filter_UnicodeData_line()
9765 # for each input line. This filter converts the input into line(s) that
9766 # process_generic_property_file() understands. There is also a setup
9767 # routine called before any of the file is processed, and a handler for
9768 # EOF processing, all in this closure.
9769
9770 # A huge speed-up occurred at the cost of some added complexity when these
9771 # routines were altered to buffer the outputs into ranges. Almost all the
9772 # lines of the input file apply to just one code point, and for most
9773 # properties, the map for the next code point up is the same as the
9774 # current one. So instead of creating a line for each property for each
9775 # input line, filter_UnicodeData_line() remembers what the previous map
9776 # of a property was, and doesn't generate a line to pass on until it has
9777 # to, as when the map changes; and that passed-on line encompasses the
9778 # whole contiguous range of code points that have the same map for that
9779 # property. This means a slight amount of extra setup, and having to
9780 # flush these buffers on EOF, testing if the maps have changed, plus
9781 # remembering state information in the closure. But it means a lot less
9782 # real time in not having to change the data base for each property on
9783 # each line.
9784
9785 # Another complication is that there are already a few ranges designated
9786 # in the input. There are two lines for each, with the same maps except
9787 # the code point and name on each line. This was actually the hardest
9788 # thing to design around. The code points in those ranges may actually
9789 # have real maps not given by these two lines. These maps will either
98dc9551 9790 # be algorithmically determinable, or in the extracted files furnished
99870f4d
KW
9791 # with the UCD. In the event of conflicts between these extracted files,
9792 # and this one, Unicode says that this one prevails. But it shouldn't
9793 # prevail for conflicts that occur in these ranges. The data from the
9794 # extracted files prevails in those cases. So, this program is structured
9795 # so that those files are processed first, storing maps. Then the other
9796 # files are processed, generally overwriting what the extracted files
9797 # stored. But just the range lines in this input file are processed
9798 # without overwriting. This is accomplished by adding a special string to
9799 # the lines output to tell process_generic_property_file() to turn off the
9800 # overwriting for just this one line.
9801 # A similar mechanism is used to tell it that the map is of a non-default
9802 # type.
9803
9804 sub setup_UnicodeData { # Called before any lines of the input are read
9805 my $file = shift;
9806 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9807
28093d0e
KW
9808 # Create a new property specially located that is a combination of the
9809 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9810 # Name_Alias properties. (The final duplicates elements of the
9811 # first.) A comment for it will later be constructed based on the
9812 # actual properties present and used
3e20195b 9813 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9814 Core_Access => '\N{...} and "use charnames"',
9815 Default_Map => "",
9816 Directory => File::Spec->curdir(),
9817 File => 'Name',
301ba948 9818 Fate => $INTERNAL_ONLY,
28093d0e 9819 Perl_Extension => 1,
b1c167a3 9820 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9821 Type => $STRING,
9822 );
9823
99870f4d 9824 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9825 Directory => File::Spec->curdir(),
99870f4d 9826 File => 'Decomposition',
a14f3cb1 9827 Format => $DECOMP_STRING_FORMAT,
301ba948 9828 Fate => $INTERNAL_ONLY,
99870f4d
KW
9829 Perl_Extension => 1,
9830 Default_Map => $CODE_POINT,
9831
0c07e538
KW
9832 # normalize.pm can't cope with these
9833 Output_Range_Counts => 0,
9834
99870f4d
KW
9835 # This is a specially formatted table
9836 # explicitly for normalize.pm, which
9837 # is expecting a particular format,
9838 # which means that mappings containing
9839 # multiple code points are in the main
9840 # body of the table
9841 Map_Type => $COMPUTE_NO_MULTI_CP,
9842 Type => $STRING,
9843 );
9844 $Perl_decomp->add_comment(join_lines(<<END
9845This mapping is a combination of the Unicode 'Decomposition_Type' and
9846'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
8d6427a5 9847identical to the official Unicode 'Decomposition_Mapping' property except for
99870f4d
KW
9848two things:
9849 1) It omits the algorithmically determinable Hangul syllable decompositions,
9850which normalize.pm handles algorithmically.
9851 2) It contains the decomposition type as well. Non-canonical decompositions
9852begin with a word in angle brackets, like <super>, which denotes the
9853compatible decomposition type. If the map does not begin with the <angle
9854brackets>, the decomposition is canonical.
9855END
9856 ));
9857
9858 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9859 Default_Map => "",
9860 Perl_Extension => 1,
9861 File => 'Digit', # Trad. location
9862 Directory => $map_directory,
9863 Type => $STRING,
9864 Range_Size_1 => 1,
9865 );
9866 $Decimal_Digit->add_comment(join_lines(<<END
9867This file gives the mapping of all code points which represent a single
9868decimal digit [0-9] to their respective digits. For example, the code point
9869U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9870that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9871numerals.
9872END
9873 ));
9874
28093d0e
KW
9875 # These properties are not used for generating anything else, and are
9876 # usually not output. By making them last in the list, we can just
99870f4d 9877 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
9878 # generating a table(s) that is/are just going to get thrown away.
9879 if (! property_ref('Decomposition_Mapping')->to_output_map
9880 && ! property_ref('Name')->to_output_map)
9881 {
9882 $last_field = min($NAME, $DECOMP_MAP) - 1;
9883 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9884 $last_field = $DECOMP_MAP;
9885 } elsif (property_ref('Name')->to_output_map) {
9886 $last_field = $NAME;
99870f4d
KW
9887 }
9888 return;
9889 }
9890
9891 my $first_time = 1; # ? Is this the first line of the file
9892 my $in_range = 0; # ? Are we in one of the file's ranges
9893 my $previous_cp; # hex code point of previous line
9894 my $decimal_previous_cp = -1; # And its decimal equivalent
9895 my @start; # For each field, the current starting
9896 # code point in hex for the range
9897 # being accumulated.
9898 my @fields; # The input fields;
9899 my @previous_fields; # And those from the previous call
9900
9901 sub filter_UnicodeData_line {
9902 # Handle a single input line from UnicodeData.txt; see comments above
9903 # Conceptually this takes a single line from the file containing N
9904 # properties, and converts it into N lines with one property per line,
9905 # which is what the final handler expects. But there are
9906 # complications due to the quirkiness of the input file, and to save
9907 # time, it accumulates ranges where the property values don't change
9908 # and only emits lines when necessary. This is about an order of
9909 # magnitude fewer lines emitted.
9910
9911 my $file = shift;
9912 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9913
9914 # $_ contains the input line.
9915 # -1 in split means retain trailing null fields
9916 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9917
9918 #local $to_trace = 1 if main::DEBUG;
9919 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9920 if (@fields > $input_field_count) {
9921 $file->carp_bad_line('Extra fields');
9922 $_ = "";
9923 return;
9924 }
9925
9926 my $decimal_cp = hex $cp;
9927
9928 # We have to output all the buffered ranges when the next code point
9929 # is not exactly one after the previous one, which means there is a
9930 # gap in the ranges.
9931 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9932
9933 # The decomposition mapping field requires special handling. It looks
9934 # like either:
9935 #
9936 # <compat> 0032 0020
9937 # 0041 0300
9938 #
9939 # The decomposition type is enclosed in <brackets>; if missing, it
9940 # means the type is canonical. There are two decomposition mapping
9941 # tables: the one for use by Perl's normalize.pm has a special format
9942 # which is this field intact; the other, for general use is of
9943 # standard format. In either case we have to find the decomposition
9944 # type. Empty fields have None as their type, and map to the code
9945 # point itself
9946 if ($fields[$PERL_DECOMPOSITION] eq "") {
9947 $fields[$DECOMP_TYPE] = 'None';
9948 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9949 }
9950 else {
9951 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9952 =~ / < ( .+? ) > \s* ( .+ ) /x;
9953 if (! defined $fields[$DECOMP_TYPE]) {
9954 $fields[$DECOMP_TYPE] = 'Canonical';
9955 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9956 }
9957 else {
9958 $fields[$DECOMP_MAP] = $map;
9959 }
9960 }
9961
9962 # The 3 numeric fields also require special handling. The 2 digit
9963 # fields must be either empty or match the number field. This means
9964 # that if it is empty, they must be as well, and the numeric type is
9965 # None, and the numeric value is 'Nan'.
9966 # The decimal digit field must be empty or match the other digit
9967 # field. If the decimal digit field is non-empty, the code point is
9968 # a decimal digit, and the other two fields will have the same value.
9969 # If it is empty, but the other digit field is non-empty, the code
9970 # point is an 'other digit', and the number field will have the same
9971 # value as the other digit field. If the other digit field is empty,
9972 # but the number field is non-empty, the code point is a generic
9973 # numeric type.
9974 if ($fields[$NUMERIC] eq "") {
9975 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9976 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9977 ) {
9978 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9979 }
9980 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9981 $fields[$NUMERIC] = 'NaN';
9982 }
9983 else {
9984 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9985 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9986 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9987 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9988 }
9989 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9990 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9991 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9992 }
9993 else {
9994 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9995
9996 # Rationals require extra effort.
9997 register_fraction($fields[$NUMERIC])
9998 if $fields[$NUMERIC] =~ qr{/};
9999 }
10000 }
10001
10002 # For the properties that have empty fields in the file, and which
10003 # mean something different from empty, change them to that default.
10004 # Certain fields just haven't been empty so far in any Unicode
10005 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10006 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 10007 # the defaults; which are very unlikely to ever change.
99870f4d
KW
10008 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10009 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10010
10011 # UAX44 says that if title is empty, it is the same as whatever upper
10012 # is,
10013 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10014
10015 # There are a few pairs of lines like:
10016 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10017 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10018 # that define ranges. These should be processed after the fields are
10019 # adjusted above, as they may override some of them; but mostly what
28093d0e 10020 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
10021 # paired lines start with a '<', but this is also true of '<control>,
10022 # which isn't one of these special ones.
28093d0e 10023 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
10024
10025 # Some code points in this file have the pseudo-name
10026 # '<control>', but the official name for such ones is the null
28093d0e 10027 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 10028 $fields[$NAME] = "";
28093d0e 10029 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
10030
10031 # We had better not be in between range lines.
10032 if ($in_range) {
28093d0e 10033 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10034 $in_range = 0;
10035 }
10036 }
28093d0e 10037 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
10038
10039 # Here is a non-range line. We had better not be in between range
10040 # lines.
10041 if ($in_range) {
28093d0e 10042 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10043 $in_range = 0;
10044 }
edb80b88
KW
10045 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10046
10047 # These are code points whose names end in their code points,
10048 # which means the names are algorithmically derivable from the
10049 # code points. To shorten the output Name file, the algorithm
10050 # for deriving these is placed in the file instead of each
10051 # code point, so they have map type $CP_IN_NAME
10052 $fields[$CHARNAME] = $CMD_DELIM
10053 . $MAP_TYPE_CMD
10054 . '='
10055 . $CP_IN_NAME
10056 . $CMD_DELIM
10057 . $fields[$CHARNAME];
10058 }
28093d0e 10059 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 10060 }
28093d0e
KW
10061 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10062 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
10063
10064 # Here we are at the beginning of a range pair.
10065 if ($in_range) {
28093d0e 10066 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10067 }
10068 $in_range = 1;
10069
10070 # Because the properties in the range do not overwrite any already
10071 # in the db, we must flush the buffers of what's already there, so
10072 # they get handled in the normal scheme.
10073 $force_output = 1;
10074
10075 }
28093d0e
KW
10076 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10077 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
10078 $_ = "";
10079 return;
10080 }
10081 else { # Here, we are at the last line of a range pair.
10082
10083 if (! $in_range) {
28093d0e 10084 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
10085 $_ = "";
10086 return;
10087 }
10088 $in_range = 0;
10089
28093d0e
KW
10090 $fields[$NAME] = $fields[$CHARNAME];
10091
99870f4d
KW
10092 # Check that the input is valid: that the closing of the range is
10093 # the same as the beginning.
10094 foreach my $i (0 .. $last_field) {
10095 next if $fields[$i] eq $previous_fields[$i];
10096 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10097 }
10098
10099 # The processing differs depending on the type of range,
28093d0e
KW
10100 # determined by its $CHARNAME
10101 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
10102
10103 # Check that the data looks right.
10104 if ($decimal_previous_cp != $SBase) {
10105 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10106 }
10107 if ($decimal_cp != $SBase + $SCount - 1) {
10108 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10109 }
10110
10111 # The Hangul syllable range has a somewhat complicated name
10112 # generation algorithm. Each code point in it has a canonical
10113 # decomposition also computable by an algorithm. The
10114 # perl decomposition map table built from these is used only
10115 # by normalize.pm, which has the algorithm built in it, so the
10116 # decomposition maps are not needed, and are large, so are
10117 # omitted from it. If the full decomposition map table is to
10118 # be output, the decompositions are generated for it, in the
10119 # EOF handling code for this input file.
10120
10121 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10122
10123 # This range is stored in our internal structure with its
10124 # own map type, different from all others.
28093d0e
KW
10125 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10126 = $CMD_DELIM
99870f4d
KW
10127 . $MAP_TYPE_CMD
10128 . '='
10129 . $HANGUL_SYLLABLE
10130 . $CMD_DELIM
28093d0e 10131 . $fields[$CHARNAME];
99870f4d 10132 }
28093d0e 10133 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
10134
10135 # The name for these contains the code point itself, and all
10136 # are defined to have the same base name, regardless of what
10137 # is in the file. They are stored in our internal structure
10138 # with a map type of $CP_IN_NAME
28093d0e
KW
10139 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10140 = $CMD_DELIM
99870f4d
KW
10141 . $MAP_TYPE_CMD
10142 . '='
10143 . $CP_IN_NAME
10144 . $CMD_DELIM
10145 . 'CJK UNIFIED IDEOGRAPH';
10146
10147 }
10148 elsif ($fields[$CATEGORY] eq 'Co'
10149 || $fields[$CATEGORY] eq 'Cs')
10150 {
10151 # The names of all the code points in these ranges are set to
10152 # null, as there are no names for the private use and
10153 # surrogate code points.
10154
28093d0e 10155 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
10156 }
10157 else {
28093d0e 10158 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
10159 }
10160
10161 # The first line of the range caused everything else to be output,
10162 # and then its values were stored as the beginning values for the
10163 # next set of ranges, which this one ends. Now, for each value,
10164 # add a command to tell the handler that these values should not
10165 # replace any existing ones in our database.
10166 foreach my $i (0 .. $last_field) {
10167 $previous_fields[$i] = $CMD_DELIM
10168 . $REPLACE_CMD
10169 . '='
10170 . $NO
10171 . $CMD_DELIM
10172 . $previous_fields[$i];
10173 }
10174
10175 # And change things so it looks like the entire range has been
10176 # gone through with this being the final part of it. Adding the
10177 # command above to each field will cause this range to be flushed
10178 # during the next iteration, as it guaranteed that the stored
10179 # field won't match whatever value the next one has.
10180 $previous_cp = $cp;
10181 $decimal_previous_cp = $decimal_cp;
10182
10183 # We are now set up for the next iteration; so skip the remaining
10184 # code in this subroutine that does the same thing, but doesn't
10185 # know about these ranges.
10186 $_ = "";
c1739a4a 10187
99870f4d
KW
10188 return;
10189 }
10190
10191 # On the very first line, we fake it so the code below thinks there is
10192 # nothing to output, and initialize so that when it does get output it
10193 # uses the first line's values for the lowest part of the range.
10194 # (One could avoid this by using peek(), but then one would need to
10195 # know the adjustments done above and do the same ones in the setup
10196 # routine; not worth it)
10197 if ($first_time) {
10198 $first_time = 0;
10199 @previous_fields = @fields;
10200 @start = ($cp) x scalar @fields;
10201 $decimal_previous_cp = $decimal_cp - 1;
10202 }
10203
10204 # For each field, output the stored up ranges that this code point
10205 # doesn't fit in. Earlier we figured out if all ranges should be
10206 # terminated because of changing the replace or map type styles, or if
10207 # there is a gap between this new code point and the previous one, and
10208 # that is stored in $force_output. But even if those aren't true, we
10209 # need to output the range if this new code point's value for the
10210 # given property doesn't match the stored range's.
10211 #local $to_trace = 1 if main::DEBUG;
10212 foreach my $i (0 .. $last_field) {
10213 my $field = $fields[$i];
10214 if ($force_output || $field ne $previous_fields[$i]) {
10215
10216 # Flush the buffer of stored values.
10217 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10218
10219 # Start a new range with this code point and its value
10220 $start[$i] = $cp;
10221 $previous_fields[$i] = $field;
10222 }
10223 }
10224
10225 # Set the values for the next time.
10226 $previous_cp = $cp;
10227 $decimal_previous_cp = $decimal_cp;
10228
10229 # The input line has generated whatever adjusted lines are needed, and
10230 # should not be looked at further.
10231 $_ = "";
10232 return;
10233 }
10234
10235 sub EOF_UnicodeData {
10236 # Called upon EOF to flush the buffers, and create the Hangul
10237 # decomposition mappings if needed.
10238
10239 my $file = shift;
10240 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10241
10242 # Flush the buffers.
10243 foreach my $i (1 .. $last_field) {
10244 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10245 }
10246
10247 if (-e 'Jamo.txt') {
10248
10249 # The algorithm is published by Unicode, based on values in
10250 # Jamo.txt, (which should have been processed before this
10251 # subroutine), and the results left in %Jamo
10252 unless (%Jamo) {
10253 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10254 return;
10255 }
10256
10257 # If the full decomposition map table is being output, insert
10258 # into it the Hangul syllable mappings. This is to avoid having
10259 # to publish a subroutine in it to compute them. (which would
10260 # essentially be this code.) This uses the algorithm published by
10261 # Unicode.
10262 if (property_ref('Decomposition_Mapping')->to_output_map) {
10263 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10264 use integer;
10265 my $SIndex = $S - $SBase;
10266 my $L = $LBase + $SIndex / $NCount;
10267 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10268 my $T = $TBase + $SIndex % $TCount;
10269
10270 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10271 my $decomposition = sprintf("%04X %04X", $L, $V);
10272 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10273 $file->insert_adjusted_lines(
10274 sprintf("%04X; Decomposition_Mapping; %s",
10275 $S,
10276 $decomposition));
10277 }
10278 }
10279 }
10280
10281 return;
10282 }
10283
10284 sub filter_v1_ucd {
10285 # Fix UCD lines in version 1. This is probably overkill, but this
10286 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10287 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10288 # removed. This program retains them
10289 # 2) didn't include ranges, which it should have, and which are now
10290 # added in @corrected_lines below. It was hand populated by
10291 # taking the data from Version 2, verified by analyzing
10292 # DAge.txt.
10293 # 3) There is a syntax error in the entry for U+09F8 which could
10294 # cause problems for utf8_heavy, and so is changed. It's
10295 # numeric value was simply a minus sign, without any number.
10296 # (Eventually Unicode changed the code point to non-numeric.)
10297 # 4) The decomposition types often don't match later versions
10298 # exactly, and the whole syntax of that field is different; so
10299 # the syntax is changed as well as the types to their later
10300 # terminology. Otherwise normalize.pm would be very unhappy
10301 # 5) Many ccc classes are different. These are left intact.
10302 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10303 # fields. These are unchanged because it doesn't really cause
10304 # problems for Perl.
10305 # 7) A number of code points, such as controls, don't have their
10306 # Unicode Version 1 Names in this file. These are unchanged.
10307
10308 my @corrected_lines = split /\n/, <<'END';
103094E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
103109FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10311E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10312F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10313F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10314FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10315END
10316
10317 my $file = shift;
10318 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10319
10320 #local $to_trace = 1 if main::DEBUG;
10321 trace $_ if main::DEBUG && $to_trace;
10322
10323 # -1 => retain trailing null fields
10324 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10325
10326 # At the first place that is wrong in the input, insert all the
10327 # corrections, replacing the wrong line.
10328 if ($code_point eq '4E00') {
10329 my @copy = @corrected_lines;
10330 $_ = shift @copy;
10331 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10332
10333 $file->insert_lines(@copy);
10334 }
10335
10336
10337 if ($fields[$NUMERIC] eq '-') {
10338 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10339 }
10340
10341 if ($fields[$PERL_DECOMPOSITION] ne "") {
10342
10343 # Several entries have this change to superscript 2 or 3 in the
10344 # middle. Convert these to the modern version, which is to use
10345 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10346 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10347 # 'HHHH HHHH 00B3 HHHH'.
10348 # It turns out that all of these that don't have another
10349 # decomposition defined at the beginning of the line have the
10350 # <square> decomposition in later releases.
10351 if ($code_point ne '00B2' && $code_point ne '00B3') {
10352 if ($fields[$PERL_DECOMPOSITION]
10353 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10354 {
10355 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10356 $fields[$PERL_DECOMPOSITION] = '<square> '
10357 . $fields[$PERL_DECOMPOSITION];
10358 }
10359 }
10360 }
10361
10362 # If is like '<+circled> 0052 <-circled>', convert to
10363 # '<circled> 0052'
10364 $fields[$PERL_DECOMPOSITION] =~
10365 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10366
10367 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10368 $fields[$PERL_DECOMPOSITION] =~
10369 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10370 or $fields[$PERL_DECOMPOSITION] =~
10371 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10372 or $fields[$PERL_DECOMPOSITION] =~
10373 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10374 or $fields[$PERL_DECOMPOSITION] =~
10375 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10376
10377 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10378 $fields[$PERL_DECOMPOSITION] =~
10379 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10380
10381 # Change names to modern form.
10382 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10383 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10384 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10385 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10386
10387 # One entry has weird braces
10388 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10389 }
10390
10391 $_ = join ';', $code_point, @fields;
10392 trace $_ if main::DEBUG && $to_trace;
10393 return;
10394 }
10395
10396 sub filter_v2_1_5_ucd {
10397 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10398 # columns swapped; These all had mirrored be 'N'. So if the numeric
10399 # column appears to be N, swap it back.
10400
10401 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10402 if ($fields[$NUMERIC] eq 'N') {
10403 $fields[$NUMERIC] = $fields[$MIRRORED];
10404 $fields[$MIRRORED] = 'N';
10405 $_ = join ';', $code_point, @fields;
10406 }
10407 return;
10408 }
3ffed8c2
KW
10409
10410 sub filter_v6_ucd {
10411
c12f2655
KW
10412 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10413 # accepted that yet to allow for some deprecation cycles.
3ffed8c2 10414
484741e1 10415 return if $_ !~ /^(?:0007|1F514|070F);/;
3ffed8c2
KW
10416
10417 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10418 if ($code_point eq '0007') {
dcd72625 10419 $fields[$CHARNAME] = "";
3ffed8c2 10420 }
484741e1
KW
10421 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10422 # http://www.unicode.org/versions/corrigendum8.html
10423 $fields[$BIDI] = "AL";
10424 }
10914c78 10425 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
3ffed8c2
KW
10426 $fields[$CHARNAME] = "";
10427 }
10428
10429 $_ = join ';', $code_point, @fields;
10430
10431 return;
10432 }
99870f4d
KW
10433} # End closure for UnicodeData
10434
37e2e78e
KW
10435sub process_GCB_test {
10436
10437 my $file = shift;
10438 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10439
10440 while ($file->next_line) {
10441 push @backslash_X_tests, $_;
10442 }
678f13d5 10443
37e2e78e
KW
10444 return;
10445}
10446
99870f4d
KW
10447sub process_NamedSequences {
10448 # NamedSequences.txt entries are just added to an array. Because these
10449 # don't look like the other tables, they have their own handler.
10450 # An example:
10451 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10452 #
10453 # This just adds the sequence to an array for later handling
10454
99870f4d
KW
10455 my $file = shift;
10456 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10457
10458 while ($file->next_line) {
10459 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10460 if (@remainder) {
10461 $file->carp_bad_line(
10462 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10463 next;
10464 }
fb121860
KW
10465
10466 # Note single \t in keeping with special output format of
10467 # Perl_charnames. But it turns out that the code points don't have to
10468 # be 5 digits long, like the rest, based on the internal workings of
10469 # charnames.pm. This could be easily changed for consistency.
10470 push @named_sequences, "$sequence\t$name";
99870f4d
KW
10471 }
10472 return;
10473}
10474
10475{ # Closure
10476
10477 my $first_range;
10478
10479 sub filter_early_ea_lb {
10480 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10481 # third field be the name of the code point, which can be ignored in
10482 # most cases. But it can be meaningful if it marks a range:
10483 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10484 # 3400;W;<CJK Ideograph Extension A, First>
10485 #
10486 # We need to see the First in the example above to know it's a range.
10487 # They did not use the later range syntaxes. This routine changes it
10488 # to use the modern syntax.
10489 # $1 is the Input_file object.
10490
10491 my @fields = split /\s*;\s*/;
10492 if ($fields[2] =~ /^<.*, First>/) {
10493 $first_range = $fields[0];
10494 $_ = "";
10495 }
10496 elsif ($fields[2] =~ /^<.*, Last>/) {
10497 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10498 }
10499 else {
10500 undef $first_range;
10501 $_ = "$fields[0]; $fields[1]";
10502 }
10503
10504 return;
10505 }
10506}
10507
10508sub filter_old_style_arabic_shaping {
10509 # Early versions used a different term for the later one.
10510
10511 my @fields = split /\s*;\s*/;
10512 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10513 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10514 $_ = join ';', @fields;
10515 return;
10516}
10517
10518sub filter_arabic_shaping_line {
10519 # ArabicShaping.txt has entries that look like:
10520 # 062A; TEH; D; BEH
10521 # The field containing 'TEH' is not used. The next field is Joining_Type
10522 # and the last is Joining_Group
10523 # This generates two lines to pass on, one for each property on the input
10524 # line.
10525
10526 my $file = shift;
10527 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10528
10529 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10530
10531 if (@fields > 4) {
10532 $file->carp_bad_line('Extra fields');
10533 $_ = "";
10534 return;
10535 }
10536
10537 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10538 $_ = "$fields[0]; Joining_Type; $fields[2]";
10539
10540 return;
10541}
10542
d3fed3dd
KW
10543{ # Closure
10544 my $lc; # Table for lowercase mapping
10545 my $tc;
10546 my $uc;
10547
6c0259ad
KW
10548 sub setup_special_casing {
10549 # SpecialCasing.txt contains the non-simple case change mappings. The
10550 # simple ones are in UnicodeData.txt, which should already have been
10551 # read in to the full property data structures, so as to initialize
10552 # these with the simple ones. Then the SpecialCasing.txt entries
10553 # overwrite the ones which have different full mappings.
10554
10555 # This routine sees if the simple mappings are to be output, and if
10556 # so, copies what has already been put into the full mapping tables,
10557 # while they still contain only the simple mappings.
10558
10559 # The reason it is done this way is that the simple mappings are
10560 # probably not going to be output, so it saves work to initialize the
10561 # full tables with the simple mappings, and then overwrite those
10562 # relatively few entries in them that have different full mappings,
10563 # and thus skip the simple mapping tables altogether.
10564
c12f2655
KW
10565 # New tables with just the simple mappings that are overridden by the
10566 # full ones are constructed. These are for Unicode::UCD, which
10567 # requires the simple mappings. The Case_Folding table is a combined
10568 # table of both the simple and full mappings, with the full ones being
10569 # in the hash, and the simple ones, even those overridden by the hash,
10570 # being in the base table. That same mechanism could have been
10571 # employed here, except that the docs have said that the generated
10572 # files are usuable directly by programs, so we dare not change the
10573 # format in any way.
10574
6c0259ad
KW
10575 my $file= shift;
10576 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10577
6c0259ad
KW
10578 $lc = property_ref('lc');
10579 $tc = property_ref('tc');
10580 $uc = property_ref('uc');
10581
10582 # For each of the case change mappings...
10583 foreach my $case_table ($lc, $tc, $uc) {
10584 my $case = $case_table->name;
10585 my $full = property_ref($case);
10586 unless (defined $full && ! $full->is_empty) {
10587 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10588 }
10589
10590 # The simple version's name in each mapping merely has an 's' in
10591 # front of the full one's
301ba948
KW
10592 my $simple_name = 's' . $case;
10593 my $simple = property_ref($simple_name);
6c0259ad
KW
10594 $simple->initialize($full) if $simple->to_output_map();
10595
10596 my $simple_only = Property->new("_s$case",
10597 Type => $STRING,
10598 Default_Map => $CODE_POINT,
10599 Perl_Extension => 1,
301ba948 10600 Fate => $INTERNAL_ONLY,
6c0259ad
KW
10601 Description => "The simple mappings for $case for code points that have full mappings as well");
10602 $simple_only->set_to_output_map($INTERNAL_MAP);
10603 $simple_only->add_comment(join_lines( <<END
d3fed3dd
KW
10604This file is for UCD.pm so that it can construct simple mappings that would
10605otherwise be lost because they are overridden by full mappings.
10606END
6c0259ad
KW
10607 ));
10608 }
99870f4d 10609
6c0259ad
KW
10610 return;
10611 }
99870f4d 10612
6c0259ad
KW
10613 sub filter_special_casing_line {
10614 # Change the format of $_ from SpecialCasing.txt into something that
10615 # the generic handler understands. Each input line contains three
10616 # case mappings. This will generate three lines to pass to the
10617 # generic handler for each of those.
99870f4d 10618
6c0259ad
KW
10619 # The input syntax (after stripping comments and trailing white space
10620 # is like one of the following (with the final two being entries that
10621 # we ignore):
10622 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10623 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10624 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10625 # Note the trailing semi-colon, unlike many of the input files. That
10626 # means that there will be an extra null field generated by the split
99870f4d 10627
6c0259ad
KW
10628 my $file = shift;
10629 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10630
6c0259ad
KW
10631 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10632 # fields
10633
10634 # field #4 is when this mapping is conditional. If any of these get
10635 # implemented, it would be by hard-coding in the casing functions in
10636 # the Perl core, not through tables. But if there is a new condition
10637 # we don't know about, output a warning. We know about all the
10638 # conditions through 6.0
10639 if ($fields[4] ne "") {
10640 my @conditions = split ' ', $fields[4];
10641 if ($conditions[0] ne 'tr' # We know that these languages have
10642 # conditions, and some are multiple
10643 && $conditions[0] ne 'az'
10644 && $conditions[0] ne 'lt'
10645
10646 # And, we know about a single condition Final_Sigma, but
10647 # nothing else.
10648 && ($v_version gt v5.2.0
10649 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10650 {
10651 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10652 }
10653 elsif ($conditions[0] ne 'Final_Sigma') {
99870f4d 10654
6c0259ad
KW
10655 # Don't print out a message for Final_Sigma, because we
10656 # have hard-coded handling for it. (But the standard
10657 # could change what the rule should be, but it wouldn't
10658 # show up here anyway.
99870f4d 10659
6c0259ad 10660 print "# SKIPPING Special Casing: $_\n"
99870f4d 10661 if $verbosity >= $VERBOSE;
6c0259ad
KW
10662 }
10663 $_ = "";
10664 return;
10665 }
10666 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10667 $file->carp_bad_line('Extra fields');
10668 $_ = "";
10669 return;
99870f4d 10670 }
99870f4d 10671
6c0259ad
KW
10672 $_ = "$fields[0]; lc; $fields[1]";
10673 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10674 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
99870f4d 10675
6c0259ad
KW
10676 # Copy any simple case change to the special tables constructed if
10677 # being overridden by a multi-character case change.
10678 if ($fields[1] ne $fields[0]
10679 && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10680 {
10681 $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10682 }
10683 if ($fields[2] ne $fields[0]
10684 && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10685 {
10686 $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10687 }
10688 if ($fields[3] ne $fields[0]
10689 && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10690 {
10691 $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10692 }
d3fed3dd 10693
6c0259ad
KW
10694 return;
10695 }
d3fed3dd 10696}
99870f4d
KW
10697
10698sub filter_old_style_case_folding {
10699 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 10700 # and later style. Different letters were used in the earlier.
99870f4d
KW
10701
10702 my $file = shift;
10703 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10704
10705 my @fields = split /\s*;\s*/;
10706 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10707 $fields[1] = 'I';
10708 }
10709 elsif ($fields[1] eq 'L') {
10710 $fields[1] = 'C'; # L => C always
10711 }
10712 elsif ($fields[1] eq 'E') {
10713 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10714 $fields[1] = 'F'
10715 }
10716 else {
10717 $fields[1] = 'C'
10718 }
10719 }
10720 else {
10721 $file->carp_bad_line("Expecting L or E in second field");
10722 $_ = "";
10723 return;
10724 }
10725 $_ = join("; ", @fields) . ';';
10726 return;
10727}
10728
10729{ # Closure for case folding
10730
10731 # Create the map for simple only if are going to output it, for otherwise
10732 # it takes no part in anything we do.
10733 my $to_output_simple;
10734
99870f4d
KW
10735 sub setup_case_folding($) {
10736 # Read in the case foldings in CaseFolding.txt. This handles both
10737 # simple and full case folding.
10738
10739 $to_output_simple
10740 = property_ref('Simple_Case_Folding')->to_output_map;
10741
6f2a3287
KW
10742 # If we ever wanted to show that these tables were combined, a new
10743 # property method could be created, like set_combined_props()
10744 property_ref('Case_Folding')->add_comment(join_lines( <<END
10745This file includes both the simple and full case folding maps. The simple
10746ones are in the main body of the table below, and the full ones adding to or
10747overriding them are in the hash.
10748END
10749 ));
99870f4d
KW
10750 return;
10751 }
10752
10753 sub filter_case_folding_line {
10754 # Called for each line in CaseFolding.txt
10755 # Input lines look like:
10756 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10757 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10758 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10759 #
10760 # 'C' means that folding is the same for both simple and full
10761 # 'F' that it is only for full folding
10762 # 'S' that it is only for simple folding
10763 # 'T' is locale-dependent, and ignored
10764 # 'I' is a type of 'F' used in some early releases.
10765 # Note the trailing semi-colon, unlike many of the input files. That
10766 # means that there will be an extra null field generated by the split
10767 # below, which we ignore and hence is not an error.
10768
10769 my $file = shift;
10770 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10771
10772 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10773 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10774 $file->carp_bad_line('Extra fields');
10775 $_ = "";
10776 return;
10777 }
10778
10779 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10780 $_ = "";
10781 return;
10782 }
10783
10784 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
3c099872
KW
10785 # I are all full foldings; S is single-char. For S, there is always
10786 # an F entry, so we must allow multiple values for the same code
10787 # point. Fortunately this table doesn't need further manipulation
10788 # which would preclude using multiple-values. The S is now included
10789 # so that _swash_inversion_hash() is able to construct closures
10790 # without having to worry about F mappings.
10791 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10792 $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
99870f4d
KW
10793 }
10794 else {
10795 $_ = "";
3c099872 10796 $file->carp_bad_line('Expecting C F I S or T in second field');
99870f4d
KW
10797 }
10798
10799 # C and S are simple foldings, but simple case folding is not needed
10800 # unless we explicitly want its map table output.
10801 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10802 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10803 }
10804
99870f4d
KW
10805 return;
10806 }
10807
99870f4d
KW
10808} # End case fold closure
10809
10810sub filter_jamo_line {
10811 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10812 # from this file that is used in generating the Name property for Jamo
10813 # code points. But, it also is used to convert early versions' syntax
10814 # into the modern form. Here are two examples:
10815 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10816 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10817 #
10818 # The input is $_, the output is $_ filtered.
10819
10820 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10821
10822 # Let the caller handle unexpected input. In earlier versions, there was
10823 # a third field which is supposed to be a comment, but did not have a '#'
10824 # before it.
10825 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10826
10827 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10828 # beginning.
10829
10830 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10831 $fields[1] = 'R' if $fields[0] eq '1105';
10832
10833 # Add to structure so can generate Names from it.
10834 my $cp = hex $fields[0];
10835 my $short_name = $fields[1];
10836 $Jamo{$cp} = $short_name;
10837 if ($cp <= $LBase + $LCount) {
10838 $Jamo_L{$short_name} = $cp - $LBase;
10839 }
10840 elsif ($cp <= $VBase + $VCount) {
10841 $Jamo_V{$short_name} = $cp - $VBase;
10842 }
10843 elsif ($cp <= $TBase + $TCount) {
10844 $Jamo_T{$short_name} = $cp - $TBase;
10845 }
10846 else {
10847 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10848 }
10849
10850
10851 # Reassemble using just the first two fields to look like a typical
10852 # property file line
10853 $_ = "$fields[0]; $fields[1]";
10854
10855 return;
10856}
10857
99870f4d
KW
10858sub register_fraction($) {
10859 # This registers the input rational number so that it can be passed on to
10860 # utf8_heavy.pl, both in rational and floating forms.
10861
10862 my $rational = shift;
10863
10864 my $float = eval $rational;
10865 $nv_floating_to_rational{$float} = $rational;
10866 return;
10867}
10868
10869sub filter_numeric_value_line {
10870 # DNumValues contains lines of a different syntax than the typical
10871 # property file:
10872 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10873 #
10874 # This routine transforms $_ containing the anomalous syntax to the
10875 # typical, by filtering out the extra columns, and convert early version
10876 # decimal numbers to strings that look like rational numbers.
10877
10878 my $file = shift;
10879 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10880
10881 # Starting in 5.1, there is a rational field. Just use that, omitting the
10882 # extra columns. Otherwise convert the decimal number in the second field
10883 # to a rational, and omit extraneous columns.
10884 my @fields = split /\s*;\s*/, $_, -1;
10885 my $rational;
10886
10887 if ($v_version ge v5.1.0) {
10888 if (@fields != 4) {
10889 $file->carp_bad_line('Not 4 semi-colon separated fields');
10890 $_ = "";
10891 return;
10892 }
10893 $rational = $fields[3];
10894 $_ = join '; ', @fields[ 0, 3 ];
10895 }
10896 else {
10897
10898 # Here, is an older Unicode file, which has decimal numbers instead of
10899 # rationals in it. Use the fraction to calculate the denominator and
10900 # convert to rational.
10901
10902 if (@fields != 2 && @fields != 3) {
10903 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10904 $_ = "";
10905 return;
10906 }
10907
10908 my $codepoints = $fields[0];
10909 my $decimal = $fields[1];
10910 if ($decimal =~ s/\.0+$//) {
10911
10912 # Anything ending with a decimal followed by nothing but 0's is an
10913 # integer
10914 $_ = "$codepoints; $decimal";
10915 $rational = $decimal;
10916 }
10917 else {
10918
10919 my $denominator;
10920 if ($decimal =~ /\.50*$/) {
10921 $denominator = 2;
10922 }
10923
10924 # Here have the hardcoded repeating decimals in the fraction, and
10925 # the denominator they imply. There were only a few denominators
10926 # in the older Unicode versions of this file which this code
10927 # handles, so it is easy to convert them.
10928
10929 # The 4 is because of a round-off error in the Unicode 3.2 files
10930 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10931 $denominator = 3;
10932 }
10933 elsif ($decimal =~ /\.[27]50*$/) {
10934 $denominator = 4;
10935 }
10936 elsif ($decimal =~ /\.[2468]0*$/) {
10937 $denominator = 5;
10938 }
10939 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10940 $denominator = 6;
10941 }
10942 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10943 $denominator = 8;
10944 }
10945 if ($denominator) {
10946 my $sign = ($decimal < 0) ? "-" : "";
10947 my $numerator = int((abs($decimal) * $denominator) + .5);
10948 $rational = "$sign$numerator/$denominator";
10949 $_ = "$codepoints; $rational";
10950 }
10951 else {
10952 $file->carp_bad_line("Can't cope with number '$decimal'.");
10953 $_ = "";
10954 return;
10955 }
10956 }
10957 }
10958
10959 register_fraction($rational) if $rational =~ qr{/};
10960 return;
10961}
10962
10963{ # Closure
10964 my %unihan_properties;
99870f4d
KW
10965
10966 sub setup_unihan {
10967 # Do any special setup for Unihan properties.
10968
10969 # This property gives the wrong computed type, so override.
10970 my $usource = property_ref('kIRG_USource');
10971 $usource->set_type($STRING) if defined $usource;
10972
b2abbe5b
KW
10973 # This property is to be considered binary (it says so in
10974 # http://www.unicode.org/reports/tr38/)
46b2142f 10975 my $iicore = property_ref('kIICore');
99870f4d 10976 if (defined $iicore) {
46b2142f
KW
10977 $iicore->set_type($FORCED_BINARY);
10978 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
10979
10980 # Unicode doesn't include the maps for this property, so don't
10981 # warn that they are missing.
10982 $iicore->set_pre_declared_maps(0);
10983 $iicore->add_comment(join_lines( <<END
10984This property contains enum values, but Unicode UAX #38 says it should be
10985interpreted as binary, so Perl creates tables for both 1) its enum values,
10986plus 2) true/false tables in which it is considered true for all code points
10987that have a non-null value
10988END
10989 ));
99870f4d
KW
10990 }
10991
10992 return;
10993 }
10994
10995 sub filter_unihan_line {
10996 # Change unihan db lines to look like the others in the db. Here is
10997 # an input sample:
10998 # U+341C kCangjie IEKN
10999
11000 # Tabs are used instead of semi-colons to separate fields; therefore
11001 # they may have semi-colons embedded in them. Change these to periods
11002 # so won't screw up the rest of the code.
11003 s/;/./g;
11004
11005 # Remove lines that don't look like ones we accept.
11006 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11007 $_ = "";
11008 return;
11009 }
11010
11011 # Extract the property, and save a reference to its object.
11012 my $property = $1;
11013 if (! exists $unihan_properties{$property}) {
11014 $unihan_properties{$property} = property_ref($property);
11015 }
11016
11017 # Don't do anything unless the property is one we're handling, which
11018 # we determine by seeing if there is an object defined for it or not
11019 if (! defined $unihan_properties{$property}) {
11020 $_ = "";
11021 return;
11022 }
11023
99870f4d
KW
11024 # Convert the tab separators to our standard semi-colons, and convert
11025 # the U+HHHH notation to the rest of the standard's HHHH
11026 s/\t/;/g;
11027 s/\b U \+ (?= $code_point_re )//xg;
11028
11029 #local $to_trace = 1 if main::DEBUG;
11030 trace $_ if main::DEBUG && $to_trace;
11031
11032 return;
11033 }
11034}
11035
11036sub filter_blocks_lines {
11037 # In the Blocks.txt file, the names of the blocks don't quite match the
11038 # names given in PropertyValueAliases.txt, so this changes them so they
11039 # do match: Blanks and hyphens are changed into underscores. Also makes
11040 # early release versions look like later ones
11041 #
11042 # $_ is transformed to the correct value.
11043
11044 my $file = shift;
11045 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11046
11047 if ($v_version lt v3.2.0) {
11048 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11049 $_ = "";
11050 return;
11051 }
11052
11053 # Old versions used a different syntax to mark the range.
11054 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11055 }
11056
11057 my @fields = split /\s*;\s*/, $_, -1;
11058 if (@fields != 2) {
11059 $file->carp_bad_line("Expecting exactly two fields");
11060 $_ = "";
11061 return;
11062 }
11063
11064 # Change hyphens and blanks in the block name field only
11065 $fields[1] =~ s/[ -]/_/g;
11066 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11067
11068 $_ = join("; ", @fields);
11069 return;
11070}
11071
11072{ # Closure
11073 my $current_property;
11074
11075 sub filter_old_style_proplist {
11076 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11077 # was in a completely different syntax. Ken Whistler of Unicode says
11078 # that it was something he used as an aid for his own purposes, but
11079 # was never an official part of the standard. However, comments in
11080 # DAge.txt indicate that non-character code points were available in
11081 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11082 # there except through this file (but on the other hand, they first
11083 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11084 # not. But the claim is that it was published as an aid to others who
11085 # might want some more information than was given in the official UCD
11086 # of the time. Many of the properties in it were incorporated into
11087 # the later PropList.txt, but some were not. This program uses this
11088 # early file to generate property tables that are otherwise not
11089 # accessible in the early UCD's, and most were probably not really
11090 # official at that time, so one could argue that it should be ignored,
11091 # and you can easily modify things to skip this. And there are bugs
11092 # in this file in various versions. (For example, the 2.1.9 version
11093 # removes from Alphabetic the CJK range starting at 4E00, and they
11094 # weren't added back in until 3.1.0.) Many of this file's properties
11095 # were later sanctioned, so this code generates tables for those
11096 # properties that aren't otherwise in the UCD of the time but
11097 # eventually did become official, and throws away the rest. Here is a
11098 # list of all the ones that are thrown away:
11099 # Bidi=* duplicates UnicodeData.txt
11100 # Combining never made into official property;
11101 # is \P{ccc=0}
11102 # Composite never made into official property.
11103 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11104 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11105 # Delimiter never made into official property;
11106 # removed in 3.0.1
11107 # Format Control never made into official property;
11108 # similar to gc=cf
11109 # High Surrogate duplicates Blocks.txt
11110 # Ignorable Control never made into official property;
11111 # similar to di=y
11112 # ISO Control duplicates UnicodeData.txt: gc=cc
11113 # Left of Pair never made into official property;
11114 # Line Separator duplicates UnicodeData.txt: gc=zl
11115 # Low Surrogate duplicates Blocks.txt
11116 # Non-break was actually listed as a property
11117 # in 3.2, but without any code
11118 # points. Unicode denies that this
11119 # was ever an official property
11120 # Non-spacing duplicate UnicodeData.txt: gc=mn
11121 # Numeric duplicates UnicodeData.txt: gc=cc
11122 # Paired Punctuation never made into official property;
11123 # appears to be gc=ps + gc=pe
11124 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11125 # Private Use duplicates UnicodeData.txt: gc=co
11126 # Private Use High Surrogate duplicates Blocks.txt
11127 # Punctuation duplicates UnicodeData.txt: gc=p
11128 # Space different definition than eventual
11129 # one.
11130 # Titlecase duplicates UnicodeData.txt: gc=lt
11131 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
98dc9551 11132 # Zero-width never made into official property;
99870f4d
KW
11133 # subset of gc=cf
11134 # Most of the properties have the same names in this file as in later
11135 # versions, but a couple do not.
11136 #
11137 # This subroutine filters $_, converting it from the old style into
11138 # the new style. Here's a sample of the old-style
11139 #
11140 # *******************************************
11141 #
11142 # Property dump for: 0x100000A0 (Join Control)
11143 #
11144 # 200C..200D (2 chars)
11145 #
11146 # In the example, the property is "Join Control". It is kept in this
11147 # closure between calls to the subroutine. The numbers beginning with
11148 # 0x were internal to Ken's program that generated this file.
11149
11150 # If this line contains the property name, extract it.
11151 if (/^Property dump for: [^(]*\((.*)\)/) {
11152 $_ = $1;
11153
11154 # Convert white space to underscores.
11155 s/ /_/g;
11156
11157 # Convert the few properties that don't have the same name as
11158 # their modern counterparts
11159 s/Identifier_Part/ID_Continue/
11160 or s/Not_a_Character/NChar/;
11161
11162 # If the name matches an existing property, use it.
11163 if (defined property_ref($_)) {
11164 trace "new property=", $_ if main::DEBUG && $to_trace;
11165 $current_property = $_;
11166 }
11167 else { # Otherwise discard it
11168 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11169 undef $current_property;
11170 }
11171 $_ = ""; # The property is saved for the next lines of the
11172 # file, but this defining line is of no further use,
11173 # so clear it so that the caller won't process it
11174 # further.
11175 }
11176 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11177
11178 # Here, the input line isn't a header defining a property for the
11179 # following section, and either we aren't in such a section, or
11180 # the line doesn't look like one that defines the code points in
11181 # such a section. Ignore this line.
11182 $_ = "";
11183 }
11184 else {
11185
11186 # Here, we have a line defining the code points for the current
11187 # stashed property. Anything starting with the first blank is
11188 # extraneous. Otherwise, it should look like a normal range to
11189 # the caller. Append the property name so that it looks just like
11190 # a modern PropList entry.
11191
11192 $_ =~ s/\s.*//;
11193 $_ .= "; $current_property";
11194 }
11195 trace $_ if main::DEBUG && $to_trace;
11196 return;
11197 }
11198} # End closure for old style proplist
11199
11200sub filter_old_style_normalization_lines {
11201 # For early releases of Unicode, the lines were like:
11202 # 74..2A76 ; NFKD_NO
11203 # For later releases this became:
11204 # 74..2A76 ; NFKD_QC; N
11205 # Filter $_ to look like those in later releases.
11206 # Similarly for MAYBEs
11207
11208 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11209
11210 # Also, the property FC_NFKC was abbreviated to FNC
11211 s/FNC/FC_NFKC/;
11212 return;
11213}
11214
82aed44a
KW
11215sub setup_script_extensions {
11216 # The Script_Extensions property starts out with a clone of the Script
11217 # property.
11218
11219 my $sc = property_ref("Script");
11220 my $scx = Property->new("scx", Full_Name => "Script_Extensions",
11221 Initialize => $sc,
11222 Default_Map => $sc->default_map,
11223 Pre_Declared_Maps => 0,
c3ff2976 11224 Format => $STRING_WHITE_SPACE_LIST,
82aed44a
KW
11225 );
11226 $scx->add_comment(join_lines( <<END
11227The values for code points that appear in one script are just the same as for
11228the 'Script' property. Likewise the values for those that appear in many
11229scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11230values of code points that appear in a few scripts are a space separated list
11231of those scripts.
11232END
11233 ));
11234
11235 # Make the scx's tables and aliases for them the same as sc's
11236 foreach my $table ($sc->tables) {
11237 my $scx_table = $scx->add_match_table($table->name,
11238 Full_Name => $table->full_name);
11239 foreach my $alias ($table->aliases) {
11240 $scx_table->add_alias($alias->name);
11241 }
11242 }
11243}
11244
fbe1e607
KW
11245sub filter_script_extensions_line {
11246 # The Scripts file comes with the full name for the scripts; the
11247 # ScriptExtensions, with the short name. The final mapping file is a
11248 # combination of these, and without adjustment, would have inconsistent
11249 # entries. This filters the latter file to convert to full names.
11250 # Entries look like this:
11251 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11252
11253 my @fields = split /\s*;\s*/;
11254 my @full_names;
11255 foreach my $short_name (split " ", $fields[1]) {
11256 push @full_names, $script->table($short_name)->full_name;
11257 }
11258 $fields[1] = join " ", @full_names;
11259 $_ = join "; ", @fields;
11260
11261 return;
11262}
11263
dcd72625
KW
11264sub setup_v6_name_alias {
11265 property_ref('Name_Alias')->add_map(7, 7, "ALERT");
11266}
11267
99870f4d
KW
11268sub finish_Unicode() {
11269 # This routine should be called after all the Unicode files have been read
11270 # in. It:
11271 # 1) Adds the mappings for code points missing from the files which have
11272 # defaults specified for them.
11273 # 2) At this this point all mappings are known, so it computes the type of
11274 # each property whose type hasn't been determined yet.
11275 # 3) Calculates all the regular expression match tables based on the
11276 # mappings.
11277 # 3) Calculates and adds the tables which are defined by Unicode, but
11278 # which aren't derived by them
11279
11280 # For each property, fill in any missing mappings, and calculate the re
11281 # match tables. If a property has more than one missing mapping, the
11282 # default is a reference to a data structure, and requires data from other
11283 # properties to resolve. The sort is used to cause these to be processed
11284 # last, after all the other properties have been calculated.
11285 # (Fortunately, the missing properties so far don't depend on each other.)
11286 foreach my $property
11287 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11288 property_ref('*'))
11289 {
11290 # $perl has been defined, but isn't one of the Unicode properties that
11291 # need to be finished up.
11292 next if $property == $perl;
11293
11294 # Handle the properties that have more than one possible default
11295 if (ref $property->default_map) {
11296 my $default_map = $property->default_map;
11297
11298 # These properties have stored in the default_map:
11299 # One or more of:
11300 # 1) A default map which applies to all code points in a
11301 # certain class
11302 # 2) an expression which will evaluate to the list of code
11303 # points in that class
11304 # And
11305 # 3) the default map which applies to every other missing code
11306 # point.
11307 #
11308 # Go through each list.
11309 while (my ($default, $eval) = $default_map->get_next_defaults) {
11310
11311 # Get the class list, and intersect it with all the so-far
11312 # unspecified code points yielding all the code points
11313 # in the class that haven't been specified.
11314 my $list = eval $eval;
11315 if ($@) {
11316 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11317 last;
11318 }
11319
11320 # Narrow down the list to just those code points we don't have
11321 # maps for yet.
11322 $list = $list & $property->inverse_list;
11323
11324 # Add mappings to the property for each code point in the list
11325 foreach my $range ($list->ranges) {
56343c78
KW
11326 $property->add_map($range->start, $range->end, $default,
11327 Replace => $CROAK);
99870f4d
KW
11328 }
11329 }
11330
11331 # All remaining code points have the other mapping. Set that up
11332 # so the normal single-default mapping code will work on them
11333 $property->set_default_map($default_map->other_default);
11334
11335 # And fall through to do that
11336 }
11337
11338 # We should have enough data now to compute the type of the property.
11339 $property->compute_type;
11340 my $property_type = $property->type;
11341
11342 next if ! $property->to_create_match_tables;
11343
11344 # Here want to create match tables for this property
11345
11346 # The Unicode db always (so far, and they claim into the future) have
11347 # the default for missing entries in binary properties be 'N' (unless
11348 # there is a '@missing' line that specifies otherwise)
11349 if ($property_type == $BINARY && ! defined $property->default_map) {
11350 $property->set_default_map('N');
11351 }
11352
11353 # Add any remaining code points to the mapping, using the default for
5d7f7709 11354 # missing code points.
d8fb1cc3 11355 my $default_table;
99870f4d 11356 if (defined (my $default_map = $property->default_map)) {
1520492f 11357
f4c2a127 11358 # Make sure there is a match table for the default
f4c2a127
KW
11359 if (! defined ($default_table = $property->table($default_map))) {
11360 $default_table = $property->add_match_table($default_map);
11361 }
11362
a92d5c2e
KW
11363 # And, if the property is binary, the default table will just
11364 # be the complement of the other table.
11365 if ($property_type == $BINARY) {
11366 my $non_default_table;
11367
11368 # Find the non-default table.
11369 for my $table ($property->tables) {
11370 next if $table == $default_table;
11371 $non_default_table = $table;
11372 }
11373 $default_table->set_complement($non_default_table);
11374 }
862fd107 11375 else {
a92d5c2e 11376
3981d009
KW
11377 # This fills in any missing values with the default. It's not
11378 # necessary to do this with binary properties, as the default
11379 # is defined completely in terms of the Y table.
6189eadc 11380 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
3981d009 11381 $default_map, Replace => $NO);
862fd107 11382 }
99870f4d
KW
11383 }
11384
11385 # Have all we need to populate the match tables.
11386 my $property_name = $property->name;
56557540 11387 my $maps_should_be_defined = $property->pre_declared_maps;
99870f4d
KW
11388 foreach my $range ($property->ranges) {
11389 my $map = $range->value;
f5e9a6ca 11390 my $table = $property->table($map);
99870f4d
KW
11391 if (! defined $table) {
11392
11393 # Integral and rational property values are not necessarily
56557540
KW
11394 # defined in PropValueAliases, but whether all the other ones
11395 # should be depends on the property.
11396 if ($maps_should_be_defined
99870f4d
KW
11397 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11398 {
11399 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
11400 }
f5e9a6ca 11401 $table = $property->add_match_table($map);
99870f4d
KW
11402 }
11403
862fd107 11404 next if $table->complement != 0; # Don't need to populate these
99870f4d
KW
11405 $table->add_range($range->start, $range->end);
11406 }
11407
06f26c45
KW
11408 # A forced binary property has additional true/false tables which
11409 # should have been set up when it was forced into binary. The false
11410 # table matches exactly the same set as the property's default table.
11411 # The true table matches the complement of that. The false table is
11412 # not the same as an additional set of aliases on top of the default
11413 # table, so use 'set_equivalent_to'. If it were implemented as
11414 # additional aliases, various things would have to be adjusted, but
11415 # especially, if the user wants to get a list of names for the table
11416 # using Unicode::UCD::prop_value_aliases(), s/he should get a
11417 # different set depending on whether they want the default table or
11418 # the false table.
11419 if ($property_type == $FORCED_BINARY) {
11420 $property->table('N')->set_equivalent_to($default_table,
11421 Related => 1);
11422 $property->table('Y')->set_complement($default_table);
11423 }
11424
807807b7
KW
11425 # For Perl 5.6 compatibility, all properties matchable in regexes can
11426 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
11427 # But warn if this creates a conflict with a (new) Unicode property
11428 # name, although it appears that Unicode has made a decision never to
11429 # begin a property name with 'Is_', so this shouldn't happen.
99870f4d
KW
11430 foreach my $alias ($property->aliases) {
11431 my $Is_name = 'Is_' . $alias->name;
807807b7 11432 if (defined (my $pre_existing = property_ref($Is_name))) {
99870f4d 11433 Carp::my_carp(<<END
807807b7
KW
11434There is already an alias named $Is_name (from " . $pre_existing . "), so
11435creating one for $property won't work. This is bad news. If it is not too
11436late, get Unicode to back off. Otherwise go back to the old scheme (findable
11437from the git blame log for this area of the code that suppressed individual
11438aliases that conflict with the new Unicode names. Proceeding anyway.
99870f4d
KW
11439END
11440 );
99870f4d
KW
11441 }
11442 } # End of loop through aliases for this property
11443 } # End of loop through all Unicode properties.
11444
11445 # Fill in the mappings that Unicode doesn't completely furnish. First the
11446 # single letter major general categories. If Unicode were to start
11447 # delivering the values, this would be redundant, but better that than to
11448 # try to figure out if should skip and not get it right. Ths could happen
11449 # if a new major category were to be introduced, and the hard-coded test
11450 # wouldn't know about it.
11451 # This routine depends on the standard names for the general categories
11452 # being what it thinks they are, like 'Cn'. The major categories are the
11453 # union of all the general category tables which have the same first
11454 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11455 foreach my $minor_table ($gc->tables) {
11456 my $minor_name = $minor_table->name;
11457 next if length $minor_name == 1;
11458 if (length $minor_name != 2) {
11459 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11460 next;
11461 }
11462
11463 my $major_name = uc(substr($minor_name, 0, 1));
11464 my $major_table = $gc->table($major_name);
11465 $major_table += $minor_table;
11466 }
11467
11468 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11469 # defines it as LC)
11470 my $LC = $gc->table('LC');
11471 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11472 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11473
11474
11475 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11476 # deliver the correct values in it
11477 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11478
11479 # Lt not in release 1.
a5c376b7
KW
11480 if (defined $gc->table('Lt')) {
11481 $LC += $gc->table('Lt');
11482 $gc->table('Lt')->set_caseless_equivalent($LC);
11483 }
99870f4d
KW
11484 }
11485 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11486
a5c376b7
KW
11487 $gc->table('Ll')->set_caseless_equivalent($LC);
11488 $gc->table('Lu')->set_caseless_equivalent($LC);
11489
99870f4d 11490 my $Cs = $gc->table('Cs');
99870f4d
KW
11491
11492
11493 # Folding information was introduced later into Unicode data. To get
11494 # Perl's case ignore (/i) to work at all in releases that don't have
11495 # folding, use the best available alternative, which is lower casing.
11496 my $fold = property_ref('Simple_Case_Folding');
11497 if ($fold->is_empty) {
11498 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11499 $fold->add_note(join_lines(<<END
11500WARNING: This table uses lower case as a substitute for missing fold
11501information
11502END
11503 ));
11504 }
11505
11506 # Multiple-character mapping was introduced later into Unicode data. If
11507 # missing, use the single-characters maps as best available alternative
11508 foreach my $map (qw { Uppercase_Mapping
11509 Lowercase_Mapping
11510 Titlecase_Mapping
11511 Case_Folding
11512 } ) {
11513 my $full = property_ref($map);
11514 if ($full->is_empty) {
11515 my $simple = property_ref('Simple_' . $map);
11516 $full->initialize($simple);
11517 $full->add_comment($simple->comment) if ($simple->comment);
11518 $full->add_note(join_lines(<<END
11519WARNING: This table uses simple mapping (single-character only) as a
11520substitute for missing multiple-character information
11521END
11522 ));
11523 }
11524 }
82aed44a
KW
11525
11526 # The Script_Extensions property started out as a clone of the Script
11527 # property. But processing its data file caused some elements to be
11528 # replaced with different data. (These elements were for the Common and
11529 # Inherited properties.) This data is a qw() list of all the scripts that
11530 # the code points in the given range are in. An example line is:
11531 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
11532 #
11533 # The code above has created a new match table named "Arab Syrc Thaa"
11534 # which contains 060C. (The cloned table started out with this code point
11535 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
11536 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
11537 # match table. This is repeated for all these tables and ranges. The map
11538 # data is retained in the map table for reference, but the spurious match
11539 # tables are deleted.
11540
11541 my $scx = property_ref("Script_Extensions");
d53a7e7d 11542 if (defined $scx) {
c3a37f64
KW
11543 foreach my $table ($scx->tables) {
11544 next unless $table->name =~ /\s/; # All the new and only the new
11545 # tables have a space in their
11546 # names
11547 my @scripts = split /\s+/, $table->name;
11548 foreach my $script (@scripts) {
11549 my $script_table = $scx->table($script);
11550 $script_table += $table;
11551 }
11552 $scx->delete_match_table($table);
82aed44a 11553 }
d53a7e7d 11554 }
82aed44a
KW
11555
11556 return;
99870f4d
KW
11557}
11558
11559sub compile_perl() {
11560 # Create perl-defined tables. Almost all are part of the pseudo-property
11561 # named 'perl' internally to this program. Many of these are recommended
11562 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11563 # on those found there.
11564 # Almost all of these are equivalent to some Unicode property.
11565 # A number of these properties have equivalents restricted to the ASCII
11566 # range, with their names prefaced by 'Posix', to signify that these match
11567 # what the Posix standard says they should match. A couple are
11568 # effectively this, but the name doesn't have 'Posix' in it because there
cbc24f92
KW
11569 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
11570 # to the full Unicode range, by our guesses as to what is appropriate.
99870f4d
KW
11571
11572 # 'Any' is all code points. As an error check, instead of just setting it
11573 # to be that, construct it to be the union of all the major categories
7fc6cb55 11574 $Any = $perl->add_match_table('Any',
6189eadc 11575 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
99870f4d
KW
11576 Matches_All => 1);
11577
11578 foreach my $major_table ($gc->tables) {
11579
11580 # Major categories are the ones with single letter names.
11581 next if length($major_table->name) != 1;
11582
11583 $Any += $major_table;
11584 }
11585
6189eadc 11586 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
99870f4d
KW
11587 Carp::my_carp_bug("Generated highest code point ("
11588 . sprintf("%X", $Any->max)
6189eadc 11589 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
99870f4d
KW
11590 }
11591 if ($Any->range_count != 1 || $Any->min != 0) {
11592 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11593 }
11594
11595 $Any->add_alias('All');
11596
11597 # Assigned is the opposite of gc=unassigned
11598 my $Assigned = $perl->add_match_table('Assigned',
11599 Description => "All assigned code points",
11600 Initialize => ~ $gc->table('Unassigned'),
11601 );
11602
11603 # Our internal-only property should be treated as more than just a
8050d00f 11604 # synonym; grandfather it in to the pod.
b15a0a3b
KW
11605 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
11606 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
99870f4d
KW
11607 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11608 Related => 1);
11609
11610 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11611 if (defined $block) { # This is equivalent to the block if have it.
11612 my $Unicode_ASCII = $block->table('Basic_Latin');
11613 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11614 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11615 }
11616 }
11617
11618 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11619 # necessary
11620 if ($ASCII->is_empty) {
11621 $ASCII->initialize([ 0..127 ]);
11622 }
11623
99870f4d
KW
11624 # Get the best available case definitions. Early Unicode versions didn't
11625 # have Uppercase and Lowercase defined, so use the general category
11626 # instead for them.
11627 my $Lower = $perl->add_match_table('Lower');
11628 my $Unicode_Lower = property_ref('Lowercase');
11629 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11630 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
a5c376b7
KW
11631 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11632 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11633 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11634
99870f4d
KW
11635 }
11636 else {
11637 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11638 Related => 1);
11639 }
cbc24f92 11640 $Lower->add_alias('XPosixLower');
a5c376b7 11641 my $Posix_Lower = $perl->add_match_table("PosixLower",
ad5e8af1
KW
11642 Description => "[a-z]",
11643 Initialize => $Lower & $ASCII,
11644 );
99870f4d
KW
11645
11646 my $Upper = $perl->add_match_table('Upper');
11647 my $Unicode_Upper = property_ref('Uppercase');
11648 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11649 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
a5c376b7
KW
11650 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11651 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11652 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
99870f4d
KW
11653 }
11654 else {
11655 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11656 Related => 1);
11657 }
cbc24f92 11658 $Upper->add_alias('XPosixUpper');
a5c376b7 11659 my $Posix_Upper = $perl->add_match_table("PosixUpper",
ad5e8af1
KW
11660 Description => "[A-Z]",
11661 Initialize => $Upper & $ASCII,
11662 );
99870f4d
KW
11663
11664 # Earliest releases didn't have title case. Initialize it to empty if not
11665 # otherwise present
4364919a
KW
11666 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11667 Description => '(= \p{Gc=Lt})');
99870f4d 11668 my $lt = $gc->table('Lt');
a5c376b7
KW
11669
11670 # Earlier versions of mktables had this related to $lt since they have
c12f2655
KW
11671 # identical code points, but their caseless equivalents are not the same,
11672 # one being 'Cased' and the other being 'LC', and so now must be kept as
11673 # separate entities.
a5c376b7 11674 $Title += $lt if defined $lt;
99870f4d
KW
11675
11676 # If this Unicode version doesn't have Cased, set up our own. From
11677 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11678 # and only if C has the Lowercase or Uppercase property or has a
11679 # General_Category value of Titlecase_Letter.
a5c376b7
KW
11680 my $Unicode_Cased = property_ref('Cased');
11681 unless (defined $Unicode_Cased) {
99870f4d
KW
11682 my $cased = $perl->add_match_table('Cased',
11683 Initialize => $Lower + $Upper + $Title,
11684 Description => 'Uppercase or Lowercase or Titlecase',
11685 );
a5c376b7 11686 $Unicode_Cased = $cased;
99870f4d 11687 }
a5c376b7 11688 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
99870f4d
KW
11689
11690 # Similarly, set up our own Case_Ignorable property if this Unicode
11691 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11692 # C is defined to be case-ignorable if C has the value MidLetter or the
11693 # value MidNumLet for the Word_Break property or its General_Category is
11694 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11695 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11696
8050d00f
KW
11697 # Perl has long had an internal-only alias for this property; grandfather
11698 # it in to the pod, but discourage its use.
11699 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
b15a0a3b
KW
11700 Re_Pod_Entry => 1,
11701 Fate => $INTERNAL_ONLY,
11702 Status => $DISCOURAGED);
99870f4d
KW
11703 my $case_ignorable = property_ref('Case_Ignorable');
11704 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11705 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11706 Related => 1);
11707 }
11708 else {
11709
11710 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11711
11712 # The following three properties are not in early releases
11713 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11714 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11715 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11716
11717 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11718 # correspondingly the case-ignorable definition lacks that one. For
11719 # 4.0, it appears that it was meant to be the same definition, but was
11720 # inadvertently omitted from the standard's text, so add it if the
11721 # property actually is there
11722 my $wb = property_ref('Word_Break');
11723 if (defined $wb) {
11724 my $midlet = $wb->table('MidLetter');
11725 $perl_case_ignorable += $midlet if defined $midlet;
11726 my $midnumlet = $wb->table('MidNumLet');
11727 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11728 }
11729 else {
11730
11731 # In earlier versions of the standard, instead of the above two
11732 # properties , just the following characters were used:
11733 $perl_case_ignorable += 0x0027 # APOSTROPHE
11734 + 0x00AD # SOFT HYPHEN (SHY)
11735 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11736 }
11737 }
11738
11739 # The remaining perl defined tables are mostly based on Unicode TR 18,
11740 # "Annex C: Compatibility Properties". All of these have two versions,
11741 # one whose name generally begins with Posix that is posix-compliant, and
11742 # one that matches Unicode characters beyond the Posix, ASCII range
11743
ad5e8af1 11744 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
11745
11746 # Alphabetic was not present in early releases
11747 my $Alphabetic = property_ref('Alphabetic');
11748 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11749 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11750 }
11751 else {
11752
11753 # For early releases, we don't get it exactly right. The below
11754 # includes more than it should, which in 5.2 terms is: L + Nl +
11755 # Other_Alphabetic. Other_Alphabetic contains many characters from
11756 # Mn and Mc. It's better to match more than we should, than less than
11757 # we should.
11758 $Alpha->initialize($gc->table('Letter')
11759 + $gc->table('Mn')
11760 + $gc->table('Mc'));
11761 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 11762 $Alpha->add_description('Alphabetic');
99870f4d 11763 }
cbc24f92 11764 $Alpha->add_alias('XPosixAlpha');
a5c376b7 11765 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
ad5e8af1
KW
11766 Description => "[A-Za-z]",
11767 Initialize => $Alpha & $ASCII,
11768 );
a5c376b7
KW
11769 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11770 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
99870f4d
KW
11771
11772 my $Alnum = $perl->add_match_table('Alnum',
ad5e8af1 11773 Description => 'Alphabetic and (Decimal) Numeric',
99870f4d
KW
11774 Initialize => $Alpha + $gc->table('Decimal_Number'),
11775 );
cbc24f92 11776 $Alnum->add_alias('XPosixAlnum');
ad5e8af1
KW
11777 $perl->add_match_table("PosixAlnum",
11778 Description => "[A-Za-z0-9]",
11779 Initialize => $Alnum & $ASCII,
11780 );
99870f4d
KW
11781
11782 my $Word = $perl->add_match_table('Word',
d35dd6c6
KW
11783 Description => '\w, including beyond ASCII;'
11784 . ' = \p{Alnum} + \pM + \p{Pc}',
99870f4d
KW
11785 Initialize => $Alnum + $gc->table('Mark'),
11786 );
cbc24f92 11787 $Word->add_alias('XPosixWord');
99870f4d
KW
11788 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11789 $Word += $Pc if defined $Pc;
11790
f38f76ae 11791 # This is a Perl extension, so the name doesn't begin with Posix.
cbc24f92 11792 my $PerlWord = $perl->add_match_table('PerlWord',
99870f4d
KW
11793 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11794 Initialize => $Word & $ASCII,
11795 );
cbc24f92 11796 $PerlWord->add_alias('PosixWord');
99870f4d
KW
11797
11798 my $Blank = $perl->add_match_table('Blank',
11799 Description => '\h, Horizontal white space',
11800
11801 # 200B is Zero Width Space which is for line
11802 # break control, and was listed as
11803 # Space_Separator in early releases
11804 Initialize => $gc->table('Space_Separator')
11805 + 0x0009 # TAB
11806 - 0x200B, # ZWSP
11807 );
11808 $Blank->add_alias('HorizSpace'); # Another name for it.
cbc24f92 11809 $Blank->add_alias('XPosixBlank');
ad5e8af1
KW
11810 $perl->add_match_table("PosixBlank",
11811 Description => "\\t and ' '",
11812 Initialize => $Blank & $ASCII,
11813 );
99870f4d
KW
11814
11815 my $VertSpace = $perl->add_match_table('VertSpace',
11816 Description => '\v',
11817 Initialize => $gc->table('Line_Separator')
11818 + $gc->table('Paragraph_Separator')
11819 + 0x000A # LINE FEED
11820 + 0x000B # VERTICAL TAB
11821 + 0x000C # FORM FEED
11822 + 0x000D # CARRIAGE RETURN
11823 + 0x0085, # NEL
11824 );
11825 # No Posix equivalent for vertical space
11826
11827 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
11828 Description => '\s including beyond ASCII plus vertical tab',
11829 Initialize => $Blank + $VertSpace,
99870f4d 11830 );
cbc24f92 11831 $Space->add_alias('XPosixSpace');
ad5e8af1 11832 $perl->add_match_table("PosixSpace",
f38f76ae 11833 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
11834 Initialize => $Space & $ASCII,
11835 );
99870f4d
KW
11836
11837 # Perl's traditional space doesn't include Vertical Tab
cbc24f92 11838 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
99870f4d
KW
11839 Description => '\s, including beyond ASCII',
11840 Initialize => $Space - 0x000B,
11841 );
cbc24f92
KW
11842 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
11843 my $PerlSpace = $perl->add_match_table('PerlSpace',
de25ec47
KW
11844 Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11845 Initialize => $XPerlSpace & $ASCII,
99870f4d
KW
11846 );
11847
cbc24f92 11848
99870f4d 11849 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 11850 Description => 'Control characters');
99870f4d 11851 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
cbc24f92 11852 $Cntrl->add_alias('XPosixCntrl');
ad5e8af1 11853 $perl->add_match_table("PosixCntrl",
f38f76ae 11854 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
11855 Initialize => $Cntrl & $ASCII,
11856 );
99870f4d
KW
11857
11858 # $controls is a temporary used to construct Graph.
11859 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11860 + $gc->table('Control'));
11861 # Cs not in release 1
11862 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11863
11864 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11865 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 11866 Description => 'Characters that are graphical',
99870f4d
KW
11867 Initialize => ~ ($Space + $controls),
11868 );
cbc24f92 11869 $Graph->add_alias('XPosixGraph');
ad5e8af1 11870 $perl->add_match_table("PosixGraph",
f38f76ae
KW
11871 Description =>
11872 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
11873 Initialize => $Graph & $ASCII,
11874 );
99870f4d 11875
3e20195b 11876 $print = $perl->add_match_table('Print',
ad5e8af1 11877 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 11878 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 11879 );
cbc24f92 11880 $print->add_alias('XPosixPrint');
ad5e8af1 11881 $perl->add_match_table("PosixPrint",
66fd7fd0 11882 Description =>
f38f76ae 11883 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 11884 Initialize => $print & $ASCII,
ad5e8af1 11885 );
99870f4d
KW
11886
11887 my $Punct = $perl->add_match_table('Punct');
11888 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11889
11890 # \p{punct} doesn't include the symbols, which posix does
cbc24f92
KW
11891 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
11892 Description => '\p{Punct} + ASCII-range \p{Symbol}',
11893 Initialize => $gc->table('Punctuation')
11894 + ($ASCII & $gc->table('Symbol')),
bb080638 11895 Perl_Extension => 1
cbc24f92 11896 );
bb080638 11897 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
f38f76ae 11898 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
cbc24f92 11899 Initialize => $ASCII & $XPosixPunct,
ad5e8af1 11900 );
99870f4d
KW
11901
11902 my $Digit = $perl->add_match_table('Digit',
f3a73f6e 11903 Description => '[0-9] + all other decimal digits');
99870f4d 11904 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
cbc24f92 11905 $Digit->add_alias('XPosixDigit');
ad5e8af1
KW
11906 my $PosixDigit = $perl->add_match_table("PosixDigit",
11907 Description => '[0-9]',
11908 Initialize => $Digit & $ASCII,
11909 );
99870f4d 11910
eadadd41
KW
11911 # Hex_Digit was not present in first release
11912 my $Xdigit = $perl->add_match_table('XDigit');
cbc24f92 11913 $Xdigit->add_alias('XPosixXDigit');
eadadd41
KW
11914 my $Hex = property_ref('Hex_Digit');
11915 if (defined $Hex && ! $Hex->is_empty) {
11916 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
11917 }
11918 else {
eadadd41
KW
11919 # (Have to use hex instead of e.g. '0', because could be running on an
11920 # non-ASCII machine, and we want the Unicode (ASCII) values)
11921 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11922 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11923 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d 11924 }
4efcc33b
KW
11925
11926 # AHex was not present in early releases
11927 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
11928 my $AHex = property_ref('ASCII_Hex_Digit');
11929 if (defined $AHex && ! $AHex->is_empty) {
11930 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
11931 }
11932 else {
11933 $PosixXDigit->initialize($Xdigit & $ASCII);
11934 }
11935 $PosixXDigit->add_description('[0-9A-Fa-f]');
99870f4d 11936
99870f4d
KW
11937 my $dt = property_ref('Decomposition_Type');
11938 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11939 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11940 Perl_Extension => 1,
d57ccc9a 11941 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
11942 );
11943
11944 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11945 # than SD appeared, construct it ourselves, based on the first release SD
8050d00f 11946 # was in. A pod entry is grandfathered in for it
33e96e72 11947 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
301ba948
KW
11948 Perl_Extension => 1,
11949 Fate => $INTERNAL_ONLY,
11950 Status => $DISCOURAGED);
99870f4d
KW
11951 my $soft_dotted = property_ref('Soft_Dotted');
11952 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11953 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11954 }
11955 else {
11956
11957 # This list came from 3.2 Soft_Dotted.
11958 $CanonDCIJ->initialize([ 0x0069,
11959 0x006A,
11960 0x012F,
11961 0x0268,
11962 0x0456,
11963 0x0458,
11964 0x1E2D,
11965 0x1ECB,
11966 ]);
11967 $CanonDCIJ = $CanonDCIJ & $Assigned;
11968 }
11969
f86864ac 11970 # These are used in Unicode's definition of \X
6ba2d696 11971 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1,
301ba948 11972 Fate => $INTERNAL_ONLY);
6ba2d696 11973 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
301ba948 11974 Fate => $INTERNAL_ONLY);
37e2e78e 11975
ee24a51c
KW
11976 # For backward compatibility, Perl has its own definition for IDStart
11977 # First, we include the underscore, and then the regular XID_Start also
11978 # have to be Words
11979 $perl->add_match_table('_Perl_IDStart',
11980 Perl_Extension => 1,
301ba948 11981 Fate => $INTERNAL_ONLY,
ee24a51c
KW
11982 Initialize =>
11983 ord('_')
11984 + (property_ref('XID_Start')->table('Y') & $Word)
11985 );
11986
99870f4d 11987 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 11988
678f13d5 11989 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
11990 # definition differs too much from the traditional Perl one to use.
11991 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11992
11993 # Note that assumes HST is defined; it came in an earlier release than
11994 # GCB. In the line below, two negatives means: yes hangul
11995 $begin += ~ property_ref('Hangul_Syllable_Type')
11996 ->table('Not_Applicable')
11997 + ~ ($gcb->table('Control')
11998 + $gcb->table('CR')
11999 + $gcb->table('LF'));
12000 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12001
12002 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12003 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
12004 }
12005 else { # Old definition, used on early releases.
f86864ac 12006 $extend += $gc->table('Mark')
37e2e78e
KW
12007 + 0x200C # ZWNJ
12008 + 0x200D; # ZWJ
12009 $begin += ~ $extend;
12010
12011 # Here we may have a release that has the regular grapheme cluster
12012 # defined, or a release that doesn't have anything defined.
12013 # We set things up so the Perl core degrades gracefully, possibly with
12014 # placeholders that match nothing.
12015
12016 if (! defined $gcb) {
12017 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
12018 }
12019 my $hst = property_ref('HST');
12020 if (!defined $hst) {
12021 $hst = Property->new('HST', Status => $PLACEHOLDER);
12022 $hst->add_match_table('Not_Applicable',
12023 Initialize => $Any,
12024 Matches_All => 1);
12025 }
12026
12027 # On some releases, here we may not have the needed tables for the
12028 # perl core, in some releases we may.
12029 foreach my $name (qw{ L LV LVT T V prepend }) {
12030 my $table = $gcb->table($name);
12031 if (! defined $table) {
12032 $table = $gcb->add_match_table($name);
12033 push @tables_that_may_be_empty, $table->complete_name;
12034 }
12035
12036 # The HST property predates the GCB one, and has identical tables
12037 # for some of them, so use it if we can.
12038 if ($table->is_empty
12039 && defined $hst
12040 && defined $hst->table($name))
12041 {
12042 $table += $hst->table($name);
12043 }
12044 }
12045 }
12046
12047 # More GCB. If we found some hangul syllables, populate a combined
12048 # table.
301ba948
KW
12049 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
12050 Perl_Extension => 1,
12051 Fate => $INTERNAL_ONLY);
37e2e78e
KW
12052 my $LV = $gcb->table('LV');
12053 if ($LV->is_empty) {
12054 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12055 } else {
12056 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12057 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
12058 }
12059
28093d0e 12060 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
12061 my @composition = ('Name', 'Unicode_1_Name');
12062
12063 if (@named_sequences) {
12064 push @composition, 'Named_Sequence';
12065 foreach my $sequence (@named_sequences) {
12066 $perl_charname->add_anomalous_entry($sequence);
12067 }
12068 }
12069
12070 my $alias_sentence = "";
12071 my $alias = property_ref('Name_Alias');
12072 if (defined $alias) {
12073 push @composition, 'Name_Alias';
12074 $alias->reset_each_range;
12075 while (my ($range) = $alias->each_range) {
12076 next if $range->value eq "";
12077 if ($range->start != $range->end) {
12078 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
12079 }
12080 $perl_charname->add_duplicate($range->start, $range->value);
12081 }
12082 $alias_sentence = <<END;
12083The Name_Alias property adds duplicate code point entries with a corrected
12084name. The original (less correct, but still valid) name will be physically
53d84487 12085last.
99870f4d
KW
12086END
12087 }
12088 my $comment;
12089 if (@composition <= 2) { # Always at least 2
12090 $comment = join " and ", @composition;
12091 }
12092 else {
12093 $comment = join ", ", @composition[0 .. scalar @composition - 2];
12094 $comment .= ", and $composition[-1]";
12095 }
12096
99870f4d
KW
12097 $perl_charname->add_comment(join_lines( <<END
12098This file is for charnames.pm. It is the union of the $comment properties.
12099Unicode_1_Name entries are used only for otherwise nameless code
12100points.
12101$alias_sentence
a03f0b9f
KW
12102This file doesn't include the algorithmically determinable names. For those,
12103use 'unicore/Name.pm'
12104END
12105 ));
12106 property_ref('Name')->add_comment(join_lines( <<END
12107This file doesn't include the algorithmically determinable names. For those,
12108use 'unicore/Name.pm'
99870f4d
KW
12109END
12110 ));
12111
99870f4d
KW
12112 # Construct the Present_In property from the Age property.
12113 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12114 my $default_map = $age->default_map;
12115 my $in = Property->new('In',
12116 Default_Map => $default_map,
12117 Full_Name => "Present_In",
99870f4d
KW
12118 Perl_Extension => 1,
12119 Type => $ENUM,
12120 Initialize => $age,
12121 );
12122 $in->add_comment(join_lines(<<END
c12f2655 12123THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
99870f4d
KW
12124same as for $age, and not for what $in really means. This is because anything
12125defined in a given release should have multiple values: that release and all
12126higher ones. But only one value per code point can be represented in a table
12127like this.
12128END
12129 ));
12130
12131 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
12132 # lowest numbered (earliest) come first, with the non-numeric one
12133 # last.
12134 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12135 ? 1
12136 : ($b->name !~ /^[\d.]*$/)
12137 ? -1
12138 : $a->name <=> $b->name
12139 } $age->tables;
12140
12141 # The Present_In property is the cumulative age properties. The first
12142 # one hence is identical to the first age one.
12143 my $previous_in = $in->add_match_table($first_age->name);
12144 $previous_in->set_equivalent_to($first_age, Related => 1);
12145
12146 my $description_start = "Code point's usage introduced in version ";
12147 $first_age->add_description($description_start . $first_age->name);
12148
98dc9551 12149 # To construct the accumulated values, for each of the age tables
99870f4d
KW
12150 # starting with the 2nd earliest, merge the earliest with it, to get
12151 # all those code points existing in the 2nd earliest. Repeat merging
12152 # the new 2nd earliest with the 3rd earliest to get all those existing
12153 # in the 3rd earliest, and so on.
12154 foreach my $current_age (@rest_ages) {
12155 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
12156
12157 my $current_in = $in->add_match_table(
12158 $current_age->name,
12159 Initialize => $current_age + $previous_in,
12160 Description => $description_start
12161 . $current_age->name
12162 . ' or earlier',
12163 );
12164 $previous_in = $current_in;
12165
12166 # Add clarifying material for the corresponding age file. This is
12167 # in part because of the confusing and contradictory information
12168 # given in the Standard's documentation itself, as of 5.2.
12169 $current_age->add_description(
12170 "Code point's usage was introduced in version "
12171 . $current_age->name);
12172 $current_age->add_note("See also $in");
12173
12174 }
12175
12176 # And finally the code points whose usages have yet to be decided are
12177 # the same in both properties. Note that permanently unassigned code
12178 # points actually have their usage assigned (as being permanently
12179 # unassigned), so that these tables are not the same as gc=cn.
12180 my $unassigned = $in->add_match_table($default_map);
12181 my $age_default = $age->table($default_map);
12182 $age_default->add_description(<<END
12183Code point's usage has not been assigned in any Unicode release thus far.
12184END
12185 );
12186 $unassigned->set_equivalent_to($age_default, Related => 1);
12187 }
12188
12189
12190 # Finished creating all the perl properties. All non-internal non-string
12191 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
12192 # an underscore.) These do not get a separate entry in the pod file
12193 foreach my $table ($perl->tables) {
12194 foreach my $alias ($table->aliases) {
12195 next if $alias->name =~ /^_/;
12196 $table->add_alias('Is_' . $alias->name,
33e96e72 12197 Re_Pod_Entry => 0,
99870f4d
KW
12198 Status => $alias->status,
12199 Externally_Ok => 0);
12200 }
12201 }
12202
c4019d52
KW
12203 # Here done with all the basic stuff. Ready to populate the information
12204 # about each character if annotating them.
558712cf 12205 if ($annotate) {
c4019d52
KW
12206
12207 # See comments at its declaration
12208 $annotate_ranges = Range_Map->new;
12209
12210 # This separates out the non-characters from the other unassigneds, so
12211 # can give different annotations for each.
12212 $unassigned_sans_noncharacters = Range_List->new(
12213 Initialize => $gc->table('Unassigned')
12214 & property_ref('Noncharacter_Code_Point')->table('N'));
12215
6189eadc 12216 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
c4019d52
KW
12217 $i = populate_char_info($i); # Note sets $i so may cause skips
12218 }
12219 }
12220
99870f4d
KW
12221 return;
12222}
12223
12224sub add_perl_synonyms() {
12225 # A number of Unicode tables have Perl synonyms that are expressed in
12226 # the single-form, \p{name}. These are:
12227 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12228 # \p{Is_Name} as synonyms
12229 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12230 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12231 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12232 # conflict, \p{Value} and \p{Is_Value} as well
12233 #
12234 # This routine generates these synonyms, warning of any unexpected
12235 # conflicts.
12236
12237 # Construct the list of tables to get synonyms for. Start with all the
12238 # binary and the General_Category ones.
06f26c45
KW
12239 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
12240 property_ref('*');
99870f4d
KW
12241 push @tables, $gc->tables;
12242
12243 # If the version of Unicode includes the Script property, add its tables
359523e2 12244 push @tables, $script->tables if defined $script;
99870f4d
KW
12245
12246 # The Block tables are kept separate because they are treated differently.
12247 # And the earliest versions of Unicode didn't include them, so add only if
12248 # there are some.
12249 my @blocks;
12250 push @blocks, $block->tables if defined $block;
12251
12252 # Here, have the lists of tables constructed. Process blocks last so that
12253 # if there are name collisions with them, blocks have lowest priority.
12254 # Should there ever be other collisions, manual intervention would be
12255 # required. See the comments at the beginning of the program for a
12256 # possible way to handle those semi-automatically.
12257 foreach my $table (@tables, @blocks) {
12258
12259 # For non-binary properties, the synonym is just the name of the
12260 # table, like Greek, but for binary properties the synonym is the name
12261 # of the property, and means the code points in its 'Y' table.
12262 my $nominal = $table;
12263 my $nominal_property = $nominal->property;
12264 my $actual;
12265 if (! $nominal->isa('Property')) {
12266 $actual = $table;
12267 }
12268 else {
12269
12270 # Here is a binary property. Use the 'Y' table. Verify that is
12271 # there
12272 my $yes = $nominal->table('Y');
12273 unless (defined $yes) { # Must be defined, but is permissible to
12274 # be empty.
12275 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
12276 next;
12277 }
12278 $actual = $yes;
12279 }
12280
12281 foreach my $alias ($nominal->aliases) {
12282
12283 # Attempt to create a table in the perl directory for the
12284 # candidate table, using whatever aliases in it that don't
12285 # conflict. Also add non-conflicting aliases for all these
12286 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12287 PREFIX:
12288 foreach my $prefix ("", 'Is_', 'In_') {
12289
12290 # Only Block properties can have added 'In_' aliases.
12291 next if $prefix eq 'In_' and $nominal_property != $block;
12292
12293 my $proposed_name = $prefix . $alias->name;
12294
12295 # No Is_Is, In_In, nor combinations thereof
12296 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12297 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12298
12299 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12300
12301 # Get a reference to any existing table in the perl
12302 # directory with the desired name.
12303 my $pre_existing = $perl->table($proposed_name);
12304
12305 if (! defined $pre_existing) {
12306
12307 # No name collision, so ok to add the perl synonym.
12308
33e96e72 12309 my $make_re_pod_entry;
99870f4d 12310 my $externally_ok;
4cd1260a 12311 my $status = $alias->status;
99870f4d
KW
12312 if ($nominal_property == $block) {
12313
12314 # For block properties, the 'In' form is preferred for
12315 # external use; the pod file contains wild cards for
12316 # this and the 'Is' form so no entries for those; and
12317 # we don't want people using the name without the
12318 # 'In', so discourage that.
12319 if ($prefix eq "") {
33e96e72 12320 $make_re_pod_entry = 1;
99870f4d
KW
12321 $status = $status || $DISCOURAGED;
12322 $externally_ok = 0;
12323 }
12324 elsif ($prefix eq 'In_') {
33e96e72 12325 $make_re_pod_entry = 0;
99870f4d
KW
12326 $status = $status || $NORMAL;
12327 $externally_ok = 1;
12328 }
12329 else {
33e96e72 12330 $make_re_pod_entry = 0;
99870f4d
KW
12331 $status = $status || $DISCOURAGED;
12332 $externally_ok = 0;
12333 }
12334 }
12335 elsif ($prefix ne "") {
12336
12337 # The 'Is' prefix is handled in the pod by a wild
12338 # card, and we won't use it for an external name
33e96e72 12339 $make_re_pod_entry = 0;
99870f4d
KW
12340 $status = $status || $NORMAL;
12341 $externally_ok = 0;
12342 }
12343 else {
12344
12345 # Here, is an empty prefix, non block. This gets its
12346 # own pod entry and can be used for an external name.
33e96e72 12347 $make_re_pod_entry = 1;
99870f4d
KW
12348 $status = $status || $NORMAL;
12349 $externally_ok = 1;
12350 }
12351
12352 # Here, there isn't a perl pre-existing table with the
12353 # name. Look through the list of equivalents of this
12354 # table to see if one is a perl table.
12355 foreach my $equivalent ($actual->leader->equivalents) {
12356 next if $equivalent->property != $perl;
12357
12358 # Here, have found a table for $perl. Add this alias
12359 # to it, and are done with this prefix.
12360 $equivalent->add_alias($proposed_name,
33e96e72 12361 Re_Pod_Entry => $make_re_pod_entry,
99870f4d
KW
12362 Status => $status,
12363 Externally_Ok => $externally_ok);
12364 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12365 next PREFIX;
12366 }
12367
12368 # Here, $perl doesn't already have a table that is a
12369 # synonym for this property, add one.
12370 my $added_table = $perl->add_match_table($proposed_name,
33e96e72 12371 Re_Pod_Entry => $make_re_pod_entry,
99870f4d
KW
12372 Status => $status,
12373 Externally_Ok => $externally_ok);
12374 # And it will be related to the actual table, since it is
12375 # based on it.
12376 $added_table->set_equivalent_to($actual, Related => 1);
12377 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12378 next;
12379 } # End of no pre-existing.
12380
12381 # Here, there is a pre-existing table that has the proposed
12382 # name. We could be in trouble, but not if this is just a
12383 # synonym for another table that we have already made a child
12384 # of the pre-existing one.
6505c6e2 12385 if ($pre_existing->is_set_equivalent_to($actual)) {
99870f4d
KW
12386 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12387 $pre_existing->add_alias($proposed_name);
12388 next;
12389 }
12390
12391 # Here, there is a name collision, but it still could be ok if
12392 # the tables match the identical set of code points, in which
12393 # case, we can combine the names. Compare each table's code
12394 # point list to see if they are identical.
12395 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12396 if ($pre_existing->matches_identically_to($actual)) {
12397
12398 # Here, they do match identically. Not a real conflict.
12399 # Make the perl version a child of the Unicode one, except
12400 # in the non-obvious case of where the perl name is
12401 # already a synonym of another Unicode property. (This is
12402 # excluded by the test for it being its own parent.) The
12403 # reason for this exclusion is that then the two Unicode
12404 # properties become related; and we don't really know if
12405 # they are or not. We generate documentation based on
12406 # relatedness, and this would be misleading. Code
12407 # later executed in the process will cause the tables to
12408 # be represented by a single file anyway, without making
12409 # it look in the pod like they are necessarily related.
12410 if ($pre_existing->parent == $pre_existing
12411 && ($pre_existing->property == $perl
12412 || $actual->property == $perl))
12413 {
12414 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12415 $pre_existing->set_equivalent_to($actual, Related => 1);
12416 }
12417 elsif (main::DEBUG && $to_trace) {
12418 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12419 trace $pre_existing->parent;
12420 }
12421 next PREFIX;
12422 }
12423
12424 # Here they didn't match identically, there is a real conflict
12425 # between our new name and a pre-existing property.
12426 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12427 $pre_existing->add_conflicting($nominal->full_name,
12428 'p',
12429 $actual);
12430
12431 # Don't output a warning for aliases for the block
12432 # properties (unless they start with 'In_') as it is
12433 # expected that there will be conflicts and the block
12434 # form loses.
12435 if ($verbosity >= $NORMAL_VERBOSITY
12436 && ($actual->property != $block || $prefix eq 'In_'))
12437 {
12438 print simple_fold(join_lines(<<END
12439There is already an alias named $proposed_name (from " . $pre_existing . "),
12440so not creating this alias for " . $actual
12441END
12442 ), "", 4);
12443 }
12444
12445 # Keep track for documentation purposes.
12446 $has_In_conflicts++ if $prefix eq 'In_';
12447 $has_Is_conflicts++ if $prefix eq 'Is_';
12448 }
12449 }
12450 }
12451
12452 # There are some properties which have No and Yes (and N and Y) as
12453 # property values, but aren't binary, and could possibly be confused with
12454 # binary ones. So create caveats for them. There are tables that are
12455 # named 'No', and tables that are named 'N', but confusion is not likely
12456 # unless they are the same table. For example, N meaning Number or
12457 # Neutral is not likely to cause confusion, so don't add caveats to things
12458 # like them.
06f26c45
KW
12459 foreach my $property (grep { $_->type != $BINARY
12460 && $_->type != $FORCED_BINARY }
12461 property_ref('*'))
12462 {
99870f4d
KW
12463 my $yes = $property->table('Yes');
12464 if (defined $yes) {
12465 my $y = $property->table('Y');
12466 if (defined $y && $yes == $y) {
12467 foreach my $alias ($property->aliases) {
12468 $yes->add_conflicting($alias->name);
12469 }
12470 }
12471 }
12472 my $no = $property->table('No');
12473 if (defined $no) {
12474 my $n = $property->table('N');
12475 if (defined $n && $no == $n) {
12476 foreach my $alias ($property->aliases) {
12477 $no->add_conflicting($alias->name, 'P');
12478 }
12479 }
12480 }
12481 }
12482
12483 return;
12484}
12485
12486sub register_file_for_name($$$) {
12487 # Given info about a table and a datafile that it should be associated
98dc9551 12488 # with, register that association
99870f4d
KW
12489
12490 my $table = shift;
12491 my $directory_ref = shift; # Array of the directory path for the file
e6ebc4c0 12492 my $file = shift; # The file name in the final directory.
99870f4d
KW
12493 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12494
12495 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12496
12497 if ($table->isa('Property')) {
12498 $table->set_file_path(@$directory_ref, $file);
12499 push @map_properties, $table
12500 if $directory_ref->[0] eq $map_directory;
12501 return;
12502 }
12503
12504 # Do all of the work for all equivalent tables when called with the leader
12505 # table, so skip if isn't the leader.
12506 return if $table->leader != $table;
12507
a92d5c2e
KW
12508 # If this is a complement of another file, use that other file instead,
12509 # with a ! prepended to it.
12510 my $complement;
12511 if (($complement = $table->complement) != 0) {
12512 my @directories = $complement->file_path;
12513
12514 # This assumes that the 0th element is something like 'lib',
12515 # the 1th element the property name (in its own directory), like
12516 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12517 # appended to it later.
12518 $directories[1] =~ s/^/!/;
12519 $file = pop @directories;
12520 $directory_ref =\@directories;
12521 }
12522
99870f4d
KW
12523 # Join all the file path components together, using slashes.
12524 my $full_filename = join('/', @$directory_ref, $file);
12525
12526 # All go in the same subdirectory of unicore
12527 if ($directory_ref->[0] ne $matches_directory) {
12528 Carp::my_carp("Unexpected directory in "
12529 . join('/', @{$directory_ref}, $file));
12530 }
12531
12532 # For this table and all its equivalents ...
12533 foreach my $table ($table, $table->equivalents) {
12534
12535 # Associate it with its file internally. Don't include the
12536 # $matches_directory first component
12537 $table->set_file_path(@$directory_ref, $file);
12538 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12539
12540 my $property = $table->property;
12541 $property = ($property == $perl)
12542 ? "" # 'perl' is never explicitly stated
12543 : standardize($property->name) . '=';
12544
12545 my $deprecated = ($table->status eq $DEPRECATED)
12546 ? $table->status_info
12547 : "";
d867ccfb 12548 my $caseless_equivalent = $table->caseless_equivalent;
99870f4d
KW
12549
12550 # And for each of the table's aliases... This inner loop eventually
12551 # goes through all aliases in the UCD that we generate regex match
12552 # files for
12553 foreach my $alias ($table->aliases) {
c85f591a 12554 my $standard = utf8_heavy_name($table, $alias);
99870f4d
KW
12555
12556 # Generate an entry in either the loose or strict hashes, which
12557 # will translate the property and alias names combination into the
12558 # file where the table for them is stored.
99870f4d 12559 if ($alias->loose_match) {
99870f4d
KW
12560 if (exists $loose_to_file_of{$standard}) {
12561 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12562 }
12563 else {
12564 $loose_to_file_of{$standard} = $sub_filename;
12565 }
12566 }
12567 else {
99870f4d
KW
12568 if (exists $stricter_to_file_of{$standard}) {
12569 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12570 }
12571 else {
12572 $stricter_to_file_of{$standard} = $sub_filename;
12573
12574 # Tightly coupled with how utf8_heavy.pl works, for a
12575 # floating point number that is a whole number, get rid of
12576 # the trailing decimal point and 0's, so that utf8_heavy
12577 # will work. Also note that this assumes that such a
12578 # number is matched strictly; so if that were to change,
12579 # this would be wrong.
c85f591a 12580 if ((my $integer_name = $alias->name)
99870f4d
KW
12581 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12582 {
12583 $stricter_to_file_of{$property . $integer_name}
c12f2655 12584 = $sub_filename;
99870f4d
KW
12585 }
12586 }
12587 }
12588
12589 # Keep a list of the deprecated properties and their filenames
a92d5c2e 12590 if ($deprecated && $complement == 0) {
99870f4d
KW
12591 $utf8::why_deprecated{$sub_filename} = $deprecated;
12592 }
d867ccfb
KW
12593
12594 # And a substitute table, if any, for case-insensitive matching
12595 if ($caseless_equivalent != 0) {
12596 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12597 }
99870f4d
KW
12598 }
12599 }
12600
12601 return;
12602}
12603
12604{ # Closure
12605 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12606 # conflicts
12607 my %full_dir_name_of; # Full length names of directories used.
12608
12609 sub construct_filename($$$) {
12610 # Return a file name for a table, based on the table name, but perhaps
12611 # changed to get rid of non-portable characters in it, and to make
12612 # sure that it is unique on a file system that allows the names before
12613 # any period to be at most 8 characters (DOS). While we're at it
12614 # check and complain if there are any directory conflicts.
12615
12616 my $name = shift; # The name to start with
12617 my $mutable = shift; # Boolean: can it be changed? If no, but
12618 # yet it must be to work properly, a warning
12619 # is given
12620 my $directories_ref = shift; # A reference to an array containing the
12621 # path to the file, with each element one path
12622 # component. This is used because the same
12623 # name can be used in different directories.
12624 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12625
12626 my $warn = ! defined wantarray; # If true, then if the name is
12627 # changed, a warning is issued as well.
12628
12629 if (! defined $name) {
12630 Carp::my_carp("Undefined name in directory "
12631 . File::Spec->join(@$directories_ref)
12632 . ". '_' used");
12633 return '_';
12634 }
12635
12636 # Make sure that no directory names conflict with each other. Look at
12637 # each directory in the input file's path. If it is already in use,
12638 # assume it is correct, and is merely being re-used, but if we
12639 # truncate it to 8 characters, and find that there are two directories
12640 # that are the same for the first 8 characters, but differ after that,
12641 # then that is a problem.
12642 foreach my $directory (@$directories_ref) {
12643 my $short_dir = substr($directory, 0, 8);
12644 if (defined $full_dir_name_of{$short_dir}) {
12645 next if $full_dir_name_of{$short_dir} eq $directory;
12646 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12647 }
12648 else {
12649 $full_dir_name_of{$short_dir} = $directory;
12650 }
12651 }
12652
12653 my $path = join '/', @$directories_ref;
12654 $path .= '/' if $path;
12655
12656 # Remove interior underscores.
12657 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12658
12659 # Change any non-word character into an underscore, and truncate to 8.
12660 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12661 substr($filename, 8) = "" if length($filename) > 8;
12662
12663 # Make sure the basename doesn't conflict with something we
12664 # might have already written. If we have, say,
12665 # InGreekExtended1
12666 # InGreekExtended2
12667 # they become
12668 # InGreekE
12669 # InGreek2
12670 my $warned = 0;
12671 while (my $num = $base_names{$path}{lc $filename}++) {
12672 $num++; # so basenames with numbers start with '2', which
12673 # just looks more natural.
12674
12675 # Want to append $num, but if it'll make the basename longer
12676 # than 8 characters, pre-truncate $filename so that the result
12677 # is acceptable.
12678 my $delta = length($filename) + length($num) - 8;
12679 if ($delta > 0) {
12680 substr($filename, -$delta) = $num;
12681 }
12682 else {
12683 $filename .= $num;
12684 }
12685 if ($warn && ! $warned) {
12686 $warned = 1;
12687 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12688 }
12689 }
12690
12691 return $filename if $mutable;
12692
12693 # If not changeable, must return the input name, but warn if needed to
12694 # change it beyond shortening it.
12695 if ($name ne $filename
12696 && substr($name, 0, length($filename)) ne $filename) {
12697 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12698 }
12699 return $name;
12700 }
12701}
12702
12703# The pod file contains a very large table. Many of the lines in that table
12704# would exceed a typical output window's size, and so need to be wrapped with
12705# a hanging indent to make them look good. The pod language is really
12706# insufficient here. There is no general construct to do that in pod, so it
12707# is done here by beginning each such line with a space to cause the result to
12708# be output without formatting, and doing all the formatting here. This leads
12709# to the result that if the eventual display window is too narrow it won't
12710# look good, and if the window is too wide, no advantage is taken of that
12711# extra width. A further complication is that the output may be indented by
12712# the formatter so that there is less space than expected. What I (khw) have
12713# done is to assume that that indent is a particular number of spaces based on
12714# what it is in my Linux system; people can always resize their windows if
12715# necessary, but this is obviously less than desirable, but the best that can
12716# be expected.
12717my $automatic_pod_indent = 8;
12718
12719# Try to format so that uses fewest lines, but few long left column entries
12720# slide into the right column. An experiment on 5.1 data yielded the
12721# following percentages that didn't cut into the other side along with the
12722# associated first-column widths
12723# 69% = 24
12724# 80% not too bad except for a few blocks
12725# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12726# 95% = 37;
12727my $indent_info_column = 27; # 75% of lines didn't have overlap
12728
12729my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12730 # The 3 is because of:
12731 # 1 for the leading space to tell the pod formatter to
12732 # output as-is
12733 # 1 for the flag
12734 # 1 for the space between the flag and the main data
12735
12736sub format_pod_line ($$$;$$) {
12737 # Take a pod line and return it, formatted properly
12738
12739 my $first_column_width = shift;
12740 my $entry = shift; # Contents of left column
12741 my $info = shift; # Contents of right column
12742
12743 my $status = shift || ""; # Any flag
12744
12745 my $loose_match = shift; # Boolean.
12746 $loose_match = 1 unless defined $loose_match;
12747
12748 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12749
12750 my $flags = "";
12751 $flags .= $STRICTER if ! $loose_match;
12752
12753 $flags .= $status if $status;
12754
12755 # There is a blank in the left column to cause the pod formatter to
12756 # output the line as-is.
12757 return sprintf " %-*s%-*s %s\n",
12758 # The first * in the format is replaced by this, the -1 is
12759 # to account for the leading blank. There isn't a
12760 # hard-coded blank after this to separate the flags from
12761 # the rest of the line, so that in the unlikely event that
12762 # multiple flags are shown on the same line, they both
12763 # will get displayed at the expense of that separation,
12764 # but since they are left justified, a blank will be
12765 # inserted in the normal case.
12766 $FILLER - 1,
12767 $flags,
12768
12769 # The other * in the format is replaced by this number to
12770 # cause the first main column to right fill with blanks.
12771 # The -1 is for the guaranteed blank following it.
12772 $first_column_width - $FILLER - 1,
12773 $entry,
12774 $info;
12775}
12776
12777my @zero_match_tables; # List of tables that have no matches in this release
12778
12779sub make_table_pod_entries($) {
12780 # This generates the entries for the pod file for a given table.
12781 # Also done at this time are any children tables. The output looks like:
12782 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
12783
12784 my $input_table = shift; # Table the entry is for
12785 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12786
12787 # Generate parent and all its children at the same time.
12788 return if $input_table->parent != $input_table;
12789
12790 my $property = $input_table->property;
12791 my $type = $property->type;
12792 my $full_name = $property->full_name;
12793
12794 my $count = $input_table->count;
12795 my $string_count = clarify_number($count);
12796 my $status = $input_table->status;
12797 my $status_info = $input_table->status_info;
56ca34ca 12798 my $caseless_equivalent = $input_table->caseless_equivalent;
99870f4d
KW
12799
12800 my $entry_for_first_table; # The entry for the first table output.
12801 # Almost certainly, it is the parent.
12802
12803 # For each related table (including itself), we will generate a pod entry
12804 # for each name each table goes by
12805 foreach my $table ($input_table, $input_table->children) {
12806
d4da3f74
KW
12807 # utf8_heavy.pl cannot deal with null string property values, so skip
12808 # any tables that have no non-null names.
12809 next if ! grep { $_->name ne "" } $table->aliases;
99870f4d
KW
12810
12811 # First, gather all the info that applies to this table as a whole.
12812
12813 push @zero_match_tables, $table if $count == 0;
12814
12815 my $table_property = $table->property;
12816
12817 # The short name has all the underscores removed, while the full name
12818 # retains them. Later, we decide whether to output a short synonym
12819 # for the full one, we need to compare apples to apples, so we use the
12820 # short name's length including underscores.
12821 my $table_property_short_name_length;
12822 my $table_property_short_name
12823 = $table_property->short_name(\$table_property_short_name_length);
12824 my $table_property_full_name = $table_property->full_name;
12825
12826 # Get how much savings there is in the short name over the full one
12827 # (delta will always be <= 0)
12828 my $table_property_short_delta = $table_property_short_name_length
12829 - length($table_property_full_name);
12830 my @table_description = $table->description;
12831 my @table_note = $table->note;
12832
12833 # Generate an entry for each alias in this table.
12834 my $entry_for_first_alias; # saves the first one encountered.
12835 foreach my $alias ($table->aliases) {
12836
12837 # Skip if not to go in pod.
33e96e72 12838 next unless $alias->make_re_pod_entry;
99870f4d
KW
12839
12840 # Start gathering all the components for the entry
12841 my $name = $alias->name;
12842
d4da3f74
KW
12843 # Skip if name is empty, as can't be accessed by regexes.
12844 next if $name eq "";
12845
99870f4d
KW
12846 my $entry; # Holds the left column, may include extras
12847 my $entry_ref; # To refer to the left column's contents from
12848 # another entry; has no extras
12849
12850 # First the left column of the pod entry. Tables for the $perl
12851 # property always use the single form.
12852 if ($table_property == $perl) {
12853 $entry = "\\p{$name}";
12854 $entry_ref = "\\p{$name}";
12855 }
12856 else { # Compound form.
12857
12858 # Only generate one entry for all the aliases that mean true
12859 # or false in binary properties. Append a '*' to indicate
12860 # some are missing. (The heading comment notes this.)
60e471b3 12861 my $rhs;
99870f4d
KW
12862 if ($type == $BINARY) {
12863 next if $name ne 'N' && $name ne 'Y';
60e471b3 12864 $rhs = "$name*";
99870f4d 12865 }
06f26c45 12866 elsif ($type != $FORCED_BINARY) {
60e471b3 12867 $rhs = $name;
99870f4d 12868 }
06f26c45
KW
12869 else {
12870
12871 # Forced binary properties require special handling. It
12872 # has two sets of tables, one set is true/false; and the
12873 # other set is everything else. Entries are generated for
12874 # each set. Use the Bidi_Mirrored property (which appears
12875 # in all Unicode versions) to get a list of the aliases
12876 # for the true/false tables. Of these, only output the N
12877 # and Y ones, the same as, a regular binary property. And
12878 # output all the rest, same as a non-binary property.
12879 my $bm = property_ref("Bidi_Mirrored");
12880 if ($name eq 'N' || $name eq 'Y') {
12881 $rhs = "$name*";
12882 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
12883 $bm->table("N")->aliases)
12884 {
12885 next;
12886 }
12887 else {
12888 $rhs = $name;
12889 }
12890 }
99870f4d
KW
12891
12892 # Colon-space is used to give a little more space to be easier
12893 # to read;
12894 $entry = "\\p{"
12895 . $table_property_full_name
60e471b3 12896 . ": $rhs}";
99870f4d
KW
12897
12898 # But for the reference to this entry, which will go in the
12899 # right column, where space is at a premium, use equals
12900 # without a space
12901 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
12902 }
12903
12904 # Then the right (info) column. This is stored as components of
12905 # an array for the moment, then joined into a string later. For
12906 # non-internal only properties, begin the info with the entry for
12907 # the first table we encountered (if any), as things are ordered
12908 # so that that one is the most descriptive. This leads to the
12909 # info column of an entry being a more descriptive version of the
12910 # name column
12911 my @info;
12912 if ($name =~ /^_/) {
12913 push @info,
12914 '(For internal use by Perl, not necessarily stable)';
12915 }
12916 elsif ($entry_for_first_alias) {
12917 push @info, $entry_for_first_alias;
12918 }
12919
12920 # If this entry is equivalent to another, add that to the info,
12921 # using the first such table we encountered
12922 if ($entry_for_first_table) {
12923 if (@info) {
12924 push @info, "(= $entry_for_first_table)";
12925 }
12926 else {
12927 push @info, $entry_for_first_table;
12928 }
12929 }
12930
12931 # If the name is a large integer, add an equivalent with an
12932 # exponent for better readability
12933 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12934 push @info, sprintf "(= %.1e)", $name
12935 }
12936
12937 my $parenthesized = "";
12938 if (! $entry_for_first_alias) {
12939
12940 # This is the first alias for the current table. The alias
12941 # array is ordered so that this is the fullest, most
12942 # descriptive alias, so it gets the fullest info. The other
12943 # aliases are mostly merely pointers to this one, using the
12944 # information already added above.
12945
12946 # Display any status message, but only on the parent table
12947 if ($status && ! $entry_for_first_table) {
12948 push @info, $status_info;
12949 }
12950
12951 # Put out any descriptive info
12952 if (@table_description || @table_note) {
12953 push @info, join "; ", @table_description, @table_note;
12954 }
12955
12956 # Look to see if there is a shorter name we can point people
12957 # at
12958 my $standard_name = standardize($name);
12959 my $short_name;
12960 my $proposed_short = $table->short_name;
12961 if (defined $proposed_short) {
12962 my $standard_short = standardize($proposed_short);
12963
12964 # If the short name is shorter than the standard one, or
12965 # even it it's not, but the combination of it and its
12966 # short property name (as in \p{prop=short} ($perl doesn't
12967 # have this form)) saves at least two characters, then,
12968 # cause it to be listed as a shorter synonym.
12969 if (length $standard_short < length $standard_name
12970 || ($table_property != $perl
12971 && (length($standard_short)
12972 - length($standard_name)
12973 + $table_property_short_delta) # (<= 0)
12974 < -2))
12975 {
12976 $short_name = $proposed_short;
12977 if ($table_property != $perl) {
12978 $short_name = $table_property_short_name
12979 . "=$short_name";
12980 }
12981 $short_name = "\\p{$short_name}";
12982 }
12983 }
12984
12985 # And if this is a compound form name, see if there is a
12986 # single form equivalent
12987 my $single_form;
12988 if ($table_property != $perl) {
12989
12990 # Special case the binary N tables, so that will print
12991 # \P{single}, but use the Y table values to populate
c12f2655 12992 # 'single', as we haven't likewise populated the N table.
06f26c45
KW
12993 # For forced binary tables, we can't just look at the N
12994 # table, but must see if this table is equivalent to the N
12995 # one, as there are two equivalent beasts in these
12996 # properties.
99870f4d
KW
12997 my $test_table;
12998 my $p;
06f26c45
KW
12999 if ( ($type == $BINARY
13000 && $input_table == $property->table('No'))
13001 || ($type == $FORCED_BINARY
13002 && $property->table('No')->
13003 is_set_equivalent_to($input_table)))
99870f4d
KW
13004 {
13005 $test_table = $property->table('Yes');
13006 $p = 'P';
13007 }
13008 else {
13009 $test_table = $input_table;
13010 $p = 'p';
13011 }
13012
13013 # Look for a single form amongst all the children.
13014 foreach my $table ($test_table->children) {
13015 next if $table->property != $perl;
13016 my $proposed_name = $table->short_name;
13017 next if ! defined $proposed_name;
13018
13019 # Don't mention internal-only properties as a possible
13020 # single form synonym
13021 next if substr($proposed_name, 0, 1) eq '_';
13022
13023 $proposed_name = "\\$p\{$proposed_name}";
13024 if (! defined $single_form
13025 || length($proposed_name) < length $single_form)
13026 {
13027 $single_form = $proposed_name;
13028
13029 # The goal here is to find a single form; not the
13030 # shortest possible one. We've already found a
13031 # short name. So, stop at the first single form
13032 # found, which is likely to be closer to the
13033 # original.
13034 last;
13035 }
13036 }
13037 }
13038
13039 # Ouput both short and single in the same parenthesized
13040 # expression, but with only one of 'Single', 'Short' if there
13041 # are both items.
13042 if ($short_name || $single_form || $table->conflicting) {
99870f4d
KW
13043 $parenthesized .= "Short: $short_name" if $short_name;
13044 if ($short_name && $single_form) {
13045 $parenthesized .= ', ';
13046 }
13047 elsif ($single_form) {
13048 $parenthesized .= 'Single: ';
13049 }
13050 $parenthesized .= $single_form if $single_form;
13051 }
13052 }
13053
56ca34ca
KW
13054 if ($caseless_equivalent != 0) {
13055 $parenthesized .= '; ' if $parenthesized ne "";
13056 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
13057 }
13058
99870f4d
KW
13059
13060 # Warn if this property isn't the same as one that a
13061 # semi-casual user might expect. The other components of this
13062 # parenthesized structure are calculated only for the first entry
13063 # for this table, but the conflicting is deemed important enough
13064 # to go on every entry.
13065 my $conflicting = join " NOR ", $table->conflicting;
13066 if ($conflicting) {
e5228720 13067 $parenthesized .= '; ' if $parenthesized ne "";
99870f4d
KW
13068 $parenthesized .= "NOT $conflicting";
13069 }
99870f4d 13070
e5228720 13071 push @info, "($parenthesized)" if $parenthesized;
d57ccc9a 13072
0f88d393
KW
13073 if ($name =~ /_$/ && $alias->loose_match) {
13074 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13075 }
13076
d57ccc9a
KW
13077 if ($table_property != $perl && $table->perl_extension) {
13078 push @info, '(Perl extension)';
13079 }
2cf724d4 13080 push @info, "($string_count)";
99870f4d
KW
13081
13082 # Now, we have both the entry and info so add them to the
13083 # list of all the properties.
13084 push @match_properties,
13085 format_pod_line($indent_info_column,
13086 $entry,
13087 join( " ", @info),
13088 $alias->status,
13089 $alias->loose_match);
13090
13091 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13092 } # End of looping through the aliases for this table.
13093
13094 if (! $entry_for_first_table) {
13095 $entry_for_first_table = $entry_for_first_alias;
13096 }
13097 } # End of looping through all the related tables
13098 return;
13099}
13100
13101sub pod_alphanumeric_sort {
13102 # Sort pod entries alphanumerically.
13103
99f78760
KW
13104 # The first few character columns are filler, plus the '\p{'; and get rid
13105 # of all the trailing stuff, starting with the trailing '}', so as to sort
13106 # on just 'Name=Value'
13107 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 13108 $a =~ s/}.*//;
99f78760 13109 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
13110 $b =~ s/}.*//;
13111
99f78760
KW
13112 # Determine if the two operands are both internal only or both not.
13113 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13114 # should be the underscore that begins internal only
13115 my $a_is_internal = (substr($a, 0, 1) eq '_');
13116 my $b_is_internal = (substr($b, 0, 1) eq '_');
13117
13118 # Sort so the internals come last in the table instead of first (which the
13119 # leading underscore would otherwise indicate).
13120 if ($a_is_internal != $b_is_internal) {
13121 return 1 if $a_is_internal;
13122 return -1
13123 }
13124
99870f4d 13125 # Determine if the two operands are numeric property values or not.
99f78760 13126 # A numeric property will look like xyz: 3. But the number
99870f4d 13127 # can begin with an optional minus sign, and may have a
99f78760 13128 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
13129 # isn't numeric, use alphabetic sort.
13130 my ($a_initial, $a_number) =
99f78760 13131 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13132 return $a cmp $b unless defined $a_number;
13133 my ($b_initial, $b_number) =
99f78760 13134 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13135 return $a cmp $b unless defined $b_number;
13136
13137 # Here they are both numeric, but use alphabetic sort if the
13138 # initial parts don't match
13139 return $a cmp $b if $a_initial ne $b_initial;
13140
13141 # Convert rationals to floating for the comparison.
13142 $a_number = eval $a_number if $a_number =~ qr{/};
13143 $b_number = eval $b_number if $b_number =~ qr{/};
13144
13145 return $a_number <=> $b_number;
13146}
13147
13148sub make_pod () {
13149 # Create the .pod file. This generates the various subsections and then
13150 # combines them in one big HERE document.
13151
07c070a8
KW
13152 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
13153
99870f4d
KW
13154 return unless defined $pod_directory;
13155 print "Making pod file\n" if $verbosity >= $PROGRESS;
13156
13157 my $exception_message =
13158 '(Any exceptions are individually noted beginning with the word NOT.)';
13159 my @block_warning;
13160 if (-e 'Blocks.txt') {
13161
13162 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
13163 # if the global $has_In_conflicts indicates we have them.
13164 push @match_properties, format_pod_line($indent_info_column,
13165 '\p{In_*}',
13166 '\p{Block: *}'
13167 . (($has_In_conflicts)
13168 ? " $exception_message"
13169 : ""));
13170 @block_warning = << "END";
13171
77173124
KW
13172Matches in the Block property have shortcuts that begin with "In_". For
13173example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
13174backward compatibility, if there is no conflict with another shortcut, these
13175may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
13176are numerous such conflicting shortcuts. Use of these forms for Block is
13177discouraged, and are flagged as such, not only because of the potential
13178confusion as to what is meant, but also because a later release of Unicode may
13179preempt the shortcut, and your program would no longer be correct. Use the
13180"In_" form instead to avoid this, or even more clearly, use the compound form,
13181e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
13182about this.
99870f4d
KW
13183END
13184 }
07c070a8 13185 my $text = $Is_flags_text;
99870f4d
KW
13186 $text = "$exception_message $text" if $has_Is_conflicts;
13187
13188 # And the 'Is_ line';
13189 push @match_properties, format_pod_line($indent_info_column,
13190 '\p{Is_*}',
13191 "\\p{*} $text");
13192
13193 # Sort the properties array for output. It is sorted alphabetically
13194 # except numerically for numeric properties, and only output unique lines.
13195 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13196
13197 my $formatted_properties = simple_fold(\@match_properties,
13198 "",
13199 # indent succeeding lines by two extra
13200 # which looks better
13201 $indent_info_column + 2,
13202
13203 # shorten the line length by how much
13204 # the formatter indents, so the folded
13205 # line will fit in the space
13206 # presumably available
13207 $automatic_pod_indent);
13208 # Add column headings, indented to be a little more centered, but not
13209 # exactly
13210 $formatted_properties = format_pod_line($indent_info_column,
13211 ' NAME',
13212 ' INFO')
13213 . "\n"
13214 . $formatted_properties;
13215
13216 # Generate pod documentation lines for the tables that match nothing
0090c5d1 13217 my $zero_matches = "";
99870f4d
KW
13218 if (@zero_match_tables) {
13219 @zero_match_tables = uniques(@zero_match_tables);
13220 $zero_matches = join "\n\n",
13221 map { $_ = '=item \p{' . $_->complete_name . "}" }
13222 sort { $a->complete_name cmp $b->complete_name }
c0de960f 13223 @zero_match_tables;
99870f4d
KW
13224
13225 $zero_matches = <<END;
13226
77173124 13227=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
99870f4d
KW
13228
13229Unicode has some property-value pairs that currently don't match anything.
c12f2655
KW
13230This happens generally either because they are obsolete, or they exist for
13231symmetry with other forms, but no language has yet been encoded that uses
13232them. In this version of Unicode, the following match zero code points:
99870f4d
KW
13233
13234=over 4
13235
13236$zero_matches
13237
13238=back
13239
13240END
13241 }
13242
13243 # Generate list of properties that we don't accept, grouped by the reasons
13244 # why. This is so only put out the 'why' once, and then list all the
13245 # properties that have that reason under it.
13246
13247 my %why_list; # The keys are the reasons; the values are lists of
13248 # properties that have the key as their reason
13249
13250 # For each property, add it to the list that are suppressed for its reason
13251 # The sort will cause the alphabetically first properties to be added to
13252 # each list first, so each list will be sorted.
13253 foreach my $property (sort keys %why_suppressed) {
13254 push @{$why_list{$why_suppressed{$property}}}, $property;
13255 }
13256
13257 # For each reason (sorted by the first property that has that reason)...
13258 my @bad_re_properties;
13259 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13260 keys %why_list)
13261 {
54ce19c9 13262 # Add to the output, all the properties that have that reason.
99870f4d
KW
13263 my $has_item = 0; # Flag if actually output anything.
13264 foreach my $name (@{$why_list{$why}}) {
13265
13266 # Split compound names into $property and $table components
13267 my $property = $name;
13268 my $table;
13269 if ($property =~ / (.*) = (.*) /x) {
13270 $property = $1;
13271 $table = $2;
13272 }
13273
13274 # This release of Unicode may not have a property that is
13275 # suppressed, so don't reference a non-existent one.
13276 $property = property_ref($property);
13277 next if ! defined $property;
13278
13279 # And since this list is only for match tables, don't list the
13280 # ones that don't have match tables.
13281 next if ! $property->to_create_match_tables;
13282
13283 # Find any abbreviation, and turn it into a compound name if this
13284 # is a property=value pair.
13285 my $short_name = $property->name;
13286 $short_name .= '=' . $property->table($table)->name if $table;
13287
54ce19c9
KW
13288 # Start with an empty line.
13289 push @bad_re_properties, "\n\n" unless $has_item;
13290
99870f4d
KW
13291 # And add the property as an item for the reason.
13292 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13293 $has_item = 1;
13294 }
13295
13296 # And add the reason under the list of properties, if such a list
13297 # actually got generated. Note that the header got added
13298 # unconditionally before. But pod ignores extra blank lines, so no
13299 # harm.
13300 push @bad_re_properties, "\n$why\n" if $has_item;
13301
13302 } # End of looping through each reason.
13303
54ce19c9
KW
13304 if (! @bad_re_properties) {
13305 push @bad_re_properties,
13306 "*** This installation accepts ALL non-Unihan properties ***";
13307 }
13308 else {
13309 # Add =over only if non-empty to avoid an empty =over/=back section,
13310 # which is considered bad form.
13311 unshift @bad_re_properties, "\n=over 4\n";
13312 push @bad_re_properties, "\n=back\n";
13313 }
13314
8d099389
KW
13315 # Similiarly, generate a list of files that we don't use, grouped by the
13316 # reasons why. First, create a hash whose keys are the reasons, and whose
13317 # values are anonymous arrays of all the files that share that reason.
13318 my %grouped_by_reason;
13319 foreach my $file (keys %ignored_files) {
13320 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
13321 }
13322
13323 # Then, sort each group.
13324 foreach my $group (keys %grouped_by_reason) {
13325 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
13326 @{$grouped_by_reason{$group}} ;
13327 }
13328
13329 # Finally, create the output text. For each reason (sorted by the
13330 # alphabetically first file that has that reason)...
13331 my @unused_files;
13332 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
13333 cmp lc $grouped_by_reason{$b}->[0]
13334 }
13335 keys %grouped_by_reason)
13336 {
13337 # Add all the files that have that reason to the output. Start
13338 # with an empty line.
13339 push @unused_files, "\n\n";
13340 push @unused_files, map { "\n=item F<$_> \n" }
13341 @{$grouped_by_reason{$reason}};
13342 # And add the reason under the list of files
13343 push @unused_files, "\n$reason\n";
13344 }
13345
99870f4d
KW
13346 # Generate a list of the properties whose map table we output, from the
13347 # global @map_properties.
13348 my @map_tables_actually_output;
13349 my $info_indent = 20; # Left column is narrower than \p{} table.
13350 foreach my $property (@map_properties) {
13351
13352 # Get the path to the file; don't output any not in the standard
13353 # directory.
13354 my @path = $property->file_path;
13355 next if $path[0] ne $map_directory;
8572ace0
KW
13356
13357 # Don't mention map tables that are for internal-use only
13358 next if $property->to_output_map == $INTERNAL_MAP;
13359
99870f4d
KW
13360 shift @path; # Remove the standard name
13361
13362 my $file = join '/', @path; # In case is in sub directory
13363 my $info = $property->full_name;
13364 my $short_name = $property->name;
13365 if ($info ne $short_name) {
13366 $info .= " ($short_name)";
13367 }
13368 foreach my $more_info ($property->description,
13369 $property->note,
13370 $property->status_info)
13371 {
13372 next unless $more_info;
13373 $info =~ s/\.\Z//;
13374 $info .= ". $more_info";
13375 }
13376 push @map_tables_actually_output, format_pod_line($info_indent,
13377 $file,
13378 $info,
13379 $property->status);
13380 }
13381
13382 # Sort alphabetically, and fold for output
13383 @map_tables_actually_output = sort
13384 pod_alphanumeric_sort @map_tables_actually_output;
13385 @map_tables_actually_output
13386 = simple_fold(\@map_tables_actually_output,
13387 ' ',
13388 $info_indent,
13389 $automatic_pod_indent);
13390
13391 # Generate a list of the formats that can appear in the map tables.
13392 my @map_table_formats;
13393 foreach my $format (sort keys %map_table_formats) {
78632ea9
KW
13394 push @map_table_formats,
13395 Text::Tabs::expand("$format\t$map_table_formats{$format}\n");
99870f4d 13396 }
78632ea9
KW
13397 @map_table_formats = simple_fold(\@map_table_formats,
13398 ' ',
13399 8,
13400 $automatic_pod_indent);
12916dad
MS
13401 local $" = "";
13402
99870f4d
KW
13403 # Everything is ready to assemble.
13404 my @OUT = << "END";
13405=begin comment
13406
13407$HEADER
13408
13409To change this file, edit $0 instead.
13410
13411=end comment
13412
13413=head1 NAME
13414
8d099389 13415$pod_file - Index of Unicode Version $string_version character properties in Perl
99870f4d
KW
13416
13417=head1 DESCRIPTION
13418
8d099389
KW
13419This document provides information about the portion of the Unicode database
13420that deals with character properties, that is the portion that is defined on
13421single code points. (L</Other information in the Unicode data base>
13422below briefly mentions other data that Unicode provides.)
99870f4d 13423
8d099389
KW
13424Perl can provide access to all non-provisional Unicode character properties,
13425though not all are enabled by default. The omitted ones are the Unihan
13426properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
13427deprecated or Unicode-internal properties. (An installation may choose to
13428recompile Perl's tables to change this. See L<Unicode regular expression
13429properties that are NOT accepted by Perl>.)
13430
13431Perl also provides some additional extensions and short-cut synonyms
13432for Unicode properties.
99870f4d
KW
13433
13434This document merely lists all available properties and does not attempt to
13435explain what each property really means. There is a brief description of each
043f3b3f
KW
13436Perl extension; see L<perlunicode/Other Properties> for more information on
13437these. There is some detail about Blocks, Scripts, General_Category,
99870f4d 13438and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
043f3b3f
KW
13439official Unicode properties, refer to the Unicode standard. A good starting
13440place is L<$unicode_reference_url>.
99870f4d
KW
13441
13442Note that you can define your own properties; see
13443L<perlunicode/"User-Defined Character Properties">.
13444
77173124 13445=head1 Properties accessible through C<\\p{}> and C<\\P{}>
99870f4d 13446
77173124
KW
13447The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13448most of the Unicode character properties. The table below shows all these
13449constructs, both single and compound forms.
99870f4d
KW
13450
13451B<Compound forms> consist of two components, separated by an equals sign or a
13452colon. The first component is the property name, and the second component is
13453the particular value of the property to match against, for example,
77173124 13454C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
99870f4d
KW
13455whose Script property is Greek.
13456
77173124 13457B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
99870f4d 13458their equivalent compound forms. The table shows these equivalences. (In our
77173124 13459example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
99870f4d 13460There are also a few Perl-defined single forms that are not shortcuts for a
77173124 13461compound form. One such is C<\\p{Word}>. These are also listed in the table.
99870f4d
KW
13462
13463In parsing these constructs, Perl always ignores Upper/lower case differences
77173124
KW
13464everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
13465C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
13466the left brace completely changes the meaning of the construct, from "match"
13467(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
13468for improved legibility.
99870f4d
KW
13469
13470Also, white space, hyphens, and underscores are also normally ignored
13471everywhere between the {braces}, and hence can be freely added or removed
13472even if the C</x> modifier hasn't been specified on the regular expression.
13473But $a_bold_stricter at the beginning of an entry in the table below
13474means that tighter (stricter) rules are used for that entry:
13475
13476=over 4
13477
77173124 13478=item Single form (C<\\p{name}>) tighter rules:
99870f4d
KW
13479
13480White space, hyphens, and underscores ARE significant
13481except for:
13482
13483=over 4
13484
13485=item * white space adjacent to a non-word character
13486
13487=item * underscores separating digits in numbers
13488
13489=back
13490
13491That means, for example, that you can freely add or remove white space
13492adjacent to (but within) the braces without affecting the meaning.
13493
77173124 13494=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
99870f4d
KW
13495
13496The tighter rules given above for the single form apply to everything to the
13497right of the colon or equals; the looser rules still apply to everything to
13498the left.
13499
13500That means, for example, that you can freely add or remove white space
13501adjacent to (but within) the braces and the colon or equal sign.
13502
13503=back
13504
78bb419c
KW
13505Some properties are considered obsolete by Unicode, but still available.
13506There are several varieties of obsolescence:
99870f4d
KW
13507
13508=over 4
13509
99870f4d
KW
13510=item Stabilized
13511
f8c38b14 13512A property may be stabilized. Such a determination does not indicate
5f7264c7
KW
13513that the property should or should not be used; instead it is a declaration
13514that the property will not be maintained nor extended for newly encoded
13515characters. Such properties are marked with $a_bold_stabilized in the
13516table.
99870f4d
KW
13517
13518=item Deprecated
13519
f8c38b14 13520A property may be deprecated, perhaps because its original intent
78bb419c
KW
13521has been replaced by another property, or because its specification was
13522somehow defective. This means that its use is strongly
99870f4d
KW
13523discouraged, so much so that a warning will be issued if used, unless the
13524regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13525statement. $A_bold_deprecated flags each such entry in the table, and
13526the entry there for the longest, most descriptive version of the property will
13527give the reason it is deprecated, and perhaps advice. Perl may issue such a
13528warning, even for properties that aren't officially deprecated by Unicode,
13529when there used to be characters or code points that were matched by them, but
13530no longer. This is to warn you that your program may not work like it did on
13531earlier Unicode releases.
13532
13533A deprecated property may be made unavailable in a future Perl version, so it
13534is best to move away from them.
13535
c12f2655
KW
13536A deprecated property may also be stabilized, but this fact is not shown.
13537
13538=item Obsolete
13539
13540Properties marked with $a_bold_obsolete in the table are considered (plain)
13541obsolete. Generally this designation is given to properties that Unicode once
13542used for internal purposes (but not any longer).
13543
99870f4d
KW
13544=back
13545
13546Some Perl extensions are present for backwards compatibility and are
c12f2655
KW
13547discouraged from being used, but are not obsolete. $A_bold_discouraged
13548flags each such entry in the table. Future Unicode versions may force
13549some of these extensions to be removed without warning, replaced by another
13550property with the same name that means something different. Use the
13551equivalent shown instead.
99870f4d
KW
13552
13553@block_warning
13554
77173124 13555The table below has two columns. The left column contains the C<\\p{}>
98dc9551 13556constructs to look up, possibly preceded by the flags mentioned above; and
99870f4d
KW
13557the right column contains information about them, like a description, or
13558synonyms. It shows both the single and compound forms for each property that
13559has them. If the left column is a short name for a property, the right column
13560will give its longer, more descriptive name; and if the left column is the
13561longest name, the right column will show any equivalent shortest name, in both
13562single and compound forms if applicable.
13563
13564The right column will also caution you if a property means something different
13565than what might normally be expected.
13566
d57ccc9a
KW
13567All single forms are Perl extensions; a few compound forms are as well, and
13568are noted as such.
13569
99870f4d
KW
13570Numbers in (parentheses) indicate the total number of code points matched by
13571the property. For emphasis, those properties that match no code points at all
13572are listed as well in a separate section following the table.
13573
56ca34ca
KW
13574Most properties match the same code points regardless of whether C<"/i">
13575case-insensitive matching is specified or not. But a few properties are
13576affected. These are shown with the notation
13577
13578 (/i= other_property)
13579
13580in the second column. Under case-insensitive matching they match the
13581same code pode points as the property "other_property".
13582
99870f4d 13583There is no description given for most non-Perl defined properties (See
77173124 13584L<$unicode_reference_url> for that).
d73e5302 13585
99870f4d
KW
13586For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13587combinations. For example, entries like:
d73e5302 13588
99870f4d 13589 \\p{Gc: *} \\p{General_Category: *}
5beb625e 13590
99870f4d
KW
13591mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13592for the latter is also valid for the former. Similarly,
5beb625e 13593
99870f4d 13594 \\p{Is_*} \\p{*}
5beb625e 13595
77173124
KW
13596means that if and only if, for example, C<\\p{Foo}> exists, then
13597C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13598And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13599C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
13600underscore.
5beb625e 13601
99870f4d
KW
13602Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13603And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
13604'N*' to indicate this, and doesn't have separate entries for the other
13605possibilities. Note that not all properties which have values 'Yes' and 'No'
13606are binary, and they have all their values spelled out without using this wild
13607card, and a C<NOT> clause in their description that highlights their not being
13608binary. These also require the compound form to match them, whereas true
13609binary properties have both single and compound forms available.
5beb625e 13610
99870f4d
KW
13611Note that all non-essential underscores are removed in the display of the
13612short names below.
5beb625e 13613
c12f2655 13614B<Legend summary:>
5beb625e 13615
99870f4d 13616=over 4
cf25bb62 13617
21405004 13618=item Z<>B<*> is a wild-card
cf25bb62 13619
99870f4d
KW
13620=item B<(\\d+)> in the info column gives the number of code points matched by
13621this property.
cf25bb62 13622
99870f4d 13623=item B<$DEPRECATED> means this is deprecated.
cf25bb62 13624
99870f4d 13625=item B<$OBSOLETE> means this is obsolete.
cf25bb62 13626
99870f4d 13627=item B<$STABILIZED> means this is stabilized.
cf25bb62 13628
99870f4d 13629=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 13630
c12f2655
KW
13631=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13632stable.
5beb625e 13633
99870f4d 13634=back
da7fcca4 13635
99870f4d 13636$formatted_properties
cf25bb62 13637
99870f4d 13638$zero_matches
cf25bb62 13639
99870f4d 13640=head1 Properties not accessible through \\p{} and \\P{}
cf25bb62 13641
99870f4d
KW
13642A few properties are accessible in Perl via various function calls only.
13643These are:
78bb419c 13644
99870f4d
KW
13645 Lowercase_Mapping lc() and lcfirst()
13646 Titlecase_Mapping ucfirst()
13647 Uppercase_Mapping uc()
12ac2576 13648
043f3b3f
KW
13649Also, Case_Folding is accessible through the C</i> modifier in regular
13650expressions.
cf25bb62 13651
043f3b3f
KW
13652And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
13653interpolation in double-quoted strings and regular expressions, but both
13654usages require a L<use charnames;|charnames> to be specified, which also
13655contains related functions viacode(), vianame(), and string_vianame().
cf25bb62 13656
99870f4d 13657=head1 Unicode regular expression properties that are NOT accepted by Perl
d2d499f5 13658
99870f4d
KW
13659Perl will generate an error for a few character properties in Unicode when
13660used in a regular expression. The non-Unihan ones are listed below, with the
13661reasons they are not accepted, perhaps with work-arounds. The short names for
13662the properties are listed enclosed in (parentheses).
c12f2655
KW
13663As described after the list, an installation can change the defaults and choose
13664to accept any of these. The list is machine generated based on the
13665choices made for the installation that generated this document.
ae6979a8 13666
99870f4d 13667@bad_re_properties
a3a8c5f0 13668
b7986f4f
KW
13669An installation can choose to allow any of these to be matched by downloading
13670the Unicode database from L<http://www.unicode.org/Public/> to
f3514a2f
KW
13671C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
13672controlling lists contained in the program
13673C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
13674(C<\%Config> is available from the Config module).
d73e5302 13675
99870f4d 13676=head1 Files in the I<To> directory (for serious hackers only)
12ac2576 13677
99870f4d
KW
13678All Unicode properties are really mappings (in the mathematical sense) from
13679code points to their respective values. As part of its build process,
13680Perl constructs tables containing these mappings for all properties that it
50b27e73 13681deals with. Some, but not all, of these are written out into files.
99870f4d 13682Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
77173124 13683(C<%Config> is available from the C<Config> module).
7ebf06b3 13684
50b27e73
KW
13685Perl reserves the right to change the format and even the existence of any of
13686those files without notice, except the ones that were in existence prior to
c6d31e50 13687release 5.14. If those change, a deprecation cycle will be done first. These
50b27e73 13688are:
12ac2576 13689
99870f4d 13690@map_tables_actually_output
12ac2576 13691
ec2f0128
KW
13692Each of the files in this directory defines several hash entries to help
13693reading programs decipher it. One of them looks like this:
12ac2576 13694
99870f4d 13695 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
d73e5302 13696
77173124
KW
13697where "NAME" is a name to indicate the property. For backwards compatibility,
13698this is not necessarily the property's official Unicode name. (The "To" is
99870f4d
KW
13699also for backwards compatibility.) The hash entry gives the format of the
13700mapping fields of the table, currently one of the following:
d73e5302 13701
12916dad 13702@map_table_formats
d73e5302 13703
99870f4d
KW
13704This format applies only to the entries in the main body of the table.
13705Entries defined in hashes or ones that are missing from the list can have a
13706different format.
d73e5302 13707
ec2f0128 13708The value that the missing entries have is given by another SwashInfo hash
99870f4d 13709entry line; it looks like this:
d73e5302 13710
99870f4d 13711 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
d73e5302 13712
99870f4d 13713This example line says that any Unicode code points not explicitly listed in
77173124 13714the file have the value "NaN" under the property indicated by NAME. If the
99870f4d
KW
13715value is the special string C<< <code point> >>, it means that the value for
13716any missing code point is the code point itself. This happens, for example,
13717in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
77173124 13718character "A", are missing because the uppercase of "A" is itself.
d73e5302 13719
ec2f0128
KW
13720Finally, if the file contains a hash for special case entries, its name is
13721specified by an entry that looks like this:
13722
13723 \$utf8::SwashInfo{'ToNAME'}{'specials_name'} = 'utf8::ToSpecNAME';
13724
8d099389
KW
13725
13726=head1 Other information in the Unicode data base
13727
13728The Unicode data base is delivered in two different formats. The XML version
13729is valid for more modern Unicode releases. The other version is a collection
13730of files. The two are intended to give equivalent information. Perl uses the
13731older form; this allows you to recompile Perl to use early Unicode releases.
13732
13733The only non-character property that Perl currently supports is Named
13734Sequences, in which a sequence of code points
13735is given a name and generally treated as a single entity. (Perl supports
13736these via the C<\\N{...}> double-quotish construct,
13737L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
13738
13739Below is a list of the files in the Unicode data base that Perl doesn't
13740currently use, along with very brief descriptions of their purposes.
13741Some of the names of the files have been shortened from those that Unicode
13742uses, in order to allow them to be distinguishable from similarly named files
13743on file systems for which only the first 8 characters of a name are
13744significant.
13745
13746=over 4
13747
13748@unused_files
13749
13750=back
13751
99870f4d 13752=head1 SEE ALSO
d73e5302 13753
99870f4d 13754L<$unicode_reference_url>
12ac2576 13755
99870f4d 13756L<perlrecharclass>
12ac2576 13757
99870f4d 13758L<perlunicode>
d73e5302 13759
99870f4d 13760END
d73e5302 13761
9218f1cf
KW
13762 # And write it. The 0 means no utf8.
13763 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
99870f4d
KW
13764 return;
13765}
d73e5302 13766
99870f4d
KW
13767sub make_Heavy () {
13768 # Create and write Heavy.pl, which passes info about the tables to
13769 # utf8_heavy.pl
12ac2576 13770
99870f4d
KW
13771 my @heavy = <<END;
13772$HEADER
126c3d4e 13773$INTERNAL_ONLY_HEADER
d73e5302 13774
99870f4d 13775# This file is for the use of utf8_heavy.pl
12ac2576 13776
c12f2655
KW
13777# Maps Unicode (not Perl single-form extensions) property names in loose
13778# standard form to their corresponding standard names
99870f4d
KW
13779\%utf8::loose_property_name_of = (
13780END
cf25bb62 13781
99870f4d
KW
13782 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
13783 push @heavy, <<END;
13784);
12ac2576 13785
99870f4d
KW
13786# Maps property, table to file for those using stricter matching
13787\%utf8::stricter_to_file_of = (
13788END
13789 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
13790 push @heavy, <<END;
13791);
12ac2576 13792
99870f4d
KW
13793# Maps property, table to file for those using loose matching
13794\%utf8::loose_to_file_of = (
13795END
13796 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
13797 push @heavy, <<END;
13798);
12ac2576 13799
99870f4d
KW
13800# Maps floating point to fractional form
13801\%utf8::nv_floating_to_rational = (
13802END
13803 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
13804 push @heavy, <<END;
13805);
12ac2576 13806
99870f4d
KW
13807# If a floating point number doesn't have enough digits in it to get this
13808# close to a fraction, it isn't considered to be that fraction even if all the
13809# digits it does have match.
13810\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 13811
99870f4d
KW
13812# Deprecated tables to generate a warning for. The key is the file containing
13813# the table, so as to avoid duplication, as many property names can map to the
13814# file, but we only need one entry for all of them.
13815\%utf8::why_deprecated = (
13816END
12ac2576 13817
99870f4d
KW
13818 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
13819 push @heavy, <<END;
13820);
12ac2576 13821
d867ccfb
KW
13822# A few properties have different behavior under /i matching. This maps the
13823# those to substitute files to use under /i.
13824\%utf8::caseless_equivalent = (
13825END
13826
d867ccfb
KW
13827 # We set the key to the file when we associated files with tables, but we
13828 # couldn't do the same for the value then, as we might not have the file
13829 # for the alternate table figured out at that time.
13830 foreach my $cased (keys %caseless_equivalent_to) {
13831 my @path = $caseless_equivalent_to{$cased}->file_path;
13832 my $path = join '/', @path[1, -1];
d867ccfb
KW
13833 $utf8::caseless_equivalent_to{$cased} = $path;
13834 }
13835 push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
13836 push @heavy, <<END;
13837);
13838
99870f4d
KW
138391;
13840END
12ac2576 13841
9218f1cf 13842 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
99870f4d 13843 return;
12ac2576
JP
13844}
13845
52dc8b5d 13846sub make_Name_pm () {
6f424f62 13847 # Create and write Name.pm, which contains subroutines and data to use in
52dc8b5d
KW
13848 # conjunction with Name.pl
13849
bb1dd3da
KW
13850 # Maybe there's nothing to do.
13851 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
13852
52dc8b5d
KW
13853 my @name = <<END;
13854$HEADER
126c3d4e 13855$INTERNAL_ONLY_HEADER
52dc8b5d 13856END
0f6f7bc2 13857
fb848dce
KW
13858 # Convert these structures to output format.
13859 my $code_points_ending_in_code_point =
13860 main::simple_dumper(\@code_points_ending_in_code_point,
13861 ' ' x 8);
13862 my $names = main::simple_dumper(\%names_ending_in_code_point,
13863 ' ' x 8);
13864 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
0f6f7bc2 13865 ' ' x 8);
0f6f7bc2 13866
fb848dce
KW
13867 # Do the same with the Hangul names,
13868 my $jamo;
13869 my $jamo_l;
13870 my $jamo_v;
13871 my $jamo_t;
13872 my $jamo_re;
13873 if ($has_hangul_syllables) {
0f6f7bc2 13874
fb848dce
KW
13875 # Construct a regular expression of all the possible
13876 # combinations of the Hangul syllables.
13877 my @L_re; # Leading consonants
13878 for my $i ($LBase .. $LBase + $LCount - 1) {
13879 push @L_re, $Jamo{$i}
13880 }
13881 my @V_re; # Middle vowels
13882 for my $i ($VBase .. $VBase + $VCount - 1) {
13883 push @V_re, $Jamo{$i}
13884 }
13885 my @T_re; # Trailing consonants
13886 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
13887 push @T_re, $Jamo{$i}
13888 }
0f6f7bc2 13889
fb848dce
KW
13890 # The whole re is made up of the L V T combination.
13891 $jamo_re = '('
13892 . join ('|', sort @L_re)
13893 . ')('
13894 . join ('|', sort @V_re)
13895 . ')('
13896 . join ('|', sort @T_re)
13897 . ')?';
0f6f7bc2 13898
fb848dce
KW
13899 # These hashes needed by the algorithm were generated
13900 # during reading of the Jamo.txt file
13901 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
13902 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
13903 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
13904 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
13905 }
0f6f7bc2 13906
6f424f62 13907 push @name, <<END;
0f6f7bc2 13908
6f424f62
KW
13909# This module contains machine-generated tables and code for the
13910# algorithmically-determinable Unicode character names. The following
13911# routines can be used to translate between name and code point and vice versa
0f6f7bc2
KW
13912
13913{ # Closure
13914
92199589
KW
13915 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
13916 # two must be 10; if there are 5, the first must not be a 0. Written this
13917 # way to decrease backtracking. The first regex allows the code point to
13918 # be at the end of a word, but to work properly, the word shouldn't end
13919 # with a valid hex character. The second one won't match a code point at
13920 # the end of a word, and doesn't have the run-on issue
0f6f7bc2
KW
13921 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
13922 my \$code_point_re = qr/$code_point_re/;
13923
13924 # In the following hash, the keys are the bases of names which includes
13925 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
13926 # of each key is another hash which is used to get the low and high ends
13927 # for each range of code points that apply to the name.
13928 my %names_ending_in_code_point = (
13929$names
13930 );
13931
13932 # The following hash is a copy of the previous one, except is for loose
13933 # matching, so each name has blanks and dashes squeezed out
13934 my %loose_names_ending_in_code_point = (
13935$loose_names
13936 );
13937
13938 # And the following array gives the inverse mapping from code points to
13939 # names. Lowest code points are first
13940 my \@code_points_ending_in_code_point = (
13941$code_points_ending_in_code_point
13942 );
13943END
fb848dce
KW
13944 # Earlier releases didn't have Jamos. No sense outputting
13945 # them unless will be used.
13946 if ($has_hangul_syllables) {
6f424f62 13947 push @name, <<END;
0f6f7bc2
KW
13948
13949 # Convert from code point to Jamo short name for use in composing Hangul
13950 # syllable names
13951 my %Jamo = (
13952$jamo
13953 );
13954
13955 # Leading consonant (can be null)
13956 my %Jamo_L = (
13957$jamo_l
13958 );
13959
13960 # Vowel
13961 my %Jamo_V = (
13962$jamo_v
13963 );
13964
13965 # Optional trailing consonant
13966 my %Jamo_T = (
13967$jamo_t
13968 );
13969
13970 # Computed re that splits up a Hangul name into LVT or LV syllables
13971 my \$syllable_re = qr/$jamo_re/;
13972
13973 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
13974 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
13975
13976 # These constants names and values were taken from the Unicode standard,
13977 # version 5.1, section 3.12. They are used in conjunction with Hangul
13978 # syllables
13979 my \$SBase = $SBase_string;
13980 my \$LBase = $LBase_string;
13981 my \$VBase = $VBase_string;
13982 my \$TBase = $TBase_string;
13983 my \$SCount = $SCount;
13984 my \$LCount = $LCount;
13985 my \$VCount = $VCount;
13986 my \$TCount = $TCount;
13987 my \$NCount = \$VCount * \$TCount;
13988END
fb848dce 13989 } # End of has Jamos
0f6f7bc2 13990
6f424f62 13991 push @name, << 'END';
0f6f7bc2
KW
13992
13993 sub name_to_code_point_special {
13994 my ($name, $loose) = @_;
13995
13996 # Returns undef if not one of the specially handled names; otherwise
13997 # returns the code point equivalent to the input name
13998 # $loose is non-zero if to use loose matching, 'name' in that case
13999 # must be input as upper case with all blanks and dashes squeezed out.
14000END
fb848dce 14001 if ($has_hangul_syllables) {
6f424f62 14002 push @name, << 'END';
0f6f7bc2
KW
14003
14004 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
14005 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
14006 {
14007 return if $name !~ qr/^$syllable_re$/;
14008 my $L = $Jamo_L{$1};
14009 my $V = $Jamo_V{$2};
14010 my $T = (defined $3) ? $Jamo_T{$3} : 0;
14011 return ($L * $VCount + $V) * $TCount + $T + $SBase;
14012 }
14013END
fb848dce 14014 }
6f424f62 14015 push @name, << 'END';
0f6f7bc2
KW
14016
14017 # Name must end in 'code_point' for this to handle.
14018 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
14019 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
14020
14021 my $base = $1;
14022 my $code_point = CORE::hex $2;
14023 my $names_ref;
14024
14025 if ($loose) {
14026 $names_ref = \%loose_names_ending_in_code_point;
14027 }
14028 else {
14029 return if $base !~ s/-$//;
14030 $names_ref = \%names_ending_in_code_point;
14031 }
14032
14033 # Name must be one of the ones which has the code point in it.
14034 return if ! $names_ref->{$base};
14035
14036 # Look through the list of ranges that apply to this name to see if
14037 # the code point is in one of them.
14038 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
14039 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
14040 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
14041
14042 # Here, the code point is in the range.
14043 return $code_point;
14044 }
14045
14046 # Here, looked like the name had a code point number in it, but
14047 # did not match one of the valid ones.
14048 return;
14049 }
14050
14051 sub code_point_to_name_special {
14052 my $code_point = shift;
14053
14054 # Returns the name of a code point if algorithmically determinable;
14055 # undef if not
14056END
fb848dce 14057 if ($has_hangul_syllables) {
6f424f62 14058 push @name, << 'END';
0f6f7bc2
KW
14059
14060 # If in the Hangul range, calculate the name based on Unicode's
14061 # algorithm
14062 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
14063 use integer;
14064 my $SIndex = $code_point - $SBase;
14065 my $L = $LBase + $SIndex / $NCount;
14066 my $V = $VBase + ($SIndex % $NCount) / $TCount;
14067 my $T = $TBase + $SIndex % $TCount;
14068 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
14069 $name .= $Jamo{$T} if $T != $TBase;
14070 return $name;
14071 }
14072END
fb848dce 14073 }
6f424f62 14074 push @name, << 'END';
0f6f7bc2
KW
14075
14076 # Look through list of these code points for one in range.
14077 foreach my $hash (@code_points_ending_in_code_point) {
14078 return if $code_point < $hash->{'low'};
14079 if ($code_point <= $hash->{'high'}) {
14080 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
14081 }
14082 }
14083 return; # None found
14084 }
14085} # End closure
14086
6f424f62 140871;
0f6f7bc2 14088END
52dc8b5d
KW
14089
14090 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
14091 return;
14092}
14093
14094
99870f4d
KW
14095sub write_all_tables() {
14096 # Write out all the tables generated by this program to files, as well as
14097 # the supporting data structures, pod file, and .t file.
14098
14099 my @writables; # List of tables that actually get written
14100 my %match_tables_to_write; # Used to collapse identical match tables
14101 # into one file. Each key is a hash function
14102 # result to partition tables into buckets.
14103 # Each value is an array of the tables that
14104 # fit in the bucket.
14105
14106 # For each property ...
14107 # (sort so that if there is an immutable file name, it has precedence, so
14108 # some other property can't come in and take over its file name. If b's
14109 # file name is defined, will return 1, meaning to take it first; don't
7fc6cb55
KW
14110 # care if both defined, as they had better be different anyway. And the
14111 # property named 'Perl' needs to be first (it doesn't have any immutable
14112 # file name) because empty properties are defined in terms of it's table
14113 # named 'Any'.)
99870f4d 14114 PROPERTY:
7fc6cb55
KW
14115 foreach my $property (sort { return -1 if $a == $perl;
14116 return 1 if $b == $perl;
14117 return defined $b->file
14118 } property_ref('*'))
14119 {
99870f4d
KW
14120 my $type = $property->type;
14121
14122 # And for each table for that property, starting with the mapping
14123 # table for it ...
14124 TABLE:
14125 foreach my $table($property,
14126
14127 # and all the match tables for it (if any), sorted so
14128 # the ones with the shortest associated file name come
14129 # first. The length sorting prevents problems of a
14130 # longer file taking a name that might have to be used
14131 # by a shorter one. The alphabetic sorting prevents
14132 # differences between releases
14133 sort { my $ext_a = $a->external_name;
14134 return 1 if ! defined $ext_a;
14135 my $ext_b = $b->external_name;
14136 return -1 if ! defined $ext_b;
a92d5c2e
KW
14137
14138 # But return the non-complement table before
14139 # the complement one, as the latter is defined
14140 # in terms of the former, and needs to have
14141 # the information for the former available.
14142 return 1 if $a->complement != 0;
14143 return -1 if $b->complement != 0;
14144
0a695432
KW
14145 # Similarly, return a subservient table after
14146 # a leader
14147 return 1 if $a->leader != $a;
14148 return -1 if $b->leader != $b;
14149
99870f4d
KW
14150 my $cmp = length $ext_a <=> length $ext_b;
14151
14152 # Return result if lengths not equal
14153 return $cmp if $cmp;
14154
14155 # Alphabetic if lengths equal
14156 return $ext_a cmp $ext_b
14157 } $property->tables
14158 )
14159 {
12ac2576 14160
99870f4d
KW
14161 # Here we have a table associated with a property. It could be
14162 # the map table (done first for each property), or one of the
14163 # other tables. Determine which type.
14164 my $is_property = $table->isa('Property');
14165
14166 my $name = $table->name;
14167 my $complete_name = $table->complete_name;
14168
14169 # See if should suppress the table if is empty, but warn if it
14170 # contains something.
0332277c
KW
14171 my $suppress_if_empty_warn_if_not
14172 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
99870f4d
KW
14173
14174 # Calculate if this table should have any code points associated
14175 # with it or not.
14176 my $expected_empty =
14177
14178 # $perl should be empty, as well as properties that we just
14179 # don't do anything with
14180 ($is_property
14181 && ($table == $perl
14182 || grep { $complete_name eq $_ }
14183 @unimplemented_properties
14184 )
14185 )
14186
14187 # Match tables in properties we skipped populating should be
14188 # empty
14189 || (! $is_property && ! $property->to_create_match_tables)
14190
14191 # Tables and properties that are expected to have no code
14192 # points should be empty
14193 || $suppress_if_empty_warn_if_not
14194 ;
14195
14196 # Set a boolean if this table is the complement of an empty binary
14197 # table
14198 my $is_complement_of_empty_binary =
14199 $type == $BINARY &&
14200 (($table == $property->table('Y')
14201 && $property->table('N')->is_empty)
14202 || ($table == $property->table('N')
14203 && $property->table('Y')->is_empty));
14204
14205
14206 # Some tables should match everything
14207 my $expected_full =
14208 ($is_property)
14209 ? # All these types of map tables will be full because
14210 # they will have been populated with defaults
06f26c45 14211 ($type == $ENUM || $type == $FORCED_BINARY)
99870f4d
KW
14212
14213 : # A match table should match everything if its method
14214 # shows it should
14215 ($table->matches_all
14216
14217 # The complement of an empty binary table will match
14218 # everything
14219 || $is_complement_of_empty_binary
14220 )
14221 ;
14222
14223 if ($table->is_empty) {
14224
99870f4d 14225 if ($suppress_if_empty_warn_if_not) {
301ba948
KW
14226 $table->set_fate($SUPPRESSED,
14227 $suppress_if_empty_warn_if_not);
99870f4d 14228 }
12ac2576 14229
c12f2655 14230 # Suppress (by skipping them) expected empty tables.
99870f4d
KW
14231 next TABLE if $expected_empty;
14232
14233 # And setup to later output a warning for those that aren't
14234 # known to be allowed to be empty. Don't do the warning if
14235 # this table is a child of another one to avoid duplicating
14236 # the warning that should come from the parent one.
14237 if (($table == $property || $table->parent == $table)
301ba948 14238 && $table->fate != $SUPPRESSED
99870f4d
KW
14239 && ! grep { $complete_name =~ /^$_$/ }
14240 @tables_that_may_be_empty)
14241 {
14242 push @unhandled_properties, "$table";
14243 }
7fc6cb55
KW
14244
14245 # An empty table is just the complement of everything.
14246 $table->set_complement($Any) if $table != $property;
99870f4d
KW
14247 }
14248 elsif ($expected_empty) {
14249 my $because = "";
14250 if ($suppress_if_empty_warn_if_not) {
0332277c 14251 $because = " because $suppress_if_empty_warn_if_not";
99870f4d 14252 }
12ac2576 14253
99870f4d
KW
14254 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
14255 }
12ac2576 14256
99870f4d
KW
14257 my $count = $table->count;
14258 if ($expected_full) {
14259 if ($count != $MAX_UNICODE_CODEPOINTS) {
14260 Carp::my_carp("$table matches only "
14261 . clarify_number($count)
14262 . " Unicode code points but should match "
14263 . clarify_number($MAX_UNICODE_CODEPOINTS)
14264 . " (off by "
14265 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
14266 . "). Proceeding anyway.");
14267 }
12ac2576 14268
99870f4d
KW
14269 # Here is expected to be full. If it is because it is the
14270 # complement of an (empty) binary table that is to be
14271 # suppressed, then suppress this one as well.
14272 if ($is_complement_of_empty_binary) {
14273 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
14274 my $opposing = $property->table($opposing_name);
14275 my $opposing_status = $opposing->status;
14276 if ($opposing_status) {
14277 $table->set_status($opposing_status,
14278 $opposing->status_info);
14279 }
14280 }
14281 }
14282 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
14283 if ($table == $property || $table->leader == $table) {
14284 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
14285 }
14286 }
d73e5302 14287
301ba948 14288 if ($table->fate == $SUPPRESSED) {
99870f4d
KW
14289 if (! $is_property) {
14290 my @children = $table->children;
14291 foreach my $child (@children) {
301ba948 14292 if ($child->fate != $SUPPRESSED) {
99870f4d
KW
14293 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
14294 }
14295 }
14296 }
14297 next TABLE;
d73e5302 14298
99870f4d
KW
14299 }
14300 if (! $is_property) {
14301
14302 # Several things need to be done just once for each related
14303 # group of match tables. Do them on the parent.
14304 if ($table->parent == $table) {
14305
14306 # Add an entry in the pod file for the table; it also does
14307 # the children.
23e33b60 14308 make_table_pod_entries($table) if defined $pod_directory;
99870f4d
KW
14309
14310 # See if the the table matches identical code points with
14311 # something that has already been output. In that case,
14312 # no need to have two files with the same code points in
14313 # them. We use the table's hash() method to store these
14314 # in buckets, so that it is quite likely that if two
14315 # tables are in the same bucket they will be identical, so
14316 # don't have to compare tables frequently. The tables
14317 # have to have the same status to share a file, so add
14318 # this to the bucket hash. (The reason for this latter is
14319 # that Heavy.pl associates a status with a file.)
06671cbc
KW
14320 # We don't check tables that are inverses of others, as it
14321 # would lead to some coding complications, and checking
14322 # all the regular ones should find everything.
14323 if ($table->complement == 0) {
21be712a 14324 my $hash = $table->hash . ';' . $table->status;
99870f4d 14325
21be712a
KW
14326 # Look at each table that is in the same bucket as
14327 # this one would be.
14328 foreach my $comparison
14329 (@{$match_tables_to_write{$hash}})
14330 {
14331 if ($table->matches_identically_to($comparison)) {
14332 $table->set_equivalent_to($comparison,
99870f4d 14333 Related => 0);
21be712a
KW
14334 next TABLE;
14335 }
99870f4d 14336 }
d73e5302 14337
21be712a
KW
14338 # Here, not equivalent, add this table to the bucket.
14339 push @{$match_tables_to_write{$hash}}, $table;
06671cbc 14340 }
99870f4d
KW
14341 }
14342 }
14343 else {
14344
14345 # Here is the property itself.
14346 # Don't write out or make references to the $perl property
14347 next if $table == $perl;
14348
14349 if ($type != $STRING) {
14350
14351 # There is a mapping stored of the various synonyms to the
14352 # standardized name of the property for utf8_heavy.pl.
14353 # Also, the pod file contains entries of the form:
14354 # \p{alias: *} \p{full: *}
14355 # rather than show every possible combination of things.
14356
14357 my @property_aliases = $property->aliases;
14358
14359 # The full name of this property is stored by convention
14360 # first in the alias array
14361 my $full_property_name =
14362 '\p{' . $property_aliases[0]->name . ': *}';
14363 my $standard_property_name = standardize($table->name);
14364
14365 # For each synonym ...
14366 for my $i (0 .. @property_aliases - 1) {
14367 my $alias = $property_aliases[$i];
14368 my $alias_name = $alias->name;
14369 my $alias_standard = standardize($alias_name);
14370
c12f2655 14371 # For utf8_heavy, set the mapping of the alias to the
99870f4d
KW
14372 # property
14373 if (exists ($loose_property_name_of{$alias_standard}))
14374 {
14375 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
14376 }
14377 else {
14378 $loose_property_name_of{$alias_standard}
14379 = $standard_property_name;
14380 }
14381
23e33b60
KW
14382 # Now for the pod entry for this alias. Skip if not
14383 # outputting a pod; skip the first one, which is the
14384 # full name so won't have an entry like: '\p{full: *}
14385 # \p{full: *}', and skip if don't want an entry for
14386 # this one.
14387 next if $i == 0
14388 || ! defined $pod_directory
33e96e72 14389 || ! $alias->make_re_pod_entry;
99870f4d 14390
d57ccc9a
KW
14391 my $rhs = $full_property_name;
14392 if ($property != $perl && $table->perl_extension) {
14393 $rhs .= ' (Perl extension)';
14394 }
99870f4d
KW
14395 push @match_properties,
14396 format_pod_line($indent_info_column,
14397 '\p{' . $alias->name . ': *}',
d57ccc9a 14398 $rhs,
99870f4d
KW
14399 $alias->status);
14400 }
14401 } # End of non-string-like property code
d73e5302 14402
d73e5302 14403
c12f2655 14404 # Don't write out a mapping file if not desired.
99870f4d
KW
14405 next if ! $property->to_output_map;
14406 }
d73e5302 14407
99870f4d
KW
14408 # Here, we know we want to write out the table, but don't do it
14409 # yet because there may be other tables that come along and will
14410 # want to share the file, and the file's comments will change to
14411 # mention them. So save for later.
14412 push @writables, $table;
14413
14414 } # End of looping through the property and all its tables.
14415 } # End of looping through all properties.
14416
14417 # Now have all the tables that will have files written for them. Do it.
14418 foreach my $table (@writables) {
14419 my @directory;
14420 my $filename;
14421 my $property = $table->property;
14422 my $is_property = ($table == $property);
14423 if (! $is_property) {
14424
14425 # Match tables for the property go in lib/$subdirectory, which is
14426 # the property's name. Don't use the standard file name for this,
14427 # as may get an unfamiliar alias
14428 @directory = ($matches_directory, $property->external_name);
14429 }
14430 else {
d73e5302 14431
99870f4d
KW
14432 @directory = $table->directory;
14433 $filename = $table->file;
14434 }
d73e5302 14435
98dc9551 14436 # Use specified filename if available, or default to property's
99870f4d
KW
14437 # shortest name. We need an 8.3 safe filename (which means "an 8
14438 # safe" filename, since after the dot is only 'pl', which is < 3)
14439 # The 2nd parameter is if the filename shouldn't be changed, and
14440 # it shouldn't iff there is a hard-coded name for this table.
14441 $filename = construct_filename(
14442 $filename || $table->external_name,
14443 ! $filename, # mutable if no filename
14444 \@directory);
d73e5302 14445
99870f4d 14446 register_file_for_name($table, \@directory, $filename);
d73e5302 14447
99870f4d
KW
14448 # Only need to write one file when shared by more than one
14449 # property
a92d5c2e
KW
14450 next if ! $is_property
14451 && ($table->leader != $table || $table->complement != 0);
d73e5302 14452
99870f4d
KW
14453 # Construct a nice comment to add to the file
14454 $table->set_final_comment;
14455
14456 $table->write;
cf25bb62 14457 }
d73e5302 14458
d73e5302 14459
99870f4d
KW
14460 # Write out the pod file
14461 make_pod;
14462
52dc8b5d 14463 # And Heavy.pl, Name.pm
99870f4d 14464 make_Heavy;
52dc8b5d 14465 make_Name_pm;
d73e5302 14466
99870f4d
KW
14467 make_property_test_script() if $make_test_script;
14468 return;
cf25bb62 14469}
d73e5302 14470
99870f4d
KW
14471my @white_space_separators = ( # This used only for making the test script.
14472 "",
14473 ' ',
14474 "\t",
14475 ' '
14476 );
d73e5302 14477
99870f4d
KW
14478sub generate_separator($) {
14479 # This used only for making the test script. It generates the colon or
14480 # equal separator between the property and property value, with random
14481 # white space surrounding the separator
d73e5302 14482
99870f4d 14483 my $lhs = shift;
d73e5302 14484
99870f4d 14485 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 14486
99870f4d
KW
14487 # Choose space before and after randomly
14488 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
14489 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 14490
99870f4d
KW
14491 # And return the whole complex, half the time using a colon, half the
14492 # equals
14493 return $spaces_before
14494 . (rand() < 0.5) ? '=' : ':'
14495 . $spaces_after;
14496}
76ccdbe2 14497
430ada4c 14498sub generate_tests($$$$$) {
99870f4d
KW
14499 # This used only for making the test script. It generates test cases that
14500 # are expected to compile successfully in perl. Note that the lhs and
14501 # rhs are assumed to already be as randomized as the caller wants.
14502
99870f4d
KW
14503 my $lhs = shift; # The property: what's to the left of the colon
14504 # or equals separator
14505 my $rhs = shift; # The property value; what's to the right
14506 my $valid_code = shift; # A code point that's known to be in the
14507 # table given by lhs=rhs; undef if table is
14508 # empty
14509 my $invalid_code = shift; # A code point known to not be in the table;
14510 # undef if the table is all code points
14511 my $warning = shift;
14512
14513 # Get the colon or equal
14514 my $separator = generate_separator($lhs);
14515
14516 # The whole 'property=value'
14517 my $name = "$lhs$separator$rhs";
14518
430ada4c 14519 my @output;
99870f4d
KW
14520 # Create a complete set of tests, with complements.
14521 if (defined $valid_code) {
430ada4c
NC
14522 push @output, <<"EOC"
14523Expect(1, $valid_code, '\\p{$name}', $warning);
14524Expect(0, $valid_code, '\\p{^$name}', $warning);
14525Expect(0, $valid_code, '\\P{$name}', $warning);
14526Expect(1, $valid_code, '\\P{^$name}', $warning);
14527EOC
99870f4d
KW
14528 }
14529 if (defined $invalid_code) {
430ada4c
NC
14530 push @output, <<"EOC"
14531Expect(0, $invalid_code, '\\p{$name}', $warning);
14532Expect(1, $invalid_code, '\\p{^$name}', $warning);
14533Expect(1, $invalid_code, '\\P{$name}', $warning);
14534Expect(0, $invalid_code, '\\P{^$name}', $warning);
14535EOC
14536 }
14537 return @output;
99870f4d 14538}
cf25bb62 14539
430ada4c 14540sub generate_error($$$) {
99870f4d
KW
14541 # This used only for making the test script. It generates test cases that
14542 # are expected to not only not match, but to be syntax or similar errors
14543
99870f4d
KW
14544 my $lhs = shift; # The property: what's to the left of the
14545 # colon or equals separator
14546 my $rhs = shift; # The property value; what's to the right
14547 my $already_in_error = shift; # Boolean; if true it's known that the
14548 # unmodified lhs and rhs will cause an error.
14549 # This routine should not force another one
14550 # Get the colon or equal
14551 my $separator = generate_separator($lhs);
14552
14553 # Since this is an error only, don't bother to randomly decide whether to
14554 # put the error on the left or right side; and assume that the rhs is
14555 # loosely matched, again for convenience rather than rigor.
14556 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
14557
14558 my $property = $lhs . $separator . $rhs;
14559
430ada4c
NC
14560 return <<"EOC";
14561Error('\\p{$property}');
14562Error('\\P{$property}');
14563EOC
d73e5302
JH
14564}
14565
99870f4d
KW
14566# These are used only for making the test script
14567# XXX Maybe should also have a bad strict seps, which includes underscore.
14568
14569my @good_loose_seps = (
14570 " ",
14571 "-",
14572 "\t",
14573 "",
14574 "_",
14575 );
14576my @bad_loose_seps = (
14577 "/a/",
14578 ':=',
14579 );
14580
14581sub randomize_stricter_name {
14582 # This used only for making the test script. Take the input name and
14583 # return a randomized, but valid version of it under the stricter matching
14584 # rules.
14585
14586 my $name = shift;
14587 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14588
14589 # If the name looks like a number (integer, floating, or rational), do
14590 # some extra work
14591 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
14592 my $sign = $1;
14593 my $number = $2;
14594 my $separator = $3;
14595
14596 # If there isn't a sign, part of the time add a plus
14597 # Note: Not testing having any denominator having a minus sign
14598 if (! $sign) {
14599 $sign = '+' if rand() <= .3;
14600 }
14601
14602 # And add 0 or more leading zeros.
14603 $name = $sign . ('0' x int rand(10)) . $number;
14604
14605 if (defined $separator) {
14606 my $extra_zeros = '0' x int rand(10);
cf25bb62 14607
99870f4d
KW
14608 if ($separator eq '.') {
14609
14610 # Similarly, add 0 or more trailing zeros after a decimal
14611 # point
14612 $name .= $extra_zeros;
14613 }
14614 else {
14615
14616 # Or, leading zeros before the denominator
14617 $name =~ s,/,/$extra_zeros,;
14618 }
14619 }
cf25bb62 14620 }
d73e5302 14621
99870f4d
KW
14622 # For legibility of the test, only change the case of whole sections at a
14623 # time. To do this, first split into sections. The split returns the
14624 # delimiters
14625 my @sections;
14626 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
14627 trace $section if main::DEBUG && $to_trace;
14628
14629 if (length $section > 1 && $section !~ /\D/) {
14630
14631 # If the section is a sequence of digits, about half the time
14632 # randomly add underscores between some of them.
14633 if (rand() > .5) {
14634
14635 # Figure out how many underscores to add. max is 1 less than
14636 # the number of digits. (But add 1 at the end to make sure
14637 # result isn't 0, and compensate earlier by subtracting 2
14638 # instead of 1)
14639 my $num_underscores = int rand(length($section) - 2) + 1;
14640
14641 # And add them evenly throughout, for convenience, not rigor
14642 use integer;
14643 my $spacing = (length($section) - 1)/ $num_underscores;
14644 my $temp = $section;
14645 $section = "";
14646 for my $i (1 .. $num_underscores) {
14647 $section .= substr($temp, 0, $spacing, "") . '_';
14648 }
14649 $section .= $temp;
14650 }
14651 push @sections, $section;
14652 }
14653 else {
d73e5302 14654
99870f4d
KW
14655 # Here not a sequence of digits. Change the case of the section
14656 # randomly
14657 my $switch = int rand(4);
14658 if ($switch == 0) {
14659 push @sections, uc $section;
14660 }
14661 elsif ($switch == 1) {
14662 push @sections, lc $section;
14663 }
14664 elsif ($switch == 2) {
14665 push @sections, ucfirst $section;
14666 }
14667 else {
14668 push @sections, $section;
14669 }
14670 }
cf25bb62 14671 }
99870f4d
KW
14672 trace "returning", join "", @sections if main::DEBUG && $to_trace;
14673 return join "", @sections;
14674}
71d929cb 14675
99870f4d
KW
14676sub randomize_loose_name($;$) {
14677 # This used only for making the test script
71d929cb 14678
99870f4d
KW
14679 my $name = shift;
14680 my $want_error = shift; # if true, make an error
14681 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14682
14683 $name = randomize_stricter_name($name);
5beb625e
JH
14684
14685 my @parts;
99870f4d 14686 push @parts, $good_loose_seps[rand(@good_loose_seps)];
45376db6
KW
14687
14688 # Preserve trailing ones for the sake of not stripping the underscore from
14689 # 'L_'
14690 for my $part (split /[-\s_]+ (?= . )/, $name) {
5beb625e 14691 if (@parts) {
99870f4d
KW
14692 if ($want_error and rand() < 0.3) {
14693 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
14694 $want_error = 0;
14695 }
14696 else {
14697 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
14698 }
14699 }
99870f4d 14700 push @parts, $part;
5beb625e 14701 }
99870f4d
KW
14702 my $new = join("", @parts);
14703 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 14704
99870f4d 14705 if ($want_error) {
5beb625e 14706 if (rand() >= 0.5) {
99870f4d
KW
14707 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
14708 }
14709 else {
14710 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
14711 }
14712 }
14713 return $new;
14714}
14715
99870f4d
KW
14716# Used to make sure don't generate duplicate test cases.
14717my %test_generated;
5beb625e 14718
99870f4d
KW
14719sub make_property_test_script() {
14720 # This used only for making the test script
14721 # this written directly -- it's huge.
5beb625e 14722
99870f4d 14723 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 14724
99870f4d
KW
14725 # This uses randomness to test different possibilities without testing all
14726 # possibilities. To ensure repeatability, set the seed to 0. But if
14727 # tests are added, it will perturb all later ones in the .t file
14728 srand 0;
5beb625e 14729
3df51b85
KW
14730 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
14731
99870f4d
KW
14732 # Keep going down an order of magnitude
14733 # until find that adding this quantity to
14734 # 1 remains 1; but put an upper limit on
14735 # this so in case this algorithm doesn't
14736 # work properly on some platform, that we
14737 # won't loop forever.
14738 my $digits = 0;
14739 my $min_floating_slop = 1;
14740 while (1+ $min_floating_slop != 1
14741 && $digits++ < 50)
5beb625e 14742 {
99870f4d
KW
14743 my $next = $min_floating_slop / 10;
14744 last if $next == 0; # If underflows,
14745 # use previous one
14746 $min_floating_slop = $next;
5beb625e 14747 }
430ada4c
NC
14748
14749 # It doesn't matter whether the elements of this array contain single lines
14750 # or multiple lines. main::write doesn't count the lines.
14751 my @output;
99870f4d
KW
14752
14753 foreach my $property (property_ref('*')) {
14754 foreach my $table ($property->tables) {
14755
14756 # Find code points that match, and don't match this table.
14757 my $valid = $table->get_valid_code_point;
14758 my $invalid = $table->get_invalid_code_point;
14759 my $warning = ($table->status eq $DEPRECATED)
14760 ? "'deprecated'"
14761 : '""';
14762
14763 # Test each possible combination of the property's aliases with
14764 # the table's. If this gets to be too many, could do what is done
14765 # in the set_final_comment() for Tables
14766 my @table_aliases = $table->aliases;
14767 my @property_aliases = $table->property->aliases;
807807b7
KW
14768
14769 # Every property can be optionally be prefixed by 'Is_', so test
14770 # that those work, by creating such a new alias for each
14771 # pre-existing one.
14772 push @property_aliases, map { Alias->new("Is_" . $_->name,
14773 $_->loose_match,
33e96e72 14774 $_->make_re_pod_entry,
807807b7
KW
14775 $_->externally_ok,
14776 $_->status)
14777 } @property_aliases;
99870f4d
KW
14778 my $max = max(scalar @table_aliases, scalar @property_aliases);
14779 for my $j (0 .. $max - 1) {
14780
14781 # The current alias for property is the next one on the list,
14782 # or if beyond the end, start over. Similarly for table
14783 my $property_name
14784 = $property_aliases[$j % @property_aliases]->name;
14785
14786 $property_name = "" if $table->property == $perl;
14787 my $table_alias = $table_aliases[$j % @table_aliases];
14788 my $table_name = $table_alias->name;
14789 my $loose_match = $table_alias->loose_match;
14790
14791 # If the table doesn't have a file, any test for it is
14792 # already guaranteed to be in error
14793 my $already_error = ! $table->file_path;
14794
14795 # Generate error cases for this alias.
430ada4c
NC
14796 push @output, generate_error($property_name,
14797 $table_name,
14798 $already_error);
99870f4d
KW
14799
14800 # If the table is guaranteed to always generate an error,
14801 # quit now without generating success cases.
14802 next if $already_error;
14803
14804 # Now for the success cases.
14805 my $random;
14806 if ($loose_match) {
14807
14808 # For loose matching, create an extra test case for the
14809 # standard name.
14810 my $standard = standardize($table_name);
14811
14812 # $test_name should be a unique combination for each test
14813 # case; used just to avoid duplicate tests
14814 my $test_name = "$property_name=$standard";
14815
14816 # Don't output duplicate test cases.
14817 if (! exists $test_generated{$test_name}) {
14818 $test_generated{$test_name} = 1;
430ada4c
NC
14819 push @output, generate_tests($property_name,
14820 $standard,
14821 $valid,
14822 $invalid,
14823 $warning,
14824 );
5beb625e 14825 }
99870f4d
KW
14826 $random = randomize_loose_name($table_name)
14827 }
14828 else { # Stricter match
14829 $random = randomize_stricter_name($table_name);
99598c8c 14830 }
99598c8c 14831
99870f4d
KW
14832 # Now for the main test case for this alias.
14833 my $test_name = "$property_name=$random";
14834 if (! exists $test_generated{$test_name}) {
14835 $test_generated{$test_name} = 1;
430ada4c
NC
14836 push @output, generate_tests($property_name,
14837 $random,
14838 $valid,
14839 $invalid,
14840 $warning,
14841 );
99870f4d
KW
14842
14843 # If the name is a rational number, add tests for the
14844 # floating point equivalent.
14845 if ($table_name =~ qr{/}) {
14846
14847 # Calculate the float, and find just the fraction.
14848 my $float = eval $table_name;
14849 my ($whole, $fraction)
14850 = $float =~ / (.*) \. (.*) /x;
14851
14852 # Starting with one digit after the decimal point,
14853 # create a test for each possible precision (number of
14854 # digits past the decimal point) until well beyond the
14855 # native number found on this machine. (If we started
14856 # with 0 digits, it would be an integer, which could
14857 # well match an unrelated table)
14858 PLACE:
14859 for my $i (1 .. $min_floating_slop + 3) {
14860 my $table_name = sprintf("%.*f", $i, $float);
14861 if ($i < $MIN_FRACTION_LENGTH) {
14862
14863 # If the test case has fewer digits than the
14864 # minimum acceptable precision, it shouldn't
14865 # succeed, so we expect an error for it.
14866 # E.g., 2/3 = .7 at one decimal point, and we
14867 # shouldn't say it matches .7. We should make
14868 # it be .667 at least before agreeing that the
14869 # intent was to match 2/3. But at the
14870 # less-than- acceptable level of precision, it
14871 # might actually match an unrelated number.
14872 # So don't generate a test case if this
14873 # conflating is possible. In our example, we
14874 # don't want 2/3 matching 7/10, if there is
14875 # a 7/10 code point.
14876 for my $existing
14877 (keys %nv_floating_to_rational)
14878 {
14879 next PLACE
14880 if abs($table_name - $existing)
14881 < $MAX_FLOATING_SLOP;
14882 }
430ada4c
NC
14883 push @output, generate_error($property_name,
14884 $table_name,
14885 1 # 1 => already an error
14886 );
99870f4d
KW
14887 }
14888 else {
14889
14890 # Here the number of digits exceeds the
14891 # minimum we think is needed. So generate a
14892 # success test case for it.
430ada4c
NC
14893 push @output, generate_tests($property_name,
14894 $table_name,
14895 $valid,
14896 $invalid,
14897 $warning,
14898 );
99870f4d
KW
14899 }
14900 }
99598c8c
JH
14901 }
14902 }
99870f4d
KW
14903 }
14904 }
14905 }
37e2e78e 14906
9218f1cf
KW
14907 &write($t_path,
14908 0, # Not utf8;
14909 [<DATA>,
14910 @output,
14911 (map {"Test_X('$_');\n"} @backslash_X_tests),
14912 "Finished();\n"]);
99870f4d
KW
14913 return;
14914}
99598c8c 14915
99870f4d
KW
14916# This is a list of the input files and how to handle them. The files are
14917# processed in their order in this list. Some reordering is possible if
14918# desired, but the v0 files should be first, and the extracted before the
14919# others except DAge.txt (as data in an extracted file can be over-ridden by
14920# the non-extracted. Some other files depend on data derived from an earlier
14921# file, like UnicodeData requires data from Jamo, and the case changing and
14922# folding requires data from Unicode. Mostly, it safest to order by first
14923# version releases in (except the Jamo). DAge.txt is read before the
14924# extracted ones because of the rarely used feature $compare_versions. In the
14925# unlikely event that there were ever an extracted file that contained the Age
14926# property information, it would have to go in front of DAge.
14927#
14928# The version strings allow the program to know whether to expect a file or
14929# not, but if a file exists in the directory, it will be processed, even if it
14930# is in a version earlier than expected, so you can copy files from a later
14931# release into an earlier release's directory.
14932my @input_file_objects = (
14933 Input_file->new('PropertyAliases.txt', v0,
14934 Handler => \&process_PropertyAliases,
14935 ),
14936 Input_file->new(undef, v0, # No file associated with this
3df51b85 14937 Progress_Message => 'Finishing property setup',
99870f4d
KW
14938 Handler => \&finish_property_setup,
14939 ),
14940 Input_file->new('PropValueAliases.txt', v0,
14941 Handler => \&process_PropValueAliases,
14942 Has_Missings_Defaults => $NOT_IGNORED,
14943 ),
14944 Input_file->new('DAge.txt', v3.2.0,
14945 Has_Missings_Defaults => $NOT_IGNORED,
14946 Property => 'Age'
14947 ),
14948 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
14949 Property => 'General_Category',
14950 ),
14951 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
14952 Property => 'Canonical_Combining_Class',
14953 Has_Missings_Defaults => $NOT_IGNORED,
14954 ),
14955 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
14956 Property => 'Numeric_Type',
14957 Has_Missings_Defaults => $NOT_IGNORED,
14958 ),
14959 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
14960 Property => 'East_Asian_Width',
14961 Has_Missings_Defaults => $NOT_IGNORED,
14962 ),
14963 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
14964 Property => 'Line_Break',
14965 Has_Missings_Defaults => $NOT_IGNORED,
14966 ),
14967 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
14968 Property => 'Bidi_Class',
14969 Has_Missings_Defaults => $NOT_IGNORED,
14970 ),
14971 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
14972 Property => 'Decomposition_Type',
14973 Has_Missings_Defaults => $NOT_IGNORED,
14974 ),
14975 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
14976 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
14977 Property => 'Numeric_Value',
14978 Each_Line_Handler => \&filter_numeric_value_line,
14979 Has_Missings_Defaults => $NOT_IGNORED,
14980 ),
14981 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
14982 Property => 'Joining_Group',
14983 Has_Missings_Defaults => $NOT_IGNORED,
14984 ),
14985
14986 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
14987 Property => 'Joining_Type',
14988 Has_Missings_Defaults => $NOT_IGNORED,
14989 ),
14990 Input_file->new('Jamo.txt', v2.0.0,
14991 Property => 'Jamo_Short_Name',
14992 Each_Line_Handler => \&filter_jamo_line,
14993 ),
14994 Input_file->new('UnicodeData.txt', v1.1.5,
14995 Pre_Handler => \&setup_UnicodeData,
14996
14997 # We clean up this file for some early versions.
14998 Each_Line_Handler => [ (($v_version lt v2.0.0 )
14999 ? \&filter_v1_ucd
15000 : ($v_version eq v2.1.5)
15001 ? \&filter_v2_1_5_ucd
3ffed8c2
KW
15002
15003 # And for 5.14 Perls with 6.0,
15004 # have to also make changes
15005 : ($v_version ge v6.0.0)
15006 ? \&filter_v6_ucd
15007 : undef),
99870f4d
KW
15008
15009 # And the main filter
15010 \&filter_UnicodeData_line,
15011 ],
15012 EOF_Handler => \&EOF_UnicodeData,
15013 ),
15014 Input_file->new('ArabicShaping.txt', v2.0.0,
15015 Each_Line_Handler =>
15016 [ ($v_version lt 4.1.0)
15017 ? \&filter_old_style_arabic_shaping
15018 : undef,
15019 \&filter_arabic_shaping_line,
15020 ],
15021 Has_Missings_Defaults => $NOT_IGNORED,
15022 ),
15023 Input_file->new('Blocks.txt', v2.0.0,
15024 Property => 'Block',
15025 Has_Missings_Defaults => $NOT_IGNORED,
15026 Each_Line_Handler => \&filter_blocks_lines
15027 ),
15028 Input_file->new('PropList.txt', v2.0.0,
15029 Each_Line_Handler => (($v_version lt v3.1.0)
15030 ? \&filter_old_style_proplist
15031 : undef),
15032 ),
15033 Input_file->new('Unihan.txt', v2.0.0,
15034 Pre_Handler => \&setup_unihan,
15035 Optional => 1,
15036 Each_Line_Handler => \&filter_unihan_line,
15037 ),
15038 Input_file->new('SpecialCasing.txt', v2.1.8,
15039 Each_Line_Handler => \&filter_special_casing_line,
15040 Pre_Handler => \&setup_special_casing,
15041 ),
15042 Input_file->new(
15043 'LineBreak.txt', v3.0.0,
15044 Has_Missings_Defaults => $NOT_IGNORED,
15045 Property => 'Line_Break',
15046 # Early versions had problematic syntax
15047 Each_Line_Handler => (($v_version lt v3.1.0)
15048 ? \&filter_early_ea_lb
15049 : undef),
15050 ),
15051 Input_file->new('EastAsianWidth.txt', v3.0.0,
15052 Property => 'East_Asian_Width',
15053 Has_Missings_Defaults => $NOT_IGNORED,
15054 # Early versions had problematic syntax
15055 Each_Line_Handler => (($v_version lt v3.1.0)
15056 ? \&filter_early_ea_lb
15057 : undef),
15058 ),
15059 Input_file->new('CompositionExclusions.txt', v3.0.0,
15060 Property => 'Composition_Exclusion',
15061 ),
15062 Input_file->new('BidiMirroring.txt', v3.0.1,
15063 Property => 'Bidi_Mirroring_Glyph',
15064 ),
37e2e78e 15065 Input_file->new("NormalizationTest.txt", v3.0.1,
09ca89ce 15066 Skip => 'Validation Tests',
37e2e78e 15067 ),
99870f4d
KW
15068 Input_file->new('CaseFolding.txt', v3.0.1,
15069 Pre_Handler => \&setup_case_folding,
15070 Each_Line_Handler =>
15071 [ ($v_version lt v3.1.0)
15072 ? \&filter_old_style_case_folding
15073 : undef,
15074 \&filter_case_folding_line
15075 ],
99870f4d
KW
15076 ),
15077 Input_file->new('DCoreProperties.txt', v3.1.0,
15078 # 5.2 changed this file
15079 Has_Missings_Defaults => (($v_version ge v5.2.0)
15080 ? $NOT_IGNORED
15081 : $NO_DEFAULTS),
15082 ),
15083 Input_file->new('Scripts.txt', v3.1.0,
15084 Property => 'Script',
15085 Has_Missings_Defaults => $NOT_IGNORED,
15086 ),
15087 Input_file->new('DNormalizationProps.txt', v3.1.0,
15088 Has_Missings_Defaults => $NOT_IGNORED,
15089 Each_Line_Handler => (($v_version lt v4.0.1)
15090 ? \&filter_old_style_normalization_lines
15091 : undef),
15092 ),
15093 Input_file->new('HangulSyllableType.txt', v4.0.0,
15094 Has_Missings_Defaults => $NOT_IGNORED,
15095 Property => 'Hangul_Syllable_Type'),
15096 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
15097 Property => 'Word_Break',
15098 Has_Missings_Defaults => $NOT_IGNORED,
15099 ),
15100 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
15101 Property => 'Grapheme_Cluster_Break',
15102 Has_Missings_Defaults => $NOT_IGNORED,
15103 ),
37e2e78e
KW
15104 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
15105 Handler => \&process_GCB_test,
15106 ),
15107 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
09ca89ce 15108 Skip => 'Validation Tests',
37e2e78e
KW
15109 ),
15110 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
09ca89ce 15111 Skip => 'Validation Tests',
37e2e78e
KW
15112 ),
15113 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
09ca89ce 15114 Skip => 'Validation Tests',
37e2e78e 15115 ),
99870f4d
KW
15116 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
15117 Property => 'Sentence_Break',
15118 Has_Missings_Defaults => $NOT_IGNORED,
15119 ),
15120 Input_file->new('NamedSequences.txt', v4.1.0,
15121 Handler => \&process_NamedSequences
15122 ),
15123 Input_file->new('NameAliases.txt', v5.0.0,
15124 Property => 'Name_Alias',
dcd72625
KW
15125 Pre_Handler => ($v_version ge v6.0.0)
15126 ? \&setup_v6_name_alias
15127 : undef,
99870f4d 15128 ),
37e2e78e 15129 Input_file->new("BidiTest.txt", v5.2.0,
09ca89ce 15130 Skip => 'Validation Tests',
37e2e78e 15131 ),
99870f4d
KW
15132 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
15133 Optional => 1,
15134 Each_Line_Handler => \&filter_unihan_line,
15135 ),
15136 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
15137 Optional => 1,
15138 Each_Line_Handler => \&filter_unihan_line,
15139 ),
15140 Input_file->new('UnihanIRGSources.txt', v5.2.0,
15141 Optional => 1,
15142 Pre_Handler => \&setup_unihan,
15143 Each_Line_Handler => \&filter_unihan_line,
15144 ),
15145 Input_file->new('UnihanNumericValues.txt', v5.2.0,
15146 Optional => 1,
15147 Each_Line_Handler => \&filter_unihan_line,
15148 ),
15149 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
15150 Optional => 1,
15151 Each_Line_Handler => \&filter_unihan_line,
15152 ),
15153 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
15154 Optional => 1,
15155 Each_Line_Handler => \&filter_unihan_line,
15156 ),
15157 Input_file->new('UnihanReadings.txt', v5.2.0,
15158 Optional => 1,
15159 Each_Line_Handler => \&filter_unihan_line,
15160 ),
15161 Input_file->new('UnihanVariants.txt', v5.2.0,
15162 Optional => 1,
15163 Each_Line_Handler => \&filter_unihan_line,
15164 ),
82aed44a
KW
15165 Input_file->new('ScriptExtensions.txt', v6.0.0,
15166 Property => 'Script_Extensions',
15167 Pre_Handler => \&setup_script_extensions,
fbe1e607 15168 Each_Line_Handler => \&filter_script_extensions_line,
82aed44a 15169 ),
99870f4d 15170);
99598c8c 15171
99870f4d
KW
15172# End of all the preliminaries.
15173# Do it...
99598c8c 15174
99870f4d
KW
15175if ($compare_versions) {
15176 Carp::my_carp(<<END
15177Warning. \$compare_versions is set. Output is not suitable for production
15178END
15179 );
15180}
99598c8c 15181
99870f4d
KW
15182# Put into %potential_files a list of all the files in the directory structure
15183# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 15184# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
15185my @ignored_files_full_names = map { File::Spec->rel2abs(
15186 internal_file_to_platform($_))
15187 } keys %ignored_files;
15188File::Find::find({
15189 wanted=>sub {
37e2e78e 15190 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 15191 my $full = lc(File::Spec->rel2abs($_));
99870f4d 15192 $potential_files{$full} = 1
37e2e78e 15193 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
15194 return;
15195 }
15196}, File::Spec->curdir());
99598c8c 15197
99870f4d 15198my @mktables_list_output_files;
cdcef19a 15199my $old_start_time = 0;
cf25bb62 15200
3644ba60
KW
15201if (! -e $file_list) {
15202 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
15203 $write_unchanged_files = 1;
15204} elsif ($write_unchanged_files) {
99870f4d
KW
15205 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
15206}
15207else {
15208 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
15209 my $file_handle;
23e33b60 15210 if (! open $file_handle, "<", $file_list) {
3644ba60 15211 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
15212 $glob_list = 1;
15213 }
15214 else {
15215 my @input;
15216
15217 # Read and parse mktables.lst, placing the results from the first part
15218 # into @input, and the second part into @mktables_list_output_files
15219 for my $list ( \@input, \@mktables_list_output_files ) {
15220 while (<$file_handle>) {
15221 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
15222 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
15223 $old_start_time = $1;
15224 }
99870f4d
KW
15225 next if /^ \s* (?: \# .* )? $/x;
15226 last if /^ =+ $/x;
15227 my ( $file ) = split /\t/;
15228 push @$list, $file;
cf25bb62 15229 }
99870f4d
KW
15230 @$list = uniques(@$list);
15231 next;
cf25bb62
JH
15232 }
15233
99870f4d
KW
15234 # Look through all the input files
15235 foreach my $input (@input) {
15236 next if $input eq 'version'; # Already have checked this.
cf25bb62 15237
99870f4d
KW
15238 # Ignore if doesn't exist. The checking about whether we care or
15239 # not is done via the Input_file object.
15240 next if ! file_exists($input);
5beb625e 15241
99870f4d
KW
15242 # The paths are stored with relative names, and with '/' as the
15243 # delimiter; convert to absolute on this machine
517956bf 15244 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
99870f4d 15245 $potential_files{$full} = 1
517956bf 15246 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 15247 }
5beb625e 15248 }
cf25bb62 15249
99870f4d
KW
15250 close $file_handle;
15251}
15252
15253if ($glob_list) {
15254
15255 # Here wants to process all .txt files in the directory structure.
15256 # Convert them to full path names. They are stored in the platform's
15257 # relative style
f86864ac
KW
15258 my @known_files;
15259 foreach my $object (@input_file_objects) {
15260 my $file = $object->file;
15261 next unless defined $file;
15262 push @known_files, File::Spec->rel2abs($file);
15263 }
99870f4d
KW
15264
15265 my @unknown_input_files;
15266 foreach my $file (keys %potential_files) {
517956bf 15267 next if grep { lc($file) eq lc($_) } @known_files;
99870f4d
KW
15268
15269 # Here, the file is unknown to us. Get relative path name
15270 $file = File::Spec->abs2rel($file);
15271 push @unknown_input_files, $file;
15272
15273 # What will happen is we create a data structure for it, and add it to
15274 # the list of input files to process. First get the subdirectories
15275 # into an array
15276 my (undef, $directories, undef) = File::Spec->splitpath($file);
15277 $directories =~ s;/$;;; # Can have extraneous trailing '/'
15278 my @directories = File::Spec->splitdir($directories);
15279
15280 # If the file isn't extracted (meaning none of the directories is the
15281 # extracted one), just add it to the end of the list of inputs.
15282 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 15283 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
15284 }
15285 else {
15286
15287 # Here, the file is extracted. It needs to go ahead of most other
15288 # processing. Search for the first input file that isn't a
15289 # special required property (that is, find one whose first_release
15290 # is non-0), and isn't extracted. Also, the Age property file is
15291 # processed before the extracted ones, just in case
15292 # $compare_versions is set.
15293 for (my $i = 0; $i < @input_file_objects; $i++) {
15294 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
15295 && lc($input_file_objects[$i]->file) ne 'dage.txt'
15296 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 15297 {
99f78760 15298 splice @input_file_objects, $i, 0,
37e2e78e 15299 Input_file->new($file, v0);
99870f4d
KW
15300 last;
15301 }
cf25bb62 15302 }
99870f4d 15303
cf25bb62 15304 }
d2d499f5 15305 }
99870f4d 15306 if (@unknown_input_files) {
23e33b60 15307 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
15308
15309The following files are unknown as to how to handle. Assuming they are
15310typical property files. You'll know by later error messages if it worked or
15311not:
15312END
99f78760 15313 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
15314 }
15315} # End of looking through directory structure for more .txt files.
5beb625e 15316
99870f4d
KW
15317# Create the list of input files from the objects we have defined, plus
15318# version
15319my @input_files = 'version';
15320foreach my $object (@input_file_objects) {
15321 my $file = $object->file;
15322 next if ! defined $file; # Not all objects have files
15323 next if $object->optional && ! -e $file;
15324 push @input_files, $file;
15325}
5beb625e 15326
99870f4d
KW
15327if ( $verbosity >= $VERBOSE ) {
15328 print "Expecting ".scalar( @input_files )." input files. ",
15329 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
15330}
cf25bb62 15331
aeab6150
KW
15332# We set $most_recent to be the most recently changed input file, including
15333# this program itself (done much earlier in this file)
99870f4d 15334foreach my $in (@input_files) {
cdcef19a
KW
15335 next unless -e $in; # Keep going even if missing a file
15336 my $mod_time = (stat $in)[9];
aeab6150 15337 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
15338
15339 # See that the input files have distinct names, to warn someone if they
15340 # are adding a new one
15341 if ($make_list) {
15342 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
15343 $directories =~ s;/$;;; # Can have extraneous trailing '/'
15344 my @directories = File::Spec->splitdir($directories);
15345 my $base = $file =~ s/\.txt$//;
15346 construct_filename($file, 'mutable', \@directories);
cf25bb62 15347 }
99870f4d 15348}
cf25bb62 15349
dff6c046 15350my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 15351 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 15352 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 15353
99870f4d
KW
15354# Now we check to see if any output files are older than youngest, if
15355# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 15356if (! $rebuild) {
99870f4d
KW
15357 foreach my $out (@mktables_list_output_files) {
15358 if ( ! file_exists($out)) {
15359 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 15360 $rebuild = 1;
99870f4d
KW
15361 last;
15362 }
15363 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
15364 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
15365 if ( (stat $out)[9] <= $most_recent ) {
15366 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 15367 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 15368 $rebuild = 1;
99870f4d 15369 last;
cf25bb62 15370 }
cf25bb62 15371 }
99870f4d 15372}
d1d1cd7a 15373if (! $rebuild) {
1265e11f 15374 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
15375 exit(0);
15376}
15377print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 15378
99870f4d
KW
15379# Ready to do the major processing. First create the perl pseudo-property.
15380$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 15381
99870f4d
KW
15382# Process each input file
15383foreach my $file (@input_file_objects) {
15384 $file->run;
d2d499f5
JH
15385}
15386
99870f4d 15387# Finish the table generation.
c4051cc5 15388
99870f4d
KW
15389print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
15390finish_Unicode();
c4051cc5 15391
99870f4d
KW
15392print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
15393compile_perl();
c4051cc5 15394
99870f4d
KW
15395print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
15396add_perl_synonyms();
c4051cc5 15397
99870f4d
KW
15398print "Writing tables\n" if $verbosity >= $PROGRESS;
15399write_all_tables();
c4051cc5 15400
99870f4d
KW
15401# Write mktables.lst
15402if ( $file_list and $make_list ) {
c4051cc5 15403
99870f4d
KW
15404 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
15405 foreach my $file (@input_files, @files_actually_output) {
15406 my (undef, $directories, $file) = File::Spec->splitpath($file);
15407 my @directories = File::Spec->splitdir($directories);
15408 $file = join '/', @directories, $file;
15409 }
15410
15411 my $ofh;
15412 if (! open $ofh,">",$file_list) {
15413 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
15414 return
15415 }
15416 else {
cdcef19a 15417 my $localtime = localtime $start_time;
99870f4d
KW
15418 print $ofh <<"END";
15419#
15420# $file_list -- File list for $0.
97050450 15421#
cdcef19a 15422# Autogenerated starting on $start_time ($localtime)
97050450
YO
15423#
15424# - First section is input files
99870f4d 15425# ($0 itself is not listed but is automatically considered an input)
98dc9551 15426# - Section separator is /^=+\$/
97050450
YO
15427# - Second section is a list of output files.
15428# - Lines matching /^\\s*#/ are treated as comments
15429# which along with blank lines are ignored.
15430#
15431
15432# Input files:
15433
99870f4d
KW
15434END
15435 print $ofh "$_\n" for sort(@input_files);
15436 print $ofh "\n=================================\n# Output files:\n\n";
15437 print $ofh "$_\n" for sort @files_actually_output;
15438 print $ofh "\n# ",scalar(@input_files)," input files\n",
15439 "# ",scalar(@files_actually_output)+1," output files\n\n",
15440 "# End list\n";
15441 close $ofh
15442 or Carp::my_carp("Failed to close $ofh: $!");
15443
15444 print "Filelist has ",scalar(@input_files)," input files and ",
15445 scalar(@files_actually_output)+1," output files\n"
15446 if $verbosity >= $VERBOSE;
15447 }
15448}
15449
15450# Output these warnings unless -q explicitly specified.
c83dffeb 15451if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
99870f4d
KW
15452 if (@unhandled_properties) {
15453 print "\nProperties and tables that unexpectedly have no code points\n";
15454 foreach my $property (sort @unhandled_properties) {
15455 print $property, "\n";
15456 }
15457 }
15458
15459 if (%potential_files) {
15460 print "\nInput files that are not considered:\n";
15461 foreach my $file (sort keys %potential_files) {
15462 print File::Spec->abs2rel($file), "\n";
15463 }
15464 }
15465 print "\nAll done\n" if $verbosity >= $VERBOSE;
15466}
5beb625e 15467exit(0);
cf25bb62 15468
99870f4d 15469# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 15470__DATA__
99870f4d 15471
5beb625e
JH
15472use strict;
15473use warnings;
15474
66fd7fd0
KW
15475# If run outside the normal test suite on an ASCII platform, you can
15476# just create a latin1_to_native() function that just returns its
15477# inputs, because that's the only function used from test.pl
15478require "test.pl";
15479
37e2e78e
KW
15480# Test qr/\X/ and the \p{} regular expression constructs. This file is
15481# constructed by mktables from the tables it generates, so if mktables is
15482# buggy, this won't necessarily catch those bugs. Tests are generated for all
15483# feasible properties; a few aren't currently feasible; see
15484# is_code_point_usable() in mktables for details.
99870f4d
KW
15485
15486# Standard test packages are not used because this manipulates SIG_WARN. It
15487# exits 0 if every non-skipped test succeeded; -1 if any failed.
15488
5beb625e
JH
15489my $Tests = 0;
15490my $Fails = 0;
99870f4d 15491
99870f4d
KW
15492sub Expect($$$$) {
15493 my $expected = shift;
15494 my $ord = shift;
15495 my $regex = shift;
15496 my $warning_type = shift; # Type of warning message, like 'deprecated'
15497 # or empty if none
15498 my $line = (caller)[2];
66fd7fd0 15499 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 15500
99870f4d 15501 # Convert the code point to hex form
23e33b60 15502 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 15503
99870f4d 15504 my @tests = "";
5beb625e 15505
37e2e78e
KW
15506 # The first time through, use all warnings. If the input should generate
15507 # a warning, add another time through with them turned off
99870f4d
KW
15508 push @tests, "no warnings '$warning_type';" if $warning_type;
15509
15510 foreach my $no_warnings (@tests) {
15511
15512 # Store any warning messages instead of outputting them
15513 local $SIG{__WARN__} = $SIG{__WARN__};
15514 my $warning_message;
15515 $SIG{__WARN__} = sub { $warning_message = $_[0] };
15516
15517 $Tests++;
15518
15519 # A string eval is needed because of the 'no warnings'.
15520 # Assumes no parens in the regular expression
15521 my $result = eval "$no_warnings
15522 my \$RegObj = qr($regex);
15523 $string =~ \$RegObj ? 1 : 0";
15524 if (not defined $result) {
15525 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
15526 $Fails++;
15527 }
15528 elsif ($result ^ $expected) {
15529 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
15530 $Fails++;
15531 }
15532 elsif ($warning_message) {
15533 if (! $warning_type || ($warning_type && $no_warnings)) {
15534 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
15535 $Fails++;
15536 }
15537 else {
15538 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
15539 }
15540 }
15541 elsif ($warning_type && ! $no_warnings) {
15542 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
15543 $Fails++;
15544 }
15545 else {
15546 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
15547 }
5beb625e 15548 }
99870f4d 15549 return;
5beb625e 15550}
d73e5302 15551
99870f4d
KW
15552sub Error($) {
15553 my $regex = shift;
5beb625e 15554 $Tests++;
99870f4d 15555 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 15556 $Fails++;
99870f4d
KW
15557 my $line = (caller)[2];
15558 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 15559 }
99870f4d
KW
15560 else {
15561 my $line = (caller)[2];
15562 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
15563 }
15564 return;
5beb625e
JH
15565}
15566
37e2e78e
KW
15567# GCBTest.txt character that separates grapheme clusters
15568my $breakable_utf8 = my $breakable = chr(0xF7);
15569utf8::upgrade($breakable_utf8);
15570
15571# GCBTest.txt character that indicates that the adjoining code points are part
15572# of the same grapheme cluster
15573my $nobreak_utf8 = my $nobreak = chr(0xD7);
15574utf8::upgrade($nobreak_utf8);
15575
15576sub Test_X($) {
15577 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
15578 # Each such line is a sequence of code points given by their hex numbers,
15579 # separated by the two characters defined just before this subroutine that
15580 # indicate that either there can or cannot be a break between the adjacent
15581 # code points. If there isn't a break, that means the sequence forms an
15582 # extended grapheme cluster, which means that \X should match the whole
15583 # thing. If there is a break, \X should stop there. This is all
15584 # converted by this routine into a match:
15585 # $string =~ /(\X)/,
15586 # Each \X should match the next cluster; and that is what is checked.
15587
15588 my $template = shift;
15589
15590 my $line = (caller)[2];
15591
15592 # The line contains characters above the ASCII range, but in Latin1. It
15593 # may or may not be in utf8, and if it is, it may or may not know it. So,
15594 # convert these characters to 8 bits. If knows is in utf8, simply
15595 # downgrade.
15596 if (utf8::is_utf8($template)) {
15597 utf8::downgrade($template);
15598 } else {
15599
15600 # Otherwise, if it is in utf8, but doesn't know it, the next lines
15601 # convert the two problematic characters to their 8-bit equivalents.
15602 # If it isn't in utf8, they don't harm anything.
15603 use bytes;
15604 $template =~ s/$nobreak_utf8/$nobreak/g;
15605 $template =~ s/$breakable_utf8/$breakable/g;
15606 }
15607
15608 # Get rid of the leading and trailing breakables
15609 $template =~ s/^ \s* $breakable \s* //x;
15610 $template =~ s/ \s* $breakable \s* $ //x;
15611
15612 # And no-breaks become just a space.
15613 $template =~ s/ \s* $nobreak \s* / /xg;
15614
15615 # Split the input into segments that are breakable between them.
15616 my @segments = split /\s*$breakable\s*/, $template;
15617
15618 my $string = "";
15619 my $display_string = "";
15620 my @should_match;
15621 my @should_display;
15622
15623 # Convert the code point sequence in each segment into a Perl string of
15624 # characters
15625 foreach my $segment (@segments) {
15626 my @code_points = split /\s+/, $segment;
15627 my $this_string = "";
15628 my $this_display = "";
15629 foreach my $code_point (@code_points) {
66fd7fd0 15630 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
15631 $this_display .= "\\x{$code_point}";
15632 }
15633
15634 # The next cluster should match the string in this segment.
15635 push @should_match, $this_string;
15636 push @should_display, $this_display;
15637 $string .= $this_string;
15638 $display_string .= $this_display;
15639 }
15640
15641 # If a string can be represented in both non-ut8 and utf8, test both cases
15642 UPGRADE:
15643 for my $to_upgrade (0 .. 1) {
678f13d5 15644
37e2e78e
KW
15645 if ($to_upgrade) {
15646
15647 # If already in utf8, would just be a repeat
15648 next UPGRADE if utf8::is_utf8($string);
15649
15650 utf8::upgrade($string);
15651 }
15652
15653 # Finally, do the \X match.
15654 my @matches = $string =~ /(\X)/g;
15655
15656 # Look through each matched cluster to verify that it matches what we
15657 # expect.
15658 my $min = (@matches < @should_match) ? @matches : @should_match;
15659 for my $i (0 .. $min - 1) {
15660 $Tests++;
15661 if ($matches[$i] eq $should_match[$i]) {
15662 print "ok $Tests - ";
15663 if ($i == 0) {
15664 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
15665 } else {
15666 print "And \\X #", $i + 1,
15667 }
15668 print " correctly matched $should_display[$i]; line $line\n";
15669 } else {
15670 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
15671 unpack("U*", $matches[$i]));
15672 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
15673 $i + 1,
15674 " should have matched $should_display[$i]",
15675 " but instead matched $matches[$i]",
15676 ". Abandoning rest of line $line\n";
15677 next UPGRADE;
15678 }
15679 }
15680
15681 # And the number of matches should equal the number of expected matches.
15682 $Tests++;
15683 if (@matches == @should_match) {
15684 print "ok $Tests - Nothing was left over; line $line\n";
15685 } else {
15686 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
15687 }
15688 }
15689
15690 return;
15691}
15692
99870f4d 15693sub Finished() {
f86864ac 15694 print "1..$Tests\n";
99870f4d 15695 exit($Fails ? -1 : 0);
5beb625e 15696}
99870f4d
KW
15697
15698Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 15699Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
15700Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
15701Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 15702Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726