This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: perluniprops change prop-val generation
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a 19my $start_time;
98dc9551 20BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
21 # get it as close as possible
22 $start_time= time;
23}
24
23e33b60 25require 5.010_001;
d73e5302 26use strict;
99870f4d 27use warnings;
cf25bb62 28use Carp;
bd9ebcfd 29use Config;
99870f4d
KW
30use File::Find;
31use File::Path;
d07a55ed 32use File::Spec;
99870f4d
KW
33use Text::Tabs;
34
35sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 36my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d
KW
37
38##########################################################################
39#
40# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
41# from the Unicode database files (lib/unicore/.../*.txt), It also generates
42# a pod file and a .t file
43#
44# The structure of this file is:
45# First these introductory comments; then
46# code needed for everywhere, such as debugging stuff; then
47# code to handle input parameters; then
48# data structures likely to be of external interest (some of which depend on
49# the input parameters, so follows them; then
50# more data structures and subroutine and package (class) definitions; then
51# the small actual loop to process the input files and finish up; then
52# a __DATA__ section, for the .t tests
53#
5f7264c7 54# This program works on all releases of Unicode through at least 6.0. The
99870f4d
KW
55# outputs have been scrutinized most intently for release 5.1. The others
56# have been checked for somewhat more than just sanity. It can handle all
57# existing Unicode character properties in those releases.
58#
99870f4d
KW
59# This program is mostly about Unicode character (or code point) properties.
60# A property describes some attribute or quality of a code point, like if it
61# is lowercase or not, its name, what version of Unicode it was first defined
62# in, or what its uppercase equivalent is. Unicode deals with these disparate
63# possibilities by making all properties into mappings from each code point
64# into some corresponding value. In the case of it being lowercase or not,
65# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
66# property maps each Unicode code point to a single value, called a "property
67# value". (Hence each Unicode property is a true mathematical function with
68# exactly one value per code point.)
69#
70# When using a property in a regular expression, what is desired isn't the
71# mapping of the code point to its property's value, but the reverse (or the
72# mathematical "inverse relation"): starting with the property value, "Does a
73# code point map to it?" These are written in a "compound" form:
74# \p{property=value}, e.g., \p{category=punctuation}. This program generates
75# files containing the lists of code points that map to each such regular
76# expression property value, one file per list
77#
78# There is also a single form shortcut that Perl adds for many of the commonly
79# used properties. This happens for all binary properties, plus script,
80# general_category, and block properties.
81#
82# Thus the outputs of this program are files. There are map files, mostly in
83# the 'To' directory; and there are list files for use in regular expression
84# matching, all in subdirectories of the 'lib' directory, with each
85# subdirectory being named for the property that the lists in it are for.
86# Bookkeeping, test, and documentation files are also generated.
87
88my $matches_directory = 'lib'; # Where match (\p{}) files go.
89my $map_directory = 'To'; # Where map files go.
90
91# DATA STRUCTURES
92#
93# The major data structures of this program are Property, of course, but also
94# Table. There are two kinds of tables, very similar to each other.
95# "Match_Table" is the data structure giving the list of code points that have
96# a particular property value, mentioned above. There is also a "Map_Table"
97# data structure which gives the property's mapping from code point to value.
98# There are two structures because the match tables need to be combined in
99# various ways, such as constructing unions, intersections, complements, etc.,
100# and the map ones don't. And there would be problems, perhaps subtle, if
101# a map table were inadvertently operated on in some of those ways.
102# The use of separate classes with operations defined on one but not the other
103# prevents accidentally confusing the two.
104#
105# At the heart of each table's data structure is a "Range_List", which is just
106# an ordered list of "Ranges", plus ancillary information, and methods to
107# operate on them. A Range is a compact way to store property information.
108# Each range has a starting code point, an ending code point, and a value that
109# is meant to apply to all the code points between the two end points,
110# inclusive. For a map table, this value is the property value for those
111# code points. Two such ranges could be written like this:
112# 0x41 .. 0x5A, 'Upper',
113# 0x61 .. 0x7A, 'Lower'
114#
115# Each range also has a type used as a convenience to classify the values.
116# Most ranges in this program will be Type 0, or normal, but there are some
117# ranges that have a non-zero type. These are used only in map tables, and
118# are for mappings that don't fit into the normal scheme of things. Mappings
119# that require a hash entry to communicate with utf8.c are one example;
120# another example is mappings for charnames.pm to use which indicate a name
121# that is algorithmically determinable from its code point (and vice-versa).
122# These are used to significantly compact these tables, instead of listing
123# each one of the tens of thousands individually.
124#
125# In a match table, the value of a range is irrelevant (and hence the type as
126# well, which will always be 0), and arbitrarily set to the null string.
127# Using the example above, there would be two match tables for those two
128# entries, one named Upper would contain the 0x41..0x5A range, and the other
129# named Lower would contain 0x61..0x7A.
130#
131# Actually, there are two types of range lists, "Range_Map" is the one
132# associated with map tables, and "Range_List" with match tables.
133# Again, this is so that methods can be defined on one and not the other so as
134# to prevent operating on them in incorrect ways.
135#
136# Eventually, most tables are written out to files to be read by utf8_heavy.pl
137# in the perl core. All tables could in theory be written, but some are
138# suppressed because there is no current practical use for them. It is easy
139# to change which get written by changing various lists that are near the top
140# of the actual code in this file. The table data structures contain enough
141# ancillary information to allow them to be treated as separate entities for
142# writing, such as the path to each one's file. There is a heading in each
143# map table that gives the format of its entries, and what the map is for all
144# the code points missing from it. (This allows tables to be more compact.)
678f13d5 145#
99870f4d
KW
146# The Property data structure contains one or more tables. All properties
147# contain a map table (except the $perl property which is a
148# pseudo-property containing only match tables), and any properties that
149# are usable in regular expression matches also contain various matching
150# tables, one for each value the property can have. A binary property can
151# have two values, True and False (or Y and N, which are preferred by Unicode
152# terminology). Thus each of these properties will have a map table that
153# takes every code point and maps it to Y or N (but having ranges cuts the
154# number of entries in that table way down), and two match tables, one
155# which has a list of all the code points that map to Y, and one for all the
156# code points that map to N. (For each of these, a third table is also
157# generated for the pseudo Perl property. It contains the identical code
158# points as the Y table, but can be written, not in the compound form, but in
159# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
160# properties have several possible values, some have many, and properties like
161# Name have a different value for every named code point. Those will not,
162# unless the controlling lists are changed, have their match tables written
163# out. But all the ones which can be used in regular expression \p{} and \P{}
c12f2655
KW
164# constructs will. Prior to 5.14, generally a property would have either its
165# map table or its match tables written but not both. Again, what gets
166# written is controlled by lists which can easily be changed. Starting in
167# 5.14, advantage was taken of this, and all the map tables needed to
168# reconstruct the Unicode db are now written out, while suppressing the
169# Unicode .txt files that contain the data. Our tables are much more compact
170# than the .txt files, so a significant space savings was achieved.
171
172# Properties have a 'Type', like binary, or string, or enum depending on how
173# many match tables there are and the content of the maps. This 'Type' is
174# different than a range 'Type', so don't get confused by the two concepts
175# having the same name.
678f13d5 176#
99870f4d
KW
177# For information about the Unicode properties, see Unicode's UAX44 document:
178
179my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
180
181# As stated earlier, this program will work on any release of Unicode so far.
182# Most obvious problems in earlier data have NOT been corrected except when
183# necessary to make Perl or this program work reasonably. For example, no
c12f2655
KW
184# folding information was given in early releases, so this program substitutes
185# lower case instead, just so that a regular expression with the /i option
186# will do something that actually gives the right results in many cases.
187# There are also a couple other corrections for version 1.1.5, commented at
188# the point they are made. As an example of corrections that weren't made
189# (but could be) is this statement from DerivedAge.txt: "The supplementary
190# private use code points and the non-character code points were assigned in
191# version 2.0, but not specifically listed in the UCD until versions 3.0 and
192# 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
193# on Unicode version glitches is further down in these introductory comments.
99870f4d 194#
5f7264c7
KW
195# This program works on all non-provisional properties as of 6.0, though the
196# files for some are suppressed from apparent lack of demand for them. You
197# can change which are output by changing lists in this program.
678f13d5 198#
dc85bd38 199# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
200# loose matchings rules (from Unicode TR18):
201#
202# The recommended names for UCD properties and property values are in
203# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
204# [PropValue]. There are both abbreviated names and longer, more
205# descriptive names. It is strongly recommended that both names be
206# recognized, and that loose matching of property names be used,
207# whereby the case distinctions, whitespace, hyphens, and underbar
208# are ignored.
209# The program still allows Fuzzy to override its determination of if loose
210# matching should be used, but it isn't currently used, as it is no longer
211# needed; the calculations it makes are good enough.
678f13d5 212#
99870f4d
KW
213# SUMMARY OF HOW IT WORKS:
214#
215# Process arguments
216#
217# A list is constructed containing each input file that is to be processed
218#
219# Each file on the list is processed in a loop, using the associated handler
220# code for each:
221# The PropertyAliases.txt and PropValueAliases.txt files are processed
222# first. These files name the properties and property values.
223# Objects are created of all the property and property value names
224# that the rest of the input should expect, including all synonyms.
225# The other input files give mappings from properties to property
226# values. That is, they list code points and say what the mapping
227# is under the given property. Some files give the mappings for
228# just one property; and some for many. This program goes through
229# each file and populates the properties from them. Some properties
230# are listed in more than one file, and Unicode has set up a
231# precedence as to which has priority if there is a conflict. Thus
232# the order of processing matters, and this program handles the
233# conflict possibility by processing the overriding input files
234# last, so that if necessary they replace earlier values.
235# After this is all done, the program creates the property mappings not
236# furnished by Unicode, but derivable from what it does give.
237# The tables of code points that match each property value in each
238# property that is accessible by regular expressions are created.
239# The Perl-defined properties are created and populated. Many of these
240# require data determined from the earlier steps
241# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 242# and Unicode are reconciled and warned about.
99870f4d
KW
243# All the properties are written to files
244# Any other files are written, and final warnings issued.
678f13d5 245#
99870f4d
KW
246# For clarity, a number of operators have been overloaded to work on tables:
247# ~ means invert (take all characters not in the set). The more
248# conventional '!' is not used because of the possibility of confusing
249# it with the actual boolean operation.
250# + means union
251# - means subtraction
252# & means intersection
253# The precedence of these is the order listed. Parentheses should be
254# copiously used. These are not a general scheme. The operations aren't
255# defined for a number of things, deliberately, to avoid getting into trouble.
256# Operations are done on references and affect the underlying structures, so
257# that the copy constructors for them have been overloaded to not return a new
258# clone, but the input object itself.
678f13d5 259#
99870f4d
KW
260# The bool operator is deliberately not overloaded to avoid confusion with
261# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
262#
263# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
264#
265# This program needs to be able to run under miniperl. Therefore, it uses a
266# minimum of other modules, and hence implements some things itself that could
267# be gotten from CPAN
268#
269# This program uses inputs published by the Unicode Consortium. These can
270# change incompatibly between releases without the Perl maintainers realizing
271# it. Therefore this program is now designed to try to flag these. It looks
272# at the directories where the inputs are, and flags any unrecognized files.
273# It keeps track of all the properties in the files it handles, and flags any
274# that it doesn't know how to handle. It also flags any input lines that
275# don't match the expected syntax, among other checks.
276#
277# It is also designed so if a new input file matches one of the known
278# templates, one hopefully just needs to add it to a list to have it
279# processed.
280#
281# As mentioned earlier, some properties are given in more than one file. In
282# particular, the files in the extracted directory are supposedly just
283# reformattings of the others. But they contain information not easily
284# derivable from the other files, including results for Unihan, which this
285# program doesn't ordinarily look at, and for unassigned code points. They
286# also have historically had errors or been incomplete. In an attempt to
287# create the best possible data, this program thus processes them first to
288# glean information missing from the other files; then processes those other
289# files to override any errors in the extracted ones. Much of the design was
290# driven by this need to store things and then possibly override them.
291#
292# It tries to keep fatal errors to a minimum, to generate something usable for
293# testing purposes. It always looks for files that could be inputs, and will
294# warn about any that it doesn't know how to handle (the -q option suppresses
295# the warning).
99870f4d 296#
678f13d5
KW
297# Why is there more than one type of range?
298# This simplified things. There are some very specialized code points that
299# have to be handled specially for output, such as Hangul syllable names.
300# By creating a range type (done late in the development process), it
301# allowed this to be stored with the range, and overridden by other input.
302# Originally these were stored in another data structure, and it became a
303# mess trying to decide if a second file that was for the same property was
304# overriding the earlier one or not.
305#
306# Why are there two kinds of tables, match and map?
307# (And there is a base class shared by the two as well.) As stated above,
308# they actually are for different things. Development proceeded much more
309# smoothly when I (khw) realized the distinction. Map tables are used to
310# give the property value for every code point (actually every code point
311# that doesn't map to a default value). Match tables are used for regular
312# expression matches, and are essentially the inverse mapping. Separating
313# the two allows more specialized methods, and error checks so that one
314# can't just take the intersection of two map tables, for example, as that
315# is nonsensical.
99870f4d 316#
23e33b60
KW
317# DEBUGGING
318#
678f13d5
KW
319# This program is written so it will run under miniperl. Occasionally changes
320# will cause an error where the backtrace doesn't work well under miniperl.
321# To diagnose the problem, you can instead run it under regular perl, if you
322# have one compiled.
323#
324# There is a good trace facility. To enable it, first sub DEBUG must be set
325# to return true. Then a line like
326#
327# local $to_trace = 1 if main::DEBUG;
328#
329# can be added to enable tracing in its lexical scope or until you insert
330# another line:
331#
332# local $to_trace = 0 if main::DEBUG;
333#
334# then use a line like "trace $a, @b, %c, ...;
335#
336# Some of the more complex subroutines already have trace statements in them.
337# Permanent trace statements should be like:
338#
339# trace ... if main::DEBUG && $to_trace;
340#
341# If there is just one or a few files that you're debugging, you can easily
342# cause most everything else to be skipped. Change the line
343#
344# my $debug_skip = 0;
345#
346# to 1, and every file whose object is in @input_file_objects and doesn't have
347# a, 'non_skip => 1,' in its constructor will be skipped.
348#
b4a0206c 349# To compare the output tables, it may be useful to specify the -annotate
c4019d52
KW
350# flag. This causes the tables to expand so there is one entry for each
351# non-algorithmically named code point giving, currently its name, and its
352# graphic representation if printable (and you have a font that knows about
353# it). This makes it easier to see what the particular code points are in
354# each output table. The tables are usable, but because they don't have
355# ranges (for the most part), a Perl using them will run slower. Non-named
356# code points are annotated with a description of their status, and contiguous
357# ones with the same description will be output as a range rather than
358# individually. Algorithmically named characters are also output as ranges,
359# except when there are just a few contiguous ones.
360#
99870f4d
KW
361# FUTURE ISSUES
362#
363# The program would break if Unicode were to change its names so that
364# interior white space, underscores, or dashes differences were significant
365# within property and property value names.
366#
367# It might be easier to use the xml versions of the UCD if this program ever
368# would need heavy revision, and the ability to handle old versions was not
369# required.
370#
371# There is the potential for name collisions, in that Perl has chosen names
372# that Unicode could decide it also likes. There have been such collisions in
373# the past, with mostly Perl deciding to adopt the Unicode definition of the
374# name. However in the 5.2 Unicode beta testing, there were a number of such
375# collisions, which were withdrawn before the final release, because of Perl's
376# and other's protests. These all involved new properties which began with
377# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
378# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
379# Unicode document, so they are unlikely to be used by Unicode for another
380# purpose. However, they might try something beginning with 'In', or use any
381# of the other Perl-defined properties. This program will warn you of name
382# collisions, and refuse to generate tables with them, but manual intervention
383# will be required in this event. One scheme that could be implemented, if
384# necessary, would be to have this program generate another file, or add a
385# field to mktables.lst that gives the date of first definition of a property.
386# Each new release of Unicode would use that file as a basis for the next
387# iteration. And the Perl synonym addition code could sort based on the age
388# of the property, so older properties get priority, and newer ones that clash
389# would be refused; hence existing code would not be impacted, and some other
390# synonym would have to be used for the new property. This is ugly, and
391# manual intervention would certainly be easier to do in the short run; lets
392# hope it never comes to this.
678f13d5 393#
99870f4d
KW
394# A NOTE ON UNIHAN
395#
396# This program can generate tables from the Unihan database. But it doesn't
397# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
398# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
399# database was split into 8 different files, all beginning with the letters
400# 'Unihan'. This program will read those file(s) if present, but it needs to
401# know which of the many properties in the file(s) should have tables created
402# for them. It will create tables for any properties listed in
403# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
404# @cjk_properties array and the @cjk_property_values array. Thus, if a
405# property you want is not in those files of the release you are building
406# against, you must add it to those two arrays. Starting in 4.0, the
407# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
408# is present in the directory, a table will be generated for that property.
409# In 5.2, several more properties were added. For your convenience, the two
5f7264c7 410# arrays are initialized with all the 6.0 listed properties that are also in
99870f4d
KW
411# earlier releases. But these are commented out. You can just uncomment the
412# ones you want, or use them as a template for adding entries for other
413# properties.
414#
415# You may need to adjust the entries to suit your purposes. setup_unihan(),
416# and filter_unihan_line() are the functions where this is done. This program
417# already does some adjusting to make the lines look more like the rest of the
418# Unicode DB; You can see what that is in filter_unihan_line()
419#
420# There is a bug in the 3.2 data file in which some values for the
421# kPrimaryNumeric property have commas and an unexpected comment. A filter
422# could be added for these; or for a particular installation, the Unihan.txt
423# file could be edited to fix them.
99870f4d 424#
678f13d5
KW
425# HOW TO ADD A FILE TO BE PROCESSED
426#
427# A new file from Unicode needs to have an object constructed for it in
428# @input_file_objects, probably at the end or at the end of the extracted
429# ones. The program should warn you if its name will clash with others on
430# restrictive file systems, like DOS. If so, figure out a better name, and
431# add lines to the README.perl file giving that. If the file is a character
432# property, it should be in the format that Unicode has by default
433# standardized for such files for the more recently introduced ones.
434# If so, the Input_file constructor for @input_file_objects can just be the
435# file name and release it first appeared in. If not, then it should be
436# possible to construct an each_line_handler() to massage the line into the
437# standardized form.
438#
439# For non-character properties, more code will be needed. You can look at
440# the existing entries for clues.
441#
442# UNICODE VERSIONS NOTES
443#
444# The Unicode UCD has had a number of errors in it over the versions. And
445# these remain, by policy, in the standard for that version. Therefore it is
446# risky to correct them, because code may be expecting the error. So this
447# program doesn't generally make changes, unless the error breaks the Perl
448# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
449# for U+1105, which causes real problems for the algorithms for Jamo
450# calculations, so it is changed here.
451#
452# But it isn't so clear cut as to what to do about concepts that are
453# introduced in a later release; should they extend back to earlier releases
454# where the concept just didn't exist? It was easier to do this than to not,
455# so that's what was done. For example, the default value for code points not
456# in the files for various properties was probably undefined until changed by
457# some version. No_Block for blocks is such an example. This program will
458# assign No_Block even in Unicode versions that didn't have it. This has the
459# benefit that code being written doesn't have to special case earlier
460# versions; and the detriment that it doesn't match the Standard precisely for
461# the affected versions.
462#
463# Here are some observations about some of the issues in early versions:
464#
6426c51b 465# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
678f13d5
KW
466# the reason is that the CJK block starting at 4E00 was removed from PropList,
467# and was not put back in until 3.1.0
468#
469# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
470# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
471# reason is that 3.2 introduced U+205F=medium math space, which was not
472# classed as white space, but Perl figured out that it should have been. 4.0
473# reclassified it correctly.
474#
475# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
476# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
477# was left with no code points, as all the ones that mapped to 202 stayed
478# mapped to 202. Thus if your program used the numeric name for the class,
479# it would not have been affected, but if it used the mnemonic, it would have
480# been.
481#
482# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
483# points which eventually came to have this script property value, instead
484# mapped to "Unknown". But in the next release all these code points were
485# moved to \p{sc=common} instead.
99870f4d
KW
486#
487# The default for missing code points for BidiClass is complicated. Starting
488# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
489# tries to do the best it can for earlier releases. It is done in
490# process_PropertyAliases()
491#
492##############################################################################
493
494my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
495 # and errors
496my $MAX_LINE_WIDTH = 78;
497
498# Debugging aid to skip most files so as to not be distracted by them when
499# concentrating on the ones being debugged. Add
500# non_skip => 1,
501# to the constructor for those files you want processed when you set this.
502# Files with a first version number of 0 are special: they are always
c12f2655
KW
503# processed regardless of the state of this flag. Generally, Jamo.txt and
504# UnicodeData.txt must not be skipped if you want this program to not die
505# before normal completion.
99870f4d
KW
506my $debug_skip = 0;
507
508# Set to 1 to enable tracing.
509our $to_trace = 0;
510
511{ # Closure for trace: debugging aid
512 my $print_caller = 1; # ? Include calling subroutine name
513 my $main_with_colon = 'main::';
514 my $main_colon_length = length($main_with_colon);
515
516 sub trace {
517 return unless $to_trace; # Do nothing if global flag not set
518
519 my @input = @_;
520
521 local $DB::trace = 0;
522 $DB::trace = 0; # Quiet 'used only once' message
523
524 my $line_number;
525
526 # Loop looking up the stack to get the first non-trace caller
527 my $caller_line;
528 my $caller_name;
529 my $i = 0;
530 do {
531 $line_number = $caller_line;
532 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
533 $caller = $main_with_colon unless defined $caller;
534
535 $caller_name = $caller;
536
537 # get rid of pkg
538 $caller_name =~ s/.*:://;
539 if (substr($caller_name, 0, $main_colon_length)
540 eq $main_with_colon)
541 {
542 $caller_name = substr($caller_name, $main_colon_length);
543 }
544
545 } until ($caller_name ne 'trace');
546
547 # If the stack was empty, we were called from the top level
548 $caller_name = 'main' if ($caller_name eq ""
549 || $caller_name eq 'trace');
550
551 my $output = "";
552 foreach my $string (@input) {
553 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
554 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
555 $output .= simple_dumper($string);
556 }
557 else {
558 $string = "$string" if ref $string;
559 $string = $UNDEF unless defined $string;
560 chomp $string;
561 $string = '""' if $string eq "";
562 $output .= " " if $output ne ""
563 && $string ne ""
564 && substr($output, -1, 1) ne " "
565 && substr($string, 0, 1) ne " ";
566 $output .= $string;
567 }
568 }
569
99f78760
KW
570 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
571 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
572 print STDERR $output, "\n";
573 return;
574 }
575}
576
577# This is for a rarely used development feature that allows you to compare two
578# versions of the Unicode standard without having to deal with changes caused
c12f2655
KW
579# by the code points introduced in the later version. Change the 0 to a
580# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
581# code points introduced in that release and earlier will be used; later ones
582# are thrown away. You use the version number of the earliest one you want to
583# compare; then run this program on directory structures containing each
584# release, and compare the outputs. These outputs will therefore include only
585# the code points common to both releases, and you can see the changes caused
586# just by the underlying release semantic changes. For versions earlier than
587# 3.2, you must copy a version of DAge.txt into the directory.
588my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
99870f4d
KW
589my $compare_versions = DEBUG
590 && $string_compare_versions
591 && pack "C*", split /\./, $string_compare_versions;
592
593sub uniques {
594 # Returns non-duplicated input values. From "Perl Best Practices:
595 # Encapsulated Cleverness". p. 455 in first edition.
596
597 my %seen;
0e407844
NC
598 # Arguably this breaks encapsulation, if the goal is to permit multiple
599 # distinct objects to stringify to the same value, and be interchangeable.
600 # However, for this program, no two objects stringify identically, and all
601 # lists passed to this function are either objects or strings. So this
602 # doesn't affect correctness, but it does give a couple of percent speedup.
603 no overloading;
99870f4d
KW
604 return grep { ! $seen{$_}++ } @_;
605}
606
607$0 = File::Spec->canonpath($0);
608
609my $make_test_script = 0; # ? Should we output a test script
610my $write_unchanged_files = 0; # ? Should we update the output files even if
611 # we don't think they have changed
612my $use_directory = ""; # ? Should we chdir somewhere.
613my $pod_directory; # input directory to store the pod file.
614my $pod_file = 'perluniprops';
615my $t_path; # Path to the .t test file
616my $file_list = 'mktables.lst'; # File to store input and output file names.
617 # This is used to speed up the build, by not
618 # executing the main body of the program if
619 # nothing on the list has changed since the
620 # previous build
621my $make_list = 1; # ? Should we write $file_list. Set to always
622 # make a list so that when the pumpking is
623 # preparing a release, s/he won't have to do
624 # special things
625my $glob_list = 0; # ? Should we try to include unknown .txt files
626 # in the input.
bd9ebcfd
KW
627my $output_range_counts = $debugging_build; # ? Should we include the number
628 # of code points in ranges in
629 # the output
558712cf 630my $annotate = 0; # ? Should character names be in the output
9ef2b94f 631
99870f4d
KW
632# Verbosity levels; 0 is quiet
633my $NORMAL_VERBOSITY = 1;
634my $PROGRESS = 2;
635my $VERBOSE = 3;
636
637my $verbosity = $NORMAL_VERBOSITY;
638
639# Process arguments
640while (@ARGV) {
cf25bb62
JH
641 my $arg = shift @ARGV;
642 if ($arg eq '-v') {
99870f4d
KW
643 $verbosity = $VERBOSE;
644 }
645 elsif ($arg eq '-p') {
646 $verbosity = $PROGRESS;
647 $| = 1; # Flush buffers as we go.
648 }
649 elsif ($arg eq '-q') {
650 $verbosity = 0;
651 }
652 elsif ($arg eq '-w') {
653 $write_unchanged_files = 1; # update the files even if havent changed
654 }
655 elsif ($arg eq '-check') {
6ae7e459
YO
656 my $this = shift @ARGV;
657 my $ok = shift @ARGV;
658 if ($this ne $ok) {
659 print "Skipping as check params are not the same.\n";
660 exit(0);
661 }
00a8df5c 662 }
99870f4d
KW
663 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
664 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
665 }
3df51b85
KW
666 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
667 {
99870f4d 668 $make_test_script = 1;
99870f4d
KW
669 }
670 elsif ($arg eq '-makelist') {
671 $make_list = 1;
672 }
673 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
674 -d $use_directory or croak "Unknown directory '$use_directory'";
675 }
676 elsif ($arg eq '-L') {
677
678 # Existence not tested until have chdir'd
679 $file_list = shift;
680 }
681 elsif ($arg eq '-globlist') {
682 $glob_list = 1;
683 }
684 elsif ($arg eq '-c') {
685 $output_range_counts = ! $output_range_counts
686 }
b4a0206c 687 elsif ($arg eq '-annotate') {
558712cf 688 $annotate = 1;
bd9ebcfd
KW
689 $debugging_build = 1;
690 $output_range_counts = 1;
9ef2b94f 691 }
99870f4d
KW
692 else {
693 my $with_c = 'with';
694 $with_c .= 'out' if $output_range_counts; # Complements the state
695 croak <<END;
696usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
697 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
698 [-check A B ]
699 -c : Output comments $with_c number of code points in ranges
700 -q : Quiet Mode: Only output serious warnings.
701 -p : Set verbosity level to normal plus show progress.
702 -v : Set Verbosity level high: Show progress and non-serious
703 warnings
704 -w : Write files regardless
705 -C dir : Change to this directory before proceeding. All relative paths
706 except those specified by the -P and -T options will be done
707 with respect to this directory.
708 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 709 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
710 -L filelist : Use alternate 'filelist' instead of standard one
711 -globlist : Take as input all non-Test *.txt files in current and sub
712 directories
3df51b85
KW
713 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
714 overrides -T
99870f4d 715 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 716 -annotate : Output an annotation for each character in the table files;
c4019d52
KW
717 useful for debugging mktables, looking at diffs; but is slow,
718 memory intensive; resulting tables are usable but slow and
719 very large.
99870f4d
KW
720 -check A B : Executes $0 only if A and B are the same
721END
722 }
723}
724
725# Stores the most-recently changed file. If none have changed, can skip the
726# build
aeab6150 727my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
728
729# Change directories now, because need to read 'version' early.
730if ($use_directory) {
3df51b85 731 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
732 $pod_directory = File::Spec->rel2abs($pod_directory);
733 }
3df51b85 734 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 735 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 736 }
99870f4d 737 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 738 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 739 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 740 }
3df51b85 741 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 742 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 743 }
00a8df5c
YO
744}
745
99870f4d
KW
746# Get Unicode version into regular and v-string. This is done now because
747# various tables below get populated based on it. These tables are populated
748# here to be near the top of the file, and so easily seeable by those needing
749# to modify things.
750open my $VERSION, "<", "version"
751 or croak "$0: can't open required file 'version': $!\n";
752my $string_version = <$VERSION>;
753close $VERSION;
754chomp $string_version;
755my $v_version = pack "C*", split /\./, $string_version; # v string
756
757# The following are the complete names of properties with property values that
758# are known to not match any code points in some versions of Unicode, but that
759# may change in the future so they should be matchable, hence an empty file is
760# generated for them.
761my @tables_that_may_be_empty = (
762 'Joining_Type=Left_Joining',
763 );
764push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
765push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
766push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
767 if $v_version ge v4.1.0;
82aed44a
KW
768push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
769 if $v_version ge v6.0.0;
99870f4d
KW
770
771# The lists below are hashes, so the key is the item in the list, and the
772# value is the reason why it is in the list. This makes generation of
773# documentation easier.
774
775my %why_suppressed; # No file generated for these.
776
777# Files aren't generated for empty extraneous properties. This is arguable.
778# Extraneous properties generally come about because a property is no longer
779# used in a newer version of Unicode. If we generated a file without code
780# points, programs that used to work on that property will still execute
781# without errors. It just won't ever match (or will always match, with \P{}).
782# This means that the logic is now likely wrong. I (khw) think its better to
783# find this out by getting an error message. Just move them to the table
784# above to change this behavior
785my %why_suppress_if_empty_warn_if_not = (
786
787 # It is the only property that has ever officially been removed from the
788 # Standard. The database never contained any code points for it.
789 'Special_Case_Condition' => 'Obsolete',
790
791 # Apparently never official, but there were code points in some versions of
792 # old-style PropList.txt
793 'Non_Break' => 'Obsolete',
794);
795
796# These would normally go in the warn table just above, but they were changed
797# a long time before this program was written, so warnings about them are
798# moot.
799if ($v_version gt v3.2.0) {
800 push @tables_that_may_be_empty,
801 'Canonical_Combining_Class=Attached_Below_Left'
802}
803
5f7264c7 804# These are listed in the Property aliases file in 6.0, but Unihan is ignored
99870f4d
KW
805# unless explicitly added.
806if ($v_version ge v5.2.0) {
807 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 808 foreach my $table (qw (
99870f4d
KW
809 kAccountingNumeric
810 kOtherNumeric
811 kPrimaryNumeric
812 kCompatibilityVariant
813 kIICore
814 kIRG_GSource
815 kIRG_HSource
816 kIRG_JSource
817 kIRG_KPSource
818 kIRG_MSource
819 kIRG_KSource
820 kIRG_TSource
821 kIRG_USource
822 kIRG_VSource
823 kRSUnicode
ea25a9b2 824 ))
99870f4d
KW
825 {
826 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
827 }
ca12659b
NC
828}
829
272501f6
KW
830# Enum values for to_output_map() method in the Map_Table package.
831my $EXTERNAL_MAP = 1;
832my $INTERNAL_MAP = 2;
833
fcf1973c
KW
834# To override computed values for writing the map tables for these properties.
835# The default for enum map tables is to write them out, so that the Unicode
836# .txt files can be removed, but all the data to compute any property value
837# for any code point is available in a more compact form.
838my %global_to_output_map = (
839 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
c12f2655
KW
840 # get stuck supporting it if things change. Since it is a STRING
841 # property, it normally would be listed in the pod, but INTERNAL_MAP
842 # suppresses that.
fcf1973c
KW
843 Unicode_1_Name => $INTERNAL_MAP,
844
845 Present_In => 0, # Suppress, as easily computed from Age
fcf1973c
KW
846 Block => 0, # Suppress, as Blocks.txt is retained.
847);
848
99870f4d
KW
849# Properties that this program ignores.
850my @unimplemented_properties = (
851'Unicode_Radical_Stroke' # Remove if changing to handle this one.
852);
d73e5302 853
99870f4d
KW
854# There are several types of obsolete properties defined by Unicode. These
855# must be hand-edited for every new Unicode release.
856my %why_deprecated; # Generates a deprecated warning message if used.
857my %why_stabilized; # Documentation only
858my %why_obsolete; # Documentation only
859
860{ # Closure
861 my $simple = 'Perl uses the more complete version of this property';
862 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
863
864 my $other_properties = 'other properties';
865 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 866 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
867
868 %why_deprecated = (
5f7264c7 869 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
870 'Jamo_Short_Name' => $contributory,
871 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
872 'Other_Alphabetic' => $contributory,
873 'Other_Default_Ignorable_Code_Point' => $contributory,
874 'Other_Grapheme_Extend' => $contributory,
875 'Other_ID_Continue' => $contributory,
876 'Other_ID_Start' => $contributory,
877 'Other_Lowercase' => $contributory,
878 'Other_Math' => $contributory,
879 'Other_Uppercase' => $contributory,
880 );
881
882 %why_suppressed = (
5f7264c7 883 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
884 # contains the same information, but without the algorithmically
885 # determinable Hangul syllables'. This file is not published, so it's
886 # existence is not noted in the comment.
887 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
888
889 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
99870f4d
KW
890
891 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
892 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
893 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
894 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
895
896 'Name' => "Accessible via 'use charnames;'",
897 'Name_Alias' => "Accessible via 'use charnames;'",
898
5f7264c7 899 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
900 Expands_On_NFC => $why_no_expand,
901 Expands_On_NFD => $why_no_expand,
902 Expands_On_NFKC => $why_no_expand,
903 Expands_On_NFKD => $why_no_expand,
904 );
905
906 # The following are suppressed because they were made contributory or
907 # deprecated by Unicode before Perl ever thought about supporting them.
908 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
909 $why_suppressed{$property} = $why_deprecated{$property};
910 }
cf25bb62 911
99870f4d
KW
912 # Customize the message for all the 'Other_' properties
913 foreach my $property (keys %why_deprecated) {
914 next if (my $main_property = $property) !~ s/^Other_//;
915 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
916 }
917}
918
919if ($v_version ge 4.0.0) {
920 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
921 if ($v_version ge 6.0.0) {
922 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
923 }
99870f4d 924}
5f7264c7 925if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 926 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7
KW
927 if ($v_version ge 6.0.0) {
928 $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed';
929 }
99870f4d
KW
930}
931
932# Probably obsolete forever
933if ($v_version ge v4.1.0) {
82aed44a
KW
934 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
935}
936if ($v_version ge v6.0.0) {
2b352efd
KW
937 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
938 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
99870f4d
KW
939}
940
941# This program can create files for enumerated-like properties, such as
942# 'Numeric_Type'. This file would be the same format as for a string
943# property, with a mapping from code point to its value, so you could look up,
944# for example, the script a code point is in. But no one so far wants this
945# mapping, or they have found another way to get it since this is a new
946# feature. So no file is generated except if it is in this list.
947my @output_mapped_properties = split "\n", <<END;
948END
949
c12f2655
KW
950# If you are using the Unihan database in a Unicode version before 5.2, you
951# need to add the properties that you want to extract from it to this table.
952# For your convenience, the properties in the 6.0 PropertyAliases.txt file are
953# listed, commented out
99870f4d
KW
954my @cjk_properties = split "\n", <<'END';
955#cjkAccountingNumeric; kAccountingNumeric
956#cjkOtherNumeric; kOtherNumeric
957#cjkPrimaryNumeric; kPrimaryNumeric
958#cjkCompatibilityVariant; kCompatibilityVariant
959#cjkIICore ; kIICore
960#cjkIRG_GSource; kIRG_GSource
961#cjkIRG_HSource; kIRG_HSource
962#cjkIRG_JSource; kIRG_JSource
963#cjkIRG_KPSource; kIRG_KPSource
964#cjkIRG_KSource; kIRG_KSource
965#cjkIRG_TSource; kIRG_TSource
966#cjkIRG_USource; kIRG_USource
967#cjkIRG_VSource; kIRG_VSource
968#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
969END
970
971# Similarly for the property values. For your convenience, the lines in the
5f7264c7 972# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
c12f2655 973# '#' marks (for Unicode versions before 5.2)
99870f4d
KW
974my @cjk_property_values = split "\n", <<'END';
975## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
976## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
977## @missing: 0000..10FFFF; cjkIICore; <none>
978## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
979## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
980## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
981## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
982## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
983## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
984## @missing: 0000..10FFFF; cjkIRG_USource; <none>
985## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
986## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
987## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
988## @missing: 0000..10FFFF; cjkRSUnicode; <none>
989END
990
991# The input files don't list every code point. Those not listed are to be
992# defaulted to some value. Below are hard-coded what those values are for
993# non-binary properties as of 5.1. Starting in 5.0, there are
994# machine-parsable comment lines in the files the give the defaults; so this
995# list shouldn't have to be extended. The claim is that all missing entries
996# for binary properties will default to 'N'. Unicode tried to change that in
997# 5.2, but the beta period produced enough protest that they backed off.
998#
999# The defaults for the fields that appear in UnicodeData.txt in this hash must
1000# be in the form that it expects. The others may be synonyms.
1001my $CODE_POINT = '<code point>';
1002my %default_mapping = (
1003 Age => "Unassigned",
1004 # Bidi_Class => Complicated; set in code
1005 Bidi_Mirroring_Glyph => "",
1006 Block => 'No_Block',
1007 Canonical_Combining_Class => 0,
1008 Case_Folding => $CODE_POINT,
1009 Decomposition_Mapping => $CODE_POINT,
1010 Decomposition_Type => 'None',
1011 East_Asian_Width => "Neutral",
1012 FC_NFKC_Closure => $CODE_POINT,
1013 General_Category => 'Cn',
1014 Grapheme_Cluster_Break => 'Other',
1015 Hangul_Syllable_Type => 'NA',
1016 ISO_Comment => "",
1017 Jamo_Short_Name => "",
1018 Joining_Group => "No_Joining_Group",
1019 # Joining_Type => Complicated; set in code
1020 kIICore => 'N', # Is converted to binary
1021 #Line_Break => Complicated; set in code
1022 Lowercase_Mapping => $CODE_POINT,
1023 Name => "",
1024 Name_Alias => "",
1025 NFC_QC => 'Yes',
1026 NFD_QC => 'Yes',
1027 NFKC_QC => 'Yes',
1028 NFKD_QC => 'Yes',
1029 Numeric_Type => 'None',
1030 Numeric_Value => 'NaN',
1031 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1032 Sentence_Break => 'Other',
1033 Simple_Case_Folding => $CODE_POINT,
1034 Simple_Lowercase_Mapping => $CODE_POINT,
1035 Simple_Titlecase_Mapping => $CODE_POINT,
1036 Simple_Uppercase_Mapping => $CODE_POINT,
1037 Titlecase_Mapping => $CODE_POINT,
1038 Unicode_1_Name => "",
1039 Unicode_Radical_Stroke => "",
1040 Uppercase_Mapping => $CODE_POINT,
1041 Word_Break => 'Other',
1042);
1043
1044# Below are files that Unicode furnishes, but this program ignores, and why
1045my %ignored_files = (
1046 'CJKRadicals.txt' => 'Unihan data',
1047 'Index.txt' => 'An index, not actual data',
1048 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1049 'NamesList.txt' => 'Just adds commentary',
1050 'NormalizationCorrections.txt' => 'Data is already in other files.',
1051 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1052 'ReadMe.txt' => 'Just comments',
1053 'README.TXT' => 'Just comments',
1054 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
5f7264c7
KW
1055 'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications',
1056 'IndicMatraCategory.txt' => 'Provisional',
1057 'IndicSyllabicCategory.txt' => 'Provisional',
99870f4d
KW
1058);
1059
678f13d5 1060### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1061
1062my $HEADER=<<"EOF";
1063# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1064# This file is machine-generated by $0 from the Unicode
1065# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1066EOF
1067
b6922eda 1068my $INTERNAL_ONLY=<<"EOF";
99870f4d
KW
1069
1070# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
fac53429
KW
1071# This file is for internal use by core Perl only. The format and even the
1072# name or existence of this file are subject to change without notice. Don't
1073# use it directly.
99870f4d
KW
1074EOF
1075
1076my $DEVELOPMENT_ONLY=<<"EOF";
1077# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1078# This file contains information artificially constrained to code points
1079# present in Unicode release $string_compare_versions.
1080# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1081# not be used for production.
b6922eda
KW
1082
1083EOF
1084
99870f4d
KW
1085my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1086my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1087my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1088
1089# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1090# two must be 10; if there are 5, the first must not be a 0. Written this way
8c32d378
KW
1091# to decrease backtracking. The first one allows the code point to be at the
1092# end of a word, but to work properly, the word shouldn't end with a valid hex
1093# character. The second one won't match a code point at the end of a word,
1094# and doesn't have the run-on issue
1095my $run_on_code_point_re =
1096 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1097my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1098
1099# This matches the beginning of the line in the Unicode db files that give the
1100# defaults for code points not listed (i.e., missing) in the file. The code
1101# depends on this ending with a semi-colon, so it can assume it is a valid
1102# field when the line is split() by semi-colons
1103my $missing_defaults_prefix =
1104 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1105
1106# Property types. Unicode has more types, but these are sufficient for our
1107# purposes.
1108my $UNKNOWN = -1; # initialized to illegal value
1109my $NON_STRING = 1; # Either binary or enum
1110my $BINARY = 2;
1111my $ENUM = 3; # Include catalog
1112my $STRING = 4; # Anything else: string or misc
1113
1114# Some input files have lines that give default values for code points not
1115# contained in the file. Sometimes these should be ignored.
1116my $NO_DEFAULTS = 0; # Must evaluate to false
1117my $NOT_IGNORED = 1;
1118my $IGNORED = 2;
1119
1120# Range types. Each range has a type. Most ranges are type 0, for normal,
1121# and will appear in the main body of the tables in the output files, but
1122# there are other types of ranges as well, listed below, that are specially
1123# handled. There are pseudo-types as well that will never be stored as a
1124# type, but will affect the calculation of the type.
1125
1126# 0 is for normal, non-specials
1127my $MULTI_CP = 1; # Sequence of more than code point
1128my $HANGUL_SYLLABLE = 2;
1129my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1130my $NULL = 4; # The map is to the null string; utf8.c can't
1131 # handle these, nor is there an accepted syntax
1132 # for them in \p{} constructs
f86864ac 1133my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1134 # otherwise be $MULTI_CP type are instead type 0
1135
1136# process_generic_property_file() can accept certain overrides in its input.
1137# Each of these must begin AND end with $CMD_DELIM.
1138my $CMD_DELIM = "\a";
1139my $REPLACE_CMD = 'replace'; # Override the Replace
1140my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1141
1142my $NO = 0;
1143my $YES = 1;
1144
1145# Values for the Replace argument to add_range.
1146# $NO # Don't replace; add only the code points not
1147 # already present.
1148my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1149 # the comments at the subroutine definition.
1150my $UNCONDITIONALLY = 2; # Replace without conditions.
1151my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1152 # already there
56343c78 1153my $CROAK = 5; # Die with an error if is already there
99870f4d
KW
1154
1155# Flags to give property statuses. The phrases are to remind maintainers that
1156# if the flag is changed, the indefinite article referring to it in the
1157# documentation may need to be as well.
1158my $NORMAL = "";
1159my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1160 # it is suppressed
c12f2655
KW
1161my $PLACEHOLDER = 'P'; # A property that is defined as a placeholder in a
1162 # Unicode version that doesn't have it, but we need it
1163 # to be defined, if empty, to have things work.
1164 # Implies no pod entry generated
99870f4d
KW
1165my $DEPRECATED = 'D';
1166my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1167my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1168my $DISCOURAGED = 'X';
1169my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1170my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1171my $STRICTER = 'T';
1172my $a_bold_stricter = "a 'B<$STRICTER>'";
1173my $A_bold_stricter = "A 'B<$STRICTER>'";
1174my $STABILIZED = 'S';
1175my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1176my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1177my $OBSOLETE = 'O';
1178my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1179my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1180
1181my %status_past_participles = (
1182 $DISCOURAGED => 'discouraged',
1183 $SUPPRESSED => 'should never be generated',
1184 $STABILIZED => 'stabilized',
1185 $OBSOLETE => 'obsolete',
37e2e78e 1186 $DEPRECATED => 'deprecated',
99870f4d
KW
1187);
1188
f5817e0a
KW
1189# The format of the values of the tables:
1190my $EMPTY_FORMAT = "";
99870f4d
KW
1191my $BINARY_FORMAT = 'b';
1192my $DECIMAL_FORMAT = 'd';
1193my $FLOAT_FORMAT = 'f';
1194my $INTEGER_FORMAT = 'i';
1195my $HEX_FORMAT = 'x';
1196my $RATIONAL_FORMAT = 'r';
1197my $STRING_FORMAT = 's';
a14f3cb1 1198my $DECOMP_STRING_FORMAT = 'c';
99870f4d
KW
1199
1200my %map_table_formats = (
1201 $BINARY_FORMAT => 'binary',
1202 $DECIMAL_FORMAT => 'single decimal digit',
1203 $FLOAT_FORMAT => 'floating point number',
1204 $INTEGER_FORMAT => 'integer',
add63c13 1205 $HEX_FORMAT => 'non-negative hex whole number; a code point',
99870f4d 1206 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1207 $STRING_FORMAT => 'string',
92f9d56c 1208 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
99870f4d
KW
1209);
1210
1211# Unicode didn't put such derived files in a separate directory at first.
1212my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1213my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1214my $AUXILIARY = 'auxiliary';
1215
1216# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1217my %loose_to_file_of; # loosely maps table names to their respective
1218 # files
1219my %stricter_to_file_of; # same; but for stricter mapping.
1220my %nv_floating_to_rational; # maps numeric values floating point numbers to
1221 # their rational equivalent
c12f2655
KW
1222my %loose_property_name_of; # Loosely maps (non_string) property names to
1223 # standard form
99870f4d 1224
d867ccfb
KW
1225# Most properties are immune to caseless matching, otherwise you would get
1226# nonsensical results, as properties are a function of a code point, not
1227# everything that is caselessly equivalent to that code point. For example,
1228# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1229# be true because 's' and 'S' are equivalent caselessly. However,
1230# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1231# extend that concept to those very few properties that are like this. Each
1232# such property will match the full range caselessly. They are hard-coded in
1233# the program; it's not worth trying to make it general as it's extremely
1234# unlikely that they will ever change.
1235my %caseless_equivalent_to;
1236
99870f4d
KW
1237# These constants names and values were taken from the Unicode standard,
1238# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1239# syllables. The '_string' versions are so generated tables can retain the
1240# hex format, which is the more familiar value
1241my $SBase_string = "0xAC00";
1242my $SBase = CORE::hex $SBase_string;
1243my $LBase_string = "0x1100";
1244my $LBase = CORE::hex $LBase_string;
1245my $VBase_string = "0x1161";
1246my $VBase = CORE::hex $VBase_string;
1247my $TBase_string = "0x11A7";
1248my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1249my $SCount = 11172;
1250my $LCount = 19;
1251my $VCount = 21;
1252my $TCount = 28;
1253my $NCount = $VCount * $TCount;
1254
1255# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1256# with the above published constants.
1257my %Jamo;
1258my %Jamo_L; # Leading consonants
1259my %Jamo_V; # Vowels
1260my %Jamo_T; # Trailing consonants
1261
37e2e78e 1262my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1263my @unhandled_properties; # Will contain a list of properties found in
1264 # the input that we didn't process.
f86864ac 1265my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1266 # listed in the pod
1267my @map_properties; # Properties that get map files written
1268my @named_sequences; # NamedSequences.txt contents.
1269my %potential_files; # Generated list of all .txt files in the directory
1270 # structure so we can warn if something is being
1271 # ignored.
1272my @files_actually_output; # List of files we generated.
1273my @more_Names; # Some code point names are compound; this is used
1274 # to store the extra components of them.
1275my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1276 # the minimum before we consider it equivalent to a
1277 # candidate rational
1278my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1279
1280# These store references to certain commonly used property objects
1281my $gc;
1282my $perl;
1283my $block;
3e20195b
KW
1284my $perl_charname;
1285my $print;
7fc6cb55 1286my $Any;
359523e2 1287my $script;
99870f4d
KW
1288
1289# Are there conflicting names because of beginning with 'In_', or 'Is_'
1290my $has_In_conflicts = 0;
1291my $has_Is_conflicts = 0;
1292
1293sub internal_file_to_platform ($) {
1294 # Convert our file paths which have '/' separators to those of the
1295 # platform.
1296
1297 my $file = shift;
1298 return undef unless defined $file;
1299
1300 return File::Spec->join(split '/', $file);
d07a55ed 1301}
5beb625e 1302
99870f4d
KW
1303sub file_exists ($) { # platform independent '-e'. This program internally
1304 # uses slash as a path separator.
1305 my $file = shift;
1306 return 0 if ! defined $file;
1307 return -e internal_file_to_platform($file);
1308}
5beb625e 1309
99870f4d 1310sub objaddr($) {
23e33b60
KW
1311 # Returns the address of the blessed input object.
1312 # It doesn't check for blessedness because that would do a string eval
1313 # every call, and the program is structured so that this is never called
1314 # for a non-blessed object.
99870f4d 1315
23e33b60 1316 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1317
1318 # Numifying a ref gives its address.
051df77b 1319 return pack 'J', $_[0];
99870f4d
KW
1320}
1321
558712cf 1322# These are used only if $annotate is true.
c4019d52
KW
1323# The entire range of Unicode characters is examined to populate these
1324# after all the input has been processed. But most can be skipped, as they
1325# have the same descriptive phrases, such as being unassigned
1326my @viacode; # Contains the 1 million character names
1327my @printable; # boolean: And are those characters printable?
1328my @annotate_char_type; # Contains a type of those characters, specifically
1329 # for the purposes of annotation.
1330my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1331 # name for the purposes of annotation. They map to the
c4019d52
KW
1332 # upper edge of the range, so that the end point can
1333 # be immediately found. This is used to skip ahead to
1334 # the end of a range, and avoid processing each
1335 # individual code point in it.
1336my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1337 # characters, but excluding those which are
1338 # also noncharacter code points
1339
1340# The annotation types are an extension of the regular range types, though
1341# some of the latter are folded into one. Make the new types negative to
1342# avoid conflicting with the regular types
1343my $SURROGATE_TYPE = -1;
1344my $UNASSIGNED_TYPE = -2;
1345my $PRIVATE_USE_TYPE = -3;
1346my $NONCHARACTER_TYPE = -4;
1347my $CONTROL_TYPE = -5;
1348my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1349
1350sub populate_char_info ($) {
558712cf 1351 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1352 # input code point's info that are needed for outputting more detailed
1353 # comments. If calling context wants a return, it is the end point of
1354 # any contiguous range of characters that share essentially the same info
1355
1356 my $i = shift;
1357 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1358
1359 $viacode[$i] = $perl_charname->value_of($i) || "";
1360
1361 # A character is generally printable if Unicode says it is,
1362 # but below we make sure that most Unicode general category 'C' types
1363 # aren't.
1364 $printable[$i] = $print->contains($i);
1365
1366 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1367
1368 # Only these two regular types are treated specially for annotations
1369 # purposes
1370 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1371 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1372
1373 # Give a generic name to all code points that don't have a real name.
1374 # We output ranges, if applicable, for these. Also calculate the end
1375 # point of the range.
1376 my $end;
1377 if (! $viacode[$i]) {
1378 if ($gc-> table('Surrogate')->contains($i)) {
1379 $viacode[$i] = 'Surrogate';
1380 $annotate_char_type[$i] = $SURROGATE_TYPE;
1381 $printable[$i] = 0;
1382 $end = $gc->table('Surrogate')->containing_range($i)->end;
1383 }
1384 elsif ($gc-> table('Private_use')->contains($i)) {
1385 $viacode[$i] = 'Private Use';
1386 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1387 $printable[$i] = 0;
1388 $end = $gc->table('Private_Use')->containing_range($i)->end;
1389 }
1390 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1391 contains($i))
1392 {
1393 $viacode[$i] = 'Noncharacter';
1394 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1395 $printable[$i] = 0;
1396 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1397 containing_range($i)->end;
1398 }
1399 elsif ($gc-> table('Control')->contains($i)) {
1400 $viacode[$i] = 'Control';
1401 $annotate_char_type[$i] = $CONTROL_TYPE;
1402 $printable[$i] = 0;
1403 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1404 }
1405 elsif ($gc-> table('Unassigned')->contains($i)) {
1406 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1407 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1408 $printable[$i] = 0;
1409
1410 # Because we name the unassigned by the blocks they are in, it
1411 # can't go past the end of that block, and it also can't go past
1412 # the unassigned range it is in. The special table makes sure
1413 # that the non-characters, which are unassigned, are separated
1414 # out.
1415 $end = min($block->containing_range($i)->end,
1416 $unassigned_sans_noncharacters-> containing_range($i)->
1417 end);
13ca76ff
KW
1418 }
1419 else {
1420 Carp::my_carp_bug("Can't figure out how to annotate "
1421 . sprintf("U+%04X", $i)
1422 . ". Proceeding anyway.");
c4019d52
KW
1423 $viacode[$i] = 'UNKNOWN';
1424 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1425 $printable[$i] = 0;
1426 }
1427 }
1428
1429 # Here, has a name, but if it's one in which the code point number is
1430 # appended to the name, do that.
1431 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1432 $viacode[$i] .= sprintf("-%04X", $i);
1433 $end = $perl_charname->containing_range($i)->end;
1434 }
1435
1436 # And here, has a name, but if it's a hangul syllable one, replace it with
1437 # the correct name from the Unicode algorithm
1438 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1439 use integer;
1440 my $SIndex = $i - $SBase;
1441 my $L = $LBase + $SIndex / $NCount;
1442 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1443 my $T = $TBase + $SIndex % $TCount;
1444 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1445 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1446 $end = $perl_charname->containing_range($i)->end;
1447 }
1448
1449 return if ! defined wantarray;
1450 return $i if ! defined $end; # If not a range, return the input
1451
1452 # Save this whole range so can find the end point quickly
1453 $annotate_ranges->add_map($i, $end, $end);
1454
1455 return $end;
1456}
1457
23e33b60
KW
1458# Commented code below should work on Perl 5.8.
1459## This 'require' doesn't necessarily work in miniperl, and even if it does,
1460## the native perl version of it (which is what would operate under miniperl)
1461## is extremely slow, as it does a string eval every call.
1462#my $has_fast_scalar_util = $\18 !~ /miniperl/
1463# && defined eval "require Scalar::Util";
1464#
1465#sub objaddr($) {
1466# # Returns the address of the blessed input object. Uses the XS version if
1467# # available. It doesn't check for blessedness because that would do a
1468# # string eval every call, and the program is structured so that this is
1469# # never called for a non-blessed object.
1470#
1471# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1472#
1473# # Check at least that is a ref.
1474# my $pkg = ref($_[0]) or return undef;
1475#
1476# # Change to a fake package to defeat any overloaded stringify
1477# bless $_[0], 'main::Fake';
1478#
1479# # Numifying a ref gives its address.
051df77b 1480# my $addr = pack 'J', $_[0];
23e33b60
KW
1481#
1482# # Return to original class
1483# bless $_[0], $pkg;
1484# return $addr;
1485#}
1486
99870f4d
KW
1487sub max ($$) {
1488 my $a = shift;
1489 my $b = shift;
1490 return $a if $a >= $b;
1491 return $b;
1492}
1493
1494sub min ($$) {
1495 my $a = shift;
1496 my $b = shift;
1497 return $a if $a <= $b;
1498 return $b;
1499}
1500
1501sub clarify_number ($) {
1502 # This returns the input number with underscores inserted every 3 digits
1503 # in large (5 digits or more) numbers. Input must be entirely digits, not
1504 # checked.
1505
1506 my $number = shift;
1507 my $pos = length($number) - 3;
1508 return $number if $pos <= 1;
1509 while ($pos > 0) {
1510 substr($number, $pos, 0) = '_';
1511 $pos -= 3;
5beb625e 1512 }
99870f4d 1513 return $number;
99598c8c
JH
1514}
1515
12ac2576 1516
99870f4d 1517package Carp;
7ebf06b3 1518
99870f4d
KW
1519# These routines give a uniform treatment of messages in this program. They
1520# are placed in the Carp package to cause the stack trace to not include them,
1521# although an alternative would be to use another package and set @CARP_NOT
1522# for it.
12ac2576 1523
99870f4d 1524our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1525
99f78760
KW
1526# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1527# and overload trying to load Scalar:Util under miniperl. See
1528# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1529undef $overload::VERSION;
1530
99870f4d
KW
1531sub my_carp {
1532 my $message = shift || "";
1533 my $nofold = shift || 0;
7ebf06b3 1534
99870f4d
KW
1535 if ($message) {
1536 $message = main::join_lines($message);
1537 $message =~ s/^$0: *//; # Remove initial program name
1538 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1539 $message = "\n$0: $message;";
12ac2576 1540
99870f4d
KW
1541 # Fold the message with program name, semi-colon end punctuation
1542 # (which looks good with the message that carp appends to it), and a
1543 # hanging indent for continuation lines.
1544 $message = main::simple_fold($message, "", 4) unless $nofold;
1545 $message =~ s/\n$//; # Remove the trailing nl so what carp
1546 # appends is to the same line
1547 }
12ac2576 1548
99870f4d 1549 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1550
99870f4d
KW
1551 carp $message;
1552 return;
1553}
7ebf06b3 1554
99870f4d
KW
1555sub my_carp_bug {
1556 # This is called when it is clear that the problem is caused by a bug in
1557 # this program.
7ebf06b3 1558
99870f4d
KW
1559 my $message = shift;
1560 $message =~ s/^$0: *//;
1561 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1562 carp $message;
1563 return;
1564}
7ebf06b3 1565
99870f4d
KW
1566sub carp_too_few_args {
1567 if (@_ != 2) {
1568 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1569 return;
12ac2576 1570 }
7ebf06b3 1571
99870f4d
KW
1572 my $args_ref = shift;
1573 my $count = shift;
7ebf06b3 1574
99870f4d
KW
1575 my_carp_bug("Need at least $count arguments to "
1576 . (caller 1)[3]
1577 . ". Instead got: '"
1578 . join ', ', @$args_ref
1579 . "'. No action taken.");
1580 return;
12ac2576
JP
1581}
1582
99870f4d
KW
1583sub carp_extra_args {
1584 my $args_ref = shift;
1585 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1586
99870f4d
KW
1587 unless (ref $args_ref) {
1588 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1589 return;
1590 }
1591 my ($package, $file, $line) = caller;
1592 my $subroutine = (caller 1)[3];
cf25bb62 1593
99870f4d
KW
1594 my $list;
1595 if (ref $args_ref eq 'HASH') {
1596 foreach my $key (keys %$args_ref) {
1597 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1598 }
99870f4d 1599 $list = join ', ', each %{$args_ref};
cf25bb62 1600 }
99870f4d
KW
1601 elsif (ref $args_ref eq 'ARRAY') {
1602 foreach my $arg (@$args_ref) {
1603 $arg = $UNDEF unless defined $arg;
1604 }
1605 $list = join ', ', @$args_ref;
1606 }
1607 else {
1608 my_carp_bug("Can't cope with ref "
1609 . ref($args_ref)
1610 . " . argument to 'carp_extra_args'. Not checking arguments.");
1611 return;
1612 }
1613
1614 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1615 return;
d73e5302
JH
1616}
1617
99870f4d
KW
1618package main;
1619
1620{ # Closure
1621
1622 # This program uses the inside-out method for objects, as recommended in
1623 # "Perl Best Practices". This closure aids in generating those. There
1624 # are two routines. setup_package() is called once per package to set
1625 # things up, and then set_access() is called for each hash representing a
1626 # field in the object. These routines arrange for the object to be
1627 # properly destroyed when no longer used, and for standard accessor
1628 # functions to be generated. If you need more complex accessors, just
1629 # write your own and leave those accesses out of the call to set_access().
1630 # More details below.
1631
1632 my %constructor_fields; # fields that are to be used in constructors; see
1633 # below
1634
1635 # The values of this hash will be the package names as keys to other
1636 # hashes containing the name of each field in the package as keys, and
1637 # references to their respective hashes as values.
1638 my %package_fields;
1639
1640 sub setup_package {
1641 # Sets up the package, creating standard DESTROY and dump methods
1642 # (unless already defined). The dump method is used in debugging by
1643 # simple_dumper().
1644 # The optional parameters are:
1645 # a) a reference to a hash, that gets populated by later
1646 # set_access() calls with one of the accesses being
1647 # 'constructor'. The caller can then refer to this, but it is
1648 # not otherwise used by these two routines.
1649 # b) a reference to a callback routine to call during destruction
1650 # of the object, before any fields are actually destroyed
1651
1652 my %args = @_;
1653 my $constructor_ref = delete $args{'Constructor_Fields'};
1654 my $destroy_callback = delete $args{'Destroy_Callback'};
1655 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1656
1657 my %fields;
1658 my $package = (caller)[0];
1659
1660 $package_fields{$package} = \%fields;
1661 $constructor_fields{$package} = $constructor_ref;
1662
1663 unless ($package->can('DESTROY')) {
1664 my $destroy_name = "${package}::DESTROY";
1665 no strict "refs";
1666
1667 # Use typeglob to give the anonymous subroutine the name we want
1668 *$destroy_name = sub {
1669 my $self = shift;
ffe43484 1670 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1671
1672 $self->$destroy_callback if $destroy_callback;
1673 foreach my $field (keys %{$package_fields{$package}}) {
1674 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1675 delete $package_fields{$package}{$field}{$addr};
1676 }
1677 return;
1678 }
1679 }
1680
1681 unless ($package->can('dump')) {
1682 my $dump_name = "${package}::dump";
1683 no strict "refs";
1684 *$dump_name = sub {
1685 my $self = shift;
1686 return dump_inside_out($self, $package_fields{$package}, @_);
1687 }
1688 }
1689 return;
1690 }
1691
1692 sub set_access {
1693 # Arrange for the input field to be garbage collected when no longer
1694 # needed. Also, creates standard accessor functions for the field
1695 # based on the optional parameters-- none if none of these parameters:
1696 # 'addable' creates an 'add_NAME()' accessor function.
1697 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1698 # function.
1699 # 'settable' creates a 'set_NAME()' accessor function.
1700 # 'constructor' doesn't create an accessor function, but adds the
1701 # field to the hash that was previously passed to
1702 # setup_package();
1703 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1704 # 'add' etc. all mean 'addable'.
1705 # The read accessor function will work on both array and scalar
1706 # values. If another accessor in the parameter list is 'a', the read
1707 # access assumes an array. You can also force it to be array access
1708 # by specifying 'readable_array' instead of 'readable'
1709 #
1710 # A sort-of 'protected' access can be set-up by preceding the addable,
1711 # readable or settable with some initial portion of 'protected_' (but,
1712 # the underscore is required), like 'p_a', 'pro_set', etc. The
1713 # "protection" is only by convention. All that happens is that the
1714 # accessor functions' names begin with an underscore. So instead of
1715 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1716 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1717 # end of each package, and then storing the __LINE__ ranges and
1718 # checking them on every accessor. But that is way overkill.)
1719
1720 # We create anonymous subroutines as the accessors and then use
1721 # typeglobs to assign them to the proper package and name
1722
1723 my $name = shift; # Name of the field
1724 my $field = shift; # Reference to the inside-out hash containing the
1725 # field
1726
1727 my $package = (caller)[0];
1728
1729 if (! exists $package_fields{$package}) {
1730 croak "$0: Must call 'setup_package' before 'set_access'";
1731 }
d73e5302 1732
99870f4d
KW
1733 # Stash the field so DESTROY can get it.
1734 $package_fields{$package}{$name} = $field;
cf25bb62 1735
99870f4d
KW
1736 # Remaining arguments are the accessors. For each...
1737 foreach my $access (@_) {
1738 my $access = lc $access;
cf25bb62 1739
99870f4d 1740 my $protected = "";
cf25bb62 1741
99870f4d
KW
1742 # Match the input as far as it goes.
1743 if ($access =~ /^(p[^_]*)_/) {
1744 $protected = $1;
1745 if (substr('protected_', 0, length $protected)
1746 eq $protected)
1747 {
1748
1749 # Add 1 for the underscore not included in $protected
1750 $access = substr($access, length($protected) + 1);
1751 $protected = '_';
1752 }
1753 else {
1754 $protected = "";
1755 }
1756 }
1757
1758 if (substr('addable', 0, length $access) eq $access) {
1759 my $subname = "${package}::${protected}add_$name";
1760 no strict "refs";
1761
1762 # add_ accessor. Don't add if already there, which we
1763 # determine using 'eq' for scalars and '==' otherwise.
1764 *$subname = sub {
1765 use strict "refs";
1766 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1767 my $self = shift;
1768 my $value = shift;
ffe43484 1769 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1770 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1771 if (ref $value) {
f998e60c 1772 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1773 }
1774 else {
f998e60c 1775 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1776 }
f998e60c 1777 push @{$field->{$addr}}, $value;
99870f4d
KW
1778 return;
1779 }
1780 }
1781 elsif (substr('constructor', 0, length $access) eq $access) {
1782 if ($protected) {
1783 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1784 }
1785 else {
1786 $constructor_fields{$package}{$name} = $field;
1787 }
1788 }
1789 elsif (substr('readable_array', 0, length $access) eq $access) {
1790
1791 # Here has read access. If one of the other parameters for
1792 # access is array, or this one specifies array (by being more
1793 # than just 'readable_'), then create a subroutine that
1794 # assumes the data is an array. Otherwise just a scalar
1795 my $subname = "${package}::${protected}$name";
1796 if (grep { /^a/i } @_
1797 or length($access) > length('readable_'))
1798 {
1799 no strict "refs";
1800 *$subname = sub {
1801 use strict "refs";
23e33b60 1802 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1803 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1804 if (ref $field->{$addr} ne 'ARRAY') {
1805 my $type = ref $field->{$addr};
1806 $type = 'scalar' unless $type;
1807 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1808 return;
1809 }
1810 return scalar @{$field->{$addr}} unless wantarray;
1811
1812 # Make a copy; had problems with caller modifying the
1813 # original otherwise
1814 my @return = @{$field->{$addr}};
1815 return @return;
1816 }
1817 }
1818 else {
1819
1820 # Here not an array value, a simpler function.
1821 no strict "refs";
1822 *$subname = sub {
1823 use strict "refs";
23e33b60 1824 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1825 no overloading;
051df77b 1826 return $field->{pack 'J', $_[0]};
99870f4d
KW
1827 }
1828 }
1829 }
1830 elsif (substr('settable', 0, length $access) eq $access) {
1831 my $subname = "${package}::${protected}set_$name";
1832 no strict "refs";
1833 *$subname = sub {
1834 use strict "refs";
23e33b60
KW
1835 if (main::DEBUG) {
1836 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1837 Carp::carp_extra_args(\@_) if @_ > 2;
1838 }
1839 # $self is $_[0]; $value is $_[1]
f998e60c 1840 no overloading;
051df77b 1841 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1842 return;
1843 }
1844 }
1845 else {
1846 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1847 }
cf25bb62 1848 }
99870f4d 1849 return;
cf25bb62 1850 }
99870f4d
KW
1851}
1852
1853package Input_file;
1854
1855# All input files use this object, which stores various attributes about them,
1856# and provides for convenient, uniform handling. The run method wraps the
1857# processing. It handles all the bookkeeping of opening, reading, and closing
1858# the file, returning only significant input lines.
1859#
1860# Each object gets a handler which processes the body of the file, and is
1861# called by run(). Most should use the generic, default handler, which has
1862# code scrubbed to handle things you might not expect. A handler should
1863# basically be a while(next_line()) {...} loop.
1864#
1865# You can also set up handlers to
1866# 1) call before the first line is read for pre processing
1867# 2) call to adjust each line of the input before the main handler gets them
1868# 3) call upon EOF before the main handler exits its loop
1869# 4) call at the end for post processing
1870#
1871# $_ is used to store the input line, and is to be filtered by the
1872# each_line_handler()s. So, if the format of the line is not in the desired
1873# format for the main handler, these are used to do that adjusting. They can
1874# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1875# so the $_ output of one is used as the input to the next. None of the other
1876# handlers are stackable, but could easily be changed to be so.
1877#
1878# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1879# which insert the parameters as lines to be processed before the next input
1880# file line is read. This allows the EOF handler to flush buffers, for
1881# example. The difference between the two routines is that the lines inserted
1882# by insert_lines() are subjected to the each_line_handler()s. (So if you
1883# called it from such a handler, you would get infinite recursion.) Lines
1884# inserted by insert_adjusted_lines() go directly to the main handler without
1885# any adjustments. If the post-processing handler calls any of these, there
1886# will be no effect. Some error checking for these conditions could be added,
1887# but it hasn't been done.
1888#
1889# carp_bad_line() should be called to warn of bad input lines, which clears $_
1890# to prevent further processing of the line. This routine will output the
1891# message as a warning once, and then keep a count of the lines that have the
1892# same message, and output that count at the end of the file's processing.
1893# This keeps the number of messages down to a manageable amount.
1894#
1895# get_missings() should be called to retrieve any @missing input lines.
1896# Messages will be raised if this isn't done if the options aren't to ignore
1897# missings.
1898
1899sub trace { return main::trace(@_); }
1900
99870f4d
KW
1901{ # Closure
1902 # Keep track of fields that are to be put into the constructor.
1903 my %constructor_fields;
1904
1905 main::setup_package(Constructor_Fields => \%constructor_fields);
1906
1907 my %file; # Input file name, required
1908 main::set_access('file', \%file, qw{ c r });
1909
1910 my %first_released; # Unicode version file was first released in, required
1911 main::set_access('first_released', \%first_released, qw{ c r });
1912
1913 my %handler; # Subroutine to process the input file, defaults to
1914 # 'process_generic_property_file'
1915 main::set_access('handler', \%handler, qw{ c });
1916
1917 my %property;
1918 # name of property this file is for. defaults to none, meaning not
1919 # applicable, or is otherwise determinable, for example, from each line.
1920 main::set_access('property', \%property, qw{ c });
1921
1922 my %optional;
1923 # If this is true, the file is optional. If not present, no warning is
1924 # output. If it is present, the string given by this parameter is
1925 # evaluated, and if false the file is not processed.
1926 main::set_access('optional', \%optional, 'c', 'r');
1927
1928 my %non_skip;
1929 # This is used for debugging, to skip processing of all but a few input
1930 # files. Add 'non_skip => 1' to the constructor for those files you want
1931 # processed when you set the $debug_skip global.
1932 main::set_access('non_skip', \%non_skip, 'c');
1933
37e2e78e
KW
1934 my %skip;
1935 # This is used to skip processing of this input file semi-permanently.
1936 # It is used for files that we aren't planning to process anytime soon,
1937 # but want to allow to be in the directory and not raise a message that we
1938 # are not handling. Mostly for test files. This is in contrast to the
1939 # non_skip element, which is supposed to be used very temporarily for
1940 # debugging. Sets 'optional' to 1
1941 main::set_access('skip', \%skip, 'c');
1942
99870f4d
KW
1943 my %each_line_handler;
1944 # list of subroutines to look at and filter each non-comment line in the
1945 # file. defaults to none. The subroutines are called in order, each is
1946 # to adjust $_ for the next one, and the final one adjusts it for
1947 # 'handler'
1948 main::set_access('each_line_handler', \%each_line_handler, 'c');
1949
1950 my %has_missings_defaults;
1951 # ? Are there lines in the file giving default values for code points
1952 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1953 # the norm, but IGNORED means it has such lines, but the handler doesn't
1954 # use them. Having these three states allows us to catch changes to the
1955 # UCD that this program should track
1956 main::set_access('has_missings_defaults',
1957 \%has_missings_defaults, qw{ c r });
1958
1959 my %pre_handler;
1960 # Subroutine to call before doing anything else in the file. If undef, no
1961 # such handler is called.
1962 main::set_access('pre_handler', \%pre_handler, qw{ c });
1963
1964 my %eof_handler;
1965 # Subroutine to call upon getting an EOF on the input file, but before
1966 # that is returned to the main handler. This is to allow buffers to be
1967 # flushed. The handler is expected to call insert_lines() or
1968 # insert_adjusted() with the buffered material
1969 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1970
1971 my %post_handler;
1972 # Subroutine to call after all the lines of the file are read in and
1973 # processed. If undef, no such handler is called.
1974 main::set_access('post_handler', \%post_handler, qw{ c });
1975
1976 my %progress_message;
1977 # Message to print to display progress in lieu of the standard one
1978 main::set_access('progress_message', \%progress_message, qw{ c });
1979
1980 my %handle;
1981 # cache open file handle, internal. Is undef if file hasn't been
1982 # processed at all, empty if has;
1983 main::set_access('handle', \%handle);
1984
1985 my %added_lines;
1986 # cache of lines added virtually to the file, internal
1987 main::set_access('added_lines', \%added_lines);
1988
1989 my %errors;
1990 # cache of errors found, internal
1991 main::set_access('errors', \%errors);
1992
1993 my %missings;
1994 # storage of '@missing' defaults lines
1995 main::set_access('missings', \%missings);
1996
1997 sub new {
1998 my $class = shift;
1999
2000 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2001 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2002
2003 # Set defaults
2004 $handler{$addr} = \&main::process_generic_property_file;
2005 $non_skip{$addr} = 0;
37e2e78e 2006 $skip{$addr} = 0;
99870f4d
KW
2007 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2008 $handle{$addr} = undef;
2009 $added_lines{$addr} = [ ];
2010 $each_line_handler{$addr} = [ ];
2011 $errors{$addr} = { };
2012 $missings{$addr} = [ ];
2013
2014 # Two positional parameters.
99f78760 2015 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2016 $file{$addr} = main::internal_file_to_platform(shift);
2017 $first_released{$addr} = shift;
2018
2019 # The rest of the arguments are key => value pairs
2020 # %constructor_fields has been set up earlier to list all possible
2021 # ones. Either set or push, depending on how the default has been set
2022 # up just above.
2023 my %args = @_;
2024 foreach my $key (keys %args) {
2025 my $argument = $args{$key};
2026
2027 # Note that the fields are the lower case of the constructor keys
2028 my $hash = $constructor_fields{lc $key};
2029 if (! defined $hash) {
2030 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2031 next;
2032 }
2033 if (ref $hash->{$addr} eq 'ARRAY') {
2034 if (ref $argument eq 'ARRAY') {
2035 foreach my $argument (@{$argument}) {
2036 next if ! defined $argument;
2037 push @{$hash->{$addr}}, $argument;
2038 }
2039 }
2040 else {
2041 push @{$hash->{$addr}}, $argument if defined $argument;
2042 }
2043 }
2044 else {
2045 $hash->{$addr} = $argument;
2046 }
2047 delete $args{$key};
2048 };
2049
2050 # If the file has a property for it, it means that the property is not
2051 # listed in the file's entries. So add a handler to the list of line
2052 # handlers to insert the property name into the lines, to provide a
2053 # uniform interface to the final processing subroutine.
2054 # the final code doesn't have to worry about that.
2055 if ($property{$addr}) {
2056 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2057 }
2058
2059 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2060 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 2061 }
99870f4d 2062
37e2e78e
KW
2063 $optional{$addr} = 1 if $skip{$addr};
2064
99870f4d 2065 return $self;
d73e5302
JH
2066 }
2067
cf25bb62 2068
99870f4d
KW
2069 use overload
2070 fallback => 0,
2071 qw("") => "_operator_stringify",
2072 "." => \&main::_operator_dot,
2073 ;
cf25bb62 2074
99870f4d
KW
2075 sub _operator_stringify {
2076 my $self = shift;
cf25bb62 2077
99870f4d 2078 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2079 }
d73e5302 2080
99870f4d
KW
2081 # flag to make sure extracted files are processed early
2082 my $seen_non_extracted_non_age = 0;
d73e5302 2083
99870f4d
KW
2084 sub run {
2085 # Process the input object $self. This opens and closes the file and
2086 # calls all the handlers for it. Currently, this can only be called
2087 # once per file, as it destroy's the EOF handler
d73e5302 2088
99870f4d
KW
2089 my $self = shift;
2090 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2091
ffe43484 2092 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2093
99870f4d 2094 my $file = $file{$addr};
d73e5302 2095
99870f4d
KW
2096 # Don't process if not expecting this file (because released later
2097 # than this Unicode version), and isn't there. This means if someone
2098 # copies it into an earlier version's directory, we will go ahead and
2099 # process it.
2100 return if $first_released{$addr} gt $v_version && ! -e $file;
2101
2102 # If in debugging mode and this file doesn't have the non-skip
2103 # flag set, and isn't one of the critical files, skip it.
2104 if ($debug_skip
2105 && $first_released{$addr} ne v0
2106 && ! $non_skip{$addr})
2107 {
2108 print "Skipping $file in debugging\n" if $verbosity;
2109 return;
2110 }
2111
2112 # File could be optional
37e2e78e 2113 if ($optional{$addr}) {
99870f4d
KW
2114 return unless -e $file;
2115 my $result = eval $optional{$addr};
2116 if (! defined $result) {
2117 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2118 return;
2119 }
2120 if (! $result) {
2121 if ($verbosity) {
2122 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2123 }
2124 return;
2125 }
2126 }
2127
2128 if (! defined $file || ! -e $file) {
2129
2130 # If the file doesn't exist, see if have internal data for it
2131 # (based on first_released being 0).
2132 if ($first_released{$addr} eq v0) {
2133 $handle{$addr} = 'pretend_is_open';
2134 }
2135 else {
2136 if (! $optional{$addr} # File could be optional
2137 && $v_version ge $first_released{$addr})
2138 {
2139 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2140 }
2141 return;
2142 }
2143 }
2144 else {
2145
37e2e78e
KW
2146 # Here, the file exists. Some platforms may change the case of
2147 # its name
99870f4d 2148 if ($seen_non_extracted_non_age) {
517956bf 2149 if ($file =~ /$EXTRACTED/i) {
99870f4d 2150 Carp::my_carp_bug(join_lines(<<END
99f78760 2151$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2152anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2153have subtle problems
2154END
2155 ));
2156 }
2157 }
2158 elsif ($EXTRACTED_DIR
2159 && $first_released{$addr} ne v0
517956bf
CB
2160 && $file !~ /$EXTRACTED/i
2161 && lc($file) ne 'dage.txt')
99870f4d
KW
2162 {
2163 # We don't set this (by the 'if' above) if we have no
2164 # extracted directory, so if running on an early version,
2165 # this test won't work. Not worth worrying about.
2166 $seen_non_extracted_non_age = 1;
2167 }
2168
2169 # And mark the file as having being processed, and warn if it
2170 # isn't a file we are expecting. As we process the files,
2171 # they are deleted from the hash, so any that remain at the
2172 # end of the program are files that we didn't process.
517956bf
CB
2173 my $fkey = File::Spec->rel2abs($file);
2174 my $expecting = delete $potential_files{$fkey};
2175 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
678f13d5
KW
2176 Carp::my_carp("Was not expecting '$file'.") if
2177 ! $expecting
99870f4d
KW
2178 && ! defined $handle{$addr};
2179
37e2e78e
KW
2180 # Having deleted from expected files, we can quit if not to do
2181 # anything. Don't print progress unless really want verbosity
2182 if ($skip{$addr}) {
2183 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2184 return;
2185 }
2186
99870f4d
KW
2187 # Open the file, converting the slashes used in this program
2188 # into the proper form for the OS
2189 my $file_handle;
2190 if (not open $file_handle, "<", $file) {
2191 Carp::my_carp("Can't open $file. Skipping: $!");
2192 return 0;
2193 }
2194 $handle{$addr} = $file_handle; # Cache the open file handle
2195 }
2196
2197 if ($verbosity >= $PROGRESS) {
2198 if ($progress_message{$addr}) {
2199 print "$progress_message{$addr}\n";
2200 }
2201 else {
2202 # If using a virtual file, say so.
2203 print "Processing ", (-e $file)
2204 ? $file
2205 : "substitute $file",
2206 "\n";
2207 }
2208 }
2209
2210
2211 # Call any special handler for before the file.
2212 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2213
2214 # Then the main handler
2215 &{$handler{$addr}}($self);
2216
2217 # Then any special post-file handler.
2218 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2219
2220 # If any errors have been accumulated, output the counts (as the first
2221 # error message in each class was output when it was encountered).
2222 if ($errors{$addr}) {
2223 my $total = 0;
2224 my $types = 0;
2225 foreach my $error (keys %{$errors{$addr}}) {
2226 $total += $errors{$addr}->{$error};
2227 delete $errors{$addr}->{$error};
2228 $types++;
2229 }
2230 if ($total > 1) {
2231 my $message
2232 = "A total of $total lines had errors in $file. ";
2233
2234 $message .= ($types == 1)
2235 ? '(Only the first one was displayed.)'
2236 : '(Only the first of each type was displayed.)';
2237 Carp::my_carp($message);
2238 }
2239 }
2240
2241 if (@{$missings{$addr}}) {
2242 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2243 }
2244
2245 # If a real file handle, close it.
2246 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2247 ref $handle{$addr};
2248 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2249 # the file, as opposed to undef
2250 return;
2251 }
2252
2253 sub next_line {
2254 # Sets $_ to be the next logical input line, if any. Returns non-zero
2255 # if such a line exists. 'logical' means that any lines that have
2256 # been added via insert_lines() will be returned in $_ before the file
2257 # is read again.
2258
2259 my $self = shift;
2260 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2261
ffe43484 2262 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2263
2264 # Here the file is open (or if the handle is not a ref, is an open
2265 # 'virtual' file). Get the next line; any inserted lines get priority
2266 # over the file itself.
2267 my $adjusted;
2268
2269 LINE:
2270 while (1) { # Loop until find non-comment, non-empty line
2271 #local $to_trace = 1 if main::DEBUG;
2272 my $inserted_ref = shift @{$added_lines{$addr}};
2273 if (defined $inserted_ref) {
2274 ($adjusted, $_) = @{$inserted_ref};
2275 trace $adjusted, $_ if main::DEBUG && $to_trace;
2276 return 1 if $adjusted;
2277 }
2278 else {
2279 last if ! ref $handle{$addr}; # Don't read unless is real file
2280 last if ! defined ($_ = readline $handle{$addr});
2281 }
2282 chomp;
2283 trace $_ if main::DEBUG && $to_trace;
2284
2285 # See if this line is the comment line that defines what property
2286 # value that code points that are not listed in the file should
2287 # have. The format or existence of these lines is not guaranteed
2288 # by Unicode since they are comments, but the documentation says
2289 # that this was added for machine-readability, so probably won't
2290 # change. This works starting in Unicode Version 5.0. They look
2291 # like:
2292 #
2293 # @missing: 0000..10FFFF; Not_Reordered
2294 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2295 # @missing: 0000..10FFFF; ; NaN
2296 #
2297 # Save the line for a later get_missings() call.
2298 if (/$missing_defaults_prefix/) {
2299 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2300 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2301 }
2302 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2303 my @defaults = split /\s* ; \s*/x, $_;
2304
2305 # The first field is the @missing, which ends in a
2306 # semi-colon, so can safely shift.
2307 shift @defaults;
2308
2309 # Some of these lines may have empty field placeholders
2310 # which get in the way. An example is:
2311 # @missing: 0000..10FFFF; ; NaN
2312 # Remove them. Process starting from the top so the
2313 # splice doesn't affect things still to be looked at.
2314 for (my $i = @defaults - 1; $i >= 0; $i--) {
2315 next if $defaults[$i] ne "";
2316 splice @defaults, $i, 1;
2317 }
2318
2319 # What's left should be just the property (maybe) and the
2320 # default. Having only one element means it doesn't have
2321 # the property.
2322 my $default;
2323 my $property;
2324 if (@defaults >= 1) {
2325 if (@defaults == 1) {
2326 $default = $defaults[0];
2327 }
2328 else {
2329 $property = $defaults[0];
2330 $default = $defaults[1];
2331 }
2332 }
2333
2334 if (@defaults < 1
2335 || @defaults > 2
2336 || ($default =~ /^</
2337 && $default !~ /^<code *point>$/i
2338 && $default !~ /^<none>$/i))
2339 {
2340 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2341 }
2342 else {
2343
2344 # If the property is missing from the line, it should
2345 # be the one for the whole file
2346 $property = $property{$addr} if ! defined $property;
2347
2348 # Change <none> to the null string, which is what it
2349 # really means. If the default is the code point
2350 # itself, set it to <code point>, which is what
2351 # Unicode uses (but sometimes they've forgotten the
2352 # space)
2353 if ($default =~ /^<none>$/i) {
2354 $default = "";
2355 }
2356 elsif ($default =~ /^<code *point>$/i) {
2357 $default = $CODE_POINT;
2358 }
2359
2360 # Store them as a sub-arrays with both components.
2361 push @{$missings{$addr}}, [ $default, $property ];
2362 }
2363 }
2364
2365 # There is nothing for the caller to process on this comment
2366 # line.
2367 next;
2368 }
2369
2370 # Remove comments and trailing space, and skip this line if the
2371 # result is empty
2372 s/#.*//;
2373 s/\s+$//;
2374 next if /^$/;
2375
2376 # Call any handlers for this line, and skip further processing of
2377 # the line if the handler sets the line to null.
2378 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2379 &{$sub_ref}($self);
2380 next LINE if /^$/;
2381 }
2382
2383 # Here the line is ok. return success.
2384 return 1;
2385 } # End of looping through lines.
2386
2387 # If there is an EOF handler, call it (only once) and if it generates
2388 # more lines to process go back in the loop to handle them.
2389 if ($eof_handler{$addr}) {
2390 &{$eof_handler{$addr}}($self);
2391 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2392 goto LINE if $added_lines{$addr};
2393 }
2394
2395 # Return failure -- no more lines.
2396 return 0;
2397
2398 }
2399
2400# Not currently used, not fully tested.
2401# sub peek {
2402# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2403# # record. Not callable from an each_line_handler(), nor does it call
2404# # an each_line_handler() on the line.
2405#
2406# my $self = shift;
ffe43484 2407# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2408#
2409# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2410# my ($adjusted, $line) = @{$inserted_ref};
2411# next if $adjusted;
2412#
2413# # Remove comments and trailing space, and return a non-empty
2414# # resulting line
2415# $line =~ s/#.*//;
2416# $line =~ s/\s+$//;
2417# return $line if $line ne "";
2418# }
2419#
2420# return if ! ref $handle{$addr}; # Don't read unless is real file
2421# while (1) { # Loop until find non-comment, non-empty line
2422# local $to_trace = 1 if main::DEBUG;
2423# trace $_ if main::DEBUG && $to_trace;
2424# return if ! defined (my $line = readline $handle{$addr});
2425# chomp $line;
2426# push @{$added_lines{$addr}}, [ 0, $line ];
2427#
2428# $line =~ s/#.*//;
2429# $line =~ s/\s+$//;
2430# return $line if $line ne "";
2431# }
2432#
2433# return;
2434# }
2435
2436
2437 sub insert_lines {
2438 # Lines can be inserted so that it looks like they were in the input
2439 # file at the place it was when this routine is called. See also
2440 # insert_adjusted_lines(). Lines inserted via this routine go through
2441 # any each_line_handler()
2442
2443 my $self = shift;
2444
2445 # Each inserted line is an array, with the first element being 0 to
2446 # indicate that this line hasn't been adjusted, and needs to be
2447 # processed.
f998e60c 2448 no overloading;
051df77b 2449 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2450 return;
2451 }
2452
2453 sub insert_adjusted_lines {
2454 # Lines can be inserted so that it looks like they were in the input
2455 # file at the place it was when this routine is called. See also
2456 # insert_lines(). Lines inserted via this routine are already fully
2457 # adjusted, ready to be processed; each_line_handler()s handlers will
2458 # not be called. This means this is not a completely general
2459 # facility, as only the last each_line_handler on the stack should
2460 # call this. It could be made more general, by passing to each of the
2461 # line_handlers their position on the stack, which they would pass on
2462 # to this routine, and that would replace the boolean first element in
2463 # the anonymous array pushed here, so that the next_line routine could
2464 # use that to call only those handlers whose index is after it on the
2465 # stack. But this is overkill for what is needed now.
2466
2467 my $self = shift;
2468 trace $_[0] if main::DEBUG && $to_trace;
2469
2470 # Each inserted line is an array, with the first element being 1 to
2471 # indicate that this line has been adjusted
f998e60c 2472 no overloading;
051df77b 2473 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2474 return;
2475 }
2476
2477 sub get_missings {
2478 # Returns the stored up @missings lines' values, and clears the list.
2479 # The values are in an array, consisting of the default in the first
2480 # element, and the property in the 2nd. However, since these lines
2481 # can be stacked up, the return is an array of all these arrays.
2482
2483 my $self = shift;
2484 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2485
ffe43484 2486 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2487
2488 # If not accepting a list return, just return the first one.
2489 return shift @{$missings{$addr}} unless wantarray;
2490
2491 my @return = @{$missings{$addr}};
2492 undef @{$missings{$addr}};
2493 return @return;
2494 }
2495
2496 sub _insert_property_into_line {
2497 # Add a property field to $_, if this file requires it.
2498
f998e60c 2499 my $self = shift;
ffe43484 2500 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2501 my $property = $property{$addr};
99870f4d
KW
2502 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2503
2504 $_ =~ s/(;|$)/; $property$1/;
2505 return;
2506 }
2507
2508 sub carp_bad_line {
2509 # Output consistent error messages, using either a generic one, or the
2510 # one given by the optional parameter. To avoid gazillions of the
2511 # same message in case the syntax of a file is way off, this routine
2512 # only outputs the first instance of each message, incrementing a
2513 # count so the totals can be output at the end of the file.
2514
2515 my $self = shift;
2516 my $message = shift;
2517 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2518
ffe43484 2519 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2520
2521 $message = 'Unexpected line' unless $message;
2522
2523 # No trailing punctuation so as to fit with our addenda.
2524 $message =~ s/[.:;,]$//;
2525
2526 # If haven't seen this exact message before, output it now. Otherwise
2527 # increment the count of how many times it has occurred
2528 unless ($errors{$addr}->{$message}) {
2529 Carp::my_carp("$message in '$_' in "
f998e60c 2530 . $file{$addr}
99870f4d
KW
2531 . " at line $.. Skipping this line;");
2532 $errors{$addr}->{$message} = 1;
2533 }
2534 else {
2535 $errors{$addr}->{$message}++;
2536 }
2537
2538 # Clear the line to prevent any further (meaningful) processing of it.
2539 $_ = "";
2540
2541 return;
2542 }
2543} # End closure
2544
2545package Multi_Default;
2546
2547# Certain properties in early versions of Unicode had more than one possible
2548# default for code points missing from the files. In these cases, one
2549# default applies to everything left over after all the others are applied,
2550# and for each of the others, there is a description of which class of code
2551# points applies to it. This object helps implement this by storing the
2552# defaults, and for all but that final default, an eval string that generates
2553# the class that it applies to.
2554
2555
2556{ # Closure
2557
2558 main::setup_package();
2559
2560 my %class_defaults;
2561 # The defaults structure for the classes
2562 main::set_access('class_defaults', \%class_defaults);
2563
2564 my %other_default;
2565 # The default that applies to everything left over.
2566 main::set_access('other_default', \%other_default, 'r');
2567
2568
2569 sub new {
2570 # The constructor is called with default => eval pairs, terminated by
2571 # the left-over default. e.g.
2572 # Multi_Default->new(
2573 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2574 # - 0x200D',
2575 # 'R' => 'some other expression that evaluates to code points',
2576 # .
2577 # .
2578 # .
2579 # 'U'));
2580
2581 my $class = shift;
2582
2583 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2584 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2585
2586 while (@_ > 1) {
2587 my $default = shift;
2588 my $eval = shift;
2589 $class_defaults{$addr}->{$default} = $eval;
2590 }
2591
2592 $other_default{$addr} = shift;
2593
2594 return $self;
2595 }
2596
2597 sub get_next_defaults {
2598 # Iterates and returns the next class of defaults.
2599 my $self = shift;
2600 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2601
ffe43484 2602 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2603
2604 return each %{$class_defaults{$addr}};
2605 }
2606}
2607
2608package Alias;
2609
2610# An alias is one of the names that a table goes by. This class defines them
2611# including some attributes. Everything is currently setup in the
2612# constructor.
2613
2614
2615{ # Closure
2616
2617 main::setup_package();
2618
2619 my %name;
2620 main::set_access('name', \%name, 'r');
2621
2622 my %loose_match;
c12f2655 2623 # Should this name match loosely or not.
99870f4d
KW
2624 main::set_access('loose_match', \%loose_match, 'r');
2625
2626 my %make_pod_entry;
2627 # Some aliases should not get their own entries because they are covered
2628 # by a wild-card, and some we want to discourage use of. Binary
2629 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2630
2631 my %status;
2632 # Aliases have a status, like deprecated, or even suppressed (which means
2633 # they don't appear in documentation). Enum
2634 main::set_access('status', \%status, 'r');
2635
2636 my %externally_ok;
2637 # Similarly, some aliases should not be considered as usable ones for
2638 # external use, such as file names, or we don't want documentation to
2639 # recommend them. Boolean
2640 main::set_access('externally_ok', \%externally_ok, 'r');
2641
2642 sub new {
2643 my $class = shift;
2644
2645 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2646 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2647
2648 $name{$addr} = shift;
2649 $loose_match{$addr} = shift;
2650 $make_pod_entry{$addr} = shift;
2651 $externally_ok{$addr} = shift;
2652 $status{$addr} = shift;
2653
2654 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2655
2656 # Null names are never ok externally
2657 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2658
2659 return $self;
2660 }
2661}
2662
2663package Range;
2664
2665# A range is the basic unit for storing code points, and is described in the
2666# comments at the beginning of the program. Each range has a starting code
2667# point; an ending code point (not less than the starting one); a value
2668# that applies to every code point in between the two end-points, inclusive;
2669# and an enum type that applies to the value. The type is for the user's
2670# convenience, and has no meaning here, except that a non-zero type is
2671# considered to not obey the normal Unicode rules for having standard forms.
2672#
2673# The same structure is used for both map and match tables, even though in the
2674# latter, the value (and hence type) is irrelevant and could be used as a
2675# comment. In map tables, the value is what all the code points in the range
2676# map to. Type 0 values have the standardized version of the value stored as
2677# well, so as to not have to recalculate it a lot.
2678
2679sub trace { return main::trace(@_); }
2680
2681{ # Closure
2682
2683 main::setup_package();
2684
2685 my %start;
2686 main::set_access('start', \%start, 'r', 's');
2687
2688 my %end;
2689 main::set_access('end', \%end, 'r', 's');
2690
2691 my %value;
2692 main::set_access('value', \%value, 'r');
2693
2694 my %type;
2695 main::set_access('type', \%type, 'r');
2696
2697 my %standard_form;
2698 # The value in internal standard form. Defined only if the type is 0.
2699 main::set_access('standard_form', \%standard_form);
2700
2701 # Note that if these fields change, the dump() method should as well
2702
2703 sub new {
2704 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2705 my $class = shift;
2706
2707 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2708 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2709
2710 $start{$addr} = shift;
2711 $end{$addr} = shift;
2712
2713 my %args = @_;
2714
2715 my $value = delete $args{'Value'}; # Can be 0
2716 $value = "" unless defined $value;
2717 $value{$addr} = $value;
2718
2719 $type{$addr} = delete $args{'Type'} || 0;
2720
2721 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2722
2723 if (! $type{$addr}) {
2724 $standard_form{$addr} = main::standardize($value);
2725 }
2726
2727 return $self;
2728 }
2729
2730 use overload
2731 fallback => 0,
2732 qw("") => "_operator_stringify",
2733 "." => \&main::_operator_dot,
2734 ;
2735
2736 sub _operator_stringify {
2737 my $self = shift;
ffe43484 2738 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2739
2740 # Output it like '0041..0065 (value)'
2741 my $return = sprintf("%04X", $start{$addr})
2742 . '..'
2743 . sprintf("%04X", $end{$addr});
2744 my $value = $value{$addr};
2745 my $type = $type{$addr};
2746 $return .= ' (';
2747 $return .= "$value";
2748 $return .= ", Type=$type" if $type != 0;
2749 $return .= ')';
2750
2751 return $return;
2752 }
2753
2754 sub standard_form {
2755 # The standard form is the value itself if the standard form is
2756 # undefined (that is if the value is special)
2757
2758 my $self = shift;
2759 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2760
ffe43484 2761 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2762
2763 return $standard_form{$addr} if defined $standard_form{$addr};
2764 return $value{$addr};
2765 }
2766
2767 sub dump {
2768 # Human, not machine readable. For machine readable, comment out this
2769 # entire routine and let the standard one take effect.
2770 my $self = shift;
2771 my $indent = shift;
2772 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2773
ffe43484 2774 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2775
2776 my $return = $indent
2777 . sprintf("%04X", $start{$addr})
2778 . '..'
2779 . sprintf("%04X", $end{$addr})
2780 . " '$value{$addr}';";
2781 if (! defined $standard_form{$addr}) {
2782 $return .= "(type=$type{$addr})";
2783 }
2784 elsif ($standard_form{$addr} ne $value{$addr}) {
2785 $return .= "(standard '$standard_form{$addr}')";
2786 }
2787 return $return;
2788 }
2789} # End closure
2790
2791package _Range_List_Base;
2792
2793# Base class for range lists. A range list is simply an ordered list of
2794# ranges, so that the ranges with the lowest starting numbers are first in it.
2795#
2796# When a new range is added that is adjacent to an existing range that has the
2797# same value and type, it merges with it to form a larger range.
2798#
2799# Ranges generally do not overlap, except that there can be multiple entries
2800# of single code point ranges. This is because of NameAliases.txt.
2801#
2802# In this program, there is a standard value such that if two different
2803# values, have the same standard value, they are considered equivalent. This
2804# value was chosen so that it gives correct results on Unicode data
2805
2806# There are a number of methods to manipulate range lists, and some operators
2807# are overloaded to handle them.
2808
99870f4d
KW
2809sub trace { return main::trace(@_); }
2810
2811{ # Closure
2812
2813 our $addr;
2814
2815 main::setup_package();
2816
2817 my %ranges;
2818 # The list of ranges
2819 main::set_access('ranges', \%ranges, 'readable_array');
2820
2821 my %max;
2822 # The highest code point in the list. This was originally a method, but
2823 # actual measurements said it was used a lot.
2824 main::set_access('max', \%max, 'r');
2825
2826 my %each_range_iterator;
2827 # Iterator position for each_range()
2828 main::set_access('each_range_iterator', \%each_range_iterator);
2829
2830 my %owner_name_of;
2831 # Name of parent this is attached to, if any. Solely for better error
2832 # messages.
2833 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2834
2835 my %_search_ranges_cache;
2836 # A cache of the previous result from _search_ranges(), for better
2837 # performance
2838 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2839
2840 sub new {
2841 my $class = shift;
2842 my %args = @_;
2843
2844 # Optional initialization data for the range list.
2845 my $initialize = delete $args{'Initialize'};
2846
2847 my $self;
2848
2849 # Use _union() to initialize. _union() returns an object of this
2850 # class, which means that it will call this constructor recursively.
2851 # But it won't have this $initialize parameter so that it won't
2852 # infinitely loop on this.
2853 return _union($class, $initialize, %args) if defined $initialize;
2854
2855 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2856 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2857
2858 # Optional parent object, only for debug info.
2859 $owner_name_of{$addr} = delete $args{'Owner'};
2860 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2861
2862 # Stringify, in case it is an object.
2863 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2864
2865 # This is used only for error messages, and so a colon is added
2866 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2867
2868 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2869
2870 # Max is initialized to a negative value that isn't adjacent to 0,
2871 # for simpler tests
2872 $max{$addr} = -2;
2873
2874 $_search_ranges_cache{$addr} = 0;
2875 $ranges{$addr} = [];
2876
2877 return $self;
2878 }
2879
2880 use overload
2881 fallback => 0,
2882 qw("") => "_operator_stringify",
2883 "." => \&main::_operator_dot,
2884 ;
2885
2886 sub _operator_stringify {
2887 my $self = shift;
ffe43484 2888 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2889
2890 return "Range_List attached to '$owner_name_of{$addr}'"
2891 if $owner_name_of{$addr};
2892 return "anonymous Range_List " . \$self;
2893 }
2894
2895 sub _union {
2896 # Returns the union of the input code points. It can be called as
2897 # either a constructor or a method. If called as a method, the result
2898 # will be a new() instance of the calling object, containing the union
2899 # of that object with the other parameter's code points; if called as
2900 # a constructor, the first parameter gives the class the new object
2901 # should be, and the second parameter gives the code points to go into
2902 # it.
2903 # In either case, there are two parameters looked at by this routine;
2904 # any additional parameters are passed to the new() constructor.
2905 #
2906 # The code points can come in the form of some object that contains
2907 # ranges, and has a conventionally named method to access them; or
2908 # they can be an array of individual code points (as integers); or
2909 # just a single code point.
2910 #
2911 # If they are ranges, this routine doesn't make any effort to preserve
2912 # the range values of one input over the other. Therefore this base
2913 # class should not allow _union to be called from other than
2914 # initialization code, so as to prevent two tables from being added
2915 # together where the range values matter. The general form of this
2916 # routine therefore belongs in a derived class, but it was moved here
2917 # to avoid duplication of code. The failure to overload this in this
2918 # class keeps it safe.
2919 #
2920
2921 my $self;
2922 my @args; # Arguments to pass to the constructor
2923
2924 my $class = shift;
2925
2926 # If a method call, will start the union with the object itself, and
2927 # the class of the new object will be the same as self.
2928 if (ref $class) {
2929 $self = $class;
2930 $class = ref $self;
2931 push @args, $self;
2932 }
2933
2934 # Add the other required parameter.
2935 push @args, shift;
2936 # Rest of parameters are passed on to the constructor
2937
2938 # Accumulate all records from both lists.
2939 my @records;
2940 for my $arg (@args) {
2941 #local $to_trace = 0 if main::DEBUG;
2942 trace "argument = $arg" if main::DEBUG && $to_trace;
2943 if (! defined $arg) {
2944 my $message = "";
2945 if (defined $self) {
f998e60c 2946 no overloading;
051df77b 2947 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2948 }
2949 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2950 return;
2951 }
2952 $arg = [ $arg ] if ! ref $arg;
2953 my $type = ref $arg;
2954 if ($type eq 'ARRAY') {
2955 foreach my $element (@$arg) {
2956 push @records, Range->new($element, $element);
2957 }
2958 }
2959 elsif ($arg->isa('Range')) {
2960 push @records, $arg;
2961 }
2962 elsif ($arg->can('ranges')) {
2963 push @records, $arg->ranges;
2964 }
2965 else {
2966 my $message = "";
2967 if (defined $self) {
f998e60c 2968 no overloading;
051df77b 2969 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2970 }
2971 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2972 return;
2973 }
2974 }
2975
2976 # Sort with the range containing the lowest ordinal first, but if
2977 # two ranges start at the same code point, sort with the bigger range
2978 # of the two first, because it takes fewer cycles.
2979 @records = sort { ($a->start <=> $b->start)
2980 or
2981 # if b is shorter than a, b->end will be
2982 # less than a->end, and we want to select
2983 # a, so want to return -1
2984 ($b->end <=> $a->end)
2985 } @records;
2986
2987 my $new = $class->new(@_);
2988
2989 # Fold in records so long as they add new information.
2990 for my $set (@records) {
2991 my $start = $set->start;
2992 my $end = $set->end;
2993 my $value = $set->value;
2994 if ($start > $new->max) {
2995 $new->_add_delete('+', $start, $end, $value);
2996 }
2997 elsif ($end > $new->max) {
2998 $new->_add_delete('+', $new->max +1, $end, $value);
2999 }
3000 }
3001
3002 return $new;
3003 }
3004
3005 sub range_count { # Return the number of ranges in the range list
3006 my $self = shift;
3007 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3008
f998e60c 3009 no overloading;
051df77b 3010 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
3011 }
3012
3013 sub min {
3014 # Returns the minimum code point currently in the range list, or if
3015 # the range list is empty, 2 beyond the max possible. This is a
3016 # method because used so rarely, that not worth saving between calls,
3017 # and having to worry about changing it as ranges are added and
3018 # deleted.
3019
3020 my $self = shift;
3021 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3022
ffe43484 3023 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3024
3025 # If the range list is empty, return a large value that isn't adjacent
3026 # to any that could be in the range list, for simpler tests
3027 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3028 return $ranges{$addr}->[0]->start;
3029 }
3030
3031 sub contains {
3032 # Boolean: Is argument in the range list? If so returns $i such that:
3033 # range[$i]->end < $codepoint <= range[$i+1]->end
3034 # which is one beyond what you want; this is so that the 0th range
3035 # doesn't return false
3036 my $self = shift;
3037 my $codepoint = shift;
3038 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3039
99870f4d
KW
3040 my $i = $self->_search_ranges($codepoint);
3041 return 0 unless defined $i;
3042
3043 # The search returns $i, such that
3044 # range[$i-1]->end < $codepoint <= range[$i]->end
3045 # So is in the table if and only iff it is at least the start position
3046 # of range $i.
f998e60c 3047 no overloading;
051df77b 3048 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
3049 return $i + 1;
3050 }
3051
2f7a8815
KW
3052 sub containing_range {
3053 # Returns the range object that contains the code point, undef if none
3054
3055 my $self = shift;
3056 my $codepoint = shift;
3057 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3058
3059 my $i = $self->contains($codepoint);
3060 return unless $i;
3061
3062 # contains() returns 1 beyond where we should look
3063 no overloading;
3064 return $ranges{pack 'J', $self}->[$i-1];
3065 }
3066
99870f4d
KW
3067 sub value_of {
3068 # Returns the value associated with the code point, undef if none
3069
3070 my $self = shift;
3071 my $codepoint = shift;
3072 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3073
d69c231b
KW
3074 my $range = $self->containing_range($codepoint);
3075 return unless defined $range;
99870f4d 3076
d69c231b 3077 return $range->value;
99870f4d
KW
3078 }
3079
0a9dbafc
KW
3080 sub type_of {
3081 # Returns the type of the range containing the code point, undef if
3082 # the code point is not in the table
3083
3084 my $self = shift;
3085 my $codepoint = shift;
3086 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3087
3088 my $range = $self->containing_range($codepoint);
3089 return unless defined $range;
3090
3091 return $range->type;
3092 }
3093
99870f4d
KW
3094 sub _search_ranges {
3095 # Find the range in the list which contains a code point, or where it
3096 # should go if were to add it. That is, it returns $i, such that:
3097 # range[$i-1]->end < $codepoint <= range[$i]->end
3098 # Returns undef if no such $i is possible (e.g. at end of table), or
3099 # if there is an error.
3100
3101 my $self = shift;
3102 my $code_point = shift;
3103 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3104
ffe43484 3105 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3106
3107 return if $code_point > $max{$addr};
3108 my $r = $ranges{$addr}; # The current list of ranges
3109 my $range_list_size = scalar @$r;
3110 my $i;
3111
3112 use integer; # want integer division
3113
3114 # Use the cached result as the starting guess for this one, because,
3115 # an experiment on 5.1 showed that 90% of the time the cache was the
3116 # same as the result on the next call (and 7% it was one less).
3117 $i = $_search_ranges_cache{$addr};
3118 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3119 # from an intervening deletion
3120 #local $to_trace = 1 if main::DEBUG;
3121 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3122 return $i if $code_point <= $r->[$i]->end
3123 && ($i == 0 || $r->[$i-1]->end < $code_point);
3124
3125 # Here the cache doesn't yield the correct $i. Try adding 1.
3126 if ($i < $range_list_size - 1
3127 && $r->[$i]->end < $code_point &&
3128 $code_point <= $r->[$i+1]->end)
3129 {
3130 $i++;
3131 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3132 $_search_ranges_cache{$addr} = $i;
3133 return $i;
3134 }
3135
3136 # Here, adding 1 also didn't work. We do a binary search to
3137 # find the correct position, starting with current $i
3138 my $lower = 0;
3139 my $upper = $range_list_size - 1;
3140 while (1) {
3141 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3142
3143 if ($code_point <= $r->[$i]->end) {
3144
3145 # Here we have met the upper constraint. We can quit if we
3146 # also meet the lower one.
3147 last if $i == 0 || $r->[$i-1]->end < $code_point;
3148
3149 $upper = $i; # Still too high.
3150
3151 }
3152 else {
3153
3154 # Here, $r[$i]->end < $code_point, so look higher up.
3155 $lower = $i;
3156 }
3157
3158 # Split search domain in half to try again.
3159 my $temp = ($upper + $lower) / 2;
3160
3161 # No point in continuing unless $i changes for next time
3162 # in the loop.
3163 if ($temp == $i) {
3164
3165 # We can't reach the highest element because of the averaging.
3166 # So if one below the upper edge, force it there and try one
3167 # more time.
3168 if ($i == $range_list_size - 2) {
3169
3170 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3171 $i = $range_list_size - 1;
3172
3173 # Change $lower as well so if fails next time through,
3174 # taking the average will yield the same $i, and we will
3175 # quit with the error message just below.
3176 $lower = $i;
3177 next;
3178 }
3179 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3180 return;
3181 }
3182 $i = $temp;
3183 } # End of while loop
3184
3185 if (main::DEBUG && $to_trace) {
3186 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3187 trace "i= [ $i ]", $r->[$i];
3188 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3189 }
3190
3191 # Here we have found the offset. Cache it as a starting point for the
3192 # next call.
3193 $_search_ranges_cache{$addr} = $i;
3194 return $i;
3195 }
3196
3197 sub _add_delete {
3198 # Add, replace or delete ranges to or from a list. The $type
3199 # parameter gives which:
3200 # '+' => insert or replace a range, returning a list of any changed
3201 # ranges.
3202 # '-' => delete a range, returning a list of any deleted ranges.
3203 #
3204 # The next three parameters give respectively the start, end, and
3205 # value associated with the range. 'value' should be null unless the
3206 # operation is '+';
3207 #
3208 # The range list is kept sorted so that the range with the lowest
3209 # starting position is first in the list, and generally, adjacent
c1739a4a 3210 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3211 # exceptions below).
3212 #
c1739a4a 3213 # There are more parameters; all are key => value pairs:
99870f4d
KW
3214 # Type gives the type of the value. It is only valid for '+'.
3215 # All ranges have types; if this parameter is omitted, 0 is
3216 # assumed. Ranges with type 0 are assumed to obey the
3217 # Unicode rules for casing, etc; ranges with other types are
3218 # not. Otherwise, the type is arbitrary, for the caller's
3219 # convenience, and looked at only by this routine to keep
3220 # adjacent ranges of different types from being merged into
3221 # a single larger range, and when Replace =>
3222 # $IF_NOT_EQUIVALENT is specified (see just below).
3223 # Replace determines what to do if the range list already contains
3224 # ranges which coincide with all or portions of the input
3225 # range. It is only valid for '+':
3226 # => $NO means that the new value is not to replace
3227 # any existing ones, but any empty gaps of the
3228 # range list coinciding with the input range
3229 # will be filled in with the new value.
3230 # => $UNCONDITIONALLY means to replace the existing values with
3231 # this one unconditionally. However, if the
3232 # new and old values are identical, the
3233 # replacement is skipped to save cycles
3234 # => $IF_NOT_EQUIVALENT means to replace the existing values
3235 # with this one if they are not equivalent.
3236 # Ranges are equivalent if their types are the
c1739a4a 3237 # same, and they are the same string; or if
99870f4d
KW
3238 # both are type 0 ranges, if their Unicode
3239 # standard forms are identical. In this last
3240 # case, the routine chooses the more "modern"
3241 # one to use. This is because some of the
3242 # older files are formatted with values that
3243 # are, for example, ALL CAPs, whereas the
3244 # derived files have a more modern style,
3245 # which looks better. By looking for this
3246 # style when the pre-existing and replacement
3247 # standard forms are the same, we can move to
3248 # the modern style
3249 # => $MULTIPLE means that if this range duplicates an
3250 # existing one, but has a different value,
3251 # don't replace the existing one, but insert
3252 # this, one so that the same range can occur
53d84487
KW
3253 # multiple times. They are stored LIFO, so
3254 # that the final one inserted is the first one
3255 # returned in an ordered search of the table.
99870f4d
KW
3256 # => anything else is the same as => $IF_NOT_EQUIVALENT
3257 #
c1739a4a
KW
3258 # "same value" means identical for non-type-0 ranges, and it means
3259 # having the same standard forms for type-0 ranges.
99870f4d
KW
3260
3261 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3262
3263 my $self = shift;
3264 my $operation = shift; # '+' for add/replace; '-' for delete;
3265 my $start = shift;
3266 my $end = shift;
3267 my $value = shift;
3268
3269 my %args = @_;
3270
3271 $value = "" if not defined $value; # warning: $value can be "0"
3272
3273 my $replace = delete $args{'Replace'};
3274 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3275
3276 my $type = delete $args{'Type'};
3277 $type = 0 unless defined $type;
3278
3279 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3280
ffe43484 3281 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3282
3283 if ($operation ne '+' && $operation ne '-') {
3284 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3285 return;
3286 }
3287 unless (defined $start && defined $end) {
3288 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3289 return;
3290 }
3291 unless ($end >= $start) {
3292 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3293 return;
3294 }
3295 #local $to_trace = 1 if main::DEBUG;
3296
3297 if ($operation eq '-') {
3298 if ($replace != $IF_NOT_EQUIVALENT) {
3299 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3300 $replace = $IF_NOT_EQUIVALENT;
3301 }
3302 if ($type) {
3303 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3304 $type = 0;
3305 }
3306 if ($value ne "") {
3307 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3308 $value = "";
3309 }
3310 }
3311
3312 my $r = $ranges{$addr}; # The current list of ranges
3313 my $range_list_size = scalar @$r; # And its size
3314 my $max = $max{$addr}; # The current high code point in
3315 # the list of ranges
3316
3317 # Do a special case requiring fewer machine cycles when the new range
3318 # starts after the current highest point. The Unicode input data is
3319 # structured so this is common.
3320 if ($start > $max) {
3321
3322 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3323 return if $operation eq '-'; # Deleting a non-existing range is a
3324 # no-op
3325
3326 # If the new range doesn't logically extend the current final one
3327 # in the range list, create a new range at the end of the range
3328 # list. (max cleverly is initialized to a negative number not
3329 # adjacent to 0 if the range list is empty, so even adding a range
3330 # to an empty range list starting at 0 will have this 'if'
3331 # succeed.)
3332 if ($start > $max + 1 # non-adjacent means can't extend.
3333 || @{$r}[-1]->value ne $value # values differ, can't extend.
3334 || @{$r}[-1]->type != $type # types differ, can't extend.
3335 ) {
3336 push @$r, Range->new($start, $end,
3337 Value => $value,
3338 Type => $type);
3339 }
3340 else {
3341
3342 # Here, the new range starts just after the current highest in
3343 # the range list, and they have the same type and value.
3344 # Extend the current range to incorporate the new one.
3345 @{$r}[-1]->set_end($end);
3346 }
3347
3348 # This becomes the new maximum.
3349 $max{$addr} = $end;
3350
3351 return;
3352 }
3353 #local $to_trace = 0 if main::DEBUG;
3354
3355 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3356
3357 # Here, the input range isn't after the whole rest of the range list.
3358 # Most likely 'splice' will be needed. The rest of the routine finds
3359 # the needed splice parameters, and if necessary, does the splice.
3360 # First, find the offset parameter needed by the splice function for
3361 # the input range. Note that the input range may span multiple
3362 # existing ones, but we'll worry about that later. For now, just find
3363 # the beginning. If the input range is to be inserted starting in a
3364 # position not currently in the range list, it must (obviously) come
3365 # just after the range below it, and just before the range above it.
3366 # Slightly less obviously, it will occupy the position currently
3367 # occupied by the range that is to come after it. More formally, we
3368 # are looking for the position, $i, in the array of ranges, such that:
3369 #
3370 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3371 #
3372 # (The ordered relationships within existing ranges are also shown in
3373 # the equation above). However, if the start of the input range is
3374 # within an existing range, the splice offset should point to that
3375 # existing range's position in the list; that is $i satisfies a
3376 # somewhat different equation, namely:
3377 #
3378 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3379 #
3380 # More briefly, $start can come before or after r[$i]->start, and at
3381 # this point, we don't know which it will be. However, these
3382 # two equations share these constraints:
3383 #
3384 # r[$i-1]->end < $start <= r[$i]->end
3385 #
3386 # And that is good enough to find $i.
3387
3388 my $i = $self->_search_ranges($start);
3389 if (! defined $i) {
3390 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3391 return;
3392 }
3393
3394 # The search function returns $i such that:
3395 #
3396 # r[$i-1]->end < $start <= r[$i]->end
3397 #
3398 # That means that $i points to the first range in the range list
3399 # that could possibly be affected by this operation. We still don't
3400 # know if the start of the input range is within r[$i], or if it
3401 # points to empty space between r[$i-1] and r[$i].
3402 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3403
3404 # Special case the insertion of data that is not to replace any
3405 # existing data.
3406 if ($replace == $NO) { # If $NO, has to be operation '+'
3407 #local $to_trace = 1 if main::DEBUG;
3408 trace "Doesn't replace" if main::DEBUG && $to_trace;
3409
3410 # Here, the new range is to take effect only on those code points
3411 # that aren't already in an existing range. This can be done by
3412 # looking through the existing range list and finding the gaps in
3413 # the ranges that this new range affects, and then calling this
3414 # function recursively on each of those gaps, leaving untouched
3415 # anything already in the list. Gather up a list of the changed
3416 # gaps first so that changes to the internal state as new ranges
3417 # are added won't be a problem.
3418 my @gap_list;
3419
3420 # First, if the starting point of the input range is outside an
3421 # existing one, there is a gap from there to the beginning of the
3422 # existing range -- add a span to fill the part that this new
3423 # range occupies
3424 if ($start < $r->[$i]->start) {
3425 push @gap_list, Range->new($start,
3426 main::min($end,
3427 $r->[$i]->start - 1),
3428 Type => $type);
3429 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3430 }
3431
3432 # Then look through the range list for other gaps until we reach
3433 # the highest range affected by the input one.
3434 my $j;
3435 for ($j = $i+1; $j < $range_list_size; $j++) {
3436 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3437 last if $end < $r->[$j]->start;
3438
3439 # If there is a gap between when this range starts and the
3440 # previous one ends, add a span to fill it. Note that just
3441 # because there are two ranges doesn't mean there is a
3442 # non-zero gap between them. It could be that they have
3443 # different values or types
3444 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3445 push @gap_list,
3446 Range->new($r->[$j-1]->end + 1,
3447 $r->[$j]->start - 1,
3448 Type => $type);
3449 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3450 }
3451 }
3452
3453 # Here, we have either found an existing range in the range list,
3454 # beyond the area affected by the input one, or we fell off the
3455 # end of the loop because the input range affects the whole rest
3456 # of the range list. In either case, $j is 1 higher than the
3457 # highest affected range. If $j == $i, it means that there are no
3458 # affected ranges, that the entire insertion is in the gap between
3459 # r[$i-1], and r[$i], which we already have taken care of before
3460 # the loop.
3461 # On the other hand, if there are affected ranges, it might be
3462 # that there is a gap that needs filling after the final such
3463 # range to the end of the input range
3464 if ($r->[$j-1]->end < $end) {
3465 push @gap_list, Range->new(main::max($start,
3466 $r->[$j-1]->end + 1),
3467 $end,
3468 Type => $type);
3469 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3470 }
3471
3472 # Call recursively to fill in all the gaps.
3473 foreach my $gap (@gap_list) {
3474 $self->_add_delete($operation,
3475 $gap->start,
3476 $gap->end,
3477 $value,
3478 Type => $type);
3479 }
3480
3481 return;
3482 }
3483
53d84487
KW
3484 # Here, we have taken care of the case where $replace is $NO.
3485 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3486 # If inserting a multiple record, this is where it goes, before the
3487 # first (if any) existing one. This implies an insertion, and no
3488 # change to any existing ranges. Note that $i can be -1 if this new
3489 # range doesn't actually duplicate any existing, and comes at the
3490 # beginning of the list.
3491 if ($replace == $MULTIPLE) {
3492
3493 if ($start != $end) {
3494 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3495 return;
3496 }
3497
3498 # Don't add an exact duplicate, as it isn't really a multiple
3499 if ($end >= $r->[$i]->start) {
1f6798c4
KW
3500 my $existing_value = $r->[$i]->value;
3501 my $existing_type = $r->[$i]->type;
3502 return if $value eq $existing_value && $type eq $existing_type;
3503
3504 # If the multiple value is part of an existing range, we want
3505 # to split up that range, so that only the single code point
3506 # is affected. To do this, we first call ourselves
3507 # recursively to delete that code point from the table, having
3508 # preserved its current data above. Then we call ourselves
3509 # recursively again to add the new multiple, which we know by
3510 # the test just above is different than the current code
3511 # point's value, so it will become a range containing a single
3512 # code point: just itself. Finally, we add back in the
3513 # pre-existing code point, which will again be a single code
3514 # point range. Because 'i' likely will have changed as a
3515 # result of these operations, we can't just continue on, but
3516 # do this operation recursively as well.
53d84487 3517 if ($r->[$i]->start != $r->[$i]->end) {
1f6798c4
KW
3518 $self->_add_delete('-', $start, $end, "");
3519 $self->_add_delete('+', $start, $end, $value, Type => $type);
3520 return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
53d84487 3521 }
53d84487
KW
3522 }
3523
3524 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3525 my @return = splice @$r,
3526 $i,
3527 0,
3528 Range->new($start,
3529 $end,
3530 Value => $value,
3531 Type => $type);
3532 if (main::DEBUG && $to_trace) {
3533 trace "After splice:";
3534 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3535 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3536 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3537 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3538 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3539 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3540 }
3541 return @return;
3542 }
3543
3544 # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
3545 # delete, insert, and replace either unconditionally or if not
3546 # equivalent. $i still points to the first potential affected range.
3547 # Now find the highest range affected, which will determine the length
3548 # parameter to splice. (The input range can span multiple existing
3549 # ones.) If this isn't a deletion, while we are looking through the
3550 # range list, see also if this is a replacement rather than a clean
3551 # insertion; that is if it will change the values of at least one
3552 # existing range. Start off assuming it is an insert, until find it
3553 # isn't.
3554 my $clean_insert = $operation eq '+';
99870f4d
KW
3555 my $j; # This will point to the highest affected range
3556
3557 # For non-zero types, the standard form is the value itself;
3558 my $standard_form = ($type) ? $value : main::standardize($value);
3559
3560 for ($j = $i; $j < $range_list_size; $j++) {
3561 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3562
3563 # If find a range that it doesn't overlap into, we can stop
3564 # searching
3565 last if $end < $r->[$j]->start;
3566
969a34cc
KW
3567 # Here, overlaps the range at $j. If the values don't match,
3568 # and so far we think this is a clean insertion, it becomes a
3569 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3570 if ($clean_insert) {
99870f4d 3571 if ($r->[$j]->standard_form ne $standard_form) {
969a34cc 3572 $clean_insert = 0;
56343c78
KW
3573 if ($replace == $CROAK) {
3574 main::croak("The range to add "
3575 . sprintf("%04X", $start)
3576 . '-'
3577 . sprintf("%04X", $end)
3578 . " with value '$value' overlaps an existing range $r->[$j]");
3579 }
99870f4d
KW
3580 }
3581 else {
3582
3583 # Here, the two values are essentially the same. If the
3584 # two are actually identical, replacing wouldn't change
3585 # anything so skip it.
3586 my $pre_existing = $r->[$j]->value;
3587 if ($pre_existing ne $value) {
3588
3589 # Here the new and old standardized values are the
3590 # same, but the non-standardized values aren't. If
3591 # replacing unconditionally, then replace
3592 if( $replace == $UNCONDITIONALLY) {
969a34cc 3593 $clean_insert = 0;
99870f4d
KW
3594 }
3595 else {
3596
3597 # Here, are replacing conditionally. Decide to
3598 # replace or not based on which appears to look
3599 # the "nicest". If one is mixed case and the
3600 # other isn't, choose the mixed case one.
3601 my $new_mixed = $value =~ /[A-Z]/
3602 && $value =~ /[a-z]/;
3603 my $old_mixed = $pre_existing =~ /[A-Z]/
3604 && $pre_existing =~ /[a-z]/;
3605
3606 if ($old_mixed != $new_mixed) {
969a34cc 3607 $clean_insert = 0 if $new_mixed;
99870f4d 3608 if (main::DEBUG && $to_trace) {
969a34cc
KW
3609 if ($clean_insert) {
3610 trace "Retaining $pre_existing over $value";
99870f4d
KW
3611 }
3612 else {
969a34cc 3613 trace "Replacing $pre_existing with $value";
99870f4d
KW
3614 }
3615 }
3616 }
3617 else {
3618
3619 # Here casing wasn't different between the two.
3620 # If one has hyphens or underscores and the
3621 # other doesn't, choose the one with the
3622 # punctuation.
3623 my $new_punct = $value =~ /[-_]/;
3624 my $old_punct = $pre_existing =~ /[-_]/;
3625
3626 if ($old_punct != $new_punct) {
969a34cc 3627 $clean_insert = 0 if $new_punct;
99870f4d 3628 if (main::DEBUG && $to_trace) {
969a34cc
KW
3629 if ($clean_insert) {
3630 trace "Retaining $pre_existing over $value";
99870f4d
KW
3631 }
3632 else {
969a34cc 3633 trace "Replacing $pre_existing with $value";
99870f4d
KW
3634 }
3635 }
3636 } # else existing one is just as "good";
3637 # retain it to save cycles.
3638 }
3639 }
3640 }
3641 }
3642 }
3643 } # End of loop looking for highest affected range.
3644
3645 # Here, $j points to one beyond the highest range that this insertion
3646 # affects (hence to beyond the range list if that range is the final
3647 # one in the range list).
3648
3649 # The splice length is all the affected ranges. Get it before
3650 # subtracting, for efficiency, so we don't have to later add 1.
3651 my $length = $j - $i;
3652
3653 $j--; # $j now points to the highest affected range.
3654 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3655
99870f4d
KW
3656 # Here, have taken care of $NO and $MULTIPLE replaces.
3657 # $j points to the highest affected range. But it can be < $i or even
3658 # -1. These happen only if the insertion is entirely in the gap
3659 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3660 # above exited first time through with $end < $r->[$i]->start. (And
3661 # then we subtracted one from j) This implies also that $start <
3662 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3663 # $start, so the entire input range is in the gap.
3664 if ($j < $i) {
3665
3666 # Here the entire input range is in the gap before $i.
3667
3668 if (main::DEBUG && $to_trace) {
3669 if ($i) {
3670 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3671 }
3672 else {
3673 trace "Entire range is before $r->[$i]";
3674 }
3675 }
3676 return if $operation ne '+'; # Deletion of a non-existent range is
3677 # a no-op
3678 }
3679 else {
3680
969a34cc
KW
3681 # Here part of the input range is not in the gap before $i. Thus,
3682 # there is at least one affected one, and $j points to the highest
3683 # such one.
99870f4d
KW
3684
3685 # At this point, here is the situation:
3686 # This is not an insertion of a multiple, nor of tentative ($NO)
3687 # data.
3688 # $i points to the first element in the current range list that
3689 # may be affected by this operation. In fact, we know
3690 # that the range at $i is affected because we are in
3691 # the else branch of this 'if'
3692 # $j points to the highest affected range.
3693 # In other words,
3694 # r[$i-1]->end < $start <= r[$i]->end
3695 # And:
3696 # r[$i-1]->end < $start <= $end <= r[$j]->end
3697 #
3698 # Also:
969a34cc
KW
3699 # $clean_insert is a boolean which is set true if and only if
3700 # this is a "clean insertion", i.e., not a change nor a
3701 # deletion (multiple was handled above).
99870f4d
KW
3702
3703 # We now have enough information to decide if this call is a no-op
969a34cc
KW
3704 # or not. It is a no-op if this is an insertion of already
3705 # existing data.
99870f4d 3706
969a34cc 3707 if (main::DEBUG && $to_trace && $clean_insert
99870f4d
KW
3708 && $i == $j
3709 && $start >= $r->[$i]->start)
3710 {
3711 trace "no-op";
3712 }
969a34cc 3713 return if $clean_insert
99870f4d
KW
3714 && $i == $j # more than one affected range => not no-op
3715
3716 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3717 # Further, $start and/or $end is >= r[$i]->start
3718 # The test below hence guarantees that
3719 # r[$i]->start < $start <= $end <= r[$i]->end
3720 # This means the input range is contained entirely in
3721 # the one at $i, so is a no-op
3722 && $start >= $r->[$i]->start;
3723 }
3724
3725 # Here, we know that some action will have to be taken. We have
3726 # calculated the offset and length (though adjustments may be needed)
3727 # for the splice. Now start constructing the replacement list.
3728 my @replacement;
3729 my $splice_start = $i;
3730
3731 my $extends_below;
3732 my $extends_above;
3733
3734 # See if should extend any adjacent ranges.
3735 if ($operation eq '-') { # Don't extend deletions
3736 $extends_below = $extends_above = 0;
3737 }
3738 else { # Here, should extend any adjacent ranges. See if there are
3739 # any.
3740 $extends_below = ($i > 0
3741 # can't extend unless adjacent
3742 && $r->[$i-1]->end == $start -1
3743 # can't extend unless are same standard value
3744 && $r->[$i-1]->standard_form eq $standard_form
3745 # can't extend unless share type
3746 && $r->[$i-1]->type == $type);
3747 $extends_above = ($j+1 < $range_list_size
3748 && $r->[$j+1]->start == $end +1
3749 && $r->[$j+1]->standard_form eq $standard_form
23822bda 3750 && $r->[$j+1]->type == $type);
99870f4d
KW
3751 }
3752 if ($extends_below && $extends_above) { # Adds to both
3753 $splice_start--; # start replace at element below
3754 $length += 2; # will replace on both sides
3755 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3756
3757 # The result will fill in any gap, replacing both sides, and
3758 # create one large range.
3759 @replacement = Range->new($r->[$i-1]->start,
3760 $r->[$j+1]->end,
3761 Value => $value,
3762 Type => $type);
3763 }
3764 else {
3765
3766 # Here we know that the result won't just be the conglomeration of
3767 # a new range with both its adjacent neighbors. But it could
3768 # extend one of them.
3769
3770 if ($extends_below) {
3771
3772 # Here the new element adds to the one below, but not to the
3773 # one above. If inserting, and only to that one range, can
3774 # just change its ending to include the new one.
969a34cc 3775 if ($length == 0 && $clean_insert) {
99870f4d
KW
3776 $r->[$i-1]->set_end($end);
3777 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3778 return;
3779 }
3780 else {
3781 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3782 $splice_start--; # start replace at element below
3783 $length++; # will replace the element below
3784 $start = $r->[$i-1]->start;
3785 }
3786 }
3787 elsif ($extends_above) {
3788
3789 # Here the new element adds to the one above, but not below.
3790 # Mirror the code above
969a34cc 3791 if ($length == 0 && $clean_insert) {
99870f4d
KW
3792 $r->[$j+1]->set_start($start);
3793 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3794 return;
3795 }
3796 else {
3797 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3798 $length++; # will replace the element above
3799 $end = $r->[$j+1]->end;
3800 }
3801 }
3802
3803 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3804
3805 # Finally, here we know there will have to be a splice.
3806 # If the change or delete affects only the highest portion of the
3807 # first affected range, the range will have to be split. The
3808 # splice will remove the whole range, but will replace it by a new
3809 # range containing just the unaffected part. So, in this case,
3810 # add to the replacement list just this unaffected portion.
3811 if (! $extends_below
3812 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3813 {
3814 push @replacement,
3815 Range->new($r->[$i]->start,
3816 $start - 1,
3817 Value => $r->[$i]->value,
3818 Type => $r->[$i]->type);
3819 }
3820
3821 # In the case of an insert or change, but not a delete, we have to
3822 # put in the new stuff; this comes next.
3823 if ($operation eq '+') {
3824 push @replacement, Range->new($start,
3825 $end,
3826 Value => $value,
3827 Type => $type);
3828 }
3829
3830 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3831 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3832
3833 # And finally, if we're changing or deleting only a portion of the
3834 # highest affected range, it must be split, as the lowest one was.
3835 if (! $extends_above
3836 && $j >= 0 # Remember that j can be -1 if before first
3837 # current element
3838 && $end >= $r->[$j]->start
3839 && $end < $r->[$j]->end)
3840 {
3841 push @replacement,
3842 Range->new($end + 1,
3843 $r->[$j]->end,
3844 Value => $r->[$j]->value,
3845 Type => $r->[$j]->type);
3846 }
3847 }
3848
3849 # And do the splice, as calculated above
3850 if (main::DEBUG && $to_trace) {
3851 trace "replacing $length element(s) at $i with ";
3852 foreach my $replacement (@replacement) {
3853 trace " $replacement";
3854 }
3855 trace "Before splice:";
3856 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3857 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3858 trace "i =[", $i, "]", $r->[$i];
3859 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3860 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3861 }
3862
3863 my @return = splice @$r, $splice_start, $length, @replacement;
3864
3865 if (main::DEBUG && $to_trace) {
3866 trace "After splice:";
3867 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3868 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3869 trace "i =[", $i, "]", $r->[$i];
3870 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3871 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
e6451557 3872 trace "removed ", @return if @return;
99870f4d
KW
3873 }
3874
3875 # An actual deletion could have changed the maximum in the list.
3876 # There was no deletion if the splice didn't return something, but
3877 # otherwise recalculate it. This is done too rarely to worry about
3878 # performance.
3879 if ($operation eq '-' && @return) {
3880 $max{$addr} = $r->[-1]->end;
3881 }
3882 return @return;
3883 }
3884
3885 sub reset_each_range { # reset the iterator for each_range();
3886 my $self = shift;
3887 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3888
f998e60c 3889 no overloading;
051df77b 3890 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
3891 return;
3892 }
3893
3894 sub each_range {
3895 # Iterate over each range in a range list. Results are undefined if
3896 # the range list is changed during the iteration.
3897
3898 my $self = shift;
3899 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3900
ffe43484 3901 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3902
3903 return if $self->is_empty;
3904
3905 $each_range_iterator{$addr} = -1
3906 if ! defined $each_range_iterator{$addr};
3907 $each_range_iterator{$addr}++;
3908 return $ranges{$addr}->[$each_range_iterator{$addr}]
3909 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3910 undef $each_range_iterator{$addr};
3911 return;
3912 }
3913
3914 sub count { # Returns count of code points in range list
3915 my $self = shift;
3916 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3917
ffe43484 3918 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3919
3920 my $count = 0;
3921 foreach my $range (@{$ranges{$addr}}) {
3922 $count += $range->end - $range->start + 1;
3923 }
3924 return $count;
3925 }
3926
3927 sub delete_range { # Delete a range
3928 my $self = shift;
3929 my $start = shift;
3930 my $end = shift;
3931
3932 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3933
3934 return $self->_add_delete('-', $start, $end, "");
3935 }
3936
3937 sub is_empty { # Returns boolean as to if a range list is empty
3938 my $self = shift;
3939 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3940
f998e60c 3941 no overloading;
051df77b 3942 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
3943 }
3944
3945 sub hash {
3946 # Quickly returns a scalar suitable for separating tables into
3947 # buckets, i.e. it is a hash function of the contents of a table, so
3948 # there are relatively few conflicts.
3949
3950 my $self = shift;
3951 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3952
ffe43484 3953 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3954
3955 # These are quickly computable. Return looks like 'min..max;count'
3956 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3957 }
3958} # End closure for _Range_List_Base
3959
3960package Range_List;
3961use base '_Range_List_Base';
3962
3963# A Range_List is a range list for match tables; i.e. the range values are
3964# not significant. Thus a number of operations can be safely added to it,
3965# such as inversion, intersection. Note that union is also an unsafe
3966# operation when range values are cared about, and that method is in the base
3967# class, not here. But things are set up so that that method is callable only
3968# during initialization. Only in this derived class, is there an operation
3969# that combines two tables. A Range_Map can thus be used to initialize a
3970# Range_List, and its mappings will be in the list, but are not significant to
3971# this class.
3972
3973sub trace { return main::trace(@_); }
3974
3975{ # Closure
3976
3977 use overload
3978 fallback => 0,
3979 '+' => sub { my $self = shift;
3980 my $other = shift;
3981
3982 return $self->_union($other)
3983 },
3984 '&' => sub { my $self = shift;
3985 my $other = shift;
3986
3987 return $self->_intersect($other, 0);
3988 },
3989 '~' => "_invert",
3990 '-' => "_subtract",
3991 ;
3992
3993 sub _invert {
3994 # Returns a new Range_List that gives all code points not in $self.
3995
3996 my $self = shift;
3997
3998 my $new = Range_List->new;
3999
4000 # Go through each range in the table, finding the gaps between them
4001 my $max = -1; # Set so no gap before range beginning at 0
4002 for my $range ($self->ranges) {
4003 my $start = $range->start;
4004 my $end = $range->end;
4005
4006 # If there is a gap before this range, the inverse will contain
4007 # that gap.
4008 if ($start > $max + 1) {
4009 $new->add_range($max + 1, $start - 1);
4010 }
4011 $max = $end;
4012 }
4013
4014 # And finally, add the gap from the end of the table to the max
4015 # possible code point
4016 if ($max < $LAST_UNICODE_CODEPOINT) {
4017 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
4018 }
4019 return $new;
4020 }
4021
4022 sub _subtract {
4023 # Returns a new Range_List with the argument deleted from it. The
4024 # argument can be a single code point, a range, or something that has
4025 # a range, with the _range_list() method on it returning them
4026
4027 my $self = shift;
4028 my $other = shift;
4029 my $reversed = shift;
4030 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4031
4032 if ($reversed) {
4033 Carp::my_carp_bug("Can't cope with a "
4034 . __PACKAGE__
4035 . " being the second parameter in a '-'. Subtraction ignored.");
4036 return $self;
4037 }
4038
4039 my $new = Range_List->new(Initialize => $self);
4040
4041 if (! ref $other) { # Single code point
4042 $new->delete_range($other, $other);
4043 }
4044 elsif ($other->isa('Range')) {
4045 $new->delete_range($other->start, $other->end);
4046 }
4047 elsif ($other->can('_range_list')) {
4048 foreach my $range ($other->_range_list->ranges) {
4049 $new->delete_range($range->start, $range->end);
4050 }
4051 }
4052 else {
4053 Carp::my_carp_bug("Can't cope with a "
4054 . ref($other)
4055 . " argument to '-'. Subtraction ignored."
4056 );
4057 return $self;
4058 }
4059
4060 return $new;
4061 }
4062
4063 sub _intersect {
4064 # Returns either a boolean giving whether the two inputs' range lists
4065 # intersect (overlap), or a new Range_List containing the intersection
4066 # of the two lists. The optional final parameter being true indicates
4067 # to do the check instead of the intersection.
4068
4069 my $a_object = shift;
4070 my $b_object = shift;
4071 my $check_if_overlapping = shift;
4072 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4073 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4074
4075 if (! defined $b_object) {
4076 my $message = "";
4077 $message .= $a_object->_owner_name_of if defined $a_object;
4078 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4079 return;
4080 }
4081
4082 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4083 # Thus the intersection could be much more simply be written:
4084 # return ~(~$a_object + ~$b_object);
4085 # But, this is slower, and when taking the inverse of a large
4086 # range_size_1 table, back when such tables were always stored that
4087 # way, it became prohibitively slow, hence the code was changed to the
4088 # below
4089
4090 if ($b_object->isa('Range')) {
4091 $b_object = Range_List->new(Initialize => $b_object,
4092 Owner => $a_object->_owner_name_of);
4093 }
4094 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4095
4096 my @a_ranges = $a_object->ranges;
4097 my @b_ranges = $b_object->ranges;
4098
4099 #local $to_trace = 1 if main::DEBUG;
4100 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4101
4102 # Start with the first range in each list
4103 my $a_i = 0;
4104 my $range_a = $a_ranges[$a_i];
4105 my $b_i = 0;
4106 my $range_b = $b_ranges[$b_i];
4107
4108 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4109 if ! $check_if_overlapping;
4110
4111 # If either list is empty, there is no intersection and no overlap
4112 if (! defined $range_a || ! defined $range_b) {
4113 return $check_if_overlapping ? 0 : $new;
4114 }
4115 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4116
4117 # Otherwise, must calculate the intersection/overlap. Start with the
4118 # very first code point in each list
4119 my $a = $range_a->start;
4120 my $b = $range_b->start;
4121
4122 # Loop through all the ranges of each list; in each iteration, $a and
4123 # $b are the current code points in their respective lists
4124 while (1) {
4125
4126 # If $a and $b are the same code point, ...
4127 if ($a == $b) {
4128
4129 # it means the lists overlap. If just checking for overlap
4130 # know the answer now,
4131 return 1 if $check_if_overlapping;
4132
4133 # The intersection includes this code point plus anything else
4134 # common to both current ranges.
4135 my $start = $a;
4136 my $end = main::min($range_a->end, $range_b->end);
4137 if (! $check_if_overlapping) {
4138 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4139 $new->add_range($start, $end);
4140 }
4141
4142 # Skip ahead to the end of the current intersect
4143 $a = $b = $end;
4144
4145 # If the current intersect ends at the end of either range (as
4146 # it must for at least one of them), the next possible one
4147 # will be the beginning code point in it's list's next range.
4148 if ($a == $range_a->end) {
4149 $range_a = $a_ranges[++$a_i];
4150 last unless defined $range_a;
4151 $a = $range_a->start;
4152 }
4153 if ($b == $range_b->end) {
4154 $range_b = $b_ranges[++$b_i];
4155 last unless defined $range_b;
4156 $b = $range_b->start;
4157 }
4158
4159 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4160 }
4161 elsif ($a < $b) {
4162
4163 # Not equal, but if the range containing $a encompasses $b,
4164 # change $a to be the middle of the range where it does equal
4165 # $b, so the next iteration will get the intersection
4166 if ($range_a->end >= $b) {
4167 $a = $b;
4168 }
4169 else {
4170
4171 # Here, the current range containing $a is entirely below
4172 # $b. Go try to find a range that could contain $b.
4173 $a_i = $a_object->_search_ranges($b);
4174
4175 # If no range found, quit.
4176 last unless defined $a_i;
4177
4178 # The search returns $a_i, such that
4179 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4180 # Set $a to the beginning of this new range, and repeat.
4181 $range_a = $a_ranges[$a_i];
4182 $a = $range_a->start;
4183 }
4184 }
4185 else { # Here, $b < $a.
4186
4187 # Mirror image code to the leg just above
4188 if ($range_b->end >= $a) {
4189 $b = $a;
4190 }
4191 else {
4192 $b_i = $b_object->_search_ranges($a);
4193 last unless defined $b_i;
4194 $range_b = $b_ranges[$b_i];
4195 $b = $range_b->start;
4196 }
4197 }
4198 } # End of looping through ranges.
4199
4200 # Intersection fully computed, or now know that there is no overlap
4201 return $check_if_overlapping ? 0 : $new;
4202 }
4203
4204 sub overlaps {
4205 # Returns boolean giving whether the two arguments overlap somewhere
4206
4207 my $self = shift;
4208 my $other = shift;
4209 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4210
4211 return $self->_intersect($other, 1);
4212 }
4213
4214 sub add_range {
4215 # Add a range to the list.
4216
4217 my $self = shift;
4218 my $start = shift;
4219 my $end = shift;
4220 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4221
4222 return $self->_add_delete('+', $start, $end, "");
4223 }
4224
09aba7e4
KW
4225 sub matches_identically_to {
4226 # Return a boolean as to whether or not two Range_Lists match identical
4227 # sets of code points.
4228
4229 my $self = shift;
4230 my $other = shift;
4231 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4232
4233 # These are ordered in increasing real time to figure out (at least
4234 # until a patch changes that and doesn't change this)
4235 return 0 if $self->max != $other->max;
4236 return 0 if $self->min != $other->min;
4237 return 0 if $self->range_count != $other->range_count;
4238 return 0 if $self->count != $other->count;
4239
4240 # Here they could be identical because all the tests above passed.
4241 # The loop below is somewhat simpler since we know they have the same
4242 # number of elements. Compare range by range, until reach the end or
4243 # find something that differs.
4244 my @a_ranges = $self->ranges;
4245 my @b_ranges = $other->ranges;
4246 for my $i (0 .. @a_ranges - 1) {
4247 my $a = $a_ranges[$i];
4248 my $b = $b_ranges[$i];
4249 trace "self $a; other $b" if main::DEBUG && $to_trace;
c1c2d9e8
KW
4250 return 0 if ! defined $b
4251 || $a->start != $b->start
4252 || $a->end != $b->end;
09aba7e4
KW
4253 }
4254 return 1;
4255 }
4256
99870f4d
KW
4257 sub is_code_point_usable {
4258 # This used only for making the test script. See if the input
4259 # proposed trial code point is one that Perl will handle. If second
4260 # parameter is 0, it won't select some code points for various
4261 # reasons, noted below.
4262
4263 my $code = shift;
4264 my $try_hard = shift;
4265 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4266
4267 return 0 if $code < 0; # Never use a negative
4268
99870f4d
KW
4269 # shun null. I'm (khw) not sure why this was done, but NULL would be
4270 # the character very frequently used.
4271 return $try_hard if $code == 0x0000;
4272
99870f4d
KW
4273 # shun non-character code points.
4274 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4275 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4276
4277 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
4278 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4279
4280 return 1;
4281 }
4282
4283 sub get_valid_code_point {
4284 # Return a code point that's part of the range list. Returns nothing
4285 # if the table is empty or we can't find a suitable code point. This
4286 # used only for making the test script.
4287
4288 my $self = shift;
4289 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4290
ffe43484 4291 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4292
4293 # On first pass, don't choose less desirable code points; if no good
4294 # one is found, repeat, allowing a less desirable one to be selected.
4295 for my $try_hard (0, 1) {
4296
4297 # Look through all the ranges for a usable code point.
4298 for my $set ($self->ranges) {
4299
4300 # Try the edge cases first, starting with the end point of the
4301 # range.
4302 my $end = $set->end;
4303 return $end if is_code_point_usable($end, $try_hard);
4304
4305 # End point didn't, work. Start at the beginning and try
4306 # every one until find one that does work.
4307 for my $trial ($set->start .. $end - 1) {
4308 return $trial if is_code_point_usable($trial, $try_hard);
4309 }
4310 }
4311 }
4312 return (); # If none found, give up.
4313 }
4314
4315 sub get_invalid_code_point {
4316 # Return a code point that's not part of the table. Returns nothing
4317 # if the table covers all code points or a suitable code point can't
4318 # be found. This used only for making the test script.
4319
4320 my $self = shift;
4321 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4322
4323 # Just find a valid code point of the inverse, if any.
4324 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4325 }
4326} # end closure for Range_List
4327
4328package Range_Map;
4329use base '_Range_List_Base';
4330
4331# A Range_Map is a range list in which the range values (called maps) are
4332# significant, and hence shouldn't be manipulated by our other code, which
4333# could be ambiguous or lose things. For example, in taking the union of two
4334# lists, which share code points, but which have differing values, which one
4335# has precedence in the union?
4336# It turns out that these operations aren't really necessary for map tables,
4337# and so this class was created to make sure they aren't accidentally
4338# applied to them.
4339
4340{ # Closure
4341
4342 sub add_map {
4343 # Add a range containing a mapping value to the list
4344
4345 my $self = shift;
4346 # Rest of parameters passed on
4347
4348 return $self->_add_delete('+', @_);
4349 }
4350
4351 sub add_duplicate {
4352 # Adds entry to a range list which can duplicate an existing entry
4353
4354 my $self = shift;
4355 my $code_point = shift;
4356 my $value = shift;
4357 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4358
4359 return $self->add_map($code_point, $code_point,
4360 $value, Replace => $MULTIPLE);
4361 }
4362} # End of closure for package Range_Map
4363
4364package _Base_Table;
4365
4366# A table is the basic data structure that gets written out into a file for
4367# use by the Perl core. This is the abstract base class implementing the
4368# common elements from the derived ones. A list of the methods to be
4369# furnished by an implementing class is just after the constructor.
4370
4371sub standardize { return main::standardize($_[0]); }
4372sub trace { return main::trace(@_); }
4373
4374{ # Closure
4375
4376 main::setup_package();
4377
4378 my %range_list;
4379 # Object containing the ranges of the table.
4380 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4381
4382 my %full_name;
4383 # The full table name.
4384 main::set_access('full_name', \%full_name, 'r');
4385
4386 my %name;
4387 # The table name, almost always shorter
4388 main::set_access('name', \%name, 'r');
4389
4390 my %short_name;
4391 # The shortest of all the aliases for this table, with underscores removed
4392 main::set_access('short_name', \%short_name);
4393
4394 my %nominal_short_name_length;
4395 # The length of short_name before removing underscores
4396 main::set_access('nominal_short_name_length',
4397 \%nominal_short_name_length);
4398
23e33b60
KW
4399 my %complete_name;
4400 # The complete name, including property.
4401 main::set_access('complete_name', \%complete_name, 'r');
4402
99870f4d
KW
4403 my %property;
4404 # Parent property this table is attached to.
4405 main::set_access('property', \%property, 'r');
4406
4407 my %aliases;
c12f2655
KW
4408 # Ordered list of alias objects of the table's name. The first ones in
4409 # the list are output first in comments
99870f4d
KW
4410 main::set_access('aliases', \%aliases, 'readable_array');
4411
4412 my %comment;
4413 # A comment associated with the table for human readers of the files
4414 main::set_access('comment', \%comment, 's');
4415
4416 my %description;
4417 # A comment giving a short description of the table's meaning for human
4418 # readers of the files.
4419 main::set_access('description', \%description, 'readable_array');
4420
4421 my %note;
4422 # A comment giving a short note about the table for human readers of the
4423 # files.
4424 main::set_access('note', \%note, 'readable_array');
4425
4426 my %internal_only;
c12f2655 4427 # Boolean; if set this table is for internal core Perl only use.
99870f4d
KW
4428 main::set_access('internal_only', \%internal_only);
4429
4430 my %find_table_from_alias;
4431 # The parent property passes this pointer to a hash which this class adds
4432 # all its aliases to, so that the parent can quickly take an alias and
4433 # find this table.
4434 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4435
4436 my %locked;
4437 # After this table is made equivalent to another one; we shouldn't go
4438 # changing the contents because that could mean it's no longer equivalent
4439 main::set_access('locked', \%locked, 'r');
4440
4441 my %file_path;
4442 # This gives the final path to the file containing the table. Each
4443 # directory in the path is an element in the array
4444 main::set_access('file_path', \%file_path, 'readable_array');
4445
4446 my %status;
4447 # What is the table's status, normal, $OBSOLETE, etc. Enum
4448 main::set_access('status', \%status, 'r');
4449
4450 my %status_info;
4451 # A comment about its being obsolete, or whatever non normal status it has
4452 main::set_access('status_info', \%status_info, 'r');
4453
d867ccfb
KW
4454 my %caseless_equivalent;
4455 # The table this is equivalent to under /i matching, if any.
4456 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4457
99870f4d
KW
4458 my %range_size_1;
4459 # Is the table to be output with each range only a single code point?
4460 # This is done to avoid breaking existing code that may have come to rely
4461 # on this behavior in previous versions of this program.)
4462 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4463
4464 my %perl_extension;
4465 # A boolean set iff this table is a Perl extension to the Unicode
4466 # standard.
4467 main::set_access('perl_extension', \%perl_extension, 'r');
4468
0c07e538
KW
4469 my %output_range_counts;
4470 # A boolean set iff this table is to have comments written in the
4471 # output file that contain the number of code points in the range.
4472 # The constructor can override the global flag of the same name.
4473 main::set_access('output_range_counts', \%output_range_counts, 'r');
4474
f5817e0a
KW
4475 my %format;
4476 # The format of the entries of the table. This is calculated from the
4477 # data in the table (or passed in the constructor). This is an enum e.g.,
4478 # $STRING_FORMAT
4479 main::set_access('format', \%format, 'r', 'p_s');
4480
99870f4d
KW
4481 sub new {
4482 # All arguments are key => value pairs, which you can see below, most
4483 # of which match fields documented above. Otherwise: Pod_Entry,
4484 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4485 # documented in the Alias package
4486
4487 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4488
4489 my $class = shift;
4490
4491 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4492 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4493
4494 my %args = @_;
4495
4496 $name{$addr} = delete $args{'Name'};
4497 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4498 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4499 my $complete_name = $complete_name{$addr}
4500 = delete $args{'Complete_Name'};
f5817e0a 4501 $format{$addr} = delete $args{'Format'};
99870f4d 4502 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
0c07e538 4503 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4504 $property{$addr} = delete $args{'_Property'};
4505 $range_list{$addr} = delete $args{'_Range_List'};
4506 $status{$addr} = delete $args{'Status'} || $NORMAL;
4507 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4508 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
d867ccfb 4509 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
99870f4d
KW
4510
4511 my $description = delete $args{'Description'};
4512 my $externally_ok = delete $args{'Externally_Ok'};
4513 my $loose_match = delete $args{'Fuzzy'};
4514 my $note = delete $args{'Note'};
4515 my $make_pod_entry = delete $args{'Pod_Entry'};
37e2e78e 4516 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4517
4518 # Shouldn't have any left over
4519 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4520
4521 # Can't use || above because conceivably the name could be 0, and
4522 # can't use // operator in case this program gets used in Perl 5.8
4523 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4524 $output_range_counts{$addr} = $output_range_counts if
4525 ! defined $output_range_counts{$addr};
99870f4d
KW
4526
4527 $aliases{$addr} = [ ];
4528 $comment{$addr} = [ ];
4529 $description{$addr} = [ ];
4530 $note{$addr} = [ ];
4531 $file_path{$addr} = [ ];
4532 $locked{$addr} = "";
4533
4534 push @{$description{$addr}}, $description if $description;
4535 push @{$note{$addr}}, $note if $note;
4536
37e2e78e
KW
4537 if ($status{$addr} eq $PLACEHOLDER) {
4538
4539 # A placeholder table doesn't get documented, is a perl extension,
4540 # and quite likely will be empty
4541 $make_pod_entry = 0 if ! defined $make_pod_entry;
4542 $perl_extension = 1 if ! defined $perl_extension;
4543 push @tables_that_may_be_empty, $complete_name{$addr};
4544 }
4545 elsif (! $status{$addr}) {
4546
4547 # If hasn't set its status already, see if it is on one of the
4548 # lists of properties or tables that have particular statuses; if
4549 # not, is normal. The lists are prioritized so the most serious
4550 # ones are checked first
ec11e5f4 4551 if (exists $why_suppressed{$complete_name}
98dc9551 4552 # Don't suppress if overridden
ec11e5f4
KW
4553 && ! grep { $_ eq $complete_name{$addr} }
4554 @output_mapped_properties)
4555 {
99870f4d
KW
4556 $status{$addr} = $SUPPRESSED;
4557 }
4558 elsif (exists $why_deprecated{$complete_name}) {
4559 $status{$addr} = $DEPRECATED;
4560 }
4561 elsif (exists $why_stabilized{$complete_name}) {
4562 $status{$addr} = $STABILIZED;
4563 }
4564 elsif (exists $why_obsolete{$complete_name}) {
4565 $status{$addr} = $OBSOLETE;
4566 }
4567
4568 # Existence above doesn't necessarily mean there is a message
4569 # associated with it. Use the most serious message.
4570 if ($status{$addr}) {
4571 if ($why_suppressed{$complete_name}) {
4572 $status_info{$addr}
4573 = $why_suppressed{$complete_name};
4574 }
4575 elsif ($why_deprecated{$complete_name}) {
4576 $status_info{$addr}
4577 = $why_deprecated{$complete_name};
4578 }
4579 elsif ($why_stabilized{$complete_name}) {
4580 $status_info{$addr}
4581 = $why_stabilized{$complete_name};
4582 }
4583 elsif ($why_obsolete{$complete_name}) {
4584 $status_info{$addr}
4585 = $why_obsolete{$complete_name};
4586 }
4587 }
4588 }
4589
37e2e78e
KW
4590 $perl_extension{$addr} = $perl_extension || 0;
4591
99870f4d
KW
4592 # By convention what typically gets printed only or first is what's
4593 # first in the list, so put the full name there for good output
4594 # clarity. Other routines rely on the full name being first on the
4595 # list
4596 $self->add_alias($full_name{$addr},
4597 Externally_Ok => $externally_ok,
4598 Fuzzy => $loose_match,
4599 Pod_Entry => $make_pod_entry,
4600 Status => $status{$addr},
4601 );
4602
4603 # Then comes the other name, if meaningfully different.
4604 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4605 $self->add_alias($name{$addr},
4606 Externally_Ok => $externally_ok,
4607 Fuzzy => $loose_match,
4608 Pod_Entry => $make_pod_entry,
4609 Status => $status{$addr},
4610 );
4611 }
4612
4613 return $self;
4614 }
4615
4616 # Here are the methods that are required to be defined by any derived
4617 # class
ea25a9b2 4618 for my $sub (qw(
668b3bfc 4619 handle_special_range
99870f4d 4620 append_to_body
99870f4d 4621 pre_body
ea25a9b2 4622 ))
668b3bfc
KW
4623 # write() knows how to write out normal ranges, but it calls
4624 # handle_special_range() when it encounters a non-normal one.
4625 # append_to_body() is called by it after it has handled all
4626 # ranges to add anything after the main portion of the table.
4627 # And finally, pre_body() is called after all this to build up
4628 # anything that should appear before the main portion of the
4629 # table. Doing it this way allows things in the middle to
4630 # affect what should appear before the main portion of the
99870f4d 4631 # table.
99870f4d
KW
4632 {
4633 no strict "refs";
4634 *$sub = sub {
4635 Carp::my_carp_bug( __LINE__
4636 . ": Must create method '$sub()' for "
4637 . ref shift);
4638 return;
4639 }
4640 }
4641
4642 use overload
4643 fallback => 0,
4644 "." => \&main::_operator_dot,
4645 '!=' => \&main::_operator_not_equal,
4646 '==' => \&main::_operator_equal,
4647 ;
4648
4649 sub ranges {
4650 # Returns the array of ranges associated with this table.
4651
f998e60c 4652 no overloading;
051df77b 4653 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4654 }
4655
4656 sub add_alias {
4657 # Add a synonym for this table.
4658
4659 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4660
4661 my $self = shift;
4662 my $name = shift; # The name to add.
4663 my $pointer = shift; # What the alias hash should point to. For
4664 # map tables, this is the parent property;
4665 # for match tables, it is the table itself.
4666
4667 my %args = @_;
4668 my $loose_match = delete $args{'Fuzzy'};
4669
4670 my $make_pod_entry = delete $args{'Pod_Entry'};
4671 $make_pod_entry = $YES unless defined $make_pod_entry;
4672
4673 my $externally_ok = delete $args{'Externally_Ok'};
4674 $externally_ok = 1 unless defined $externally_ok;
4675
4676 my $status = delete $args{'Status'};
4677 $status = $NORMAL unless defined $status;
4678
4679 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4680
4681 # Capitalize the first letter of the alias unless it is one of the CJK
4682 # ones which specifically begins with a lower 'k'. Do this because
4683 # Unicode has varied whether they capitalize first letters or not, and
4684 # have later changed their minds and capitalized them, but not the
4685 # other way around. So do it always and avoid changes from release to
4686 # release
4687 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4688
ffe43484 4689 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4690
4691 # Figure out if should be loosely matched if not already specified.
4692 if (! defined $loose_match) {
4693
4694 # Is a loose_match if isn't null, and doesn't begin with an
4695 # underscore and isn't just a number
4696 if ($name ne ""
4697 && substr($name, 0, 1) ne '_'
4698 && $name !~ qr{^[0-9_.+-/]+$})
4699 {
4700 $loose_match = 1;
4701 }
4702 else {
4703 $loose_match = 0;
4704 }
4705 }
4706
4707 # If this alias has already been defined, do nothing.
4708 return if defined $find_table_from_alias{$addr}->{$name};
4709
4710 # That includes if it is standardly equivalent to an existing alias,
4711 # in which case, add this name to the list, so won't have to search
4712 # for it again.
4713 my $standard_name = main::standardize($name);
4714 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4715 $find_table_from_alias{$addr}->{$name}
4716 = $find_table_from_alias{$addr}->{$standard_name};
4717 return;
4718 }
4719
4720 # Set the index hash for this alias for future quick reference.
4721 $find_table_from_alias{$addr}->{$name} = $pointer;
4722 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4723 local $to_trace = 0 if main::DEBUG;
4724 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4725 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4726
4727
4728 # Put the new alias at the end of the list of aliases unless the final
4729 # element begins with an underscore (meaning it is for internal perl
4730 # use) or is all numeric, in which case, put the new one before that
4731 # one. This floats any all-numeric or underscore-beginning aliases to
4732 # the end. This is done so that they are listed last in output lists,
4733 # to encourage the user to use a better name (either more descriptive
4734 # or not an internal-only one) instead. This ordering is relied on
4735 # implicitly elsewhere in this program, like in short_name()
4736 my $list = $aliases{$addr};
4737 my $insert_position = (@$list == 0
4738 || (substr($list->[-1]->name, 0, 1) ne '_'
4739 && $list->[-1]->name =~ /\D/))
4740 ? @$list
4741 : @$list - 1;
4742 splice @$list,
4743 $insert_position,
4744 0,
4745 Alias->new($name, $loose_match, $make_pod_entry,
4746 $externally_ok, $status);
4747
4748 # This name may be shorter than any existing ones, so clear the cache
4749 # of the shortest, so will have to be recalculated.
f998e60c 4750 no overloading;
051df77b 4751 undef $short_name{pack 'J', $self};
99870f4d
KW
4752 return;
4753 }
4754
4755 sub short_name {
4756 # Returns a name suitable for use as the base part of a file name.
4757 # That is, shorter wins. It can return undef if there is no suitable
4758 # name. The name has all non-essential underscores removed.
4759
4760 # The optional second parameter is a reference to a scalar in which
4761 # this routine will store the length the returned name had before the
4762 # underscores were removed, or undef if the return is undef.
4763
4764 # The shortest name can change if new aliases are added. So using
4765 # this should be deferred until after all these are added. The code
4766 # that does that should clear this one's cache.
4767 # Any name with alphabetics is preferred over an all numeric one, even
4768 # if longer.
4769
4770 my $self = shift;
4771 my $nominal_length_ptr = shift;
4772 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4773
ffe43484 4774 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4775
4776 # For efficiency, don't recalculate, but this means that adding new
4777 # aliases could change what the shortest is, so the code that does
4778 # that needs to undef this.
4779 if (defined $short_name{$addr}) {
4780 if ($nominal_length_ptr) {
4781 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4782 }
4783 return $short_name{$addr};
4784 }
4785
4786 # Look at each alias
4787 foreach my $alias ($self->aliases()) {
4788
4789 # Don't use an alias that isn't ok to use for an external name.
4790 next if ! $alias->externally_ok;
4791
4792 my $name = main::Standardize($alias->name);
4793 trace $self, $name if main::DEBUG && $to_trace;
4794
4795 # Take the first one, or a shorter one that isn't numeric. This
4796 # relies on numeric aliases always being last in the array
4797 # returned by aliases(). Any alpha one will have precedence.
4798 if (! defined $short_name{$addr}
4799 || ($name =~ /\D/
4800 && length($name) < length($short_name{$addr})))
4801 {
4802 # Remove interior underscores.
4803 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4804
4805 $nominal_short_name_length{$addr} = length $name;
4806 }
4807 }
4808
ff485b9e
KW
4809 # If the short name isn't a nice one, perhaps an equivalent table has
4810 # a better one.
4811 if (! defined $short_name{$addr}
4812 || $short_name{$addr} eq ""
4813 || $short_name{$addr} eq "_")
4814 {
4815 my $return;
4816 foreach my $follower ($self->children) { # All equivalents
4817 my $follower_name = $follower->short_name;
4818 next unless defined $follower_name;
4819
4820 # Anything (except undefined) is better than underscore or
4821 # empty
4822 if (! defined $return || $return eq "_") {
4823 $return = $follower_name;
4824 next;
4825 }
4826
4827 # If the new follower name isn't "_" and is shorter than the
4828 # current best one, prefer the new one.
4829 next if $follower_name eq "_";
4830 next if length $follower_name > length $return;
4831 $return = $follower_name;
4832 }
4833 $short_name{$addr} = $return if defined $return;
4834 }
4835
99870f4d
KW
4836 # If no suitable external name return undef
4837 if (! defined $short_name{$addr}) {
4838 $$nominal_length_ptr = undef if $nominal_length_ptr;
4839 return;
4840 }
4841
c12f2655 4842 # Don't allow a null short name.
99870f4d
KW
4843 if ($short_name{$addr} eq "") {
4844 $short_name{$addr} = '_';
4845 $nominal_short_name_length{$addr} = 1;
4846 }
4847
4848 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4849
4850 if ($nominal_length_ptr) {
4851 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4852 }
4853 return $short_name{$addr};
4854 }
4855
4856 sub external_name {
4857 # Returns the external name that this table should be known by. This
c12f2655
KW
4858 # is usually the short_name, but not if the short_name is undefined,
4859 # in which case the external_name is arbitrarily set to the
4860 # underscore.
99870f4d
KW
4861
4862 my $self = shift;
4863 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4864
4865 my $short = $self->short_name;
4866 return $short if defined $short;
4867
4868 return '_';
4869 }
4870
4871 sub add_description { # Adds the parameter as a short description.
4872
4873 my $self = shift;
4874 my $description = shift;
4875 chomp $description;
4876 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4877
f998e60c 4878 no overloading;
051df77b 4879 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
4880
4881 return;
4882 }
4883
4884 sub add_note { # Adds the parameter as a short note.
4885
4886 my $self = shift;
4887 my $note = shift;
4888 chomp $note;
4889 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4890
f998e60c 4891 no overloading;
051df77b 4892 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
4893
4894 return;
4895 }
4896
4897 sub add_comment { # Adds the parameter as a comment.
4898
bd9ebcfd
KW
4899 return unless $debugging_build;
4900
99870f4d
KW
4901 my $self = shift;
4902 my $comment = shift;
4903 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4904
4905 chomp $comment;
f998e60c
KW
4906
4907 no overloading;
051df77b 4908 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
4909
4910 return;
4911 }
4912
4913 sub comment {
4914 # Return the current comment for this table. If called in list
4915 # context, returns the array of comments. In scalar, returns a string
4916 # of each element joined together with a period ending each.
4917
4918 my $self = shift;
4919 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4920
ffe43484 4921 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 4922 my @list = @{$comment{$addr}};
99870f4d
KW
4923 return @list if wantarray;
4924 my $return = "";
4925 foreach my $sentence (@list) {
4926 $return .= '. ' if $return;
4927 $return .= $sentence;
4928 $return =~ s/\.$//;
4929 }
4930 $return .= '.' if $return;
4931 return $return;
4932 }
4933
4934 sub initialize {
4935 # Initialize the table with the argument which is any valid
4936 # initialization for range lists.
4937
4938 my $self = shift;
ffe43484 4939 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4940 my $initialization = shift;
4941 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4942
4943 # Replace the current range list with a new one of the same exact
4944 # type.
f998e60c
KW
4945 my $class = ref $range_list{$addr};
4946 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
4947 Initialize => $initialization);
4948 return;
4949
4950 }
4951
4952 sub header {
4953 # The header that is output for the table in the file it is written
4954 # in.
4955
4956 my $self = shift;
4957 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4958
4959 my $return = "";
4960 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4961 $return .= $HEADER;
f998e60c 4962 no overloading;
051df77b 4963 $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
99870f4d
KW
4964 return $return;
4965 }
4966
4967 sub write {
668b3bfc
KW
4968 # Write a representation of the table to its file. It calls several
4969 # functions furnished by sub-classes of this abstract base class to
4970 # handle non-normal ranges, to add stuff before the table, and at its
4971 # end.
99870f4d
KW
4972
4973 my $self = shift;
4974 my $tab_stops = shift; # The number of tab stops over to put any
4975 # comment.
4976 my $suppress_value = shift; # Optional, if the value associated with
4977 # a range equals this one, don't write
4978 # the range
4979 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4980
ffe43484 4981 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4982
4983 # Start with the header
668b3bfc 4984 my @HEADER = $self->header;
99870f4d
KW
4985
4986 # Then the comments
668b3bfc 4987 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
99870f4d
KW
4988 if $comment{$addr};
4989
668b3bfc
KW
4990 # Things discovered processing the main body of the document may
4991 # affect what gets output before it, therefore pre_body() isn't called
4992 # until after all other processing of the table is done.
99870f4d 4993
c4019d52
KW
4994 # The main body looks like a 'here' document. If annotating, get rid
4995 # of the comments before passing to the caller, as some callers, such
4996 # as charnames.pm, can't cope with them. (Outputting range counts
4997 # also introduces comments, but these don't show up in the tables that
4998 # can't cope with comments, and there aren't that many of them that
4999 # it's worth the extra real time to get rid of them).
668b3bfc 5000 my @OUT;
558712cf 5001 if ($annotate) {
c4019d52
KW
5002 # Use the line below in Perls that don't have /r
5003 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5004 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5005 } else {
5006 push @OUT, "return <<'END';\n";
5007 }
99870f4d
KW
5008
5009 if ($range_list{$addr}->is_empty) {
5010
5011 # This is a kludge for empty tables to silence a warning in
5012 # utf8.c, which can't really deal with empty tables, but it can
5013 # deal with a table that matches nothing, as the inverse of 'Any'
5014 # does.
67a53d68 5015 push @OUT, "!utf8::Any\n";
99870f4d 5016 }
c69a9c68
KW
5017 elsif ($self->name eq 'N'
5018
5019 # To save disk space and table cache space, avoid putting out
5020 # binary N tables, but instead create a file which just inverts
5021 # the Y table. Since the file will still exist and occupy a
5022 # certain number of blocks, might as well output the whole
5023 # thing if it all will fit in one block. The number of
5024 # ranges below is an approximate number for that.
5025 && $self->property->type == $BINARY
5026 # && $self->property->tables == 2 Can't do this because the
5027 # non-binary properties, like NFDQC aren't specifiable
5028 # by the notation
5029 && $range_list{$addr}->ranges > 15
5030 && ! $annotate) # Under --annotate, want to see everything
5031 {
5032 push @OUT, "!utf8::" . $self->property->name . "\n";
5033 }
99870f4d
KW
5034 else {
5035 my $range_size_1 = $range_size_1{$addr};
558712cf
KW
5036 my $format; # Used only in $annotate option
5037 my $include_name; # Used only in $annotate option
c4019d52 5038
558712cf 5039 if ($annotate) {
c4019d52
KW
5040
5041 # if annotating each code point, must print 1 per line.
5042 # The variable could point to a subroutine, and we don't want
5043 # to lose that fact, so only set if not set already
5044 $range_size_1 = 1 if ! $range_size_1;
5045
5046 $format = $self->format;
5047
5048 # The name of the character is output only for tables that
5049 # don't already include the name in the output.
5050 my $property = $self->property;
5051 $include_name =
5052 ! ($property == $perl_charname
5053 || $property == main::property_ref('Unicode_1_Name')
5054 || $property == main::property_ref('Name')
5055 || $property == main::property_ref('Name_Alias')
5056 );
5057 }
99870f4d
KW
5058
5059 # Output each range as part of the here document.
5a2b5ddb 5060 RANGE:
99870f4d 5061 for my $set ($range_list{$addr}->ranges) {
5a2b5ddb
KW
5062 if ($set->type != 0) {
5063 $self->handle_special_range($set);
5064 next RANGE;
5065 }
99870f4d
KW
5066 my $start = $set->start;
5067 my $end = $set->end;
5068 my $value = $set->value;
5069
5070 # Don't output ranges whose value is the one to suppress
c4019d52
KW
5071 next RANGE if defined $suppress_value
5072 && $value eq $suppress_value;
99870f4d 5073
c4019d52
KW
5074 # If there is a range and doesn't need a single point range
5075 # output
5076 if ($start != $end && ! $range_size_1) {
bd9ebcfd
KW
5077 push @OUT, sprintf "%04X\t%04X", $start, $end;
5078 $OUT[-1] .= "\t$value" if $value ne "";
99870f4d
KW
5079
5080 # Add a comment with the size of the range, if requested.
5081 # Expand Tabs to make sure they all start in the same
5082 # column, and then unexpand to use mostly tabs.
0c07e538 5083 if (! $output_range_counts{$addr}) {
99870f4d
KW
5084 $OUT[-1] .= "\n";
5085 }
5086 else {
5087 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5088 my $count = main::clarify_number($end - $start + 1);
5089 use integer;
5090
5091 my $width = $tab_stops * 8 - 1;
5092 $OUT[-1] = sprintf("%-*s # [%s]\n",
5093 $width,
5094 $OUT[-1],
5095 $count);
5096 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5097 }
c4019d52
KW
5098 next RANGE;
5099 }
5100
5101 # Here to output a single code point per line
5102
5103 # If not to annotate, use the simple formats
558712cf 5104 if (! $annotate) {
c4019d52
KW
5105
5106 # Use any passed in subroutine to output.
5107 if (ref $range_size_1 eq 'CODE') {
5108 for my $i ($start .. $end) {
5109 push @OUT, &{$range_size_1}($i, $value);
5110 }
5111 }
5112 else {
5113
5114 # Here, caller is ok with default output.
5115 for (my $i = $start; $i <= $end; $i++) {
5116 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5117 }
5118 }
5119 next RANGE;
5120 }
5121
5122 # Here, wants annotation.
5123 for (my $i = $start; $i <= $end; $i++) {
5124
5125 # Get character information if don't have it already
5126 main::populate_char_info($i)
5127 if ! defined $viacode[$i];
5128 my $type = $annotate_char_type[$i];
5129
5130 # Figure out if should output the next code points as part
5131 # of a range or not. If this is not in an annotation
5132 # range, then won't output as a range, so returns $i.
5133 # Otherwise use the end of the annotation range, but no
5134 # further than the maximum possible end point of the loop.
5135 my $range_end = main::min($annotate_ranges->value_of($i)
5136 || $i,
5137 $end);
5138
5139 # Use a range if it is a range, and either is one of the
5140 # special annotation ranges, or the range is at most 3
5141 # long. This last case causes the algorithmically named
5142 # code points to be output individually in spans of at
5143 # most 3, as they are the ones whose $type is > 0.
5144 if ($range_end != $i
5145 && ( $type < 0 || $range_end - $i > 2))
5146 {
5147 # Here is to output a range. We don't allow a
5148 # caller-specified output format--just use the
5149 # standard one.
5150 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5151 $range_end,
5152 $value;
5153 my $range_name = $viacode[$i];
5154
5155 # For the code points which end in their hex value, we
5156 # eliminate that from the output annotation, and
5157 # capitalize only the first letter of each word.
5158 if ($type == $CP_IN_NAME) {
5159 my $hex = sprintf "%04X", $i;
5160 $range_name =~ s/-$hex$//;
5161 my @words = split " ", $range_name;
5162 for my $word (@words) {
5163 $word = ucfirst(lc($word)) if $word ne 'CJK';
5164 }
5165 $range_name = join " ", @words;
5166 }
5167 elsif ($type == $HANGUL_SYLLABLE) {
5168 $range_name = "Hangul Syllable";
5169 }
5170
5171 $OUT[-1] .= " $range_name" if $range_name;
5172
5173 # Include the number of code points in the range
5174 my $count = main::clarify_number($range_end - $i + 1);
5175 $OUT[-1] .= " [$count]\n";
5176
5177 # Skip to the end of the range
5178 $i = $range_end;
5179 }
5180 else { # Not in a range.
5181 my $comment = "";
5182
5183 # When outputting the names of each character, use
5184 # the character itself if printable
5185 $comment .= "'" . chr($i) . "' " if $printable[$i];
5186
5187 # To make it more readable, use a minimum indentation
5188 my $comment_indent;
5189
5190 # Determine the annotation
5191 if ($format eq $DECOMP_STRING_FORMAT) {
5192
5193 # This is very specialized, with the type of
5194 # decomposition beginning the line enclosed in
5195 # <...>, and the code points that the code point
5196 # decomposes to separated by blanks. Create two
5197 # strings, one of the printable characters, and
5198 # one of their official names.
5199 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5200 my $tostr = "";
5201 my $to_name = "";
5202 my $to_chr = "";
5203 foreach my $to (split " ", $map) {
5204 $to = CORE::hex $to;
5205 $to_name .= " + " if $to_name;
5206 $to_chr .= chr($to);
5207 main::populate_char_info($to)
5208 if ! defined $viacode[$to];
5209 $to_name .= $viacode[$to];
5210 }
5211
5212 $comment .=
5213 "=> '$to_chr'; $viacode[$i] => $to_name";
5214 $comment_indent = 25; # Determined by experiment
5215 }
5216 else {
5217
5218 # Assume that any table that has hex format is a
5219 # mapping of one code point to another.
5220 if ($format eq $HEX_FORMAT) {
5221 my $decimal_value = CORE::hex $value;
5222 main::populate_char_info($decimal_value)
5223 if ! defined $viacode[$decimal_value];
5224 $comment .= "=> '"
5225 . chr($decimal_value)
5226 . "'; " if $printable[$decimal_value];
5227 }
5228 $comment .= $viacode[$i] if $include_name
5229 && $viacode[$i];
5230 if ($format eq $HEX_FORMAT) {
5231 my $decimal_value = CORE::hex $value;
5232 $comment .= " => $viacode[$decimal_value]"
5233 if $viacode[$decimal_value];
5234 }
5235
5236 # If including the name, no need to indent, as the
5237 # name will already be way across the line.
5238 $comment_indent = ($include_name) ? 0 : 60;
5239 }
5240
5241 # Use any passed in routine to output the base part of
5242 # the line.
5243 if (ref $range_size_1 eq 'CODE') {
5244 my $base_part = &{$range_size_1}($i, $value);
5245 chomp $base_part;
5246 push @OUT, $base_part;
5247 }
5248 else {
5249 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5250 }
5251
5252 # And add the annotation.
5253 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5254 $OUT[-1],
5255 $comment if $comment;
5256 $OUT[-1] .= "\n";
5257 }
99870f4d
KW
5258 }
5259 } # End of loop through all the table's ranges
5260 }
5261
5262 # Add anything that goes after the main body, but within the here
5263 # document,
5264 my $append_to_body = $self->append_to_body;
5265 push @OUT, $append_to_body if $append_to_body;
5266
5267 # And finish the here document.
5268 push @OUT, "END\n";
5269
668b3bfc
KW
5270 # Done with the main portion of the body. Can now figure out what
5271 # should appear before it in the file.
5272 my $pre_body = $self->pre_body;
5273 push @HEADER, $pre_body, "\n" if $pre_body;
668b3bfc 5274
6b0079b5
KW
5275 # All these files should have a .pl suffix added to them.
5276 my @file_with_pl = @{$file_path{$addr}};
5277 $file_with_pl[-1] .= '.pl';
99870f4d 5278
6b0079b5 5279 main::write(\@file_with_pl,
558712cf 5280 $annotate, # utf8 iff annotating
9218f1cf
KW
5281 \@HEADER,
5282 \@OUT);
99870f4d
KW
5283 return;
5284 }
5285
5286 sub set_status { # Set the table's status
5287 my $self = shift;
5288 my $status = shift; # The status enum value
5289 my $info = shift; # Any message associated with it.
5290 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5291
ffe43484 5292 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5293
5294 $status{$addr} = $status;
5295 $status_info{$addr} = $info;
5296 return;
5297 }
5298
5299 sub lock {
5300 # Don't allow changes to the table from now on. This stores a stack
5301 # trace of where it was called, so that later attempts to modify it
5302 # can immediately show where it got locked.
5303
5304 my $self = shift;
5305 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5306
ffe43484 5307 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5308
5309 $locked{$addr} = "";
5310
5311 my $line = (caller(0))[2];
5312 my $i = 1;
5313
5314 # Accumulate the stack trace
5315 while (1) {
5316 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5317
5318 last unless defined $caller;
5319
5320 $locked{$addr} .= " called from $caller() at line $line\n";
5321 $line = $caller_line;
5322 }
5323 $locked{$addr} .= " called from main at line $line\n";
5324
5325 return;
5326 }
5327
5328 sub carp_if_locked {
5329 # Return whether a table is locked or not, and, by the way, complain
5330 # if is locked
5331
5332 my $self = shift;
5333 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5334
ffe43484 5335 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5336
5337 return 0 if ! $locked{$addr};
5338 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5339 return 1;
5340 }
5341
5342 sub set_file_path { # Set the final directory path for this table
5343 my $self = shift;
5344 # Rest of parameters passed on
5345
f998e60c 5346 no overloading;
051df77b 5347 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
5348 return
5349 }
5350
5351 # Accessors for the range list stored in this table. First for
5352 # unconditional
ea25a9b2 5353 for my $sub (qw(
2f7a8815 5354 containing_range
99870f4d
KW
5355 contains
5356 count
5357 each_range
5358 hash
5359 is_empty
09aba7e4 5360 matches_identically_to
99870f4d
KW
5361 max
5362 min
5363 range_count
5364 reset_each_range
0a9dbafc 5365 type_of
99870f4d 5366 value_of
ea25a9b2 5367 ))
99870f4d
KW
5368 {
5369 no strict "refs";
5370 *$sub = sub {
5371 use strict "refs";
5372 my $self = shift;
f998e60c 5373 no overloading;
051df77b 5374 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
5375 }
5376 }
5377
5378 # Then for ones that should fail if locked
ea25a9b2 5379 for my $sub (qw(
99870f4d 5380 delete_range
ea25a9b2 5381 ))
99870f4d
KW
5382 {
5383 no strict "refs";
5384 *$sub = sub {
5385 use strict "refs";
5386 my $self = shift;
5387
5388 return if $self->carp_if_locked;
f998e60c 5389 no overloading;
051df77b 5390 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
5391 }
5392 }
5393
5394} # End closure
5395
5396package Map_Table;
5397use base '_Base_Table';
5398
5399# A Map Table is a table that contains the mappings from code points to
5400# values. There are two weird cases:
5401# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5402# are written in the table's file at the end of the table nonetheless. It
5403# requires specially constructed code to handle these; utf8.c can not read
5404# these in, so they should not go in $map_directory. As of this writing,
5405# the only case that these happen is for named sequences used in
5406# charnames.pm. But this code doesn't enforce any syntax on these, so
5407# something else could come along that uses it.
5408# 2) Specials are anything that doesn't fit syntactically into the body of the
5409# table. The ranges for these have a map type of non-zero. The code below
5410# knows about and handles each possible type. In most cases, these are
5411# written as part of the header.
5412#
5413# A map table deliberately can't be manipulated at will unlike match tables.
5414# This is because of the ambiguities having to do with what to do with
5415# overlapping code points. And there just isn't a need for those things;
5416# what one wants to do is just query, add, replace, or delete mappings, plus
5417# write the final result.
5418# However, there is a method to get the list of possible ranges that aren't in
5419# this table to use for defaulting missing code point mappings. And,
5420# map_add_or_replace_non_nulls() does allow one to add another table to this
5421# one, but it is clearly very specialized, and defined that the other's
5422# non-null values replace this one's if there is any overlap.
5423
5424sub trace { return main::trace(@_); }
5425
5426{ # Closure
5427
5428 main::setup_package();
5429
5430 my %default_map;
5431 # Many input files omit some entries; this gives what the mapping for the
5432 # missing entries should be
5433 main::set_access('default_map', \%default_map, 'r');
5434
5435 my %anomalous_entries;
5436 # Things that go in the body of the table which don't fit the normal
5437 # scheme of things, like having a range. Not much can be done with these
5438 # once there except to output them. This was created to handle named
5439 # sequences.
5440 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5441 main::set_access('anomalous_entries', # Append singular, read plural
5442 \%anomalous_entries,
5443 'readable_array');
5444
99870f4d
KW
5445 my %core_access;
5446 # This is a string, solely for documentation, indicating how one can get
5447 # access to this property via the Perl core.
5448 main::set_access('core_access', \%core_access, 'r', 's');
5449
99870f4d 5450 my %to_output_map;
8572ace0 5451 # Enum as to whether or not to write out this map table:
c12f2655 5452 # 0 don't output
8572ace0
KW
5453 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5454 # it should not be removed nor its format changed. This
5455 # is done for those files that have traditionally been
5456 # output.
5457 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5458 # with this file
99870f4d
KW
5459 main::set_access('to_output_map', \%to_output_map, 's');
5460
5461
5462 sub new {
5463 my $class = shift;
5464 my $name = shift;
5465
5466 my %args = @_;
5467
5468 # Optional initialization data for the table.
5469 my $initialize = delete $args{'Initialize'};
5470
5471 my $core_access = delete $args{'Core_Access'};
5472 my $default_map = delete $args{'Default_Map'};
99870f4d 5473 my $property = delete $args{'_Property'};
23e33b60 5474 my $full_name = delete $args{'Full_Name'};
20863809 5475
99870f4d
KW
5476 # Rest of parameters passed on
5477
5478 my $range_list = Range_Map->new(Owner => $property);
5479
5480 my $self = $class->SUPER::new(
5481 Name => $name,
23e33b60
KW
5482 Complete_Name => $full_name,
5483 Full_Name => $full_name,
99870f4d
KW
5484 _Property => $property,
5485 _Range_List => $range_list,
5486 %args);
5487
ffe43484 5488 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5489
5490 $anomalous_entries{$addr} = [];
5491 $core_access{$addr} = $core_access;
5492 $default_map{$addr} = $default_map;
99870f4d
KW
5493
5494 $self->initialize($initialize) if defined $initialize;
5495
5496 return $self;
5497 }
5498
5499 use overload
5500 fallback => 0,
5501 qw("") => "_operator_stringify",
5502 ;
5503
5504 sub _operator_stringify {
5505 my $self = shift;
5506
5507 my $name = $self->property->full_name;
5508 $name = '""' if $name eq "";
5509 return "Map table for Property '$name'";
5510 }
5511
99870f4d
KW
5512 sub add_alias {
5513 # Add a synonym for this table (which means the property itself)
5514 my $self = shift;
5515 my $name = shift;
5516 # Rest of parameters passed on.
5517
5518 $self->SUPER::add_alias($name, $self->property, @_);
5519 return;
5520 }
5521
5522 sub add_map {
5523 # Add a range of code points to the list of specially-handled code
5524 # points. $MULTI_CP is assumed if the type of special is not passed
5525 # in.
5526
5527 my $self = shift;
5528 my $lower = shift;
5529 my $upper = shift;
5530 my $string = shift;
5531 my %args = @_;
5532
5533 my $type = delete $args{'Type'} || 0;
5534 # Rest of parameters passed on
5535
5536 # Can't change the table if locked.
5537 return if $self->carp_if_locked;
5538
ffe43484 5539 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 5540
99870f4d
KW
5541 $self->_range_list->add_map($lower, $upper,
5542 $string,
5543 @_,
5544 Type => $type);
5545 return;
5546 }
5547
5548 sub append_to_body {
5549 # Adds to the written HERE document of the table's body any anomalous
5550 # entries in the table..
5551
5552 my $self = shift;
5553 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5554
ffe43484 5555 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5556
5557 return "" unless @{$anomalous_entries{$addr}};
5558 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5559 }
5560
5561 sub map_add_or_replace_non_nulls {
5562 # This adds the mappings in the table $other to $self. Non-null
5563 # mappings from $other override those in $self. It essentially merges
5564 # the two tables, with the second having priority except for null
5565 # mappings.
5566
5567 my $self = shift;
5568 my $other = shift;
5569 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5570
5571 return if $self->carp_if_locked;
5572
5573 if (! $other->isa(__PACKAGE__)) {
5574 Carp::my_carp_bug("$other should be a "
5575 . __PACKAGE__
5576 . ". Not a '"
5577 . ref($other)
5578 . "'. Not added;");
5579 return;
5580 }
5581
ffe43484
NC
5582 my $addr = do { no overloading; pack 'J', $self; };
5583 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5584
5585 local $to_trace = 0 if main::DEBUG;
5586
5587 my $self_range_list = $self->_range_list;
5588 my $other_range_list = $other->_range_list;
5589 foreach my $range ($other_range_list->ranges) {
5590 my $value = $range->value;
5591 next if $value eq "";
5592 $self_range_list->_add_delete('+',
5593 $range->start,
5594 $range->end,
5595 $value,
5596 Type => $range->type,
5597 Replace => $UNCONDITIONALLY);
5598 }
5599
99870f4d
KW
5600 return;
5601 }
5602
5603 sub set_default_map {
5604 # Define what code points that are missing from the input files should
5605 # map to
5606
5607 my $self = shift;
5608 my $map = shift;
5609 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5610
ffe43484 5611 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5612
5613 # Convert the input to the standard equivalent, if any (won't have any
5614 # for $STRING properties)
5615 my $standard = $self->_find_table_from_alias->{$map};
5616 $map = $standard->name if defined $standard;
5617
5618 # Warn if there already is a non-equivalent default map for this
5619 # property. Note that a default map can be a ref, which means that
5620 # what it actually means is delayed until later in the program, and it
5621 # IS permissible to override it here without a message.
5622 my $default_map = $default_map{$addr};
5623 if (defined $default_map
5624 && ! ref($default_map)
5625 && $default_map ne $map
5626 && main::Standardize($map) ne $default_map)
5627 {
5628 my $property = $self->property;
5629 my $map_table = $property->table($map);
5630 my $default_table = $property->table($default_map);
5631 if (defined $map_table
5632 && defined $default_table
5633 && $map_table != $default_table)
5634 {
5635 Carp::my_carp("Changing the default mapping for "
5636 . $property
5637 . " from $default_map to $map'");
5638 }
5639 }
5640
5641 $default_map{$addr} = $map;
5642
5643 # Don't also create any missing table for this map at this point,
5644 # because if we did, it could get done before the main table add is
5645 # done for PropValueAliases.txt; instead the caller will have to make
5646 # sure it exists, if desired.
5647 return;
5648 }
5649
5650 sub to_output_map {
5651 # Returns boolean: should we write this map table?
5652
5653 my $self = shift;
5654 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5655
ffe43484 5656 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5657
5658 # If overridden, use that
5659 return $to_output_map{$addr} if defined $to_output_map{$addr};
5660
5661 my $full_name = $self->full_name;
fcf1973c
KW
5662 return $global_to_output_map{$full_name}
5663 if defined $global_to_output_map{$full_name};
99870f4d 5664
20863809 5665 # If table says to output, do so; if says to suppress it, do so.
8572ace0 5666 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
99870f4d
KW
5667 return 0 if $self->status eq $SUPPRESSED;
5668
5669 my $type = $self->property->type;
5670
5671 # Don't want to output binary map tables even for debugging.
5672 return 0 if $type == $BINARY;
5673
5674 # But do want to output string ones.
8572ace0 5675 return $EXTERNAL_MAP if $type == $STRING;
99870f4d 5676
8572ace0
KW
5677 # Otherwise is an $ENUM, do output it, for Perl's purposes
5678 return $INTERNAL_MAP;
99870f4d
KW
5679 }
5680
5681 sub inverse_list {
5682 # Returns a Range_List that is gaps of the current table. That is,
5683 # the inversion
5684
5685 my $self = shift;
5686 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5687
5688 my $current = Range_List->new(Initialize => $self->_range_list,
5689 Owner => $self->property);
5690 return ~ $current;
5691 }
5692
8572ace0
KW
5693 sub header {
5694 my $self = shift;
5695 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5696
5697 my $return = $self->SUPER::header();
5698
5699 $return .= $INTERNAL_ONLY if $self->to_output_map == $INTERNAL_MAP;
5700 return $return;
5701 }
5702
99870f4d
KW
5703 sub set_final_comment {
5704 # Just before output, create the comment that heads the file
5705 # containing this table.
5706
bd9ebcfd
KW
5707 return unless $debugging_build;
5708
99870f4d
KW
5709 my $self = shift;
5710 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5711
5712 # No sense generating a comment if aren't going to write it out.
5713 return if ! $self->to_output_map;
5714
ffe43484 5715 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5716
5717 my $property = $self->property;
5718
5719 # Get all the possible names for this property. Don't use any that
5720 # aren't ok for use in a file name, etc. This is perhaps causing that
5721 # flag to do double duty, and may have to be changed in the future to
5722 # have our own flag for just this purpose; but it works now to exclude
5723 # Perl generated synonyms from the lists for properties, where the
5724 # name is always the proper Unicode one.
5725 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5726
5727 my $count = $self->count;
5728 my $default_map = $default_map{$addr};
5729
5730 # The ranges that map to the default aren't output, so subtract that
5731 # to get those actually output. A property with matching tables
5732 # already has the information calculated.
5733 if ($property->type != $STRING) {
5734 $count -= $property->table($default_map)->count;
5735 }
5736 elsif (defined $default_map) {
5737
5738 # But for $STRING properties, must calculate now. Subtract the
5739 # count from each range that maps to the default.
5740 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5741 if ($range->value eq $default_map) {
5742 $count -= $range->end +1 - $range->start;
5743 }
5744 }
5745
5746 }
5747
5748 # Get a string version of $count with underscores in large numbers,
5749 # for clarity.
5750 my $string_count = main::clarify_number($count);
5751
5752 my $code_points = ($count == 1)
5753 ? 'single code point'
5754 : "$string_count code points";
5755
5756 my $mapping;
5757 my $these_mappings;
5758 my $are;
5759 if (@property_aliases <= 1) {
5760 $mapping = 'mapping';
5761 $these_mappings = 'this mapping';
5762 $are = 'is'
5763 }
5764 else {
5765 $mapping = 'synonymous mappings';
5766 $these_mappings = 'these mappings';
5767 $are = 'are'
5768 }
5769 my $cp;
5770 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5771 $cp = "any code point in Unicode Version $string_version";
5772 }
5773 else {
5774 my $map_to;
5775 if ($default_map eq "") {
5776 $map_to = 'the null string';
5777 }
5778 elsif ($default_map eq $CODE_POINT) {
5779 $map_to = "itself";
5780 }
5781 else {
5782 $map_to = "'$default_map'";
5783 }
5784 if ($count == 1) {
5785 $cp = "the single code point";
5786 }
5787 else {
5788 $cp = "one of the $code_points";
5789 }
5790 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5791 }
5792
5793 my $comment = "";
5794
5795 my $status = $self->status;
5796 if ($status) {
5797 my $warn = uc $status_past_participles{$status};
5798 $comment .= <<END;
5799
5800!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5801 All property or property=value combinations contained in this file are $warn.
5802 See $unicode_reference_url for what this means.
5803
5804END
5805 }
5806 $comment .= "This file returns the $mapping:\n";
5807
5808 for my $i (0 .. @property_aliases - 1) {
5809 $comment .= sprintf("%-8s%s\n",
5810 " ",
5811 $property_aliases[$i]->name . '(cp)'
5812 );
5813 }
5814 $comment .=
5815 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5816
5817 my $access = $core_access{$addr};
5818 if ($access) {
5819 $comment .= "accessible through the Perl core via $access.";
5820 }
5821 else {
5822 $comment .= "not accessible through the Perl core directly.";
5823 }
5824
5825 # And append any commentary already set from the actual property.
5826 $comment .= "\n\n" . $self->comment if $self->comment;
5827 if ($self->description) {
5828 $comment .= "\n\n" . join " ", $self->description;
5829 }
5830 if ($self->note) {
5831 $comment .= "\n\n" . join " ", $self->note;
5832 }
5833 $comment .= "\n";
5834
5835 if (! $self->perl_extension) {
5836 $comment .= <<END;
5837
5838For information about what this property really means, see:
5839$unicode_reference_url
5840END
5841 }
5842
5843 if ($count) { # Format differs for empty table
5844 $comment.= "\nThe format of the ";
5845 if ($self->range_size_1) {
5846 $comment.= <<END;
5847main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5848is in hex; MAPPING is what CODE_POINT maps to.
5849END
5850 }
5851 else {
5852
5853 # There are tables which end up only having one element per
5854 # range, but it is not worth keeping track of for making just
5855 # this comment a little better.
5856 $comment.= <<END;
5857non-comment portions of the main body of lines of this file is:
5858START\\tSTOP\\tMAPPING where START is the starting code point of the
5859range, in hex; STOP is the ending point, or if omitted, the range has just one
5860code point; MAPPING is what each code point between START and STOP maps to.
5861END
0c07e538 5862 if ($self->output_range_counts) {
99870f4d
KW
5863 $comment .= <<END;
5864Numbers in comments in [brackets] indicate how many code points are in the
5865range (omitted when the range is a single code point or if the mapping is to
5866the null string).
5867END
5868 }
5869 }
5870 }
5871 $self->set_comment(main::join_lines($comment));
5872 return;
5873 }
5874
5875 my %swash_keys; # Makes sure don't duplicate swash names.
5876
668b3bfc
KW
5877 # The remaining variables are temporaries used while writing each table,
5878 # to output special ranges.
5879 my $has_hangul_syllables;
5880 my @multi_code_point_maps; # Map is to more than one code point.
5881
5882 # The key is the base name of the code point, and the value is an
5883 # array giving all the ranges that use this base name. Each range
5884 # is actually a hash giving the 'low' and 'high' values of it.
5885 my %names_ending_in_code_point;
8c32d378 5886 my %loose_names_ending_in_code_point;
668b3bfc
KW
5887
5888 # Inverse mapping. The list of ranges that have these kinds of
c12f2655
KW
5889 # names. Each element contains the low, high, and base names in an
5890 # anonymous hash.
668b3bfc
KW
5891 my @code_points_ending_in_code_point;
5892
5893 sub handle_special_range {
5894 # Called in the middle of write when it finds a range it doesn't know
5895 # how to handle.
5896
5897 my $self = shift;
5898 my $range = shift;
5899 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5900
5901 my $addr = do { no overloading; pack 'J', $self; };
5902
5903 my $type = $range->type;
5904
5905 my $low = $range->start;
5906 my $high = $range->end;
5907 my $map = $range->value;
5908
5909 # No need to output the range if it maps to the default.
5910 return if $map eq $default_map{$addr};
5911
5912 # Switch based on the map type...
5913 if ($type == $HANGUL_SYLLABLE) {
5914
5915 # These are entirely algorithmically determinable based on
5916 # some constants furnished by Unicode; for now, just set a
5917 # flag to indicate that have them. After everything is figured
5918 # out, we will output the code that does the algorithm.
5919 $has_hangul_syllables = 1;
5920 }
5921 elsif ($type == $CP_IN_NAME) {
5922
5923 # Code points whose the name ends in their code point are also
5924 # algorithmically determinable, but need information about the map
5925 # to do so. Both the map and its inverse are stored in data
5926 # structures output in the file.
5927 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5928 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5929
8c32d378
KW
5930 my $squeezed = $map =~ s/[-\s]+//gr;
5931 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, $low;
5932 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, $high;
5933
668b3bfc 5934 push @code_points_ending_in_code_point, { low => $low,
c12f2655
KW
5935 high => $high,
5936 name => $map
668b3bfc
KW
5937 };
5938 }
5939 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5940
5941 # Multi-code point maps and null string maps have an entry
5942 # for each code point in the range. They use the same
5943 # output format.
5944 for my $code_point ($low .. $high) {
5945
c12f2655
KW
5946 # The pack() below can't cope with surrogates. XXX This may
5947 # no longer be true
668b3bfc 5948 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
98dc9551 5949 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
668b3bfc
KW
5950 next;
5951 }
5952
5953 # Generate the hash entries for these in the form that
5954 # utf8.c understands.
5955 my $tostr = "";
5956 my $to_name = "";
5957 my $to_chr = "";
5958 foreach my $to (split " ", $map) {
5959 if ($to !~ /^$code_point_re$/) {
5960 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5961 next;
5962 }
5963 $tostr .= sprintf "\\x{%s}", $to;
5964 $to = CORE::hex $to;
558712cf 5965 if ($annotate) {
c4019d52
KW
5966 $to_name .= " + " if $to_name;
5967 $to_chr .= chr($to);
5968 main::populate_char_info($to)
5969 if ! defined $viacode[$to];
5970 $to_name .= $viacode[$to];
5971 }
668b3bfc
KW
5972 }
5973
5974 # I (khw) have never waded through this line to
5975 # understand it well enough to comment it.
5976 my $utf8 = sprintf(qq["%s" => "$tostr",],
5977 join("", map { sprintf "\\x%02X", $_ }
5978 unpack("U0C*", pack("U", $code_point))));
5979
5980 # Add a comment so that a human reader can more easily
5981 # see what's going on.
5982 push @multi_code_point_maps,
5983 sprintf("%-45s # U+%04X", $utf8, $code_point);
558712cf 5984 if (! $annotate) {
c4019d52
KW
5985 $multi_code_point_maps[-1] .= " => $map";
5986 }
5987 else {
5988 main::populate_char_info($code_point)
5989 if ! defined $viacode[$code_point];
5990 $multi_code_point_maps[-1] .= " '"
5991 . chr($code_point)
5992 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
5993 }
668b3bfc
KW
5994 }
5995 }
5996 else {
5997 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
5998 }
5999
6000 return;
6001 }
6002
99870f4d
KW
6003 sub pre_body {
6004 # Returns the string that should be output in the file before the main
668b3bfc
KW
6005 # body of this table. It isn't called until the main body is
6006 # calculated, saving a pass. The string includes some hash entries
6007 # identifying the format of the body, and what the single value should
6008 # be for all ranges missing from it. It also includes any code points
6009 # which have map_types that don't go in the main table.
99870f4d
KW
6010
6011 my $self = shift;
6012 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6013
ffe43484 6014 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6015
6016 my $name = $self->property->swash_name;
6017
6018 if (defined $swash_keys{$name}) {
6019 Carp::my_carp(join_lines(<<END
6020Already created a swash name '$name' for $swash_keys{$name}. This means that
6021the same name desired for $self shouldn't be used. Bad News. This must be
6022fixed before production use, but proceeding anyway
6023END
6024 ));
6025 }
6026 $swash_keys{$name} = "$self";
6027
99870f4d 6028 my $pre_body = "";
99870f4d 6029
668b3bfc
KW
6030 # Here we assume we were called after have gone through the whole
6031 # file. If we actually generated anything for each map type, add its
6032 # respective header and trailer
ec2f0128 6033 my $specials_name = "";
668b3bfc 6034 if (@multi_code_point_maps) {
ec2f0128 6035 $specials_name = "utf8::ToSpec$name";
668b3bfc 6036 $pre_body .= <<END;
99870f4d
KW
6037
6038# Some code points require special handling because their mappings are each to
6039# multiple code points. These do not appear in the main body, but are defined
6040# in the hash below.
6041
76591e2b
KW
6042# Each key is the string of N bytes that together make up the UTF-8 encoding
6043# for the code point. (i.e. the same as looking at the code point's UTF-8
6044# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
ec2f0128 6045\%$specials_name = (
99870f4d 6046END
668b3bfc
KW
6047 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6048 }
99870f4d 6049
668b3bfc
KW
6050 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
6051
6052 # Convert these structures to output format.
6053 my $code_points_ending_in_code_point =
6054 main::simple_dumper(\@code_points_ending_in_code_point,
6055 ' ' x 8);
6056 my $names = main::simple_dumper(\%names_ending_in_code_point,
6057 ' ' x 8);
8c32d378
KW
6058 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
6059 ' ' x 8);
668b3bfc
KW
6060
6061 # Do the same with the Hangul names,
6062 my $jamo;
6063 my $jamo_l;
6064 my $jamo_v;
6065 my $jamo_t;
6066 my $jamo_re;
6067 if ($has_hangul_syllables) {
6068
6069 # Construct a regular expression of all the possible
6070 # combinations of the Hangul syllables.
6071 my @L_re; # Leading consonants
6072 for my $i ($LBase .. $LBase + $LCount - 1) {
6073 push @L_re, $Jamo{$i}
6074 }
6075 my @V_re; # Middle vowels
6076 for my $i ($VBase .. $VBase + $VCount - 1) {
6077 push @V_re, $Jamo{$i}
6078 }
6079 my @T_re; # Trailing consonants
6080 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
6081 push @T_re, $Jamo{$i}
99870f4d
KW
6082 }
6083
668b3bfc
KW
6084 # The whole re is made up of the L V T combination.
6085 $jamo_re = '('
6086 . join ('|', sort @L_re)
6087 . ')('
6088 . join ('|', sort @V_re)
6089 . ')('
6090 . join ('|', sort @T_re)
6091 . ')?';
6092
6093 # These hashes needed by the algorithm were generated
6094 # during reading of the Jamo.txt file
6095 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
6096 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
6097 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
6098 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
6099 }
6100
6101 $pre_body .= <<END;
99870f4d
KW
6102
6103# To achieve significant memory savings when this file is read in,
6104# algorithmically derivable code points are omitted from the main body below.
6105# Instead, the following routines can be used to translate between name and
6106# code point and vice versa
6107
6108{ # Closure
6109
6110 # Matches legal code point. 4-6 hex numbers, If there are 6, the
6111 # first two must be '10'; if there are 5, the first must not be a '0'.
8c32d378
KW
6112 # First can match at the end of a word provided that the end of the
6113 # word doesn't look like a hex number.
6114 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
99870f4d
KW
6115 my \$code_point_re = qr/$code_point_re/;
6116
6117 # In the following hash, the keys are the bases of names which includes
6118 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
6119 # of each key is another hash which is used to get the low and high ends
8c32d378 6120 # for each range of code points that apply to the name.
99870f4d
KW
6121 my %names_ending_in_code_point = (
6122$names
6123 );
6124
8c32d378
KW
6125 # The following hash is a copy of the previous one, except is for loose
6126 # matching, so each name has blanks and dashes squeezed out
6127 my %loose_names_ending_in_code_point = (
6128$loose_names
6129 );
6130
99870f4d
KW
6131 # And the following array gives the inverse mapping from code points to
6132 # names. Lowest code points are first
6133 my \@code_points_ending_in_code_point = (
6134$code_points_ending_in_code_point
6135 );
6136END
668b3bfc
KW
6137 # Earlier releases didn't have Jamos. No sense outputting
6138 # them unless will be used.
6139 if ($has_hangul_syllables) {
6140 $pre_body .= <<END;
99870f4d
KW
6141
6142 # Convert from code point to Jamo short name for use in composing Hangul
6143 # syllable names
6144 my %Jamo = (
6145$jamo
6146 );
6147
6148 # Leading consonant (can be null)
6149 my %Jamo_L = (
6150$jamo_l
6151 );
6152
6153 # Vowel
6154 my %Jamo_V = (
6155$jamo_v
6156 );
6157
6158 # Optional trailing consonant
6159 my %Jamo_T = (
6160$jamo_t
6161 );
6162
6163 # Computed re that splits up a Hangul name into LVT or LV syllables
6164 my \$syllable_re = qr/$jamo_re/;
6165
6166 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
8c32d378 6167 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
99870f4d
KW
6168
6169 # These constants names and values were taken from the Unicode standard,
6170 # version 5.1, section 3.12. They are used in conjunction with Hangul
6171 # syllables
6e5a209b
KW
6172 my \$SBase = $SBase_string;
6173 my \$LBase = $LBase_string;
6174 my \$VBase = $VBase_string;
6175 my \$TBase = $TBase_string;
6176 my \$SCount = $SCount;
6177 my \$LCount = $LCount;
6178 my \$VCount = $VCount;
6179 my \$TCount = $TCount;
99870f4d
KW
6180 my \$NCount = \$VCount * \$TCount;
6181END
668b3bfc 6182 } # End of has Jamos
99870f4d 6183
668b3bfc 6184 $pre_body .= << 'END';
99870f4d
KW
6185
6186 sub name_to_code_point_special {
8c32d378 6187 my ($name, $loose) = @_;
99870f4d
KW
6188
6189 # Returns undef if not one of the specially handled names; otherwise
6190 # returns the code point equivalent to the input name
8c32d378
KW
6191 # $loose is non-zero if to use loose matching, 'name' in that case
6192 # must be input as upper case with all blanks and dashes squeezed out.
99870f4d 6193END
668b3bfc
KW
6194 if ($has_hangul_syllables) {
6195 $pre_body .= << 'END';
99870f4d 6196
8c32d378
KW
6197 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
6198 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
6199 {
99870f4d
KW
6200 return if $name !~ qr/^$syllable_re$/;
6201 my $L = $Jamo_L{$1};
6202 my $V = $Jamo_V{$2};
6203 my $T = (defined $3) ? $Jamo_T{$3} : 0;
6204 return ($L * $VCount + $V) * $TCount + $T + $SBase;
6205 }
6206END
668b3bfc
KW
6207 }
6208 $pre_body .= << 'END';
99870f4d 6209
8c32d378
KW
6210 # Name must end in 'code_point' for this to handle.
6211 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
6212 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
99870f4d
KW
6213
6214 my $base = $1;
6215 my $code_point = CORE::hex $2;
8c32d378
KW
6216 my $names_ref;
6217
6218 if ($loose) {
6219 $names_ref = \%loose_names_ending_in_code_point;
6220 }
6221 else {
6222 return if $base !~ s/-$//;
6223 $names_ref = \%names_ending_in_code_point;
6224 }
99870f4d
KW
6225
6226 # Name must be one of the ones which has the code point in it.
8c32d378 6227 return if ! $names_ref->{$base};
99870f4d
KW
6228
6229 # Look through the list of ranges that apply to this name to see if
6230 # the code point is in one of them.
8c32d378
KW
6231 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
6232 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
6233 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
99870f4d
KW
6234
6235 # Here, the code point is in the range.
6236 return $code_point;
6237 }
6238
6239 # Here, looked like the name had a code point number in it, but
6240 # did not match one of the valid ones.
6241 return;
6242 }
6243
6244 sub code_point_to_name_special {
6245 my $code_point = shift;
6246
6247 # Returns the name of a code point if algorithmically determinable;
6248 # undef if not
6249END
668b3bfc
KW
6250 if ($has_hangul_syllables) {
6251 $pre_body .= << 'END';
99870f4d
KW
6252
6253 # If in the Hangul range, calculate the name based on Unicode's
6254 # algorithm
6255 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
6256 use integer;
6257 my $SIndex = $code_point - $SBase;
6258 my $L = $LBase + $SIndex / $NCount;
6259 my $V = $VBase + ($SIndex % $NCount) / $TCount;
6260 my $T = $TBase + $SIndex % $TCount;
03e1aa51 6261 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
99870f4d
KW
6262 $name .= $Jamo{$T} if $T != $TBase;
6263 return $name;
6264 }
6265END
668b3bfc
KW
6266 }
6267 $pre_body .= << 'END';
99870f4d
KW
6268
6269 # Look through list of these code points for one in range.
6270 foreach my $hash (@code_points_ending_in_code_point) {
6271 return if $code_point < $hash->{'low'};
6272 if ($code_point <= $hash->{'high'}) {
6273 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
6274 }
6275 }
6276 return; # None found
6277 }
6278} # End closure
6279
6280END
668b3bfc
KW
6281 } # End of has hangul or code point in name maps.
6282
6283 my $format = $self->format;
6284
6285 my $return = <<END;
6286# The name this swash is to be known by, with the format of the mappings in
6287# the main body of the table, and what all code points missing from this file
6288# map to.
6289\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6290END
ec2f0128
KW
6291 if ($specials_name) {
6292 $return .= <<END;
6293\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6294END
6295 }
668b3bfc
KW
6296 my $default_map = $default_map{$addr};
6297 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6298
6299 if ($default_map eq $CODE_POINT) {
6300 $return .= ' # code point maps to itself';
6301 }
6302 elsif ($default_map eq "") {
6303 $return .= ' # code point maps to the null string';
6304 }
6305 $return .= "\n";
6306
6307 $return .= $pre_body;
6308
6309 return $return;
6310 }
6311
6312 sub write {
6313 # Write the table to the file.
6314
6315 my $self = shift;
6316 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6317
6318 my $addr = do { no overloading; pack 'J', $self; };
6319
6320 # Clear the temporaries
6321 $has_hangul_syllables = 0;
6322 undef @multi_code_point_maps;
6323 undef %names_ending_in_code_point;
8c32d378 6324 undef %loose_names_ending_in_code_point;
668b3bfc 6325 undef @code_points_ending_in_code_point;
99870f4d
KW
6326
6327 # Calculate the format of the table if not already done.
f5817e0a 6328 my $format = $self->format;
668b3bfc
KW
6329 my $type = $self->property->type;
6330 my $default_map = $self->default_map;
99870f4d
KW
6331 if (! defined $format) {
6332 if ($type == $BINARY) {
6333
6334 # Don't bother checking the values, because we elsewhere
6335 # verify that a binary table has only 2 values.
6336 $format = $BINARY_FORMAT;
6337 }
6338 else {
6339 my @ranges = $self->_range_list->ranges;
6340
6341 # default an empty table based on its type and default map
6342 if (! @ranges) {
6343
6344 # But it turns out that the only one we can say is a
6345 # non-string (besides binary, handled above) is when the
6346 # table is a string and the default map is to a code point
6347 if ($type == $STRING && $default_map eq $CODE_POINT) {
6348 $format = $HEX_FORMAT;
6349 }
6350 else {
6351 $format = $STRING_FORMAT;
6352 }
6353 }
6354 else {
6355
6356 # Start with the most restrictive format, and as we find
6357 # something that doesn't fit with that, change to the next
6358 # most restrictive, and so on.
6359 $format = $DECIMAL_FORMAT;
6360 foreach my $range (@ranges) {
668b3bfc
KW
6361 next if $range->type != 0; # Non-normal ranges don't
6362 # affect the main body
99870f4d
KW
6363 my $map = $range->value;
6364 if ($map ne $default_map) {
6365 last if $format eq $STRING_FORMAT; # already at
6366 # least
6367 # restrictive
6368 $format = $INTEGER_FORMAT
6369 if $format eq $DECIMAL_FORMAT
6370 && $map !~ / ^ [0-9] $ /x;
6371 $format = $FLOAT_FORMAT
6372 if $format eq $INTEGER_FORMAT
6373 && $map !~ / ^ -? [0-9]+ $ /x;
6374 $format = $RATIONAL_FORMAT
6375 if $format eq $FLOAT_FORMAT
6376 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6377 $format = $HEX_FORMAT
6378 if $format eq $RATIONAL_FORMAT
6379 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6380 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6381 && $map =~ /[^0-9A-F]/;
6382 }
6383 }
6384 }
6385 }
6386 } # end of calculating format
6387
668b3bfc 6388 if ($default_map eq $CODE_POINT
99870f4d 6389 && $format ne $HEX_FORMAT
668b3bfc
KW
6390 && ! defined $self->format) # manual settings are always
6391 # considered ok
99870f4d
KW
6392 {
6393 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6394 }
99870f4d 6395
668b3bfc 6396 $self->_set_format($format);
99870f4d 6397
0911a63d
KW
6398 # Core Perl has a different definition of mapping ranges than we do,
6399 # that is applicable mainly to mapping code points, so for tables
6400 # where it is possible that core Perl could be used to read it,
6401 # make it range size 1 to prevent possible confusion
6402 $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6403
99870f4d
KW
6404 return $self->SUPER::write(
6405 ($self->property == $block)
6406 ? 7 # block file needs more tab stops
6407 : 3,
668b3bfc 6408 $default_map); # don't write defaulteds
99870f4d
KW
6409 }
6410
6411 # Accessors for the underlying list that should fail if locked.
ea25a9b2 6412 for my $sub (qw(
99870f4d 6413 add_duplicate
ea25a9b2 6414 ))
99870f4d
KW
6415 {
6416 no strict "refs";
6417 *$sub = sub {
6418 use strict "refs";
6419 my $self = shift;
6420
6421 return if $self->carp_if_locked;
6422 return $self->_range_list->$sub(@_);
6423 }
6424 }
6425} # End closure for Map_Table
6426
6427package Match_Table;
6428use base '_Base_Table';
6429
6430# A Match table is one which is a list of all the code points that have
6431# the same property and property value, for use in \p{property=value}
6432# constructs in regular expressions. It adds very little data to the base
6433# structure, but many methods, as these lists can be combined in many ways to
6434# form new ones.
6435# There are only a few concepts added:
6436# 1) Equivalents and Relatedness.
6437# Two tables can match the identical code points, but have different names.
6438# This always happens when there is a perl single form extension
6439# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6440# tables are set to be related, with the Perl extension being a child, and
6441# the Unicode property being the parent.
6442#
6443# It may be that two tables match the identical code points and we don't
6444# know if they are related or not. This happens most frequently when the
6445# Block and Script properties have the exact range. But note that a
6446# revision to Unicode could add new code points to the script, which would
6447# now have to be in a different block (as the block was filled, or there
6448# would have been 'Unknown' script code points in it and they wouldn't have
6449# been identical). So we can't rely on any two properties from Unicode
6450# always matching the same code points from release to release, and thus
6451# these tables are considered coincidentally equivalent--not related. When
6452# two tables are unrelated but equivalent, one is arbitrarily chosen as the
6453# 'leader', and the others are 'equivalents'. This concept is useful
6454# to minimize the number of tables written out. Only one file is used for
6455# any identical set of code points, with entries in Heavy.pl mapping all
6456# the involved tables to it.
6457#
6458# Related tables will always be identical; we set them up to be so. Thus
6459# if the Unicode one is deprecated, the Perl one will be too. Not so for
6460# unrelated tables. Relatedness makes generating the documentation easier.
6461#
c12f2655
KW
6462# 2) Complement.
6463# Like equivalents, two tables may be the inverses of each other, the
6464# intersection between them is null, and the union is every Unicode code
6465# point. The two tables that occupy a binary property are necessarily like
6466# this. By specifying one table as the complement of another, we can avoid
6467# storing it on disk (using the other table and performing a fast
6468# transform), and some memory and calculations.
6469#
6470# 3) Conflicting. It may be that there will eventually be name clashes, with
99870f4d
KW
6471# the same name meaning different things. For a while, there actually were
6472# conflicts, but they have so far been resolved by changing Perl's or
6473# Unicode's definitions to match the other, but when this code was written,
6474# it wasn't clear that that was what was going to happen. (Unicode changed
6475# because of protests during their beta period.) Name clashes are warned
6476# about during compilation, and the documentation. The generated tables
6477# are sane, free of name clashes, because the code suppresses the Perl
6478# version. But manual intervention to decide what the actual behavior
6479# should be may be required should this happen. The introductory comments
6480# have more to say about this.
6481
6482sub standardize { return main::standardize($_[0]); }
6483sub trace { return main::trace(@_); }
6484
6485
6486{ # Closure
6487
6488 main::setup_package();
6489
6490 my %leader;
6491 # The leader table of this one; initially $self.
6492 main::set_access('leader', \%leader, 'r');
6493
6494 my %equivalents;
6495 # An array of any tables that have this one as their leader
6496 main::set_access('equivalents', \%equivalents, 'readable_array');
6497
6498 my %parent;
6499 # The parent table to this one, initially $self. This allows us to
c12f2655
KW
6500 # distinguish between equivalent tables that are related (for which this
6501 # is set to), and those which may not be, but share the same output file
6502 # because they match the exact same set of code points in the current
6503 # Unicode release.
99870f4d
KW
6504 main::set_access('parent', \%parent, 'r');
6505
6506 my %children;
6507 # An array of any tables that have this one as their parent
6508 main::set_access('children', \%children, 'readable_array');
6509
6510 my %conflicting;
6511 # Array of any tables that would have the same name as this one with
6512 # a different meaning. This is used for the generated documentation.
6513 main::set_access('conflicting', \%conflicting, 'readable_array');
6514
6515 my %matches_all;
6516 # Set in the constructor for tables that are expected to match all code
6517 # points.
6518 main::set_access('matches_all', \%matches_all, 'r');
6519
a92d5c2e
KW
6520 my %complement;
6521 # Points to the complement that this table is expressed in terms of; 0 if
6522 # none.
6523 main::set_access('complement', \%complement, 'r', 's' );
6524
99870f4d
KW
6525 sub new {
6526 my $class = shift;
6527
6528 my %args = @_;
6529
6530 # The property for which this table is a listing of property values.
6531 my $property = delete $args{'_Property'};
6532
23e33b60
KW
6533 my $name = delete $args{'Name'};
6534 my $full_name = delete $args{'Full_Name'};
6535 $full_name = $name if ! defined $full_name;
6536
99870f4d
KW
6537 # Optional
6538 my $initialize = delete $args{'Initialize'};
6539 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 6540 my $format = delete $args{'Format'};
99870f4d
KW
6541 # Rest of parameters passed on.
6542
6543 my $range_list = Range_List->new(Initialize => $initialize,
6544 Owner => $property);
6545
23e33b60
KW
6546 my $complete = $full_name;
6547 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6548 # but this helps debug if it
6549 # does
6550 # The complete name for a match table includes it's property in a
6551 # compound form 'property=table', except if the property is the
6552 # pseudo-property, perl, in which case it is just the single form,
6553 # 'table' (If you change the '=' must also change the ':' in lots of
6554 # places in this program that assume an equal sign)
6555 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 6556
99870f4d 6557 my $self = $class->SUPER::new(%args,
23e33b60
KW
6558 Name => $name,
6559 Complete_Name => $complete,
6560 Full_Name => $full_name,
99870f4d
KW
6561 _Property => $property,
6562 _Range_List => $range_list,
f5817e0a 6563 Format => $EMPTY_FORMAT,
99870f4d 6564 );
ffe43484 6565 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6566
6567 $conflicting{$addr} = [ ];
6568 $equivalents{$addr} = [ ];
6569 $children{$addr} = [ ];
6570 $matches_all{$addr} = $matches_all;
6571 $leader{$addr} = $self;
6572 $parent{$addr} = $self;
a92d5c2e 6573 $complement{$addr} = 0;
99870f4d 6574
f5817e0a
KW
6575 if (defined $format && $format ne $EMPTY_FORMAT) {
6576 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6577 }
6578
99870f4d
KW
6579 return $self;
6580 }
6581
6582 # See this program's beginning comment block about overloading these.
6583 use overload
6584 fallback => 0,
6585 qw("") => "_operator_stringify",
6586 '=' => sub {
6587 my $self = shift;
6588
6589 return if $self->carp_if_locked;
6590 return $self;
6591 },
6592
6593 '+' => sub {
6594 my $self = shift;
6595 my $other = shift;
6596
6597 return $self->_range_list + $other;
6598 },
6599 '&' => sub {
6600 my $self = shift;
6601 my $other = shift;
6602
6603 return $self->_range_list & $other;
6604 },
6605 '+=' => sub {
6606 my $self = shift;
6607 my $other = shift;
6608
6609 return if $self->carp_if_locked;
6610
ffe43484 6611 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6612
6613 if (ref $other) {
6614
6615 # Change the range list of this table to be the
6616 # union of the two.
6617 $self->_set_range_list($self->_range_list
6618 + $other);
6619 }
6620 else { # $other is just a simple value
6621 $self->add_range($other, $other);
6622 }
6623 return $self;
6624 },
6625 '-' => sub { my $self = shift;
6626 my $other = shift;
6627 my $reversed = shift;
6628
6629 if ($reversed) {
6630 Carp::my_carp_bug("Can't cope with a "
6631 . __PACKAGE__
6632 . " being the first parameter in a '-'. Subtraction ignored.");
6633 return;
6634 }
6635
6636 return $self->_range_list - $other;
6637 },
6638 '~' => sub { my $self = shift;
6639 return ~ $self->_range_list;
6640 },
6641 ;
6642
6643 sub _operator_stringify {
6644 my $self = shift;
6645
23e33b60 6646 my $name = $self->complete_name;
99870f4d
KW
6647 return "Table '$name'";
6648 }
6649
6650 sub add_alias {
6651 # Add a synonym for this table. See the comments in the base class
6652
6653 my $self = shift;
6654 my $name = shift;
6655 # Rest of parameters passed on.
6656
6657 $self->SUPER::add_alias($name, $self, @_);
6658 return;
6659 }
6660
6661 sub add_conflicting {
6662 # Add the name of some other object to the list of ones that name
6663 # clash with this match table.
6664
6665 my $self = shift;
6666 my $conflicting_name = shift; # The name of the conflicting object
6667 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6668 my $conflicting_object = shift; # Optional, the conflicting object
6669 # itself. This is used to
6670 # disambiguate the text if the input
6671 # name is identical to any of the
6672 # aliases $self is known by.
6673 # Sometimes the conflicting object is
6674 # merely hypothetical, so this has to
6675 # be an optional parameter.
6676 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6677
ffe43484 6678 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6679
6680 # Check if the conflicting name is exactly the same as any existing
6681 # alias in this table (as long as there is a real object there to
6682 # disambiguate with).
6683 if (defined $conflicting_object) {
6684 foreach my $alias ($self->aliases) {
6685 if ($alias->name eq $conflicting_name) {
6686
6687 # Here, there is an exact match. This results in
6688 # ambiguous comments, so disambiguate by changing the
6689 # conflicting name to its object's complete equivalent.
6690 $conflicting_name = $conflicting_object->complete_name;
6691 last;
6692 }
6693 }
6694 }
6695
6696 # Convert to the \p{...} final name
6697 $conflicting_name = "\\$p" . "{$conflicting_name}";
6698
6699 # Only add once
6700 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6701
6702 push @{$conflicting{$addr}}, $conflicting_name;
6703
6704 return;
6705 }
6706
6505c6e2 6707 sub is_set_equivalent_to {
99870f4d
KW
6708 # Return boolean of whether or not the other object is a table of this
6709 # type and has been marked equivalent to this one.
6710
6711 my $self = shift;
6712 my $other = shift;
6713 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6714
6715 return 0 if ! defined $other; # Can happen for incomplete early
6716 # releases
6717 unless ($other->isa(__PACKAGE__)) {
6718 my $ref_other = ref $other;
6719 my $ref_self = ref $self;
6505c6e2 6720 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
99870f4d
KW
6721 return 0;
6722 }
6723
6724 # Two tables are equivalent if they have the same leader.
f998e60c 6725 no overloading;
051df77b 6726 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6727 return;
6728 }
6729
99870f4d
KW
6730 sub set_equivalent_to {
6731 # Set $self equivalent to the parameter table.
6732 # The required Related => 'x' parameter is a boolean indicating
6733 # whether these tables are related or not. If related, $other becomes
6734 # the 'parent' of $self; if unrelated it becomes the 'leader'
6735 #
6736 # Related tables share all characteristics except names; equivalents
6737 # not quite so many.
6738 # If they are related, one must be a perl extension. This is because
6739 # we can't guarantee that Unicode won't change one or the other in a
98dc9551 6740 # later release even if they are identical now.
99870f4d
KW
6741
6742 my $self = shift;
6743 my $other = shift;
6744
6745 my %args = @_;
6746 my $related = delete $args{'Related'};
6747
6748 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6749
6750 return if ! defined $other; # Keep on going; happens in some early
6751 # Unicode releases.
6752
6753 if (! defined $related) {
6754 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6755 $related = 0;
6756 }
6757
6758 # If already are equivalent, no need to re-do it; if subroutine
6759 # returns null, it found an error, also do nothing
6505c6e2 6760 my $are_equivalent = $self->is_set_equivalent_to($other);
99870f4d
KW
6761 return if ! defined $are_equivalent || $are_equivalent;
6762
ffe43484 6763 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6764 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d 6765
45e32b91
KW
6766 if ($related) {
6767 if ($current_leader->perl_extension) {
6768 if ($other->perl_extension) {
6769 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6770 return;
6771 }
7610e9e2
KW
6772 } elsif ($self->property != $other->property # Depending on
6773 # situation, might
6774 # be better to use
6775 # add_alias()
6776 # instead for same
6777 # property
6778 && ! $other->perl_extension)
6779 {
45e32b91
KW
6780 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6781 $related = 0;
6782 }
6783 }
6784
6785 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6786 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
6787 return;
99870f4d
KW
6788 }
6789
ffe43484
NC
6790 my $leader = do { no overloading; pack 'J', $current_leader; };
6791 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6792
6793 # Any tables that are equivalent to or children of this table must now
6794 # instead be equivalent to or (children) to the new leader (parent),
6795 # still equivalent. The equivalency includes their matches_all info,
6796 # and for related tables, their status
6797 # All related tables are of necessity equivalent, but the converse
6798 # isn't necessarily true
6799 my $status = $other->status;
6800 my $status_info = $other->status_info;
6801 my $matches_all = $matches_all{other_addr};
d867ccfb 6802 my $caseless_equivalent = $other->caseless_equivalent;
99870f4d
KW
6803 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6804 next if $table == $other;
6805 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6806
ffe43484 6807 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6808 $leader{$table_addr} = $other;
6809 $matches_all{$table_addr} = $matches_all;
6810 $self->_set_range_list($other->_range_list);
6811 push @{$equivalents{$other_addr}}, $table;
6812 if ($related) {
6813 $parent{$table_addr} = $other;
6814 push @{$children{$other_addr}}, $table;
6815 $table->set_status($status, $status_info);
d867ccfb 6816 $self->set_caseless_equivalent($caseless_equivalent);
99870f4d
KW
6817 }
6818 }
6819
6820 # Now that we've declared these to be equivalent, any changes to one
6821 # of the tables would invalidate that equivalency.
6822 $self->lock;
6823 $other->lock;
6824 return;
6825 }
6826
6827 sub add_range { # Add a range to the list for this table.
6828 my $self = shift;
6829 # Rest of parameters passed on
6830
6831 return if $self->carp_if_locked;
6832 return $self->_range_list->add_range(@_);
6833 }
6834
99870f4d
KW
6835 sub pre_body { # Does nothing for match tables.
6836 return
6837 }
6838
6839 sub append_to_body { # Does nothing for match tables.
6840 return
6841 }
6842
6843 sub write {
6844 my $self = shift;
6845 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6846
6847 return $self->SUPER::write(2); # 2 tab stops
6848 }
6849
6850 sub set_final_comment {
6851 # This creates a comment for the file that is to hold the match table
6852 # $self. It is somewhat convoluted to make the English read nicely,
6853 # but, heh, it's just a comment.
6854 # This should be called only with the leader match table of all the
6855 # ones that share the same file. It lists all such tables, ordered so
6856 # that related ones are together.
6857
bd9ebcfd
KW
6858 return unless $debugging_build;
6859
99870f4d
KW
6860 my $leader = shift; # Should only be called on the leader table of
6861 # an equivalent group
6862 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6863
ffe43484 6864 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6865
6866 if ($leader{$addr} != $leader) {
6867 Carp::my_carp_bug(<<END
6868set_final_comment() must be called on a leader table, which $leader is not.
6869It is equivalent to $leader{$addr}. No comment created
6870END
6871 );
6872 return;
6873 }
6874
6875 # Get the number of code points matched by each of the tables in this
6876 # file, and add underscores for clarity.
6877 my $count = $leader->count;
6878 my $string_count = main::clarify_number($count);
6879
6880 my $loose_count = 0; # how many aliases loosely matched
6881 my $compound_name = ""; # ? Are any names compound?, and if so, an
6882 # example
6883 my $properties_with_compound_names = 0; # count of these
6884
6885
6886 my %flags; # The status flags used in the file
6887 my $total_entries = 0; # number of entries written in the comment
6888 my $matches_comment = ""; # The portion of the comment about the
6889 # \p{}'s
6890 my @global_comments; # List of all the tables' comments that are
6891 # there before this routine was called.
6892
6893 # Get list of all the parent tables that are equivalent to this one
6894 # (including itself).
6895 my @parents = grep { $parent{main::objaddr $_} == $_ }
6896 main::uniques($leader, @{$equivalents{$addr}});
6897 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6898 # tables
6899
6900 for my $parent (@parents) {
6901
6902 my $property = $parent->property;
6903
6904 # Special case 'N' tables in properties with two match tables when
6905 # the other is a 'Y' one. These are likely to be binary tables,
6906 # but not necessarily. In either case, \P{} will match the
6907 # complement of \p{}, and so if something is a synonym of \p, the
6908 # complement of that something will be the synonym of \P. This
6909 # would be true of any property with just two match tables, not
6910 # just those whose values are Y and N; but that would require a
6911 # little extra work, and there are none such so far in Unicode.
6912 my $perl_p = 'p'; # which is it? \p{} or \P{}
6913 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6914
6915 if (scalar $property->tables == 2
6916 && $parent == $property->table('N')
6917 && defined (my $yes = $property->table('Y')))
6918 {
ffe43484 6919 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6920 @yes_perl_synonyms
6921 = grep { $_->property == $perl }
6922 main::uniques($yes,
6923 $parent{$yes_addr},
6924 $parent{$yes_addr}->children);
6925
6926 # But these synonyms are \P{} ,not \p{}
6927 $perl_p = 'P';
6928 }
6929
6930 my @description; # Will hold the table description
6931 my @note; # Will hold the table notes.
6932 my @conflicting; # Will hold the table conflicts.
6933
6934 # Look at the parent, any yes synonyms, and all the children
ffe43484 6935 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6936 for my $table ($parent,
6937 @yes_perl_synonyms,
f998e60c 6938 @{$children{$parent_addr}})
99870f4d 6939 {
ffe43484 6940 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6941 my $table_property = $table->property;
6942
6943 # Tables are separated by a blank line to create a grouping.
6944 $matches_comment .= "\n" if $matches_comment;
6945
6946 # The table is named based on the property and value
6947 # combination it is for, like script=greek. But there may be
6948 # a number of synonyms for each side, like 'sc' for 'script',
6949 # and 'grek' for 'greek'. Any combination of these is a valid
6950 # name for this table. In this case, there are three more,
6951 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6952 # listing all possible combinations in the comment, we make
6953 # sure that each synonym occurs at least once, and add
6954 # commentary that the other combinations are possible.
da912e1e
KW
6955 # Because regular expressions don't recognize things like
6956 # \p{jsn=}, only look at non-null right-hand-sides
99870f4d 6957 my @property_aliases = $table_property->aliases;
da912e1e 6958 my @table_aliases = grep { $_->name ne "" } $table->aliases;
99870f4d
KW
6959
6960 # The alias lists above are already ordered in the order we
6961 # want to output them. To ensure that each synonym is listed,
da912e1e
KW
6962 # we must use the max of the two numbers. But if there are no
6963 # legal synonyms (nothing in @table_aliases), then we don't
6964 # list anything.
6965 my $listed_combos = (@table_aliases)
6966 ? main::max(scalar @table_aliases,
6967 scalar @property_aliases)
6968 : 0;
99870f4d
KW
6969 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6970
da912e1e 6971
99870f4d
KW
6972 my $property_had_compound_name = 0;
6973
6974 for my $i (0 .. $listed_combos - 1) {
6975 $total_entries++;
6976
6977 # The current alias for the property is the next one on
6978 # the list, or if beyond the end, start over. Similarly
6979 # for the table (\p{prop=table})
6980 my $property_alias = $property_aliases
6981 [$i % @property_aliases]->name;
6982 my $table_alias_object = $table_aliases
6983 [$i % @table_aliases];
6984 my $table_alias = $table_alias_object->name;
6985 my $loose_match = $table_alias_object->loose_match;
6986
6987 if ($table_alias !~ /\D/) { # Clarify large numbers.
6988 $table_alias = main::clarify_number($table_alias)
6989 }
6990
6991 # Add a comment for this alias combination
6992 my $current_match_comment;
6993 if ($table_property == $perl) {
6994 $current_match_comment = "\\$perl_p"
6995 . "{$table_alias}";
6996 }
6997 else {
6998 $current_match_comment
6999 = "\\p{$property_alias=$table_alias}";
7000 $property_had_compound_name = 1;
7001 }
7002
7003 # Flag any abnormal status for this table.
7004 my $flag = $property->status
7005 || $table->status
7006 || $table_alias_object->status;
37e2e78e
KW
7007 if ($flag) {
7008 if ($flag ne $PLACEHOLDER) {
7009 $flags{$flag} = $status_past_participles{$flag};
7010 } else {
7011 $flags{$flag} = <<END;
7012a placeholder because it is not in Version $string_version of Unicode, but is
7013needed by the Perl core to work gracefully. Because it is not in this version
7014of Unicode, it will not be listed in $pod_file.pod
7015END
7016 }
7017 }
99870f4d
KW
7018
7019 $loose_count++;
7020
7021 # Pretty up the comment. Note the \b; it says don't make
7022 # this line a continuation.
7023 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7024 $flag,
7025 " " x 7,
7026 $current_match_comment);
7027 } # End of generating the entries for this table.
7028
7029 # Save these for output after this group of related tables.
7030 push @description, $table->description;
7031 push @note, $table->note;
7032 push @conflicting, $table->conflicting;
7033
37e2e78e
KW
7034 # And this for output after all the tables.
7035 push @global_comments, $table->comment;
7036
99870f4d
KW
7037 # Compute an alternate compound name using the final property
7038 # synonym and the first table synonym with a colon instead of
7039 # the equal sign used elsewhere.
7040 if ($property_had_compound_name) {
7041 $properties_with_compound_names ++;
7042 if (! $compound_name || @property_aliases > 1) {
7043 $compound_name = $property_aliases[-1]->name
7044 . ': '
7045 . $table_aliases[0]->name;
7046 }
7047 }
7048 } # End of looping through all children of this table
7049
7050 # Here have assembled in $matches_comment all the related tables
7051 # to the current parent (preceded by the same info for all the
7052 # previous parents). Put out information that applies to all of
7053 # the current family.
7054 if (@conflicting) {
7055
7056 # But output the conflicting information now, as it applies to
7057 # just this table.
7058 my $conflicting = join ", ", @conflicting;
7059 if ($conflicting) {
7060 $matches_comment .= <<END;
7061
7062 Note that contrary to what you might expect, the above is NOT the same as
7063END
7064 $matches_comment .= "any of: " if @conflicting > 1;
7065 $matches_comment .= "$conflicting\n";
7066 }
7067 }
7068 if (@description) {
7069 $matches_comment .= "\n Meaning: "
7070 . join('; ', @description)
7071 . "\n";
7072 }
7073 if (@note) {
7074 $matches_comment .= "\n Note: "
7075 . join("\n ", @note)
7076 . "\n";
7077 }
7078 } # End of looping through all tables
7079
7080
7081 my $code_points;
7082 my $match;
7083 my $any_of_these;
7084 if ($count == 1) {
7085 $match = 'matches';
7086 $code_points = 'single code point';
7087 }
7088 else {
7089 $match = 'match';
7090 $code_points = "$string_count code points";
7091 }
7092
7093 my $synonyms;
7094 my $entries;
da912e1e 7095 if ($total_entries == 1) {
99870f4d
KW
7096 $synonyms = "";
7097 $entries = 'entry';
7098 $any_of_these = 'this'
7099 }
7100 else {
7101 $synonyms = " any of the following regular expression constructs";
7102 $entries = 'entries';
7103 $any_of_these = 'any of these'
7104 }
7105
7106 my $comment = "";
7107 if ($has_unrelated) {
7108 $comment .= <<END;
7109This file is for tables that are not necessarily related: To conserve
7110resources, every table that matches the identical set of code points in this
7111version of Unicode uses this file. Each one is listed in a separate group
7112below. It could be that the tables will match the same set of code points in
7113other Unicode releases, or it could be purely coincidence that they happen to
7114be the same in Unicode $string_version, and hence may not in other versions.
7115
7116END
7117 }
7118
7119 if (%flags) {
7120 foreach my $flag (sort keys %flags) {
7121 $comment .= <<END;
37e2e78e 7122'$flag' below means that this form is $flags{$flag}.
99870f4d 7123END
37e2e78e
KW
7124 next if $flag eq $PLACEHOLDER;
7125 $comment .= "Consult $pod_file.pod\n";
99870f4d
KW
7126 }
7127 $comment .= "\n";
7128 }
7129
da912e1e
KW
7130 if ($total_entries == 0) {
7131 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7132 $comment .= <<END;
7133This file returns the $code_points in Unicode Version $string_version for
7134$leader, but it is inaccessible through Perl regular expressions, as
7135"\\p{prop=}" is not recognized.
7136END
7137
7138 } else {
7139 $comment .= <<END;
99870f4d
KW
7140This file returns the $code_points in Unicode Version $string_version that
7141$match$synonyms:
7142
7143$matches_comment
37e2e78e 7144$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
7145including if adding or subtracting white space, underscore, and hyphen
7146characters matters or doesn't matter, and other permissible syntactic
7147variants. Upper/lower case distinctions never matter.
7148END
7149
da912e1e 7150 }
99870f4d
KW
7151 if ($compound_name) {
7152 $comment .= <<END;
7153
7154A colon can be substituted for the equals sign, and
7155END
7156 if ($properties_with_compound_names > 1) {
7157 $comment .= <<END;
7158within each group above,
7159END
7160 }
7161 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7162
7163 # Note the \b below, it says don't make that line a continuation.
7164 $comment .= <<END;
7165anything to the left of the equals (or colon) can be combined with anything to
7166the right. Thus, for example,
7167$compound_name
7168\bis also valid.
7169END
7170 }
7171
7172 # And append any comment(s) from the actual tables. They are all
7173 # gathered here, so may not read all that well.
37e2e78e
KW
7174 if (@global_comments) {
7175 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7176 }
99870f4d
KW
7177
7178 if ($count) { # The format differs if no code points, and needs no
7179 # explanation in that case
7180 $comment.= <<END;
7181
7182The format of the lines of this file is:
7183END
7184 $comment.= <<END;
7185START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7186STOP is the ending point, or if omitted, the range has just one code point.
7187END
0c07e538 7188 if ($leader->output_range_counts) {
99870f4d
KW
7189 $comment .= <<END;
7190Numbers in comments in [brackets] indicate how many code points are in the
7191range.
7192END
7193 }
7194 }
7195
7196 $leader->set_comment(main::join_lines($comment));
7197 return;
7198 }
7199
7200 # Accessors for the underlying list
ea25a9b2 7201 for my $sub (qw(
99870f4d
KW
7202 get_valid_code_point
7203 get_invalid_code_point
ea25a9b2 7204 ))
99870f4d
KW
7205 {
7206 no strict "refs";
7207 *$sub = sub {
7208 use strict "refs";
7209 my $self = shift;
7210
7211 return $self->_range_list->$sub(@_);
7212 }
7213 }
7214} # End closure for Match_Table
7215
7216package Property;
7217
7218# The Property class represents a Unicode property, or the $perl
7219# pseudo-property. It contains a map table initialized empty at construction
7220# time, and for properties accessible through regular expressions, various
7221# match tables, created through the add_match_table() method, and referenced
7222# by the table('NAME') or tables() methods, the latter returning a list of all
7223# of the match tables. Otherwise table operations implicitly are for the map
7224# table.
7225#
7226# Most of the data in the property is actually about its map table, so it
7227# mostly just uses that table's accessors for most methods. The two could
7228# have been combined into one object, but for clarity because of their
7229# differing semantics, they have been kept separate. It could be argued that
7230# the 'file' and 'directory' fields should be kept with the map table.
7231#
7232# Each property has a type. This can be set in the constructor, or in the
7233# set_type accessor, but mostly it is figured out by the data. Every property
7234# starts with unknown type, overridden by a parameter to the constructor, or
7235# as match tables are added, or ranges added to the map table, the data is
7236# inspected, and the type changed. After the table is mostly or entirely
7237# filled, compute_type() should be called to finalize they analysis.
7238#
7239# There are very few operations defined. One can safely remove a range from
7240# the map table, and property_add_or_replace_non_nulls() adds the maps from another
7241# table to this one, replacing any in the intersection of the two.
7242
7243sub standardize { return main::standardize($_[0]); }
7244sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7245
7246{ # Closure
7247
7248 # This hash will contain as keys, all the aliases of all properties, and
7249 # as values, pointers to their respective property objects. This allows
7250 # quick look-up of a property from any of its names.
7251 my %alias_to_property_of;
7252
7253 sub dump_alias_to_property_of {
7254 # For debugging
7255
7256 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7257 return;
7258 }
7259
7260 sub property_ref {
7261 # This is a package subroutine, not called as a method.
7262 # If the single parameter is a literal '*' it returns a list of all
7263 # defined properties.
7264 # Otherwise, the single parameter is a name, and it returns a pointer
7265 # to the corresponding property object, or undef if none.
7266 #
7267 # Properties can have several different names. The 'standard' form of
7268 # each of them is stored in %alias_to_property_of as they are defined.
7269 # But it's possible that this subroutine will be called with some
7270 # variant, so if the initial lookup fails, it is repeated with the
98dc9551 7271 # standardized form of the input name. If found, besides returning the
99870f4d
KW
7272 # result, the input name is added to the list so future calls won't
7273 # have to do the conversion again.
7274
7275 my $name = shift;
7276
7277 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7278
7279 if (! defined $name) {
7280 Carp::my_carp_bug("Undefined input property. No action taken.");
7281 return;
7282 }
7283
7284 return main::uniques(values %alias_to_property_of) if $name eq '*';
7285
7286 # Return cached result if have it.
7287 my $result = $alias_to_property_of{$name};
7288 return $result if defined $result;
7289
7290 # Convert the input to standard form.
7291 my $standard_name = standardize($name);
7292
7293 $result = $alias_to_property_of{$standard_name};
7294 return unless defined $result; # Don't cache undefs
7295
7296 # Cache the result before returning it.
7297 $alias_to_property_of{$name} = $result;
7298 return $result;
7299 }
7300
7301
7302 main::setup_package();
7303
7304 my %map;
7305 # A pointer to the map table object for this property
7306 main::set_access('map', \%map);
7307
7308 my %full_name;
7309 # The property's full name. This is a duplicate of the copy kept in the
7310 # map table, but is needed because stringify needs it during
7311 # construction of the map table, and then would have a chicken before egg
7312 # problem.
7313 main::set_access('full_name', \%full_name, 'r');
7314
7315 my %table_ref;
7316 # This hash will contain as keys, all the aliases of any match tables
7317 # attached to this property, and as values, the pointers to their
7318 # respective tables. This allows quick look-up of a table from any of its
7319 # names.
7320 main::set_access('table_ref', \%table_ref);
7321
7322 my %type;
7323 # The type of the property, $ENUM, $BINARY, etc
7324 main::set_access('type', \%type, 'r');
7325
7326 my %file;
7327 # The filename where the map table will go (if actually written).
7328 # Normally defaulted, but can be overridden.
7329 main::set_access('file', \%file, 'r', 's');
7330
7331 my %directory;
7332 # The directory where the map table will go (if actually written).
7333 # Normally defaulted, but can be overridden.
7334 main::set_access('directory', \%directory, 's');
7335
7336 my %pseudo_map_type;
7337 # This is used to affect the calculation of the map types for all the
7338 # ranges in the table. It should be set to one of the values that signify
7339 # to alter the calculation.
7340 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7341
7342 my %has_only_code_point_maps;
7343 # A boolean used to help in computing the type of data in the map table.
7344 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7345
7346 my %unique_maps;
7347 # A list of the first few distinct mappings this property has. This is
7348 # used to disambiguate between binary and enum property types, so don't
7349 # have to keep more than three.
7350 main::set_access('unique_maps', \%unique_maps);
7351
56557540
KW
7352 my %pre_declared_maps;
7353 # A boolean that gives whether the input data should declare all the
7354 # tables used, or not. If the former, unknown ones raise a warning.
7355 main::set_access('pre_declared_maps',
047274f2 7356 \%pre_declared_maps, 'r', 's');
56557540 7357
99870f4d
KW
7358 sub new {
7359 # The only required parameter is the positionally first, name. All
7360 # other parameters are key => value pairs. See the documentation just
7361 # above for the meanings of the ones not passed directly on to the map
7362 # table constructor.
7363
7364 my $class = shift;
7365 my $name = shift || "";
7366
7367 my $self = property_ref($name);
7368 if (defined $self) {
7369 my $options_string = join ", ", @_;
7370 $options_string = ". Ignoring options $options_string" if $options_string;
7371 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7372 return $self;
7373 }
7374
7375 my %args = @_;
7376
7377 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 7378 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7379
7380 $directory{$addr} = delete $args{'Directory'};
7381 $file{$addr} = delete $args{'File'};
7382 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7383 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7384 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
56557540
KW
7385 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7386 # Starting in this release, property
7387 # values should be defined for all
7388 # properties, except those overriding this
7389 // $v_version ge v5.1.0;
c12f2655 7390
99870f4d
KW
7391 # Rest of parameters passed on.
7392
7393 $has_only_code_point_maps{$addr} = 1;
7394 $table_ref{$addr} = { };
7395 $unique_maps{$addr} = { };
7396
7397 $map{$addr} = Map_Table->new($name,
7398 Full_Name => $full_name{$addr},
7399 _Alias_Hash => \%alias_to_property_of,
7400 _Property => $self,
7401 %args);
7402 return $self;
7403 }
7404
7405 # See this program's beginning comment block about overloading the copy
7406 # constructor. Few operations are defined on properties, but a couple are
7407 # useful. It is safe to take the inverse of a property, and to remove a
7408 # single code point from it.
7409 use overload
7410 fallback => 0,
7411 qw("") => "_operator_stringify",
7412 "." => \&main::_operator_dot,
7413 '==' => \&main::_operator_equal,
7414 '!=' => \&main::_operator_not_equal,
7415 '=' => sub { return shift },
7416 '-=' => "_minus_and_equal",
7417 ;
7418
7419 sub _operator_stringify {
7420 return "Property '" . shift->full_name . "'";
7421 }
7422
7423 sub _minus_and_equal {
7424 # Remove a single code point from the map table of a property.
7425
7426 my $self = shift;
7427 my $other = shift;
7428 my $reversed = shift;
7429 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7430
7431 if (ref $other) {
7432 Carp::my_carp_bug("Can't cope with a "
7433 . ref($other)
7434 . " argument to '-='. Subtraction ignored.");
7435 return $self;
7436 }
98dc9551 7437 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
99870f4d
KW
7438 Carp::my_carp_bug("Can't cope with a "
7439 . __PACKAGE__
7440 . " being the first parameter in a '-='. Subtraction ignored.");
7441 return $self;
7442 }
7443 else {
f998e60c 7444 no overloading;
051df77b 7445 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
7446 }
7447 return $self;
7448 }
7449
7450 sub add_match_table {
7451 # Add a new match table for this property, with name given by the
7452 # parameter. It returns a pointer to the table.
7453
7454 my $self = shift;
7455 my $name = shift;
7456 my %args = @_;
7457
ffe43484 7458 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7459
7460 my $table = $table_ref{$addr}{$name};
7461 my $standard_name = main::standardize($name);
7462 if (defined $table
7463 || (defined ($table = $table_ref{$addr}{$standard_name})))
7464 {
7465 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7466 $table_ref{$addr}{$name} = $table;
7467 return $table;
7468 }
7469 else {
7470
7471 # See if this is a perl extension, if not passed in.
7472 my $perl_extension = delete $args{'Perl_Extension'};
7473 $perl_extension
7474 = $self->perl_extension if ! defined $perl_extension;
7475
7476 $table = Match_Table->new(
7477 Name => $name,
7478 Perl_Extension => $perl_extension,
7479 _Alias_Hash => $table_ref{$addr},
7480 _Property => $self,
7481
7482 # gets property's status by default
7483 Status => $self->status,
7484 _Status_Info => $self->status_info,
7485 %args,
7486 Internal_Only_Warning => 1); # Override any
7487 # input param
7488 return unless defined $table;
7489 }
7490
7491 # Save the names for quick look up
7492 $table_ref{$addr}{$standard_name} = $table;
7493 $table_ref{$addr}{$name} = $table;
7494
7495 # Perhaps we can figure out the type of this property based on the
7496 # fact of adding this match table. First, string properties don't
7497 # have match tables; second, a binary property can't have 3 match
7498 # tables
7499 if ($type{$addr} == $UNKNOWN) {
7500 $type{$addr} = $NON_STRING;
7501 }
7502 elsif ($type{$addr} == $STRING) {
7503 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7504 $type{$addr} = $NON_STRING;
7505 }
7506 elsif ($type{$addr} != $ENUM) {
7507 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7508 && $type{$addr} == $BINARY)
7509 {
7510 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7511 $type{$addr} = $ENUM;
7512 }
7513 }
7514
7515 return $table;
7516 }
7517
4b9b0bc5
KW
7518 sub delete_match_table {
7519 # Delete the table referred to by $2 from the property $1.
7520
7521 my $self = shift;
7522 my $table_to_remove = shift;
7523 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7524
7525 my $addr = do { no overloading; pack 'J', $self; };
7526
7527 # Remove all names that refer to it.
7528 foreach my $key (keys %{$table_ref{$addr}}) {
7529 delete $table_ref{$addr}{$key}
7530 if $table_ref{$addr}{$key} == $table_to_remove;
7531 }
7532
7533 $table_to_remove->DESTROY;
7534 return;
7535 }
7536
99870f4d
KW
7537 sub table {
7538 # Return a pointer to the match table (with name given by the
7539 # parameter) associated with this property; undef if none.
7540
7541 my $self = shift;
7542 my $name = shift;
7543 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7544
ffe43484 7545 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7546
7547 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7548
7549 # If quick look-up failed, try again using the standard form of the
7550 # input name. If that succeeds, cache the result before returning so
7551 # won't have to standardize this input name again.
7552 my $standard_name = main::standardize($name);
7553 return unless defined $table_ref{$addr}{$standard_name};
7554
7555 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7556 return $table_ref{$addr}{$name};
7557 }
7558
7559 sub tables {
7560 # Return a list of pointers to all the match tables attached to this
7561 # property
7562
f998e60c 7563 no overloading;
051df77b 7564 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
7565 }
7566
7567 sub directory {
7568 # Returns the directory the map table for this property should be
7569 # output in. If a specific directory has been specified, that has
7570 # priority; 'undef' is returned if the type isn't defined;
7571 # or $map_directory for everything else.
7572
ffe43484 7573 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
7574
7575 return $directory{$addr} if defined $directory{$addr};
7576 return undef if $type{$addr} == $UNKNOWN;
7577 return $map_directory;
7578 }
7579
7580 sub swash_name {
7581 # Return the name that is used to both:
7582 # 1) Name the file that the map table is written to.
7583 # 2) The name of swash related stuff inside that file.
7584 # The reason for this is that the Perl core historically has used
7585 # certain names that aren't the same as the Unicode property names.
7586 # To continue using these, $file is hard-coded in this file for those,
7587 # but otherwise the standard name is used. This is different from the
7588 # external_name, so that the rest of the files, like in lib can use
7589 # the standard name always, without regard to historical precedent.
7590
7591 my $self = shift;
7592 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7593
ffe43484 7594 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7595
7596 return $file{$addr} if defined $file{$addr};
7597 return $map{$addr}->external_name;
7598 }
7599
7600 sub to_create_match_tables {
7601 # Returns a boolean as to whether or not match tables should be
7602 # created for this property.
7603
7604 my $self = shift;
7605 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7606
7607 # The whole point of this pseudo property is match tables.
7608 return 1 if $self == $perl;
7609
ffe43484 7610 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7611
7612 # Don't generate tables of code points that match the property values
7613 # of a string property. Such a list would most likely have many
7614 # property values, each with just one or very few code points mapping
7615 # to it.
7616 return 0 if $type{$addr} == $STRING;
7617
7618 # Don't generate anything for unimplemented properties.
7619 return 0 if grep { $self->complete_name eq $_ }
7620 @unimplemented_properties;
7621 # Otherwise, do.
7622 return 1;
7623 }
7624
7625 sub property_add_or_replace_non_nulls {
7626 # This adds the mappings in the property $other to $self. Non-null
7627 # mappings from $other override those in $self. It essentially merges
7628 # the two properties, with the second having priority except for null
7629 # mappings.
7630
7631 my $self = shift;
7632 my $other = shift;
7633 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7634
7635 if (! $other->isa(__PACKAGE__)) {
7636 Carp::my_carp_bug("$other should be a "
7637 . __PACKAGE__
7638 . ". Not a '"
7639 . ref($other)
7640 . "'. Not added;");
7641 return;
7642 }
7643
f998e60c 7644 no overloading;
051df77b 7645 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
7646 }
7647
7648 sub set_type {
7649 # Set the type of the property. Mostly this is figured out by the
7650 # data in the table. But this is used to set it explicitly. The
7651 # reason it is not a standard accessor is that when setting a binary
7652 # property, we need to make sure that all the true/false aliases are
7653 # present, as they were omitted in early Unicode releases.
7654
7655 my $self = shift;
7656 my $type = shift;
7657 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7658
7659 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
7660 Carp::my_carp("Unrecognized type '$type'. Type not set");
7661 return;
7662 }
7663
051df77b 7664 { no overloading; $type{pack 'J', $self} = $type; }
99870f4d
KW
7665 return if $type != $BINARY;
7666
7667 my $yes = $self->table('Y');
7668 $yes = $self->table('Yes') if ! defined $yes;
01adf4be
KW
7669 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7670 if ! defined $yes;
7671
7672 # Add aliases in order wanted, duplicates will be ignored. Note, that
7673 # could run into problems in outputting things in that we don't
7674 # distinguish between the name and full name of these. Hopefully, if
7675 # the table was already created before this code is executed, it was
7676 # done with these set properly.
7677 $yes->add_alias('Y');
99870f4d
KW
7678 $yes->add_alias('Yes');
7679 $yes->add_alias('T');
7680 $yes->add_alias('True');
7681
7682 my $no = $self->table('N');
7683 $no = $self->table('No') if ! defined $no;
01adf4be
KW
7684 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
7685 $no->add_alias('N');
99870f4d
KW
7686 $no->add_alias('No');
7687 $no->add_alias('F');
7688 $no->add_alias('False');
c12f2655 7689
99870f4d
KW
7690 return;
7691 }
7692
7693 sub add_map {
7694 # Add a map to the property's map table. This also keeps
7695 # track of the maps so that the property type can be determined from
7696 # its data.
7697
7698 my $self = shift;
7699 my $start = shift; # First code point in range
7700 my $end = shift; # Final code point in range
7701 my $map = shift; # What the range maps to.
7702 # Rest of parameters passed on.
7703
ffe43484 7704 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7705
7706 # If haven't the type of the property, gather information to figure it
7707 # out.
7708 if ($type{$addr} == $UNKNOWN) {
7709
7710 # If the map contains an interior blank or dash, or most other
7711 # nonword characters, it will be a string property. This
7712 # heuristic may actually miss some string properties. If so, they
7713 # may need to have explicit set_types called for them. This
7714 # happens in the Unihan properties.
7715 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7716 || $map =~ / [^\w.\/\ -] /x)
7717 {
7718 $self->set_type($STRING);
7719
7720 # $unique_maps is used for disambiguating between ENUM and
7721 # BINARY later; since we know the property is not going to be
7722 # one of those, no point in keeping the data around
7723 undef $unique_maps{$addr};
7724 }
7725 else {
7726
7727 # Not necessarily a string. The final decision has to be
7728 # deferred until all the data are in. We keep track of if all
7729 # the values are code points for that eventual decision.
7730 $has_only_code_point_maps{$addr} &=
7731 $map =~ / ^ $code_point_re $/x;
7732
7733 # For the purposes of disambiguating between binary and other
7734 # enumerations at the end, we keep track of the first three
7735 # distinct property values. Once we get to three, we know
7736 # it's not going to be binary, so no need to track more.
7737 if (scalar keys %{$unique_maps{$addr}} < 3) {
7738 $unique_maps{$addr}{main::standardize($map)} = 1;
7739 }
7740 }
7741 }
7742
7743 # Add the mapping by calling our map table's method
7744 return $map{$addr}->add_map($start, $end, $map, @_);
7745 }
7746
7747 sub compute_type {
7748 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7749 # should be called after the property is mostly filled with its maps.
7750 # We have been keeping track of what the property values have been,
7751 # and now have the necessary information to figure out the type.
7752
7753 my $self = shift;
7754 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7755
ffe43484 7756 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7757
7758 my $type = $type{$addr};
7759
7760 # If already have figured these out, no need to do so again, but we do
7761 # a double check on ENUMS to make sure that a string property hasn't
7762 # improperly been classified as an ENUM, so continue on with those.
7763 return if $type == $STRING || $type == $BINARY;
7764
7765 # If every map is to a code point, is a string property.
7766 if ($type == $UNKNOWN
7767 && ($has_only_code_point_maps{$addr}
7768 || (defined $map{$addr}->default_map
7769 && $map{$addr}->default_map eq "")))
7770 {
7771 $self->set_type($STRING);
7772 }
7773 else {
7774
7775 # Otherwise, it is to some sort of enumeration. (The case where
7776 # it is a Unicode miscellaneous property, and treated like a
7777 # string in this program is handled in add_map()). Distinguish
7778 # between binary and some other enumeration type. Of course, if
7779 # there are more than two values, it's not binary. But more
7780 # subtle is the test that the default mapping is defined means it
7781 # isn't binary. This in fact may change in the future if Unicode
7782 # changes the way its data is structured. But so far, no binary
7783 # properties ever have @missing lines for them, so the default map
7784 # isn't defined for them. The few properties that are two-valued
7785 # and aren't considered binary have the default map defined
7786 # starting in Unicode 5.0, when the @missing lines appeared; and
7787 # this program has special code to put in a default map for them
7788 # for earlier than 5.0 releases.
7789 if ($type == $ENUM
7790 || scalar keys %{$unique_maps{$addr}} > 2
7791 || defined $self->default_map)
7792 {
7793 my $tables = $self->tables;
7794 my $count = $self->count;
7795 if ($verbosity && $count > 500 && $tables/$count > .1) {
7796 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7797 }
7798 $self->set_type($ENUM);
7799 }
7800 else {
7801 $self->set_type($BINARY);
7802 }
7803 }
7804 undef $unique_maps{$addr}; # Garbage collect
7805 return;
7806 }
7807
7808 # Most of the accessors for a property actually apply to its map table.
7809 # Setup up accessor functions for those, referring to %map
ea25a9b2 7810 for my $sub (qw(
99870f4d
KW
7811 add_alias
7812 add_anomalous_entry
7813 add_comment
7814 add_conflicting
7815 add_description
7816 add_duplicate
7817 add_note
7818 aliases
7819 comment
7820 complete_name
2f7a8815 7821 containing_range
99870f4d
KW
7822 core_access
7823 count
7824 default_map
7825 delete_range
7826 description
7827 each_range
7828 external_name
7829 file_path
7830 format
7831 initialize
7832 inverse_list
7833 is_empty
7834 name
7835 note
7836 perl_extension
7837 property
7838 range_count
7839 ranges
7840 range_size_1
7841 reset_each_range
7842 set_comment
7843 set_core_access
7844 set_default_map
7845 set_file_path
7846 set_final_comment
7847 set_range_size_1
7848 set_status
7849 set_to_output_map
7850 short_name
7851 status
7852 status_info
7853 to_output_map
0a9dbafc 7854 type_of
99870f4d
KW
7855 value_of
7856 write
ea25a9b2 7857 ))
99870f4d
KW
7858 # 'property' above is for symmetry, so that one can take
7859 # the property of a property and get itself, and so don't
7860 # have to distinguish between properties and tables in
7861 # calling code
7862 {
7863 no strict "refs";
7864 *$sub = sub {
7865 use strict "refs";
7866 my $self = shift;
f998e60c 7867 no overloading;
051df77b 7868 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7869 }
7870 }
7871
7872
7873} # End closure
7874
7875package main;
7876
7877sub join_lines($) {
7878 # Returns lines of the input joined together, so that they can be folded
7879 # properly.
7880 # This causes continuation lines to be joined together into one long line
7881 # for folding. A continuation line is any line that doesn't begin with a
7882 # space or "\b" (the latter is stripped from the output). This is so
7883 # lines can be be in a HERE document so as to fit nicely in the terminal
7884 # width, but be joined together in one long line, and then folded with
7885 # indents, '#' prefixes, etc, properly handled.
7886 # A blank separates the joined lines except if there is a break; an extra
7887 # blank is inserted after a period ending a line.
7888
98dc9551 7889 # Initialize the return with the first line.
99870f4d
KW
7890 my ($return, @lines) = split "\n", shift;
7891
7892 # If the first line is null, it was an empty line, add the \n back in
7893 $return = "\n" if $return eq "";
7894
7895 # Now join the remainder of the physical lines.
7896 for my $line (@lines) {
7897
7898 # An empty line means wanted a blank line, so add two \n's to get that
7899 # effect, and go to the next line.
7900 if (length $line == 0) {
7901 $return .= "\n\n";
7902 next;
7903 }
7904
7905 # Look at the last character of what we have so far.
7906 my $previous_char = substr($return, -1, 1);
7907
7908 # And at the next char to be output.
7909 my $next_char = substr($line, 0, 1);
7910
7911 if ($previous_char ne "\n") {
7912
7913 # Here didn't end wth a nl. If the next char a blank or \b, it
7914 # means that here there is a break anyway. So add a nl to the
7915 # output.
7916 if ($next_char eq " " || $next_char eq "\b") {
7917 $previous_char = "\n";
7918 $return .= $previous_char;
7919 }
7920
7921 # Add an extra space after periods.
7922 $return .= " " if $previous_char eq '.';
7923 }
7924
7925 # Here $previous_char is still the latest character to be output. If
7926 # it isn't a nl, it means that the next line is to be a continuation
7927 # line, with a blank inserted between them.
7928 $return .= " " if $previous_char ne "\n";
7929
7930 # Get rid of any \b
7931 substr($line, 0, 1) = "" if $next_char eq "\b";
7932
7933 # And append this next line.
7934 $return .= $line;
7935 }
7936
7937 return $return;
7938}
7939
7940sub simple_fold($;$$$) {
7941 # Returns a string of the input (string or an array of strings) folded
7942 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7943 # a \n
7944 # This is tailored for the kind of text written by this program,
7945 # especially the pod file, which can have very long names with
7946 # underscores in the middle, or words like AbcDefgHij.... We allow
7947 # breaking in the middle of such constructs if the line won't fit
7948 # otherwise. The break in such cases will come either just after an
7949 # underscore, or just before one of the Capital letters.
7950
7951 local $to_trace = 0 if main::DEBUG;
7952
7953 my $line = shift;
7954 my $prefix = shift; # Optional string to prepend to each output
7955 # line
7956 $prefix = "" unless defined $prefix;
7957
7958 my $hanging_indent = shift; # Optional number of spaces to indent
7959 # continuation lines
7960 $hanging_indent = 0 unless $hanging_indent;
7961
7962 my $right_margin = shift; # Optional number of spaces to narrow the
7963 # total width by.
7964 $right_margin = 0 unless defined $right_margin;
7965
7966 # Call carp with the 'nofold' option to avoid it from trying to call us
7967 # recursively
7968 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7969
7970 # The space available doesn't include what's automatically prepended
7971 # to each line, or what's reserved on the right.
7972 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7973 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7974
7975 if (DEBUG && $hanging_indent >= $max) {
7976 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7977 $hanging_indent = 0;
7978 }
7979
7980 # First, split into the current physical lines.
7981 my @line;
7982 if (ref $line) { # Better be an array, because not bothering to
7983 # test
7984 foreach my $line (@{$line}) {
7985 push @line, split /\n/, $line;
7986 }
7987 }
7988 else {
7989 @line = split /\n/, $line;
7990 }
7991
7992 #local $to_trace = 1 if main::DEBUG;
7993 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7994
7995 # Look at each current physical line.
7996 for (my $i = 0; $i < @line; $i++) {
7997 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7998 #local $to_trace = 1 if main::DEBUG;
7999 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8000
8001 # Remove prefix, because will be added back anyway, don't want
8002 # doubled prefix
8003 $line[$i] =~ s/^$prefix//;
8004
8005 # Remove trailing space
8006 $line[$i] =~ s/\s+\Z//;
8007
8008 # If the line is too long, fold it.
8009 if (length $line[$i] > $max) {
8010 my $remainder;
8011
8012 # Here needs to fold. Save the leading space in the line for
8013 # later.
8014 $line[$i] =~ /^ ( \s* )/x;
8015 my $leading_space = $1;
8016 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8017
8018 # If character at final permissible position is white space,
8019 # fold there, which will delete that white space
8020 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8021 $remainder = substr($line[$i], $max);
8022 $line[$i] = substr($line[$i], 0, $max - 1);
8023 }
8024 else {
8025
8026 # Otherwise fold at an acceptable break char closest to
8027 # the max length. Look at just the maximal initial
8028 # segment of the line
8029 my $segment = substr($line[$i], 0, $max - 1);
8030 if ($segment =~
8031 /^ ( .{$hanging_indent} # Don't look before the
8032 # indent.
8033 \ * # Don't look in leading
8034 # blanks past the indent
8035 [^ ] .* # Find the right-most
8036 (?: # acceptable break:
8037 [ \s = ] # space or equal
8038 | - (?! [.0-9] ) # or non-unary minus.
8039 ) # $1 includes the character
8040 )/x)
8041 {
8042 # Split into the initial part that fits, and remaining
8043 # part of the input
8044 $remainder = substr($line[$i], length $1);
8045 $line[$i] = $1;
8046 trace $line[$i] if DEBUG && $to_trace;
8047 trace $remainder if DEBUG && $to_trace;
8048 }
8049
8050 # If didn't find a good breaking spot, see if there is a
8051 # not-so-good breaking spot. These are just after
8052 # underscores or where the case changes from lower to
8053 # upper. Use \a as a soft hyphen, but give up
8054 # and don't break the line if there is actually a \a
8055 # already in the input. We use an ascii character for the
8056 # soft-hyphen to avoid any attempt by miniperl to try to
8057 # access the files that this program is creating.
8058 elsif ($segment !~ /\a/
8059 && ($segment =~ s/_/_\a/g
8060 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8061 {
8062 # Here were able to find at least one place to insert
8063 # our substitute soft hyphen. Find the right-most one
8064 # and replace it by a real hyphen.
8065 trace $segment if DEBUG && $to_trace;
8066 substr($segment,
8067 rindex($segment, "\a"),
8068 1) = '-';
8069
8070 # Then remove the soft hyphen substitutes.
8071 $segment =~ s/\a//g;
8072 trace $segment if DEBUG && $to_trace;
8073
8074 # And split into the initial part that fits, and
8075 # remainder of the line
8076 my $pos = rindex($segment, '-');
8077 $remainder = substr($line[$i], $pos);
8078 trace $remainder if DEBUG && $to_trace;
8079 $line[$i] = substr($segment, 0, $pos + 1);
8080 }
8081 }
8082
8083 # Here we know if we can fold or not. If we can, $remainder
8084 # is what remains to be processed in the next iteration.
8085 if (defined $remainder) {
8086 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8087
8088 # Insert the folded remainder of the line as a new element
8089 # of the array. (It may still be too long, but we will
8090 # deal with that next time through the loop.) Omit any
8091 # leading space in the remainder.
8092 $remainder =~ s/^\s+//;
8093 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8094
8095 # But then indent by whichever is larger of:
8096 # 1) the leading space on the input line;
8097 # 2) the hanging indent.
8098 # This preserves indentation in the original line.
8099 my $lead = ($leading_space)
8100 ? length $leading_space
8101 : $hanging_indent;
8102 $lead = max($lead, $hanging_indent);
8103 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8104 }
8105 }
8106
8107 # Ready to output the line. Get rid of any trailing space
8108 # And prefix by the required $prefix passed in.
8109 $line[$i] =~ s/\s+$//;
8110 $line[$i] = "$prefix$line[$i]\n";
8111 } # End of looping through all the lines.
8112
8113 return join "", @line;
8114}
8115
8116sub property_ref { # Returns a reference to a property object.
8117 return Property::property_ref(@_);
8118}
8119
8120sub force_unlink ($) {
8121 my $filename = shift;
8122 return unless file_exists($filename);
8123 return if CORE::unlink($filename);
8124
8125 # We might need write permission
8126 chmod 0777, $filename;
8127 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8128 return;
8129}
8130
9218f1cf 8131sub write ($$@) {
9abe8df8
KW
8132 # Given a filename and references to arrays of lines, write the lines of
8133 # each array to the file
99870f4d
KW
8134 # Filename can be given as an arrayref of directory names
8135
9218f1cf 8136 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
99870f4d 8137
9abe8df8 8138 my $file = shift;
9218f1cf 8139 my $use_utf8 = shift;
99870f4d
KW
8140
8141 # Get into a single string if an array, and get rid of, in Unix terms, any
8142 # leading '.'
8143 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8144 $file = File::Spec->canonpath($file);
8145
8146 # If has directories, make sure that they all exist
8147 (undef, my $directories, undef) = File::Spec->splitpath($file);
8148 File::Path::mkpath($directories) if $directories && ! -d $directories;
8149
8150 push @files_actually_output, $file;
8151
99870f4d
KW
8152 force_unlink ($file);
8153
8154 my $OUT;
8155 if (not open $OUT, ">", $file) {
8156 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8157 return;
8158 }
430ada4c 8159
9218f1cf
KW
8160 binmode $OUT, ":utf8" if $use_utf8;
8161
9abe8df8
KW
8162 while (defined (my $lines_ref = shift)) {
8163 unless (@$lines_ref) {
8164 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8165 }
8166
8167 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8168 }
430ada4c
NC
8169 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8170
99870f4d
KW
8171 print "$file written.\n" if $verbosity >= $VERBOSE;
8172
99870f4d
KW
8173 return;
8174}
8175
8176
8177sub Standardize($) {
8178 # This converts the input name string into a standardized equivalent to
8179 # use internally.
8180
8181 my $name = shift;
8182 unless (defined $name) {
8183 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8184 return;
8185 }
8186
8187 # Remove any leading or trailing white space
8188 $name =~ s/^\s+//g;
8189 $name =~ s/\s+$//g;
8190
98dc9551 8191 # Convert interior white space and hyphens into underscores.
99870f4d
KW
8192 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8193
8194 # Capitalize the letter following an underscore, and convert a sequence of
8195 # multiple underscores to a single one
8196 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8197
8198 # And capitalize the first letter, but not for the special cjk ones.
8199 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8200 return $name;
8201}
8202
8203sub standardize ($) {
8204 # Returns a lower-cased standardized name, without underscores. This form
8205 # is chosen so that it can distinguish between any real versus superficial
8206 # Unicode name differences. It relies on the fact that Unicode doesn't
8207 # have interior underscores, white space, nor dashes in any
8208 # stricter-matched name. It should not be used on Unicode code point
8209 # names (the Name property), as they mostly, but not always follow these
8210 # rules.
8211
8212 my $name = Standardize(shift);
8213 return if !defined $name;
8214
8215 $name =~ s/ (?<= .) _ (?= . ) //xg;
8216 return lc $name;
8217}
8218
c85f591a
KW
8219sub utf8_heavy_name ($$) {
8220 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8221 # perhaps this function should be placed somewhere, like Heavy.pl so that
8222 # utf8_heavy can use it directly without duplicating code that can get
8223 # out-of sync.
8224
8225 my $table = shift;
8226 my $alias = shift;
8227 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8228
8229 my $property = $table->property;
8230 $property = ($property == $perl)
8231 ? "" # 'perl' is never explicitly stated
8232 : standardize($property->name) . '=';
8233 if ($alias->loose_match) {
8234 return $property . standardize($alias->name);
8235 }
8236 else {
8237 return lc ($property . $alias->name);
8238 }
8239
8240 return;
8241}
8242
99870f4d
KW
8243{ # Closure
8244
8245 my $indent_increment = " " x 2;
8246 my %already_output;
8247
8248 $main::simple_dumper_nesting = 0;
8249
8250 sub simple_dumper {
8251 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8252 # the real thing as we have to run under miniperl.
8253
8254 # It is designed so that on input it is at the beginning of a line,
8255 # and the final thing output in any call is a trailing ",\n".
8256
8257 my $item = shift;
8258 my $indent = shift;
8259 $indent = "" if ! defined $indent;
8260
8261 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8262
8263 # nesting level is localized, so that as the call stack pops, it goes
8264 # back to the prior value.
8265 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8266 undef %already_output if $main::simple_dumper_nesting == 0;
8267 $main::simple_dumper_nesting++;
8268 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8269
8270 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8271
8272 # Determine the indent for recursive calls.
8273 my $next_indent = $indent . $indent_increment;
8274
8275 my $output;
8276 if (! ref $item) {
8277
8278 # Dump of scalar: just output it in quotes if not a number. To do
8279 # so we must escape certain characters, and therefore need to
8280 # operate on a copy to avoid changing the original
8281 my $copy = $item;
8282 $copy = $UNDEF unless defined $copy;
8283
8284 # Quote non-numbers (numbers also have optional leading '-' and
8285 # fractions)
8286 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
8287
8288 # Escape apostrophe and backslash
8289 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8290 $copy = "'$copy'";
8291 }
8292 $output = "$indent$copy,\n";
8293 }
8294 else {
8295
8296 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 8297 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 8298 if (defined $already_output{$addr}) {
99870f4d
KW
8299 return "${indent}ALREADY OUTPUT: $item\n";
8300 }
f998e60c 8301 $already_output{$addr} = $item;
99870f4d
KW
8302
8303 if (ref $item eq 'ARRAY') {
8304 my $using_brackets;
8305 $output = $indent;
8306 if ($main::simple_dumper_nesting > 1) {
8307 $output .= '[';
8308 $using_brackets = 1;
8309 }
8310 else {
8311 $using_brackets = 0;
8312 }
8313
8314 # If the array is empty, put the closing bracket on the same
8315 # line. Otherwise, recursively add each array element
8316 if (@$item == 0) {
8317 $output .= " ";
8318 }
8319 else {
8320 $output .= "\n";
8321 for (my $i = 0; $i < @$item; $i++) {
8322
8323 # Indent array elements one level
8324 $output .= &simple_dumper($item->[$i], $next_indent);
c12f2655
KW
8325 $output =~ s/\n$//; # Remove any trailing nl so
8326 $output .= " # [$i]\n"; # as to add a comment giving
8327 # the array index
99870f4d
KW
8328 }
8329 $output .= $indent; # Indent closing ']' to orig level
8330 }
8331 $output .= ']' if $using_brackets;
8332 $output .= ",\n";
8333 }
8334 elsif (ref $item eq 'HASH') {
8335 my $is_first_line;
8336 my $using_braces;
8337 my $body_indent;
8338
8339 # No surrounding braces at top level
8340 $output .= $indent;
8341 if ($main::simple_dumper_nesting > 1) {
8342 $output .= "{\n";
8343 $is_first_line = 0;
8344 $body_indent = $next_indent;
8345 $next_indent .= $indent_increment;
8346 $using_braces = 1;
8347 }
8348 else {
8349 $is_first_line = 1;
8350 $body_indent = $indent;
8351 $using_braces = 0;
8352 }
8353
8354 # Output hashes sorted alphabetically instead of apparently
8355 # random. Use caseless alphabetic sort
8356 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8357 {
8358 if ($is_first_line) {
8359 $is_first_line = 0;
8360 }
8361 else {
8362 $output .= "$body_indent";
8363 }
8364
8365 # The key must be a scalar, but this recursive call quotes
8366 # it
8367 $output .= &simple_dumper($key);
8368
8369 # And change the trailing comma and nl to the hash fat
8370 # comma for clarity, and so the value can be on the same
8371 # line
8372 $output =~ s/,\n$/ => /;
8373
8374 # Recursively call to get the value's dump.
8375 my $next = &simple_dumper($item->{$key}, $next_indent);
8376
8377 # If the value is all on one line, remove its indent, so
8378 # will follow the => immediately. If it takes more than
8379 # one line, start it on a new line.
8380 if ($next !~ /\n.*\n/) {
8381 $next =~ s/^ *//;
8382 }
8383 else {
8384 $output .= "\n";
8385 }
8386 $output .= $next;
8387 }
8388
8389 $output .= "$indent},\n" if $using_braces;
8390 }
8391 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8392 $output = $indent . ref($item) . "\n";
8393 # XXX see if blessed
8394 }
8395 elsif ($item->can('dump')) {
8396
8397 # By convention in this program, objects furnish a 'dump'
8398 # method. Since not doing any output at this level, just pass
8399 # on the input indent
8400 $output = $item->dump($indent);
8401 }
8402 else {
8403 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8404 }
8405 }
8406 return $output;
8407 }
8408}
8409
8410sub dump_inside_out {
8411 # Dump inside-out hashes in an object's state by converting them to a
8412 # regular hash and then calling simple_dumper on that.
8413
8414 my $object = shift;
8415 my $fields_ref = shift;
8416 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8417
ffe43484 8418 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
8419
8420 my %hash;
8421 foreach my $key (keys %$fields_ref) {
8422 $hash{$key} = $fields_ref->{$key}{$addr};
8423 }
8424
8425 return simple_dumper(\%hash, @_);
8426}
8427
8428sub _operator_dot {
8429 # Overloaded '.' method that is common to all packages. It uses the
8430 # package's stringify method.
8431
8432 my $self = shift;
8433 my $other = shift;
8434 my $reversed = shift;
8435 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8436
8437 $other = "" unless defined $other;
8438
8439 foreach my $which (\$self, \$other) {
8440 next unless ref $$which;
8441 if ($$which->can('_operator_stringify')) {
8442 $$which = $$which->_operator_stringify;
8443 }
8444 else {
8445 my $ref = ref $$which;
ffe43484 8446 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
8447 $$which = "$ref ($addr)";
8448 }
8449 }
8450 return ($reversed)
8451 ? "$other$self"
8452 : "$self$other";
8453}
8454
8455sub _operator_equal {
8456 # Generic overloaded '==' routine. To be equal, they must be the exact
8457 # same object
8458
8459 my $self = shift;
8460 my $other = shift;
8461
8462 return 0 unless defined $other;
8463 return 0 unless ref $other;
f998e60c 8464 no overloading;
2100aa98 8465 return $self == $other;
99870f4d
KW
8466}
8467
8468sub _operator_not_equal {
8469 my $self = shift;
8470 my $other = shift;
8471
8472 return ! _operator_equal($self, $other);
8473}
8474
8475sub process_PropertyAliases($) {
8476 # This reads in the PropertyAliases.txt file, which contains almost all
8477 # the character properties in Unicode and their equivalent aliases:
8478 # scf ; Simple_Case_Folding ; sfc
8479 #
8480 # Field 0 is the preferred short name for the property.
8481 # Field 1 is the full name.
8482 # Any succeeding ones are other accepted names.
8483
8484 my $file= shift;
8485 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8486
8487 # This whole file was non-existent in early releases, so use our own
8488 # internal one.
8489 $file->insert_lines(get_old_property_aliases())
8490 if ! -e 'PropertyAliases.txt';
8491
8492 # Add any cjk properties that may have been defined.
8493 $file->insert_lines(@cjk_properties);
8494
8495 while ($file->next_line) {
8496
8497 my @data = split /\s*;\s*/;
8498
8499 my $full = $data[1];
8500
8501 my $this = Property->new($data[0], Full_Name => $full);
8502
8503 # Start looking for more aliases after these two.
8504 for my $i (2 .. @data - 1) {
8505 $this->add_alias($data[$i]);
8506 }
8507
8508 }
8509 return;
8510}
8511
8512sub finish_property_setup {
8513 # Finishes setting up after PropertyAliases.
8514
8515 my $file = shift;
8516 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8517
8518 # This entry was missing from this file in earlier Unicode versions
8519 if (-e 'Jamo.txt') {
8520 my $jsn = property_ref('JSN');
8521 if (! defined $jsn) {
8522 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8523 }
8524 }
8525
5f7264c7 8526 # This entry is still missing as of 6.0, perhaps because no short name for
99870f4d
KW
8527 # it.
8528 if (-e 'NameAliases.txt') {
8529 my $aliases = property_ref('Name_Alias');
8530 if (! defined $aliases) {
8531 $aliases = Property->new('Name_Alias');
8532 }
8533 }
8534
8535 # These are used so much, that we set globals for them.
8536 $gc = property_ref('General_Category');
8537 $block = property_ref('Block');
359523e2 8538 $script = property_ref('Script');
99870f4d
KW
8539
8540 # Perl adds this alias.
8541 $gc->add_alias('Category');
8542
8543 # For backwards compatibility, these property files have particular names.
8544 my $upper = property_ref('Uppercase_Mapping');
8545 $upper->set_core_access('uc()');
8546 $upper->set_file('Upper'); # This is what utf8.c calls it
8547
8548 my $lower = property_ref('Lowercase_Mapping');
8549 $lower->set_core_access('lc()');
8550 $lower->set_file('Lower');
8551
8552 my $title = property_ref('Titlecase_Mapping');
8553 $title->set_core_access('ucfirst()');
8554 $title->set_file('Title');
8555
8556 my $fold = property_ref('Case_Folding');
8557 $fold->set_file('Fold') if defined $fold;
8558
d3cbe105
KW
8559 # Unicode::Normalize expects this file with this name and directory.
8560 my $ccc = property_ref('Canonical_Combining_Class');
8561 if (defined $ccc) {
8562 $ccc->set_file('CombiningClass');
8563 $ccc->set_directory(File::Spec->curdir());
8564 }
8565
2cd56239
KW
8566 # utf8.c has a different meaning for non range-size-1 for map properties
8567 # that this program doesn't currently handle; and even if it were changed
8568 # to do so, some other code may be using them expecting range size 1.
99870f4d
KW
8569 foreach my $property (qw {
8570 Case_Folding
8571 Lowercase_Mapping
8572 Titlecase_Mapping
8573 Uppercase_Mapping
8574 })
8575 {
8576 property_ref($property)->set_range_size_1(1);
8577 }
8578
8579 # These two properties aren't actually used in the core, but unfortunately
8580 # the names just above that are in the core interfere with these, so
8581 # choose different names. These aren't a problem unless the map tables
8582 # for these files get written out.
8583 my $lowercase = property_ref('Lowercase');
8584 $lowercase->set_file('IsLower') if defined $lowercase;
8585 my $uppercase = property_ref('Uppercase');
8586 $uppercase->set_file('IsUpper') if defined $uppercase;
8587
8588 # Set up the hard-coded default mappings, but only on properties defined
8589 # for this release
8590 foreach my $property (keys %default_mapping) {
8591 my $property_object = property_ref($property);
8592 next if ! defined $property_object;
8593 my $default_map = $default_mapping{$property};
8594 $property_object->set_default_map($default_map);
8595
8596 # A map of <code point> implies the property is string.
8597 if ($property_object->type == $UNKNOWN
8598 && $default_map eq $CODE_POINT)
8599 {
8600 $property_object->set_type($STRING);
8601 }
8602 }
8603
8604 # The following use the Multi_Default class to create objects for
8605 # defaults.
8606
8607 # Bidi class has a complicated default, but the derived file takes care of
8608 # the complications, leaving just 'L'.
8609 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8610 property_ref('Bidi_Class')->set_default_map('L');
8611 }
8612 else {
8613 my $default;
8614
8615 # The derived file was introduced in 3.1.1. The values below are
8616 # taken from table 3-8, TUS 3.0
8617 my $default_R =
8618 'my $default = Range_List->new;
8619 $default->add_range(0x0590, 0x05FF);
8620 $default->add_range(0xFB1D, 0xFB4F);'
8621 ;
8622
8623 # The defaults apply only to unassigned characters
a67f160a 8624 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
8625
8626 if ($v_version lt v3.0.0) {
8627 $default = Multi_Default->new(R => $default_R, 'L');
8628 }
8629 else {
8630
8631 # AL apparently not introduced until 3.0: TUS 2.x references are
8632 # not on-line to check it out
8633 my $default_AL =
8634 'my $default = Range_List->new;
8635 $default->add_range(0x0600, 0x07BF);
8636 $default->add_range(0xFB50, 0xFDFF);
8637 $default->add_range(0xFE70, 0xFEFF);'
8638 ;
8639
8640 # Non-character code points introduced in this release; aren't AL
8641 if ($v_version ge 3.1.0) {
8642 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8643 }
a67f160a 8644 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
8645 $default = Multi_Default->new(AL => $default_AL,
8646 R => $default_R,
8647 'L');
8648 }
8649 property_ref('Bidi_Class')->set_default_map($default);
8650 }
8651
8652 # Joining type has a complicated default, but the derived file takes care
8653 # of the complications, leaving just 'U' (or Non_Joining), except the file
8654 # is bad in 3.1.0
8655 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8656 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8657 property_ref('Joining_Type')->set_default_map('Non_Joining');
8658 }
8659 else {
8660
8661 # Otherwise, there are not one, but two possibilities for the
8662 # missing defaults: T and U.
8663 # The missing defaults that evaluate to T are given by:
8664 # T = Mn + Cf - ZWNJ - ZWJ
8665 # where Mn and Cf are the general category values. In other words,
8666 # any non-spacing mark or any format control character, except
8667 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8668 # WIDTH JOINER (joining type C).
8669 my $default = Multi_Default->new(
8670 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8671 'Non_Joining');
8672 property_ref('Joining_Type')->set_default_map($default);
8673 }
8674 }
8675
8676 # Line break has a complicated default in early releases. It is 'Unknown'
8677 # for non-assigned code points; 'AL' for assigned.
8678 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8679 my $lb = property_ref('Line_Break');
8680 if ($v_version gt 3.2.0) {
8681 $lb->set_default_map('Unknown');
8682 }
8683 else {
8684 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8685 'AL');
8686 $lb->set_default_map($default);
8687 }
8688
8689 # If has the URS property, make sure that the standard aliases are in
8690 # it, since not in the input tables in some versions.
8691 my $urs = property_ref('Unicode_Radical_Stroke');
8692 if (defined $urs) {
8693 $urs->add_alias('cjkRSUnicode');
8694 $urs->add_alias('kRSUnicode');
8695 }
8696 }
8697 return;
8698}
8699
8700sub get_old_property_aliases() {
8701 # Returns what would be in PropertyAliases.txt if it existed in very old
8702 # versions of Unicode. It was derived from the one in 3.2, and pared
8703 # down based on the data that was actually in the older releases.
8704 # An attempt was made to use the existence of files to mean inclusion or
8705 # not of various aliases, but if this was not sufficient, using version
8706 # numbers was resorted to.
8707
8708 my @return;
8709
8710 # These are to be used in all versions (though some are constructed by
8711 # this program if missing)
8712 push @return, split /\n/, <<'END';
8713bc ; Bidi_Class
8714Bidi_M ; Bidi_Mirrored
8715cf ; Case_Folding
8716ccc ; Canonical_Combining_Class
8717dm ; Decomposition_Mapping
8718dt ; Decomposition_Type
8719gc ; General_Category
8720isc ; ISO_Comment
8721lc ; Lowercase_Mapping
8722na ; Name
8723na1 ; Unicode_1_Name
8724nt ; Numeric_Type
8725nv ; Numeric_Value
8726sfc ; Simple_Case_Folding
8727slc ; Simple_Lowercase_Mapping
8728stc ; Simple_Titlecase_Mapping
8729suc ; Simple_Uppercase_Mapping
8730tc ; Titlecase_Mapping
8731uc ; Uppercase_Mapping
8732END
8733
8734 if (-e 'Blocks.txt') {
8735 push @return, "blk ; Block\n";
8736 }
8737 if (-e 'ArabicShaping.txt') {
8738 push @return, split /\n/, <<'END';
8739jg ; Joining_Group
8740jt ; Joining_Type
8741END
8742 }
8743 if (-e 'PropList.txt') {
8744
8745 # This first set is in the original old-style proplist.
8746 push @return, split /\n/, <<'END';
8747Alpha ; Alphabetic
8748Bidi_C ; Bidi_Control
8749Dash ; Dash
8750Dia ; Diacritic
8751Ext ; Extender
8752Hex ; Hex_Digit
8753Hyphen ; Hyphen
8754IDC ; ID_Continue
8755Ideo ; Ideographic
8756Join_C ; Join_Control
8757Math ; Math
8758QMark ; Quotation_Mark
8759Term ; Terminal_Punctuation
8760WSpace ; White_Space
8761END
8762 # The next sets were added later
8763 if ($v_version ge v3.0.0) {
8764 push @return, split /\n/, <<'END';
8765Upper ; Uppercase
8766Lower ; Lowercase
8767END
8768 }
8769 if ($v_version ge v3.0.1) {
8770 push @return, split /\n/, <<'END';
8771NChar ; Noncharacter_Code_Point
8772END
8773 }
8774 # The next sets were added in the new-style
8775 if ($v_version ge v3.1.0) {
8776 push @return, split /\n/, <<'END';
8777OAlpha ; Other_Alphabetic
8778OLower ; Other_Lowercase
8779OMath ; Other_Math
8780OUpper ; Other_Uppercase
8781END
8782 }
8783 if ($v_version ge v3.1.1) {
8784 push @return, "AHex ; ASCII_Hex_Digit\n";
8785 }
8786 }
8787 if (-e 'EastAsianWidth.txt') {
8788 push @return, "ea ; East_Asian_Width\n";
8789 }
8790 if (-e 'CompositionExclusions.txt') {
8791 push @return, "CE ; Composition_Exclusion\n";
8792 }
8793 if (-e 'LineBreak.txt') {
8794 push @return, "lb ; Line_Break\n";
8795 }
8796 if (-e 'BidiMirroring.txt') {
8797 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8798 }
8799 if (-e 'Scripts.txt') {
8800 push @return, "sc ; Script\n";
8801 }
8802 if (-e 'DNormalizationProps.txt') {
8803 push @return, split /\n/, <<'END';
8804Comp_Ex ; Full_Composition_Exclusion
8805FC_NFKC ; FC_NFKC_Closure
8806NFC_QC ; NFC_Quick_Check
8807NFD_QC ; NFD_Quick_Check
8808NFKC_QC ; NFKC_Quick_Check
8809NFKD_QC ; NFKD_Quick_Check
8810XO_NFC ; Expands_On_NFC
8811XO_NFD ; Expands_On_NFD
8812XO_NFKC ; Expands_On_NFKC
8813XO_NFKD ; Expands_On_NFKD
8814END
8815 }
8816 if (-e 'DCoreProperties.txt') {
8817 push @return, split /\n/, <<'END';
8818IDS ; ID_Start
8819XIDC ; XID_Continue
8820XIDS ; XID_Start
8821END
8822 # These can also appear in some versions of PropList.txt
8823 push @return, "Lower ; Lowercase\n"
8824 unless grep { $_ =~ /^Lower\b/} @return;
8825 push @return, "Upper ; Uppercase\n"
8826 unless grep { $_ =~ /^Upper\b/} @return;
8827 }
8828
8829 # This flag requires the DAge.txt file to be copied into the directory.
8830 if (DEBUG && $compare_versions) {
8831 push @return, 'age ; Age';
8832 }
8833
8834 return @return;
8835}
8836
8837sub process_PropValueAliases {
8838 # This file contains values that properties look like:
8839 # bc ; AL ; Arabic_Letter
8840 # blk; n/a ; Greek_And_Coptic ; Greek
8841 #
8842 # Field 0 is the property.
8843 # Field 1 is the short name of a property value or 'n/a' if no
8844 # short name exists;
8845 # Field 2 is the full property value name;
8846 # Any other fields are more synonyms for the property value.
8847 # Purely numeric property values are omitted from the file; as are some
8848 # others, fewer and fewer in later releases
8849
8850 # Entries for the ccc property have an extra field before the
8851 # abbreviation:
8852 # ccc; 0; NR ; Not_Reordered
8853 # It is the numeric value that the names are synonyms for.
8854
8855 # There are comment entries for values missing from this file:
8856 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8857 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8858
8859 my $file= shift;
8860 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8861
8862 # This whole file was non-existent in early releases, so use our own
8863 # internal one if necessary.
8864 if (! -e 'PropValueAliases.txt') {
8865 $file->insert_lines(get_old_property_value_aliases());
8866 }
8867
8868 # Add any explicit cjk values
8869 $file->insert_lines(@cjk_property_values);
8870
8871 # This line is used only for testing the code that checks for name
8872 # conflicts. There is a script Inherited, and when this line is executed
8873 # it causes there to be a name conflict with the 'Inherited' that this
8874 # program generates for this block property value
8875 #$file->insert_lines('blk; n/a; Herited');
8876
8877
8878 # Process each line of the file ...
8879 while ($file->next_line) {
8880
8881 my ($property, @data) = split /\s*;\s*/;
8882
66b4eb0a
KW
8883 # The ccc property has an extra field at the beginning, which is the
8884 # numeric value. Move it to be after the other two, mnemonic, fields,
8885 # so that those will be used as the property value's names, and the
8886 # number will be an extra alias. (Rightmost splice removes field 1-2,
8887 # returning them in a slice; left splice inserts that before anything,
8888 # thus shifting the former field 0 to after them.)
8889 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8890
8891 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
8892 # there is no short name, use the full one in element 1
027866c1
KW
8893 if ($data[0] eq "n/a") {
8894 $data[0] = $data[1];
8895 }
8896 elsif ($data[0] ne $data[1]
8897 && standardize($data[0]) eq standardize($data[1])
8898 && $data[1] !~ /[[:upper:]]/)
8899 {
8900 # Also, there is a bug in the file in which "n/a" is omitted, and
8901 # the two fields are identical except for case, and the full name
8902 # is all lower case. Copy the "short" name unto the full one to
8903 # give it some upper case.
8904
8905 $data[1] = $data[0];
8906 }
99870f4d
KW
8907
8908 # Earlier releases had the pseudo property 'qc' that should expand to
8909 # the ones that replace it below.
8910 if ($property eq 'qc') {
8911 if (lc $data[0] eq 'y') {
8912 $file->insert_lines('NFC_QC; Y ; Yes',
8913 'NFD_QC; Y ; Yes',
8914 'NFKC_QC; Y ; Yes',
8915 'NFKD_QC; Y ; Yes',
8916 );
8917 }
8918 elsif (lc $data[0] eq 'n') {
8919 $file->insert_lines('NFC_QC; N ; No',
8920 'NFD_QC; N ; No',
8921 'NFKC_QC; N ; No',
8922 'NFKD_QC; N ; No',
8923 );
8924 }
8925 elsif (lc $data[0] eq 'm') {
8926 $file->insert_lines('NFC_QC; M ; Maybe',
8927 'NFKC_QC; M ; Maybe',
8928 );
8929 }
8930 else {
8931 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8932 }
8933 next;
8934 }
8935
8936 # The first field is the short name, 2nd is the full one.
8937 my $property_object = property_ref($property);
8938 my $table = $property_object->add_match_table($data[0],
8939 Full_Name => $data[1]);
8940
8941 # Start looking for more aliases after these two.
8942 for my $i (2 .. @data - 1) {
8943 $table->add_alias($data[$i]);
8944 }
8945 } # End of looping through the file
8946
8947 # As noted in the comments early in the program, it generates tables for
8948 # the default values for all releases, even those for which the concept
8949 # didn't exist at the time. Here we add those if missing.
8950 my $age = property_ref('age');
8951 if (defined $age && ! defined $age->table('Unassigned')) {
8952 $age->add_match_table('Unassigned');
8953 }
8954 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8955 && ! defined $block->table('No_Block');
8956
8957
8958 # Now set the default mappings of the properties from the file. This is
8959 # done after the loop because a number of properties have only @missings
8960 # entries in the file, and may not show up until the end.
8961 my @defaults = $file->get_missings;
8962 foreach my $default_ref (@defaults) {
8963 my $default = $default_ref->[0];
8964 my $property = property_ref($default_ref->[1]);
8965 $property->set_default_map($default);
8966 }
8967 return;
8968}
8969
8970sub get_old_property_value_aliases () {
8971 # Returns what would be in PropValueAliases.txt if it existed in very old
8972 # versions of Unicode. It was derived from the one in 3.2, and pared
8973 # down. An attempt was made to use the existence of files to mean
8974 # inclusion or not of various aliases, but if this was not sufficient,
8975 # using version numbers was resorted to.
8976
8977 my @return = split /\n/, <<'END';
8978bc ; AN ; Arabic_Number
8979bc ; B ; Paragraph_Separator
8980bc ; CS ; Common_Separator
8981bc ; EN ; European_Number
8982bc ; ES ; European_Separator
8983bc ; ET ; European_Terminator
8984bc ; L ; Left_To_Right
8985bc ; ON ; Other_Neutral
8986bc ; R ; Right_To_Left
8987bc ; WS ; White_Space
8988
8989# The standard combining classes are very much different in v1, so only use
8990# ones that look right (not checked thoroughly)
8991ccc; 0; NR ; Not_Reordered
8992ccc; 1; OV ; Overlay
8993ccc; 7; NK ; Nukta
8994ccc; 8; KV ; Kana_Voicing
8995ccc; 9; VR ; Virama
8996ccc; 202; ATBL ; Attached_Below_Left
8997ccc; 216; ATAR ; Attached_Above_Right
8998ccc; 218; BL ; Below_Left
8999ccc; 220; B ; Below
9000ccc; 222; BR ; Below_Right
9001ccc; 224; L ; Left
9002ccc; 228; AL ; Above_Left
9003ccc; 230; A ; Above
9004ccc; 232; AR ; Above_Right
9005ccc; 234; DA ; Double_Above
9006
9007dt ; can ; canonical
9008dt ; enc ; circle
9009dt ; fin ; final
9010dt ; font ; font
9011dt ; fra ; fraction
9012dt ; init ; initial
9013dt ; iso ; isolated
9014dt ; med ; medial
9015dt ; n/a ; none
9016dt ; nb ; noBreak
9017dt ; sqr ; square
9018dt ; sub ; sub
9019dt ; sup ; super
9020
9021gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9022gc ; Cc ; Control
9023gc ; Cn ; Unassigned
9024gc ; Co ; Private_Use
9025gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9026gc ; LC ; Cased_Letter # Ll | Lt | Lu
9027gc ; Ll ; Lowercase_Letter
9028gc ; Lm ; Modifier_Letter
9029gc ; Lo ; Other_Letter
9030gc ; Lu ; Uppercase_Letter
9031gc ; M ; Mark # Mc | Me | Mn
9032gc ; Mc ; Spacing_Mark
9033gc ; Mn ; Nonspacing_Mark
9034gc ; N ; Number # Nd | Nl | No
9035gc ; Nd ; Decimal_Number
9036gc ; No ; Other_Number
9037gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9038gc ; Pd ; Dash_Punctuation
9039gc ; Pe ; Close_Punctuation
9040gc ; Po ; Other_Punctuation
9041gc ; Ps ; Open_Punctuation
9042gc ; S ; Symbol # Sc | Sk | Sm | So
9043gc ; Sc ; Currency_Symbol
9044gc ; Sm ; Math_Symbol
9045gc ; So ; Other_Symbol
9046gc ; Z ; Separator # Zl | Zp | Zs
9047gc ; Zl ; Line_Separator
9048gc ; Zp ; Paragraph_Separator
9049gc ; Zs ; Space_Separator
9050
9051nt ; de ; Decimal
9052nt ; di ; Digit
9053nt ; n/a ; None
9054nt ; nu ; Numeric
9055END
9056
9057 if (-e 'ArabicShaping.txt') {
9058 push @return, split /\n/, <<'END';
9059jg ; n/a ; AIN
9060jg ; n/a ; ALEF
9061jg ; n/a ; DAL
9062jg ; n/a ; GAF
9063jg ; n/a ; LAM
9064jg ; n/a ; MEEM
9065jg ; n/a ; NO_JOINING_GROUP
9066jg ; n/a ; NOON
9067jg ; n/a ; QAF
9068jg ; n/a ; SAD
9069jg ; n/a ; SEEN
9070jg ; n/a ; TAH
9071jg ; n/a ; WAW
9072
9073jt ; C ; Join_Causing
9074jt ; D ; Dual_Joining
9075jt ; L ; Left_Joining
9076jt ; R ; Right_Joining
9077jt ; U ; Non_Joining
9078jt ; T ; Transparent
9079END
9080 if ($v_version ge v3.0.0) {
9081 push @return, split /\n/, <<'END';
9082jg ; n/a ; ALAPH
9083jg ; n/a ; BEH
9084jg ; n/a ; BETH
9085jg ; n/a ; DALATH_RISH
9086jg ; n/a ; E
9087jg ; n/a ; FEH
9088jg ; n/a ; FINAL_SEMKATH
9089jg ; n/a ; GAMAL
9090jg ; n/a ; HAH
9091jg ; n/a ; HAMZA_ON_HEH_GOAL
9092jg ; n/a ; HE
9093jg ; n/a ; HEH
9094jg ; n/a ; HEH_GOAL
9095jg ; n/a ; HETH
9096jg ; n/a ; KAF
9097jg ; n/a ; KAPH
9098jg ; n/a ; KNOTTED_HEH
9099jg ; n/a ; LAMADH
9100jg ; n/a ; MIM
9101jg ; n/a ; NUN
9102jg ; n/a ; PE
9103jg ; n/a ; QAPH
9104jg ; n/a ; REH
9105jg ; n/a ; REVERSED_PE
9106jg ; n/a ; SADHE
9107jg ; n/a ; SEMKATH
9108jg ; n/a ; SHIN
9109jg ; n/a ; SWASH_KAF
9110jg ; n/a ; TAW
9111jg ; n/a ; TEH_MARBUTA
9112jg ; n/a ; TETH
9113jg ; n/a ; YEH
9114jg ; n/a ; YEH_BARREE
9115jg ; n/a ; YEH_WITH_TAIL
9116jg ; n/a ; YUDH
9117jg ; n/a ; YUDH_HE
9118jg ; n/a ; ZAIN
9119END
9120 }
9121 }
9122
9123
9124 if (-e 'EastAsianWidth.txt') {
9125 push @return, split /\n/, <<'END';
9126ea ; A ; Ambiguous
9127ea ; F ; Fullwidth
9128ea ; H ; Halfwidth
9129ea ; N ; Neutral
9130ea ; Na ; Narrow
9131ea ; W ; Wide
9132END
9133 }
9134
9135 if (-e 'LineBreak.txt') {
9136 push @return, split /\n/, <<'END';
9137lb ; AI ; Ambiguous
9138lb ; AL ; Alphabetic
9139lb ; B2 ; Break_Both
9140lb ; BA ; Break_After
9141lb ; BB ; Break_Before
9142lb ; BK ; Mandatory_Break
9143lb ; CB ; Contingent_Break
9144lb ; CL ; Close_Punctuation
9145lb ; CM ; Combining_Mark
9146lb ; CR ; Carriage_Return
9147lb ; EX ; Exclamation
9148lb ; GL ; Glue
9149lb ; HY ; Hyphen
9150lb ; ID ; Ideographic
9151lb ; IN ; Inseperable
9152lb ; IS ; Infix_Numeric
9153lb ; LF ; Line_Feed
9154lb ; NS ; Nonstarter
9155lb ; NU ; Numeric
9156lb ; OP ; Open_Punctuation
9157lb ; PO ; Postfix_Numeric
9158lb ; PR ; Prefix_Numeric
9159lb ; QU ; Quotation
9160lb ; SA ; Complex_Context
9161lb ; SG ; Surrogate
9162lb ; SP ; Space
9163lb ; SY ; Break_Symbols
9164lb ; XX ; Unknown
9165lb ; ZW ; ZWSpace
9166END
9167 }
9168
9169 if (-e 'DNormalizationProps.txt') {
9170 push @return, split /\n/, <<'END';
9171qc ; M ; Maybe
9172qc ; N ; No
9173qc ; Y ; Yes
9174END
9175 }
9176
9177 if (-e 'Scripts.txt') {
9178 push @return, split /\n/, <<'END';
9179sc ; Arab ; Arabic
9180sc ; Armn ; Armenian
9181sc ; Beng ; Bengali
9182sc ; Bopo ; Bopomofo
9183sc ; Cans ; Canadian_Aboriginal
9184sc ; Cher ; Cherokee
9185sc ; Cyrl ; Cyrillic
9186sc ; Deva ; Devanagari
9187sc ; Dsrt ; Deseret
9188sc ; Ethi ; Ethiopic
9189sc ; Geor ; Georgian
9190sc ; Goth ; Gothic
9191sc ; Grek ; Greek
9192sc ; Gujr ; Gujarati
9193sc ; Guru ; Gurmukhi
9194sc ; Hang ; Hangul
9195sc ; Hani ; Han
9196sc ; Hebr ; Hebrew
9197sc ; Hira ; Hiragana
9198sc ; Ital ; Old_Italic
9199sc ; Kana ; Katakana
9200sc ; Khmr ; Khmer
9201sc ; Knda ; Kannada
9202sc ; Laoo ; Lao
9203sc ; Latn ; Latin
9204sc ; Mlym ; Malayalam
9205sc ; Mong ; Mongolian
9206sc ; Mymr ; Myanmar
9207sc ; Ogam ; Ogham
9208sc ; Orya ; Oriya
9209sc ; Qaai ; Inherited
9210sc ; Runr ; Runic
9211sc ; Sinh ; Sinhala
9212sc ; Syrc ; Syriac
9213sc ; Taml ; Tamil
9214sc ; Telu ; Telugu
9215sc ; Thaa ; Thaana
9216sc ; Thai ; Thai
9217sc ; Tibt ; Tibetan
9218sc ; Yiii ; Yi
9219sc ; Zyyy ; Common
9220END
9221 }
9222
9223 if ($v_version ge v2.0.0) {
9224 push @return, split /\n/, <<'END';
9225dt ; com ; compat
9226dt ; nar ; narrow
9227dt ; sml ; small
9228dt ; vert ; vertical
9229dt ; wide ; wide
9230
9231gc ; Cf ; Format
9232gc ; Cs ; Surrogate
9233gc ; Lt ; Titlecase_Letter
9234gc ; Me ; Enclosing_Mark
9235gc ; Nl ; Letter_Number
9236gc ; Pc ; Connector_Punctuation
9237gc ; Sk ; Modifier_Symbol
9238END
9239 }
9240 if ($v_version ge v2.1.2) {
9241 push @return, "bc ; S ; Segment_Separator\n";
9242 }
9243 if ($v_version ge v2.1.5) {
9244 push @return, split /\n/, <<'END';
9245gc ; Pf ; Final_Punctuation
9246gc ; Pi ; Initial_Punctuation
9247END
9248 }
9249 if ($v_version ge v2.1.8) {
9250 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9251 }
9252
9253 if ($v_version ge v3.0.0) {
9254 push @return, split /\n/, <<'END';
9255bc ; AL ; Arabic_Letter
9256bc ; BN ; Boundary_Neutral
9257bc ; LRE ; Left_To_Right_Embedding
9258bc ; LRO ; Left_To_Right_Override
9259bc ; NSM ; Nonspacing_Mark
9260bc ; PDF ; Pop_Directional_Format
9261bc ; RLE ; Right_To_Left_Embedding
9262bc ; RLO ; Right_To_Left_Override
9263
9264ccc; 233; DB ; Double_Below
9265END
9266 }
9267
9268 if ($v_version ge v3.1.0) {
9269 push @return, "ccc; 226; R ; Right\n";
9270 }
9271
9272 return @return;
9273}
9274
b1c167a3
KW
9275sub output_perl_charnames_line ($$) {
9276
9277 # Output the entries in Perl_charnames specially, using 5 digits instead
9278 # of four. This makes the entries a constant length, and simplifies
9279 # charnames.pm which this table is for. Unicode can have 6 digit
9280 # ordinals, but they are all private use or noncharacters which do not
9281 # have names, so won't be in this table.
9282
73d9566f 9283 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
9284}
9285
99870f4d
KW
9286{ # Closure
9287 # This is used to store the range list of all the code points usable when
9288 # the little used $compare_versions feature is enabled.
9289 my $compare_versions_range_list;
9290
9291 sub process_generic_property_file {
9292 # This processes a file containing property mappings and puts them
9293 # into internal map tables. It should be used to handle any property
9294 # files that have mappings from a code point or range thereof to
9295 # something else. This means almost all the UCD .txt files.
9296 # each_line_handlers() should be set to adjust the lines of these
9297 # files, if necessary, to what this routine understands:
9298 #
9299 # 0374 ; NFD_QC; N
9300 # 003C..003E ; Math
9301 #
92f9d56c 9302 # the fields are: "codepoint-range ; property; map"
99870f4d
KW
9303 #
9304 # meaning the codepoints in the range all have the value 'map' under
9305 # 'property'.
98dc9551 9306 # Beginning and trailing white space in each field are not significant.
99870f4d
KW
9307 # Note there is not a trailing semi-colon in the above. A trailing
9308 # semi-colon means the map is a null-string. An omitted map, as
9309 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9310 # table syntax. (This could have been hidden from this routine by
9311 # doing it in the $file object, but that would require parsing of the
9312 # line there, so would have to parse it twice, or change the interface
9313 # to pass this an array. So not done.)
9314 #
9315 # The map field may begin with a sequence of commands that apply to
9316 # this range. Each such command begins and ends with $CMD_DELIM.
9317 # These are used to indicate, for example, that the mapping for a
9318 # range has a non-default type.
9319 #
9320 # This loops through the file, calling it's next_line() method, and
9321 # then taking the map and adding it to the property's table.
9322 # Complications arise because any number of properties can be in the
9323 # file, in any order, interspersed in any way. The first time a
9324 # property is seen, it gets information about that property and
f86864ac 9325 # caches it for quick retrieval later. It also normalizes the maps
5d7f7709
KW
9326 # so that only one of many synonyms is stored. The Unicode input
9327 # files do use some multiple synonyms.
99870f4d
KW
9328
9329 my $file = shift;
9330 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9331
9332 my %property_info; # To keep track of what properties
9333 # have already had entries in the
9334 # current file, and info about each,
9335 # so don't have to recompute.
9336 my $property_name; # property currently being worked on
9337 my $property_type; # and its type
9338 my $previous_property_name = ""; # name from last time through loop
9339 my $property_object; # pointer to the current property's
9340 # object
9341 my $property_addr; # the address of that object
9342 my $default_map; # the string that code points missing
9343 # from the file map to
9344 my $default_table; # For non-string properties, a
9345 # reference to the match table that
9346 # will contain the list of code
9347 # points that map to $default_map.
9348
9349 # Get the next real non-comment line
9350 LINE:
9351 while ($file->next_line) {
9352
9353 # Default replacement type; means that if parts of the range have
9354 # already been stored in our tables, the new map overrides them if
9355 # they differ more than cosmetically
9356 my $replace = $IF_NOT_EQUIVALENT;
9357 my $map_type; # Default type for the map of this range
9358
9359 #local $to_trace = 1 if main::DEBUG;
9360 trace $_ if main::DEBUG && $to_trace;
9361
9362 # Split the line into components
9363 my ($range, $property_name, $map, @remainder)
9364 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9365
9366 # If more or less on the line than we are expecting, warn and skip
9367 # the line
9368 if (@remainder) {
9369 $file->carp_bad_line('Extra fields');
9370 next LINE;
9371 }
9372 elsif ( ! defined $property_name) {
9373 $file->carp_bad_line('Missing property');
9374 next LINE;
9375 }
9376
9377 # Examine the range.
9378 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9379 {
9380 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9381 next LINE;
9382 }
9383 my $low = hex $1;
9384 my $high = (defined $2) ? hex $2 : $low;
9385
9386 # For the very specialized case of comparing two Unicode
9387 # versions...
9388 if (DEBUG && $compare_versions) {
9389 if ($property_name eq 'Age') {
9390
9391 # Only allow code points at least as old as the version
9392 # specified.
9393 my $age = pack "C*", split(/\./, $map); # v string
9394 next LINE if $age gt $compare_versions;
9395 }
9396 else {
9397
9398 # Again, we throw out code points younger than those of
9399 # the specified version. By now, the Age property is
9400 # populated. We use the intersection of each input range
9401 # with this property to find what code points in it are
9402 # valid. To do the intersection, we have to convert the
9403 # Age property map to a Range_list. We only have to do
9404 # this once.
9405 if (! defined $compare_versions_range_list) {
9406 my $age = property_ref('Age');
9407 if (! -e 'DAge.txt') {
9408 croak "Need to have 'DAge.txt' file to do version comparison";
9409 }
9410 elsif ($age->count == 0) {
9411 croak "The 'Age' table is empty, but its file exists";
9412 }
9413 $compare_versions_range_list
9414 = Range_List->new(Initialize => $age);
9415 }
9416
9417 # An undefined map is always 'Y'
9418 $map = 'Y' if ! defined $map;
9419
9420 # Calculate the intersection of the input range with the
9421 # code points that are known in the specified version
9422 my @ranges = ($compare_versions_range_list
9423 & Range->new($low, $high))->ranges;
9424
9425 # If the intersection is empty, throw away this range
9426 next LINE unless @ranges;
9427
9428 # Only examine the first range this time through the loop.
9429 my $this_range = shift @ranges;
9430
9431 # Put any remaining ranges in the queue to be processed
9432 # later. Note that there is unnecessary work here, as we
9433 # will do the intersection again for each of these ranges
9434 # during some future iteration of the LINE loop, but this
9435 # code is not used in production. The later intersections
9436 # are guaranteed to not splinter, so this will not become
9437 # an infinite loop.
9438 my $line = join ';', $property_name, $map;
9439 foreach my $range (@ranges) {
9440 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9441 $range->start,
9442 $range->end,
9443 $line));
9444 }
9445
9446 # And process the first range, like any other.
9447 $low = $this_range->start;
9448 $high = $this_range->end;
9449 }
9450 } # End of $compare_versions
9451
9452 # If changing to a new property, get the things constant per
9453 # property
9454 if ($previous_property_name ne $property_name) {
9455
9456 $property_object = property_ref($property_name);
9457 if (! defined $property_object) {
9458 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9459 next LINE;
9460 }
051df77b 9461 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
9462
9463 # Defer changing names until have a line that is acceptable
9464 # (the 'next' statement above means is unacceptable)
9465 $previous_property_name = $property_name;
9466
9467 # If not the first time for this property, retrieve info about
9468 # it from the cache
9469 if (defined ($property_info{$property_addr}{'type'})) {
9470 $property_type = $property_info{$property_addr}{'type'};
9471 $default_map = $property_info{$property_addr}{'default'};
9472 $map_type
9473 = $property_info{$property_addr}{'pseudo_map_type'};
9474 $default_table
9475 = $property_info{$property_addr}{'default_table'};
9476 }
9477 else {
9478
9479 # Here, is the first time for this property. Set up the
9480 # cache.
9481 $property_type = $property_info{$property_addr}{'type'}
9482 = $property_object->type;
9483 $map_type
9484 = $property_info{$property_addr}{'pseudo_map_type'}
9485 = $property_object->pseudo_map_type;
9486
9487 # The Unicode files are set up so that if the map is not
9488 # defined, it is a binary property
9489 if (! defined $map && $property_type != $BINARY) {
9490 if ($property_type != $UNKNOWN
9491 && $property_type != $NON_STRING)
9492 {
9493 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9494 }
9495 else {
9496 $property_object->set_type($BINARY);
9497 $property_type
9498 = $property_info{$property_addr}{'type'}
9499 = $BINARY;
9500 }
9501 }
9502
9503 # Get any @missings default for this property. This
9504 # should precede the first entry for the property in the
9505 # input file, and is located in a comment that has been
9506 # stored by the Input_file class until we access it here.
9507 # It's possible that there is more than one such line
9508 # waiting for us; collect them all, and parse
9509 my @missings_list = $file->get_missings
9510 if $file->has_missings_defaults;
9511 foreach my $default_ref (@missings_list) {
9512 my $default = $default_ref->[0];
ffe43484 9513 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
9514
9515 # For string properties, the default is just what the
9516 # file says, but non-string properties should already
9517 # have set up a table for the default property value;
9518 # use the table for these, so can resolve synonyms
9519 # later to a single standard one.
9520 if ($property_type == $STRING
9521 || $property_type == $UNKNOWN)
9522 {
9523 $property_info{$addr}{'missings'} = $default;
9524 }
9525 else {
9526 $property_info{$addr}{'missings'}
9527 = $property_object->table($default);
9528 }
9529 }
9530
9531 # Finished storing all the @missings defaults in the input
9532 # file so far. Get the one for the current property.
9533 my $missings = $property_info{$property_addr}{'missings'};
9534
9535 # But we likely have separately stored what the default
9536 # should be. (This is to accommodate versions of the
9537 # standard where the @missings lines are absent or
9538 # incomplete.) Hopefully the two will match. But check
9539 # it out.
9540 $default_map = $property_object->default_map;
9541
9542 # If the map is a ref, it means that the default won't be
9543 # processed until later, so undef it, so next few lines
9544 # will redefine it to something that nothing will match
9545 undef $default_map if ref $default_map;
9546
9547 # Create a $default_map if don't have one; maybe a dummy
9548 # that won't match anything.
9549 if (! defined $default_map) {
9550
9551 # Use any @missings line in the file.
9552 if (defined $missings) {
9553 if (ref $missings) {
9554 $default_map = $missings->full_name;
9555 $default_table = $missings;
9556 }
9557 else {
9558 $default_map = $missings;
9559 }
678f13d5 9560
99870f4d
KW
9561 # And store it with the property for outside use.
9562 $property_object->set_default_map($default_map);
9563 }
9564 else {
9565
9566 # Neither an @missings nor a default map. Create
9567 # a dummy one, so won't have to test definedness
9568 # in the main loop.
9569 $default_map = '_Perl This will never be in a file
9570 from Unicode';
9571 }
9572 }
9573
9574 # Here, we have $default_map defined, possibly in terms of
9575 # $missings, but maybe not, and possibly is a dummy one.
9576 if (defined $missings) {
9577
9578 # Make sure there is no conflict between the two.
9579 # $missings has priority.
9580 if (ref $missings) {
23e33b60
KW
9581 $default_table
9582 = $property_object->table($default_map);
99870f4d
KW
9583 if (! defined $default_table
9584 || $default_table != $missings)
9585 {
9586 if (! defined $default_table) {
9587 $default_table = $UNDEF;
9588 }
9589 $file->carp_bad_line(<<END
9590The \@missings line for $property_name in $file says that missings default to
9591$missings, but we expect it to be $default_table. $missings used.
9592END
9593 );
9594 $default_table = $missings;
9595 $default_map = $missings->full_name;
9596 }
9597 $property_info{$property_addr}{'default_table'}
9598 = $default_table;
9599 }
9600 elsif ($default_map ne $missings) {
9601 $file->carp_bad_line(<<END
9602The \@missings line for $property_name in $file says that missings default to
9603$missings, but we expect it to be $default_map. $missings used.
9604END
9605 );
9606 $default_map = $missings;
9607 }
9608 }
9609
9610 $property_info{$property_addr}{'default'}
9611 = $default_map;
9612
9613 # If haven't done so already, find the table corresponding
9614 # to this map for non-string properties.
9615 if (! defined $default_table
9616 && $property_type != $STRING
9617 && $property_type != $UNKNOWN)
9618 {
9619 $default_table = $property_info{$property_addr}
9620 {'default_table'}
9621 = $property_object->table($default_map);
9622 }
9623 } # End of is first time for this property
9624 } # End of switching properties.
9625
9626 # Ready to process the line.
9627 # The Unicode files are set up so that if the map is not defined,
9628 # it is a binary property with value 'Y'
9629 if (! defined $map) {
9630 $map = 'Y';
9631 }
9632 else {
9633
9634 # If the map begins with a special command to us (enclosed in
9635 # delimiters), extract the command(s).
a35d7f90
KW
9636 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9637 my $command = $1;
9638 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9639 $replace = $1;
99870f4d 9640 }
a35d7f90
KW
9641 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9642 $map_type = $1;
9643 }
9644 else {
9645 $file->carp_bad_line("Unknown command line: '$1'");
9646 next LINE;
9647 }
9648 }
99870f4d
KW
9649 }
9650
9651 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9652 {
9653
9654 # Here, we have a map to a particular code point, and the
9655 # default map is to a code point itself. If the range
9656 # includes the particular code point, change that portion of
9657 # the range to the default. This makes sure that in the final
9658 # table only the non-defaults are listed.
9659 my $decimal_map = hex $map;
9660 if ($low <= $decimal_map && $decimal_map <= $high) {
9661
9662 # If the range includes stuff before or after the map
9663 # we're changing, split it and process the split-off parts
9664 # later.
9665 if ($low < $decimal_map) {
9666 $file->insert_adjusted_lines(
9667 sprintf("%04X..%04X; %s; %s",
9668 $low,
9669 $decimal_map - 1,
9670 $property_name,
9671 $map));
9672 }
9673 if ($high > $decimal_map) {
9674 $file->insert_adjusted_lines(
9675 sprintf("%04X..%04X; %s; %s",
9676 $decimal_map + 1,
9677 $high,
9678 $property_name,
9679 $map));
9680 }
9681 $low = $high = $decimal_map;
9682 $map = $CODE_POINT;
9683 }
9684 }
9685
9686 # If we can tell that this is a synonym for the default map, use
9687 # the default one instead.
9688 if ($property_type != $STRING
9689 && $property_type != $UNKNOWN)
9690 {
9691 my $table = $property_object->table($map);
9692 if (defined $table && $table == $default_table) {
9693 $map = $default_map;
9694 }
9695 }
9696
9697 # And figure out the map type if not known.
9698 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9699 if ($map eq "") { # Nulls are always $NULL map type
9700 $map_type = $NULL;
9701 } # Otherwise, non-strings, and those that don't allow
9702 # $MULTI_CP, and those that aren't multiple code points are
9703 # 0
9704 elsif
9705 (($property_type != $STRING && $property_type != $UNKNOWN)
9706 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9707 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9708 {
9709 $map_type = 0;
9710 }
9711 else {
9712 $map_type = $MULTI_CP;
9713 }
9714 }
9715
9716 $property_object->add_map($low, $high,
9717 $map,
9718 Type => $map_type,
9719 Replace => $replace);
9720 } # End of loop through file's lines
9721
9722 return;
9723 }
9724}
9725
99870f4d
KW
9726{ # Closure for UnicodeData.txt handling
9727
9728 # This file was the first one in the UCD; its design leads to some
9729 # awkwardness in processing. Here is a sample line:
9730 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9731 # The fields in order are:
9732 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9733 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9734 my $CATEGORY = $i++; # category (e.g. "Lu")
9735 my $CCC = $i++; # Canonical combining class (e.g. "230")
9736 my $BIDI = $i++; # directional class (e.g. "L")
9737 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9738 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9739 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9740 # Dual-use in this program; see below
9741 my $NUMERIC = $i++; # numeric value
9742 my $MIRRORED = $i++; # ? mirrored
9743 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9744 my $COMMENT = $i++; # iso comment
9745 my $UPPER = $i++; # simple uppercase mapping
9746 my $LOWER = $i++; # simple lowercase mapping
9747 my $TITLE = $i++; # simple titlecase mapping
9748 my $input_field_count = $i;
9749
9750 # This routine in addition outputs these extra fields:
9751 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9752
9753 # These fields are modifications of ones above, and are usually
9754 # suppressed; they must come last, as for speed, the loop upper bound is
9755 # normally set to ignore them
9756 my $NAME = $i++; # This is the strict name field, not the one that
9757 # charnames uses.
9758 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9759 # by Unicode::Normalize
99870f4d
KW
9760 my $last_field = $i - 1;
9761
9762 # All these are read into an array for each line, with the indices defined
9763 # above. The empty fields in the example line above indicate that the
9764 # value is defaulted. The handler called for each line of the input
9765 # changes these to their defaults.
9766
9767 # Here are the official names of the properties, in a parallel array:
9768 my @field_names;
9769 $field_names[$BIDI] = 'Bidi_Class';
9770 $field_names[$CATEGORY] = 'General_Category';
9771 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9772 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9773 $field_names[$COMMENT] = 'ISO_Comment';
9774 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9775 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9776 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9777 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9778 $field_names[$NAME] = 'Name';
9779 $field_names[$NUMERIC] = 'Numeric_Value';
9780 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9781 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9782 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9783 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9784 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9785 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9786
28093d0e
KW
9787 # Some of these need a little more explanation:
9788 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9789 # property, but is used in calculating the Numeric_Type. Perl however,
9790 # creates a file from this field, so a Perl property is created from it.
9791 # Similarly, the Other_Digit field is used only for calculating the
9792 # Numeric_Type, and so it can be safely re-used as the place to store
9793 # the value for Numeric_Type; hence it is referred to as
9794 # $NUMERIC_TYPE_OTHER_DIGIT.
9795 # The input field named $PERL_DECOMPOSITION is a combination of both the
9796 # decomposition mapping and its type. Perl creates a file containing
9797 # exactly this field, so it is used for that. The two properties are
9798 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9799 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9800 # output it), as Perl doesn't use it directly.
9801 # The input field named here $CHARNAME is used to construct the
9802 # Perl_Charnames property, which is a combination of the Name property
9803 # (which the input field contains), and the Unicode_1_Name property, and
9804 # others from other files. Since, the strict Name property is not used
9805 # by Perl, this field is used for the table that Perl does use. The
9806 # strict Name property table is usually suppressed (unless the lists are
9807 # changed to output it), so it is accumulated in a separate field,
9808 # $NAME, which to save time is discarded unless the table is actually to
9809 # be output
99870f4d
KW
9810
9811 # This file is processed like most in this program. Control is passed to
9812 # process_generic_property_file() which calls filter_UnicodeData_line()
9813 # for each input line. This filter converts the input into line(s) that
9814 # process_generic_property_file() understands. There is also a setup
9815 # routine called before any of the file is processed, and a handler for
9816 # EOF processing, all in this closure.
9817
9818 # A huge speed-up occurred at the cost of some added complexity when these
9819 # routines were altered to buffer the outputs into ranges. Almost all the
9820 # lines of the input file apply to just one code point, and for most
9821 # properties, the map for the next code point up is the same as the
9822 # current one. So instead of creating a line for each property for each
9823 # input line, filter_UnicodeData_line() remembers what the previous map
9824 # of a property was, and doesn't generate a line to pass on until it has
9825 # to, as when the map changes; and that passed-on line encompasses the
9826 # whole contiguous range of code points that have the same map for that
9827 # property. This means a slight amount of extra setup, and having to
9828 # flush these buffers on EOF, testing if the maps have changed, plus
9829 # remembering state information in the closure. But it means a lot less
9830 # real time in not having to change the data base for each property on
9831 # each line.
9832
9833 # Another complication is that there are already a few ranges designated
9834 # in the input. There are two lines for each, with the same maps except
9835 # the code point and name on each line. This was actually the hardest
9836 # thing to design around. The code points in those ranges may actually
9837 # have real maps not given by these two lines. These maps will either
98dc9551 9838 # be algorithmically determinable, or in the extracted files furnished
99870f4d
KW
9839 # with the UCD. In the event of conflicts between these extracted files,
9840 # and this one, Unicode says that this one prevails. But it shouldn't
9841 # prevail for conflicts that occur in these ranges. The data from the
9842 # extracted files prevails in those cases. So, this program is structured
9843 # so that those files are processed first, storing maps. Then the other
9844 # files are processed, generally overwriting what the extracted files
9845 # stored. But just the range lines in this input file are processed
9846 # without overwriting. This is accomplished by adding a special string to
9847 # the lines output to tell process_generic_property_file() to turn off the
9848 # overwriting for just this one line.
9849 # A similar mechanism is used to tell it that the map is of a non-default
9850 # type.
9851
9852 sub setup_UnicodeData { # Called before any lines of the input are read
9853 my $file = shift;
9854 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9855
28093d0e
KW
9856 # Create a new property specially located that is a combination of the
9857 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9858 # Name_Alias properties. (The final duplicates elements of the
9859 # first.) A comment for it will later be constructed based on the
9860 # actual properties present and used
3e20195b 9861 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9862 Core_Access => '\N{...} and "use charnames"',
9863 Default_Map => "",
9864 Directory => File::Spec->curdir(),
9865 File => 'Name',
9866 Internal_Only_Warning => 1,
9867 Perl_Extension => 1,
b1c167a3 9868 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9869 Type => $STRING,
9870 );
9871
99870f4d 9872 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9873 Directory => File::Spec->curdir(),
99870f4d 9874 File => 'Decomposition',
a14f3cb1 9875 Format => $DECOMP_STRING_FORMAT,
99870f4d
KW
9876 Internal_Only_Warning => 1,
9877 Perl_Extension => 1,
9878 Default_Map => $CODE_POINT,
9879
0c07e538
KW
9880 # normalize.pm can't cope with these
9881 Output_Range_Counts => 0,
9882
99870f4d
KW
9883 # This is a specially formatted table
9884 # explicitly for normalize.pm, which
9885 # is expecting a particular format,
9886 # which means that mappings containing
9887 # multiple code points are in the main
9888 # body of the table
9889 Map_Type => $COMPUTE_NO_MULTI_CP,
9890 Type => $STRING,
9891 );
9892 $Perl_decomp->add_comment(join_lines(<<END
9893This mapping is a combination of the Unicode 'Decomposition_Type' and
9894'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9895identical to the official Unicode 'Decomposition_Mapping' property except for
9896two things:
9897 1) It omits the algorithmically determinable Hangul syllable decompositions,
9898which normalize.pm handles algorithmically.
9899 2) It contains the decomposition type as well. Non-canonical decompositions
9900begin with a word in angle brackets, like <super>, which denotes the
9901compatible decomposition type. If the map does not begin with the <angle
9902brackets>, the decomposition is canonical.
9903END
9904 ));
9905
9906 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9907 Default_Map => "",
9908 Perl_Extension => 1,
9909 File => 'Digit', # Trad. location
9910 Directory => $map_directory,
9911 Type => $STRING,
9912 Range_Size_1 => 1,
9913 );
9914 $Decimal_Digit->add_comment(join_lines(<<END
9915This file gives the mapping of all code points which represent a single
9916decimal digit [0-9] to their respective digits. For example, the code point
9917U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9918that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9919numerals.
9920END
9921 ));
9922
28093d0e
KW
9923 # These properties are not used for generating anything else, and are
9924 # usually not output. By making them last in the list, we can just
99870f4d 9925 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
9926 # generating a table(s) that is/are just going to get thrown away.
9927 if (! property_ref('Decomposition_Mapping')->to_output_map
9928 && ! property_ref('Name')->to_output_map)
9929 {
9930 $last_field = min($NAME, $DECOMP_MAP) - 1;
9931 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9932 $last_field = $DECOMP_MAP;
9933 } elsif (property_ref('Name')->to_output_map) {
9934 $last_field = $NAME;
99870f4d
KW
9935 }
9936 return;
9937 }
9938
9939 my $first_time = 1; # ? Is this the first line of the file
9940 my $in_range = 0; # ? Are we in one of the file's ranges
9941 my $previous_cp; # hex code point of previous line
9942 my $decimal_previous_cp = -1; # And its decimal equivalent
9943 my @start; # For each field, the current starting
9944 # code point in hex for the range
9945 # being accumulated.
9946 my @fields; # The input fields;
9947 my @previous_fields; # And those from the previous call
9948
9949 sub filter_UnicodeData_line {
9950 # Handle a single input line from UnicodeData.txt; see comments above
9951 # Conceptually this takes a single line from the file containing N
9952 # properties, and converts it into N lines with one property per line,
9953 # which is what the final handler expects. But there are
9954 # complications due to the quirkiness of the input file, and to save
9955 # time, it accumulates ranges where the property values don't change
9956 # and only emits lines when necessary. This is about an order of
9957 # magnitude fewer lines emitted.
9958
9959 my $file = shift;
9960 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9961
9962 # $_ contains the input line.
9963 # -1 in split means retain trailing null fields
9964 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9965
9966 #local $to_trace = 1 if main::DEBUG;
9967 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9968 if (@fields > $input_field_count) {
9969 $file->carp_bad_line('Extra fields');
9970 $_ = "";
9971 return;
9972 }
9973
9974 my $decimal_cp = hex $cp;
9975
9976 # We have to output all the buffered ranges when the next code point
9977 # is not exactly one after the previous one, which means there is a
9978 # gap in the ranges.
9979 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9980
9981 # The decomposition mapping field requires special handling. It looks
9982 # like either:
9983 #
9984 # <compat> 0032 0020
9985 # 0041 0300
9986 #
9987 # The decomposition type is enclosed in <brackets>; if missing, it
9988 # means the type is canonical. There are two decomposition mapping
9989 # tables: the one for use by Perl's normalize.pm has a special format
9990 # which is this field intact; the other, for general use is of
9991 # standard format. In either case we have to find the decomposition
9992 # type. Empty fields have None as their type, and map to the code
9993 # point itself
9994 if ($fields[$PERL_DECOMPOSITION] eq "") {
9995 $fields[$DECOMP_TYPE] = 'None';
9996 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9997 }
9998 else {
9999 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10000 =~ / < ( .+? ) > \s* ( .+ ) /x;
10001 if (! defined $fields[$DECOMP_TYPE]) {
10002 $fields[$DECOMP_TYPE] = 'Canonical';
10003 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10004 }
10005 else {
10006 $fields[$DECOMP_MAP] = $map;
10007 }
10008 }
10009
10010 # The 3 numeric fields also require special handling. The 2 digit
10011 # fields must be either empty or match the number field. This means
10012 # that if it is empty, they must be as well, and the numeric type is
10013 # None, and the numeric value is 'Nan'.
10014 # The decimal digit field must be empty or match the other digit
10015 # field. If the decimal digit field is non-empty, the code point is
10016 # a decimal digit, and the other two fields will have the same value.
10017 # If it is empty, but the other digit field is non-empty, the code
10018 # point is an 'other digit', and the number field will have the same
10019 # value as the other digit field. If the other digit field is empty,
10020 # but the number field is non-empty, the code point is a generic
10021 # numeric type.
10022 if ($fields[$NUMERIC] eq "") {
10023 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10024 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10025 ) {
10026 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10027 }
10028 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10029 $fields[$NUMERIC] = 'NaN';
10030 }
10031 else {
10032 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10033 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10034 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10035 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10036 }
10037 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10038 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10039 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10040 }
10041 else {
10042 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10043
10044 # Rationals require extra effort.
10045 register_fraction($fields[$NUMERIC])
10046 if $fields[$NUMERIC] =~ qr{/};
10047 }
10048 }
10049
10050 # For the properties that have empty fields in the file, and which
10051 # mean something different from empty, change them to that default.
10052 # Certain fields just haven't been empty so far in any Unicode
10053 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10054 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 10055 # the defaults; which are very unlikely to ever change.
99870f4d
KW
10056 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10057 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10058
10059 # UAX44 says that if title is empty, it is the same as whatever upper
10060 # is,
10061 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10062
10063 # There are a few pairs of lines like:
10064 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10065 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10066 # that define ranges. These should be processed after the fields are
10067 # adjusted above, as they may override some of them; but mostly what
28093d0e 10068 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
10069 # paired lines start with a '<', but this is also true of '<control>,
10070 # which isn't one of these special ones.
28093d0e 10071 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
10072
10073 # Some code points in this file have the pseudo-name
10074 # '<control>', but the official name for such ones is the null
28093d0e 10075 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 10076 $fields[$NAME] = "";
28093d0e 10077 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
10078
10079 # We had better not be in between range lines.
10080 if ($in_range) {
28093d0e 10081 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10082 $in_range = 0;
10083 }
10084 }
28093d0e 10085 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
10086
10087 # Here is a non-range line. We had better not be in between range
10088 # lines.
10089 if ($in_range) {
28093d0e 10090 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10091 $in_range = 0;
10092 }
edb80b88
KW
10093 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10094
10095 # These are code points whose names end in their code points,
10096 # which means the names are algorithmically derivable from the
10097 # code points. To shorten the output Name file, the algorithm
10098 # for deriving these is placed in the file instead of each
10099 # code point, so they have map type $CP_IN_NAME
10100 $fields[$CHARNAME] = $CMD_DELIM
10101 . $MAP_TYPE_CMD
10102 . '='
10103 . $CP_IN_NAME
10104 . $CMD_DELIM
10105 . $fields[$CHARNAME];
10106 }
28093d0e 10107 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 10108 }
28093d0e
KW
10109 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10110 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
10111
10112 # Here we are at the beginning of a range pair.
10113 if ($in_range) {
28093d0e 10114 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10115 }
10116 $in_range = 1;
10117
10118 # Because the properties in the range do not overwrite any already
10119 # in the db, we must flush the buffers of what's already there, so
10120 # they get handled in the normal scheme.
10121 $force_output = 1;
10122
10123 }
28093d0e
KW
10124 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10125 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
10126 $_ = "";
10127 return;
10128 }
10129 else { # Here, we are at the last line of a range pair.
10130
10131 if (! $in_range) {
28093d0e 10132 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
10133 $_ = "";
10134 return;
10135 }
10136 $in_range = 0;
10137
28093d0e
KW
10138 $fields[$NAME] = $fields[$CHARNAME];
10139
99870f4d
KW
10140 # Check that the input is valid: that the closing of the range is
10141 # the same as the beginning.
10142 foreach my $i (0 .. $last_field) {
10143 next if $fields[$i] eq $previous_fields[$i];
10144 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10145 }
10146
10147 # The processing differs depending on the type of range,
28093d0e
KW
10148 # determined by its $CHARNAME
10149 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
10150
10151 # Check that the data looks right.
10152 if ($decimal_previous_cp != $SBase) {
10153 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10154 }
10155 if ($decimal_cp != $SBase + $SCount - 1) {
10156 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10157 }
10158
10159 # The Hangul syllable range has a somewhat complicated name
10160 # generation algorithm. Each code point in it has a canonical
10161 # decomposition also computable by an algorithm. The
10162 # perl decomposition map table built from these is used only
10163 # by normalize.pm, which has the algorithm built in it, so the
10164 # decomposition maps are not needed, and are large, so are
10165 # omitted from it. If the full decomposition map table is to
10166 # be output, the decompositions are generated for it, in the
10167 # EOF handling code for this input file.
10168
10169 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10170
10171 # This range is stored in our internal structure with its
10172 # own map type, different from all others.
28093d0e
KW
10173 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10174 = $CMD_DELIM
99870f4d
KW
10175 . $MAP_TYPE_CMD
10176 . '='
10177 . $HANGUL_SYLLABLE
10178 . $CMD_DELIM
28093d0e 10179 . $fields[$CHARNAME];
99870f4d 10180 }
28093d0e 10181 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
10182
10183 # The name for these contains the code point itself, and all
10184 # are defined to have the same base name, regardless of what
10185 # is in the file. They are stored in our internal structure
10186 # with a map type of $CP_IN_NAME
28093d0e
KW
10187 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10188 = $CMD_DELIM
99870f4d
KW
10189 . $MAP_TYPE_CMD
10190 . '='
10191 . $CP_IN_NAME
10192 . $CMD_DELIM
10193 . 'CJK UNIFIED IDEOGRAPH';
10194
10195 }
10196 elsif ($fields[$CATEGORY] eq 'Co'
10197 || $fields[$CATEGORY] eq 'Cs')
10198 {
10199 # The names of all the code points in these ranges are set to
10200 # null, as there are no names for the private use and
10201 # surrogate code points.
10202
28093d0e 10203 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
10204 }
10205 else {
28093d0e 10206 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
10207 }
10208
10209 # The first line of the range caused everything else to be output,
10210 # and then its values were stored as the beginning values for the
10211 # next set of ranges, which this one ends. Now, for each value,
10212 # add a command to tell the handler that these values should not
10213 # replace any existing ones in our database.
10214 foreach my $i (0 .. $last_field) {
10215 $previous_fields[$i] = $CMD_DELIM
10216 . $REPLACE_CMD
10217 . '='
10218 . $NO
10219 . $CMD_DELIM
10220 . $previous_fields[$i];
10221 }
10222
10223 # And change things so it looks like the entire range has been
10224 # gone through with this being the final part of it. Adding the
10225 # command above to each field will cause this range to be flushed
10226 # during the next iteration, as it guaranteed that the stored
10227 # field won't match whatever value the next one has.
10228 $previous_cp = $cp;
10229 $decimal_previous_cp = $decimal_cp;
10230
10231 # We are now set up for the next iteration; so skip the remaining
10232 # code in this subroutine that does the same thing, but doesn't
10233 # know about these ranges.
10234 $_ = "";
c1739a4a 10235
99870f4d
KW
10236 return;
10237 }
10238
10239 # On the very first line, we fake it so the code below thinks there is
10240 # nothing to output, and initialize so that when it does get output it
10241 # uses the first line's values for the lowest part of the range.
10242 # (One could avoid this by using peek(), but then one would need to
10243 # know the adjustments done above and do the same ones in the setup
10244 # routine; not worth it)
10245 if ($first_time) {
10246 $first_time = 0;
10247 @previous_fields = @fields;
10248 @start = ($cp) x scalar @fields;
10249 $decimal_previous_cp = $decimal_cp - 1;
10250 }
10251
10252 # For each field, output the stored up ranges that this code point
10253 # doesn't fit in. Earlier we figured out if all ranges should be
10254 # terminated because of changing the replace or map type styles, or if
10255 # there is a gap between this new code point and the previous one, and
10256 # that is stored in $force_output. But even if those aren't true, we
10257 # need to output the range if this new code point's value for the
10258 # given property doesn't match the stored range's.
10259 #local $to_trace = 1 if main::DEBUG;
10260 foreach my $i (0 .. $last_field) {
10261 my $field = $fields[$i];
10262 if ($force_output || $field ne $previous_fields[$i]) {
10263
10264 # Flush the buffer of stored values.
10265 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10266
10267 # Start a new range with this code point and its value
10268 $start[$i] = $cp;
10269 $previous_fields[$i] = $field;
10270 }
10271 }
10272
10273 # Set the values for the next time.
10274 $previous_cp = $cp;
10275 $decimal_previous_cp = $decimal_cp;
10276
10277 # The input line has generated whatever adjusted lines are needed, and
10278 # should not be looked at further.
10279 $_ = "";
10280 return;
10281 }
10282
10283 sub EOF_UnicodeData {
10284 # Called upon EOF to flush the buffers, and create the Hangul
10285 # decomposition mappings if needed.
10286
10287 my $file = shift;
10288 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10289
10290 # Flush the buffers.
10291 foreach my $i (1 .. $last_field) {
10292 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10293 }
10294
10295 if (-e 'Jamo.txt') {
10296
10297 # The algorithm is published by Unicode, based on values in
10298 # Jamo.txt, (which should have been processed before this
10299 # subroutine), and the results left in %Jamo
10300 unless (%Jamo) {
10301 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10302 return;
10303 }
10304
10305 # If the full decomposition map table is being output, insert
10306 # into it the Hangul syllable mappings. This is to avoid having
10307 # to publish a subroutine in it to compute them. (which would
10308 # essentially be this code.) This uses the algorithm published by
10309 # Unicode.
10310 if (property_ref('Decomposition_Mapping')->to_output_map) {
10311 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10312 use integer;
10313 my $SIndex = $S - $SBase;
10314 my $L = $LBase + $SIndex / $NCount;
10315 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10316 my $T = $TBase + $SIndex % $TCount;
10317
10318 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10319 my $decomposition = sprintf("%04X %04X", $L, $V);
10320 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10321 $file->insert_adjusted_lines(
10322 sprintf("%04X; Decomposition_Mapping; %s",
10323 $S,
10324 $decomposition));
10325 }
10326 }
10327 }
10328
10329 return;
10330 }
10331
10332 sub filter_v1_ucd {
10333 # Fix UCD lines in version 1. This is probably overkill, but this
10334 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10335 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10336 # removed. This program retains them
10337 # 2) didn't include ranges, which it should have, and which are now
10338 # added in @corrected_lines below. It was hand populated by
10339 # taking the data from Version 2, verified by analyzing
10340 # DAge.txt.
10341 # 3) There is a syntax error in the entry for U+09F8 which could
10342 # cause problems for utf8_heavy, and so is changed. It's
10343 # numeric value was simply a minus sign, without any number.
10344 # (Eventually Unicode changed the code point to non-numeric.)
10345 # 4) The decomposition types often don't match later versions
10346 # exactly, and the whole syntax of that field is different; so
10347 # the syntax is changed as well as the types to their later
10348 # terminology. Otherwise normalize.pm would be very unhappy
10349 # 5) Many ccc classes are different. These are left intact.
10350 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10351 # fields. These are unchanged because it doesn't really cause
10352 # problems for Perl.
10353 # 7) A number of code points, such as controls, don't have their
10354 # Unicode Version 1 Names in this file. These are unchanged.
10355
10356 my @corrected_lines = split /\n/, <<'END';
103574E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
103589FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10359E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10360F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10361F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10362FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10363END
10364
10365 my $file = shift;
10366 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10367
10368 #local $to_trace = 1 if main::DEBUG;
10369 trace $_ if main::DEBUG && $to_trace;
10370
10371 # -1 => retain trailing null fields
10372 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10373
10374 # At the first place that is wrong in the input, insert all the
10375 # corrections, replacing the wrong line.
10376 if ($code_point eq '4E00') {
10377 my @copy = @corrected_lines;
10378 $_ = shift @copy;
10379 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10380
10381 $file->insert_lines(@copy);
10382 }
10383
10384
10385 if ($fields[$NUMERIC] eq '-') {
10386 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10387 }
10388
10389 if ($fields[$PERL_DECOMPOSITION] ne "") {
10390
10391 # Several entries have this change to superscript 2 or 3 in the
10392 # middle. Convert these to the modern version, which is to use
10393 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10394 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10395 # 'HHHH HHHH 00B3 HHHH'.
10396 # It turns out that all of these that don't have another
10397 # decomposition defined at the beginning of the line have the
10398 # <square> decomposition in later releases.
10399 if ($code_point ne '00B2' && $code_point ne '00B3') {
10400 if ($fields[$PERL_DECOMPOSITION]
10401 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10402 {
10403 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10404 $fields[$PERL_DECOMPOSITION] = '<square> '
10405 . $fields[$PERL_DECOMPOSITION];
10406 }
10407 }
10408 }
10409
10410 # If is like '<+circled> 0052 <-circled>', convert to
10411 # '<circled> 0052'
10412 $fields[$PERL_DECOMPOSITION] =~
10413 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10414
10415 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10416 $fields[$PERL_DECOMPOSITION] =~
10417 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10418 or $fields[$PERL_DECOMPOSITION] =~
10419 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10420 or $fields[$PERL_DECOMPOSITION] =~
10421 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10422 or $fields[$PERL_DECOMPOSITION] =~
10423 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10424
10425 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10426 $fields[$PERL_DECOMPOSITION] =~
10427 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10428
10429 # Change names to modern form.
10430 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10431 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10432 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10433 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10434
10435 # One entry has weird braces
10436 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10437 }
10438
10439 $_ = join ';', $code_point, @fields;
10440 trace $_ if main::DEBUG && $to_trace;
10441 return;
10442 }
10443
10444 sub filter_v2_1_5_ucd {
10445 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10446 # columns swapped; These all had mirrored be 'N'. So if the numeric
10447 # column appears to be N, swap it back.
10448
10449 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10450 if ($fields[$NUMERIC] eq 'N') {
10451 $fields[$NUMERIC] = $fields[$MIRRORED];
10452 $fields[$MIRRORED] = 'N';
10453 $_ = join ';', $code_point, @fields;
10454 }
10455 return;
10456 }
3ffed8c2
KW
10457
10458 sub filter_v6_ucd {
10459
c12f2655
KW
10460 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10461 # accepted that yet to allow for some deprecation cycles.
3ffed8c2 10462
484741e1 10463 return if $_ !~ /^(?:0007|1F514|070F);/;
3ffed8c2
KW
10464
10465 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10466 if ($code_point eq '0007') {
0e429600 10467 $fields[$CHARNAME] = "ALERT";
3ffed8c2 10468 }
484741e1
KW
10469 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10470 # http://www.unicode.org/versions/corrigendum8.html
10471 $fields[$BIDI] = "AL";
10472 }
10914c78 10473 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
3ffed8c2
KW
10474 $fields[$CHARNAME] = "";
10475 }
10476
10477 $_ = join ';', $code_point, @fields;
10478
10479 return;
10480 }
99870f4d
KW
10481} # End closure for UnicodeData
10482
37e2e78e
KW
10483sub process_GCB_test {
10484
10485 my $file = shift;
10486 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10487
10488 while ($file->next_line) {
10489 push @backslash_X_tests, $_;
10490 }
678f13d5 10491
37e2e78e
KW
10492 return;
10493}
10494
99870f4d
KW
10495sub process_NamedSequences {
10496 # NamedSequences.txt entries are just added to an array. Because these
10497 # don't look like the other tables, they have their own handler.
10498 # An example:
10499 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10500 #
10501 # This just adds the sequence to an array for later handling
10502
99870f4d
KW
10503 my $file = shift;
10504 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10505
10506 while ($file->next_line) {
10507 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10508 if (@remainder) {
10509 $file->carp_bad_line(
10510 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10511 next;
10512 }
fb121860
KW
10513
10514 # Note single \t in keeping with special output format of
10515 # Perl_charnames. But it turns out that the code points don't have to
10516 # be 5 digits long, like the rest, based on the internal workings of
10517 # charnames.pm. This could be easily changed for consistency.
10518 push @named_sequences, "$sequence\t$name";
99870f4d
KW
10519 }
10520 return;
10521}
10522
10523{ # Closure
10524
10525 my $first_range;
10526
10527 sub filter_early_ea_lb {
10528 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10529 # third field be the name of the code point, which can be ignored in
10530 # most cases. But it can be meaningful if it marks a range:
10531 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10532 # 3400;W;<CJK Ideograph Extension A, First>
10533 #
10534 # We need to see the First in the example above to know it's a range.
10535 # They did not use the later range syntaxes. This routine changes it
10536 # to use the modern syntax.
10537 # $1 is the Input_file object.
10538
10539 my @fields = split /\s*;\s*/;
10540 if ($fields[2] =~ /^<.*, First>/) {
10541 $first_range = $fields[0];
10542 $_ = "";
10543 }
10544 elsif ($fields[2] =~ /^<.*, Last>/) {
10545 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10546 }
10547 else {
10548 undef $first_range;
10549 $_ = "$fields[0]; $fields[1]";
10550 }
10551
10552 return;
10553 }
10554}
10555
10556sub filter_old_style_arabic_shaping {
10557 # Early versions used a different term for the later one.
10558
10559 my @fields = split /\s*;\s*/;
10560 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10561 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10562 $_ = join ';', @fields;
10563 return;
10564}
10565
10566sub filter_arabic_shaping_line {
10567 # ArabicShaping.txt has entries that look like:
10568 # 062A; TEH; D; BEH
10569 # The field containing 'TEH' is not used. The next field is Joining_Type
10570 # and the last is Joining_Group
10571 # This generates two lines to pass on, one for each property on the input
10572 # line.
10573
10574 my $file = shift;
10575 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10576
10577 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10578
10579 if (@fields > 4) {
10580 $file->carp_bad_line('Extra fields');
10581 $_ = "";
10582 return;
10583 }
10584
10585 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10586 $_ = "$fields[0]; Joining_Type; $fields[2]";
10587
10588 return;
10589}
10590
d3fed3dd
KW
10591{ # Closure
10592 my $lc; # Table for lowercase mapping
10593 my $tc;
10594 my $uc;
10595
6c0259ad
KW
10596 sub setup_special_casing {
10597 # SpecialCasing.txt contains the non-simple case change mappings. The
10598 # simple ones are in UnicodeData.txt, which should already have been
10599 # read in to the full property data structures, so as to initialize
10600 # these with the simple ones. Then the SpecialCasing.txt entries
10601 # overwrite the ones which have different full mappings.
10602
10603 # This routine sees if the simple mappings are to be output, and if
10604 # so, copies what has already been put into the full mapping tables,
10605 # while they still contain only the simple mappings.
10606
10607 # The reason it is done this way is that the simple mappings are
10608 # probably not going to be output, so it saves work to initialize the
10609 # full tables with the simple mappings, and then overwrite those
10610 # relatively few entries in them that have different full mappings,
10611 # and thus skip the simple mapping tables altogether.
10612
c12f2655
KW
10613 # New tables with just the simple mappings that are overridden by the
10614 # full ones are constructed. These are for Unicode::UCD, which
10615 # requires the simple mappings. The Case_Folding table is a combined
10616 # table of both the simple and full mappings, with the full ones being
10617 # in the hash, and the simple ones, even those overridden by the hash,
10618 # being in the base table. That same mechanism could have been
10619 # employed here, except that the docs have said that the generated
10620 # files are usuable directly by programs, so we dare not change the
10621 # format in any way.
10622
6c0259ad
KW
10623 my $file= shift;
10624 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10625
6c0259ad
KW
10626 $lc = property_ref('lc');
10627 $tc = property_ref('tc');
10628 $uc = property_ref('uc');
10629
10630 # For each of the case change mappings...
10631 foreach my $case_table ($lc, $tc, $uc) {
10632 my $case = $case_table->name;
10633 my $full = property_ref($case);
10634 unless (defined $full && ! $full->is_empty) {
10635 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10636 }
10637
10638 # The simple version's name in each mapping merely has an 's' in
10639 # front of the full one's
10640 my $simple = property_ref('s' . $case);
10641 $simple->initialize($full) if $simple->to_output_map();
10642
10643 my $simple_only = Property->new("_s$case",
10644 Type => $STRING,
10645 Default_Map => $CODE_POINT,
10646 Perl_Extension => 1,
10647 Description => "The simple mappings for $case for code points that have full mappings as well");
10648 $simple_only->set_to_output_map($INTERNAL_MAP);
10649 $simple_only->add_comment(join_lines( <<END
d3fed3dd
KW
10650This file is for UCD.pm so that it can construct simple mappings that would
10651otherwise be lost because they are overridden by full mappings.
10652END
6c0259ad
KW
10653 ));
10654 }
99870f4d 10655
6c0259ad
KW
10656 return;
10657 }
99870f4d 10658
6c0259ad
KW
10659 sub filter_special_casing_line {
10660 # Change the format of $_ from SpecialCasing.txt into something that
10661 # the generic handler understands. Each input line contains three
10662 # case mappings. This will generate three lines to pass to the
10663 # generic handler for each of those.
99870f4d 10664
6c0259ad
KW
10665 # The input syntax (after stripping comments and trailing white space
10666 # is like one of the following (with the final two being entries that
10667 # we ignore):
10668 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10669 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10670 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10671 # Note the trailing semi-colon, unlike many of the input files. That
10672 # means that there will be an extra null field generated by the split
99870f4d 10673
6c0259ad
KW
10674 my $file = shift;
10675 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10676
6c0259ad
KW
10677 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10678 # fields
10679
10680 # field #4 is when this mapping is conditional. If any of these get
10681 # implemented, it would be by hard-coding in the casing functions in
10682 # the Perl core, not through tables. But if there is a new condition
10683 # we don't know about, output a warning. We know about all the
10684 # conditions through 6.0
10685 if ($fields[4] ne "") {
10686 my @conditions = split ' ', $fields[4];
10687 if ($conditions[0] ne 'tr' # We know that these languages have
10688 # conditions, and some are multiple
10689 && $conditions[0] ne 'az'
10690 && $conditions[0] ne 'lt'
10691
10692 # And, we know about a single condition Final_Sigma, but
10693 # nothing else.
10694 && ($v_version gt v5.2.0
10695 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10696 {
10697 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10698 }
10699 elsif ($conditions[0] ne 'Final_Sigma') {
99870f4d 10700
6c0259ad
KW
10701 # Don't print out a message for Final_Sigma, because we
10702 # have hard-coded handling for it. (But the standard
10703 # could change what the rule should be, but it wouldn't
10704 # show up here anyway.
99870f4d 10705
6c0259ad 10706 print "# SKIPPING Special Casing: $_\n"
99870f4d 10707 if $verbosity >= $VERBOSE;
6c0259ad
KW
10708 }
10709 $_ = "";
10710 return;
10711 }
10712 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10713 $file->carp_bad_line('Extra fields');
10714 $_ = "";
10715 return;
99870f4d 10716 }
99870f4d 10717
6c0259ad
KW
10718 $_ = "$fields[0]; lc; $fields[1]";
10719 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10720 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
99870f4d 10721
6c0259ad
KW
10722 # Copy any simple case change to the special tables constructed if
10723 # being overridden by a multi-character case change.
10724 if ($fields[1] ne $fields[0]
10725 && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10726 {
10727 $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10728 }
10729 if ($fields[2] ne $fields[0]
10730 && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10731 {
10732 $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10733 }
10734 if ($fields[3] ne $fields[0]
10735 && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10736 {
10737 $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10738 }
d3fed3dd 10739
6c0259ad
KW
10740 return;
10741 }
d3fed3dd 10742}
99870f4d
KW
10743
10744sub filter_old_style_case_folding {
10745 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 10746 # and later style. Different letters were used in the earlier.
99870f4d
KW
10747
10748 my $file = shift;
10749 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10750
10751 my @fields = split /\s*;\s*/;
10752 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10753 $fields[1] = 'I';
10754 }
10755 elsif ($fields[1] eq 'L') {
10756 $fields[1] = 'C'; # L => C always
10757 }
10758 elsif ($fields[1] eq 'E') {
10759 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10760 $fields[1] = 'F'
10761 }
10762 else {
10763 $fields[1] = 'C'
10764 }
10765 }
10766 else {
10767 $file->carp_bad_line("Expecting L or E in second field");
10768 $_ = "";
10769 return;
10770 }
10771 $_ = join("; ", @fields) . ';';
10772 return;
10773}
10774
10775{ # Closure for case folding
10776
10777 # Create the map for simple only if are going to output it, for otherwise
10778 # it takes no part in anything we do.
10779 my $to_output_simple;
10780
99870f4d
KW
10781 sub setup_case_folding($) {
10782 # Read in the case foldings in CaseFolding.txt. This handles both
10783 # simple and full case folding.
10784
10785 $to_output_simple
10786 = property_ref('Simple_Case_Folding')->to_output_map;
10787
10788 return;
10789 }
10790
10791 sub filter_case_folding_line {
10792 # Called for each line in CaseFolding.txt
10793 # Input lines look like:
10794 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10795 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10796 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10797 #
10798 # 'C' means that folding is the same for both simple and full
10799 # 'F' that it is only for full folding
10800 # 'S' that it is only for simple folding
10801 # 'T' is locale-dependent, and ignored
10802 # 'I' is a type of 'F' used in some early releases.
10803 # Note the trailing semi-colon, unlike many of the input files. That
10804 # means that there will be an extra null field generated by the split
10805 # below, which we ignore and hence is not an error.
10806
10807 my $file = shift;
10808 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10809
10810 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10811 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10812 $file->carp_bad_line('Extra fields');
10813 $_ = "";
10814 return;
10815 }
10816
10817 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10818 $_ = "";
10819 return;
10820 }
10821
10822 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
3c099872
KW
10823 # I are all full foldings; S is single-char. For S, there is always
10824 # an F entry, so we must allow multiple values for the same code
10825 # point. Fortunately this table doesn't need further manipulation
10826 # which would preclude using multiple-values. The S is now included
10827 # so that _swash_inversion_hash() is able to construct closures
10828 # without having to worry about F mappings.
10829 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10830 $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
99870f4d
KW
10831 }
10832 else {
10833 $_ = "";
3c099872 10834 $file->carp_bad_line('Expecting C F I S or T in second field');
99870f4d
KW
10835 }
10836
10837 # C and S are simple foldings, but simple case folding is not needed
10838 # unless we explicitly want its map table output.
10839 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10840 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10841 }
10842
99870f4d
KW
10843 return;
10844 }
10845
99870f4d
KW
10846} # End case fold closure
10847
10848sub filter_jamo_line {
10849 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10850 # from this file that is used in generating the Name property for Jamo
10851 # code points. But, it also is used to convert early versions' syntax
10852 # into the modern form. Here are two examples:
10853 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10854 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10855 #
10856 # The input is $_, the output is $_ filtered.
10857
10858 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10859
10860 # Let the caller handle unexpected input. In earlier versions, there was
10861 # a third field which is supposed to be a comment, but did not have a '#'
10862 # before it.
10863 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10864
10865 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10866 # beginning.
10867
10868 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10869 $fields[1] = 'R' if $fields[0] eq '1105';
10870
10871 # Add to structure so can generate Names from it.
10872 my $cp = hex $fields[0];
10873 my $short_name = $fields[1];
10874 $Jamo{$cp} = $short_name;
10875 if ($cp <= $LBase + $LCount) {
10876 $Jamo_L{$short_name} = $cp - $LBase;
10877 }
10878 elsif ($cp <= $VBase + $VCount) {
10879 $Jamo_V{$short_name} = $cp - $VBase;
10880 }
10881 elsif ($cp <= $TBase + $TCount) {
10882 $Jamo_T{$short_name} = $cp - $TBase;
10883 }
10884 else {
10885 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10886 }
10887
10888
10889 # Reassemble using just the first two fields to look like a typical
10890 # property file line
10891 $_ = "$fields[0]; $fields[1]";
10892
10893 return;
10894}
10895
99870f4d
KW
10896sub register_fraction($) {
10897 # This registers the input rational number so that it can be passed on to
10898 # utf8_heavy.pl, both in rational and floating forms.
10899
10900 my $rational = shift;
10901
10902 my $float = eval $rational;
10903 $nv_floating_to_rational{$float} = $rational;
10904 return;
10905}
10906
10907sub filter_numeric_value_line {
10908 # DNumValues contains lines of a different syntax than the typical
10909 # property file:
10910 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10911 #
10912 # This routine transforms $_ containing the anomalous syntax to the
10913 # typical, by filtering out the extra columns, and convert early version
10914 # decimal numbers to strings that look like rational numbers.
10915
10916 my $file = shift;
10917 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10918
10919 # Starting in 5.1, there is a rational field. Just use that, omitting the
10920 # extra columns. Otherwise convert the decimal number in the second field
10921 # to a rational, and omit extraneous columns.
10922 my @fields = split /\s*;\s*/, $_, -1;
10923 my $rational;
10924
10925 if ($v_version ge v5.1.0) {
10926 if (@fields != 4) {
10927 $file->carp_bad_line('Not 4 semi-colon separated fields');
10928 $_ = "";
10929 return;
10930 }
10931 $rational = $fields[3];
10932 $_ = join '; ', @fields[ 0, 3 ];
10933 }
10934 else {
10935
10936 # Here, is an older Unicode file, which has decimal numbers instead of
10937 # rationals in it. Use the fraction to calculate the denominator and
10938 # convert to rational.
10939
10940 if (@fields != 2 && @fields != 3) {
10941 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10942 $_ = "";
10943 return;
10944 }
10945
10946 my $codepoints = $fields[0];
10947 my $decimal = $fields[1];
10948 if ($decimal =~ s/\.0+$//) {
10949
10950 # Anything ending with a decimal followed by nothing but 0's is an
10951 # integer
10952 $_ = "$codepoints; $decimal";
10953 $rational = $decimal;
10954 }
10955 else {
10956
10957 my $denominator;
10958 if ($decimal =~ /\.50*$/) {
10959 $denominator = 2;
10960 }
10961
10962 # Here have the hardcoded repeating decimals in the fraction, and
10963 # the denominator they imply. There were only a few denominators
10964 # in the older Unicode versions of this file which this code
10965 # handles, so it is easy to convert them.
10966
10967 # The 4 is because of a round-off error in the Unicode 3.2 files
10968 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10969 $denominator = 3;
10970 }
10971 elsif ($decimal =~ /\.[27]50*$/) {
10972 $denominator = 4;
10973 }
10974 elsif ($decimal =~ /\.[2468]0*$/) {
10975 $denominator = 5;
10976 }
10977 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10978 $denominator = 6;
10979 }
10980 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10981 $denominator = 8;
10982 }
10983 if ($denominator) {
10984 my $sign = ($decimal < 0) ? "-" : "";
10985 my $numerator = int((abs($decimal) * $denominator) + .5);
10986 $rational = "$sign$numerator/$denominator";
10987 $_ = "$codepoints; $rational";
10988 }
10989 else {
10990 $file->carp_bad_line("Can't cope with number '$decimal'.");
10991 $_ = "";
10992 return;
10993 }
10994 }
10995 }
10996
10997 register_fraction($rational) if $rational =~ qr{/};
10998 return;
10999}
11000
11001{ # Closure
11002 my %unihan_properties;
11003 my $iicore;
11004
11005
11006 sub setup_unihan {
11007 # Do any special setup for Unihan properties.
11008
11009 # This property gives the wrong computed type, so override.
11010 my $usource = property_ref('kIRG_USource');
11011 $usource->set_type($STRING) if defined $usource;
11012
b2abbe5b
KW
11013 # This property is to be considered binary (it says so in
11014 # http://www.unicode.org/reports/tr38/)
99870f4d
KW
11015 $iicore = property_ref('kIICore');
11016 if (defined $iicore) {
b2abbe5b 11017 $iicore->set_type($BINARY);
99870f4d
KW
11018
11019 # We have to change the default map, because the @missing line is
11020 # misleading, given that we are treating it as binary.
11021 $iicore->set_default_map('N');
cc6d1d88
KW
11022 $iicore->table("Y")
11023 ->add_note("Converted to a binary property as per unicode.org UAX #38.");
99870f4d
KW
11024 }
11025
11026 return;
11027 }
11028
11029 sub filter_unihan_line {
11030 # Change unihan db lines to look like the others in the db. Here is
11031 # an input sample:
11032 # U+341C kCangjie IEKN
11033
11034 # Tabs are used instead of semi-colons to separate fields; therefore
11035 # they may have semi-colons embedded in them. Change these to periods
11036 # so won't screw up the rest of the code.
11037 s/;/./g;
11038
11039 # Remove lines that don't look like ones we accept.
11040 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11041 $_ = "";
11042 return;
11043 }
11044
11045 # Extract the property, and save a reference to its object.
11046 my $property = $1;
11047 if (! exists $unihan_properties{$property}) {
11048 $unihan_properties{$property} = property_ref($property);
11049 }
11050
11051 # Don't do anything unless the property is one we're handling, which
11052 # we determine by seeing if there is an object defined for it or not
11053 if (! defined $unihan_properties{$property}) {
11054 $_ = "";
11055 return;
11056 }
11057
11058 # The iicore property is supposed to be a boolean, so convert to our
11059 # standard boolean form.
11060 if (defined $iicore && $unihan_properties{$property} == $iicore) {
11061 $_ =~ s/$property.*/$property\tY/
11062 }
11063
11064 # Convert the tab separators to our standard semi-colons, and convert
11065 # the U+HHHH notation to the rest of the standard's HHHH
11066 s/\t/;/g;
11067 s/\b U \+ (?= $code_point_re )//xg;
11068
11069 #local $to_trace = 1 if main::DEBUG;
11070 trace $_ if main::DEBUG && $to_trace;
11071
11072 return;
11073 }
11074}
11075
11076sub filter_blocks_lines {
11077 # In the Blocks.txt file, the names of the blocks don't quite match the
11078 # names given in PropertyValueAliases.txt, so this changes them so they
11079 # do match: Blanks and hyphens are changed into underscores. Also makes
11080 # early release versions look like later ones
11081 #
11082 # $_ is transformed to the correct value.
11083
11084 my $file = shift;
11085 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11086
11087 if ($v_version lt v3.2.0) {
11088 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11089 $_ = "";
11090 return;
11091 }
11092
11093 # Old versions used a different syntax to mark the range.
11094 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11095 }
11096
11097 my @fields = split /\s*;\s*/, $_, -1;
11098 if (@fields != 2) {
11099 $file->carp_bad_line("Expecting exactly two fields");
11100 $_ = "";
11101 return;
11102 }
11103
11104 # Change hyphens and blanks in the block name field only
11105 $fields[1] =~ s/[ -]/_/g;
11106 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11107
11108 $_ = join("; ", @fields);
11109 return;
11110}
11111
11112{ # Closure
11113 my $current_property;
11114
11115 sub filter_old_style_proplist {
11116 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11117 # was in a completely different syntax. Ken Whistler of Unicode says
11118 # that it was something he used as an aid for his own purposes, but
11119 # was never an official part of the standard. However, comments in
11120 # DAge.txt indicate that non-character code points were available in
11121 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11122 # there except through this file (but on the other hand, they first
11123 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11124 # not. But the claim is that it was published as an aid to others who
11125 # might want some more information than was given in the official UCD
11126 # of the time. Many of the properties in it were incorporated into
11127 # the later PropList.txt, but some were not. This program uses this
11128 # early file to generate property tables that are otherwise not
11129 # accessible in the early UCD's, and most were probably not really
11130 # official at that time, so one could argue that it should be ignored,
11131 # and you can easily modify things to skip this. And there are bugs
11132 # in this file in various versions. (For example, the 2.1.9 version
11133 # removes from Alphabetic the CJK range starting at 4E00, and they
11134 # weren't added back in until 3.1.0.) Many of this file's properties
11135 # were later sanctioned, so this code generates tables for those
11136 # properties that aren't otherwise in the UCD of the time but
11137 # eventually did become official, and throws away the rest. Here is a
11138 # list of all the ones that are thrown away:
11139 # Bidi=* duplicates UnicodeData.txt
11140 # Combining never made into official property;
11141 # is \P{ccc=0}
11142 # Composite never made into official property.
11143 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11144 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11145 # Delimiter never made into official property;
11146 # removed in 3.0.1
11147 # Format Control never made into official property;
11148 # similar to gc=cf
11149 # High Surrogate duplicates Blocks.txt
11150 # Ignorable Control never made into official property;
11151 # similar to di=y
11152 # ISO Control duplicates UnicodeData.txt: gc=cc
11153 # Left of Pair never made into official property;
11154 # Line Separator duplicates UnicodeData.txt: gc=zl
11155 # Low Surrogate duplicates Blocks.txt
11156 # Non-break was actually listed as a property
11157 # in 3.2, but without any code
11158 # points. Unicode denies that this
11159 # was ever an official property
11160 # Non-spacing duplicate UnicodeData.txt: gc=mn
11161 # Numeric duplicates UnicodeData.txt: gc=cc
11162 # Paired Punctuation never made into official property;
11163 # appears to be gc=ps + gc=pe
11164 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11165 # Private Use duplicates UnicodeData.txt: gc=co
11166 # Private Use High Surrogate duplicates Blocks.txt
11167 # Punctuation duplicates UnicodeData.txt: gc=p
11168 # Space different definition than eventual
11169 # one.
11170 # Titlecase duplicates UnicodeData.txt: gc=lt
11171 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
98dc9551 11172 # Zero-width never made into official property;
99870f4d
KW
11173 # subset of gc=cf
11174 # Most of the properties have the same names in this file as in later
11175 # versions, but a couple do not.
11176 #
11177 # This subroutine filters $_, converting it from the old style into
11178 # the new style. Here's a sample of the old-style
11179 #
11180 # *******************************************
11181 #
11182 # Property dump for: 0x100000A0 (Join Control)
11183 #
11184 # 200C..200D (2 chars)
11185 #
11186 # In the example, the property is "Join Control". It is kept in this
11187 # closure between calls to the subroutine. The numbers beginning with
11188 # 0x were internal to Ken's program that generated this file.
11189
11190 # If this line contains the property name, extract it.
11191 if (/^Property dump for: [^(]*\((.*)\)/) {
11192 $_ = $1;
11193
11194 # Convert white space to underscores.
11195 s/ /_/g;
11196
11197 # Convert the few properties that don't have the same name as
11198 # their modern counterparts
11199 s/Identifier_Part/ID_Continue/
11200 or s/Not_a_Character/NChar/;
11201
11202 # If the name matches an existing property, use it.
11203 if (defined property_ref($_)) {
11204 trace "new property=", $_ if main::DEBUG && $to_trace;
11205 $current_property = $_;
11206 }
11207 else { # Otherwise discard it
11208 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11209 undef $current_property;
11210 }
11211 $_ = ""; # The property is saved for the next lines of the
11212 # file, but this defining line is of no further use,
11213 # so clear it so that the caller won't process it
11214 # further.
11215 }
11216 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11217
11218 # Here, the input line isn't a header defining a property for the
11219 # following section, and either we aren't in such a section, or
11220 # the line doesn't look like one that defines the code points in
11221 # such a section. Ignore this line.
11222 $_ = "";
11223 }
11224 else {
11225
11226 # Here, we have a line defining the code points for the current
11227 # stashed property. Anything starting with the first blank is
11228 # extraneous. Otherwise, it should look like a normal range to
11229 # the caller. Append the property name so that it looks just like
11230 # a modern PropList entry.
11231
11232 $_ =~ s/\s.*//;
11233 $_ .= "; $current_property";
11234 }
11235 trace $_ if main::DEBUG && $to_trace;
11236 return;
11237 }
11238} # End closure for old style proplist
11239
11240sub filter_old_style_normalization_lines {
11241 # For early releases of Unicode, the lines were like:
11242 # 74..2A76 ; NFKD_NO
11243 # For later releases this became:
11244 # 74..2A76 ; NFKD_QC; N
11245 # Filter $_ to look like those in later releases.
11246 # Similarly for MAYBEs
11247
11248 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11249
11250 # Also, the property FC_NFKC was abbreviated to FNC
11251 s/FNC/FC_NFKC/;
11252 return;
11253}
11254
82aed44a
KW
11255sub setup_script_extensions {
11256 # The Script_Extensions property starts out with a clone of the Script
11257 # property.
11258
11259 my $sc = property_ref("Script");
11260 my $scx = Property->new("scx", Full_Name => "Script_Extensions",
11261 Initialize => $sc,
11262 Default_Map => $sc->default_map,
11263 Pre_Declared_Maps => 0,
11264 );
11265 $scx->add_comment(join_lines( <<END
11266The values for code points that appear in one script are just the same as for
11267the 'Script' property. Likewise the values for those that appear in many
11268scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11269values of code points that appear in a few scripts are a space separated list
11270of those scripts.
11271END
11272 ));
11273
11274 # Make the scx's tables and aliases for them the same as sc's
11275 foreach my $table ($sc->tables) {
11276 my $scx_table = $scx->add_match_table($table->name,
11277 Full_Name => $table->full_name);
11278 foreach my $alias ($table->aliases) {
11279 $scx_table->add_alias($alias->name);
11280 }
11281 }
11282}
11283
fbe1e607
KW
11284sub filter_script_extensions_line {
11285 # The Scripts file comes with the full name for the scripts; the
11286 # ScriptExtensions, with the short name. The final mapping file is a
11287 # combination of these, and without adjustment, would have inconsistent
11288 # entries. This filters the latter file to convert to full names.
11289 # Entries look like this:
11290 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
11291
11292 my @fields = split /\s*;\s*/;
11293 my @full_names;
11294 foreach my $short_name (split " ", $fields[1]) {
11295 push @full_names, $script->table($short_name)->full_name;
11296 }
11297 $fields[1] = join " ", @full_names;
11298 $_ = join "; ", @fields;
11299
11300 return;
11301}
11302
99870f4d
KW
11303sub finish_Unicode() {
11304 # This routine should be called after all the Unicode files have been read
11305 # in. It:
11306 # 1) Adds the mappings for code points missing from the files which have
11307 # defaults specified for them.
11308 # 2) At this this point all mappings are known, so it computes the type of
11309 # each property whose type hasn't been determined yet.
11310 # 3) Calculates all the regular expression match tables based on the
11311 # mappings.
11312 # 3) Calculates and adds the tables which are defined by Unicode, but
11313 # which aren't derived by them
11314
11315 # For each property, fill in any missing mappings, and calculate the re
11316 # match tables. If a property has more than one missing mapping, the
11317 # default is a reference to a data structure, and requires data from other
11318 # properties to resolve. The sort is used to cause these to be processed
11319 # last, after all the other properties have been calculated.
11320 # (Fortunately, the missing properties so far don't depend on each other.)
11321 foreach my $property
11322 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11323 property_ref('*'))
11324 {
11325 # $perl has been defined, but isn't one of the Unicode properties that
11326 # need to be finished up.
11327 next if $property == $perl;
11328
11329 # Handle the properties that have more than one possible default
11330 if (ref $property->default_map) {
11331 my $default_map = $property->default_map;
11332
11333 # These properties have stored in the default_map:
11334 # One or more of:
11335 # 1) A default map which applies to all code points in a
11336 # certain class
11337 # 2) an expression which will evaluate to the list of code
11338 # points in that class
11339 # And
11340 # 3) the default map which applies to every other missing code
11341 # point.
11342 #
11343 # Go through each list.
11344 while (my ($default, $eval) = $default_map->get_next_defaults) {
11345
11346 # Get the class list, and intersect it with all the so-far
11347 # unspecified code points yielding all the code points
11348 # in the class that haven't been specified.
11349 my $list = eval $eval;
11350 if ($@) {
11351 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11352 last;
11353 }
11354
11355 # Narrow down the list to just those code points we don't have
11356 # maps for yet.
11357 $list = $list & $property->inverse_list;
11358
11359 # Add mappings to the property for each code point in the list
11360 foreach my $range ($list->ranges) {
56343c78
KW
11361 $property->add_map($range->start, $range->end, $default,
11362 Replace => $CROAK);
99870f4d
KW
11363 }
11364 }
11365
11366 # All remaining code points have the other mapping. Set that up
11367 # so the normal single-default mapping code will work on them
11368 $property->set_default_map($default_map->other_default);
11369
11370 # And fall through to do that
11371 }
11372
11373 # We should have enough data now to compute the type of the property.
11374 $property->compute_type;
11375 my $property_type = $property->type;
11376
11377 next if ! $property->to_create_match_tables;
11378
11379 # Here want to create match tables for this property
11380
11381 # The Unicode db always (so far, and they claim into the future) have
11382 # the default for missing entries in binary properties be 'N' (unless
11383 # there is a '@missing' line that specifies otherwise)
11384 if ($property_type == $BINARY && ! defined $property->default_map) {
11385 $property->set_default_map('N');
11386 }
11387
11388 # Add any remaining code points to the mapping, using the default for
5d7f7709 11389 # missing code points.
d8fb1cc3 11390 my $default_table;
99870f4d 11391 if (defined (my $default_map = $property->default_map)) {
1520492f 11392
f4c2a127 11393 # Make sure there is a match table for the default
f4c2a127
KW
11394 if (! defined ($default_table = $property->table($default_map))) {
11395 $default_table = $property->add_match_table($default_map);
11396 }
11397
a92d5c2e
KW
11398 # And, if the property is binary, the default table will just
11399 # be the complement of the other table.
11400 if ($property_type == $BINARY) {
11401 my $non_default_table;
11402
11403 # Find the non-default table.
11404 for my $table ($property->tables) {
11405 next if $table == $default_table;
11406 $non_default_table = $table;
11407 }
11408 $default_table->set_complement($non_default_table);
11409 }
11410
e1759d04
KW
11411 # This fills in any missing values with the default. It's
11412 # tempting to save some time and memory in running this program
11413 # by skipping this step for binary tables where the default
11414 # is easily calculated. But it is needed for generating
11415 # the test file, and other changes would also be required to do
11416 # so.
1520492f
KW
11417 $property->add_map(0, $LAST_UNICODE_CODEPOINT,
11418 $default_map, Replace => $NO);
99870f4d
KW
11419 }
11420
11421 # Have all we need to populate the match tables.
11422 my $property_name = $property->name;
56557540 11423 my $maps_should_be_defined = $property->pre_declared_maps;
99870f4d
KW
11424 foreach my $range ($property->ranges) {
11425 my $map = $range->value;
f5e9a6ca 11426 my $table = $property->table($map);
99870f4d
KW
11427 if (! defined $table) {
11428
11429 # Integral and rational property values are not necessarily
56557540
KW
11430 # defined in PropValueAliases, but whether all the other ones
11431 # should be depends on the property.
11432 if ($maps_should_be_defined
99870f4d
KW
11433 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11434 {
11435 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
11436 }
f5e9a6ca 11437 $table = $property->add_match_table($map);
99870f4d
KW
11438 }
11439
11440 $table->add_range($range->start, $range->end);
11441 }
11442
807807b7
KW
11443 # For Perl 5.6 compatibility, all properties matchable in regexes can
11444 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
11445 # But warn if this creates a conflict with a (new) Unicode property
11446 # name, although it appears that Unicode has made a decision never to
11447 # begin a property name with 'Is_', so this shouldn't happen.
99870f4d
KW
11448 foreach my $alias ($property->aliases) {
11449 my $Is_name = 'Is_' . $alias->name;
807807b7 11450 if (defined (my $pre_existing = property_ref($Is_name))) {
99870f4d 11451 Carp::my_carp(<<END
807807b7
KW
11452There is already an alias named $Is_name (from " . $pre_existing . "), so
11453creating one for $property won't work. This is bad news. If it is not too
11454late, get Unicode to back off. Otherwise go back to the old scheme (findable
11455from the git blame log for this area of the code that suppressed individual
11456aliases that conflict with the new Unicode names. Proceeding anyway.
99870f4d
KW
11457END
11458 );
99870f4d
KW
11459 }
11460 } # End of loop through aliases for this property
11461 } # End of loop through all Unicode properties.
11462
11463 # Fill in the mappings that Unicode doesn't completely furnish. First the
11464 # single letter major general categories. If Unicode were to start
11465 # delivering the values, this would be redundant, but better that than to
11466 # try to figure out if should skip and not get it right. Ths could happen
11467 # if a new major category were to be introduced, and the hard-coded test
11468 # wouldn't know about it.
11469 # This routine depends on the standard names for the general categories
11470 # being what it thinks they are, like 'Cn'. The major categories are the
11471 # union of all the general category tables which have the same first
11472 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11473 foreach my $minor_table ($gc->tables) {
11474 my $minor_name = $minor_table->name;
11475 next if length $minor_name == 1;
11476 if (length $minor_name != 2) {
11477 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11478 next;
11479 }
11480
11481 my $major_name = uc(substr($minor_name, 0, 1));
11482 my $major_table = $gc->table($major_name);
11483 $major_table += $minor_table;
11484 }
11485
11486 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11487 # defines it as LC)
11488 my $LC = $gc->table('LC');
11489 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11490 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11491
11492
11493 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11494 # deliver the correct values in it
11495 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11496
11497 # Lt not in release 1.
a5c376b7
KW
11498 if (defined $gc->table('Lt')) {
11499 $LC += $gc->table('Lt');
11500 $gc->table('Lt')->set_caseless_equivalent($LC);
11501 }
99870f4d
KW
11502 }
11503 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11504
a5c376b7
KW
11505 $gc->table('Ll')->set_caseless_equivalent($LC);
11506 $gc->table('Lu')->set_caseless_equivalent($LC);
11507
99870f4d 11508 my $Cs = $gc->table('Cs');
99870f4d
KW
11509
11510
11511 # Folding information was introduced later into Unicode data. To get
11512 # Perl's case ignore (/i) to work at all in releases that don't have
11513 # folding, use the best available alternative, which is lower casing.
11514 my $fold = property_ref('Simple_Case_Folding');
11515 if ($fold->is_empty) {
11516 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11517 $fold->add_note(join_lines(<<END
11518WARNING: This table uses lower case as a substitute for missing fold
11519information
11520END
11521 ));
11522 }
11523
11524 # Multiple-character mapping was introduced later into Unicode data. If
11525 # missing, use the single-characters maps as best available alternative
11526 foreach my $map (qw { Uppercase_Mapping
11527 Lowercase_Mapping
11528 Titlecase_Mapping
11529 Case_Folding
11530 } ) {
11531 my $full = property_ref($map);
11532 if ($full->is_empty) {
11533 my $simple = property_ref('Simple_' . $map);
11534 $full->initialize($simple);
11535 $full->add_comment($simple->comment) if ($simple->comment);
11536 $full->add_note(join_lines(<<END
11537WARNING: This table uses simple mapping (single-character only) as a
11538substitute for missing multiple-character information
11539END
11540 ));
11541 }
11542 }
82aed44a
KW
11543
11544 # The Script_Extensions property started out as a clone of the Script
11545 # property. But processing its data file caused some elements to be
11546 # replaced with different data. (These elements were for the Common and
11547 # Inherited properties.) This data is a qw() list of all the scripts that
11548 # the code points in the given range are in. An example line is:
11549 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
11550 #
11551 # The code above has created a new match table named "Arab Syrc Thaa"
11552 # which contains 060C. (The cloned table started out with this code point
11553 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
11554 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
11555 # match table. This is repeated for all these tables and ranges. The map
11556 # data is retained in the map table for reference, but the spurious match
11557 # tables are deleted.
11558
11559 my $scx = property_ref("Script_Extensions");
d53a7e7d 11560 if (defined $scx) {
c3a37f64
KW
11561 foreach my $table ($scx->tables) {
11562 next unless $table->name =~ /\s/; # All the new and only the new
11563 # tables have a space in their
11564 # names
11565 my @scripts = split /\s+/, $table->name;
11566 foreach my $script (@scripts) {
11567 my $script_table = $scx->table($script);
11568 $script_table += $table;
11569 }
11570 $scx->delete_match_table($table);
82aed44a 11571 }
d53a7e7d 11572 }
82aed44a
KW
11573
11574 return;
99870f4d
KW
11575}
11576
11577sub compile_perl() {
11578 # Create perl-defined tables. Almost all are part of the pseudo-property
11579 # named 'perl' internally to this program. Many of these are recommended
11580 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11581 # on those found there.
11582 # Almost all of these are equivalent to some Unicode property.
11583 # A number of these properties have equivalents restricted to the ASCII
11584 # range, with their names prefaced by 'Posix', to signify that these match
11585 # what the Posix standard says they should match. A couple are
11586 # effectively this, but the name doesn't have 'Posix' in it because there
cbc24f92
KW
11587 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
11588 # to the full Unicode range, by our guesses as to what is appropriate.
99870f4d
KW
11589
11590 # 'Any' is all code points. As an error check, instead of just setting it
11591 # to be that, construct it to be the union of all the major categories
7fc6cb55 11592 $Any = $perl->add_match_table('Any',
99870f4d
KW
11593 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
11594 Matches_All => 1);
11595
11596 foreach my $major_table ($gc->tables) {
11597
11598 # Major categories are the ones with single letter names.
11599 next if length($major_table->name) != 1;
11600
11601 $Any += $major_table;
11602 }
11603
11604 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
11605 Carp::my_carp_bug("Generated highest code point ("
11606 . sprintf("%X", $Any->max)
11607 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
11608 }
11609 if ($Any->range_count != 1 || $Any->min != 0) {
11610 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11611 }
11612
11613 $Any->add_alias('All');
11614
11615 # Assigned is the opposite of gc=unassigned
11616 my $Assigned = $perl->add_match_table('Assigned',
11617 Description => "All assigned code points",
11618 Initialize => ~ $gc->table('Unassigned'),
11619 );
11620
11621 # Our internal-only property should be treated as more than just a
11622 # synonym.
11623 $perl->add_match_table('_CombAbove')
11624 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11625 Related => 1);
11626
11627 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11628 if (defined $block) { # This is equivalent to the block if have it.
11629 my $Unicode_ASCII = $block->table('Basic_Latin');
11630 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11631 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11632 }
11633 }
11634
11635 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11636 # necessary
11637 if ($ASCII->is_empty) {
11638 $ASCII->initialize([ 0..127 ]);
11639 }
11640
99870f4d
KW
11641 # Get the best available case definitions. Early Unicode versions didn't
11642 # have Uppercase and Lowercase defined, so use the general category
11643 # instead for them.
11644 my $Lower = $perl->add_match_table('Lower');
11645 my $Unicode_Lower = property_ref('Lowercase');
11646 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11647 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
a5c376b7
KW
11648 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11649 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11650 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11651
99870f4d
KW
11652 }
11653 else {
11654 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11655 Related => 1);
11656 }
cbc24f92 11657 $Lower->add_alias('XPosixLower');
a5c376b7 11658 my $Posix_Lower = $perl->add_match_table("PosixLower",
ad5e8af1
KW
11659 Description => "[a-z]",
11660 Initialize => $Lower & $ASCII,
11661 );
99870f4d
KW
11662
11663 my $Upper = $perl->add_match_table('Upper');
11664 my $Unicode_Upper = property_ref('Uppercase');
11665 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11666 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
a5c376b7
KW
11667 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11668 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11669 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
99870f4d
KW
11670 }
11671 else {
11672 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11673 Related => 1);
11674 }
cbc24f92 11675 $Upper->add_alias('XPosixUpper');
a5c376b7 11676 my $Posix_Upper = $perl->add_match_table("PosixUpper",
ad5e8af1
KW
11677 Description => "[A-Z]",
11678 Initialize => $Upper & $ASCII,
11679 );
99870f4d
KW
11680
11681 # Earliest releases didn't have title case. Initialize it to empty if not
11682 # otherwise present
4364919a
KW
11683 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11684 Description => '(= \p{Gc=Lt})');
99870f4d 11685 my $lt = $gc->table('Lt');
a5c376b7
KW
11686
11687 # Earlier versions of mktables had this related to $lt since they have
c12f2655
KW
11688 # identical code points, but their caseless equivalents are not the same,
11689 # one being 'Cased' and the other being 'LC', and so now must be kept as
11690 # separate entities.
a5c376b7 11691 $Title += $lt if defined $lt;
99870f4d
KW
11692
11693 # If this Unicode version doesn't have Cased, set up our own. From
11694 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11695 # and only if C has the Lowercase or Uppercase property or has a
11696 # General_Category value of Titlecase_Letter.
a5c376b7
KW
11697 my $Unicode_Cased = property_ref('Cased');
11698 unless (defined $Unicode_Cased) {
99870f4d
KW
11699 my $cased = $perl->add_match_table('Cased',
11700 Initialize => $Lower + $Upper + $Title,
11701 Description => 'Uppercase or Lowercase or Titlecase',
11702 );
a5c376b7 11703 $Unicode_Cased = $cased;
99870f4d 11704 }
a5c376b7 11705 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
99870f4d
KW
11706
11707 # Similarly, set up our own Case_Ignorable property if this Unicode
11708 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11709 # C is defined to be case-ignorable if C has the value MidLetter or the
11710 # value MidNumLet for the Word_Break property or its General_Category is
11711 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11712 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11713
11714 # Perl has long had an internal-only alias for this property.
11715 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
11716 my $case_ignorable = property_ref('Case_Ignorable');
11717 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11718 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11719 Related => 1);
11720 }
11721 else {
11722
11723 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11724
11725 # The following three properties are not in early releases
11726 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11727 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11728 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11729
11730 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11731 # correspondingly the case-ignorable definition lacks that one. For
11732 # 4.0, it appears that it was meant to be the same definition, but was
11733 # inadvertently omitted from the standard's text, so add it if the
11734 # property actually is there
11735 my $wb = property_ref('Word_Break');
11736 if (defined $wb) {
11737 my $midlet = $wb->table('MidLetter');
11738 $perl_case_ignorable += $midlet if defined $midlet;
11739 my $midnumlet = $wb->table('MidNumLet');
11740 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11741 }
11742 else {
11743
11744 # In earlier versions of the standard, instead of the above two
11745 # properties , just the following characters were used:
11746 $perl_case_ignorable += 0x0027 # APOSTROPHE
11747 + 0x00AD # SOFT HYPHEN (SHY)
11748 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11749 }
11750 }
11751
11752 # The remaining perl defined tables are mostly based on Unicode TR 18,
11753 # "Annex C: Compatibility Properties". All of these have two versions,
11754 # one whose name generally begins with Posix that is posix-compliant, and
11755 # one that matches Unicode characters beyond the Posix, ASCII range
11756
ad5e8af1 11757 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
11758
11759 # Alphabetic was not present in early releases
11760 my $Alphabetic = property_ref('Alphabetic');
11761 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11762 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11763 }
11764 else {
11765
11766 # For early releases, we don't get it exactly right. The below
11767 # includes more than it should, which in 5.2 terms is: L + Nl +
11768 # Other_Alphabetic. Other_Alphabetic contains many characters from
11769 # Mn and Mc. It's better to match more than we should, than less than
11770 # we should.
11771 $Alpha->initialize($gc->table('Letter')
11772 + $gc->table('Mn')
11773 + $gc->table('Mc'));
11774 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 11775 $Alpha->add_description('Alphabetic');
99870f4d 11776 }
cbc24f92 11777 $Alpha->add_alias('XPosixAlpha');
a5c376b7 11778 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
ad5e8af1
KW
11779 Description => "[A-Za-z]",
11780 Initialize => $Alpha & $ASCII,
11781 );
a5c376b7
KW
11782 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11783 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
99870f4d
KW
11784
11785 my $Alnum = $perl->add_match_table('Alnum',
ad5e8af1 11786 Description => 'Alphabetic and (Decimal) Numeric',
99870f4d
KW
11787 Initialize => $Alpha + $gc->table('Decimal_Number'),
11788 );
cbc24f92 11789 $Alnum->add_alias('XPosixAlnum');
ad5e8af1
KW
11790 $perl->add_match_table("PosixAlnum",
11791 Description => "[A-Za-z0-9]",
11792 Initialize => $Alnum & $ASCII,
11793 );
99870f4d
KW
11794
11795 my $Word = $perl->add_match_table('Word',
d35dd6c6
KW
11796 Description => '\w, including beyond ASCII;'
11797 . ' = \p{Alnum} + \pM + \p{Pc}',
99870f4d
KW
11798 Initialize => $Alnum + $gc->table('Mark'),
11799 );
cbc24f92 11800 $Word->add_alias('XPosixWord');
99870f4d
KW
11801 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11802 $Word += $Pc if defined $Pc;
11803
f38f76ae 11804 # This is a Perl extension, so the name doesn't begin with Posix.
cbc24f92 11805 my $PerlWord = $perl->add_match_table('PerlWord',
99870f4d
KW
11806 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11807 Initialize => $Word & $ASCII,
11808 );
cbc24f92 11809 $PerlWord->add_alias('PosixWord');
99870f4d
KW
11810
11811 my $Blank = $perl->add_match_table('Blank',
11812 Description => '\h, Horizontal white space',
11813
11814 # 200B is Zero Width Space which is for line
11815 # break control, and was listed as
11816 # Space_Separator in early releases
11817 Initialize => $gc->table('Space_Separator')
11818 + 0x0009 # TAB
11819 - 0x200B, # ZWSP
11820 );
11821 $Blank->add_alias('HorizSpace'); # Another name for it.
cbc24f92 11822 $Blank->add_alias('XPosixBlank');
ad5e8af1
KW
11823 $perl->add_match_table("PosixBlank",
11824 Description => "\\t and ' '",
11825 Initialize => $Blank & $ASCII,
11826 );
99870f4d
KW
11827
11828 my $VertSpace = $perl->add_match_table('VertSpace',
11829 Description => '\v',
11830 Initialize => $gc->table('Line_Separator')
11831 + $gc->table('Paragraph_Separator')
11832 + 0x000A # LINE FEED
11833 + 0x000B # VERTICAL TAB
11834 + 0x000C # FORM FEED
11835 + 0x000D # CARRIAGE RETURN
11836 + 0x0085, # NEL
11837 );
11838 # No Posix equivalent for vertical space
11839
11840 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
11841 Description => '\s including beyond ASCII plus vertical tab',
11842 Initialize => $Blank + $VertSpace,
99870f4d 11843 );
cbc24f92 11844 $Space->add_alias('XPosixSpace');
ad5e8af1 11845 $perl->add_match_table("PosixSpace",
f38f76ae 11846 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
11847 Initialize => $Space & $ASCII,
11848 );
99870f4d
KW
11849
11850 # Perl's traditional space doesn't include Vertical Tab
cbc24f92 11851 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
99870f4d
KW
11852 Description => '\s, including beyond ASCII',
11853 Initialize => $Space - 0x000B,
11854 );
cbc24f92
KW
11855 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
11856 my $PerlSpace = $perl->add_match_table('PerlSpace',
de25ec47
KW
11857 Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11858 Initialize => $XPerlSpace & $ASCII,
99870f4d
KW
11859 );
11860
cbc24f92 11861
99870f4d 11862 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 11863 Description => 'Control characters');
99870f4d 11864 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
cbc24f92 11865 $Cntrl->add_alias('XPosixCntrl');
ad5e8af1 11866 $perl->add_match_table("PosixCntrl",
f38f76ae 11867 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
11868 Initialize => $Cntrl & $ASCII,
11869 );
99870f4d
KW
11870
11871 # $controls is a temporary used to construct Graph.
11872 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11873 + $gc->table('Control'));
11874 # Cs not in release 1
11875 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11876
11877 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11878 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 11879 Description => 'Characters that are graphical',
99870f4d
KW
11880 Initialize => ~ ($Space + $controls),
11881 );
cbc24f92 11882 $Graph->add_alias('XPosixGraph');
ad5e8af1 11883 $perl->add_match_table("PosixGraph",
f38f76ae
KW
11884 Description =>
11885 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
11886 Initialize => $Graph & $ASCII,
11887 );
99870f4d 11888
3e20195b 11889 $print = $perl->add_match_table('Print',
ad5e8af1 11890 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 11891 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 11892 );
cbc24f92 11893 $print->add_alias('XPosixPrint');
ad5e8af1 11894 $perl->add_match_table("PosixPrint",
66fd7fd0 11895 Description =>
f38f76ae 11896 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 11897 Initialize => $print & $ASCII,
ad5e8af1 11898 );
99870f4d
KW
11899
11900 my $Punct = $perl->add_match_table('Punct');
11901 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11902
11903 # \p{punct} doesn't include the symbols, which posix does
cbc24f92
KW
11904 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
11905 Description => '\p{Punct} + ASCII-range \p{Symbol}',
11906 Initialize => $gc->table('Punctuation')
11907 + ($ASCII & $gc->table('Symbol')),
11908 );
99870f4d 11909 $perl->add_match_table('PosixPunct',
f38f76ae 11910 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
cbc24f92 11911 Initialize => $ASCII & $XPosixPunct,
ad5e8af1 11912 );
99870f4d
KW
11913
11914 my $Digit = $perl->add_match_table('Digit',
f3a73f6e 11915 Description => '[0-9] + all other decimal digits');
99870f4d 11916 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
cbc24f92 11917 $Digit->add_alias('XPosixDigit');
ad5e8af1
KW
11918 my $PosixDigit = $perl->add_match_table("PosixDigit",
11919 Description => '[0-9]',
11920 Initialize => $Digit & $ASCII,
11921 );
99870f4d 11922
eadadd41
KW
11923 # Hex_Digit was not present in first release
11924 my $Xdigit = $perl->add_match_table('XDigit');
cbc24f92 11925 $Xdigit->add_alias('XPosixXDigit');
eadadd41
KW
11926 my $Hex = property_ref('Hex_Digit');
11927 if (defined $Hex && ! $Hex->is_empty) {
11928 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
11929 }
11930 else {
eadadd41
KW
11931 # (Have to use hex instead of e.g. '0', because could be running on an
11932 # non-ASCII machine, and we want the Unicode (ASCII) values)
11933 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11934 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11935 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d 11936 }
4efcc33b
KW
11937
11938 # AHex was not present in early releases
11939 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
11940 my $AHex = property_ref('ASCII_Hex_Digit');
11941 if (defined $AHex && ! $AHex->is_empty) {
11942 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
11943 }
11944 else {
11945 $PosixXDigit->initialize($Xdigit & $ASCII);
11946 }
11947 $PosixXDigit->add_description('[0-9A-Fa-f]');
99870f4d 11948
99870f4d
KW
11949 my $dt = property_ref('Decomposition_Type');
11950 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11951 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11952 Perl_Extension => 1,
d57ccc9a 11953 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
11954 );
11955
11956 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11957 # than SD appeared, construct it ourselves, based on the first release SD
11958 # was in.
11959 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11960 my $soft_dotted = property_ref('Soft_Dotted');
11961 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11962 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11963 }
11964 else {
11965
11966 # This list came from 3.2 Soft_Dotted.
11967 $CanonDCIJ->initialize([ 0x0069,
11968 0x006A,
11969 0x012F,
11970 0x0268,
11971 0x0456,
11972 0x0458,
11973 0x1E2D,
11974 0x1ECB,
11975 ]);
11976 $CanonDCIJ = $CanonDCIJ & $Assigned;
11977 }
11978
f86864ac 11979 # These are used in Unicode's definition of \X
37e2e78e
KW
11980 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11981 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11982
ee24a51c
KW
11983 # For backward compatibility, Perl has its own definition for IDStart
11984 # First, we include the underscore, and then the regular XID_Start also
11985 # have to be Words
11986 $perl->add_match_table('_Perl_IDStart',
11987 Perl_Extension => 1,
11988 Internal_Only => 1,
11989 Initialize =>
11990 ord('_')
11991 + (property_ref('XID_Start')->table('Y') & $Word)
11992 );
11993
99870f4d 11994 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 11995
678f13d5 11996 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
11997 # definition differs too much from the traditional Perl one to use.
11998 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11999
12000 # Note that assumes HST is defined; it came in an earlier release than
12001 # GCB. In the line below, two negatives means: yes hangul
12002 $begin += ~ property_ref('Hangul_Syllable_Type')
12003 ->table('Not_Applicable')
12004 + ~ ($gcb->table('Control')
12005 + $gcb->table('CR')
12006 + $gcb->table('LF'));
12007 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
12008
12009 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
12010 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
12011 }
12012 else { # Old definition, used on early releases.
f86864ac 12013 $extend += $gc->table('Mark')
37e2e78e
KW
12014 + 0x200C # ZWNJ
12015 + 0x200D; # ZWJ
12016 $begin += ~ $extend;
12017
12018 # Here we may have a release that has the regular grapheme cluster
12019 # defined, or a release that doesn't have anything defined.
12020 # We set things up so the Perl core degrades gracefully, possibly with
12021 # placeholders that match nothing.
12022
12023 if (! defined $gcb) {
12024 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
12025 }
12026 my $hst = property_ref('HST');
12027 if (!defined $hst) {
12028 $hst = Property->new('HST', Status => $PLACEHOLDER);
12029 $hst->add_match_table('Not_Applicable',
12030 Initialize => $Any,
12031 Matches_All => 1);
12032 }
12033
12034 # On some releases, here we may not have the needed tables for the
12035 # perl core, in some releases we may.
12036 foreach my $name (qw{ L LV LVT T V prepend }) {
12037 my $table = $gcb->table($name);
12038 if (! defined $table) {
12039 $table = $gcb->add_match_table($name);
12040 push @tables_that_may_be_empty, $table->complete_name;
12041 }
12042
12043 # The HST property predates the GCB one, and has identical tables
12044 # for some of them, so use it if we can.
12045 if ($table->is_empty
12046 && defined $hst
12047 && defined $hst->table($name))
12048 {
12049 $table += $hst->table($name);
12050 }
12051 }
12052 }
12053
12054 # More GCB. If we found some hangul syllables, populate a combined
12055 # table.
12056 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
12057 my $LV = $gcb->table('LV');
12058 if ($LV->is_empty) {
12059 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
12060 } else {
12061 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
12062 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
12063 }
12064
28093d0e 12065 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
12066 my @composition = ('Name', 'Unicode_1_Name');
12067
12068 if (@named_sequences) {
12069 push @composition, 'Named_Sequence';
12070 foreach my $sequence (@named_sequences) {
12071 $perl_charname->add_anomalous_entry($sequence);
12072 }
12073 }
12074
12075 my $alias_sentence = "";
12076 my $alias = property_ref('Name_Alias');
12077 if (defined $alias) {
12078 push @composition, 'Name_Alias';
12079 $alias->reset_each_range;
12080 while (my ($range) = $alias->each_range) {
12081 next if $range->value eq "";
12082 if ($range->start != $range->end) {
12083 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
12084 }
12085 $perl_charname->add_duplicate($range->start, $range->value);
12086 }
12087 $alias_sentence = <<END;
12088The Name_Alias property adds duplicate code point entries with a corrected
12089name. The original (less correct, but still valid) name will be physically
53d84487 12090last.
99870f4d
KW
12091END
12092 }
12093 my $comment;
12094 if (@composition <= 2) { # Always at least 2
12095 $comment = join " and ", @composition;
12096 }
12097 else {
12098 $comment = join ", ", @composition[0 .. scalar @composition - 2];
12099 $comment .= ", and $composition[-1]";
12100 }
12101
99870f4d
KW
12102 $perl_charname->add_comment(join_lines( <<END
12103This file is for charnames.pm. It is the union of the $comment properties.
12104Unicode_1_Name entries are used only for otherwise nameless code
12105points.
12106$alias_sentence
12107END
12108 ));
12109
99870f4d
KW
12110 # Construct the Present_In property from the Age property.
12111 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12112 my $default_map = $age->default_map;
12113 my $in = Property->new('In',
12114 Default_Map => $default_map,
12115 Full_Name => "Present_In",
12116 Internal_Only_Warning => 1,
12117 Perl_Extension => 1,
12118 Type => $ENUM,
12119 Initialize => $age,
12120 );
12121 $in->add_comment(join_lines(<<END
c12f2655 12122THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
99870f4d
KW
12123same as for $age, and not for what $in really means. This is because anything
12124defined in a given release should have multiple values: that release and all
12125higher ones. But only one value per code point can be represented in a table
12126like this.
12127END
12128 ));
12129
12130 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
12131 # lowest numbered (earliest) come first, with the non-numeric one
12132 # last.
12133 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12134 ? 1
12135 : ($b->name !~ /^[\d.]*$/)
12136 ? -1
12137 : $a->name <=> $b->name
12138 } $age->tables;
12139
12140 # The Present_In property is the cumulative age properties. The first
12141 # one hence is identical to the first age one.
12142 my $previous_in = $in->add_match_table($first_age->name);
12143 $previous_in->set_equivalent_to($first_age, Related => 1);
12144
12145 my $description_start = "Code point's usage introduced in version ";
12146 $first_age->add_description($description_start . $first_age->name);
12147
98dc9551 12148 # To construct the accumulated values, for each of the age tables
99870f4d
KW
12149 # starting with the 2nd earliest, merge the earliest with it, to get
12150 # all those code points existing in the 2nd earliest. Repeat merging
12151 # the new 2nd earliest with the 3rd earliest to get all those existing
12152 # in the 3rd earliest, and so on.
12153 foreach my $current_age (@rest_ages) {
12154 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
12155
12156 my $current_in = $in->add_match_table(
12157 $current_age->name,
12158 Initialize => $current_age + $previous_in,
12159 Description => $description_start
12160 . $current_age->name
12161 . ' or earlier',
12162 );
12163 $previous_in = $current_in;
12164
12165 # Add clarifying material for the corresponding age file. This is
12166 # in part because of the confusing and contradictory information
12167 # given in the Standard's documentation itself, as of 5.2.
12168 $current_age->add_description(
12169 "Code point's usage was introduced in version "
12170 . $current_age->name);
12171 $current_age->add_note("See also $in");
12172
12173 }
12174
12175 # And finally the code points whose usages have yet to be decided are
12176 # the same in both properties. Note that permanently unassigned code
12177 # points actually have their usage assigned (as being permanently
12178 # unassigned), so that these tables are not the same as gc=cn.
12179 my $unassigned = $in->add_match_table($default_map);
12180 my $age_default = $age->table($default_map);
12181 $age_default->add_description(<<END
12182Code point's usage has not been assigned in any Unicode release thus far.
12183END
12184 );
12185 $unassigned->set_equivalent_to($age_default, Related => 1);
12186 }
12187
12188
12189 # Finished creating all the perl properties. All non-internal non-string
12190 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
12191 # an underscore.) These do not get a separate entry in the pod file
12192 foreach my $table ($perl->tables) {
12193 foreach my $alias ($table->aliases) {
12194 next if $alias->name =~ /^_/;
12195 $table->add_alias('Is_' . $alias->name,
12196 Pod_Entry => 0,
12197 Status => $alias->status,
12198 Externally_Ok => 0);
12199 }
12200 }
12201
c4019d52
KW
12202 # Here done with all the basic stuff. Ready to populate the information
12203 # about each character if annotating them.
558712cf 12204 if ($annotate) {
c4019d52
KW
12205
12206 # See comments at its declaration
12207 $annotate_ranges = Range_Map->new;
12208
12209 # This separates out the non-characters from the other unassigneds, so
12210 # can give different annotations for each.
12211 $unassigned_sans_noncharacters = Range_List->new(
12212 Initialize => $gc->table('Unassigned')
12213 & property_ref('Noncharacter_Code_Point')->table('N'));
12214
12215 for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) {
12216 $i = populate_char_info($i); # Note sets $i so may cause skips
12217 }
12218 }
12219
99870f4d
KW
12220 return;
12221}
12222
12223sub add_perl_synonyms() {
12224 # A number of Unicode tables have Perl synonyms that are expressed in
12225 # the single-form, \p{name}. These are:
12226 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12227 # \p{Is_Name} as synonyms
12228 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12229 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12230 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12231 # conflict, \p{Value} and \p{Is_Value} as well
12232 #
12233 # This routine generates these synonyms, warning of any unexpected
12234 # conflicts.
12235
12236 # Construct the list of tables to get synonyms for. Start with all the
12237 # binary and the General_Category ones.
12238 my @tables = grep { $_->type == $BINARY } property_ref('*');
12239 push @tables, $gc->tables;
12240
12241 # If the version of Unicode includes the Script property, add its tables
359523e2 12242 push @tables, $script->tables if defined $script;
99870f4d
KW
12243
12244 # The Block tables are kept separate because they are treated differently.
12245 # And the earliest versions of Unicode didn't include them, so add only if
12246 # there are some.
12247 my @blocks;
12248 push @blocks, $block->tables if defined $block;
12249
12250 # Here, have the lists of tables constructed. Process blocks last so that
12251 # if there are name collisions with them, blocks have lowest priority.
12252 # Should there ever be other collisions, manual intervention would be
12253 # required. See the comments at the beginning of the program for a
12254 # possible way to handle those semi-automatically.
12255 foreach my $table (@tables, @blocks) {
12256
12257 # For non-binary properties, the synonym is just the name of the
12258 # table, like Greek, but for binary properties the synonym is the name
12259 # of the property, and means the code points in its 'Y' table.
12260 my $nominal = $table;
12261 my $nominal_property = $nominal->property;
12262 my $actual;
12263 if (! $nominal->isa('Property')) {
12264 $actual = $table;
12265 }
12266 else {
12267
12268 # Here is a binary property. Use the 'Y' table. Verify that is
12269 # there
12270 my $yes = $nominal->table('Y');
12271 unless (defined $yes) { # Must be defined, but is permissible to
12272 # be empty.
12273 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
12274 next;
12275 }
12276 $actual = $yes;
12277 }
12278
12279 foreach my $alias ($nominal->aliases) {
12280
12281 # Attempt to create a table in the perl directory for the
12282 # candidate table, using whatever aliases in it that don't
12283 # conflict. Also add non-conflicting aliases for all these
12284 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12285 PREFIX:
12286 foreach my $prefix ("", 'Is_', 'In_') {
12287
12288 # Only Block properties can have added 'In_' aliases.
12289 next if $prefix eq 'In_' and $nominal_property != $block;
12290
12291 my $proposed_name = $prefix . $alias->name;
12292
12293 # No Is_Is, In_In, nor combinations thereof
12294 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12295 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12296
12297 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12298
12299 # Get a reference to any existing table in the perl
12300 # directory with the desired name.
12301 my $pre_existing = $perl->table($proposed_name);
12302
12303 if (! defined $pre_existing) {
12304
12305 # No name collision, so ok to add the perl synonym.
12306
12307 my $make_pod_entry;
12308 my $externally_ok;
4cd1260a 12309 my $status = $alias->status;
99870f4d
KW
12310 if ($nominal_property == $block) {
12311
12312 # For block properties, the 'In' form is preferred for
12313 # external use; the pod file contains wild cards for
12314 # this and the 'Is' form so no entries for those; and
12315 # we don't want people using the name without the
12316 # 'In', so discourage that.
12317 if ($prefix eq "") {
12318 $make_pod_entry = 1;
12319 $status = $status || $DISCOURAGED;
12320 $externally_ok = 0;
12321 }
12322 elsif ($prefix eq 'In_') {
12323 $make_pod_entry = 0;
12324 $status = $status || $NORMAL;
12325 $externally_ok = 1;
12326 }
12327 else {
12328 $make_pod_entry = 0;
12329 $status = $status || $DISCOURAGED;
12330 $externally_ok = 0;
12331 }
12332 }
12333 elsif ($prefix ne "") {
12334
12335 # The 'Is' prefix is handled in the pod by a wild
12336 # card, and we won't use it for an external name
12337 $make_pod_entry = 0;
12338 $status = $status || $NORMAL;
12339 $externally_ok = 0;
12340 }
12341 else {
12342
12343 # Here, is an empty prefix, non block. This gets its
12344 # own pod entry and can be used for an external name.
12345 $make_pod_entry = 1;
12346 $status = $status || $NORMAL;
12347 $externally_ok = 1;
12348 }
12349
12350 # Here, there isn't a perl pre-existing table with the
12351 # name. Look through the list of equivalents of this
12352 # table to see if one is a perl table.
12353 foreach my $equivalent ($actual->leader->equivalents) {
12354 next if $equivalent->property != $perl;
12355
12356 # Here, have found a table for $perl. Add this alias
12357 # to it, and are done with this prefix.
12358 $equivalent->add_alias($proposed_name,
12359 Pod_Entry => $make_pod_entry,
12360 Status => $status,
12361 Externally_Ok => $externally_ok);
12362 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12363 next PREFIX;
12364 }
12365
12366 # Here, $perl doesn't already have a table that is a
12367 # synonym for this property, add one.
12368 my $added_table = $perl->add_match_table($proposed_name,
12369 Pod_Entry => $make_pod_entry,
12370 Status => $status,
12371 Externally_Ok => $externally_ok);
12372 # And it will be related to the actual table, since it is
12373 # based on it.
12374 $added_table->set_equivalent_to($actual, Related => 1);
12375 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12376 next;
12377 } # End of no pre-existing.
12378
12379 # Here, there is a pre-existing table that has the proposed
12380 # name. We could be in trouble, but not if this is just a
12381 # synonym for another table that we have already made a child
12382 # of the pre-existing one.
6505c6e2 12383 if ($pre_existing->is_set_equivalent_to($actual)) {
99870f4d
KW
12384 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12385 $pre_existing->add_alias($proposed_name);
12386 next;
12387 }
12388
12389 # Here, there is a name collision, but it still could be ok if
12390 # the tables match the identical set of code points, in which
12391 # case, we can combine the names. Compare each table's code
12392 # point list to see if they are identical.
12393 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12394 if ($pre_existing->matches_identically_to($actual)) {
12395
12396 # Here, they do match identically. Not a real conflict.
12397 # Make the perl version a child of the Unicode one, except
12398 # in the non-obvious case of where the perl name is
12399 # already a synonym of another Unicode property. (This is
12400 # excluded by the test for it being its own parent.) The
12401 # reason for this exclusion is that then the two Unicode
12402 # properties become related; and we don't really know if
12403 # they are or not. We generate documentation based on
12404 # relatedness, and this would be misleading. Code
12405 # later executed in the process will cause the tables to
12406 # be represented by a single file anyway, without making
12407 # it look in the pod like they are necessarily related.
12408 if ($pre_existing->parent == $pre_existing
12409 && ($pre_existing->property == $perl
12410 || $actual->property == $perl))
12411 {
12412 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12413 $pre_existing->set_equivalent_to($actual, Related => 1);
12414 }
12415 elsif (main::DEBUG && $to_trace) {
12416 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12417 trace $pre_existing->parent;
12418 }
12419 next PREFIX;
12420 }
12421
12422 # Here they didn't match identically, there is a real conflict
12423 # between our new name and a pre-existing property.
12424 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12425 $pre_existing->add_conflicting($nominal->full_name,
12426 'p',
12427 $actual);
12428
12429 # Don't output a warning for aliases for the block
12430 # properties (unless they start with 'In_') as it is
12431 # expected that there will be conflicts and the block
12432 # form loses.
12433 if ($verbosity >= $NORMAL_VERBOSITY
12434 && ($actual->property != $block || $prefix eq 'In_'))
12435 {
12436 print simple_fold(join_lines(<<END
12437There is already an alias named $proposed_name (from " . $pre_existing . "),
12438so not creating this alias for " . $actual
12439END
12440 ), "", 4);
12441 }
12442
12443 # Keep track for documentation purposes.
12444 $has_In_conflicts++ if $prefix eq 'In_';
12445 $has_Is_conflicts++ if $prefix eq 'Is_';
12446 }
12447 }
12448 }
12449
12450 # There are some properties which have No and Yes (and N and Y) as
12451 # property values, but aren't binary, and could possibly be confused with
12452 # binary ones. So create caveats for them. There are tables that are
12453 # named 'No', and tables that are named 'N', but confusion is not likely
12454 # unless they are the same table. For example, N meaning Number or
12455 # Neutral is not likely to cause confusion, so don't add caveats to things
12456 # like them.
12457 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
12458 my $yes = $property->table('Yes');
12459 if (defined $yes) {
12460 my $y = $property->table('Y');
12461 if (defined $y && $yes == $y) {
12462 foreach my $alias ($property->aliases) {
12463 $yes->add_conflicting($alias->name);
12464 }
12465 }
12466 }
12467 my $no = $property->table('No');
12468 if (defined $no) {
12469 my $n = $property->table('N');
12470 if (defined $n && $no == $n) {
12471 foreach my $alias ($property->aliases) {
12472 $no->add_conflicting($alias->name, 'P');
12473 }
12474 }
12475 }
12476 }
12477
12478 return;
12479}
12480
12481sub register_file_for_name($$$) {
12482 # Given info about a table and a datafile that it should be associated
98dc9551 12483 # with, register that association
99870f4d
KW
12484
12485 my $table = shift;
12486 my $directory_ref = shift; # Array of the directory path for the file
e6ebc4c0 12487 my $file = shift; # The file name in the final directory.
99870f4d
KW
12488 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12489
12490 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12491
12492 if ($table->isa('Property')) {
12493 $table->set_file_path(@$directory_ref, $file);
12494 push @map_properties, $table
12495 if $directory_ref->[0] eq $map_directory;
12496 return;
12497 }
12498
12499 # Do all of the work for all equivalent tables when called with the leader
12500 # table, so skip if isn't the leader.
12501 return if $table->leader != $table;
12502
a92d5c2e
KW
12503 # If this is a complement of another file, use that other file instead,
12504 # with a ! prepended to it.
12505 my $complement;
12506 if (($complement = $table->complement) != 0) {
12507 my @directories = $complement->file_path;
12508
12509 # This assumes that the 0th element is something like 'lib',
12510 # the 1th element the property name (in its own directory), like
12511 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12512 # appended to it later.
12513 $directories[1] =~ s/^/!/;
12514 $file = pop @directories;
12515 $directory_ref =\@directories;
12516 }
12517
99870f4d
KW
12518 # Join all the file path components together, using slashes.
12519 my $full_filename = join('/', @$directory_ref, $file);
12520
12521 # All go in the same subdirectory of unicore
12522 if ($directory_ref->[0] ne $matches_directory) {
12523 Carp::my_carp("Unexpected directory in "
12524 . join('/', @{$directory_ref}, $file));
12525 }
12526
12527 # For this table and all its equivalents ...
12528 foreach my $table ($table, $table->equivalents) {
12529
12530 # Associate it with its file internally. Don't include the
12531 # $matches_directory first component
12532 $table->set_file_path(@$directory_ref, $file);
12533 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12534
12535 my $property = $table->property;
12536 $property = ($property == $perl)
12537 ? "" # 'perl' is never explicitly stated
12538 : standardize($property->name) . '=';
12539
12540 my $deprecated = ($table->status eq $DEPRECATED)
12541 ? $table->status_info
12542 : "";
d867ccfb 12543 my $caseless_equivalent = $table->caseless_equivalent;
99870f4d
KW
12544
12545 # And for each of the table's aliases... This inner loop eventually
12546 # goes through all aliases in the UCD that we generate regex match
12547 # files for
12548 foreach my $alias ($table->aliases) {
c85f591a 12549 my $standard = utf8_heavy_name($table, $alias);
99870f4d
KW
12550
12551 # Generate an entry in either the loose or strict hashes, which
12552 # will translate the property and alias names combination into the
12553 # file where the table for them is stored.
99870f4d 12554 if ($alias->loose_match) {
99870f4d
KW
12555 if (exists $loose_to_file_of{$standard}) {
12556 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12557 }
12558 else {
12559 $loose_to_file_of{$standard} = $sub_filename;
12560 }
12561 }
12562 else {
99870f4d
KW
12563 if (exists $stricter_to_file_of{$standard}) {
12564 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12565 }
12566 else {
12567 $stricter_to_file_of{$standard} = $sub_filename;
12568
12569 # Tightly coupled with how utf8_heavy.pl works, for a
12570 # floating point number that is a whole number, get rid of
12571 # the trailing decimal point and 0's, so that utf8_heavy
12572 # will work. Also note that this assumes that such a
12573 # number is matched strictly; so if that were to change,
12574 # this would be wrong.
c85f591a 12575 if ((my $integer_name = $alias->name)
99870f4d
KW
12576 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12577 {
12578 $stricter_to_file_of{$property . $integer_name}
c12f2655 12579 = $sub_filename;
99870f4d
KW
12580 }
12581 }
12582 }
12583
12584 # Keep a list of the deprecated properties and their filenames
a92d5c2e 12585 if ($deprecated && $complement == 0) {
99870f4d
KW
12586 $utf8::why_deprecated{$sub_filename} = $deprecated;
12587 }
d867ccfb
KW
12588
12589 # And a substitute table, if any, for case-insensitive matching
12590 if ($caseless_equivalent != 0) {
12591 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12592 }
99870f4d
KW
12593 }
12594 }
12595
12596 return;
12597}
12598
12599{ # Closure
12600 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12601 # conflicts
12602 my %full_dir_name_of; # Full length names of directories used.
12603
12604 sub construct_filename($$$) {
12605 # Return a file name for a table, based on the table name, but perhaps
12606 # changed to get rid of non-portable characters in it, and to make
12607 # sure that it is unique on a file system that allows the names before
12608 # any period to be at most 8 characters (DOS). While we're at it
12609 # check and complain if there are any directory conflicts.
12610
12611 my $name = shift; # The name to start with
12612 my $mutable = shift; # Boolean: can it be changed? If no, but
12613 # yet it must be to work properly, a warning
12614 # is given
12615 my $directories_ref = shift; # A reference to an array containing the
12616 # path to the file, with each element one path
12617 # component. This is used because the same
12618 # name can be used in different directories.
12619 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12620
12621 my $warn = ! defined wantarray; # If true, then if the name is
12622 # changed, a warning is issued as well.
12623
12624 if (! defined $name) {
12625 Carp::my_carp("Undefined name in directory "
12626 . File::Spec->join(@$directories_ref)
12627 . ". '_' used");
12628 return '_';
12629 }
12630
12631 # Make sure that no directory names conflict with each other. Look at
12632 # each directory in the input file's path. If it is already in use,
12633 # assume it is correct, and is merely being re-used, but if we
12634 # truncate it to 8 characters, and find that there are two directories
12635 # that are the same for the first 8 characters, but differ after that,
12636 # then that is a problem.
12637 foreach my $directory (@$directories_ref) {
12638 my $short_dir = substr($directory, 0, 8);
12639 if (defined $full_dir_name_of{$short_dir}) {
12640 next if $full_dir_name_of{$short_dir} eq $directory;
12641 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12642 }
12643 else {
12644 $full_dir_name_of{$short_dir} = $directory;
12645 }
12646 }
12647
12648 my $path = join '/', @$directories_ref;
12649 $path .= '/' if $path;
12650
12651 # Remove interior underscores.
12652 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12653
12654 # Change any non-word character into an underscore, and truncate to 8.
12655 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12656 substr($filename, 8) = "" if length($filename) > 8;
12657
12658 # Make sure the basename doesn't conflict with something we
12659 # might have already written. If we have, say,
12660 # InGreekExtended1
12661 # InGreekExtended2
12662 # they become
12663 # InGreekE
12664 # InGreek2
12665 my $warned = 0;
12666 while (my $num = $base_names{$path}{lc $filename}++) {
12667 $num++; # so basenames with numbers start with '2', which
12668 # just looks more natural.
12669
12670 # Want to append $num, but if it'll make the basename longer
12671 # than 8 characters, pre-truncate $filename so that the result
12672 # is acceptable.
12673 my $delta = length($filename) + length($num) - 8;
12674 if ($delta > 0) {
12675 substr($filename, -$delta) = $num;
12676 }
12677 else {
12678 $filename .= $num;
12679 }
12680 if ($warn && ! $warned) {
12681 $warned = 1;
12682 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12683 }
12684 }
12685
12686 return $filename if $mutable;
12687
12688 # If not changeable, must return the input name, but warn if needed to
12689 # change it beyond shortening it.
12690 if ($name ne $filename
12691 && substr($name, 0, length($filename)) ne $filename) {
12692 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12693 }
12694 return $name;
12695 }
12696}
12697
12698# The pod file contains a very large table. Many of the lines in that table
12699# would exceed a typical output window's size, and so need to be wrapped with
12700# a hanging indent to make them look good. The pod language is really
12701# insufficient here. There is no general construct to do that in pod, so it
12702# is done here by beginning each such line with a space to cause the result to
12703# be output without formatting, and doing all the formatting here. This leads
12704# to the result that if the eventual display window is too narrow it won't
12705# look good, and if the window is too wide, no advantage is taken of that
12706# extra width. A further complication is that the output may be indented by
12707# the formatter so that there is less space than expected. What I (khw) have
12708# done is to assume that that indent is a particular number of spaces based on
12709# what it is in my Linux system; people can always resize their windows if
12710# necessary, but this is obviously less than desirable, but the best that can
12711# be expected.
12712my $automatic_pod_indent = 8;
12713
12714# Try to format so that uses fewest lines, but few long left column entries
12715# slide into the right column. An experiment on 5.1 data yielded the
12716# following percentages that didn't cut into the other side along with the
12717# associated first-column widths
12718# 69% = 24
12719# 80% not too bad except for a few blocks
12720# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12721# 95% = 37;
12722my $indent_info_column = 27; # 75% of lines didn't have overlap
12723
12724my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12725 # The 3 is because of:
12726 # 1 for the leading space to tell the pod formatter to
12727 # output as-is
12728 # 1 for the flag
12729 # 1 for the space between the flag and the main data
12730
12731sub format_pod_line ($$$;$$) {
12732 # Take a pod line and return it, formatted properly
12733
12734 my $first_column_width = shift;
12735 my $entry = shift; # Contents of left column
12736 my $info = shift; # Contents of right column
12737
12738 my $status = shift || ""; # Any flag
12739
12740 my $loose_match = shift; # Boolean.
12741 $loose_match = 1 unless defined $loose_match;
12742
12743 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12744
12745 my $flags = "";
12746 $flags .= $STRICTER if ! $loose_match;
12747
12748 $flags .= $status if $status;
12749
12750 # There is a blank in the left column to cause the pod formatter to
12751 # output the line as-is.
12752 return sprintf " %-*s%-*s %s\n",
12753 # The first * in the format is replaced by this, the -1 is
12754 # to account for the leading blank. There isn't a
12755 # hard-coded blank after this to separate the flags from
12756 # the rest of the line, so that in the unlikely event that
12757 # multiple flags are shown on the same line, they both
12758 # will get displayed at the expense of that separation,
12759 # but since they are left justified, a blank will be
12760 # inserted in the normal case.
12761 $FILLER - 1,
12762 $flags,
12763
12764 # The other * in the format is replaced by this number to
12765 # cause the first main column to right fill with blanks.
12766 # The -1 is for the guaranteed blank following it.
12767 $first_column_width - $FILLER - 1,
12768 $entry,
12769 $info;
12770}
12771
12772my @zero_match_tables; # List of tables that have no matches in this release
12773
12774sub make_table_pod_entries($) {
12775 # This generates the entries for the pod file for a given table.
12776 # Also done at this time are any children tables. The output looks like:
12777 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
12778
12779 my $input_table = shift; # Table the entry is for
12780 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12781
12782 # Generate parent and all its children at the same time.
12783 return if $input_table->parent != $input_table;
12784
12785 my $property = $input_table->property;
12786 my $type = $property->type;
12787 my $full_name = $property->full_name;
12788
12789 my $count = $input_table->count;
12790 my $string_count = clarify_number($count);
12791 my $status = $input_table->status;
12792 my $status_info = $input_table->status_info;
56ca34ca 12793 my $caseless_equivalent = $input_table->caseless_equivalent;
99870f4d
KW
12794
12795 my $entry_for_first_table; # The entry for the first table output.
12796 # Almost certainly, it is the parent.
12797
12798 # For each related table (including itself), we will generate a pod entry
12799 # for each name each table goes by
12800 foreach my $table ($input_table, $input_table->children) {
12801
d4da3f74
KW
12802 # utf8_heavy.pl cannot deal with null string property values, so skip
12803 # any tables that have no non-null names.
12804 next if ! grep { $_->name ne "" } $table->aliases;
99870f4d
KW
12805
12806 # First, gather all the info that applies to this table as a whole.
12807
12808 push @zero_match_tables, $table if $count == 0;
12809
12810 my $table_property = $table->property;
12811
12812 # The short name has all the underscores removed, while the full name
12813 # retains them. Later, we decide whether to output a short synonym
12814 # for the full one, we need to compare apples to apples, so we use the
12815 # short name's length including underscores.
12816 my $table_property_short_name_length;
12817 my $table_property_short_name
12818 = $table_property->short_name(\$table_property_short_name_length);
12819 my $table_property_full_name = $table_property->full_name;
12820
12821 # Get how much savings there is in the short name over the full one
12822 # (delta will always be <= 0)
12823 my $table_property_short_delta = $table_property_short_name_length
12824 - length($table_property_full_name);
12825 my @table_description = $table->description;
12826 my @table_note = $table->note;
12827
12828 # Generate an entry for each alias in this table.
12829 my $entry_for_first_alias; # saves the first one encountered.
12830 foreach my $alias ($table->aliases) {
12831
12832 # Skip if not to go in pod.
12833 next unless $alias->make_pod_entry;
12834
12835 # Start gathering all the components for the entry
12836 my $name = $alias->name;
12837
d4da3f74
KW
12838 # Skip if name is empty, as can't be accessed by regexes.
12839 next if $name eq "";
12840
99870f4d
KW
12841 my $entry; # Holds the left column, may include extras
12842 my $entry_ref; # To refer to the left column's contents from
12843 # another entry; has no extras
12844
12845 # First the left column of the pod entry. Tables for the $perl
12846 # property always use the single form.
12847 if ($table_property == $perl) {
12848 $entry = "\\p{$name}";
12849 $entry_ref = "\\p{$name}";
12850 }
12851 else { # Compound form.
12852
12853 # Only generate one entry for all the aliases that mean true
12854 # or false in binary properties. Append a '*' to indicate
12855 # some are missing. (The heading comment notes this.)
60e471b3 12856 my $rhs;
99870f4d
KW
12857 if ($type == $BINARY) {
12858 next if $name ne 'N' && $name ne 'Y';
60e471b3 12859 $rhs = "$name*";
99870f4d
KW
12860 }
12861 else {
60e471b3 12862 $rhs = $name;
99870f4d
KW
12863 }
12864
12865 # Colon-space is used to give a little more space to be easier
12866 # to read;
12867 $entry = "\\p{"
12868 . $table_property_full_name
60e471b3 12869 . ": $rhs}";
99870f4d
KW
12870
12871 # But for the reference to this entry, which will go in the
12872 # right column, where space is at a premium, use equals
12873 # without a space
12874 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
12875 }
12876
12877 # Then the right (info) column. This is stored as components of
12878 # an array for the moment, then joined into a string later. For
12879 # non-internal only properties, begin the info with the entry for
12880 # the first table we encountered (if any), as things are ordered
12881 # so that that one is the most descriptive. This leads to the
12882 # info column of an entry being a more descriptive version of the
12883 # name column
12884 my @info;
12885 if ($name =~ /^_/) {
12886 push @info,
12887 '(For internal use by Perl, not necessarily stable)';
12888 }
12889 elsif ($entry_for_first_alias) {
12890 push @info, $entry_for_first_alias;
12891 }
12892
12893 # If this entry is equivalent to another, add that to the info,
12894 # using the first such table we encountered
12895 if ($entry_for_first_table) {
12896 if (@info) {
12897 push @info, "(= $entry_for_first_table)";
12898 }
12899 else {
12900 push @info, $entry_for_first_table;
12901 }
12902 }
12903
12904 # If the name is a large integer, add an equivalent with an
12905 # exponent for better readability
12906 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12907 push @info, sprintf "(= %.1e)", $name
12908 }
12909
12910 my $parenthesized = "";
12911 if (! $entry_for_first_alias) {
12912
12913 # This is the first alias for the current table. The alias
12914 # array is ordered so that this is the fullest, most
12915 # descriptive alias, so it gets the fullest info. The other
12916 # aliases are mostly merely pointers to this one, using the
12917 # information already added above.
12918
12919 # Display any status message, but only on the parent table
12920 if ($status && ! $entry_for_first_table) {
12921 push @info, $status_info;
12922 }
12923
12924 # Put out any descriptive info
12925 if (@table_description || @table_note) {
12926 push @info, join "; ", @table_description, @table_note;
12927 }
12928
12929 # Look to see if there is a shorter name we can point people
12930 # at
12931 my $standard_name = standardize($name);
12932 my $short_name;
12933 my $proposed_short = $table->short_name;
12934 if (defined $proposed_short) {
12935 my $standard_short = standardize($proposed_short);
12936
12937 # If the short name is shorter than the standard one, or
12938 # even it it's not, but the combination of it and its
12939 # short property name (as in \p{prop=short} ($perl doesn't
12940 # have this form)) saves at least two characters, then,
12941 # cause it to be listed as a shorter synonym.
12942 if (length $standard_short < length $standard_name
12943 || ($table_property != $perl
12944 && (length($standard_short)
12945 - length($standard_name)
12946 + $table_property_short_delta) # (<= 0)
12947 < -2))
12948 {
12949 $short_name = $proposed_short;
12950 if ($table_property != $perl) {
12951 $short_name = $table_property_short_name
12952 . "=$short_name";
12953 }
12954 $short_name = "\\p{$short_name}";
12955 }
12956 }
12957
12958 # And if this is a compound form name, see if there is a
12959 # single form equivalent
12960 my $single_form;
12961 if ($table_property != $perl) {
12962
12963 # Special case the binary N tables, so that will print
12964 # \P{single}, but use the Y table values to populate
c12f2655 12965 # 'single', as we haven't likewise populated the N table.
99870f4d
KW
12966 my $test_table;
12967 my $p;
12968 if ($type == $BINARY
12969 && $input_table == $property->table('No'))
12970 {
12971 $test_table = $property->table('Yes');
12972 $p = 'P';
12973 }
12974 else {
12975 $test_table = $input_table;
12976 $p = 'p';
12977 }
12978
12979 # Look for a single form amongst all the children.
12980 foreach my $table ($test_table->children) {
12981 next if $table->property != $perl;
12982 my $proposed_name = $table->short_name;
12983 next if ! defined $proposed_name;
12984
12985 # Don't mention internal-only properties as a possible
12986 # single form synonym
12987 next if substr($proposed_name, 0, 1) eq '_';
12988
12989 $proposed_name = "\\$p\{$proposed_name}";
12990 if (! defined $single_form
12991 || length($proposed_name) < length $single_form)
12992 {
12993 $single_form = $proposed_name;
12994
12995 # The goal here is to find a single form; not the
12996 # shortest possible one. We've already found a
12997 # short name. So, stop at the first single form
12998 # found, which is likely to be closer to the
12999 # original.
13000 last;
13001 }
13002 }
13003 }
13004
13005 # Ouput both short and single in the same parenthesized
13006 # expression, but with only one of 'Single', 'Short' if there
13007 # are both items.
13008 if ($short_name || $single_form || $table->conflicting) {
99870f4d
KW
13009 $parenthesized .= "Short: $short_name" if $short_name;
13010 if ($short_name && $single_form) {
13011 $parenthesized .= ', ';
13012 }
13013 elsif ($single_form) {
13014 $parenthesized .= 'Single: ';
13015 }
13016 $parenthesized .= $single_form if $single_form;
13017 }
13018 }
13019
56ca34ca
KW
13020 if ($caseless_equivalent != 0) {
13021 $parenthesized .= '; ' if $parenthesized ne "";
13022 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
13023 }
13024
99870f4d
KW
13025
13026 # Warn if this property isn't the same as one that a
13027 # semi-casual user might expect. The other components of this
13028 # parenthesized structure are calculated only for the first entry
13029 # for this table, but the conflicting is deemed important enough
13030 # to go on every entry.
13031 my $conflicting = join " NOR ", $table->conflicting;
13032 if ($conflicting) {
e5228720 13033 $parenthesized .= '; ' if $parenthesized ne "";
99870f4d
KW
13034 $parenthesized .= "NOT $conflicting";
13035 }
99870f4d 13036
e5228720 13037 push @info, "($parenthesized)" if $parenthesized;
d57ccc9a 13038
0f88d393
KW
13039 if ($name =~ /_$/ && $alias->loose_match) {
13040 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
13041 }
13042
d57ccc9a
KW
13043 if ($table_property != $perl && $table->perl_extension) {
13044 push @info, '(Perl extension)';
13045 }
2cf724d4 13046 push @info, "($string_count)";
99870f4d
KW
13047
13048 # Now, we have both the entry and info so add them to the
13049 # list of all the properties.
13050 push @match_properties,
13051 format_pod_line($indent_info_column,
13052 $entry,
13053 join( " ", @info),
13054 $alias->status,
13055 $alias->loose_match);
13056
13057 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
13058 } # End of looping through the aliases for this table.
13059
13060 if (! $entry_for_first_table) {
13061 $entry_for_first_table = $entry_for_first_alias;
13062 }
13063 } # End of looping through all the related tables
13064 return;
13065}
13066
13067sub pod_alphanumeric_sort {
13068 # Sort pod entries alphanumerically.
13069
99f78760
KW
13070 # The first few character columns are filler, plus the '\p{'; and get rid
13071 # of all the trailing stuff, starting with the trailing '}', so as to sort
13072 # on just 'Name=Value'
13073 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 13074 $a =~ s/}.*//;
99f78760 13075 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
13076 $b =~ s/}.*//;
13077
99f78760
KW
13078 # Determine if the two operands are both internal only or both not.
13079 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13080 # should be the underscore that begins internal only
13081 my $a_is_internal = (substr($a, 0, 1) eq '_');
13082 my $b_is_internal = (substr($b, 0, 1) eq '_');
13083
13084 # Sort so the internals come last in the table instead of first (which the
13085 # leading underscore would otherwise indicate).
13086 if ($a_is_internal != $b_is_internal) {
13087 return 1 if $a_is_internal;
13088 return -1
13089 }
13090
99870f4d 13091 # Determine if the two operands are numeric property values or not.
99f78760 13092 # A numeric property will look like xyz: 3. But the number
99870f4d 13093 # can begin with an optional minus sign, and may have a
99f78760 13094 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
13095 # isn't numeric, use alphabetic sort.
13096 my ($a_initial, $a_number) =
99f78760 13097 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13098 return $a cmp $b unless defined $a_number;
13099 my ($b_initial, $b_number) =
99f78760 13100 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13101 return $a cmp $b unless defined $b_number;
13102
13103 # Here they are both numeric, but use alphabetic sort if the
13104 # initial parts don't match
13105 return $a cmp $b if $a_initial ne $b_initial;
13106
13107 # Convert rationals to floating for the comparison.
13108 $a_number = eval $a_number if $a_number =~ qr{/};
13109 $b_number = eval $b_number if $b_number =~ qr{/};
13110
13111 return $a_number <=> $b_number;
13112}
13113
13114sub make_pod () {
13115 # Create the .pod file. This generates the various subsections and then
13116 # combines them in one big HERE document.
13117
13118 return unless defined $pod_directory;
13119 print "Making pod file\n" if $verbosity >= $PROGRESS;
13120
13121 my $exception_message =
13122 '(Any exceptions are individually noted beginning with the word NOT.)';
13123 my @block_warning;
13124 if (-e 'Blocks.txt') {
13125
13126 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
13127 # if the global $has_In_conflicts indicates we have them.
13128 push @match_properties, format_pod_line($indent_info_column,
13129 '\p{In_*}',
13130 '\p{Block: *}'
13131 . (($has_In_conflicts)
13132 ? " $exception_message"
13133 : ""));
13134 @block_warning = << "END";
13135
77173124
KW
13136Matches in the Block property have shortcuts that begin with "In_". For
13137example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
13138backward compatibility, if there is no conflict with another shortcut, these
13139may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
13140are numerous such conflicting shortcuts. Use of these forms for Block is
13141discouraged, and are flagged as such, not only because of the potential
13142confusion as to what is meant, but also because a later release of Unicode may
13143preempt the shortcut, and your program would no longer be correct. Use the
13144"In_" form instead to avoid this, or even more clearly, use the compound form,
13145e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
13146about this.
99870f4d
KW
13147END
13148 }
77173124 13149 my $text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
99870f4d
KW
13150 $text = "$exception_message $text" if $has_Is_conflicts;
13151
13152 # And the 'Is_ line';
13153 push @match_properties, format_pod_line($indent_info_column,
13154 '\p{Is_*}',
13155 "\\p{*} $text");
13156
13157 # Sort the properties array for output. It is sorted alphabetically
13158 # except numerically for numeric properties, and only output unique lines.
13159 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13160
13161 my $formatted_properties = simple_fold(\@match_properties,
13162 "",
13163 # indent succeeding lines by two extra
13164 # which looks better
13165 $indent_info_column + 2,
13166
13167 # shorten the line length by how much
13168 # the formatter indents, so the folded
13169 # line will fit in the space
13170 # presumably available
13171 $automatic_pod_indent);
13172 # Add column headings, indented to be a little more centered, but not
13173 # exactly
13174 $formatted_properties = format_pod_line($indent_info_column,
13175 ' NAME',
13176 ' INFO')
13177 . "\n"
13178 . $formatted_properties;
13179
13180 # Generate pod documentation lines for the tables that match nothing
0090c5d1 13181 my $zero_matches = "";
99870f4d
KW
13182 if (@zero_match_tables) {
13183 @zero_match_tables = uniques(@zero_match_tables);
13184 $zero_matches = join "\n\n",
13185 map { $_ = '=item \p{' . $_->complete_name . "}" }
13186 sort { $a->complete_name cmp $b->complete_name }
c0de960f 13187 @zero_match_tables;
99870f4d
KW
13188
13189 $zero_matches = <<END;
13190
77173124 13191=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
99870f4d
KW
13192
13193Unicode has some property-value pairs that currently don't match anything.
c12f2655
KW
13194This happens generally either because they are obsolete, or they exist for
13195symmetry with other forms, but no language has yet been encoded that uses
13196them. In this version of Unicode, the following match zero code points:
99870f4d
KW
13197
13198=over 4
13199
13200$zero_matches
13201
13202=back
13203
13204END
13205 }
13206
13207 # Generate list of properties that we don't accept, grouped by the reasons
13208 # why. This is so only put out the 'why' once, and then list all the
13209 # properties that have that reason under it.
13210
13211 my %why_list; # The keys are the reasons; the values are lists of
13212 # properties that have the key as their reason
13213
13214 # For each property, add it to the list that are suppressed for its reason
13215 # The sort will cause the alphabetically first properties to be added to
13216 # each list first, so each list will be sorted.
13217 foreach my $property (sort keys %why_suppressed) {
13218 push @{$why_list{$why_suppressed{$property}}}, $property;
13219 }
13220
13221 # For each reason (sorted by the first property that has that reason)...
13222 my @bad_re_properties;
13223 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13224 keys %why_list)
13225 {
13226 # Add to the output, all the properties that have that reason. Start
13227 # with an empty line.
13228 push @bad_re_properties, "\n\n";
13229
13230 my $has_item = 0; # Flag if actually output anything.
13231 foreach my $name (@{$why_list{$why}}) {
13232
13233 # Split compound names into $property and $table components
13234 my $property = $name;
13235 my $table;
13236 if ($property =~ / (.*) = (.*) /x) {
13237 $property = $1;
13238 $table = $2;
13239 }
13240
13241 # This release of Unicode may not have a property that is
13242 # suppressed, so don't reference a non-existent one.
13243 $property = property_ref($property);
13244 next if ! defined $property;
13245
13246 # And since this list is only for match tables, don't list the
13247 # ones that don't have match tables.
13248 next if ! $property->to_create_match_tables;
13249
13250 # Find any abbreviation, and turn it into a compound name if this
13251 # is a property=value pair.
13252 my $short_name = $property->name;
13253 $short_name .= '=' . $property->table($table)->name if $table;
13254
13255 # And add the property as an item for the reason.
13256 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13257 $has_item = 1;
13258 }
13259
13260 # And add the reason under the list of properties, if such a list
13261 # actually got generated. Note that the header got added
13262 # unconditionally before. But pod ignores extra blank lines, so no
13263 # harm.
13264 push @bad_re_properties, "\n$why\n" if $has_item;
13265
13266 } # End of looping through each reason.
13267
13268 # Generate a list of the properties whose map table we output, from the
13269 # global @map_properties.
13270 my @map_tables_actually_output;
13271 my $info_indent = 20; # Left column is narrower than \p{} table.
13272 foreach my $property (@map_properties) {
13273
13274 # Get the path to the file; don't output any not in the standard
13275 # directory.
13276 my @path = $property->file_path;
13277 next if $path[0] ne $map_directory;
8572ace0
KW
13278
13279 # Don't mention map tables that are for internal-use only
13280 next if $property->to_output_map == $INTERNAL_MAP;
13281
99870f4d
KW
13282 shift @path; # Remove the standard name
13283
13284 my $file = join '/', @path; # In case is in sub directory
13285 my $info = $property->full_name;
13286 my $short_name = $property->name;
13287 if ($info ne $short_name) {
13288 $info .= " ($short_name)";
13289 }
13290 foreach my $more_info ($property->description,
13291 $property->note,
13292 $property->status_info)
13293 {
13294 next unless $more_info;
13295 $info =~ s/\.\Z//;
13296 $info .= ". $more_info";
13297 }
13298 push @map_tables_actually_output, format_pod_line($info_indent,
13299 $file,
13300 $info,
13301 $property->status);
13302 }
13303
13304 # Sort alphabetically, and fold for output
13305 @map_tables_actually_output = sort
13306 pod_alphanumeric_sort @map_tables_actually_output;
13307 @map_tables_actually_output
13308 = simple_fold(\@map_tables_actually_output,
13309 ' ',
13310 $info_indent,
13311 $automatic_pod_indent);
13312
13313 # Generate a list of the formats that can appear in the map tables.
13314 my @map_table_formats;
13315 foreach my $format (sort keys %map_table_formats) {
12916dad 13316 push @map_table_formats, " $format $map_table_formats{$format}\n";
99870f4d
KW
13317 }
13318
12916dad
MS
13319 local $" = "";
13320
99870f4d
KW
13321 # Everything is ready to assemble.
13322 my @OUT = << "END";
13323=begin comment
13324
13325$HEADER
13326
13327To change this file, edit $0 instead.
13328
13329=end comment
13330
13331=head1 NAME
13332
51f494cc 13333$pod_file - Index of Unicode Version $string_version properties in Perl
99870f4d
KW
13334
13335=head1 DESCRIPTION
13336
13337There are many properties in Unicode, and Perl provides access to almost all of
13338them, as well as some additional extensions and short-cut synonyms.
13339
13340And just about all of the few that aren't accessible through the Perl
77173124
KW
13341core are accessible through the modules: L<Unicode::Normalize> and
13342L<Unicode::UCD>, and for Unihan properties, via the CPAN module
13343L<Unicode::Unihan>.
99870f4d
KW
13344
13345This document merely lists all available properties and does not attempt to
13346explain what each property really means. There is a brief description of each
13347Perl extension. There is some detail about Blocks, Scripts, General_Category,
13348and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
13349Unicode properties, refer to the Unicode standard. A good starting place is
13350L<$unicode_reference_url>. More information on the Perl extensions is in
78bb419c 13351L<perlunicode/Other Properties>.
99870f4d
KW
13352
13353Note that you can define your own properties; see
13354L<perlunicode/"User-Defined Character Properties">.
13355
77173124 13356=head1 Properties accessible through C<\\p{}> and C<\\P{}>
99870f4d 13357
77173124
KW
13358The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13359most of the Unicode character properties. The table below shows all these
13360constructs, both single and compound forms.
99870f4d
KW
13361
13362B<Compound forms> consist of two components, separated by an equals sign or a
13363colon. The first component is the property name, and the second component is
13364the particular value of the property to match against, for example,
77173124 13365C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
99870f4d
KW
13366whose Script property is Greek.
13367
77173124 13368B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
99870f4d 13369their equivalent compound forms. The table shows these equivalences. (In our
77173124 13370example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
99870f4d 13371There are also a few Perl-defined single forms that are not shortcuts for a
77173124 13372compound form. One such is C<\\p{Word}>. These are also listed in the table.
99870f4d
KW
13373
13374In parsing these constructs, Perl always ignores Upper/lower case differences
77173124
KW
13375everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
13376C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
13377the left brace completely changes the meaning of the construct, from "match"
13378(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
13379for improved legibility.
99870f4d
KW
13380
13381Also, white space, hyphens, and underscores are also normally ignored
13382everywhere between the {braces}, and hence can be freely added or removed
13383even if the C</x> modifier hasn't been specified on the regular expression.
13384But $a_bold_stricter at the beginning of an entry in the table below
13385means that tighter (stricter) rules are used for that entry:
13386
13387=over 4
13388
77173124 13389=item Single form (C<\\p{name}>) tighter rules:
99870f4d
KW
13390
13391White space, hyphens, and underscores ARE significant
13392except for:
13393
13394=over 4
13395
13396=item * white space adjacent to a non-word character
13397
13398=item * underscores separating digits in numbers
13399
13400=back
13401
13402That means, for example, that you can freely add or remove white space
13403adjacent to (but within) the braces without affecting the meaning.
13404
77173124 13405=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
99870f4d
KW
13406
13407The tighter rules given above for the single form apply to everything to the
13408right of the colon or equals; the looser rules still apply to everything to
13409the left.
13410
13411That means, for example, that you can freely add or remove white space
13412adjacent to (but within) the braces and the colon or equal sign.
13413
13414=back
13415
78bb419c
KW
13416Some properties are considered obsolete by Unicode, but still available.
13417There are several varieties of obsolescence:
99870f4d
KW
13418
13419=over 4
13420
99870f4d
KW
13421=item Stabilized
13422
5f7264c7
KW
13423Obsolete properties may be stabilized. Such a determination does not indicate
13424that the property should or should not be used; instead it is a declaration
13425that the property will not be maintained nor extended for newly encoded
13426characters. Such properties are marked with $a_bold_stabilized in the
13427table.
99870f4d
KW
13428
13429=item Deprecated
13430
5f7264c7 13431An obsolete property may be deprecated, perhaps because its original intent
78bb419c
KW
13432has been replaced by another property, or because its specification was
13433somehow defective. This means that its use is strongly
99870f4d
KW
13434discouraged, so much so that a warning will be issued if used, unless the
13435regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13436statement. $A_bold_deprecated flags each such entry in the table, and
13437the entry there for the longest, most descriptive version of the property will
13438give the reason it is deprecated, and perhaps advice. Perl may issue such a
13439warning, even for properties that aren't officially deprecated by Unicode,
13440when there used to be characters or code points that were matched by them, but
13441no longer. This is to warn you that your program may not work like it did on
13442earlier Unicode releases.
13443
13444A deprecated property may be made unavailable in a future Perl version, so it
13445is best to move away from them.
13446
c12f2655
KW
13447A deprecated property may also be stabilized, but this fact is not shown.
13448
13449=item Obsolete
13450
13451Properties marked with $a_bold_obsolete in the table are considered (plain)
13452obsolete. Generally this designation is given to properties that Unicode once
13453used for internal purposes (but not any longer).
13454
99870f4d
KW
13455=back
13456
13457Some Perl extensions are present for backwards compatibility and are
c12f2655
KW
13458discouraged from being used, but are not obsolete. $A_bold_discouraged
13459flags each such entry in the table. Future Unicode versions may force
13460some of these extensions to be removed without warning, replaced by another
13461property with the same name that means something different. Use the
13462equivalent shown instead.
99870f4d
KW
13463
13464@block_warning
13465
77173124 13466The table below has two columns. The left column contains the C<\\p{}>
98dc9551 13467constructs to look up, possibly preceded by the flags mentioned above; and
99870f4d
KW
13468the right column contains information about them, like a description, or
13469synonyms. It shows both the single and compound forms for each property that
13470has them. If the left column is a short name for a property, the right column
13471will give its longer, more descriptive name; and if the left column is the
13472longest name, the right column will show any equivalent shortest name, in both
13473single and compound forms if applicable.
13474
13475The right column will also caution you if a property means something different
13476than what might normally be expected.
13477
d57ccc9a
KW
13478All single forms are Perl extensions; a few compound forms are as well, and
13479are noted as such.
13480
99870f4d
KW
13481Numbers in (parentheses) indicate the total number of code points matched by
13482the property. For emphasis, those properties that match no code points at all
13483are listed as well in a separate section following the table.
13484
56ca34ca
KW
13485Most properties match the same code points regardless of whether C<"/i">
13486case-insensitive matching is specified or not. But a few properties are
13487affected. These are shown with the notation
13488
13489 (/i= other_property)
13490
13491in the second column. Under case-insensitive matching they match the
13492same code pode points as the property "other_property".
13493
99870f4d 13494There is no description given for most non-Perl defined properties (See
77173124 13495L<$unicode_reference_url> for that).
d73e5302 13496
99870f4d
KW
13497For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13498combinations. For example, entries like:
d73e5302 13499
99870f4d 13500 \\p{Gc: *} \\p{General_Category: *}
5beb625e 13501
99870f4d
KW
13502mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13503for the latter is also valid for the former. Similarly,
5beb625e 13504
99870f4d 13505 \\p{Is_*} \\p{*}
5beb625e 13506
77173124
KW
13507means that if and only if, for example, C<\\p{Foo}> exists, then
13508C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13509And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13510C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
13511underscore.
5beb625e 13512
99870f4d
KW
13513Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13514And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
13515'N*' to indicate this, and doesn't have separate entries for the other
13516possibilities. Note that not all properties which have values 'Yes' and 'No'
13517are binary, and they have all their values spelled out without using this wild
13518card, and a C<NOT> clause in their description that highlights their not being
13519binary. These also require the compound form to match them, whereas true
13520binary properties have both single and compound forms available.
5beb625e 13521
99870f4d
KW
13522Note that all non-essential underscores are removed in the display of the
13523short names below.
5beb625e 13524
c12f2655 13525B<Legend summary:>
5beb625e 13526
99870f4d 13527=over 4
cf25bb62 13528
21405004 13529=item Z<>B<*> is a wild-card
cf25bb62 13530
99870f4d
KW
13531=item B<(\\d+)> in the info column gives the number of code points matched by
13532this property.
cf25bb62 13533
99870f4d 13534=item B<$DEPRECATED> means this is deprecated.
cf25bb62 13535
99870f4d 13536=item B<$OBSOLETE> means this is obsolete.
cf25bb62 13537
99870f4d 13538=item B<$STABILIZED> means this is stabilized.
cf25bb62 13539
99870f4d 13540=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 13541
c12f2655
KW
13542=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13543stable.
5beb625e 13544
99870f4d 13545=back
da7fcca4 13546
99870f4d 13547$formatted_properties
cf25bb62 13548
99870f4d 13549$zero_matches
cf25bb62 13550
99870f4d 13551=head1 Properties not accessible through \\p{} and \\P{}
cf25bb62 13552
99870f4d
KW
13553A few properties are accessible in Perl via various function calls only.
13554These are:
78bb419c 13555
99870f4d
KW
13556 Lowercase_Mapping lc() and lcfirst()
13557 Titlecase_Mapping ucfirst()
13558 Uppercase_Mapping uc()
12ac2576 13559
77173124 13560Case_Folding is accessible through the C</i> modifier in regular expressions.
cf25bb62 13561
77173124 13562The Name property is accessible through the C<\\N{}> interpolation in
99870f4d 13563double-quoted strings and regular expressions, but both usages require a C<use
fb121860
KW
13564charnames;> to be specified, which also contains related functions viacode(),
13565vianame(), and string_vianame().
cf25bb62 13566
99870f4d 13567=head1 Unicode regular expression properties that are NOT accepted by Perl
d2d499f5 13568
99870f4d
KW
13569Perl will generate an error for a few character properties in Unicode when
13570used in a regular expression. The non-Unihan ones are listed below, with the
13571reasons they are not accepted, perhaps with work-arounds. The short names for
13572the properties are listed enclosed in (parentheses).
c12f2655
KW
13573As described after the list, an installation can change the defaults and choose
13574to accept any of these. The list is machine generated based on the
13575choices made for the installation that generated this document.
ae6979a8 13576
99870f4d 13577=over 4
ae6979a8 13578
99870f4d 13579@bad_re_properties
a3a8c5f0 13580
99870f4d 13581=back
a3a8c5f0 13582
b7986f4f
KW
13583An installation can choose to allow any of these to be matched by downloading
13584the Unicode database from L<http://www.unicode.org/Public/> to
f3514a2f
KW
13585C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
13586controlling lists contained in the program
13587C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
13588(C<\%Config> is available from the Config module).
d73e5302 13589
99870f4d 13590=head1 Files in the I<To> directory (for serious hackers only)
12ac2576 13591
99870f4d
KW
13592All Unicode properties are really mappings (in the mathematical sense) from
13593code points to their respective values. As part of its build process,
13594Perl constructs tables containing these mappings for all properties that it
50b27e73 13595deals with. Some, but not all, of these are written out into files.
99870f4d 13596Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
77173124 13597(C<%Config> is available from the C<Config> module).
7ebf06b3 13598
50b27e73
KW
13599Perl reserves the right to change the format and even the existence of any of
13600those files without notice, except the ones that were in existence prior to
c6d31e50 13601release 5.14. If those change, a deprecation cycle will be done first. These
50b27e73 13602are:
12ac2576 13603
99870f4d 13604@map_tables_actually_output
12ac2576 13605
ec2f0128
KW
13606Each of the files in this directory defines several hash entries to help
13607reading programs decipher it. One of them looks like this:
12ac2576 13608
99870f4d 13609 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
d73e5302 13610
77173124
KW
13611where "NAME" is a name to indicate the property. For backwards compatibility,
13612this is not necessarily the property's official Unicode name. (The "To" is
99870f4d
KW
13613also for backwards compatibility.) The hash entry gives the format of the
13614mapping fields of the table, currently one of the following:
d73e5302 13615
12916dad 13616@map_table_formats
d73e5302 13617
99870f4d
KW
13618This format applies only to the entries in the main body of the table.
13619Entries defined in hashes or ones that are missing from the list can have a
13620different format.
d73e5302 13621
ec2f0128 13622The value that the missing entries have is given by another SwashInfo hash
99870f4d 13623entry line; it looks like this:
d73e5302 13624
99870f4d 13625 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
d73e5302 13626
99870f4d 13627This example line says that any Unicode code points not explicitly listed in
77173124 13628the file have the value "NaN" under the property indicated by NAME. If the
99870f4d
KW
13629value is the special string C<< <code point> >>, it means that the value for
13630any missing code point is the code point itself. This happens, for example,
13631in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
77173124 13632character "A", are missing because the uppercase of "A" is itself.
d73e5302 13633
ec2f0128
KW
13634Finally, if the file contains a hash for special case entries, its name is
13635specified by an entry that looks like this:
13636
13637 \$utf8::SwashInfo{'ToNAME'}{'specials_name'} = 'utf8::ToSpecNAME';
13638
99870f4d 13639=head1 SEE ALSO
d73e5302 13640
99870f4d 13641L<$unicode_reference_url>
12ac2576 13642
99870f4d 13643L<perlrecharclass>
12ac2576 13644
99870f4d 13645L<perlunicode>
d73e5302 13646
99870f4d 13647END
d73e5302 13648
9218f1cf
KW
13649 # And write it. The 0 means no utf8.
13650 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
99870f4d
KW
13651 return;
13652}
d73e5302 13653
99870f4d
KW
13654sub make_Heavy () {
13655 # Create and write Heavy.pl, which passes info about the tables to
13656 # utf8_heavy.pl
12ac2576 13657
99870f4d
KW
13658 my @heavy = <<END;
13659$HEADER
13660$INTERNAL_ONLY
d73e5302 13661
99870f4d 13662# This file is for the use of utf8_heavy.pl
12ac2576 13663
c12f2655
KW
13664# Maps Unicode (not Perl single-form extensions) property names in loose
13665# standard form to their corresponding standard names
99870f4d
KW
13666\%utf8::loose_property_name_of = (
13667END
cf25bb62 13668
99870f4d
KW
13669 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
13670 push @heavy, <<END;
13671);
12ac2576 13672
99870f4d
KW
13673# Maps property, table to file for those using stricter matching
13674\%utf8::stricter_to_file_of = (
13675END
13676 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
13677 push @heavy, <<END;
13678);
12ac2576 13679
99870f4d
KW
13680# Maps property, table to file for those using loose matching
13681\%utf8::loose_to_file_of = (
13682END
13683 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
13684 push @heavy, <<END;
13685);
12ac2576 13686
99870f4d
KW
13687# Maps floating point to fractional form
13688\%utf8::nv_floating_to_rational = (
13689END
13690 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
13691 push @heavy, <<END;
13692);
12ac2576 13693
99870f4d
KW
13694# If a floating point number doesn't have enough digits in it to get this
13695# close to a fraction, it isn't considered to be that fraction even if all the
13696# digits it does have match.
13697\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 13698
99870f4d
KW
13699# Deprecated tables to generate a warning for. The key is the file containing
13700# the table, so as to avoid duplication, as many property names can map to the
13701# file, but we only need one entry for all of them.
13702\%utf8::why_deprecated = (
13703END
12ac2576 13704
99870f4d
KW
13705 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
13706 push @heavy, <<END;
13707);
12ac2576 13708
d867ccfb
KW
13709# A few properties have different behavior under /i matching. This maps the
13710# those to substitute files to use under /i.
13711\%utf8::caseless_equivalent = (
13712END
13713
d867ccfb
KW
13714 # We set the key to the file when we associated files with tables, but we
13715 # couldn't do the same for the value then, as we might not have the file
13716 # for the alternate table figured out at that time.
13717 foreach my $cased (keys %caseless_equivalent_to) {
13718 my @path = $caseless_equivalent_to{$cased}->file_path;
13719 my $path = join '/', @path[1, -1];
d867ccfb
KW
13720 $utf8::caseless_equivalent_to{$cased} = $path;
13721 }
13722 push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
13723 push @heavy, <<END;
13724);
13725
99870f4d
KW
137261;
13727END
12ac2576 13728
9218f1cf 13729 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
99870f4d 13730 return;
12ac2576
JP
13731}
13732
99870f4d
KW
13733sub write_all_tables() {
13734 # Write out all the tables generated by this program to files, as well as
13735 # the supporting data structures, pod file, and .t file.
13736
13737 my @writables; # List of tables that actually get written
13738 my %match_tables_to_write; # Used to collapse identical match tables
13739 # into one file. Each key is a hash function
13740 # result to partition tables into buckets.
13741 # Each value is an array of the tables that
13742 # fit in the bucket.
13743
13744 # For each property ...
13745 # (sort so that if there is an immutable file name, it has precedence, so
13746 # some other property can't come in and take over its file name. If b's
13747 # file name is defined, will return 1, meaning to take it first; don't
7fc6cb55
KW
13748 # care if both defined, as they had better be different anyway. And the
13749 # property named 'Perl' needs to be first (it doesn't have any immutable
13750 # file name) because empty properties are defined in terms of it's table
13751 # named 'Any'.)
99870f4d 13752 PROPERTY:
7fc6cb55
KW
13753 foreach my $property (sort { return -1 if $a == $perl;
13754 return 1 if $b == $perl;
13755 return defined $b->file
13756 } property_ref('*'))
13757 {
99870f4d
KW
13758 my $type = $property->type;
13759
13760 # And for each table for that property, starting with the mapping
13761 # table for it ...
13762 TABLE:
13763 foreach my $table($property,
13764
13765 # and all the match tables for it (if any), sorted so
13766 # the ones with the shortest associated file name come
13767 # first. The length sorting prevents problems of a
13768 # longer file taking a name that might have to be used
13769 # by a shorter one. The alphabetic sorting prevents
13770 # differences between releases
13771 sort { my $ext_a = $a->external_name;
13772 return 1 if ! defined $ext_a;
13773 my $ext_b = $b->external_name;
13774 return -1 if ! defined $ext_b;
a92d5c2e
KW
13775
13776 # But return the non-complement table before
13777 # the complement one, as the latter is defined
13778 # in terms of the former, and needs to have
13779 # the information for the former available.
13780 return 1 if $a->complement != 0;
13781 return -1 if $b->complement != 0;
13782
0a695432
KW
13783 # Similarly, return a subservient table after
13784 # a leader
13785 return 1 if $a->leader != $a;
13786 return -1 if $b->leader != $b;
13787
99870f4d
KW
13788 my $cmp = length $ext_a <=> length $ext_b;
13789
13790 # Return result if lengths not equal
13791 return $cmp if $cmp;
13792
13793 # Alphabetic if lengths equal
13794 return $ext_a cmp $ext_b
13795 } $property->tables
13796 )
13797 {
12ac2576 13798
99870f4d
KW
13799 # Here we have a table associated with a property. It could be
13800 # the map table (done first for each property), or one of the
13801 # other tables. Determine which type.
13802 my $is_property = $table->isa('Property');
13803
13804 my $name = $table->name;
13805 my $complete_name = $table->complete_name;
13806
13807 # See if should suppress the table if is empty, but warn if it
13808 # contains something.
13809 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
13810 keys %why_suppress_if_empty_warn_if_not;
13811
13812 # Calculate if this table should have any code points associated
13813 # with it or not.
13814 my $expected_empty =
13815
13816 # $perl should be empty, as well as properties that we just
13817 # don't do anything with
13818 ($is_property
13819 && ($table == $perl
13820 || grep { $complete_name eq $_ }
13821 @unimplemented_properties
13822 )
13823 )
13824
13825 # Match tables in properties we skipped populating should be
13826 # empty
13827 || (! $is_property && ! $property->to_create_match_tables)
13828
13829 # Tables and properties that are expected to have no code
13830 # points should be empty
13831 || $suppress_if_empty_warn_if_not
13832 ;
13833
13834 # Set a boolean if this table is the complement of an empty binary
13835 # table
13836 my $is_complement_of_empty_binary =
13837 $type == $BINARY &&
13838 (($table == $property->table('Y')
13839 && $property->table('N')->is_empty)
13840 || ($table == $property->table('N')
13841 && $property->table('Y')->is_empty));
13842
13843
13844 # Some tables should match everything
13845 my $expected_full =
13846 ($is_property)
13847 ? # All these types of map tables will be full because
13848 # they will have been populated with defaults
13849 ($type == $ENUM || $type == $BINARY)
13850
13851 : # A match table should match everything if its method
13852 # shows it should
13853 ($table->matches_all
13854
13855 # The complement of an empty binary table will match
13856 # everything
13857 || $is_complement_of_empty_binary
13858 )
13859 ;
13860
13861 if ($table->is_empty) {
13862
99870f4d
KW
13863 if ($suppress_if_empty_warn_if_not) {
13864 $table->set_status($SUPPRESSED,
13865 $why_suppress_if_empty_warn_if_not{$complete_name});
13866 }
12ac2576 13867
c12f2655 13868 # Suppress (by skipping them) expected empty tables.
99870f4d
KW
13869 next TABLE if $expected_empty;
13870
13871 # And setup to later output a warning for those that aren't
13872 # known to be allowed to be empty. Don't do the warning if
13873 # this table is a child of another one to avoid duplicating
13874 # the warning that should come from the parent one.
13875 if (($table == $property || $table->parent == $table)
13876 && $table->status ne $SUPPRESSED
13877 && ! grep { $complete_name =~ /^$_$/ }
13878 @tables_that_may_be_empty)
13879 {
13880 push @unhandled_properties, "$table";
13881 }
7fc6cb55
KW
13882
13883 # An empty table is just the complement of everything.
13884 $table->set_complement($Any) if $table != $property;
99870f4d
KW
13885 }
13886 elsif ($expected_empty) {
13887 my $because = "";
13888 if ($suppress_if_empty_warn_if_not) {
13889 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
13890 }
12ac2576 13891
99870f4d
KW
13892 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
13893 }
12ac2576 13894
99870f4d
KW
13895 my $count = $table->count;
13896 if ($expected_full) {
13897 if ($count != $MAX_UNICODE_CODEPOINTS) {
13898 Carp::my_carp("$table matches only "
13899 . clarify_number($count)
13900 . " Unicode code points but should match "
13901 . clarify_number($MAX_UNICODE_CODEPOINTS)
13902 . " (off by "
13903 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
13904 . "). Proceeding anyway.");
13905 }
12ac2576 13906
99870f4d
KW
13907 # Here is expected to be full. If it is because it is the
13908 # complement of an (empty) binary table that is to be
13909 # suppressed, then suppress this one as well.
13910 if ($is_complement_of_empty_binary) {
13911 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
13912 my $opposing = $property->table($opposing_name);
13913 my $opposing_status = $opposing->status;
13914 if ($opposing_status) {
13915 $table->set_status($opposing_status,
13916 $opposing->status_info);
13917 }
13918 }
13919 }
13920 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
13921 if ($table == $property || $table->leader == $table) {
13922 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
13923 }
13924 }
d73e5302 13925
99870f4d
KW
13926 if ($table->status eq $SUPPRESSED) {
13927 if (! $is_property) {
13928 my @children = $table->children;
13929 foreach my $child (@children) {
13930 if ($child->status ne $SUPPRESSED) {
13931 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
13932 }
13933 }
13934 }
13935 next TABLE;
d73e5302 13936
99870f4d
KW
13937 }
13938 if (! $is_property) {
13939
13940 # Several things need to be done just once for each related
13941 # group of match tables. Do them on the parent.
13942 if ($table->parent == $table) {
13943
13944 # Add an entry in the pod file for the table; it also does
13945 # the children.
23e33b60 13946 make_table_pod_entries($table) if defined $pod_directory;
99870f4d
KW
13947
13948 # See if the the table matches identical code points with
13949 # something that has already been output. In that case,
13950 # no need to have two files with the same code points in
13951 # them. We use the table's hash() method to store these
13952 # in buckets, so that it is quite likely that if two
13953 # tables are in the same bucket they will be identical, so
13954 # don't have to compare tables frequently. The tables
13955 # have to have the same status to share a file, so add
13956 # this to the bucket hash. (The reason for this latter is
13957 # that Heavy.pl associates a status with a file.)
06671cbc
KW
13958 # We don't check tables that are inverses of others, as it
13959 # would lead to some coding complications, and checking
13960 # all the regular ones should find everything.
13961 if ($table->complement == 0) {
21be712a 13962 my $hash = $table->hash . ';' . $table->status;
99870f4d 13963
21be712a
KW
13964 # Look at each table that is in the same bucket as
13965 # this one would be.
13966 foreach my $comparison
13967 (@{$match_tables_to_write{$hash}})
13968 {
13969 if ($table->matches_identically_to($comparison)) {
13970 $table->set_equivalent_to($comparison,
99870f4d 13971 Related => 0);
21be712a
KW
13972 next TABLE;
13973 }
99870f4d 13974 }
d73e5302 13975
21be712a
KW
13976 # Here, not equivalent, add this table to the bucket.
13977 push @{$match_tables_to_write{$hash}}, $table;
06671cbc 13978 }
99870f4d
KW
13979 }
13980 }
13981 else {
13982
13983 # Here is the property itself.
13984 # Don't write out or make references to the $perl property
13985 next if $table == $perl;
13986
13987 if ($type != $STRING) {
13988
13989 # There is a mapping stored of the various synonyms to the
13990 # standardized name of the property for utf8_heavy.pl.
13991 # Also, the pod file contains entries of the form:
13992 # \p{alias: *} \p{full: *}
13993 # rather than show every possible combination of things.
13994
13995 my @property_aliases = $property->aliases;
13996
13997 # The full name of this property is stored by convention
13998 # first in the alias array
13999 my $full_property_name =
14000 '\p{' . $property_aliases[0]->name . ': *}';
14001 my $standard_property_name = standardize($table->name);
14002
14003 # For each synonym ...
14004 for my $i (0 .. @property_aliases - 1) {
14005 my $alias = $property_aliases[$i];
14006 my $alias_name = $alias->name;
14007 my $alias_standard = standardize($alias_name);
14008
c12f2655 14009 # For utf8_heavy, set the mapping of the alias to the
99870f4d
KW
14010 # property
14011 if (exists ($loose_property_name_of{$alias_standard}))
14012 {
14013 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
14014 }
14015 else {
14016 $loose_property_name_of{$alias_standard}
14017 = $standard_property_name;
14018 }
14019
23e33b60
KW
14020 # Now for the pod entry for this alias. Skip if not
14021 # outputting a pod; skip the first one, which is the
14022 # full name so won't have an entry like: '\p{full: *}
14023 # \p{full: *}', and skip if don't want an entry for
14024 # this one.
14025 next if $i == 0
14026 || ! defined $pod_directory
14027 || ! $alias->make_pod_entry;
99870f4d 14028
d57ccc9a
KW
14029 my $rhs = $full_property_name;
14030 if ($property != $perl && $table->perl_extension) {
14031 $rhs .= ' (Perl extension)';
14032 }
99870f4d
KW
14033 push @match_properties,
14034 format_pod_line($indent_info_column,
14035 '\p{' . $alias->name . ': *}',
d57ccc9a 14036 $rhs,
99870f4d
KW
14037 $alias->status);
14038 }
14039 } # End of non-string-like property code
d73e5302 14040
d73e5302 14041
c12f2655 14042 # Don't write out a mapping file if not desired.
99870f4d
KW
14043 next if ! $property->to_output_map;
14044 }
d73e5302 14045
99870f4d
KW
14046 # Here, we know we want to write out the table, but don't do it
14047 # yet because there may be other tables that come along and will
14048 # want to share the file, and the file's comments will change to
14049 # mention them. So save for later.
14050 push @writables, $table;
14051
14052 } # End of looping through the property and all its tables.
14053 } # End of looping through all properties.
14054
14055 # Now have all the tables that will have files written for them. Do it.
14056 foreach my $table (@writables) {
14057 my @directory;
14058 my $filename;
14059 my $property = $table->property;
14060 my $is_property = ($table == $property);
14061 if (! $is_property) {
14062
14063 # Match tables for the property go in lib/$subdirectory, which is
14064 # the property's name. Don't use the standard file name for this,
14065 # as may get an unfamiliar alias
14066 @directory = ($matches_directory, $property->external_name);
14067 }
14068 else {
d73e5302 14069
99870f4d
KW
14070 @directory = $table->directory;
14071 $filename = $table->file;
14072 }
d73e5302 14073
98dc9551 14074 # Use specified filename if available, or default to property's
99870f4d
KW
14075 # shortest name. We need an 8.3 safe filename (which means "an 8
14076 # safe" filename, since after the dot is only 'pl', which is < 3)
14077 # The 2nd parameter is if the filename shouldn't be changed, and
14078 # it shouldn't iff there is a hard-coded name for this table.
14079 $filename = construct_filename(
14080 $filename || $table->external_name,
14081 ! $filename, # mutable if no filename
14082 \@directory);
d73e5302 14083
99870f4d 14084 register_file_for_name($table, \@directory, $filename);
d73e5302 14085
99870f4d
KW
14086 # Only need to write one file when shared by more than one
14087 # property
a92d5c2e
KW
14088 next if ! $is_property
14089 && ($table->leader != $table || $table->complement != 0);
d73e5302 14090
99870f4d
KW
14091 # Construct a nice comment to add to the file
14092 $table->set_final_comment;
14093
14094 $table->write;
cf25bb62 14095 }
d73e5302 14096
d73e5302 14097
99870f4d
KW
14098 # Write out the pod file
14099 make_pod;
14100
14101 # And Heavy.pl
14102 make_Heavy;
d73e5302 14103
99870f4d
KW
14104 make_property_test_script() if $make_test_script;
14105 return;
cf25bb62 14106}
d73e5302 14107
99870f4d
KW
14108my @white_space_separators = ( # This used only for making the test script.
14109 "",
14110 ' ',
14111 "\t",
14112 ' '
14113 );
d73e5302 14114
99870f4d
KW
14115sub generate_separator($) {
14116 # This used only for making the test script. It generates the colon or
14117 # equal separator between the property and property value, with random
14118 # white space surrounding the separator
d73e5302 14119
99870f4d 14120 my $lhs = shift;
d73e5302 14121
99870f4d 14122 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 14123
99870f4d
KW
14124 # Choose space before and after randomly
14125 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
14126 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 14127
99870f4d
KW
14128 # And return the whole complex, half the time using a colon, half the
14129 # equals
14130 return $spaces_before
14131 . (rand() < 0.5) ? '=' : ':'
14132 . $spaces_after;
14133}
76ccdbe2 14134
430ada4c 14135sub generate_tests($$$$$) {
99870f4d
KW
14136 # This used only for making the test script. It generates test cases that
14137 # are expected to compile successfully in perl. Note that the lhs and
14138 # rhs are assumed to already be as randomized as the caller wants.
14139
99870f4d
KW
14140 my $lhs = shift; # The property: what's to the left of the colon
14141 # or equals separator
14142 my $rhs = shift; # The property value; what's to the right
14143 my $valid_code = shift; # A code point that's known to be in the
14144 # table given by lhs=rhs; undef if table is
14145 # empty
14146 my $invalid_code = shift; # A code point known to not be in the table;
14147 # undef if the table is all code points
14148 my $warning = shift;
14149
14150 # Get the colon or equal
14151 my $separator = generate_separator($lhs);
14152
14153 # The whole 'property=value'
14154 my $name = "$lhs$separator$rhs";
14155
430ada4c 14156 my @output;
99870f4d
KW
14157 # Create a complete set of tests, with complements.
14158 if (defined $valid_code) {
430ada4c
NC
14159 push @output, <<"EOC"
14160Expect(1, $valid_code, '\\p{$name}', $warning);
14161Expect(0, $valid_code, '\\p{^$name}', $warning);
14162Expect(0, $valid_code, '\\P{$name}', $warning);
14163Expect(1, $valid_code, '\\P{^$name}', $warning);
14164EOC
99870f4d
KW
14165 }
14166 if (defined $invalid_code) {
430ada4c
NC
14167 push @output, <<"EOC"
14168Expect(0, $invalid_code, '\\p{$name}', $warning);
14169Expect(1, $invalid_code, '\\p{^$name}', $warning);
14170Expect(1, $invalid_code, '\\P{$name}', $warning);
14171Expect(0, $invalid_code, '\\P{^$name}', $warning);
14172EOC
14173 }
14174 return @output;
99870f4d 14175}
cf25bb62 14176
430ada4c 14177sub generate_error($$$) {
99870f4d
KW
14178 # This used only for making the test script. It generates test cases that
14179 # are expected to not only not match, but to be syntax or similar errors
14180
99870f4d
KW
14181 my $lhs = shift; # The property: what's to the left of the
14182 # colon or equals separator
14183 my $rhs = shift; # The property value; what's to the right
14184 my $already_in_error = shift; # Boolean; if true it's known that the
14185 # unmodified lhs and rhs will cause an error.
14186 # This routine should not force another one
14187 # Get the colon or equal
14188 my $separator = generate_separator($lhs);
14189
14190 # Since this is an error only, don't bother to randomly decide whether to
14191 # put the error on the left or right side; and assume that the rhs is
14192 # loosely matched, again for convenience rather than rigor.
14193 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
14194
14195 my $property = $lhs . $separator . $rhs;
14196
430ada4c
NC
14197 return <<"EOC";
14198Error('\\p{$property}');
14199Error('\\P{$property}');
14200EOC
d73e5302
JH
14201}
14202
99870f4d
KW
14203# These are used only for making the test script
14204# XXX Maybe should also have a bad strict seps, which includes underscore.
14205
14206my @good_loose_seps = (
14207 " ",
14208 "-",
14209 "\t",
14210 "",
14211 "_",
14212 );
14213my @bad_loose_seps = (
14214 "/a/",
14215 ':=',
14216 );
14217
14218sub randomize_stricter_name {
14219 # This used only for making the test script. Take the input name and
14220 # return a randomized, but valid version of it under the stricter matching
14221 # rules.
14222
14223 my $name = shift;
14224 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14225
14226 # If the name looks like a number (integer, floating, or rational), do
14227 # some extra work
14228 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
14229 my $sign = $1;
14230 my $number = $2;
14231 my $separator = $3;
14232
14233 # If there isn't a sign, part of the time add a plus
14234 # Note: Not testing having any denominator having a minus sign
14235 if (! $sign) {
14236 $sign = '+' if rand() <= .3;
14237 }
14238
14239 # And add 0 or more leading zeros.
14240 $name = $sign . ('0' x int rand(10)) . $number;
14241
14242 if (defined $separator) {
14243 my $extra_zeros = '0' x int rand(10);
cf25bb62 14244
99870f4d
KW
14245 if ($separator eq '.') {
14246
14247 # Similarly, add 0 or more trailing zeros after a decimal
14248 # point
14249 $name .= $extra_zeros;
14250 }
14251 else {
14252
14253 # Or, leading zeros before the denominator
14254 $name =~ s,/,/$extra_zeros,;
14255 }
14256 }
cf25bb62 14257 }
d73e5302 14258
99870f4d
KW
14259 # For legibility of the test, only change the case of whole sections at a
14260 # time. To do this, first split into sections. The split returns the
14261 # delimiters
14262 my @sections;
14263 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
14264 trace $section if main::DEBUG && $to_trace;
14265
14266 if (length $section > 1 && $section !~ /\D/) {
14267
14268 # If the section is a sequence of digits, about half the time
14269 # randomly add underscores between some of them.
14270 if (rand() > .5) {
14271
14272 # Figure out how many underscores to add. max is 1 less than
14273 # the number of digits. (But add 1 at the end to make sure
14274 # result isn't 0, and compensate earlier by subtracting 2
14275 # instead of 1)
14276 my $num_underscores = int rand(length($section) - 2) + 1;
14277
14278 # And add them evenly throughout, for convenience, not rigor
14279 use integer;
14280 my $spacing = (length($section) - 1)/ $num_underscores;
14281 my $temp = $section;
14282 $section = "";
14283 for my $i (1 .. $num_underscores) {
14284 $section .= substr($temp, 0, $spacing, "") . '_';
14285 }
14286 $section .= $temp;
14287 }
14288 push @sections, $section;
14289 }
14290 else {
d73e5302 14291
99870f4d
KW
14292 # Here not a sequence of digits. Change the case of the section
14293 # randomly
14294 my $switch = int rand(4);
14295 if ($switch == 0) {
14296 push @sections, uc $section;
14297 }
14298 elsif ($switch == 1) {
14299 push @sections, lc $section;
14300 }
14301 elsif ($switch == 2) {
14302 push @sections, ucfirst $section;
14303 }
14304 else {
14305 push @sections, $section;
14306 }
14307 }
cf25bb62 14308 }
99870f4d
KW
14309 trace "returning", join "", @sections if main::DEBUG && $to_trace;
14310 return join "", @sections;
14311}
71d929cb 14312
99870f4d
KW
14313sub randomize_loose_name($;$) {
14314 # This used only for making the test script
71d929cb 14315
99870f4d
KW
14316 my $name = shift;
14317 my $want_error = shift; # if true, make an error
14318 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14319
14320 $name = randomize_stricter_name($name);
5beb625e
JH
14321
14322 my @parts;
99870f4d 14323 push @parts, $good_loose_seps[rand(@good_loose_seps)];
45376db6
KW
14324
14325 # Preserve trailing ones for the sake of not stripping the underscore from
14326 # 'L_'
14327 for my $part (split /[-\s_]+ (?= . )/, $name) {
5beb625e 14328 if (@parts) {
99870f4d
KW
14329 if ($want_error and rand() < 0.3) {
14330 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
14331 $want_error = 0;
14332 }
14333 else {
14334 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
14335 }
14336 }
99870f4d 14337 push @parts, $part;
5beb625e 14338 }
99870f4d
KW
14339 my $new = join("", @parts);
14340 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 14341
99870f4d 14342 if ($want_error) {
5beb625e 14343 if (rand() >= 0.5) {
99870f4d
KW
14344 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
14345 }
14346 else {
14347 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
14348 }
14349 }
14350 return $new;
14351}
14352
99870f4d
KW
14353# Used to make sure don't generate duplicate test cases.
14354my %test_generated;
5beb625e 14355
99870f4d
KW
14356sub make_property_test_script() {
14357 # This used only for making the test script
14358 # this written directly -- it's huge.
5beb625e 14359
99870f4d 14360 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 14361
99870f4d
KW
14362 # This uses randomness to test different possibilities without testing all
14363 # possibilities. To ensure repeatability, set the seed to 0. But if
14364 # tests are added, it will perturb all later ones in the .t file
14365 srand 0;
5beb625e 14366
3df51b85
KW
14367 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
14368
99870f4d
KW
14369 # Keep going down an order of magnitude
14370 # until find that adding this quantity to
14371 # 1 remains 1; but put an upper limit on
14372 # this so in case this algorithm doesn't
14373 # work properly on some platform, that we
14374 # won't loop forever.
14375 my $digits = 0;
14376 my $min_floating_slop = 1;
14377 while (1+ $min_floating_slop != 1
14378 && $digits++ < 50)
5beb625e 14379 {
99870f4d
KW
14380 my $next = $min_floating_slop / 10;
14381 last if $next == 0; # If underflows,
14382 # use previous one
14383 $min_floating_slop = $next;
5beb625e 14384 }
430ada4c
NC
14385
14386 # It doesn't matter whether the elements of this array contain single lines
14387 # or multiple lines. main::write doesn't count the lines.
14388 my @output;
99870f4d
KW
14389
14390 foreach my $property (property_ref('*')) {
14391 foreach my $table ($property->tables) {
14392
14393 # Find code points that match, and don't match this table.
14394 my $valid = $table->get_valid_code_point;
14395 my $invalid = $table->get_invalid_code_point;
14396 my $warning = ($table->status eq $DEPRECATED)
14397 ? "'deprecated'"
14398 : '""';
14399
14400 # Test each possible combination of the property's aliases with
14401 # the table's. If this gets to be too many, could do what is done
14402 # in the set_final_comment() for Tables
14403 my @table_aliases = $table->aliases;
14404 my @property_aliases = $table->property->aliases;
807807b7
KW
14405
14406 # Every property can be optionally be prefixed by 'Is_', so test
14407 # that those work, by creating such a new alias for each
14408 # pre-existing one.
14409 push @property_aliases, map { Alias->new("Is_" . $_->name,
14410 $_->loose_match,
14411 $_->make_pod_entry,
14412 $_->externally_ok,
14413 $_->status)
14414 } @property_aliases;
99870f4d
KW
14415 my $max = max(scalar @table_aliases, scalar @property_aliases);
14416 for my $j (0 .. $max - 1) {
14417
14418 # The current alias for property is the next one on the list,
14419 # or if beyond the end, start over. Similarly for table
14420 my $property_name
14421 = $property_aliases[$j % @property_aliases]->name;
14422
14423 $property_name = "" if $table->property == $perl;
14424 my $table_alias = $table_aliases[$j % @table_aliases];
14425 my $table_name = $table_alias->name;
14426 my $loose_match = $table_alias->loose_match;
14427
14428 # If the table doesn't have a file, any test for it is
14429 # already guaranteed to be in error
14430 my $already_error = ! $table->file_path;
14431
14432 # Generate error cases for this alias.
430ada4c
NC
14433 push @output, generate_error($property_name,
14434 $table_name,
14435 $already_error);
99870f4d
KW
14436
14437 # If the table is guaranteed to always generate an error,
14438 # quit now without generating success cases.
14439 next if $already_error;
14440
14441 # Now for the success cases.
14442 my $random;
14443 if ($loose_match) {
14444
14445 # For loose matching, create an extra test case for the
14446 # standard name.
14447 my $standard = standardize($table_name);
14448
14449 # $test_name should be a unique combination for each test
14450 # case; used just to avoid duplicate tests
14451 my $test_name = "$property_name=$standard";
14452
14453 # Don't output duplicate test cases.
14454 if (! exists $test_generated{$test_name}) {
14455 $test_generated{$test_name} = 1;
430ada4c
NC
14456 push @output, generate_tests($property_name,
14457 $standard,
14458 $valid,
14459 $invalid,
14460 $warning,
14461 );
5beb625e 14462 }
99870f4d
KW
14463 $random = randomize_loose_name($table_name)
14464 }
14465 else { # Stricter match
14466 $random = randomize_stricter_name($table_name);
99598c8c 14467 }
99598c8c 14468
99870f4d
KW
14469 # Now for the main test case for this alias.
14470 my $test_name = "$property_name=$random";
14471 if (! exists $test_generated{$test_name}) {
14472 $test_generated{$test_name} = 1;
430ada4c
NC
14473 push @output, generate_tests($property_name,
14474 $random,
14475 $valid,
14476 $invalid,
14477 $warning,
14478 );
99870f4d
KW
14479
14480 # If the name is a rational number, add tests for the
14481 # floating point equivalent.
14482 if ($table_name =~ qr{/}) {
14483
14484 # Calculate the float, and find just the fraction.
14485 my $float = eval $table_name;
14486 my ($whole, $fraction)
14487 = $float =~ / (.*) \. (.*) /x;
14488
14489 # Starting with one digit after the decimal point,
14490 # create a test for each possible precision (number of
14491 # digits past the decimal point) until well beyond the
14492 # native number found on this machine. (If we started
14493 # with 0 digits, it would be an integer, which could
14494 # well match an unrelated table)
14495 PLACE:
14496 for my $i (1 .. $min_floating_slop + 3) {
14497 my $table_name = sprintf("%.*f", $i, $float);
14498 if ($i < $MIN_FRACTION_LENGTH) {
14499
14500 # If the test case has fewer digits than the
14501 # minimum acceptable precision, it shouldn't
14502 # succeed, so we expect an error for it.
14503 # E.g., 2/3 = .7 at one decimal point, and we
14504 # shouldn't say it matches .7. We should make
14505 # it be .667 at least before agreeing that the
14506 # intent was to match 2/3. But at the
14507 # less-than- acceptable level of precision, it
14508 # might actually match an unrelated number.
14509 # So don't generate a test case if this
14510 # conflating is possible. In our example, we
14511 # don't want 2/3 matching 7/10, if there is
14512 # a 7/10 code point.
14513 for my $existing
14514 (keys %nv_floating_to_rational)
14515 {
14516 next PLACE
14517 if abs($table_name - $existing)
14518 < $MAX_FLOATING_SLOP;
14519 }
430ada4c
NC
14520 push @output, generate_error($property_name,
14521 $table_name,
14522 1 # 1 => already an error
14523 );
99870f4d
KW
14524 }
14525 else {
14526
14527 # Here the number of digits exceeds the
14528 # minimum we think is needed. So generate a
14529 # success test case for it.
430ada4c
NC
14530 push @output, generate_tests($property_name,
14531 $table_name,
14532 $valid,
14533 $invalid,
14534 $warning,
14535 );
99870f4d
KW
14536 }
14537 }
99598c8c
JH
14538 }
14539 }
99870f4d
KW
14540 }
14541 }
14542 }
37e2e78e 14543
9218f1cf
KW
14544 &write($t_path,
14545 0, # Not utf8;
14546 [<DATA>,
14547 @output,
14548 (map {"Test_X('$_');\n"} @backslash_X_tests),
14549 "Finished();\n"]);
99870f4d
KW
14550 return;
14551}
99598c8c 14552
99870f4d
KW
14553# This is a list of the input files and how to handle them. The files are
14554# processed in their order in this list. Some reordering is possible if
14555# desired, but the v0 files should be first, and the extracted before the
14556# others except DAge.txt (as data in an extracted file can be over-ridden by
14557# the non-extracted. Some other files depend on data derived from an earlier
14558# file, like UnicodeData requires data from Jamo, and the case changing and
14559# folding requires data from Unicode. Mostly, it safest to order by first
14560# version releases in (except the Jamo). DAge.txt is read before the
14561# extracted ones because of the rarely used feature $compare_versions. In the
14562# unlikely event that there were ever an extracted file that contained the Age
14563# property information, it would have to go in front of DAge.
14564#
14565# The version strings allow the program to know whether to expect a file or
14566# not, but if a file exists in the directory, it will be processed, even if it
14567# is in a version earlier than expected, so you can copy files from a later
14568# release into an earlier release's directory.
14569my @input_file_objects = (
14570 Input_file->new('PropertyAliases.txt', v0,
14571 Handler => \&process_PropertyAliases,
14572 ),
14573 Input_file->new(undef, v0, # No file associated with this
3df51b85 14574 Progress_Message => 'Finishing property setup',
99870f4d
KW
14575 Handler => \&finish_property_setup,
14576 ),
14577 Input_file->new('PropValueAliases.txt', v0,
14578 Handler => \&process_PropValueAliases,
14579 Has_Missings_Defaults => $NOT_IGNORED,
14580 ),
14581 Input_file->new('DAge.txt', v3.2.0,
14582 Has_Missings_Defaults => $NOT_IGNORED,
14583 Property => 'Age'
14584 ),
14585 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
14586 Property => 'General_Category',
14587 ),
14588 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
14589 Property => 'Canonical_Combining_Class',
14590 Has_Missings_Defaults => $NOT_IGNORED,
14591 ),
14592 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
14593 Property => 'Numeric_Type',
14594 Has_Missings_Defaults => $NOT_IGNORED,
14595 ),
14596 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
14597 Property => 'East_Asian_Width',
14598 Has_Missings_Defaults => $NOT_IGNORED,
14599 ),
14600 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
14601 Property => 'Line_Break',
14602 Has_Missings_Defaults => $NOT_IGNORED,
14603 ),
14604 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
14605 Property => 'Bidi_Class',
14606 Has_Missings_Defaults => $NOT_IGNORED,
14607 ),
14608 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
14609 Property => 'Decomposition_Type',
14610 Has_Missings_Defaults => $NOT_IGNORED,
14611 ),
14612 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
14613 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
14614 Property => 'Numeric_Value',
14615 Each_Line_Handler => \&filter_numeric_value_line,
14616 Has_Missings_Defaults => $NOT_IGNORED,
14617 ),
14618 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
14619 Property => 'Joining_Group',
14620 Has_Missings_Defaults => $NOT_IGNORED,
14621 ),
14622
14623 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
14624 Property => 'Joining_Type',
14625 Has_Missings_Defaults => $NOT_IGNORED,
14626 ),
14627 Input_file->new('Jamo.txt', v2.0.0,
14628 Property => 'Jamo_Short_Name',
14629 Each_Line_Handler => \&filter_jamo_line,
14630 ),
14631 Input_file->new('UnicodeData.txt', v1.1.5,
14632 Pre_Handler => \&setup_UnicodeData,
14633
14634 # We clean up this file for some early versions.
14635 Each_Line_Handler => [ (($v_version lt v2.0.0 )
14636 ? \&filter_v1_ucd
14637 : ($v_version eq v2.1.5)
14638 ? \&filter_v2_1_5_ucd
3ffed8c2
KW
14639
14640 # And for 5.14 Perls with 6.0,
14641 # have to also make changes
14642 : ($v_version ge v6.0.0)
14643 ? \&filter_v6_ucd
14644 : undef),
99870f4d
KW
14645
14646 # And the main filter
14647 \&filter_UnicodeData_line,
14648 ],
14649 EOF_Handler => \&EOF_UnicodeData,
14650 ),
14651 Input_file->new('ArabicShaping.txt', v2.0.0,
14652 Each_Line_Handler =>
14653 [ ($v_version lt 4.1.0)
14654 ? \&filter_old_style_arabic_shaping
14655 : undef,
14656 \&filter_arabic_shaping_line,
14657 ],
14658 Has_Missings_Defaults => $NOT_IGNORED,
14659 ),
14660 Input_file->new('Blocks.txt', v2.0.0,
14661 Property => 'Block',
14662 Has_Missings_Defaults => $NOT_IGNORED,
14663 Each_Line_Handler => \&filter_blocks_lines
14664 ),
14665 Input_file->new('PropList.txt', v2.0.0,
14666 Each_Line_Handler => (($v_version lt v3.1.0)
14667 ? \&filter_old_style_proplist
14668 : undef),
14669 ),
14670 Input_file->new('Unihan.txt', v2.0.0,
14671 Pre_Handler => \&setup_unihan,
14672 Optional => 1,
14673 Each_Line_Handler => \&filter_unihan_line,
14674 ),
14675 Input_file->new('SpecialCasing.txt', v2.1.8,
14676 Each_Line_Handler => \&filter_special_casing_line,
14677 Pre_Handler => \&setup_special_casing,
14678 ),
14679 Input_file->new(
14680 'LineBreak.txt', v3.0.0,
14681 Has_Missings_Defaults => $NOT_IGNORED,
14682 Property => 'Line_Break',
14683 # Early versions had problematic syntax
14684 Each_Line_Handler => (($v_version lt v3.1.0)
14685 ? \&filter_early_ea_lb
14686 : undef),
14687 ),
14688 Input_file->new('EastAsianWidth.txt', v3.0.0,
14689 Property => 'East_Asian_Width',
14690 Has_Missings_Defaults => $NOT_IGNORED,
14691 # Early versions had problematic syntax
14692 Each_Line_Handler => (($v_version lt v3.1.0)
14693 ? \&filter_early_ea_lb
14694 : undef),
14695 ),
14696 Input_file->new('CompositionExclusions.txt', v3.0.0,
14697 Property => 'Composition_Exclusion',
14698 ),
14699 Input_file->new('BidiMirroring.txt', v3.0.1,
14700 Property => 'Bidi_Mirroring_Glyph',
14701 ),
37e2e78e
KW
14702 Input_file->new("NormalizationTest.txt", v3.0.1,
14703 Skip => 1,
14704 ),
99870f4d
KW
14705 Input_file->new('CaseFolding.txt', v3.0.1,
14706 Pre_Handler => \&setup_case_folding,
14707 Each_Line_Handler =>
14708 [ ($v_version lt v3.1.0)
14709 ? \&filter_old_style_case_folding
14710 : undef,
14711 \&filter_case_folding_line
14712 ],
99870f4d
KW
14713 ),
14714 Input_file->new('DCoreProperties.txt', v3.1.0,
14715 # 5.2 changed this file
14716 Has_Missings_Defaults => (($v_version ge v5.2.0)
14717 ? $NOT_IGNORED
14718 : $NO_DEFAULTS),
14719 ),
14720 Input_file->new('Scripts.txt', v3.1.0,
14721 Property => 'Script',
14722 Has_Missings_Defaults => $NOT_IGNORED,
14723 ),
14724 Input_file->new('DNormalizationProps.txt', v3.1.0,
14725 Has_Missings_Defaults => $NOT_IGNORED,
14726 Each_Line_Handler => (($v_version lt v4.0.1)
14727 ? \&filter_old_style_normalization_lines
14728 : undef),
14729 ),
14730 Input_file->new('HangulSyllableType.txt', v4.0.0,
14731 Has_Missings_Defaults => $NOT_IGNORED,
14732 Property => 'Hangul_Syllable_Type'),
14733 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
14734 Property => 'Word_Break',
14735 Has_Missings_Defaults => $NOT_IGNORED,
14736 ),
14737 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
14738 Property => 'Grapheme_Cluster_Break',
14739 Has_Missings_Defaults => $NOT_IGNORED,
14740 ),
37e2e78e
KW
14741 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
14742 Handler => \&process_GCB_test,
14743 ),
14744 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
14745 Skip => 1,
14746 ),
14747 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
14748 Skip => 1,
14749 ),
14750 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
14751 Skip => 1,
14752 ),
99870f4d
KW
14753 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
14754 Property => 'Sentence_Break',
14755 Has_Missings_Defaults => $NOT_IGNORED,
14756 ),
14757 Input_file->new('NamedSequences.txt', v4.1.0,
14758 Handler => \&process_NamedSequences
14759 ),
14760 Input_file->new('NameAliases.txt', v5.0.0,
14761 Property => 'Name_Alias',
14762 ),
37e2e78e
KW
14763 Input_file->new("BidiTest.txt", v5.2.0,
14764 Skip => 1,
14765 ),
99870f4d
KW
14766 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
14767 Optional => 1,
14768 Each_Line_Handler => \&filter_unihan_line,
14769 ),
14770 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
14771 Optional => 1,
14772 Each_Line_Handler => \&filter_unihan_line,
14773 ),
14774 Input_file->new('UnihanIRGSources.txt', v5.2.0,
14775 Optional => 1,
14776 Pre_Handler => \&setup_unihan,
14777 Each_Line_Handler => \&filter_unihan_line,
14778 ),
14779 Input_file->new('UnihanNumericValues.txt', v5.2.0,
14780 Optional => 1,
14781 Each_Line_Handler => \&filter_unihan_line,
14782 ),
14783 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
14784 Optional => 1,
14785 Each_Line_Handler => \&filter_unihan_line,
14786 ),
14787 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
14788 Optional => 1,
14789 Each_Line_Handler => \&filter_unihan_line,
14790 ),
14791 Input_file->new('UnihanReadings.txt', v5.2.0,
14792 Optional => 1,
14793 Each_Line_Handler => \&filter_unihan_line,
14794 ),
14795 Input_file->new('UnihanVariants.txt', v5.2.0,
14796 Optional => 1,
14797 Each_Line_Handler => \&filter_unihan_line,
14798 ),
82aed44a
KW
14799 Input_file->new('ScriptExtensions.txt', v6.0.0,
14800 Property => 'Script_Extensions',
14801 Pre_Handler => \&setup_script_extensions,
fbe1e607 14802 Each_Line_Handler => \&filter_script_extensions_line,
82aed44a 14803 ),
99870f4d 14804);
99598c8c 14805
99870f4d
KW
14806# End of all the preliminaries.
14807# Do it...
99598c8c 14808
99870f4d
KW
14809if ($compare_versions) {
14810 Carp::my_carp(<<END
14811Warning. \$compare_versions is set. Output is not suitable for production
14812END
14813 );
14814}
99598c8c 14815
99870f4d
KW
14816# Put into %potential_files a list of all the files in the directory structure
14817# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 14818# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
14819my @ignored_files_full_names = map { File::Spec->rel2abs(
14820 internal_file_to_platform($_))
14821 } keys %ignored_files;
14822File::Find::find({
14823 wanted=>sub {
37e2e78e 14824 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 14825 my $full = lc(File::Spec->rel2abs($_));
99870f4d 14826 $potential_files{$full} = 1
37e2e78e 14827 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
14828 return;
14829 }
14830}, File::Spec->curdir());
99598c8c 14831
99870f4d 14832my @mktables_list_output_files;
cdcef19a 14833my $old_start_time = 0;
cf25bb62 14834
3644ba60
KW
14835if (! -e $file_list) {
14836 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
14837 $write_unchanged_files = 1;
14838} elsif ($write_unchanged_files) {
99870f4d
KW
14839 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
14840}
14841else {
14842 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
14843 my $file_handle;
23e33b60 14844 if (! open $file_handle, "<", $file_list) {
3644ba60 14845 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
14846 $glob_list = 1;
14847 }
14848 else {
14849 my @input;
14850
14851 # Read and parse mktables.lst, placing the results from the first part
14852 # into @input, and the second part into @mktables_list_output_files
14853 for my $list ( \@input, \@mktables_list_output_files ) {
14854 while (<$file_handle>) {
14855 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
14856 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
14857 $old_start_time = $1;
14858 }
99870f4d
KW
14859 next if /^ \s* (?: \# .* )? $/x;
14860 last if /^ =+ $/x;
14861 my ( $file ) = split /\t/;
14862 push @$list, $file;
cf25bb62 14863 }
99870f4d
KW
14864 @$list = uniques(@$list);
14865 next;
cf25bb62
JH
14866 }
14867
99870f4d
KW
14868 # Look through all the input files
14869 foreach my $input (@input) {
14870 next if $input eq 'version'; # Already have checked this.
cf25bb62 14871
99870f4d
KW
14872 # Ignore if doesn't exist. The checking about whether we care or
14873 # not is done via the Input_file object.
14874 next if ! file_exists($input);
5beb625e 14875
99870f4d
KW
14876 # The paths are stored with relative names, and with '/' as the
14877 # delimiter; convert to absolute on this machine
517956bf 14878 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
99870f4d 14879 $potential_files{$full} = 1
517956bf 14880 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 14881 }
5beb625e 14882 }
cf25bb62 14883
99870f4d
KW
14884 close $file_handle;
14885}
14886
14887if ($glob_list) {
14888
14889 # Here wants to process all .txt files in the directory structure.
14890 # Convert them to full path names. They are stored in the platform's
14891 # relative style
f86864ac
KW
14892 my @known_files;
14893 foreach my $object (@input_file_objects) {
14894 my $file = $object->file;
14895 next unless defined $file;
14896 push @known_files, File::Spec->rel2abs($file);
14897 }
99870f4d
KW
14898
14899 my @unknown_input_files;
14900 foreach my $file (keys %potential_files) {
517956bf 14901 next if grep { lc($file) eq lc($_) } @known_files;
99870f4d
KW
14902
14903 # Here, the file is unknown to us. Get relative path name
14904 $file = File::Spec->abs2rel($file);
14905 push @unknown_input_files, $file;
14906
14907 # What will happen is we create a data structure for it, and add it to
14908 # the list of input files to process. First get the subdirectories
14909 # into an array
14910 my (undef, $directories, undef) = File::Spec->splitpath($file);
14911 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14912 my @directories = File::Spec->splitdir($directories);
14913
14914 # If the file isn't extracted (meaning none of the directories is the
14915 # extracted one), just add it to the end of the list of inputs.
14916 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 14917 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
14918 }
14919 else {
14920
14921 # Here, the file is extracted. It needs to go ahead of most other
14922 # processing. Search for the first input file that isn't a
14923 # special required property (that is, find one whose first_release
14924 # is non-0), and isn't extracted. Also, the Age property file is
14925 # processed before the extracted ones, just in case
14926 # $compare_versions is set.
14927 for (my $i = 0; $i < @input_file_objects; $i++) {
14928 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
14929 && lc($input_file_objects[$i]->file) ne 'dage.txt'
14930 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 14931 {
99f78760 14932 splice @input_file_objects, $i, 0,
37e2e78e 14933 Input_file->new($file, v0);
99870f4d
KW
14934 last;
14935 }
cf25bb62 14936 }
99870f4d 14937
cf25bb62 14938 }
d2d499f5 14939 }
99870f4d 14940 if (@unknown_input_files) {
23e33b60 14941 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
14942
14943The following files are unknown as to how to handle. Assuming they are
14944typical property files. You'll know by later error messages if it worked or
14945not:
14946END
99f78760 14947 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
14948 }
14949} # End of looking through directory structure for more .txt files.
5beb625e 14950
99870f4d
KW
14951# Create the list of input files from the objects we have defined, plus
14952# version
14953my @input_files = 'version';
14954foreach my $object (@input_file_objects) {
14955 my $file = $object->file;
14956 next if ! defined $file; # Not all objects have files
14957 next if $object->optional && ! -e $file;
14958 push @input_files, $file;
14959}
5beb625e 14960
99870f4d
KW
14961if ( $verbosity >= $VERBOSE ) {
14962 print "Expecting ".scalar( @input_files )." input files. ",
14963 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
14964}
cf25bb62 14965
aeab6150
KW
14966# We set $most_recent to be the most recently changed input file, including
14967# this program itself (done much earlier in this file)
99870f4d 14968foreach my $in (@input_files) {
cdcef19a
KW
14969 next unless -e $in; # Keep going even if missing a file
14970 my $mod_time = (stat $in)[9];
aeab6150 14971 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
14972
14973 # See that the input files have distinct names, to warn someone if they
14974 # are adding a new one
14975 if ($make_list) {
14976 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
14977 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14978 my @directories = File::Spec->splitdir($directories);
14979 my $base = $file =~ s/\.txt$//;
14980 construct_filename($file, 'mutable', \@directories);
cf25bb62 14981 }
99870f4d 14982}
cf25bb62 14983
dff6c046 14984my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 14985 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 14986 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 14987
99870f4d
KW
14988# Now we check to see if any output files are older than youngest, if
14989# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 14990if (! $rebuild) {
99870f4d
KW
14991 foreach my $out (@mktables_list_output_files) {
14992 if ( ! file_exists($out)) {
14993 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 14994 $rebuild = 1;
99870f4d
KW
14995 last;
14996 }
14997 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
14998 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
14999 if ( (stat $out)[9] <= $most_recent ) {
15000 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 15001 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 15002 $rebuild = 1;
99870f4d 15003 last;
cf25bb62 15004 }
cf25bb62 15005 }
99870f4d 15006}
d1d1cd7a 15007if (! $rebuild) {
1265e11f 15008 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
15009 exit(0);
15010}
15011print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 15012
99870f4d
KW
15013# Ready to do the major processing. First create the perl pseudo-property.
15014$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 15015
99870f4d
KW
15016# Process each input file
15017foreach my $file (@input_file_objects) {
15018 $file->run;
d2d499f5
JH
15019}
15020
99870f4d 15021# Finish the table generation.
c4051cc5 15022
99870f4d
KW
15023print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
15024finish_Unicode();
c4051cc5 15025
99870f4d
KW
15026print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
15027compile_perl();
c4051cc5 15028
99870f4d
KW
15029print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
15030add_perl_synonyms();
c4051cc5 15031
99870f4d
KW
15032print "Writing tables\n" if $verbosity >= $PROGRESS;
15033write_all_tables();
c4051cc5 15034
99870f4d
KW
15035# Write mktables.lst
15036if ( $file_list and $make_list ) {
c4051cc5 15037
99870f4d
KW
15038 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
15039 foreach my $file (@input_files, @files_actually_output) {
15040 my (undef, $directories, $file) = File::Spec->splitpath($file);
15041 my @directories = File::Spec->splitdir($directories);
15042 $file = join '/', @directories, $file;
15043 }
15044
15045 my $ofh;
15046 if (! open $ofh,">",$file_list) {
15047 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
15048 return
15049 }
15050 else {
cdcef19a 15051 my $localtime = localtime $start_time;
99870f4d
KW
15052 print $ofh <<"END";
15053#
15054# $file_list -- File list for $0.
97050450 15055#
cdcef19a 15056# Autogenerated starting on $start_time ($localtime)
97050450
YO
15057#
15058# - First section is input files
99870f4d 15059# ($0 itself is not listed but is automatically considered an input)
98dc9551 15060# - Section separator is /^=+\$/
97050450
YO
15061# - Second section is a list of output files.
15062# - Lines matching /^\\s*#/ are treated as comments
15063# which along with blank lines are ignored.
15064#
15065
15066# Input files:
15067
99870f4d
KW
15068END
15069 print $ofh "$_\n" for sort(@input_files);
15070 print $ofh "\n=================================\n# Output files:\n\n";
15071 print $ofh "$_\n" for sort @files_actually_output;
15072 print $ofh "\n# ",scalar(@input_files)," input files\n",
15073 "# ",scalar(@files_actually_output)+1," output files\n\n",
15074 "# End list\n";
15075 close $ofh
15076 or Carp::my_carp("Failed to close $ofh: $!");
15077
15078 print "Filelist has ",scalar(@input_files)," input files and ",
15079 scalar(@files_actually_output)+1," output files\n"
15080 if $verbosity >= $VERBOSE;
15081 }
15082}
15083
15084# Output these warnings unless -q explicitly specified.
c83dffeb 15085if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
99870f4d
KW
15086 if (@unhandled_properties) {
15087 print "\nProperties and tables that unexpectedly have no code points\n";
15088 foreach my $property (sort @unhandled_properties) {
15089 print $property, "\n";
15090 }
15091 }
15092
15093 if (%potential_files) {
15094 print "\nInput files that are not considered:\n";
15095 foreach my $file (sort keys %potential_files) {
15096 print File::Spec->abs2rel($file), "\n";
15097 }
15098 }
15099 print "\nAll done\n" if $verbosity >= $VERBOSE;
15100}
5beb625e 15101exit(0);
cf25bb62 15102
99870f4d 15103# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 15104__DATA__
99870f4d 15105
5beb625e
JH
15106use strict;
15107use warnings;
15108
66fd7fd0
KW
15109# If run outside the normal test suite on an ASCII platform, you can
15110# just create a latin1_to_native() function that just returns its
15111# inputs, because that's the only function used from test.pl
15112require "test.pl";
15113
37e2e78e
KW
15114# Test qr/\X/ and the \p{} regular expression constructs. This file is
15115# constructed by mktables from the tables it generates, so if mktables is
15116# buggy, this won't necessarily catch those bugs. Tests are generated for all
15117# feasible properties; a few aren't currently feasible; see
15118# is_code_point_usable() in mktables for details.
99870f4d
KW
15119
15120# Standard test packages are not used because this manipulates SIG_WARN. It
15121# exits 0 if every non-skipped test succeeded; -1 if any failed.
15122
5beb625e
JH
15123my $Tests = 0;
15124my $Fails = 0;
99870f4d 15125
99870f4d
KW
15126sub Expect($$$$) {
15127 my $expected = shift;
15128 my $ord = shift;
15129 my $regex = shift;
15130 my $warning_type = shift; # Type of warning message, like 'deprecated'
15131 # or empty if none
15132 my $line = (caller)[2];
66fd7fd0 15133 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 15134
99870f4d 15135 # Convert the code point to hex form
23e33b60 15136 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 15137
99870f4d 15138 my @tests = "";
5beb625e 15139
37e2e78e
KW
15140 # The first time through, use all warnings. If the input should generate
15141 # a warning, add another time through with them turned off
99870f4d
KW
15142 push @tests, "no warnings '$warning_type';" if $warning_type;
15143
15144 foreach my $no_warnings (@tests) {
15145
15146 # Store any warning messages instead of outputting them
15147 local $SIG{__WARN__} = $SIG{__WARN__};
15148 my $warning_message;
15149 $SIG{__WARN__} = sub { $warning_message = $_[0] };
15150
15151 $Tests++;
15152
15153 # A string eval is needed because of the 'no warnings'.
15154 # Assumes no parens in the regular expression
15155 my $result = eval "$no_warnings
15156 my \$RegObj = qr($regex);
15157 $string =~ \$RegObj ? 1 : 0";
15158 if (not defined $result) {
15159 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
15160 $Fails++;
15161 }
15162 elsif ($result ^ $expected) {
15163 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
15164 $Fails++;
15165 }
15166 elsif ($warning_message) {
15167 if (! $warning_type || ($warning_type && $no_warnings)) {
15168 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
15169 $Fails++;
15170 }
15171 else {
15172 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
15173 }
15174 }
15175 elsif ($warning_type && ! $no_warnings) {
15176 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
15177 $Fails++;
15178 }
15179 else {
15180 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
15181 }
5beb625e 15182 }
99870f4d 15183 return;
5beb625e 15184}
d73e5302 15185
99870f4d
KW
15186sub Error($) {
15187 my $regex = shift;
5beb625e 15188 $Tests++;
99870f4d 15189 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 15190 $Fails++;
99870f4d
KW
15191 my $line = (caller)[2];
15192 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 15193 }
99870f4d
KW
15194 else {
15195 my $line = (caller)[2];
15196 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
15197 }
15198 return;
5beb625e
JH
15199}
15200
37e2e78e
KW
15201# GCBTest.txt character that separates grapheme clusters
15202my $breakable_utf8 = my $breakable = chr(0xF7);
15203utf8::upgrade($breakable_utf8);
15204
15205# GCBTest.txt character that indicates that the adjoining code points are part
15206# of the same grapheme cluster
15207my $nobreak_utf8 = my $nobreak = chr(0xD7);
15208utf8::upgrade($nobreak_utf8);
15209
15210sub Test_X($) {
15211 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
15212 # Each such line is a sequence of code points given by their hex numbers,
15213 # separated by the two characters defined just before this subroutine that
15214 # indicate that either there can or cannot be a break between the adjacent
15215 # code points. If there isn't a break, that means the sequence forms an
15216 # extended grapheme cluster, which means that \X should match the whole
15217 # thing. If there is a break, \X should stop there. This is all
15218 # converted by this routine into a match:
15219 # $string =~ /(\X)/,
15220 # Each \X should match the next cluster; and that is what is checked.
15221
15222 my $template = shift;
15223
15224 my $line = (caller)[2];
15225
15226 # The line contains characters above the ASCII range, but in Latin1. It
15227 # may or may not be in utf8, and if it is, it may or may not know it. So,
15228 # convert these characters to 8 bits. If knows is in utf8, simply
15229 # downgrade.
15230 if (utf8::is_utf8($template)) {
15231 utf8::downgrade($template);
15232 } else {
15233
15234 # Otherwise, if it is in utf8, but doesn't know it, the next lines
15235 # convert the two problematic characters to their 8-bit equivalents.
15236 # If it isn't in utf8, they don't harm anything.
15237 use bytes;
15238 $template =~ s/$nobreak_utf8/$nobreak/g;
15239 $template =~ s/$breakable_utf8/$breakable/g;
15240 }
15241
15242 # Get rid of the leading and trailing breakables
15243 $template =~ s/^ \s* $breakable \s* //x;
15244 $template =~ s/ \s* $breakable \s* $ //x;
15245
15246 # And no-breaks become just a space.
15247 $template =~ s/ \s* $nobreak \s* / /xg;
15248
15249 # Split the input into segments that are breakable between them.
15250 my @segments = split /\s*$breakable\s*/, $template;
15251
15252 my $string = "";
15253 my $display_string = "";
15254 my @should_match;
15255 my @should_display;
15256
15257 # Convert the code point sequence in each segment into a Perl string of
15258 # characters
15259 foreach my $segment (@segments) {
15260 my @code_points = split /\s+/, $segment;
15261 my $this_string = "";
15262 my $this_display = "";
15263 foreach my $code_point (@code_points) {
66fd7fd0 15264 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
15265 $this_display .= "\\x{$code_point}";
15266 }
15267
15268 # The next cluster should match the string in this segment.
15269 push @should_match, $this_string;
15270 push @should_display, $this_display;
15271 $string .= $this_string;
15272 $display_string .= $this_display;
15273 }
15274
15275 # If a string can be represented in both non-ut8 and utf8, test both cases
15276 UPGRADE:
15277 for my $to_upgrade (0 .. 1) {
678f13d5 15278
37e2e78e
KW
15279 if ($to_upgrade) {
15280
15281 # If already in utf8, would just be a repeat
15282 next UPGRADE if utf8::is_utf8($string);
15283
15284 utf8::upgrade($string);
15285 }
15286
15287 # Finally, do the \X match.
15288 my @matches = $string =~ /(\X)/g;
15289
15290 # Look through each matched cluster to verify that it matches what we
15291 # expect.
15292 my $min = (@matches < @should_match) ? @matches : @should_match;
15293 for my $i (0 .. $min - 1) {
15294 $Tests++;
15295 if ($matches[$i] eq $should_match[$i]) {
15296 print "ok $Tests - ";
15297 if ($i == 0) {
15298 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
15299 } else {
15300 print "And \\X #", $i + 1,
15301 }
15302 print " correctly matched $should_display[$i]; line $line\n";
15303 } else {
15304 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
15305 unpack("U*", $matches[$i]));
15306 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
15307 $i + 1,
15308 " should have matched $should_display[$i]",
15309 " but instead matched $matches[$i]",
15310 ". Abandoning rest of line $line\n";
15311 next UPGRADE;
15312 }
15313 }
15314
15315 # And the number of matches should equal the number of expected matches.
15316 $Tests++;
15317 if (@matches == @should_match) {
15318 print "ok $Tests - Nothing was left over; line $line\n";
15319 } else {
15320 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
15321 }
15322 }
15323
15324 return;
15325}
15326
99870f4d 15327sub Finished() {
f86864ac 15328 print "1..$Tests\n";
99870f4d 15329 exit($Fails ? -1 : 0);
5beb625e 15330}
99870f4d
KW
15331
15332Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 15333Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
15334Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
15335Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 15336Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726