This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Improve comments, white-space
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a 19my $start_time;
98dc9551 20BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
21 # get it as close as possible
22 $start_time= time;
23}
24
23e33b60 25require 5.010_001;
d73e5302 26use strict;
99870f4d 27use warnings;
cf25bb62 28use Carp;
bd9ebcfd 29use Config;
99870f4d
KW
30use File::Find;
31use File::Path;
d07a55ed 32use File::Spec;
99870f4d
KW
33use Text::Tabs;
34
35sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 36my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d
KW
37
38##########################################################################
39#
40# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
41# from the Unicode database files (lib/unicore/.../*.txt), It also generates
42# a pod file and a .t file
43#
44# The structure of this file is:
45# First these introductory comments; then
46# code needed for everywhere, such as debugging stuff; then
47# code to handle input parameters; then
48# data structures likely to be of external interest (some of which depend on
49# the input parameters, so follows them; then
50# more data structures and subroutine and package (class) definitions; then
51# the small actual loop to process the input files and finish up; then
52# a __DATA__ section, for the .t tests
53#
5f7264c7 54# This program works on all releases of Unicode through at least 6.0. The
99870f4d
KW
55# outputs have been scrutinized most intently for release 5.1. The others
56# have been checked for somewhat more than just sanity. It can handle all
57# existing Unicode character properties in those releases.
58#
99870f4d
KW
59# This program is mostly about Unicode character (or code point) properties.
60# A property describes some attribute or quality of a code point, like if it
61# is lowercase or not, its name, what version of Unicode it was first defined
62# in, or what its uppercase equivalent is. Unicode deals with these disparate
63# possibilities by making all properties into mappings from each code point
64# into some corresponding value. In the case of it being lowercase or not,
65# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
66# property maps each Unicode code point to a single value, called a "property
67# value". (Hence each Unicode property is a true mathematical function with
68# exactly one value per code point.)
69#
70# When using a property in a regular expression, what is desired isn't the
71# mapping of the code point to its property's value, but the reverse (or the
72# mathematical "inverse relation"): starting with the property value, "Does a
73# code point map to it?" These are written in a "compound" form:
74# \p{property=value}, e.g., \p{category=punctuation}. This program generates
75# files containing the lists of code points that map to each such regular
76# expression property value, one file per list
77#
78# There is also a single form shortcut that Perl adds for many of the commonly
79# used properties. This happens for all binary properties, plus script,
80# general_category, and block properties.
81#
82# Thus the outputs of this program are files. There are map files, mostly in
83# the 'To' directory; and there are list files for use in regular expression
84# matching, all in subdirectories of the 'lib' directory, with each
85# subdirectory being named for the property that the lists in it are for.
86# Bookkeeping, test, and documentation files are also generated.
87
88my $matches_directory = 'lib'; # Where match (\p{}) files go.
89my $map_directory = 'To'; # Where map files go.
90
91# DATA STRUCTURES
92#
93# The major data structures of this program are Property, of course, but also
94# Table. There are two kinds of tables, very similar to each other.
95# "Match_Table" is the data structure giving the list of code points that have
96# a particular property value, mentioned above. There is also a "Map_Table"
97# data structure which gives the property's mapping from code point to value.
98# There are two structures because the match tables need to be combined in
99# various ways, such as constructing unions, intersections, complements, etc.,
100# and the map ones don't. And there would be problems, perhaps subtle, if
101# a map table were inadvertently operated on in some of those ways.
102# The use of separate classes with operations defined on one but not the other
103# prevents accidentally confusing the two.
104#
105# At the heart of each table's data structure is a "Range_List", which is just
106# an ordered list of "Ranges", plus ancillary information, and methods to
107# operate on them. A Range is a compact way to store property information.
108# Each range has a starting code point, an ending code point, and a value that
109# is meant to apply to all the code points between the two end points,
110# inclusive. For a map table, this value is the property value for those
111# code points. Two such ranges could be written like this:
112# 0x41 .. 0x5A, 'Upper',
113# 0x61 .. 0x7A, 'Lower'
114#
115# Each range also has a type used as a convenience to classify the values.
116# Most ranges in this program will be Type 0, or normal, but there are some
117# ranges that have a non-zero type. These are used only in map tables, and
118# are for mappings that don't fit into the normal scheme of things. Mappings
119# that require a hash entry to communicate with utf8.c are one example;
120# another example is mappings for charnames.pm to use which indicate a name
121# that is algorithmically determinable from its code point (and vice-versa).
122# These are used to significantly compact these tables, instead of listing
123# each one of the tens of thousands individually.
124#
125# In a match table, the value of a range is irrelevant (and hence the type as
126# well, which will always be 0), and arbitrarily set to the null string.
127# Using the example above, there would be two match tables for those two
128# entries, one named Upper would contain the 0x41..0x5A range, and the other
129# named Lower would contain 0x61..0x7A.
130#
131# Actually, there are two types of range lists, "Range_Map" is the one
132# associated with map tables, and "Range_List" with match tables.
133# Again, this is so that methods can be defined on one and not the other so as
134# to prevent operating on them in incorrect ways.
135#
136# Eventually, most tables are written out to files to be read by utf8_heavy.pl
137# in the perl core. All tables could in theory be written, but some are
138# suppressed because there is no current practical use for them. It is easy
139# to change which get written by changing various lists that are near the top
140# of the actual code in this file. The table data structures contain enough
141# ancillary information to allow them to be treated as separate entities for
142# writing, such as the path to each one's file. There is a heading in each
143# map table that gives the format of its entries, and what the map is for all
144# the code points missing from it. (This allows tables to be more compact.)
678f13d5 145#
99870f4d
KW
146# The Property data structure contains one or more tables. All properties
147# contain a map table (except the $perl property which is a
148# pseudo-property containing only match tables), and any properties that
149# are usable in regular expression matches also contain various matching
150# tables, one for each value the property can have. A binary property can
151# have two values, True and False (or Y and N, which are preferred by Unicode
152# terminology). Thus each of these properties will have a map table that
153# takes every code point and maps it to Y or N (but having ranges cuts the
154# number of entries in that table way down), and two match tables, one
155# which has a list of all the code points that map to Y, and one for all the
156# code points that map to N. (For each of these, a third table is also
157# generated for the pseudo Perl property. It contains the identical code
158# points as the Y table, but can be written, not in the compound form, but in
159# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
160# properties have several possible values, some have many, and properties like
161# Name have a different value for every named code point. Those will not,
162# unless the controlling lists are changed, have their match tables written
163# out. But all the ones which can be used in regular expression \p{} and \P{}
c12f2655
KW
164# constructs will. Prior to 5.14, generally a property would have either its
165# map table or its match tables written but not both. Again, what gets
166# written is controlled by lists which can easily be changed. Starting in
167# 5.14, advantage was taken of this, and all the map tables needed to
168# reconstruct the Unicode db are now written out, while suppressing the
169# Unicode .txt files that contain the data. Our tables are much more compact
170# than the .txt files, so a significant space savings was achieved.
171
172# Properties have a 'Type', like binary, or string, or enum depending on how
173# many match tables there are and the content of the maps. This 'Type' is
174# different than a range 'Type', so don't get confused by the two concepts
175# having the same name.
678f13d5 176#
99870f4d
KW
177# For information about the Unicode properties, see Unicode's UAX44 document:
178
179my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
180
181# As stated earlier, this program will work on any release of Unicode so far.
182# Most obvious problems in earlier data have NOT been corrected except when
183# necessary to make Perl or this program work reasonably. For example, no
c12f2655
KW
184# folding information was given in early releases, so this program substitutes
185# lower case instead, just so that a regular expression with the /i option
186# will do something that actually gives the right results in many cases.
187# There are also a couple other corrections for version 1.1.5, commented at
188# the point they are made. As an example of corrections that weren't made
189# (but could be) is this statement from DerivedAge.txt: "The supplementary
190# private use code points and the non-character code points were assigned in
191# version 2.0, but not specifically listed in the UCD until versions 3.0 and
192# 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0) More information
193# on Unicode version glitches is further down in these introductory comments.
99870f4d 194#
5f7264c7
KW
195# This program works on all non-provisional properties as of 6.0, though the
196# files for some are suppressed from apparent lack of demand for them. You
197# can change which are output by changing lists in this program.
678f13d5 198#
dc85bd38 199# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
200# loose matchings rules (from Unicode TR18):
201#
202# The recommended names for UCD properties and property values are in
203# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
204# [PropValue]. There are both abbreviated names and longer, more
205# descriptive names. It is strongly recommended that both names be
206# recognized, and that loose matching of property names be used,
207# whereby the case distinctions, whitespace, hyphens, and underbar
208# are ignored.
209# The program still allows Fuzzy to override its determination of if loose
210# matching should be used, but it isn't currently used, as it is no longer
211# needed; the calculations it makes are good enough.
678f13d5 212#
99870f4d
KW
213# SUMMARY OF HOW IT WORKS:
214#
215# Process arguments
216#
217# A list is constructed containing each input file that is to be processed
218#
219# Each file on the list is processed in a loop, using the associated handler
220# code for each:
221# The PropertyAliases.txt and PropValueAliases.txt files are processed
222# first. These files name the properties and property values.
223# Objects are created of all the property and property value names
224# that the rest of the input should expect, including all synonyms.
225# The other input files give mappings from properties to property
226# values. That is, they list code points and say what the mapping
227# is under the given property. Some files give the mappings for
228# just one property; and some for many. This program goes through
229# each file and populates the properties from them. Some properties
230# are listed in more than one file, and Unicode has set up a
231# precedence as to which has priority if there is a conflict. Thus
232# the order of processing matters, and this program handles the
233# conflict possibility by processing the overriding input files
234# last, so that if necessary they replace earlier values.
235# After this is all done, the program creates the property mappings not
236# furnished by Unicode, but derivable from what it does give.
237# The tables of code points that match each property value in each
238# property that is accessible by regular expressions are created.
239# The Perl-defined properties are created and populated. Many of these
240# require data determined from the earlier steps
241# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 242# and Unicode are reconciled and warned about.
99870f4d
KW
243# All the properties are written to files
244# Any other files are written, and final warnings issued.
678f13d5 245#
99870f4d
KW
246# For clarity, a number of operators have been overloaded to work on tables:
247# ~ means invert (take all characters not in the set). The more
248# conventional '!' is not used because of the possibility of confusing
249# it with the actual boolean operation.
250# + means union
251# - means subtraction
252# & means intersection
253# The precedence of these is the order listed. Parentheses should be
254# copiously used. These are not a general scheme. The operations aren't
255# defined for a number of things, deliberately, to avoid getting into trouble.
256# Operations are done on references and affect the underlying structures, so
257# that the copy constructors for them have been overloaded to not return a new
258# clone, but the input object itself.
678f13d5 259#
99870f4d
KW
260# The bool operator is deliberately not overloaded to avoid confusion with
261# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
262#
263# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
264#
265# This program needs to be able to run under miniperl. Therefore, it uses a
266# minimum of other modules, and hence implements some things itself that could
267# be gotten from CPAN
268#
269# This program uses inputs published by the Unicode Consortium. These can
270# change incompatibly between releases without the Perl maintainers realizing
271# it. Therefore this program is now designed to try to flag these. It looks
272# at the directories where the inputs are, and flags any unrecognized files.
273# It keeps track of all the properties in the files it handles, and flags any
274# that it doesn't know how to handle. It also flags any input lines that
275# don't match the expected syntax, among other checks.
276#
277# It is also designed so if a new input file matches one of the known
278# templates, one hopefully just needs to add it to a list to have it
279# processed.
280#
281# As mentioned earlier, some properties are given in more than one file. In
282# particular, the files in the extracted directory are supposedly just
283# reformattings of the others. But they contain information not easily
284# derivable from the other files, including results for Unihan, which this
285# program doesn't ordinarily look at, and for unassigned code points. They
286# also have historically had errors or been incomplete. In an attempt to
287# create the best possible data, this program thus processes them first to
288# glean information missing from the other files; then processes those other
289# files to override any errors in the extracted ones. Much of the design was
290# driven by this need to store things and then possibly override them.
291#
292# It tries to keep fatal errors to a minimum, to generate something usable for
293# testing purposes. It always looks for files that could be inputs, and will
294# warn about any that it doesn't know how to handle (the -q option suppresses
295# the warning).
99870f4d 296#
678f13d5
KW
297# Why is there more than one type of range?
298# This simplified things. There are some very specialized code points that
299# have to be handled specially for output, such as Hangul syllable names.
300# By creating a range type (done late in the development process), it
301# allowed this to be stored with the range, and overridden by other input.
302# Originally these were stored in another data structure, and it became a
303# mess trying to decide if a second file that was for the same property was
304# overriding the earlier one or not.
305#
306# Why are there two kinds of tables, match and map?
307# (And there is a base class shared by the two as well.) As stated above,
308# they actually are for different things. Development proceeded much more
309# smoothly when I (khw) realized the distinction. Map tables are used to
310# give the property value for every code point (actually every code point
311# that doesn't map to a default value). Match tables are used for regular
312# expression matches, and are essentially the inverse mapping. Separating
313# the two allows more specialized methods, and error checks so that one
314# can't just take the intersection of two map tables, for example, as that
315# is nonsensical.
99870f4d 316#
23e33b60
KW
317# DEBUGGING
318#
678f13d5
KW
319# This program is written so it will run under miniperl. Occasionally changes
320# will cause an error where the backtrace doesn't work well under miniperl.
321# To diagnose the problem, you can instead run it under regular perl, if you
322# have one compiled.
323#
324# There is a good trace facility. To enable it, first sub DEBUG must be set
325# to return true. Then a line like
326#
327# local $to_trace = 1 if main::DEBUG;
328#
329# can be added to enable tracing in its lexical scope or until you insert
330# another line:
331#
332# local $to_trace = 0 if main::DEBUG;
333#
334# then use a line like "trace $a, @b, %c, ...;
335#
336# Some of the more complex subroutines already have trace statements in them.
337# Permanent trace statements should be like:
338#
339# trace ... if main::DEBUG && $to_trace;
340#
341# If there is just one or a few files that you're debugging, you can easily
342# cause most everything else to be skipped. Change the line
343#
344# my $debug_skip = 0;
345#
346# to 1, and every file whose object is in @input_file_objects and doesn't have
347# a, 'non_skip => 1,' in its constructor will be skipped.
348#
b4a0206c 349# To compare the output tables, it may be useful to specify the -annotate
c4019d52
KW
350# flag. This causes the tables to expand so there is one entry for each
351# non-algorithmically named code point giving, currently its name, and its
352# graphic representation if printable (and you have a font that knows about
353# it). This makes it easier to see what the particular code points are in
354# each output table. The tables are usable, but because they don't have
355# ranges (for the most part), a Perl using them will run slower. Non-named
356# code points are annotated with a description of their status, and contiguous
357# ones with the same description will be output as a range rather than
358# individually. Algorithmically named characters are also output as ranges,
359# except when there are just a few contiguous ones.
360#
99870f4d
KW
361# FUTURE ISSUES
362#
363# The program would break if Unicode were to change its names so that
364# interior white space, underscores, or dashes differences were significant
365# within property and property value names.
366#
367# It might be easier to use the xml versions of the UCD if this program ever
368# would need heavy revision, and the ability to handle old versions was not
369# required.
370#
371# There is the potential for name collisions, in that Perl has chosen names
372# that Unicode could decide it also likes. There have been such collisions in
373# the past, with mostly Perl deciding to adopt the Unicode definition of the
374# name. However in the 5.2 Unicode beta testing, there were a number of such
375# collisions, which were withdrawn before the final release, because of Perl's
376# and other's protests. These all involved new properties which began with
377# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
378# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
379# Unicode document, so they are unlikely to be used by Unicode for another
380# purpose. However, they might try something beginning with 'In', or use any
381# of the other Perl-defined properties. This program will warn you of name
382# collisions, and refuse to generate tables with them, but manual intervention
383# will be required in this event. One scheme that could be implemented, if
384# necessary, would be to have this program generate another file, or add a
385# field to mktables.lst that gives the date of first definition of a property.
386# Each new release of Unicode would use that file as a basis for the next
387# iteration. And the Perl synonym addition code could sort based on the age
388# of the property, so older properties get priority, and newer ones that clash
389# would be refused; hence existing code would not be impacted, and some other
390# synonym would have to be used for the new property. This is ugly, and
391# manual intervention would certainly be easier to do in the short run; lets
392# hope it never comes to this.
678f13d5 393#
99870f4d
KW
394# A NOTE ON UNIHAN
395#
396# This program can generate tables from the Unihan database. But it doesn't
397# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
398# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
399# database was split into 8 different files, all beginning with the letters
400# 'Unihan'. This program will read those file(s) if present, but it needs to
401# know which of the many properties in the file(s) should have tables created
402# for them. It will create tables for any properties listed in
403# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
404# @cjk_properties array and the @cjk_property_values array. Thus, if a
405# property you want is not in those files of the release you are building
406# against, you must add it to those two arrays. Starting in 4.0, the
407# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
408# is present in the directory, a table will be generated for that property.
409# In 5.2, several more properties were added. For your convenience, the two
5f7264c7 410# arrays are initialized with all the 6.0 listed properties that are also in
99870f4d
KW
411# earlier releases. But these are commented out. You can just uncomment the
412# ones you want, or use them as a template for adding entries for other
413# properties.
414#
415# You may need to adjust the entries to suit your purposes. setup_unihan(),
416# and filter_unihan_line() are the functions where this is done. This program
417# already does some adjusting to make the lines look more like the rest of the
418# Unicode DB; You can see what that is in filter_unihan_line()
419#
420# There is a bug in the 3.2 data file in which some values for the
421# kPrimaryNumeric property have commas and an unexpected comment. A filter
422# could be added for these; or for a particular installation, the Unihan.txt
423# file could be edited to fix them.
99870f4d 424#
678f13d5
KW
425# HOW TO ADD A FILE TO BE PROCESSED
426#
427# A new file from Unicode needs to have an object constructed for it in
428# @input_file_objects, probably at the end or at the end of the extracted
429# ones. The program should warn you if its name will clash with others on
430# restrictive file systems, like DOS. If so, figure out a better name, and
431# add lines to the README.perl file giving that. If the file is a character
432# property, it should be in the format that Unicode has by default
433# standardized for such files for the more recently introduced ones.
434# If so, the Input_file constructor for @input_file_objects can just be the
435# file name and release it first appeared in. If not, then it should be
436# possible to construct an each_line_handler() to massage the line into the
437# standardized form.
438#
439# For non-character properties, more code will be needed. You can look at
440# the existing entries for clues.
441#
442# UNICODE VERSIONS NOTES
443#
444# The Unicode UCD has had a number of errors in it over the versions. And
445# these remain, by policy, in the standard for that version. Therefore it is
446# risky to correct them, because code may be expecting the error. So this
447# program doesn't generally make changes, unless the error breaks the Perl
448# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
449# for U+1105, which causes real problems for the algorithms for Jamo
450# calculations, so it is changed here.
451#
452# But it isn't so clear cut as to what to do about concepts that are
453# introduced in a later release; should they extend back to earlier releases
454# where the concept just didn't exist? It was easier to do this than to not,
455# so that's what was done. For example, the default value for code points not
456# in the files for various properties was probably undefined until changed by
457# some version. No_Block for blocks is such an example. This program will
458# assign No_Block even in Unicode versions that didn't have it. This has the
459# benefit that code being written doesn't have to special case earlier
460# versions; and the detriment that it doesn't match the Standard precisely for
461# the affected versions.
462#
463# Here are some observations about some of the issues in early versions:
464#
6426c51b 465# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
678f13d5
KW
466# the reason is that the CJK block starting at 4E00 was removed from PropList,
467# and was not put back in until 3.1.0
468#
469# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
470# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
471# reason is that 3.2 introduced U+205F=medium math space, which was not
472# classed as white space, but Perl figured out that it should have been. 4.0
473# reclassified it correctly.
474#
475# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
476# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
477# was left with no code points, as all the ones that mapped to 202 stayed
478# mapped to 202. Thus if your program used the numeric name for the class,
479# it would not have been affected, but if it used the mnemonic, it would have
480# been.
481#
482# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
483# points which eventually came to have this script property value, instead
484# mapped to "Unknown". But in the next release all these code points were
485# moved to \p{sc=common} instead.
99870f4d
KW
486#
487# The default for missing code points for BidiClass is complicated. Starting
488# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
489# tries to do the best it can for earlier releases. It is done in
490# process_PropertyAliases()
491#
492##############################################################################
493
494my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
495 # and errors
496my $MAX_LINE_WIDTH = 78;
497
498# Debugging aid to skip most files so as to not be distracted by them when
499# concentrating on the ones being debugged. Add
500# non_skip => 1,
501# to the constructor for those files you want processed when you set this.
502# Files with a first version number of 0 are special: they are always
c12f2655
KW
503# processed regardless of the state of this flag. Generally, Jamo.txt and
504# UnicodeData.txt must not be skipped if you want this program to not die
505# before normal completion.
99870f4d
KW
506my $debug_skip = 0;
507
508# Set to 1 to enable tracing.
509our $to_trace = 0;
510
511{ # Closure for trace: debugging aid
512 my $print_caller = 1; # ? Include calling subroutine name
513 my $main_with_colon = 'main::';
514 my $main_colon_length = length($main_with_colon);
515
516 sub trace {
517 return unless $to_trace; # Do nothing if global flag not set
518
519 my @input = @_;
520
521 local $DB::trace = 0;
522 $DB::trace = 0; # Quiet 'used only once' message
523
524 my $line_number;
525
526 # Loop looking up the stack to get the first non-trace caller
527 my $caller_line;
528 my $caller_name;
529 my $i = 0;
530 do {
531 $line_number = $caller_line;
532 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
533 $caller = $main_with_colon unless defined $caller;
534
535 $caller_name = $caller;
536
537 # get rid of pkg
538 $caller_name =~ s/.*:://;
539 if (substr($caller_name, 0, $main_colon_length)
540 eq $main_with_colon)
541 {
542 $caller_name = substr($caller_name, $main_colon_length);
543 }
544
545 } until ($caller_name ne 'trace');
546
547 # If the stack was empty, we were called from the top level
548 $caller_name = 'main' if ($caller_name eq ""
549 || $caller_name eq 'trace');
550
551 my $output = "";
552 foreach my $string (@input) {
553 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
554 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
555 $output .= simple_dumper($string);
556 }
557 else {
558 $string = "$string" if ref $string;
559 $string = $UNDEF unless defined $string;
560 chomp $string;
561 $string = '""' if $string eq "";
562 $output .= " " if $output ne ""
563 && $string ne ""
564 && substr($output, -1, 1) ne " "
565 && substr($string, 0, 1) ne " ";
566 $output .= $string;
567 }
568 }
569
99f78760
KW
570 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
571 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
572 print STDERR $output, "\n";
573 return;
574 }
575}
576
577# This is for a rarely used development feature that allows you to compare two
578# versions of the Unicode standard without having to deal with changes caused
c12f2655
KW
579# by the code points introduced in the later version. Change the 0 to a
580# string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
581# code points introduced in that release and earlier will be used; later ones
582# are thrown away. You use the version number of the earliest one you want to
583# compare; then run this program on directory structures containing each
584# release, and compare the outputs. These outputs will therefore include only
585# the code points common to both releases, and you can see the changes caused
586# just by the underlying release semantic changes. For versions earlier than
587# 3.2, you must copy a version of DAge.txt into the directory.
588my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
99870f4d
KW
589my $compare_versions = DEBUG
590 && $string_compare_versions
591 && pack "C*", split /\./, $string_compare_versions;
592
593sub uniques {
594 # Returns non-duplicated input values. From "Perl Best Practices:
595 # Encapsulated Cleverness". p. 455 in first edition.
596
597 my %seen;
0e407844
NC
598 # Arguably this breaks encapsulation, if the goal is to permit multiple
599 # distinct objects to stringify to the same value, and be interchangeable.
600 # However, for this program, no two objects stringify identically, and all
601 # lists passed to this function are either objects or strings. So this
602 # doesn't affect correctness, but it does give a couple of percent speedup.
603 no overloading;
99870f4d
KW
604 return grep { ! $seen{$_}++ } @_;
605}
606
607$0 = File::Spec->canonpath($0);
608
609my $make_test_script = 0; # ? Should we output a test script
610my $write_unchanged_files = 0; # ? Should we update the output files even if
611 # we don't think they have changed
612my $use_directory = ""; # ? Should we chdir somewhere.
613my $pod_directory; # input directory to store the pod file.
614my $pod_file = 'perluniprops';
615my $t_path; # Path to the .t test file
616my $file_list = 'mktables.lst'; # File to store input and output file names.
617 # This is used to speed up the build, by not
618 # executing the main body of the program if
619 # nothing on the list has changed since the
620 # previous build
621my $make_list = 1; # ? Should we write $file_list. Set to always
622 # make a list so that when the pumpking is
623 # preparing a release, s/he won't have to do
624 # special things
625my $glob_list = 0; # ? Should we try to include unknown .txt files
626 # in the input.
bd9ebcfd
KW
627my $output_range_counts = $debugging_build; # ? Should we include the number
628 # of code points in ranges in
629 # the output
558712cf 630my $annotate = 0; # ? Should character names be in the output
9ef2b94f 631
99870f4d
KW
632# Verbosity levels; 0 is quiet
633my $NORMAL_VERBOSITY = 1;
634my $PROGRESS = 2;
635my $VERBOSE = 3;
636
637my $verbosity = $NORMAL_VERBOSITY;
638
639# Process arguments
640while (@ARGV) {
cf25bb62
JH
641 my $arg = shift @ARGV;
642 if ($arg eq '-v') {
99870f4d
KW
643 $verbosity = $VERBOSE;
644 }
645 elsif ($arg eq '-p') {
646 $verbosity = $PROGRESS;
647 $| = 1; # Flush buffers as we go.
648 }
649 elsif ($arg eq '-q') {
650 $verbosity = 0;
651 }
652 elsif ($arg eq '-w') {
653 $write_unchanged_files = 1; # update the files even if havent changed
654 }
655 elsif ($arg eq '-check') {
6ae7e459
YO
656 my $this = shift @ARGV;
657 my $ok = shift @ARGV;
658 if ($this ne $ok) {
659 print "Skipping as check params are not the same.\n";
660 exit(0);
661 }
00a8df5c 662 }
99870f4d
KW
663 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
664 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
665 }
3df51b85
KW
666 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
667 {
99870f4d 668 $make_test_script = 1;
99870f4d
KW
669 }
670 elsif ($arg eq '-makelist') {
671 $make_list = 1;
672 }
673 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
674 -d $use_directory or croak "Unknown directory '$use_directory'";
675 }
676 elsif ($arg eq '-L') {
677
678 # Existence not tested until have chdir'd
679 $file_list = shift;
680 }
681 elsif ($arg eq '-globlist') {
682 $glob_list = 1;
683 }
684 elsif ($arg eq '-c') {
685 $output_range_counts = ! $output_range_counts
686 }
b4a0206c 687 elsif ($arg eq '-annotate') {
558712cf 688 $annotate = 1;
bd9ebcfd
KW
689 $debugging_build = 1;
690 $output_range_counts = 1;
9ef2b94f 691 }
99870f4d
KW
692 else {
693 my $with_c = 'with';
694 $with_c .= 'out' if $output_range_counts; # Complements the state
695 croak <<END;
696usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
697 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
698 [-check A B ]
699 -c : Output comments $with_c number of code points in ranges
700 -q : Quiet Mode: Only output serious warnings.
701 -p : Set verbosity level to normal plus show progress.
702 -v : Set Verbosity level high: Show progress and non-serious
703 warnings
704 -w : Write files regardless
705 -C dir : Change to this directory before proceeding. All relative paths
706 except those specified by the -P and -T options will be done
707 with respect to this directory.
708 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 709 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
710 -L filelist : Use alternate 'filelist' instead of standard one
711 -globlist : Take as input all non-Test *.txt files in current and sub
712 directories
3df51b85
KW
713 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
714 overrides -T
99870f4d 715 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 716 -annotate : Output an annotation for each character in the table files;
c4019d52
KW
717 useful for debugging mktables, looking at diffs; but is slow,
718 memory intensive; resulting tables are usable but slow and
719 very large.
99870f4d
KW
720 -check A B : Executes $0 only if A and B are the same
721END
722 }
723}
724
725# Stores the most-recently changed file. If none have changed, can skip the
726# build
aeab6150 727my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
728
729# Change directories now, because need to read 'version' early.
730if ($use_directory) {
3df51b85 731 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
732 $pod_directory = File::Spec->rel2abs($pod_directory);
733 }
3df51b85 734 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 735 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 736 }
99870f4d 737 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 738 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 739 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 740 }
3df51b85 741 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 742 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 743 }
00a8df5c
YO
744}
745
99870f4d
KW
746# Get Unicode version into regular and v-string. This is done now because
747# various tables below get populated based on it. These tables are populated
748# here to be near the top of the file, and so easily seeable by those needing
749# to modify things.
750open my $VERSION, "<", "version"
751 or croak "$0: can't open required file 'version': $!\n";
752my $string_version = <$VERSION>;
753close $VERSION;
754chomp $string_version;
755my $v_version = pack "C*", split /\./, $string_version; # v string
756
757# The following are the complete names of properties with property values that
758# are known to not match any code points in some versions of Unicode, but that
759# may change in the future so they should be matchable, hence an empty file is
760# generated for them.
761my @tables_that_may_be_empty = (
762 'Joining_Type=Left_Joining',
763 );
764push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
765push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
766push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
767 if $v_version ge v4.1.0;
82aed44a
KW
768push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
769 if $v_version ge v6.0.0;
99870f4d
KW
770
771# The lists below are hashes, so the key is the item in the list, and the
772# value is the reason why it is in the list. This makes generation of
773# documentation easier.
774
775my %why_suppressed; # No file generated for these.
776
777# Files aren't generated for empty extraneous properties. This is arguable.
778# Extraneous properties generally come about because a property is no longer
779# used in a newer version of Unicode. If we generated a file without code
780# points, programs that used to work on that property will still execute
781# without errors. It just won't ever match (or will always match, with \P{}).
782# This means that the logic is now likely wrong. I (khw) think its better to
783# find this out by getting an error message. Just move them to the table
784# above to change this behavior
785my %why_suppress_if_empty_warn_if_not = (
786
787 # It is the only property that has ever officially been removed from the
788 # Standard. The database never contained any code points for it.
789 'Special_Case_Condition' => 'Obsolete',
790
791 # Apparently never official, but there were code points in some versions of
792 # old-style PropList.txt
793 'Non_Break' => 'Obsolete',
794);
795
796# These would normally go in the warn table just above, but they were changed
797# a long time before this program was written, so warnings about them are
798# moot.
799if ($v_version gt v3.2.0) {
800 push @tables_that_may_be_empty,
801 'Canonical_Combining_Class=Attached_Below_Left'
802}
803
5f7264c7 804# These are listed in the Property aliases file in 6.0, but Unihan is ignored
99870f4d
KW
805# unless explicitly added.
806if ($v_version ge v5.2.0) {
807 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 808 foreach my $table (qw (
99870f4d
KW
809 kAccountingNumeric
810 kOtherNumeric
811 kPrimaryNumeric
812 kCompatibilityVariant
813 kIICore
814 kIRG_GSource
815 kIRG_HSource
816 kIRG_JSource
817 kIRG_KPSource
818 kIRG_MSource
819 kIRG_KSource
820 kIRG_TSource
821 kIRG_USource
822 kIRG_VSource
823 kRSUnicode
ea25a9b2 824 ))
99870f4d
KW
825 {
826 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
827 }
ca12659b
NC
828}
829
272501f6
KW
830# Enum values for to_output_map() method in the Map_Table package.
831my $EXTERNAL_MAP = 1;
832my $INTERNAL_MAP = 2;
833
fcf1973c
KW
834# To override computed values for writing the map tables for these properties.
835# The default for enum map tables is to write them out, so that the Unicode
836# .txt files can be removed, but all the data to compute any property value
837# for any code point is available in a more compact form.
838my %global_to_output_map = (
839 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
c12f2655
KW
840 # get stuck supporting it if things change. Since it is a STRING
841 # property, it normally would be listed in the pod, but INTERNAL_MAP
842 # suppresses that.
fcf1973c
KW
843 Unicode_1_Name => $INTERNAL_MAP,
844
845 Present_In => 0, # Suppress, as easily computed from Age
fcf1973c
KW
846 Block => 0, # Suppress, as Blocks.txt is retained.
847);
848
99870f4d
KW
849# Properties that this program ignores.
850my @unimplemented_properties = (
851'Unicode_Radical_Stroke' # Remove if changing to handle this one.
852);
d73e5302 853
99870f4d
KW
854# There are several types of obsolete properties defined by Unicode. These
855# must be hand-edited for every new Unicode release.
856my %why_deprecated; # Generates a deprecated warning message if used.
857my %why_stabilized; # Documentation only
858my %why_obsolete; # Documentation only
859
860{ # Closure
861 my $simple = 'Perl uses the more complete version of this property';
862 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
863
864 my $other_properties = 'other properties';
865 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 866 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
867
868 %why_deprecated = (
5f7264c7 869 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
870 'Jamo_Short_Name' => $contributory,
871 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
872 'Other_Alphabetic' => $contributory,
873 'Other_Default_Ignorable_Code_Point' => $contributory,
874 'Other_Grapheme_Extend' => $contributory,
875 'Other_ID_Continue' => $contributory,
876 'Other_ID_Start' => $contributory,
877 'Other_Lowercase' => $contributory,
878 'Other_Math' => $contributory,
879 'Other_Uppercase' => $contributory,
880 );
881
882 %why_suppressed = (
5f7264c7 883 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
884 # contains the same information, but without the algorithmically
885 # determinable Hangul syllables'. This file is not published, so it's
886 # existence is not noted in the comment.
887 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
888
889 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
99870f4d
KW
890
891 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
892 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
893 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
894 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
895
896 'Name' => "Accessible via 'use charnames;'",
897 'Name_Alias' => "Accessible via 'use charnames;'",
898
5f7264c7 899 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
900 Expands_On_NFC => $why_no_expand,
901 Expands_On_NFD => $why_no_expand,
902 Expands_On_NFKC => $why_no_expand,
903 Expands_On_NFKD => $why_no_expand,
904 );
905
906 # The following are suppressed because they were made contributory or
907 # deprecated by Unicode before Perl ever thought about supporting them.
908 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
909 $why_suppressed{$property} = $why_deprecated{$property};
910 }
cf25bb62 911
99870f4d
KW
912 # Customize the message for all the 'Other_' properties
913 foreach my $property (keys %why_deprecated) {
914 next if (my $main_property = $property) !~ s/^Other_//;
915 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
916 }
917}
918
919if ($v_version ge 4.0.0) {
920 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
921 if ($v_version ge 6.0.0) {
922 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
923 }
99870f4d 924}
5f7264c7 925if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 926 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7
KW
927 if ($v_version ge 6.0.0) {
928 $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed';
929 }
99870f4d
KW
930}
931
932# Probably obsolete forever
933if ($v_version ge v4.1.0) {
82aed44a
KW
934 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
935}
936if ($v_version ge v6.0.0) {
2b352efd
KW
937 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana (or both)"';
938 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
99870f4d
KW
939}
940
941# This program can create files for enumerated-like properties, such as
942# 'Numeric_Type'. This file would be the same format as for a string
943# property, with a mapping from code point to its value, so you could look up,
944# for example, the script a code point is in. But no one so far wants this
945# mapping, or they have found another way to get it since this is a new
946# feature. So no file is generated except if it is in this list.
947my @output_mapped_properties = split "\n", <<END;
948END
949
c12f2655
KW
950# If you are using the Unihan database in a Unicode version before 5.2, you
951# need to add the properties that you want to extract from it to this table.
952# For your convenience, the properties in the 6.0 PropertyAliases.txt file are
953# listed, commented out
99870f4d
KW
954my @cjk_properties = split "\n", <<'END';
955#cjkAccountingNumeric; kAccountingNumeric
956#cjkOtherNumeric; kOtherNumeric
957#cjkPrimaryNumeric; kPrimaryNumeric
958#cjkCompatibilityVariant; kCompatibilityVariant
959#cjkIICore ; kIICore
960#cjkIRG_GSource; kIRG_GSource
961#cjkIRG_HSource; kIRG_HSource
962#cjkIRG_JSource; kIRG_JSource
963#cjkIRG_KPSource; kIRG_KPSource
964#cjkIRG_KSource; kIRG_KSource
965#cjkIRG_TSource; kIRG_TSource
966#cjkIRG_USource; kIRG_USource
967#cjkIRG_VSource; kIRG_VSource
968#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
969END
970
971# Similarly for the property values. For your convenience, the lines in the
5f7264c7 972# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
c12f2655 973# '#' marks (for Unicode versions before 5.2)
99870f4d
KW
974my @cjk_property_values = split "\n", <<'END';
975## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
976## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
977## @missing: 0000..10FFFF; cjkIICore; <none>
978## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
979## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
980## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
981## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
982## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
983## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
984## @missing: 0000..10FFFF; cjkIRG_USource; <none>
985## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
986## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
987## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
988## @missing: 0000..10FFFF; cjkRSUnicode; <none>
989END
990
991# The input files don't list every code point. Those not listed are to be
992# defaulted to some value. Below are hard-coded what those values are for
993# non-binary properties as of 5.1. Starting in 5.0, there are
994# machine-parsable comment lines in the files the give the defaults; so this
995# list shouldn't have to be extended. The claim is that all missing entries
996# for binary properties will default to 'N'. Unicode tried to change that in
997# 5.2, but the beta period produced enough protest that they backed off.
998#
999# The defaults for the fields that appear in UnicodeData.txt in this hash must
1000# be in the form that it expects. The others may be synonyms.
1001my $CODE_POINT = '<code point>';
1002my %default_mapping = (
1003 Age => "Unassigned",
1004 # Bidi_Class => Complicated; set in code
1005 Bidi_Mirroring_Glyph => "",
1006 Block => 'No_Block',
1007 Canonical_Combining_Class => 0,
1008 Case_Folding => $CODE_POINT,
1009 Decomposition_Mapping => $CODE_POINT,
1010 Decomposition_Type => 'None',
1011 East_Asian_Width => "Neutral",
1012 FC_NFKC_Closure => $CODE_POINT,
1013 General_Category => 'Cn',
1014 Grapheme_Cluster_Break => 'Other',
1015 Hangul_Syllable_Type => 'NA',
1016 ISO_Comment => "",
1017 Jamo_Short_Name => "",
1018 Joining_Group => "No_Joining_Group",
1019 # Joining_Type => Complicated; set in code
1020 kIICore => 'N', # Is converted to binary
1021 #Line_Break => Complicated; set in code
1022 Lowercase_Mapping => $CODE_POINT,
1023 Name => "",
1024 Name_Alias => "",
1025 NFC_QC => 'Yes',
1026 NFD_QC => 'Yes',
1027 NFKC_QC => 'Yes',
1028 NFKD_QC => 'Yes',
1029 Numeric_Type => 'None',
1030 Numeric_Value => 'NaN',
1031 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1032 Sentence_Break => 'Other',
1033 Simple_Case_Folding => $CODE_POINT,
1034 Simple_Lowercase_Mapping => $CODE_POINT,
1035 Simple_Titlecase_Mapping => $CODE_POINT,
1036 Simple_Uppercase_Mapping => $CODE_POINT,
1037 Titlecase_Mapping => $CODE_POINT,
1038 Unicode_1_Name => "",
1039 Unicode_Radical_Stroke => "",
1040 Uppercase_Mapping => $CODE_POINT,
1041 Word_Break => 'Other',
1042);
1043
1044# Below are files that Unicode furnishes, but this program ignores, and why
1045my %ignored_files = (
1046 'CJKRadicals.txt' => 'Unihan data',
1047 'Index.txt' => 'An index, not actual data',
1048 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1049 'NamesList.txt' => 'Just adds commentary',
1050 'NormalizationCorrections.txt' => 'Data is already in other files.',
1051 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1052 'ReadMe.txt' => 'Just comments',
1053 'README.TXT' => 'Just comments',
1054 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
5f7264c7
KW
1055 'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications',
1056 'IndicMatraCategory.txt' => 'Provisional',
1057 'IndicSyllabicCategory.txt' => 'Provisional',
99870f4d
KW
1058);
1059
678f13d5 1060### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1061
1062my $HEADER=<<"EOF";
1063# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1064# This file is machine-generated by $0 from the Unicode
1065# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1066EOF
1067
b6922eda 1068my $INTERNAL_ONLY=<<"EOF";
99870f4d
KW
1069
1070# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
fac53429
KW
1071# This file is for internal use by core Perl only. The format and even the
1072# name or existence of this file are subject to change without notice. Don't
1073# use it directly.
99870f4d
KW
1074EOF
1075
1076my $DEVELOPMENT_ONLY=<<"EOF";
1077# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1078# This file contains information artificially constrained to code points
1079# present in Unicode release $string_compare_versions.
1080# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1081# not be used for production.
b6922eda
KW
1082
1083EOF
1084
99870f4d
KW
1085my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1086my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1087my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1088
1089# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1090# two must be 10; if there are 5, the first must not be a 0. Written this way
8c32d378
KW
1091# to decrease backtracking. The first one allows the code point to be at the
1092# end of a word, but to work properly, the word shouldn't end with a valid hex
1093# character. The second one won't match a code point at the end of a word,
1094# and doesn't have the run-on issue
1095my $run_on_code_point_re =
1096 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1097my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1098
1099# This matches the beginning of the line in the Unicode db files that give the
1100# defaults for code points not listed (i.e., missing) in the file. The code
1101# depends on this ending with a semi-colon, so it can assume it is a valid
1102# field when the line is split() by semi-colons
1103my $missing_defaults_prefix =
1104 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1105
1106# Property types. Unicode has more types, but these are sufficient for our
1107# purposes.
1108my $UNKNOWN = -1; # initialized to illegal value
1109my $NON_STRING = 1; # Either binary or enum
1110my $BINARY = 2;
1111my $ENUM = 3; # Include catalog
1112my $STRING = 4; # Anything else: string or misc
1113
1114# Some input files have lines that give default values for code points not
1115# contained in the file. Sometimes these should be ignored.
1116my $NO_DEFAULTS = 0; # Must evaluate to false
1117my $NOT_IGNORED = 1;
1118my $IGNORED = 2;
1119
1120# Range types. Each range has a type. Most ranges are type 0, for normal,
1121# and will appear in the main body of the tables in the output files, but
1122# there are other types of ranges as well, listed below, that are specially
1123# handled. There are pseudo-types as well that will never be stored as a
1124# type, but will affect the calculation of the type.
1125
1126# 0 is for normal, non-specials
1127my $MULTI_CP = 1; # Sequence of more than code point
1128my $HANGUL_SYLLABLE = 2;
1129my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1130my $NULL = 4; # The map is to the null string; utf8.c can't
1131 # handle these, nor is there an accepted syntax
1132 # for them in \p{} constructs
f86864ac 1133my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1134 # otherwise be $MULTI_CP type are instead type 0
1135
1136# process_generic_property_file() can accept certain overrides in its input.
1137# Each of these must begin AND end with $CMD_DELIM.
1138my $CMD_DELIM = "\a";
1139my $REPLACE_CMD = 'replace'; # Override the Replace
1140my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1141
1142my $NO = 0;
1143my $YES = 1;
1144
1145# Values for the Replace argument to add_range.
1146# $NO # Don't replace; add only the code points not
1147 # already present.
1148my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1149 # the comments at the subroutine definition.
1150my $UNCONDITIONALLY = 2; # Replace without conditions.
1151my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1152 # already there
56343c78 1153my $CROAK = 5; # Die with an error if is already there
99870f4d
KW
1154
1155# Flags to give property statuses. The phrases are to remind maintainers that
1156# if the flag is changed, the indefinite article referring to it in the
1157# documentation may need to be as well.
1158my $NORMAL = "";
1159my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1160 # it is suppressed
c12f2655
KW
1161my $PLACEHOLDER = 'P'; # A property that is defined as a placeholder in a
1162 # Unicode version that doesn't have it, but we need it
1163 # to be defined, if empty, to have things work.
1164 # Implies no pod entry generated
99870f4d
KW
1165my $DEPRECATED = 'D';
1166my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1167my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1168my $DISCOURAGED = 'X';
1169my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1170my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1171my $STRICTER = 'T';
1172my $a_bold_stricter = "a 'B<$STRICTER>'";
1173my $A_bold_stricter = "A 'B<$STRICTER>'";
1174my $STABILIZED = 'S';
1175my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1176my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1177my $OBSOLETE = 'O';
1178my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1179my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1180
1181my %status_past_participles = (
1182 $DISCOURAGED => 'discouraged',
1183 $SUPPRESSED => 'should never be generated',
1184 $STABILIZED => 'stabilized',
1185 $OBSOLETE => 'obsolete',
37e2e78e 1186 $DEPRECATED => 'deprecated',
99870f4d
KW
1187);
1188
f5817e0a
KW
1189# The format of the values of the tables:
1190my $EMPTY_FORMAT = "";
99870f4d
KW
1191my $BINARY_FORMAT = 'b';
1192my $DECIMAL_FORMAT = 'd';
1193my $FLOAT_FORMAT = 'f';
1194my $INTEGER_FORMAT = 'i';
1195my $HEX_FORMAT = 'x';
1196my $RATIONAL_FORMAT = 'r';
1197my $STRING_FORMAT = 's';
a14f3cb1 1198my $DECOMP_STRING_FORMAT = 'c';
99870f4d
KW
1199
1200my %map_table_formats = (
1201 $BINARY_FORMAT => 'binary',
1202 $DECIMAL_FORMAT => 'single decimal digit',
1203 $FLOAT_FORMAT => 'floating point number',
1204 $INTEGER_FORMAT => 'integer',
add63c13 1205 $HEX_FORMAT => 'non-negative hex whole number; a code point',
99870f4d 1206 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1207 $STRING_FORMAT => 'string',
92f9d56c 1208 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
99870f4d
KW
1209);
1210
1211# Unicode didn't put such derived files in a separate directory at first.
1212my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1213my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1214my $AUXILIARY = 'auxiliary';
1215
1216# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1217my %loose_to_file_of; # loosely maps table names to their respective
1218 # files
1219my %stricter_to_file_of; # same; but for stricter mapping.
1220my %nv_floating_to_rational; # maps numeric values floating point numbers to
1221 # their rational equivalent
c12f2655
KW
1222my %loose_property_name_of; # Loosely maps (non_string) property names to
1223 # standard form
99870f4d 1224
d867ccfb
KW
1225# Most properties are immune to caseless matching, otherwise you would get
1226# nonsensical results, as properties are a function of a code point, not
1227# everything that is caselessly equivalent to that code point. For example,
1228# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1229# be true because 's' and 'S' are equivalent caselessly. However,
1230# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1231# extend that concept to those very few properties that are like this. Each
1232# such property will match the full range caselessly. They are hard-coded in
1233# the program; it's not worth trying to make it general as it's extremely
1234# unlikely that they will ever change.
1235my %caseless_equivalent_to;
1236
99870f4d
KW
1237# These constants names and values were taken from the Unicode standard,
1238# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1239# syllables. The '_string' versions are so generated tables can retain the
1240# hex format, which is the more familiar value
1241my $SBase_string = "0xAC00";
1242my $SBase = CORE::hex $SBase_string;
1243my $LBase_string = "0x1100";
1244my $LBase = CORE::hex $LBase_string;
1245my $VBase_string = "0x1161";
1246my $VBase = CORE::hex $VBase_string;
1247my $TBase_string = "0x11A7";
1248my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1249my $SCount = 11172;
1250my $LCount = 19;
1251my $VCount = 21;
1252my $TCount = 28;
1253my $NCount = $VCount * $TCount;
1254
1255# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1256# with the above published constants.
1257my %Jamo;
1258my %Jamo_L; # Leading consonants
1259my %Jamo_V; # Vowels
1260my %Jamo_T; # Trailing consonants
1261
37e2e78e 1262my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1263my @unhandled_properties; # Will contain a list of properties found in
1264 # the input that we didn't process.
f86864ac 1265my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1266 # listed in the pod
1267my @map_properties; # Properties that get map files written
1268my @named_sequences; # NamedSequences.txt contents.
1269my %potential_files; # Generated list of all .txt files in the directory
1270 # structure so we can warn if something is being
1271 # ignored.
1272my @files_actually_output; # List of files we generated.
1273my @more_Names; # Some code point names are compound; this is used
1274 # to store the extra components of them.
1275my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1276 # the minimum before we consider it equivalent to a
1277 # candidate rational
1278my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1279
1280# These store references to certain commonly used property objects
1281my $gc;
1282my $perl;
1283my $block;
3e20195b
KW
1284my $perl_charname;
1285my $print;
7fc6cb55 1286my $Any;
99870f4d
KW
1287
1288# Are there conflicting names because of beginning with 'In_', or 'Is_'
1289my $has_In_conflicts = 0;
1290my $has_Is_conflicts = 0;
1291
1292sub internal_file_to_platform ($) {
1293 # Convert our file paths which have '/' separators to those of the
1294 # platform.
1295
1296 my $file = shift;
1297 return undef unless defined $file;
1298
1299 return File::Spec->join(split '/', $file);
d07a55ed 1300}
5beb625e 1301
99870f4d
KW
1302sub file_exists ($) { # platform independent '-e'. This program internally
1303 # uses slash as a path separator.
1304 my $file = shift;
1305 return 0 if ! defined $file;
1306 return -e internal_file_to_platform($file);
1307}
5beb625e 1308
99870f4d 1309sub objaddr($) {
23e33b60
KW
1310 # Returns the address of the blessed input object.
1311 # It doesn't check for blessedness because that would do a string eval
1312 # every call, and the program is structured so that this is never called
1313 # for a non-blessed object.
99870f4d 1314
23e33b60 1315 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1316
1317 # Numifying a ref gives its address.
051df77b 1318 return pack 'J', $_[0];
99870f4d
KW
1319}
1320
558712cf 1321# These are used only if $annotate is true.
c4019d52
KW
1322# The entire range of Unicode characters is examined to populate these
1323# after all the input has been processed. But most can be skipped, as they
1324# have the same descriptive phrases, such as being unassigned
1325my @viacode; # Contains the 1 million character names
1326my @printable; # boolean: And are those characters printable?
1327my @annotate_char_type; # Contains a type of those characters, specifically
1328 # for the purposes of annotation.
1329my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1330 # name for the purposes of annotation. They map to the
c4019d52
KW
1331 # upper edge of the range, so that the end point can
1332 # be immediately found. This is used to skip ahead to
1333 # the end of a range, and avoid processing each
1334 # individual code point in it.
1335my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1336 # characters, but excluding those which are
1337 # also noncharacter code points
1338
1339# The annotation types are an extension of the regular range types, though
1340# some of the latter are folded into one. Make the new types negative to
1341# avoid conflicting with the regular types
1342my $SURROGATE_TYPE = -1;
1343my $UNASSIGNED_TYPE = -2;
1344my $PRIVATE_USE_TYPE = -3;
1345my $NONCHARACTER_TYPE = -4;
1346my $CONTROL_TYPE = -5;
1347my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1348
1349sub populate_char_info ($) {
558712cf 1350 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1351 # input code point's info that are needed for outputting more detailed
1352 # comments. If calling context wants a return, it is the end point of
1353 # any contiguous range of characters that share essentially the same info
1354
1355 my $i = shift;
1356 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1357
1358 $viacode[$i] = $perl_charname->value_of($i) || "";
1359
1360 # A character is generally printable if Unicode says it is,
1361 # but below we make sure that most Unicode general category 'C' types
1362 # aren't.
1363 $printable[$i] = $print->contains($i);
1364
1365 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1366
1367 # Only these two regular types are treated specially for annotations
1368 # purposes
1369 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1370 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1371
1372 # Give a generic name to all code points that don't have a real name.
1373 # We output ranges, if applicable, for these. Also calculate the end
1374 # point of the range.
1375 my $end;
1376 if (! $viacode[$i]) {
1377 if ($gc-> table('Surrogate')->contains($i)) {
1378 $viacode[$i] = 'Surrogate';
1379 $annotate_char_type[$i] = $SURROGATE_TYPE;
1380 $printable[$i] = 0;
1381 $end = $gc->table('Surrogate')->containing_range($i)->end;
1382 }
1383 elsif ($gc-> table('Private_use')->contains($i)) {
1384 $viacode[$i] = 'Private Use';
1385 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1386 $printable[$i] = 0;
1387 $end = $gc->table('Private_Use')->containing_range($i)->end;
1388 }
1389 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1390 contains($i))
1391 {
1392 $viacode[$i] = 'Noncharacter';
1393 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1394 $printable[$i] = 0;
1395 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1396 containing_range($i)->end;
1397 }
1398 elsif ($gc-> table('Control')->contains($i)) {
1399 $viacode[$i] = 'Control';
1400 $annotate_char_type[$i] = $CONTROL_TYPE;
1401 $printable[$i] = 0;
1402 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1403 }
1404 elsif ($gc-> table('Unassigned')->contains($i)) {
1405 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1406 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1407 $printable[$i] = 0;
1408
1409 # Because we name the unassigned by the blocks they are in, it
1410 # can't go past the end of that block, and it also can't go past
1411 # the unassigned range it is in. The special table makes sure
1412 # that the non-characters, which are unassigned, are separated
1413 # out.
1414 $end = min($block->containing_range($i)->end,
1415 $unassigned_sans_noncharacters-> containing_range($i)->
1416 end);
13ca76ff
KW
1417 }
1418 else {
1419 Carp::my_carp_bug("Can't figure out how to annotate "
1420 . sprintf("U+%04X", $i)
1421 . ". Proceeding anyway.");
c4019d52
KW
1422 $viacode[$i] = 'UNKNOWN';
1423 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1424 $printable[$i] = 0;
1425 }
1426 }
1427
1428 # Here, has a name, but if it's one in which the code point number is
1429 # appended to the name, do that.
1430 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1431 $viacode[$i] .= sprintf("-%04X", $i);
1432 $end = $perl_charname->containing_range($i)->end;
1433 }
1434
1435 # And here, has a name, but if it's a hangul syllable one, replace it with
1436 # the correct name from the Unicode algorithm
1437 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1438 use integer;
1439 my $SIndex = $i - $SBase;
1440 my $L = $LBase + $SIndex / $NCount;
1441 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1442 my $T = $TBase + $SIndex % $TCount;
1443 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1444 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1445 $end = $perl_charname->containing_range($i)->end;
1446 }
1447
1448 return if ! defined wantarray;
1449 return $i if ! defined $end; # If not a range, return the input
1450
1451 # Save this whole range so can find the end point quickly
1452 $annotate_ranges->add_map($i, $end, $end);
1453
1454 return $end;
1455}
1456
23e33b60
KW
1457# Commented code below should work on Perl 5.8.
1458## This 'require' doesn't necessarily work in miniperl, and even if it does,
1459## the native perl version of it (which is what would operate under miniperl)
1460## is extremely slow, as it does a string eval every call.
1461#my $has_fast_scalar_util = $\18 !~ /miniperl/
1462# && defined eval "require Scalar::Util";
1463#
1464#sub objaddr($) {
1465# # Returns the address of the blessed input object. Uses the XS version if
1466# # available. It doesn't check for blessedness because that would do a
1467# # string eval every call, and the program is structured so that this is
1468# # never called for a non-blessed object.
1469#
1470# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1471#
1472# # Check at least that is a ref.
1473# my $pkg = ref($_[0]) or return undef;
1474#
1475# # Change to a fake package to defeat any overloaded stringify
1476# bless $_[0], 'main::Fake';
1477#
1478# # Numifying a ref gives its address.
051df77b 1479# my $addr = pack 'J', $_[0];
23e33b60
KW
1480#
1481# # Return to original class
1482# bless $_[0], $pkg;
1483# return $addr;
1484#}
1485
99870f4d
KW
1486sub max ($$) {
1487 my $a = shift;
1488 my $b = shift;
1489 return $a if $a >= $b;
1490 return $b;
1491}
1492
1493sub min ($$) {
1494 my $a = shift;
1495 my $b = shift;
1496 return $a if $a <= $b;
1497 return $b;
1498}
1499
1500sub clarify_number ($) {
1501 # This returns the input number with underscores inserted every 3 digits
1502 # in large (5 digits or more) numbers. Input must be entirely digits, not
1503 # checked.
1504
1505 my $number = shift;
1506 my $pos = length($number) - 3;
1507 return $number if $pos <= 1;
1508 while ($pos > 0) {
1509 substr($number, $pos, 0) = '_';
1510 $pos -= 3;
5beb625e 1511 }
99870f4d 1512 return $number;
99598c8c
JH
1513}
1514
12ac2576 1515
99870f4d 1516package Carp;
7ebf06b3 1517
99870f4d
KW
1518# These routines give a uniform treatment of messages in this program. They
1519# are placed in the Carp package to cause the stack trace to not include them,
1520# although an alternative would be to use another package and set @CARP_NOT
1521# for it.
12ac2576 1522
99870f4d 1523our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1524
99f78760
KW
1525# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1526# and overload trying to load Scalar:Util under miniperl. See
1527# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1528undef $overload::VERSION;
1529
99870f4d
KW
1530sub my_carp {
1531 my $message = shift || "";
1532 my $nofold = shift || 0;
7ebf06b3 1533
99870f4d
KW
1534 if ($message) {
1535 $message = main::join_lines($message);
1536 $message =~ s/^$0: *//; # Remove initial program name
1537 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1538 $message = "\n$0: $message;";
12ac2576 1539
99870f4d
KW
1540 # Fold the message with program name, semi-colon end punctuation
1541 # (which looks good with the message that carp appends to it), and a
1542 # hanging indent for continuation lines.
1543 $message = main::simple_fold($message, "", 4) unless $nofold;
1544 $message =~ s/\n$//; # Remove the trailing nl so what carp
1545 # appends is to the same line
1546 }
12ac2576 1547
99870f4d 1548 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1549
99870f4d
KW
1550 carp $message;
1551 return;
1552}
7ebf06b3 1553
99870f4d
KW
1554sub my_carp_bug {
1555 # This is called when it is clear that the problem is caused by a bug in
1556 # this program.
7ebf06b3 1557
99870f4d
KW
1558 my $message = shift;
1559 $message =~ s/^$0: *//;
1560 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1561 carp $message;
1562 return;
1563}
7ebf06b3 1564
99870f4d
KW
1565sub carp_too_few_args {
1566 if (@_ != 2) {
1567 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1568 return;
12ac2576 1569 }
7ebf06b3 1570
99870f4d
KW
1571 my $args_ref = shift;
1572 my $count = shift;
7ebf06b3 1573
99870f4d
KW
1574 my_carp_bug("Need at least $count arguments to "
1575 . (caller 1)[3]
1576 . ". Instead got: '"
1577 . join ', ', @$args_ref
1578 . "'. No action taken.");
1579 return;
12ac2576
JP
1580}
1581
99870f4d
KW
1582sub carp_extra_args {
1583 my $args_ref = shift;
1584 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1585
99870f4d
KW
1586 unless (ref $args_ref) {
1587 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1588 return;
1589 }
1590 my ($package, $file, $line) = caller;
1591 my $subroutine = (caller 1)[3];
cf25bb62 1592
99870f4d
KW
1593 my $list;
1594 if (ref $args_ref eq 'HASH') {
1595 foreach my $key (keys %$args_ref) {
1596 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1597 }
99870f4d 1598 $list = join ', ', each %{$args_ref};
cf25bb62 1599 }
99870f4d
KW
1600 elsif (ref $args_ref eq 'ARRAY') {
1601 foreach my $arg (@$args_ref) {
1602 $arg = $UNDEF unless defined $arg;
1603 }
1604 $list = join ', ', @$args_ref;
1605 }
1606 else {
1607 my_carp_bug("Can't cope with ref "
1608 . ref($args_ref)
1609 . " . argument to 'carp_extra_args'. Not checking arguments.");
1610 return;
1611 }
1612
1613 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1614 return;
d73e5302
JH
1615}
1616
99870f4d
KW
1617package main;
1618
1619{ # Closure
1620
1621 # This program uses the inside-out method for objects, as recommended in
1622 # "Perl Best Practices". This closure aids in generating those. There
1623 # are two routines. setup_package() is called once per package to set
1624 # things up, and then set_access() is called for each hash representing a
1625 # field in the object. These routines arrange for the object to be
1626 # properly destroyed when no longer used, and for standard accessor
1627 # functions to be generated. If you need more complex accessors, just
1628 # write your own and leave those accesses out of the call to set_access().
1629 # More details below.
1630
1631 my %constructor_fields; # fields that are to be used in constructors; see
1632 # below
1633
1634 # The values of this hash will be the package names as keys to other
1635 # hashes containing the name of each field in the package as keys, and
1636 # references to their respective hashes as values.
1637 my %package_fields;
1638
1639 sub setup_package {
1640 # Sets up the package, creating standard DESTROY and dump methods
1641 # (unless already defined). The dump method is used in debugging by
1642 # simple_dumper().
1643 # The optional parameters are:
1644 # a) a reference to a hash, that gets populated by later
1645 # set_access() calls with one of the accesses being
1646 # 'constructor'. The caller can then refer to this, but it is
1647 # not otherwise used by these two routines.
1648 # b) a reference to a callback routine to call during destruction
1649 # of the object, before any fields are actually destroyed
1650
1651 my %args = @_;
1652 my $constructor_ref = delete $args{'Constructor_Fields'};
1653 my $destroy_callback = delete $args{'Destroy_Callback'};
1654 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1655
1656 my %fields;
1657 my $package = (caller)[0];
1658
1659 $package_fields{$package} = \%fields;
1660 $constructor_fields{$package} = $constructor_ref;
1661
1662 unless ($package->can('DESTROY')) {
1663 my $destroy_name = "${package}::DESTROY";
1664 no strict "refs";
1665
1666 # Use typeglob to give the anonymous subroutine the name we want
1667 *$destroy_name = sub {
1668 my $self = shift;
ffe43484 1669 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1670
1671 $self->$destroy_callback if $destroy_callback;
1672 foreach my $field (keys %{$package_fields{$package}}) {
1673 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1674 delete $package_fields{$package}{$field}{$addr};
1675 }
1676 return;
1677 }
1678 }
1679
1680 unless ($package->can('dump')) {
1681 my $dump_name = "${package}::dump";
1682 no strict "refs";
1683 *$dump_name = sub {
1684 my $self = shift;
1685 return dump_inside_out($self, $package_fields{$package}, @_);
1686 }
1687 }
1688 return;
1689 }
1690
1691 sub set_access {
1692 # Arrange for the input field to be garbage collected when no longer
1693 # needed. Also, creates standard accessor functions for the field
1694 # based on the optional parameters-- none if none of these parameters:
1695 # 'addable' creates an 'add_NAME()' accessor function.
1696 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1697 # function.
1698 # 'settable' creates a 'set_NAME()' accessor function.
1699 # 'constructor' doesn't create an accessor function, but adds the
1700 # field to the hash that was previously passed to
1701 # setup_package();
1702 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1703 # 'add' etc. all mean 'addable'.
1704 # The read accessor function will work on both array and scalar
1705 # values. If another accessor in the parameter list is 'a', the read
1706 # access assumes an array. You can also force it to be array access
1707 # by specifying 'readable_array' instead of 'readable'
1708 #
1709 # A sort-of 'protected' access can be set-up by preceding the addable,
1710 # readable or settable with some initial portion of 'protected_' (but,
1711 # the underscore is required), like 'p_a', 'pro_set', etc. The
1712 # "protection" is only by convention. All that happens is that the
1713 # accessor functions' names begin with an underscore. So instead of
1714 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1715 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1716 # end of each package, and then storing the __LINE__ ranges and
1717 # checking them on every accessor. But that is way overkill.)
1718
1719 # We create anonymous subroutines as the accessors and then use
1720 # typeglobs to assign them to the proper package and name
1721
1722 my $name = shift; # Name of the field
1723 my $field = shift; # Reference to the inside-out hash containing the
1724 # field
1725
1726 my $package = (caller)[0];
1727
1728 if (! exists $package_fields{$package}) {
1729 croak "$0: Must call 'setup_package' before 'set_access'";
1730 }
d73e5302 1731
99870f4d
KW
1732 # Stash the field so DESTROY can get it.
1733 $package_fields{$package}{$name} = $field;
cf25bb62 1734
99870f4d
KW
1735 # Remaining arguments are the accessors. For each...
1736 foreach my $access (@_) {
1737 my $access = lc $access;
cf25bb62 1738
99870f4d 1739 my $protected = "";
cf25bb62 1740
99870f4d
KW
1741 # Match the input as far as it goes.
1742 if ($access =~ /^(p[^_]*)_/) {
1743 $protected = $1;
1744 if (substr('protected_', 0, length $protected)
1745 eq $protected)
1746 {
1747
1748 # Add 1 for the underscore not included in $protected
1749 $access = substr($access, length($protected) + 1);
1750 $protected = '_';
1751 }
1752 else {
1753 $protected = "";
1754 }
1755 }
1756
1757 if (substr('addable', 0, length $access) eq $access) {
1758 my $subname = "${package}::${protected}add_$name";
1759 no strict "refs";
1760
1761 # add_ accessor. Don't add if already there, which we
1762 # determine using 'eq' for scalars and '==' otherwise.
1763 *$subname = sub {
1764 use strict "refs";
1765 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1766 my $self = shift;
1767 my $value = shift;
ffe43484 1768 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1769 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1770 if (ref $value) {
f998e60c 1771 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1772 }
1773 else {
f998e60c 1774 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1775 }
f998e60c 1776 push @{$field->{$addr}}, $value;
99870f4d
KW
1777 return;
1778 }
1779 }
1780 elsif (substr('constructor', 0, length $access) eq $access) {
1781 if ($protected) {
1782 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1783 }
1784 else {
1785 $constructor_fields{$package}{$name} = $field;
1786 }
1787 }
1788 elsif (substr('readable_array', 0, length $access) eq $access) {
1789
1790 # Here has read access. If one of the other parameters for
1791 # access is array, or this one specifies array (by being more
1792 # than just 'readable_'), then create a subroutine that
1793 # assumes the data is an array. Otherwise just a scalar
1794 my $subname = "${package}::${protected}$name";
1795 if (grep { /^a/i } @_
1796 or length($access) > length('readable_'))
1797 {
1798 no strict "refs";
1799 *$subname = sub {
1800 use strict "refs";
23e33b60 1801 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1802 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1803 if (ref $field->{$addr} ne 'ARRAY') {
1804 my $type = ref $field->{$addr};
1805 $type = 'scalar' unless $type;
1806 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1807 return;
1808 }
1809 return scalar @{$field->{$addr}} unless wantarray;
1810
1811 # Make a copy; had problems with caller modifying the
1812 # original otherwise
1813 my @return = @{$field->{$addr}};
1814 return @return;
1815 }
1816 }
1817 else {
1818
1819 # Here not an array value, a simpler function.
1820 no strict "refs";
1821 *$subname = sub {
1822 use strict "refs";
23e33b60 1823 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1824 no overloading;
051df77b 1825 return $field->{pack 'J', $_[0]};
99870f4d
KW
1826 }
1827 }
1828 }
1829 elsif (substr('settable', 0, length $access) eq $access) {
1830 my $subname = "${package}::${protected}set_$name";
1831 no strict "refs";
1832 *$subname = sub {
1833 use strict "refs";
23e33b60
KW
1834 if (main::DEBUG) {
1835 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1836 Carp::carp_extra_args(\@_) if @_ > 2;
1837 }
1838 # $self is $_[0]; $value is $_[1]
f998e60c 1839 no overloading;
051df77b 1840 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1841 return;
1842 }
1843 }
1844 else {
1845 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1846 }
cf25bb62 1847 }
99870f4d 1848 return;
cf25bb62 1849 }
99870f4d
KW
1850}
1851
1852package Input_file;
1853
1854# All input files use this object, which stores various attributes about them,
1855# and provides for convenient, uniform handling. The run method wraps the
1856# processing. It handles all the bookkeeping of opening, reading, and closing
1857# the file, returning only significant input lines.
1858#
1859# Each object gets a handler which processes the body of the file, and is
1860# called by run(). Most should use the generic, default handler, which has
1861# code scrubbed to handle things you might not expect. A handler should
1862# basically be a while(next_line()) {...} loop.
1863#
1864# You can also set up handlers to
1865# 1) call before the first line is read for pre processing
1866# 2) call to adjust each line of the input before the main handler gets them
1867# 3) call upon EOF before the main handler exits its loop
1868# 4) call at the end for post processing
1869#
1870# $_ is used to store the input line, and is to be filtered by the
1871# each_line_handler()s. So, if the format of the line is not in the desired
1872# format for the main handler, these are used to do that adjusting. They can
1873# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1874# so the $_ output of one is used as the input to the next. None of the other
1875# handlers are stackable, but could easily be changed to be so.
1876#
1877# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1878# which insert the parameters as lines to be processed before the next input
1879# file line is read. This allows the EOF handler to flush buffers, for
1880# example. The difference between the two routines is that the lines inserted
1881# by insert_lines() are subjected to the each_line_handler()s. (So if you
1882# called it from such a handler, you would get infinite recursion.) Lines
1883# inserted by insert_adjusted_lines() go directly to the main handler without
1884# any adjustments. If the post-processing handler calls any of these, there
1885# will be no effect. Some error checking for these conditions could be added,
1886# but it hasn't been done.
1887#
1888# carp_bad_line() should be called to warn of bad input lines, which clears $_
1889# to prevent further processing of the line. This routine will output the
1890# message as a warning once, and then keep a count of the lines that have the
1891# same message, and output that count at the end of the file's processing.
1892# This keeps the number of messages down to a manageable amount.
1893#
1894# get_missings() should be called to retrieve any @missing input lines.
1895# Messages will be raised if this isn't done if the options aren't to ignore
1896# missings.
1897
1898sub trace { return main::trace(@_); }
1899
99870f4d
KW
1900{ # Closure
1901 # Keep track of fields that are to be put into the constructor.
1902 my %constructor_fields;
1903
1904 main::setup_package(Constructor_Fields => \%constructor_fields);
1905
1906 my %file; # Input file name, required
1907 main::set_access('file', \%file, qw{ c r });
1908
1909 my %first_released; # Unicode version file was first released in, required
1910 main::set_access('first_released', \%first_released, qw{ c r });
1911
1912 my %handler; # Subroutine to process the input file, defaults to
1913 # 'process_generic_property_file'
1914 main::set_access('handler', \%handler, qw{ c });
1915
1916 my %property;
1917 # name of property this file is for. defaults to none, meaning not
1918 # applicable, or is otherwise determinable, for example, from each line.
1919 main::set_access('property', \%property, qw{ c });
1920
1921 my %optional;
1922 # If this is true, the file is optional. If not present, no warning is
1923 # output. If it is present, the string given by this parameter is
1924 # evaluated, and if false the file is not processed.
1925 main::set_access('optional', \%optional, 'c', 'r');
1926
1927 my %non_skip;
1928 # This is used for debugging, to skip processing of all but a few input
1929 # files. Add 'non_skip => 1' to the constructor for those files you want
1930 # processed when you set the $debug_skip global.
1931 main::set_access('non_skip', \%non_skip, 'c');
1932
37e2e78e
KW
1933 my %skip;
1934 # This is used to skip processing of this input file semi-permanently.
1935 # It is used for files that we aren't planning to process anytime soon,
1936 # but want to allow to be in the directory and not raise a message that we
1937 # are not handling. Mostly for test files. This is in contrast to the
1938 # non_skip element, which is supposed to be used very temporarily for
1939 # debugging. Sets 'optional' to 1
1940 main::set_access('skip', \%skip, 'c');
1941
99870f4d
KW
1942 my %each_line_handler;
1943 # list of subroutines to look at and filter each non-comment line in the
1944 # file. defaults to none. The subroutines are called in order, each is
1945 # to adjust $_ for the next one, and the final one adjusts it for
1946 # 'handler'
1947 main::set_access('each_line_handler', \%each_line_handler, 'c');
1948
1949 my %has_missings_defaults;
1950 # ? Are there lines in the file giving default values for code points
1951 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1952 # the norm, but IGNORED means it has such lines, but the handler doesn't
1953 # use them. Having these three states allows us to catch changes to the
1954 # UCD that this program should track
1955 main::set_access('has_missings_defaults',
1956 \%has_missings_defaults, qw{ c r });
1957
1958 my %pre_handler;
1959 # Subroutine to call before doing anything else in the file. If undef, no
1960 # such handler is called.
1961 main::set_access('pre_handler', \%pre_handler, qw{ c });
1962
1963 my %eof_handler;
1964 # Subroutine to call upon getting an EOF on the input file, but before
1965 # that is returned to the main handler. This is to allow buffers to be
1966 # flushed. The handler is expected to call insert_lines() or
1967 # insert_adjusted() with the buffered material
1968 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1969
1970 my %post_handler;
1971 # Subroutine to call after all the lines of the file are read in and
1972 # processed. If undef, no such handler is called.
1973 main::set_access('post_handler', \%post_handler, qw{ c });
1974
1975 my %progress_message;
1976 # Message to print to display progress in lieu of the standard one
1977 main::set_access('progress_message', \%progress_message, qw{ c });
1978
1979 my %handle;
1980 # cache open file handle, internal. Is undef if file hasn't been
1981 # processed at all, empty if has;
1982 main::set_access('handle', \%handle);
1983
1984 my %added_lines;
1985 # cache of lines added virtually to the file, internal
1986 main::set_access('added_lines', \%added_lines);
1987
1988 my %errors;
1989 # cache of errors found, internal
1990 main::set_access('errors', \%errors);
1991
1992 my %missings;
1993 # storage of '@missing' defaults lines
1994 main::set_access('missings', \%missings);
1995
1996 sub new {
1997 my $class = shift;
1998
1999 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2000 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2001
2002 # Set defaults
2003 $handler{$addr} = \&main::process_generic_property_file;
2004 $non_skip{$addr} = 0;
37e2e78e 2005 $skip{$addr} = 0;
99870f4d
KW
2006 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2007 $handle{$addr} = undef;
2008 $added_lines{$addr} = [ ];
2009 $each_line_handler{$addr} = [ ];
2010 $errors{$addr} = { };
2011 $missings{$addr} = [ ];
2012
2013 # Two positional parameters.
99f78760 2014 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2015 $file{$addr} = main::internal_file_to_platform(shift);
2016 $first_released{$addr} = shift;
2017
2018 # The rest of the arguments are key => value pairs
2019 # %constructor_fields has been set up earlier to list all possible
2020 # ones. Either set or push, depending on how the default has been set
2021 # up just above.
2022 my %args = @_;
2023 foreach my $key (keys %args) {
2024 my $argument = $args{$key};
2025
2026 # Note that the fields are the lower case of the constructor keys
2027 my $hash = $constructor_fields{lc $key};
2028 if (! defined $hash) {
2029 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2030 next;
2031 }
2032 if (ref $hash->{$addr} eq 'ARRAY') {
2033 if (ref $argument eq 'ARRAY') {
2034 foreach my $argument (@{$argument}) {
2035 next if ! defined $argument;
2036 push @{$hash->{$addr}}, $argument;
2037 }
2038 }
2039 else {
2040 push @{$hash->{$addr}}, $argument if defined $argument;
2041 }
2042 }
2043 else {
2044 $hash->{$addr} = $argument;
2045 }
2046 delete $args{$key};
2047 };
2048
2049 # If the file has a property for it, it means that the property is not
2050 # listed in the file's entries. So add a handler to the list of line
2051 # handlers to insert the property name into the lines, to provide a
2052 # uniform interface to the final processing subroutine.
2053 # the final code doesn't have to worry about that.
2054 if ($property{$addr}) {
2055 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2056 }
2057
2058 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2059 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 2060 }
99870f4d 2061
37e2e78e
KW
2062 $optional{$addr} = 1 if $skip{$addr};
2063
99870f4d 2064 return $self;
d73e5302
JH
2065 }
2066
cf25bb62 2067
99870f4d
KW
2068 use overload
2069 fallback => 0,
2070 qw("") => "_operator_stringify",
2071 "." => \&main::_operator_dot,
2072 ;
cf25bb62 2073
99870f4d
KW
2074 sub _operator_stringify {
2075 my $self = shift;
cf25bb62 2076
99870f4d 2077 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2078 }
d73e5302 2079
99870f4d
KW
2080 # flag to make sure extracted files are processed early
2081 my $seen_non_extracted_non_age = 0;
d73e5302 2082
99870f4d
KW
2083 sub run {
2084 # Process the input object $self. This opens and closes the file and
2085 # calls all the handlers for it. Currently, this can only be called
2086 # once per file, as it destroy's the EOF handler
d73e5302 2087
99870f4d
KW
2088 my $self = shift;
2089 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2090
ffe43484 2091 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2092
99870f4d 2093 my $file = $file{$addr};
d73e5302 2094
99870f4d
KW
2095 # Don't process if not expecting this file (because released later
2096 # than this Unicode version), and isn't there. This means if someone
2097 # copies it into an earlier version's directory, we will go ahead and
2098 # process it.
2099 return if $first_released{$addr} gt $v_version && ! -e $file;
2100
2101 # If in debugging mode and this file doesn't have the non-skip
2102 # flag set, and isn't one of the critical files, skip it.
2103 if ($debug_skip
2104 && $first_released{$addr} ne v0
2105 && ! $non_skip{$addr})
2106 {
2107 print "Skipping $file in debugging\n" if $verbosity;
2108 return;
2109 }
2110
2111 # File could be optional
37e2e78e 2112 if ($optional{$addr}) {
99870f4d
KW
2113 return unless -e $file;
2114 my $result = eval $optional{$addr};
2115 if (! defined $result) {
2116 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2117 return;
2118 }
2119 if (! $result) {
2120 if ($verbosity) {
2121 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2122 }
2123 return;
2124 }
2125 }
2126
2127 if (! defined $file || ! -e $file) {
2128
2129 # If the file doesn't exist, see if have internal data for it
2130 # (based on first_released being 0).
2131 if ($first_released{$addr} eq v0) {
2132 $handle{$addr} = 'pretend_is_open';
2133 }
2134 else {
2135 if (! $optional{$addr} # File could be optional
2136 && $v_version ge $first_released{$addr})
2137 {
2138 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2139 }
2140 return;
2141 }
2142 }
2143 else {
2144
37e2e78e
KW
2145 # Here, the file exists. Some platforms may change the case of
2146 # its name
99870f4d 2147 if ($seen_non_extracted_non_age) {
517956bf 2148 if ($file =~ /$EXTRACTED/i) {
99870f4d 2149 Carp::my_carp_bug(join_lines(<<END
99f78760 2150$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2151anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2152have subtle problems
2153END
2154 ));
2155 }
2156 }
2157 elsif ($EXTRACTED_DIR
2158 && $first_released{$addr} ne v0
517956bf
CB
2159 && $file !~ /$EXTRACTED/i
2160 && lc($file) ne 'dage.txt')
99870f4d
KW
2161 {
2162 # We don't set this (by the 'if' above) if we have no
2163 # extracted directory, so if running on an early version,
2164 # this test won't work. Not worth worrying about.
2165 $seen_non_extracted_non_age = 1;
2166 }
2167
2168 # And mark the file as having being processed, and warn if it
2169 # isn't a file we are expecting. As we process the files,
2170 # they are deleted from the hash, so any that remain at the
2171 # end of the program are files that we didn't process.
517956bf
CB
2172 my $fkey = File::Spec->rel2abs($file);
2173 my $expecting = delete $potential_files{$fkey};
2174 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
678f13d5
KW
2175 Carp::my_carp("Was not expecting '$file'.") if
2176 ! $expecting
99870f4d
KW
2177 && ! defined $handle{$addr};
2178
37e2e78e
KW
2179 # Having deleted from expected files, we can quit if not to do
2180 # anything. Don't print progress unless really want verbosity
2181 if ($skip{$addr}) {
2182 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2183 return;
2184 }
2185
99870f4d
KW
2186 # Open the file, converting the slashes used in this program
2187 # into the proper form for the OS
2188 my $file_handle;
2189 if (not open $file_handle, "<", $file) {
2190 Carp::my_carp("Can't open $file. Skipping: $!");
2191 return 0;
2192 }
2193 $handle{$addr} = $file_handle; # Cache the open file handle
2194 }
2195
2196 if ($verbosity >= $PROGRESS) {
2197 if ($progress_message{$addr}) {
2198 print "$progress_message{$addr}\n";
2199 }
2200 else {
2201 # If using a virtual file, say so.
2202 print "Processing ", (-e $file)
2203 ? $file
2204 : "substitute $file",
2205 "\n";
2206 }
2207 }
2208
2209
2210 # Call any special handler for before the file.
2211 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2212
2213 # Then the main handler
2214 &{$handler{$addr}}($self);
2215
2216 # Then any special post-file handler.
2217 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2218
2219 # If any errors have been accumulated, output the counts (as the first
2220 # error message in each class was output when it was encountered).
2221 if ($errors{$addr}) {
2222 my $total = 0;
2223 my $types = 0;
2224 foreach my $error (keys %{$errors{$addr}}) {
2225 $total += $errors{$addr}->{$error};
2226 delete $errors{$addr}->{$error};
2227 $types++;
2228 }
2229 if ($total > 1) {
2230 my $message
2231 = "A total of $total lines had errors in $file. ";
2232
2233 $message .= ($types == 1)
2234 ? '(Only the first one was displayed.)'
2235 : '(Only the first of each type was displayed.)';
2236 Carp::my_carp($message);
2237 }
2238 }
2239
2240 if (@{$missings{$addr}}) {
2241 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2242 }
2243
2244 # If a real file handle, close it.
2245 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2246 ref $handle{$addr};
2247 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2248 # the file, as opposed to undef
2249 return;
2250 }
2251
2252 sub next_line {
2253 # Sets $_ to be the next logical input line, if any. Returns non-zero
2254 # if such a line exists. 'logical' means that any lines that have
2255 # been added via insert_lines() will be returned in $_ before the file
2256 # is read again.
2257
2258 my $self = shift;
2259 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2260
ffe43484 2261 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2262
2263 # Here the file is open (or if the handle is not a ref, is an open
2264 # 'virtual' file). Get the next line; any inserted lines get priority
2265 # over the file itself.
2266 my $adjusted;
2267
2268 LINE:
2269 while (1) { # Loop until find non-comment, non-empty line
2270 #local $to_trace = 1 if main::DEBUG;
2271 my $inserted_ref = shift @{$added_lines{$addr}};
2272 if (defined $inserted_ref) {
2273 ($adjusted, $_) = @{$inserted_ref};
2274 trace $adjusted, $_ if main::DEBUG && $to_trace;
2275 return 1 if $adjusted;
2276 }
2277 else {
2278 last if ! ref $handle{$addr}; # Don't read unless is real file
2279 last if ! defined ($_ = readline $handle{$addr});
2280 }
2281 chomp;
2282 trace $_ if main::DEBUG && $to_trace;
2283
2284 # See if this line is the comment line that defines what property
2285 # value that code points that are not listed in the file should
2286 # have. The format or existence of these lines is not guaranteed
2287 # by Unicode since they are comments, but the documentation says
2288 # that this was added for machine-readability, so probably won't
2289 # change. This works starting in Unicode Version 5.0. They look
2290 # like:
2291 #
2292 # @missing: 0000..10FFFF; Not_Reordered
2293 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2294 # @missing: 0000..10FFFF; ; NaN
2295 #
2296 # Save the line for a later get_missings() call.
2297 if (/$missing_defaults_prefix/) {
2298 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2299 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2300 }
2301 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2302 my @defaults = split /\s* ; \s*/x, $_;
2303
2304 # The first field is the @missing, which ends in a
2305 # semi-colon, so can safely shift.
2306 shift @defaults;
2307
2308 # Some of these lines may have empty field placeholders
2309 # which get in the way. An example is:
2310 # @missing: 0000..10FFFF; ; NaN
2311 # Remove them. Process starting from the top so the
2312 # splice doesn't affect things still to be looked at.
2313 for (my $i = @defaults - 1; $i >= 0; $i--) {
2314 next if $defaults[$i] ne "";
2315 splice @defaults, $i, 1;
2316 }
2317
2318 # What's left should be just the property (maybe) and the
2319 # default. Having only one element means it doesn't have
2320 # the property.
2321 my $default;
2322 my $property;
2323 if (@defaults >= 1) {
2324 if (@defaults == 1) {
2325 $default = $defaults[0];
2326 }
2327 else {
2328 $property = $defaults[0];
2329 $default = $defaults[1];
2330 }
2331 }
2332
2333 if (@defaults < 1
2334 || @defaults > 2
2335 || ($default =~ /^</
2336 && $default !~ /^<code *point>$/i
2337 && $default !~ /^<none>$/i))
2338 {
2339 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2340 }
2341 else {
2342
2343 # If the property is missing from the line, it should
2344 # be the one for the whole file
2345 $property = $property{$addr} if ! defined $property;
2346
2347 # Change <none> to the null string, which is what it
2348 # really means. If the default is the code point
2349 # itself, set it to <code point>, which is what
2350 # Unicode uses (but sometimes they've forgotten the
2351 # space)
2352 if ($default =~ /^<none>$/i) {
2353 $default = "";
2354 }
2355 elsif ($default =~ /^<code *point>$/i) {
2356 $default = $CODE_POINT;
2357 }
2358
2359 # Store them as a sub-arrays with both components.
2360 push @{$missings{$addr}}, [ $default, $property ];
2361 }
2362 }
2363
2364 # There is nothing for the caller to process on this comment
2365 # line.
2366 next;
2367 }
2368
2369 # Remove comments and trailing space, and skip this line if the
2370 # result is empty
2371 s/#.*//;
2372 s/\s+$//;
2373 next if /^$/;
2374
2375 # Call any handlers for this line, and skip further processing of
2376 # the line if the handler sets the line to null.
2377 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2378 &{$sub_ref}($self);
2379 next LINE if /^$/;
2380 }
2381
2382 # Here the line is ok. return success.
2383 return 1;
2384 } # End of looping through lines.
2385
2386 # If there is an EOF handler, call it (only once) and if it generates
2387 # more lines to process go back in the loop to handle them.
2388 if ($eof_handler{$addr}) {
2389 &{$eof_handler{$addr}}($self);
2390 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2391 goto LINE if $added_lines{$addr};
2392 }
2393
2394 # Return failure -- no more lines.
2395 return 0;
2396
2397 }
2398
2399# Not currently used, not fully tested.
2400# sub peek {
2401# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2402# # record. Not callable from an each_line_handler(), nor does it call
2403# # an each_line_handler() on the line.
2404#
2405# my $self = shift;
ffe43484 2406# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2407#
2408# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2409# my ($adjusted, $line) = @{$inserted_ref};
2410# next if $adjusted;
2411#
2412# # Remove comments and trailing space, and return a non-empty
2413# # resulting line
2414# $line =~ s/#.*//;
2415# $line =~ s/\s+$//;
2416# return $line if $line ne "";
2417# }
2418#
2419# return if ! ref $handle{$addr}; # Don't read unless is real file
2420# while (1) { # Loop until find non-comment, non-empty line
2421# local $to_trace = 1 if main::DEBUG;
2422# trace $_ if main::DEBUG && $to_trace;
2423# return if ! defined (my $line = readline $handle{$addr});
2424# chomp $line;
2425# push @{$added_lines{$addr}}, [ 0, $line ];
2426#
2427# $line =~ s/#.*//;
2428# $line =~ s/\s+$//;
2429# return $line if $line ne "";
2430# }
2431#
2432# return;
2433# }
2434
2435
2436 sub insert_lines {
2437 # Lines can be inserted so that it looks like they were in the input
2438 # file at the place it was when this routine is called. See also
2439 # insert_adjusted_lines(). Lines inserted via this routine go through
2440 # any each_line_handler()
2441
2442 my $self = shift;
2443
2444 # Each inserted line is an array, with the first element being 0 to
2445 # indicate that this line hasn't been adjusted, and needs to be
2446 # processed.
f998e60c 2447 no overloading;
051df77b 2448 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2449 return;
2450 }
2451
2452 sub insert_adjusted_lines {
2453 # Lines can be inserted so that it looks like they were in the input
2454 # file at the place it was when this routine is called. See also
2455 # insert_lines(). Lines inserted via this routine are already fully
2456 # adjusted, ready to be processed; each_line_handler()s handlers will
2457 # not be called. This means this is not a completely general
2458 # facility, as only the last each_line_handler on the stack should
2459 # call this. It could be made more general, by passing to each of the
2460 # line_handlers their position on the stack, which they would pass on
2461 # to this routine, and that would replace the boolean first element in
2462 # the anonymous array pushed here, so that the next_line routine could
2463 # use that to call only those handlers whose index is after it on the
2464 # stack. But this is overkill for what is needed now.
2465
2466 my $self = shift;
2467 trace $_[0] if main::DEBUG && $to_trace;
2468
2469 # Each inserted line is an array, with the first element being 1 to
2470 # indicate that this line has been adjusted
f998e60c 2471 no overloading;
051df77b 2472 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2473 return;
2474 }
2475
2476 sub get_missings {
2477 # Returns the stored up @missings lines' values, and clears the list.
2478 # The values are in an array, consisting of the default in the first
2479 # element, and the property in the 2nd. However, since these lines
2480 # can be stacked up, the return is an array of all these arrays.
2481
2482 my $self = shift;
2483 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2484
ffe43484 2485 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2486
2487 # If not accepting a list return, just return the first one.
2488 return shift @{$missings{$addr}} unless wantarray;
2489
2490 my @return = @{$missings{$addr}};
2491 undef @{$missings{$addr}};
2492 return @return;
2493 }
2494
2495 sub _insert_property_into_line {
2496 # Add a property field to $_, if this file requires it.
2497
f998e60c 2498 my $self = shift;
ffe43484 2499 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2500 my $property = $property{$addr};
99870f4d
KW
2501 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2502
2503 $_ =~ s/(;|$)/; $property$1/;
2504 return;
2505 }
2506
2507 sub carp_bad_line {
2508 # Output consistent error messages, using either a generic one, or the
2509 # one given by the optional parameter. To avoid gazillions of the
2510 # same message in case the syntax of a file is way off, this routine
2511 # only outputs the first instance of each message, incrementing a
2512 # count so the totals can be output at the end of the file.
2513
2514 my $self = shift;
2515 my $message = shift;
2516 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2517
ffe43484 2518 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2519
2520 $message = 'Unexpected line' unless $message;
2521
2522 # No trailing punctuation so as to fit with our addenda.
2523 $message =~ s/[.:;,]$//;
2524
2525 # If haven't seen this exact message before, output it now. Otherwise
2526 # increment the count of how many times it has occurred
2527 unless ($errors{$addr}->{$message}) {
2528 Carp::my_carp("$message in '$_' in "
f998e60c 2529 . $file{$addr}
99870f4d
KW
2530 . " at line $.. Skipping this line;");
2531 $errors{$addr}->{$message} = 1;
2532 }
2533 else {
2534 $errors{$addr}->{$message}++;
2535 }
2536
2537 # Clear the line to prevent any further (meaningful) processing of it.
2538 $_ = "";
2539
2540 return;
2541 }
2542} # End closure
2543
2544package Multi_Default;
2545
2546# Certain properties in early versions of Unicode had more than one possible
2547# default for code points missing from the files. In these cases, one
2548# default applies to everything left over after all the others are applied,
2549# and for each of the others, there is a description of which class of code
2550# points applies to it. This object helps implement this by storing the
2551# defaults, and for all but that final default, an eval string that generates
2552# the class that it applies to.
2553
2554
2555{ # Closure
2556
2557 main::setup_package();
2558
2559 my %class_defaults;
2560 # The defaults structure for the classes
2561 main::set_access('class_defaults', \%class_defaults);
2562
2563 my %other_default;
2564 # The default that applies to everything left over.
2565 main::set_access('other_default', \%other_default, 'r');
2566
2567
2568 sub new {
2569 # The constructor is called with default => eval pairs, terminated by
2570 # the left-over default. e.g.
2571 # Multi_Default->new(
2572 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2573 # - 0x200D',
2574 # 'R' => 'some other expression that evaluates to code points',
2575 # .
2576 # .
2577 # .
2578 # 'U'));
2579
2580 my $class = shift;
2581
2582 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2583 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2584
2585 while (@_ > 1) {
2586 my $default = shift;
2587 my $eval = shift;
2588 $class_defaults{$addr}->{$default} = $eval;
2589 }
2590
2591 $other_default{$addr} = shift;
2592
2593 return $self;
2594 }
2595
2596 sub get_next_defaults {
2597 # Iterates and returns the next class of defaults.
2598 my $self = shift;
2599 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2600
ffe43484 2601 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2602
2603 return each %{$class_defaults{$addr}};
2604 }
2605}
2606
2607package Alias;
2608
2609# An alias is one of the names that a table goes by. This class defines them
2610# including some attributes. Everything is currently setup in the
2611# constructor.
2612
2613
2614{ # Closure
2615
2616 main::setup_package();
2617
2618 my %name;
2619 main::set_access('name', \%name, 'r');
2620
2621 my %loose_match;
c12f2655 2622 # Should this name match loosely or not.
99870f4d
KW
2623 main::set_access('loose_match', \%loose_match, 'r');
2624
2625 my %make_pod_entry;
2626 # Some aliases should not get their own entries because they are covered
2627 # by a wild-card, and some we want to discourage use of. Binary
2628 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2629
2630 my %status;
2631 # Aliases have a status, like deprecated, or even suppressed (which means
2632 # they don't appear in documentation). Enum
2633 main::set_access('status', \%status, 'r');
2634
2635 my %externally_ok;
2636 # Similarly, some aliases should not be considered as usable ones for
2637 # external use, such as file names, or we don't want documentation to
2638 # recommend them. Boolean
2639 main::set_access('externally_ok', \%externally_ok, 'r');
2640
2641 sub new {
2642 my $class = shift;
2643
2644 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2645 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2646
2647 $name{$addr} = shift;
2648 $loose_match{$addr} = shift;
2649 $make_pod_entry{$addr} = shift;
2650 $externally_ok{$addr} = shift;
2651 $status{$addr} = shift;
2652
2653 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2654
2655 # Null names are never ok externally
2656 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2657
2658 return $self;
2659 }
2660}
2661
2662package Range;
2663
2664# A range is the basic unit for storing code points, and is described in the
2665# comments at the beginning of the program. Each range has a starting code
2666# point; an ending code point (not less than the starting one); a value
2667# that applies to every code point in between the two end-points, inclusive;
2668# and an enum type that applies to the value. The type is for the user's
2669# convenience, and has no meaning here, except that a non-zero type is
2670# considered to not obey the normal Unicode rules for having standard forms.
2671#
2672# The same structure is used for both map and match tables, even though in the
2673# latter, the value (and hence type) is irrelevant and could be used as a
2674# comment. In map tables, the value is what all the code points in the range
2675# map to. Type 0 values have the standardized version of the value stored as
2676# well, so as to not have to recalculate it a lot.
2677
2678sub trace { return main::trace(@_); }
2679
2680{ # Closure
2681
2682 main::setup_package();
2683
2684 my %start;
2685 main::set_access('start', \%start, 'r', 's');
2686
2687 my %end;
2688 main::set_access('end', \%end, 'r', 's');
2689
2690 my %value;
2691 main::set_access('value', \%value, 'r');
2692
2693 my %type;
2694 main::set_access('type', \%type, 'r');
2695
2696 my %standard_form;
2697 # The value in internal standard form. Defined only if the type is 0.
2698 main::set_access('standard_form', \%standard_form);
2699
2700 # Note that if these fields change, the dump() method should as well
2701
2702 sub new {
2703 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2704 my $class = shift;
2705
2706 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2707 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2708
2709 $start{$addr} = shift;
2710 $end{$addr} = shift;
2711
2712 my %args = @_;
2713
2714 my $value = delete $args{'Value'}; # Can be 0
2715 $value = "" unless defined $value;
2716 $value{$addr} = $value;
2717
2718 $type{$addr} = delete $args{'Type'} || 0;
2719
2720 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2721
2722 if (! $type{$addr}) {
2723 $standard_form{$addr} = main::standardize($value);
2724 }
2725
2726 return $self;
2727 }
2728
2729 use overload
2730 fallback => 0,
2731 qw("") => "_operator_stringify",
2732 "." => \&main::_operator_dot,
2733 ;
2734
2735 sub _operator_stringify {
2736 my $self = shift;
ffe43484 2737 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2738
2739 # Output it like '0041..0065 (value)'
2740 my $return = sprintf("%04X", $start{$addr})
2741 . '..'
2742 . sprintf("%04X", $end{$addr});
2743 my $value = $value{$addr};
2744 my $type = $type{$addr};
2745 $return .= ' (';
2746 $return .= "$value";
2747 $return .= ", Type=$type" if $type != 0;
2748 $return .= ')';
2749
2750 return $return;
2751 }
2752
2753 sub standard_form {
2754 # The standard form is the value itself if the standard form is
2755 # undefined (that is if the value is special)
2756
2757 my $self = shift;
2758 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2759
ffe43484 2760 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2761
2762 return $standard_form{$addr} if defined $standard_form{$addr};
2763 return $value{$addr};
2764 }
2765
2766 sub dump {
2767 # Human, not machine readable. For machine readable, comment out this
2768 # entire routine and let the standard one take effect.
2769 my $self = shift;
2770 my $indent = shift;
2771 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2772
ffe43484 2773 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2774
2775 my $return = $indent
2776 . sprintf("%04X", $start{$addr})
2777 . '..'
2778 . sprintf("%04X", $end{$addr})
2779 . " '$value{$addr}';";
2780 if (! defined $standard_form{$addr}) {
2781 $return .= "(type=$type{$addr})";
2782 }
2783 elsif ($standard_form{$addr} ne $value{$addr}) {
2784 $return .= "(standard '$standard_form{$addr}')";
2785 }
2786 return $return;
2787 }
2788} # End closure
2789
2790package _Range_List_Base;
2791
2792# Base class for range lists. A range list is simply an ordered list of
2793# ranges, so that the ranges with the lowest starting numbers are first in it.
2794#
2795# When a new range is added that is adjacent to an existing range that has the
2796# same value and type, it merges with it to form a larger range.
2797#
2798# Ranges generally do not overlap, except that there can be multiple entries
2799# of single code point ranges. This is because of NameAliases.txt.
2800#
2801# In this program, there is a standard value such that if two different
2802# values, have the same standard value, they are considered equivalent. This
2803# value was chosen so that it gives correct results on Unicode data
2804
2805# There are a number of methods to manipulate range lists, and some operators
2806# are overloaded to handle them.
2807
99870f4d
KW
2808sub trace { return main::trace(@_); }
2809
2810{ # Closure
2811
2812 our $addr;
2813
2814 main::setup_package();
2815
2816 my %ranges;
2817 # The list of ranges
2818 main::set_access('ranges', \%ranges, 'readable_array');
2819
2820 my %max;
2821 # The highest code point in the list. This was originally a method, but
2822 # actual measurements said it was used a lot.
2823 main::set_access('max', \%max, 'r');
2824
2825 my %each_range_iterator;
2826 # Iterator position for each_range()
2827 main::set_access('each_range_iterator', \%each_range_iterator);
2828
2829 my %owner_name_of;
2830 # Name of parent this is attached to, if any. Solely for better error
2831 # messages.
2832 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2833
2834 my %_search_ranges_cache;
2835 # A cache of the previous result from _search_ranges(), for better
2836 # performance
2837 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2838
2839 sub new {
2840 my $class = shift;
2841 my %args = @_;
2842
2843 # Optional initialization data for the range list.
2844 my $initialize = delete $args{'Initialize'};
2845
2846 my $self;
2847
2848 # Use _union() to initialize. _union() returns an object of this
2849 # class, which means that it will call this constructor recursively.
2850 # But it won't have this $initialize parameter so that it won't
2851 # infinitely loop on this.
2852 return _union($class, $initialize, %args) if defined $initialize;
2853
2854 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2855 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2856
2857 # Optional parent object, only for debug info.
2858 $owner_name_of{$addr} = delete $args{'Owner'};
2859 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2860
2861 # Stringify, in case it is an object.
2862 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2863
2864 # This is used only for error messages, and so a colon is added
2865 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2866
2867 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2868
2869 # Max is initialized to a negative value that isn't adjacent to 0,
2870 # for simpler tests
2871 $max{$addr} = -2;
2872
2873 $_search_ranges_cache{$addr} = 0;
2874 $ranges{$addr} = [];
2875
2876 return $self;
2877 }
2878
2879 use overload
2880 fallback => 0,
2881 qw("") => "_operator_stringify",
2882 "." => \&main::_operator_dot,
2883 ;
2884
2885 sub _operator_stringify {
2886 my $self = shift;
ffe43484 2887 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2888
2889 return "Range_List attached to '$owner_name_of{$addr}'"
2890 if $owner_name_of{$addr};
2891 return "anonymous Range_List " . \$self;
2892 }
2893
2894 sub _union {
2895 # Returns the union of the input code points. It can be called as
2896 # either a constructor or a method. If called as a method, the result
2897 # will be a new() instance of the calling object, containing the union
2898 # of that object with the other parameter's code points; if called as
2899 # a constructor, the first parameter gives the class the new object
2900 # should be, and the second parameter gives the code points to go into
2901 # it.
2902 # In either case, there are two parameters looked at by this routine;
2903 # any additional parameters are passed to the new() constructor.
2904 #
2905 # The code points can come in the form of some object that contains
2906 # ranges, and has a conventionally named method to access them; or
2907 # they can be an array of individual code points (as integers); or
2908 # just a single code point.
2909 #
2910 # If they are ranges, this routine doesn't make any effort to preserve
2911 # the range values of one input over the other. Therefore this base
2912 # class should not allow _union to be called from other than
2913 # initialization code, so as to prevent two tables from being added
2914 # together where the range values matter. The general form of this
2915 # routine therefore belongs in a derived class, but it was moved here
2916 # to avoid duplication of code. The failure to overload this in this
2917 # class keeps it safe.
2918 #
2919
2920 my $self;
2921 my @args; # Arguments to pass to the constructor
2922
2923 my $class = shift;
2924
2925 # If a method call, will start the union with the object itself, and
2926 # the class of the new object will be the same as self.
2927 if (ref $class) {
2928 $self = $class;
2929 $class = ref $self;
2930 push @args, $self;
2931 }
2932
2933 # Add the other required parameter.
2934 push @args, shift;
2935 # Rest of parameters are passed on to the constructor
2936
2937 # Accumulate all records from both lists.
2938 my @records;
2939 for my $arg (@args) {
2940 #local $to_trace = 0 if main::DEBUG;
2941 trace "argument = $arg" if main::DEBUG && $to_trace;
2942 if (! defined $arg) {
2943 my $message = "";
2944 if (defined $self) {
f998e60c 2945 no overloading;
051df77b 2946 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2947 }
2948 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2949 return;
2950 }
2951 $arg = [ $arg ] if ! ref $arg;
2952 my $type = ref $arg;
2953 if ($type eq 'ARRAY') {
2954 foreach my $element (@$arg) {
2955 push @records, Range->new($element, $element);
2956 }
2957 }
2958 elsif ($arg->isa('Range')) {
2959 push @records, $arg;
2960 }
2961 elsif ($arg->can('ranges')) {
2962 push @records, $arg->ranges;
2963 }
2964 else {
2965 my $message = "";
2966 if (defined $self) {
f998e60c 2967 no overloading;
051df77b 2968 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2969 }
2970 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2971 return;
2972 }
2973 }
2974
2975 # Sort with the range containing the lowest ordinal first, but if
2976 # two ranges start at the same code point, sort with the bigger range
2977 # of the two first, because it takes fewer cycles.
2978 @records = sort { ($a->start <=> $b->start)
2979 or
2980 # if b is shorter than a, b->end will be
2981 # less than a->end, and we want to select
2982 # a, so want to return -1
2983 ($b->end <=> $a->end)
2984 } @records;
2985
2986 my $new = $class->new(@_);
2987
2988 # Fold in records so long as they add new information.
2989 for my $set (@records) {
2990 my $start = $set->start;
2991 my $end = $set->end;
2992 my $value = $set->value;
2993 if ($start > $new->max) {
2994 $new->_add_delete('+', $start, $end, $value);
2995 }
2996 elsif ($end > $new->max) {
2997 $new->_add_delete('+', $new->max +1, $end, $value);
2998 }
2999 }
3000
3001 return $new;
3002 }
3003
3004 sub range_count { # Return the number of ranges in the range list
3005 my $self = shift;
3006 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3007
f998e60c 3008 no overloading;
051df77b 3009 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
3010 }
3011
3012 sub min {
3013 # Returns the minimum code point currently in the range list, or if
3014 # the range list is empty, 2 beyond the max possible. This is a
3015 # method because used so rarely, that not worth saving between calls,
3016 # and having to worry about changing it as ranges are added and
3017 # deleted.
3018
3019 my $self = shift;
3020 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3021
ffe43484 3022 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3023
3024 # If the range list is empty, return a large value that isn't adjacent
3025 # to any that could be in the range list, for simpler tests
3026 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3027 return $ranges{$addr}->[0]->start;
3028 }
3029
3030 sub contains {
3031 # Boolean: Is argument in the range list? If so returns $i such that:
3032 # range[$i]->end < $codepoint <= range[$i+1]->end
3033 # which is one beyond what you want; this is so that the 0th range
3034 # doesn't return false
3035 my $self = shift;
3036 my $codepoint = shift;
3037 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3038
99870f4d
KW
3039 my $i = $self->_search_ranges($codepoint);
3040 return 0 unless defined $i;
3041
3042 # The search returns $i, such that
3043 # range[$i-1]->end < $codepoint <= range[$i]->end
3044 # So is in the table if and only iff it is at least the start position
3045 # of range $i.
f998e60c 3046 no overloading;
051df77b 3047 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
3048 return $i + 1;
3049 }
3050
2f7a8815
KW
3051 sub containing_range {
3052 # Returns the range object that contains the code point, undef if none
3053
3054 my $self = shift;
3055 my $codepoint = shift;
3056 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3057
3058 my $i = $self->contains($codepoint);
3059 return unless $i;
3060
3061 # contains() returns 1 beyond where we should look
3062 no overloading;
3063 return $ranges{pack 'J', $self}->[$i-1];
3064 }
3065
99870f4d
KW
3066 sub value_of {
3067 # Returns the value associated with the code point, undef if none
3068
3069 my $self = shift;
3070 my $codepoint = shift;
3071 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3072
d69c231b
KW
3073 my $range = $self->containing_range($codepoint);
3074 return unless defined $range;
99870f4d 3075
d69c231b 3076 return $range->value;
99870f4d
KW
3077 }
3078
0a9dbafc
KW
3079 sub type_of {
3080 # Returns the type of the range containing the code point, undef if
3081 # the code point is not in the table
3082
3083 my $self = shift;
3084 my $codepoint = shift;
3085 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3086
3087 my $range = $self->containing_range($codepoint);
3088 return unless defined $range;
3089
3090 return $range->type;
3091 }
3092
99870f4d
KW
3093 sub _search_ranges {
3094 # Find the range in the list which contains a code point, or where it
3095 # should go if were to add it. That is, it returns $i, such that:
3096 # range[$i-1]->end < $codepoint <= range[$i]->end
3097 # Returns undef if no such $i is possible (e.g. at end of table), or
3098 # if there is an error.
3099
3100 my $self = shift;
3101 my $code_point = shift;
3102 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3103
ffe43484 3104 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3105
3106 return if $code_point > $max{$addr};
3107 my $r = $ranges{$addr}; # The current list of ranges
3108 my $range_list_size = scalar @$r;
3109 my $i;
3110
3111 use integer; # want integer division
3112
3113 # Use the cached result as the starting guess for this one, because,
3114 # an experiment on 5.1 showed that 90% of the time the cache was the
3115 # same as the result on the next call (and 7% it was one less).
3116 $i = $_search_ranges_cache{$addr};
3117 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3118 # from an intervening deletion
3119 #local $to_trace = 1 if main::DEBUG;
3120 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3121 return $i if $code_point <= $r->[$i]->end
3122 && ($i == 0 || $r->[$i-1]->end < $code_point);
3123
3124 # Here the cache doesn't yield the correct $i. Try adding 1.
3125 if ($i < $range_list_size - 1
3126 && $r->[$i]->end < $code_point &&
3127 $code_point <= $r->[$i+1]->end)
3128 {
3129 $i++;
3130 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3131 $_search_ranges_cache{$addr} = $i;
3132 return $i;
3133 }
3134
3135 # Here, adding 1 also didn't work. We do a binary search to
3136 # find the correct position, starting with current $i
3137 my $lower = 0;
3138 my $upper = $range_list_size - 1;
3139 while (1) {
3140 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3141
3142 if ($code_point <= $r->[$i]->end) {
3143
3144 # Here we have met the upper constraint. We can quit if we
3145 # also meet the lower one.
3146 last if $i == 0 || $r->[$i-1]->end < $code_point;
3147
3148 $upper = $i; # Still too high.
3149
3150 }
3151 else {
3152
3153 # Here, $r[$i]->end < $code_point, so look higher up.
3154 $lower = $i;
3155 }
3156
3157 # Split search domain in half to try again.
3158 my $temp = ($upper + $lower) / 2;
3159
3160 # No point in continuing unless $i changes for next time
3161 # in the loop.
3162 if ($temp == $i) {
3163
3164 # We can't reach the highest element because of the averaging.
3165 # So if one below the upper edge, force it there and try one
3166 # more time.
3167 if ($i == $range_list_size - 2) {
3168
3169 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3170 $i = $range_list_size - 1;
3171
3172 # Change $lower as well so if fails next time through,
3173 # taking the average will yield the same $i, and we will
3174 # quit with the error message just below.
3175 $lower = $i;
3176 next;
3177 }
3178 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3179 return;
3180 }
3181 $i = $temp;
3182 } # End of while loop
3183
3184 if (main::DEBUG && $to_trace) {
3185 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3186 trace "i= [ $i ]", $r->[$i];
3187 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3188 }
3189
3190 # Here we have found the offset. Cache it as a starting point for the
3191 # next call.
3192 $_search_ranges_cache{$addr} = $i;
3193 return $i;
3194 }
3195
3196 sub _add_delete {
3197 # Add, replace or delete ranges to or from a list. The $type
3198 # parameter gives which:
3199 # '+' => insert or replace a range, returning a list of any changed
3200 # ranges.
3201 # '-' => delete a range, returning a list of any deleted ranges.
3202 #
3203 # The next three parameters give respectively the start, end, and
3204 # value associated with the range. 'value' should be null unless the
3205 # operation is '+';
3206 #
3207 # The range list is kept sorted so that the range with the lowest
3208 # starting position is first in the list, and generally, adjacent
c1739a4a 3209 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3210 # exceptions below).
3211 #
c1739a4a 3212 # There are more parameters; all are key => value pairs:
99870f4d
KW
3213 # Type gives the type of the value. It is only valid for '+'.
3214 # All ranges have types; if this parameter is omitted, 0 is
3215 # assumed. Ranges with type 0 are assumed to obey the
3216 # Unicode rules for casing, etc; ranges with other types are
3217 # not. Otherwise, the type is arbitrary, for the caller's
3218 # convenience, and looked at only by this routine to keep
3219 # adjacent ranges of different types from being merged into
3220 # a single larger range, and when Replace =>
3221 # $IF_NOT_EQUIVALENT is specified (see just below).
3222 # Replace determines what to do if the range list already contains
3223 # ranges which coincide with all or portions of the input
3224 # range. It is only valid for '+':
3225 # => $NO means that the new value is not to replace
3226 # any existing ones, but any empty gaps of the
3227 # range list coinciding with the input range
3228 # will be filled in with the new value.
3229 # => $UNCONDITIONALLY means to replace the existing values with
3230 # this one unconditionally. However, if the
3231 # new and old values are identical, the
3232 # replacement is skipped to save cycles
3233 # => $IF_NOT_EQUIVALENT means to replace the existing values
3234 # with this one if they are not equivalent.
3235 # Ranges are equivalent if their types are the
c1739a4a 3236 # same, and they are the same string; or if
99870f4d
KW
3237 # both are type 0 ranges, if their Unicode
3238 # standard forms are identical. In this last
3239 # case, the routine chooses the more "modern"
3240 # one to use. This is because some of the
3241 # older files are formatted with values that
3242 # are, for example, ALL CAPs, whereas the
3243 # derived files have a more modern style,
3244 # which looks better. By looking for this
3245 # style when the pre-existing and replacement
3246 # standard forms are the same, we can move to
3247 # the modern style
3248 # => $MULTIPLE means that if this range duplicates an
3249 # existing one, but has a different value,
3250 # don't replace the existing one, but insert
3251 # this, one so that the same range can occur
53d84487
KW
3252 # multiple times. They are stored LIFO, so
3253 # that the final one inserted is the first one
3254 # returned in an ordered search of the table.
99870f4d
KW
3255 # => anything else is the same as => $IF_NOT_EQUIVALENT
3256 #
c1739a4a
KW
3257 # "same value" means identical for non-type-0 ranges, and it means
3258 # having the same standard forms for type-0 ranges.
99870f4d
KW
3259
3260 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3261
3262 my $self = shift;
3263 my $operation = shift; # '+' for add/replace; '-' for delete;
3264 my $start = shift;
3265 my $end = shift;
3266 my $value = shift;
3267
3268 my %args = @_;
3269
3270 $value = "" if not defined $value; # warning: $value can be "0"
3271
3272 my $replace = delete $args{'Replace'};
3273 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3274
3275 my $type = delete $args{'Type'};
3276 $type = 0 unless defined $type;
3277
3278 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3279
ffe43484 3280 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3281
3282 if ($operation ne '+' && $operation ne '-') {
3283 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3284 return;
3285 }
3286 unless (defined $start && defined $end) {
3287 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3288 return;
3289 }
3290 unless ($end >= $start) {
3291 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3292 return;
3293 }
3294 #local $to_trace = 1 if main::DEBUG;
3295
3296 if ($operation eq '-') {
3297 if ($replace != $IF_NOT_EQUIVALENT) {
3298 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3299 $replace = $IF_NOT_EQUIVALENT;
3300 }
3301 if ($type) {
3302 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3303 $type = 0;
3304 }
3305 if ($value ne "") {
3306 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3307 $value = "";
3308 }
3309 }
3310
3311 my $r = $ranges{$addr}; # The current list of ranges
3312 my $range_list_size = scalar @$r; # And its size
3313 my $max = $max{$addr}; # The current high code point in
3314 # the list of ranges
3315
3316 # Do a special case requiring fewer machine cycles when the new range
3317 # starts after the current highest point. The Unicode input data is
3318 # structured so this is common.
3319 if ($start > $max) {
3320
3321 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3322 return if $operation eq '-'; # Deleting a non-existing range is a
3323 # no-op
3324
3325 # If the new range doesn't logically extend the current final one
3326 # in the range list, create a new range at the end of the range
3327 # list. (max cleverly is initialized to a negative number not
3328 # adjacent to 0 if the range list is empty, so even adding a range
3329 # to an empty range list starting at 0 will have this 'if'
3330 # succeed.)
3331 if ($start > $max + 1 # non-adjacent means can't extend.
3332 || @{$r}[-1]->value ne $value # values differ, can't extend.
3333 || @{$r}[-1]->type != $type # types differ, can't extend.
3334 ) {
3335 push @$r, Range->new($start, $end,
3336 Value => $value,
3337 Type => $type);
3338 }
3339 else {
3340
3341 # Here, the new range starts just after the current highest in
3342 # the range list, and they have the same type and value.
3343 # Extend the current range to incorporate the new one.
3344 @{$r}[-1]->set_end($end);
3345 }
3346
3347 # This becomes the new maximum.
3348 $max{$addr} = $end;
3349
3350 return;
3351 }
3352 #local $to_trace = 0 if main::DEBUG;
3353
3354 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3355
3356 # Here, the input range isn't after the whole rest of the range list.
3357 # Most likely 'splice' will be needed. The rest of the routine finds
3358 # the needed splice parameters, and if necessary, does the splice.
3359 # First, find the offset parameter needed by the splice function for
3360 # the input range. Note that the input range may span multiple
3361 # existing ones, but we'll worry about that later. For now, just find
3362 # the beginning. If the input range is to be inserted starting in a
3363 # position not currently in the range list, it must (obviously) come
3364 # just after the range below it, and just before the range above it.
3365 # Slightly less obviously, it will occupy the position currently
3366 # occupied by the range that is to come after it. More formally, we
3367 # are looking for the position, $i, in the array of ranges, such that:
3368 #
3369 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3370 #
3371 # (The ordered relationships within existing ranges are also shown in
3372 # the equation above). However, if the start of the input range is
3373 # within an existing range, the splice offset should point to that
3374 # existing range's position in the list; that is $i satisfies a
3375 # somewhat different equation, namely:
3376 #
3377 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3378 #
3379 # More briefly, $start can come before or after r[$i]->start, and at
3380 # this point, we don't know which it will be. However, these
3381 # two equations share these constraints:
3382 #
3383 # r[$i-1]->end < $start <= r[$i]->end
3384 #
3385 # And that is good enough to find $i.
3386
3387 my $i = $self->_search_ranges($start);
3388 if (! defined $i) {
3389 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3390 return;
3391 }
3392
3393 # The search function returns $i such that:
3394 #
3395 # r[$i-1]->end < $start <= r[$i]->end
3396 #
3397 # That means that $i points to the first range in the range list
3398 # that could possibly be affected by this operation. We still don't
3399 # know if the start of the input range is within r[$i], or if it
3400 # points to empty space between r[$i-1] and r[$i].
3401 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3402
3403 # Special case the insertion of data that is not to replace any
3404 # existing data.
3405 if ($replace == $NO) { # If $NO, has to be operation '+'
3406 #local $to_trace = 1 if main::DEBUG;
3407 trace "Doesn't replace" if main::DEBUG && $to_trace;
3408
3409 # Here, the new range is to take effect only on those code points
3410 # that aren't already in an existing range. This can be done by
3411 # looking through the existing range list and finding the gaps in
3412 # the ranges that this new range affects, and then calling this
3413 # function recursively on each of those gaps, leaving untouched
3414 # anything already in the list. Gather up a list of the changed
3415 # gaps first so that changes to the internal state as new ranges
3416 # are added won't be a problem.
3417 my @gap_list;
3418
3419 # First, if the starting point of the input range is outside an
3420 # existing one, there is a gap from there to the beginning of the
3421 # existing range -- add a span to fill the part that this new
3422 # range occupies
3423 if ($start < $r->[$i]->start) {
3424 push @gap_list, Range->new($start,
3425 main::min($end,
3426 $r->[$i]->start - 1),
3427 Type => $type);
3428 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3429 }
3430
3431 # Then look through the range list for other gaps until we reach
3432 # the highest range affected by the input one.
3433 my $j;
3434 for ($j = $i+1; $j < $range_list_size; $j++) {
3435 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3436 last if $end < $r->[$j]->start;
3437
3438 # If there is a gap between when this range starts and the
3439 # previous one ends, add a span to fill it. Note that just
3440 # because there are two ranges doesn't mean there is a
3441 # non-zero gap between them. It could be that they have
3442 # different values or types
3443 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3444 push @gap_list,
3445 Range->new($r->[$j-1]->end + 1,
3446 $r->[$j]->start - 1,
3447 Type => $type);
3448 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3449 }
3450 }
3451
3452 # Here, we have either found an existing range in the range list,
3453 # beyond the area affected by the input one, or we fell off the
3454 # end of the loop because the input range affects the whole rest
3455 # of the range list. In either case, $j is 1 higher than the
3456 # highest affected range. If $j == $i, it means that there are no
3457 # affected ranges, that the entire insertion is in the gap between
3458 # r[$i-1], and r[$i], which we already have taken care of before
3459 # the loop.
3460 # On the other hand, if there are affected ranges, it might be
3461 # that there is a gap that needs filling after the final such
3462 # range to the end of the input range
3463 if ($r->[$j-1]->end < $end) {
3464 push @gap_list, Range->new(main::max($start,
3465 $r->[$j-1]->end + 1),
3466 $end,
3467 Type => $type);
3468 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3469 }
3470
3471 # Call recursively to fill in all the gaps.
3472 foreach my $gap (@gap_list) {
3473 $self->_add_delete($operation,
3474 $gap->start,
3475 $gap->end,
3476 $value,
3477 Type => $type);
3478 }
3479
3480 return;
3481 }
3482
53d84487
KW
3483 # Here, we have taken care of the case where $replace is $NO.
3484 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3485 # If inserting a multiple record, this is where it goes, before the
3486 # first (if any) existing one. This implies an insertion, and no
3487 # change to any existing ranges. Note that $i can be -1 if this new
3488 # range doesn't actually duplicate any existing, and comes at the
3489 # beginning of the list.
3490 if ($replace == $MULTIPLE) {
3491
3492 if ($start != $end) {
3493 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3494 return;
3495 }
3496
3497 # Don't add an exact duplicate, as it isn't really a multiple
3498 if ($end >= $r->[$i]->start) {
1f6798c4
KW
3499 my $existing_value = $r->[$i]->value;
3500 my $existing_type = $r->[$i]->type;
3501 return if $value eq $existing_value && $type eq $existing_type;
3502
3503 # If the multiple value is part of an existing range, we want
3504 # to split up that range, so that only the single code point
3505 # is affected. To do this, we first call ourselves
3506 # recursively to delete that code point from the table, having
3507 # preserved its current data above. Then we call ourselves
3508 # recursively again to add the new multiple, which we know by
3509 # the test just above is different than the current code
3510 # point's value, so it will become a range containing a single
3511 # code point: just itself. Finally, we add back in the
3512 # pre-existing code point, which will again be a single code
3513 # point range. Because 'i' likely will have changed as a
3514 # result of these operations, we can't just continue on, but
3515 # do this operation recursively as well.
53d84487 3516 if ($r->[$i]->start != $r->[$i]->end) {
1f6798c4
KW
3517 $self->_add_delete('-', $start, $end, "");
3518 $self->_add_delete('+', $start, $end, $value, Type => $type);
3519 return $self->_add_delete('+', $start, $end, $existing_value, Type => $existing_type, Replace => $MULTIPLE);
53d84487 3520 }
53d84487
KW
3521 }
3522
3523 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3524 my @return = splice @$r,
3525 $i,
3526 0,
3527 Range->new($start,
3528 $end,
3529 Value => $value,
3530 Type => $type);
3531 if (main::DEBUG && $to_trace) {
3532 trace "After splice:";
3533 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3534 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3535 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3536 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3537 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3538 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3539 }
3540 return @return;
3541 }
3542
3543 # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
3544 # delete, insert, and replace either unconditionally or if not
3545 # equivalent. $i still points to the first potential affected range.
3546 # Now find the highest range affected, which will determine the length
3547 # parameter to splice. (The input range can span multiple existing
3548 # ones.) If this isn't a deletion, while we are looking through the
3549 # range list, see also if this is a replacement rather than a clean
3550 # insertion; that is if it will change the values of at least one
3551 # existing range. Start off assuming it is an insert, until find it
3552 # isn't.
3553 my $clean_insert = $operation eq '+';
99870f4d
KW
3554 my $j; # This will point to the highest affected range
3555
3556 # For non-zero types, the standard form is the value itself;
3557 my $standard_form = ($type) ? $value : main::standardize($value);
3558
3559 for ($j = $i; $j < $range_list_size; $j++) {
3560 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3561
3562 # If find a range that it doesn't overlap into, we can stop
3563 # searching
3564 last if $end < $r->[$j]->start;
3565
969a34cc
KW
3566 # Here, overlaps the range at $j. If the values don't match,
3567 # and so far we think this is a clean insertion, it becomes a
3568 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3569 if ($clean_insert) {
99870f4d 3570 if ($r->[$j]->standard_form ne $standard_form) {
969a34cc 3571 $clean_insert = 0;
56343c78
KW
3572 if ($replace == $CROAK) {
3573 main::croak("The range to add "
3574 . sprintf("%04X", $start)
3575 . '-'
3576 . sprintf("%04X", $end)
3577 . " with value '$value' overlaps an existing range $r->[$j]");
3578 }
99870f4d
KW
3579 }
3580 else {
3581
3582 # Here, the two values are essentially the same. If the
3583 # two are actually identical, replacing wouldn't change
3584 # anything so skip it.
3585 my $pre_existing = $r->[$j]->value;
3586 if ($pre_existing ne $value) {
3587
3588 # Here the new and old standardized values are the
3589 # same, but the non-standardized values aren't. If
3590 # replacing unconditionally, then replace
3591 if( $replace == $UNCONDITIONALLY) {
969a34cc 3592 $clean_insert = 0;
99870f4d
KW
3593 }
3594 else {
3595
3596 # Here, are replacing conditionally. Decide to
3597 # replace or not based on which appears to look
3598 # the "nicest". If one is mixed case and the
3599 # other isn't, choose the mixed case one.
3600 my $new_mixed = $value =~ /[A-Z]/
3601 && $value =~ /[a-z]/;
3602 my $old_mixed = $pre_existing =~ /[A-Z]/
3603 && $pre_existing =~ /[a-z]/;
3604
3605 if ($old_mixed != $new_mixed) {
969a34cc 3606 $clean_insert = 0 if $new_mixed;
99870f4d 3607 if (main::DEBUG && $to_trace) {
969a34cc
KW
3608 if ($clean_insert) {
3609 trace "Retaining $pre_existing over $value";
99870f4d
KW
3610 }
3611 else {
969a34cc 3612 trace "Replacing $pre_existing with $value";
99870f4d
KW
3613 }
3614 }
3615 }
3616 else {
3617
3618 # Here casing wasn't different between the two.
3619 # If one has hyphens or underscores and the
3620 # other doesn't, choose the one with the
3621 # punctuation.
3622 my $new_punct = $value =~ /[-_]/;
3623 my $old_punct = $pre_existing =~ /[-_]/;
3624
3625 if ($old_punct != $new_punct) {
969a34cc 3626 $clean_insert = 0 if $new_punct;
99870f4d 3627 if (main::DEBUG && $to_trace) {
969a34cc
KW
3628 if ($clean_insert) {
3629 trace "Retaining $pre_existing over $value";
99870f4d
KW
3630 }
3631 else {
969a34cc 3632 trace "Replacing $pre_existing with $value";
99870f4d
KW
3633 }
3634 }
3635 } # else existing one is just as "good";
3636 # retain it to save cycles.
3637 }
3638 }
3639 }
3640 }
3641 }
3642 } # End of loop looking for highest affected range.
3643
3644 # Here, $j points to one beyond the highest range that this insertion
3645 # affects (hence to beyond the range list if that range is the final
3646 # one in the range list).
3647
3648 # The splice length is all the affected ranges. Get it before
3649 # subtracting, for efficiency, so we don't have to later add 1.
3650 my $length = $j - $i;
3651
3652 $j--; # $j now points to the highest affected range.
3653 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3654
99870f4d
KW
3655 # Here, have taken care of $NO and $MULTIPLE replaces.
3656 # $j points to the highest affected range. But it can be < $i or even
3657 # -1. These happen only if the insertion is entirely in the gap
3658 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3659 # above exited first time through with $end < $r->[$i]->start. (And
3660 # then we subtracted one from j) This implies also that $start <
3661 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3662 # $start, so the entire input range is in the gap.
3663 if ($j < $i) {
3664
3665 # Here the entire input range is in the gap before $i.
3666
3667 if (main::DEBUG && $to_trace) {
3668 if ($i) {
3669 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3670 }
3671 else {
3672 trace "Entire range is before $r->[$i]";
3673 }
3674 }
3675 return if $operation ne '+'; # Deletion of a non-existent range is
3676 # a no-op
3677 }
3678 else {
3679
969a34cc
KW
3680 # Here part of the input range is not in the gap before $i. Thus,
3681 # there is at least one affected one, and $j points to the highest
3682 # such one.
99870f4d
KW
3683
3684 # At this point, here is the situation:
3685 # This is not an insertion of a multiple, nor of tentative ($NO)
3686 # data.
3687 # $i points to the first element in the current range list that
3688 # may be affected by this operation. In fact, we know
3689 # that the range at $i is affected because we are in
3690 # the else branch of this 'if'
3691 # $j points to the highest affected range.
3692 # In other words,
3693 # r[$i-1]->end < $start <= r[$i]->end
3694 # And:
3695 # r[$i-1]->end < $start <= $end <= r[$j]->end
3696 #
3697 # Also:
969a34cc
KW
3698 # $clean_insert is a boolean which is set true if and only if
3699 # this is a "clean insertion", i.e., not a change nor a
3700 # deletion (multiple was handled above).
99870f4d
KW
3701
3702 # We now have enough information to decide if this call is a no-op
969a34cc
KW
3703 # or not. It is a no-op if this is an insertion of already
3704 # existing data.
99870f4d 3705
969a34cc 3706 if (main::DEBUG && $to_trace && $clean_insert
99870f4d
KW
3707 && $i == $j
3708 && $start >= $r->[$i]->start)
3709 {
3710 trace "no-op";
3711 }
969a34cc 3712 return if $clean_insert
99870f4d
KW
3713 && $i == $j # more than one affected range => not no-op
3714
3715 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3716 # Further, $start and/or $end is >= r[$i]->start
3717 # The test below hence guarantees that
3718 # r[$i]->start < $start <= $end <= r[$i]->end
3719 # This means the input range is contained entirely in
3720 # the one at $i, so is a no-op
3721 && $start >= $r->[$i]->start;
3722 }
3723
3724 # Here, we know that some action will have to be taken. We have
3725 # calculated the offset and length (though adjustments may be needed)
3726 # for the splice. Now start constructing the replacement list.
3727 my @replacement;
3728 my $splice_start = $i;
3729
3730 my $extends_below;
3731 my $extends_above;
3732
3733 # See if should extend any adjacent ranges.
3734 if ($operation eq '-') { # Don't extend deletions
3735 $extends_below = $extends_above = 0;
3736 }
3737 else { # Here, should extend any adjacent ranges. See if there are
3738 # any.
3739 $extends_below = ($i > 0
3740 # can't extend unless adjacent
3741 && $r->[$i-1]->end == $start -1
3742 # can't extend unless are same standard value
3743 && $r->[$i-1]->standard_form eq $standard_form
3744 # can't extend unless share type
3745 && $r->[$i-1]->type == $type);
3746 $extends_above = ($j+1 < $range_list_size
3747 && $r->[$j+1]->start == $end +1
3748 && $r->[$j+1]->standard_form eq $standard_form
23822bda 3749 && $r->[$j+1]->type == $type);
99870f4d
KW
3750 }
3751 if ($extends_below && $extends_above) { # Adds to both
3752 $splice_start--; # start replace at element below
3753 $length += 2; # will replace on both sides
3754 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3755
3756 # The result will fill in any gap, replacing both sides, and
3757 # create one large range.
3758 @replacement = Range->new($r->[$i-1]->start,
3759 $r->[$j+1]->end,
3760 Value => $value,
3761 Type => $type);
3762 }
3763 else {
3764
3765 # Here we know that the result won't just be the conglomeration of
3766 # a new range with both its adjacent neighbors. But it could
3767 # extend one of them.
3768
3769 if ($extends_below) {
3770
3771 # Here the new element adds to the one below, but not to the
3772 # one above. If inserting, and only to that one range, can
3773 # just change its ending to include the new one.
969a34cc 3774 if ($length == 0 && $clean_insert) {
99870f4d
KW
3775 $r->[$i-1]->set_end($end);
3776 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3777 return;
3778 }
3779 else {
3780 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3781 $splice_start--; # start replace at element below
3782 $length++; # will replace the element below
3783 $start = $r->[$i-1]->start;
3784 }
3785 }
3786 elsif ($extends_above) {
3787
3788 # Here the new element adds to the one above, but not below.
3789 # Mirror the code above
969a34cc 3790 if ($length == 0 && $clean_insert) {
99870f4d
KW
3791 $r->[$j+1]->set_start($start);
3792 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3793 return;
3794 }
3795 else {
3796 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3797 $length++; # will replace the element above
3798 $end = $r->[$j+1]->end;
3799 }
3800 }
3801
3802 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3803
3804 # Finally, here we know there will have to be a splice.
3805 # If the change or delete affects only the highest portion of the
3806 # first affected range, the range will have to be split. The
3807 # splice will remove the whole range, but will replace it by a new
3808 # range containing just the unaffected part. So, in this case,
3809 # add to the replacement list just this unaffected portion.
3810 if (! $extends_below
3811 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3812 {
3813 push @replacement,
3814 Range->new($r->[$i]->start,
3815 $start - 1,
3816 Value => $r->[$i]->value,
3817 Type => $r->[$i]->type);
3818 }
3819
3820 # In the case of an insert or change, but not a delete, we have to
3821 # put in the new stuff; this comes next.
3822 if ($operation eq '+') {
3823 push @replacement, Range->new($start,
3824 $end,
3825 Value => $value,
3826 Type => $type);
3827 }
3828
3829 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3830 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3831
3832 # And finally, if we're changing or deleting only a portion of the
3833 # highest affected range, it must be split, as the lowest one was.
3834 if (! $extends_above
3835 && $j >= 0 # Remember that j can be -1 if before first
3836 # current element
3837 && $end >= $r->[$j]->start
3838 && $end < $r->[$j]->end)
3839 {
3840 push @replacement,
3841 Range->new($end + 1,
3842 $r->[$j]->end,
3843 Value => $r->[$j]->value,
3844 Type => $r->[$j]->type);
3845 }
3846 }
3847
3848 # And do the splice, as calculated above
3849 if (main::DEBUG && $to_trace) {
3850 trace "replacing $length element(s) at $i with ";
3851 foreach my $replacement (@replacement) {
3852 trace " $replacement";
3853 }
3854 trace "Before splice:";
3855 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3856 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3857 trace "i =[", $i, "]", $r->[$i];
3858 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3859 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3860 }
3861
3862 my @return = splice @$r, $splice_start, $length, @replacement;
3863
3864 if (main::DEBUG && $to_trace) {
3865 trace "After splice:";
3866 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3867 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3868 trace "i =[", $i, "]", $r->[$i];
3869 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3870 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
e6451557 3871 trace "removed ", @return if @return;
99870f4d
KW
3872 }
3873
3874 # An actual deletion could have changed the maximum in the list.
3875 # There was no deletion if the splice didn't return something, but
3876 # otherwise recalculate it. This is done too rarely to worry about
3877 # performance.
3878 if ($operation eq '-' && @return) {
3879 $max{$addr} = $r->[-1]->end;
3880 }
3881 return @return;
3882 }
3883
3884 sub reset_each_range { # reset the iterator for each_range();
3885 my $self = shift;
3886 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3887
f998e60c 3888 no overloading;
051df77b 3889 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
3890 return;
3891 }
3892
3893 sub each_range {
3894 # Iterate over each range in a range list. Results are undefined if
3895 # the range list is changed during the iteration.
3896
3897 my $self = shift;
3898 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3899
ffe43484 3900 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3901
3902 return if $self->is_empty;
3903
3904 $each_range_iterator{$addr} = -1
3905 if ! defined $each_range_iterator{$addr};
3906 $each_range_iterator{$addr}++;
3907 return $ranges{$addr}->[$each_range_iterator{$addr}]
3908 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3909 undef $each_range_iterator{$addr};
3910 return;
3911 }
3912
3913 sub count { # Returns count of code points in range list
3914 my $self = shift;
3915 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3916
ffe43484 3917 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3918
3919 my $count = 0;
3920 foreach my $range (@{$ranges{$addr}}) {
3921 $count += $range->end - $range->start + 1;
3922 }
3923 return $count;
3924 }
3925
3926 sub delete_range { # Delete a range
3927 my $self = shift;
3928 my $start = shift;
3929 my $end = shift;
3930
3931 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3932
3933 return $self->_add_delete('-', $start, $end, "");
3934 }
3935
3936 sub is_empty { # Returns boolean as to if a range list is empty
3937 my $self = shift;
3938 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3939
f998e60c 3940 no overloading;
051df77b 3941 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
3942 }
3943
3944 sub hash {
3945 # Quickly returns a scalar suitable for separating tables into
3946 # buckets, i.e. it is a hash function of the contents of a table, so
3947 # there are relatively few conflicts.
3948
3949 my $self = shift;
3950 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3951
ffe43484 3952 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3953
3954 # These are quickly computable. Return looks like 'min..max;count'
3955 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3956 }
3957} # End closure for _Range_List_Base
3958
3959package Range_List;
3960use base '_Range_List_Base';
3961
3962# A Range_List is a range list for match tables; i.e. the range values are
3963# not significant. Thus a number of operations can be safely added to it,
3964# such as inversion, intersection. Note that union is also an unsafe
3965# operation when range values are cared about, and that method is in the base
3966# class, not here. But things are set up so that that method is callable only
3967# during initialization. Only in this derived class, is there an operation
3968# that combines two tables. A Range_Map can thus be used to initialize a
3969# Range_List, and its mappings will be in the list, but are not significant to
3970# this class.
3971
3972sub trace { return main::trace(@_); }
3973
3974{ # Closure
3975
3976 use overload
3977 fallback => 0,
3978 '+' => sub { my $self = shift;
3979 my $other = shift;
3980
3981 return $self->_union($other)
3982 },
3983 '&' => sub { my $self = shift;
3984 my $other = shift;
3985
3986 return $self->_intersect($other, 0);
3987 },
3988 '~' => "_invert",
3989 '-' => "_subtract",
3990 ;
3991
3992 sub _invert {
3993 # Returns a new Range_List that gives all code points not in $self.
3994
3995 my $self = shift;
3996
3997 my $new = Range_List->new;
3998
3999 # Go through each range in the table, finding the gaps between them
4000 my $max = -1; # Set so no gap before range beginning at 0
4001 for my $range ($self->ranges) {
4002 my $start = $range->start;
4003 my $end = $range->end;
4004
4005 # If there is a gap before this range, the inverse will contain
4006 # that gap.
4007 if ($start > $max + 1) {
4008 $new->add_range($max + 1, $start - 1);
4009 }
4010 $max = $end;
4011 }
4012
4013 # And finally, add the gap from the end of the table to the max
4014 # possible code point
4015 if ($max < $LAST_UNICODE_CODEPOINT) {
4016 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
4017 }
4018 return $new;
4019 }
4020
4021 sub _subtract {
4022 # Returns a new Range_List with the argument deleted from it. The
4023 # argument can be a single code point, a range, or something that has
4024 # a range, with the _range_list() method on it returning them
4025
4026 my $self = shift;
4027 my $other = shift;
4028 my $reversed = shift;
4029 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4030
4031 if ($reversed) {
4032 Carp::my_carp_bug("Can't cope with a "
4033 . __PACKAGE__
4034 . " being the second parameter in a '-'. Subtraction ignored.");
4035 return $self;
4036 }
4037
4038 my $new = Range_List->new(Initialize => $self);
4039
4040 if (! ref $other) { # Single code point
4041 $new->delete_range($other, $other);
4042 }
4043 elsif ($other->isa('Range')) {
4044 $new->delete_range($other->start, $other->end);
4045 }
4046 elsif ($other->can('_range_list')) {
4047 foreach my $range ($other->_range_list->ranges) {
4048 $new->delete_range($range->start, $range->end);
4049 }
4050 }
4051 else {
4052 Carp::my_carp_bug("Can't cope with a "
4053 . ref($other)
4054 . " argument to '-'. Subtraction ignored."
4055 );
4056 return $self;
4057 }
4058
4059 return $new;
4060 }
4061
4062 sub _intersect {
4063 # Returns either a boolean giving whether the two inputs' range lists
4064 # intersect (overlap), or a new Range_List containing the intersection
4065 # of the two lists. The optional final parameter being true indicates
4066 # to do the check instead of the intersection.
4067
4068 my $a_object = shift;
4069 my $b_object = shift;
4070 my $check_if_overlapping = shift;
4071 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4072 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4073
4074 if (! defined $b_object) {
4075 my $message = "";
4076 $message .= $a_object->_owner_name_of if defined $a_object;
4077 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4078 return;
4079 }
4080
4081 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4082 # Thus the intersection could be much more simply be written:
4083 # return ~(~$a_object + ~$b_object);
4084 # But, this is slower, and when taking the inverse of a large
4085 # range_size_1 table, back when such tables were always stored that
4086 # way, it became prohibitively slow, hence the code was changed to the
4087 # below
4088
4089 if ($b_object->isa('Range')) {
4090 $b_object = Range_List->new(Initialize => $b_object,
4091 Owner => $a_object->_owner_name_of);
4092 }
4093 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4094
4095 my @a_ranges = $a_object->ranges;
4096 my @b_ranges = $b_object->ranges;
4097
4098 #local $to_trace = 1 if main::DEBUG;
4099 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4100
4101 # Start with the first range in each list
4102 my $a_i = 0;
4103 my $range_a = $a_ranges[$a_i];
4104 my $b_i = 0;
4105 my $range_b = $b_ranges[$b_i];
4106
4107 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4108 if ! $check_if_overlapping;
4109
4110 # If either list is empty, there is no intersection and no overlap
4111 if (! defined $range_a || ! defined $range_b) {
4112 return $check_if_overlapping ? 0 : $new;
4113 }
4114 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4115
4116 # Otherwise, must calculate the intersection/overlap. Start with the
4117 # very first code point in each list
4118 my $a = $range_a->start;
4119 my $b = $range_b->start;
4120
4121 # Loop through all the ranges of each list; in each iteration, $a and
4122 # $b are the current code points in their respective lists
4123 while (1) {
4124
4125 # If $a and $b are the same code point, ...
4126 if ($a == $b) {
4127
4128 # it means the lists overlap. If just checking for overlap
4129 # know the answer now,
4130 return 1 if $check_if_overlapping;
4131
4132 # The intersection includes this code point plus anything else
4133 # common to both current ranges.
4134 my $start = $a;
4135 my $end = main::min($range_a->end, $range_b->end);
4136 if (! $check_if_overlapping) {
4137 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4138 $new->add_range($start, $end);
4139 }
4140
4141 # Skip ahead to the end of the current intersect
4142 $a = $b = $end;
4143
4144 # If the current intersect ends at the end of either range (as
4145 # it must for at least one of them), the next possible one
4146 # will be the beginning code point in it's list's next range.
4147 if ($a == $range_a->end) {
4148 $range_a = $a_ranges[++$a_i];
4149 last unless defined $range_a;
4150 $a = $range_a->start;
4151 }
4152 if ($b == $range_b->end) {
4153 $range_b = $b_ranges[++$b_i];
4154 last unless defined $range_b;
4155 $b = $range_b->start;
4156 }
4157
4158 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4159 }
4160 elsif ($a < $b) {
4161
4162 # Not equal, but if the range containing $a encompasses $b,
4163 # change $a to be the middle of the range where it does equal
4164 # $b, so the next iteration will get the intersection
4165 if ($range_a->end >= $b) {
4166 $a = $b;
4167 }
4168 else {
4169
4170 # Here, the current range containing $a is entirely below
4171 # $b. Go try to find a range that could contain $b.
4172 $a_i = $a_object->_search_ranges($b);
4173
4174 # If no range found, quit.
4175 last unless defined $a_i;
4176
4177 # The search returns $a_i, such that
4178 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4179 # Set $a to the beginning of this new range, and repeat.
4180 $range_a = $a_ranges[$a_i];
4181 $a = $range_a->start;
4182 }
4183 }
4184 else { # Here, $b < $a.
4185
4186 # Mirror image code to the leg just above
4187 if ($range_b->end >= $a) {
4188 $b = $a;
4189 }
4190 else {
4191 $b_i = $b_object->_search_ranges($a);
4192 last unless defined $b_i;
4193 $range_b = $b_ranges[$b_i];
4194 $b = $range_b->start;
4195 }
4196 }
4197 } # End of looping through ranges.
4198
4199 # Intersection fully computed, or now know that there is no overlap
4200 return $check_if_overlapping ? 0 : $new;
4201 }
4202
4203 sub overlaps {
4204 # Returns boolean giving whether the two arguments overlap somewhere
4205
4206 my $self = shift;
4207 my $other = shift;
4208 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4209
4210 return $self->_intersect($other, 1);
4211 }
4212
4213 sub add_range {
4214 # Add a range to the list.
4215
4216 my $self = shift;
4217 my $start = shift;
4218 my $end = shift;
4219 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4220
4221 return $self->_add_delete('+', $start, $end, "");
4222 }
4223
09aba7e4
KW
4224 sub matches_identically_to {
4225 # Return a boolean as to whether or not two Range_Lists match identical
4226 # sets of code points.
4227
4228 my $self = shift;
4229 my $other = shift;
4230 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4231
4232 # These are ordered in increasing real time to figure out (at least
4233 # until a patch changes that and doesn't change this)
4234 return 0 if $self->max != $other->max;
4235 return 0 if $self->min != $other->min;
4236 return 0 if $self->range_count != $other->range_count;
4237 return 0 if $self->count != $other->count;
4238
4239 # Here they could be identical because all the tests above passed.
4240 # The loop below is somewhat simpler since we know they have the same
4241 # number of elements. Compare range by range, until reach the end or
4242 # find something that differs.
4243 my @a_ranges = $self->ranges;
4244 my @b_ranges = $other->ranges;
4245 for my $i (0 .. @a_ranges - 1) {
4246 my $a = $a_ranges[$i];
4247 my $b = $b_ranges[$i];
4248 trace "self $a; other $b" if main::DEBUG && $to_trace;
4249 return 0 if $a->start != $b->start || $a->end != $b->end;
4250 }
4251 return 1;
4252 }
4253
99870f4d
KW
4254 sub is_code_point_usable {
4255 # This used only for making the test script. See if the input
4256 # proposed trial code point is one that Perl will handle. If second
4257 # parameter is 0, it won't select some code points for various
4258 # reasons, noted below.
4259
4260 my $code = shift;
4261 my $try_hard = shift;
4262 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4263
4264 return 0 if $code < 0; # Never use a negative
4265
99870f4d
KW
4266 # shun null. I'm (khw) not sure why this was done, but NULL would be
4267 # the character very frequently used.
4268 return $try_hard if $code == 0x0000;
4269
99870f4d
KW
4270 # shun non-character code points.
4271 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4272 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4273
4274 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
4275 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4276
4277 return 1;
4278 }
4279
4280 sub get_valid_code_point {
4281 # Return a code point that's part of the range list. Returns nothing
4282 # if the table is empty or we can't find a suitable code point. This
4283 # used only for making the test script.
4284
4285 my $self = shift;
4286 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4287
ffe43484 4288 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4289
4290 # On first pass, don't choose less desirable code points; if no good
4291 # one is found, repeat, allowing a less desirable one to be selected.
4292 for my $try_hard (0, 1) {
4293
4294 # Look through all the ranges for a usable code point.
4295 for my $set ($self->ranges) {
4296
4297 # Try the edge cases first, starting with the end point of the
4298 # range.
4299 my $end = $set->end;
4300 return $end if is_code_point_usable($end, $try_hard);
4301
4302 # End point didn't, work. Start at the beginning and try
4303 # every one until find one that does work.
4304 for my $trial ($set->start .. $end - 1) {
4305 return $trial if is_code_point_usable($trial, $try_hard);
4306 }
4307 }
4308 }
4309 return (); # If none found, give up.
4310 }
4311
4312 sub get_invalid_code_point {
4313 # Return a code point that's not part of the table. Returns nothing
4314 # if the table covers all code points or a suitable code point can't
4315 # be found. This used only for making the test script.
4316
4317 my $self = shift;
4318 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4319
4320 # Just find a valid code point of the inverse, if any.
4321 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4322 }
4323} # end closure for Range_List
4324
4325package Range_Map;
4326use base '_Range_List_Base';
4327
4328# A Range_Map is a range list in which the range values (called maps) are
4329# significant, and hence shouldn't be manipulated by our other code, which
4330# could be ambiguous or lose things. For example, in taking the union of two
4331# lists, which share code points, but which have differing values, which one
4332# has precedence in the union?
4333# It turns out that these operations aren't really necessary for map tables,
4334# and so this class was created to make sure they aren't accidentally
4335# applied to them.
4336
4337{ # Closure
4338
4339 sub add_map {
4340 # Add a range containing a mapping value to the list
4341
4342 my $self = shift;
4343 # Rest of parameters passed on
4344
4345 return $self->_add_delete('+', @_);
4346 }
4347
4348 sub add_duplicate {
4349 # Adds entry to a range list which can duplicate an existing entry
4350
4351 my $self = shift;
4352 my $code_point = shift;
4353 my $value = shift;
4354 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4355
4356 return $self->add_map($code_point, $code_point,
4357 $value, Replace => $MULTIPLE);
4358 }
4359} # End of closure for package Range_Map
4360
4361package _Base_Table;
4362
4363# A table is the basic data structure that gets written out into a file for
4364# use by the Perl core. This is the abstract base class implementing the
4365# common elements from the derived ones. A list of the methods to be
4366# furnished by an implementing class is just after the constructor.
4367
4368sub standardize { return main::standardize($_[0]); }
4369sub trace { return main::trace(@_); }
4370
4371{ # Closure
4372
4373 main::setup_package();
4374
4375 my %range_list;
4376 # Object containing the ranges of the table.
4377 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4378
4379 my %full_name;
4380 # The full table name.
4381 main::set_access('full_name', \%full_name, 'r');
4382
4383 my %name;
4384 # The table name, almost always shorter
4385 main::set_access('name', \%name, 'r');
4386
4387 my %short_name;
4388 # The shortest of all the aliases for this table, with underscores removed
4389 main::set_access('short_name', \%short_name);
4390
4391 my %nominal_short_name_length;
4392 # The length of short_name before removing underscores
4393 main::set_access('nominal_short_name_length',
4394 \%nominal_short_name_length);
4395
23e33b60
KW
4396 my %complete_name;
4397 # The complete name, including property.
4398 main::set_access('complete_name', \%complete_name, 'r');
4399
99870f4d
KW
4400 my %property;
4401 # Parent property this table is attached to.
4402 main::set_access('property', \%property, 'r');
4403
4404 my %aliases;
c12f2655
KW
4405 # Ordered list of alias objects of the table's name. The first ones in
4406 # the list are output first in comments
99870f4d
KW
4407 main::set_access('aliases', \%aliases, 'readable_array');
4408
4409 my %comment;
4410 # A comment associated with the table for human readers of the files
4411 main::set_access('comment', \%comment, 's');
4412
4413 my %description;
4414 # A comment giving a short description of the table's meaning for human
4415 # readers of the files.
4416 main::set_access('description', \%description, 'readable_array');
4417
4418 my %note;
4419 # A comment giving a short note about the table for human readers of the
4420 # files.
4421 main::set_access('note', \%note, 'readable_array');
4422
4423 my %internal_only;
c12f2655 4424 # Boolean; if set this table is for internal core Perl only use.
99870f4d
KW
4425 main::set_access('internal_only', \%internal_only);
4426
4427 my %find_table_from_alias;
4428 # The parent property passes this pointer to a hash which this class adds
4429 # all its aliases to, so that the parent can quickly take an alias and
4430 # find this table.
4431 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4432
4433 my %locked;
4434 # After this table is made equivalent to another one; we shouldn't go
4435 # changing the contents because that could mean it's no longer equivalent
4436 main::set_access('locked', \%locked, 'r');
4437
4438 my %file_path;
4439 # This gives the final path to the file containing the table. Each
4440 # directory in the path is an element in the array
4441 main::set_access('file_path', \%file_path, 'readable_array');
4442
4443 my %status;
4444 # What is the table's status, normal, $OBSOLETE, etc. Enum
4445 main::set_access('status', \%status, 'r');
4446
4447 my %status_info;
4448 # A comment about its being obsolete, or whatever non normal status it has
4449 main::set_access('status_info', \%status_info, 'r');
4450
d867ccfb
KW
4451 my %caseless_equivalent;
4452 # The table this is equivalent to under /i matching, if any.
4453 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4454
99870f4d
KW
4455 my %range_size_1;
4456 # Is the table to be output with each range only a single code point?
4457 # This is done to avoid breaking existing code that may have come to rely
4458 # on this behavior in previous versions of this program.)
4459 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4460
4461 my %perl_extension;
4462 # A boolean set iff this table is a Perl extension to the Unicode
4463 # standard.
4464 main::set_access('perl_extension', \%perl_extension, 'r');
4465
0c07e538
KW
4466 my %output_range_counts;
4467 # A boolean set iff this table is to have comments written in the
4468 # output file that contain the number of code points in the range.
4469 # The constructor can override the global flag of the same name.
4470 main::set_access('output_range_counts', \%output_range_counts, 'r');
4471
f5817e0a
KW
4472 my %format;
4473 # The format of the entries of the table. This is calculated from the
4474 # data in the table (or passed in the constructor). This is an enum e.g.,
4475 # $STRING_FORMAT
4476 main::set_access('format', \%format, 'r', 'p_s');
4477
99870f4d
KW
4478 sub new {
4479 # All arguments are key => value pairs, which you can see below, most
4480 # of which match fields documented above. Otherwise: Pod_Entry,
4481 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4482 # documented in the Alias package
4483
4484 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4485
4486 my $class = shift;
4487
4488 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4489 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4490
4491 my %args = @_;
4492
4493 $name{$addr} = delete $args{'Name'};
4494 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4495 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4496 my $complete_name = $complete_name{$addr}
4497 = delete $args{'Complete_Name'};
f5817e0a 4498 $format{$addr} = delete $args{'Format'};
99870f4d 4499 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
0c07e538 4500 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4501 $property{$addr} = delete $args{'_Property'};
4502 $range_list{$addr} = delete $args{'_Range_List'};
4503 $status{$addr} = delete $args{'Status'} || $NORMAL;
4504 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4505 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
d867ccfb 4506 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
99870f4d
KW
4507
4508 my $description = delete $args{'Description'};
4509 my $externally_ok = delete $args{'Externally_Ok'};
4510 my $loose_match = delete $args{'Fuzzy'};
4511 my $note = delete $args{'Note'};
4512 my $make_pod_entry = delete $args{'Pod_Entry'};
37e2e78e 4513 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4514
4515 # Shouldn't have any left over
4516 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4517
4518 # Can't use || above because conceivably the name could be 0, and
4519 # can't use // operator in case this program gets used in Perl 5.8
4520 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4521 $output_range_counts{$addr} = $output_range_counts if
4522 ! defined $output_range_counts{$addr};
99870f4d
KW
4523
4524 $aliases{$addr} = [ ];
4525 $comment{$addr} = [ ];
4526 $description{$addr} = [ ];
4527 $note{$addr} = [ ];
4528 $file_path{$addr} = [ ];
4529 $locked{$addr} = "";
4530
4531 push @{$description{$addr}}, $description if $description;
4532 push @{$note{$addr}}, $note if $note;
4533
37e2e78e
KW
4534 if ($status{$addr} eq $PLACEHOLDER) {
4535
4536 # A placeholder table doesn't get documented, is a perl extension,
4537 # and quite likely will be empty
4538 $make_pod_entry = 0 if ! defined $make_pod_entry;
4539 $perl_extension = 1 if ! defined $perl_extension;
4540 push @tables_that_may_be_empty, $complete_name{$addr};
4541 }
4542 elsif (! $status{$addr}) {
4543
4544 # If hasn't set its status already, see if it is on one of the
4545 # lists of properties or tables that have particular statuses; if
4546 # not, is normal. The lists are prioritized so the most serious
4547 # ones are checked first
ec11e5f4 4548 if (exists $why_suppressed{$complete_name}
98dc9551 4549 # Don't suppress if overridden
ec11e5f4
KW
4550 && ! grep { $_ eq $complete_name{$addr} }
4551 @output_mapped_properties)
4552 {
99870f4d
KW
4553 $status{$addr} = $SUPPRESSED;
4554 }
4555 elsif (exists $why_deprecated{$complete_name}) {
4556 $status{$addr} = $DEPRECATED;
4557 }
4558 elsif (exists $why_stabilized{$complete_name}) {
4559 $status{$addr} = $STABILIZED;
4560 }
4561 elsif (exists $why_obsolete{$complete_name}) {
4562 $status{$addr} = $OBSOLETE;
4563 }
4564
4565 # Existence above doesn't necessarily mean there is a message
4566 # associated with it. Use the most serious message.
4567 if ($status{$addr}) {
4568 if ($why_suppressed{$complete_name}) {
4569 $status_info{$addr}
4570 = $why_suppressed{$complete_name};
4571 }
4572 elsif ($why_deprecated{$complete_name}) {
4573 $status_info{$addr}
4574 = $why_deprecated{$complete_name};
4575 }
4576 elsif ($why_stabilized{$complete_name}) {
4577 $status_info{$addr}
4578 = $why_stabilized{$complete_name};
4579 }
4580 elsif ($why_obsolete{$complete_name}) {
4581 $status_info{$addr}
4582 = $why_obsolete{$complete_name};
4583 }
4584 }
4585 }
4586
37e2e78e
KW
4587 $perl_extension{$addr} = $perl_extension || 0;
4588
99870f4d
KW
4589 # By convention what typically gets printed only or first is what's
4590 # first in the list, so put the full name there for good output
4591 # clarity. Other routines rely on the full name being first on the
4592 # list
4593 $self->add_alias($full_name{$addr},
4594 Externally_Ok => $externally_ok,
4595 Fuzzy => $loose_match,
4596 Pod_Entry => $make_pod_entry,
4597 Status => $status{$addr},
4598 );
4599
4600 # Then comes the other name, if meaningfully different.
4601 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4602 $self->add_alias($name{$addr},
4603 Externally_Ok => $externally_ok,
4604 Fuzzy => $loose_match,
4605 Pod_Entry => $make_pod_entry,
4606 Status => $status{$addr},
4607 );
4608 }
4609
4610 return $self;
4611 }
4612
4613 # Here are the methods that are required to be defined by any derived
4614 # class
ea25a9b2 4615 for my $sub (qw(
668b3bfc 4616 handle_special_range
99870f4d 4617 append_to_body
99870f4d 4618 pre_body
ea25a9b2 4619 ))
668b3bfc
KW
4620 # write() knows how to write out normal ranges, but it calls
4621 # handle_special_range() when it encounters a non-normal one.
4622 # append_to_body() is called by it after it has handled all
4623 # ranges to add anything after the main portion of the table.
4624 # And finally, pre_body() is called after all this to build up
4625 # anything that should appear before the main portion of the
4626 # table. Doing it this way allows things in the middle to
4627 # affect what should appear before the main portion of the
99870f4d 4628 # table.
99870f4d
KW
4629 {
4630 no strict "refs";
4631 *$sub = sub {
4632 Carp::my_carp_bug( __LINE__
4633 . ": Must create method '$sub()' for "
4634 . ref shift);
4635 return;
4636 }
4637 }
4638
4639 use overload
4640 fallback => 0,
4641 "." => \&main::_operator_dot,
4642 '!=' => \&main::_operator_not_equal,
4643 '==' => \&main::_operator_equal,
4644 ;
4645
4646 sub ranges {
4647 # Returns the array of ranges associated with this table.
4648
f998e60c 4649 no overloading;
051df77b 4650 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4651 }
4652
4653 sub add_alias {
4654 # Add a synonym for this table.
4655
4656 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4657
4658 my $self = shift;
4659 my $name = shift; # The name to add.
4660 my $pointer = shift; # What the alias hash should point to. For
4661 # map tables, this is the parent property;
4662 # for match tables, it is the table itself.
4663
4664 my %args = @_;
4665 my $loose_match = delete $args{'Fuzzy'};
4666
4667 my $make_pod_entry = delete $args{'Pod_Entry'};
4668 $make_pod_entry = $YES unless defined $make_pod_entry;
4669
4670 my $externally_ok = delete $args{'Externally_Ok'};
4671 $externally_ok = 1 unless defined $externally_ok;
4672
4673 my $status = delete $args{'Status'};
4674 $status = $NORMAL unless defined $status;
4675
4676 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4677
4678 # Capitalize the first letter of the alias unless it is one of the CJK
4679 # ones which specifically begins with a lower 'k'. Do this because
4680 # Unicode has varied whether they capitalize first letters or not, and
4681 # have later changed their minds and capitalized them, but not the
4682 # other way around. So do it always and avoid changes from release to
4683 # release
4684 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4685
ffe43484 4686 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4687
4688 # Figure out if should be loosely matched if not already specified.
4689 if (! defined $loose_match) {
4690
4691 # Is a loose_match if isn't null, and doesn't begin with an
4692 # underscore and isn't just a number
4693 if ($name ne ""
4694 && substr($name, 0, 1) ne '_'
4695 && $name !~ qr{^[0-9_.+-/]+$})
4696 {
4697 $loose_match = 1;
4698 }
4699 else {
4700 $loose_match = 0;
4701 }
4702 }
4703
4704 # If this alias has already been defined, do nothing.
4705 return if defined $find_table_from_alias{$addr}->{$name};
4706
4707 # That includes if it is standardly equivalent to an existing alias,
4708 # in which case, add this name to the list, so won't have to search
4709 # for it again.
4710 my $standard_name = main::standardize($name);
4711 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4712 $find_table_from_alias{$addr}->{$name}
4713 = $find_table_from_alias{$addr}->{$standard_name};
4714 return;
4715 }
4716
4717 # Set the index hash for this alias for future quick reference.
4718 $find_table_from_alias{$addr}->{$name} = $pointer;
4719 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4720 local $to_trace = 0 if main::DEBUG;
4721 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4722 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4723
4724
4725 # Put the new alias at the end of the list of aliases unless the final
4726 # element begins with an underscore (meaning it is for internal perl
4727 # use) or is all numeric, in which case, put the new one before that
4728 # one. This floats any all-numeric or underscore-beginning aliases to
4729 # the end. This is done so that they are listed last in output lists,
4730 # to encourage the user to use a better name (either more descriptive
4731 # or not an internal-only one) instead. This ordering is relied on
4732 # implicitly elsewhere in this program, like in short_name()
4733 my $list = $aliases{$addr};
4734 my $insert_position = (@$list == 0
4735 || (substr($list->[-1]->name, 0, 1) ne '_'
4736 && $list->[-1]->name =~ /\D/))
4737 ? @$list
4738 : @$list - 1;
4739 splice @$list,
4740 $insert_position,
4741 0,
4742 Alias->new($name, $loose_match, $make_pod_entry,
4743 $externally_ok, $status);
4744
4745 # This name may be shorter than any existing ones, so clear the cache
4746 # of the shortest, so will have to be recalculated.
f998e60c 4747 no overloading;
051df77b 4748 undef $short_name{pack 'J', $self};
99870f4d
KW
4749 return;
4750 }
4751
4752 sub short_name {
4753 # Returns a name suitable for use as the base part of a file name.
4754 # That is, shorter wins. It can return undef if there is no suitable
4755 # name. The name has all non-essential underscores removed.
4756
4757 # The optional second parameter is a reference to a scalar in which
4758 # this routine will store the length the returned name had before the
4759 # underscores were removed, or undef if the return is undef.
4760
4761 # The shortest name can change if new aliases are added. So using
4762 # this should be deferred until after all these are added. The code
4763 # that does that should clear this one's cache.
4764 # Any name with alphabetics is preferred over an all numeric one, even
4765 # if longer.
4766
4767 my $self = shift;
4768 my $nominal_length_ptr = shift;
4769 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4770
ffe43484 4771 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4772
4773 # For efficiency, don't recalculate, but this means that adding new
4774 # aliases could change what the shortest is, so the code that does
4775 # that needs to undef this.
4776 if (defined $short_name{$addr}) {
4777 if ($nominal_length_ptr) {
4778 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4779 }
4780 return $short_name{$addr};
4781 }
4782
4783 # Look at each alias
4784 foreach my $alias ($self->aliases()) {
4785
4786 # Don't use an alias that isn't ok to use for an external name.
4787 next if ! $alias->externally_ok;
4788
4789 my $name = main::Standardize($alias->name);
4790 trace $self, $name if main::DEBUG && $to_trace;
4791
4792 # Take the first one, or a shorter one that isn't numeric. This
4793 # relies on numeric aliases always being last in the array
4794 # returned by aliases(). Any alpha one will have precedence.
4795 if (! defined $short_name{$addr}
4796 || ($name =~ /\D/
4797 && length($name) < length($short_name{$addr})))
4798 {
4799 # Remove interior underscores.
4800 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4801
4802 $nominal_short_name_length{$addr} = length $name;
4803 }
4804 }
4805
4806 # If no suitable external name return undef
4807 if (! defined $short_name{$addr}) {
4808 $$nominal_length_ptr = undef if $nominal_length_ptr;
4809 return;
4810 }
4811
c12f2655 4812 # Don't allow a null short name.
99870f4d
KW
4813 if ($short_name{$addr} eq "") {
4814 $short_name{$addr} = '_';
4815 $nominal_short_name_length{$addr} = 1;
4816 }
4817
4818 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4819
4820 if ($nominal_length_ptr) {
4821 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4822 }
4823 return $short_name{$addr};
4824 }
4825
4826 sub external_name {
4827 # Returns the external name that this table should be known by. This
c12f2655
KW
4828 # is usually the short_name, but not if the short_name is undefined,
4829 # in which case the external_name is arbitrarily set to the
4830 # underscore.
99870f4d
KW
4831
4832 my $self = shift;
4833 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4834
4835 my $short = $self->short_name;
4836 return $short if defined $short;
4837
4838 return '_';
4839 }
4840
4841 sub add_description { # Adds the parameter as a short description.
4842
4843 my $self = shift;
4844 my $description = shift;
4845 chomp $description;
4846 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4847
f998e60c 4848 no overloading;
051df77b 4849 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
4850
4851 return;
4852 }
4853
4854 sub add_note { # Adds the parameter as a short note.
4855
4856 my $self = shift;
4857 my $note = shift;
4858 chomp $note;
4859 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4860
f998e60c 4861 no overloading;
051df77b 4862 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
4863
4864 return;
4865 }
4866
4867 sub add_comment { # Adds the parameter as a comment.
4868
bd9ebcfd
KW
4869 return unless $debugging_build;
4870
99870f4d
KW
4871 my $self = shift;
4872 my $comment = shift;
4873 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4874
4875 chomp $comment;
f998e60c
KW
4876
4877 no overloading;
051df77b 4878 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
4879
4880 return;
4881 }
4882
4883 sub comment {
4884 # Return the current comment for this table. If called in list
4885 # context, returns the array of comments. In scalar, returns a string
4886 # of each element joined together with a period ending each.
4887
4888 my $self = shift;
4889 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4890
ffe43484 4891 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 4892 my @list = @{$comment{$addr}};
99870f4d
KW
4893 return @list if wantarray;
4894 my $return = "";
4895 foreach my $sentence (@list) {
4896 $return .= '. ' if $return;
4897 $return .= $sentence;
4898 $return =~ s/\.$//;
4899 }
4900 $return .= '.' if $return;
4901 return $return;
4902 }
4903
4904 sub initialize {
4905 # Initialize the table with the argument which is any valid
4906 # initialization for range lists.
4907
4908 my $self = shift;
ffe43484 4909 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4910 my $initialization = shift;
4911 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4912
4913 # Replace the current range list with a new one of the same exact
4914 # type.
f998e60c
KW
4915 my $class = ref $range_list{$addr};
4916 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
4917 Initialize => $initialization);
4918 return;
4919
4920 }
4921
4922 sub header {
4923 # The header that is output for the table in the file it is written
4924 # in.
4925
4926 my $self = shift;
4927 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4928
4929 my $return = "";
4930 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4931 $return .= $HEADER;
f998e60c 4932 no overloading;
051df77b 4933 $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
99870f4d
KW
4934 return $return;
4935 }
4936
4937 sub write {
668b3bfc
KW
4938 # Write a representation of the table to its file. It calls several
4939 # functions furnished by sub-classes of this abstract base class to
4940 # handle non-normal ranges, to add stuff before the table, and at its
4941 # end.
99870f4d
KW
4942
4943 my $self = shift;
4944 my $tab_stops = shift; # The number of tab stops over to put any
4945 # comment.
4946 my $suppress_value = shift; # Optional, if the value associated with
4947 # a range equals this one, don't write
4948 # the range
4949 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4950
ffe43484 4951 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4952
4953 # Start with the header
668b3bfc 4954 my @HEADER = $self->header;
99870f4d
KW
4955
4956 # Then the comments
668b3bfc 4957 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
99870f4d
KW
4958 if $comment{$addr};
4959
668b3bfc
KW
4960 # Things discovered processing the main body of the document may
4961 # affect what gets output before it, therefore pre_body() isn't called
4962 # until after all other processing of the table is done.
99870f4d 4963
c4019d52
KW
4964 # The main body looks like a 'here' document. If annotating, get rid
4965 # of the comments before passing to the caller, as some callers, such
4966 # as charnames.pm, can't cope with them. (Outputting range counts
4967 # also introduces comments, but these don't show up in the tables that
4968 # can't cope with comments, and there aren't that many of them that
4969 # it's worth the extra real time to get rid of them).
668b3bfc 4970 my @OUT;
558712cf 4971 if ($annotate) {
c4019d52
KW
4972 # Use the line below in Perls that don't have /r
4973 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
4974 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
4975 } else {
4976 push @OUT, "return <<'END';\n";
4977 }
99870f4d
KW
4978
4979 if ($range_list{$addr}->is_empty) {
4980
4981 # This is a kludge for empty tables to silence a warning in
4982 # utf8.c, which can't really deal with empty tables, but it can
4983 # deal with a table that matches nothing, as the inverse of 'Any'
4984 # does.
67a53d68 4985 push @OUT, "!utf8::Any\n";
99870f4d 4986 }
c69a9c68
KW
4987 elsif ($self->name eq 'N'
4988
4989 # To save disk space and table cache space, avoid putting out
4990 # binary N tables, but instead create a file which just inverts
4991 # the Y table. Since the file will still exist and occupy a
4992 # certain number of blocks, might as well output the whole
4993 # thing if it all will fit in one block. The number of
4994 # ranges below is an approximate number for that.
4995 && $self->property->type == $BINARY
4996 # && $self->property->tables == 2 Can't do this because the
4997 # non-binary properties, like NFDQC aren't specifiable
4998 # by the notation
4999 && $range_list{$addr}->ranges > 15
5000 && ! $annotate) # Under --annotate, want to see everything
5001 {
5002 push @OUT, "!utf8::" . $self->property->name . "\n";
5003 }
99870f4d
KW
5004 else {
5005 my $range_size_1 = $range_size_1{$addr};
558712cf
KW
5006 my $format; # Used only in $annotate option
5007 my $include_name; # Used only in $annotate option
c4019d52 5008
558712cf 5009 if ($annotate) {
c4019d52
KW
5010
5011 # if annotating each code point, must print 1 per line.
5012 # The variable could point to a subroutine, and we don't want
5013 # to lose that fact, so only set if not set already
5014 $range_size_1 = 1 if ! $range_size_1;
5015
5016 $format = $self->format;
5017
5018 # The name of the character is output only for tables that
5019 # don't already include the name in the output.
5020 my $property = $self->property;
5021 $include_name =
5022 ! ($property == $perl_charname
5023 || $property == main::property_ref('Unicode_1_Name')
5024 || $property == main::property_ref('Name')
5025 || $property == main::property_ref('Name_Alias')
5026 );
5027 }
99870f4d
KW
5028
5029 # Output each range as part of the here document.
5a2b5ddb 5030 RANGE:
99870f4d 5031 for my $set ($range_list{$addr}->ranges) {
5a2b5ddb
KW
5032 if ($set->type != 0) {
5033 $self->handle_special_range($set);
5034 next RANGE;
5035 }
99870f4d
KW
5036 my $start = $set->start;
5037 my $end = $set->end;
5038 my $value = $set->value;
5039
5040 # Don't output ranges whose value is the one to suppress
c4019d52
KW
5041 next RANGE if defined $suppress_value
5042 && $value eq $suppress_value;
99870f4d 5043
c4019d52
KW
5044 # If there is a range and doesn't need a single point range
5045 # output
5046 if ($start != $end && ! $range_size_1) {
bd9ebcfd
KW
5047 push @OUT, sprintf "%04X\t%04X", $start, $end;
5048 $OUT[-1] .= "\t$value" if $value ne "";
99870f4d
KW
5049
5050 # Add a comment with the size of the range, if requested.
5051 # Expand Tabs to make sure they all start in the same
5052 # column, and then unexpand to use mostly tabs.
0c07e538 5053 if (! $output_range_counts{$addr}) {
99870f4d
KW
5054 $OUT[-1] .= "\n";
5055 }
5056 else {
5057 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5058 my $count = main::clarify_number($end - $start + 1);
5059 use integer;
5060
5061 my $width = $tab_stops * 8 - 1;
5062 $OUT[-1] = sprintf("%-*s # [%s]\n",
5063 $width,
5064 $OUT[-1],
5065 $count);
5066 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5067 }
c4019d52
KW
5068 next RANGE;
5069 }
5070
5071 # Here to output a single code point per line
5072
5073 # If not to annotate, use the simple formats
558712cf 5074 if (! $annotate) {
c4019d52
KW
5075
5076 # Use any passed in subroutine to output.
5077 if (ref $range_size_1 eq 'CODE') {
5078 for my $i ($start .. $end) {
5079 push @OUT, &{$range_size_1}($i, $value);
5080 }
5081 }
5082 else {
5083
5084 # Here, caller is ok with default output.
5085 for (my $i = $start; $i <= $end; $i++) {
5086 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5087 }
5088 }
5089 next RANGE;
5090 }
5091
5092 # Here, wants annotation.
5093 for (my $i = $start; $i <= $end; $i++) {
5094
5095 # Get character information if don't have it already
5096 main::populate_char_info($i)
5097 if ! defined $viacode[$i];
5098 my $type = $annotate_char_type[$i];
5099
5100 # Figure out if should output the next code points as part
5101 # of a range or not. If this is not in an annotation
5102 # range, then won't output as a range, so returns $i.
5103 # Otherwise use the end of the annotation range, but no
5104 # further than the maximum possible end point of the loop.
5105 my $range_end = main::min($annotate_ranges->value_of($i)
5106 || $i,
5107 $end);
5108
5109 # Use a range if it is a range, and either is one of the
5110 # special annotation ranges, or the range is at most 3
5111 # long. This last case causes the algorithmically named
5112 # code points to be output individually in spans of at
5113 # most 3, as they are the ones whose $type is > 0.
5114 if ($range_end != $i
5115 && ( $type < 0 || $range_end - $i > 2))
5116 {
5117 # Here is to output a range. We don't allow a
5118 # caller-specified output format--just use the
5119 # standard one.
5120 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5121 $range_end,
5122 $value;
5123 my $range_name = $viacode[$i];
5124
5125 # For the code points which end in their hex value, we
5126 # eliminate that from the output annotation, and
5127 # capitalize only the first letter of each word.
5128 if ($type == $CP_IN_NAME) {
5129 my $hex = sprintf "%04X", $i;
5130 $range_name =~ s/-$hex$//;
5131 my @words = split " ", $range_name;
5132 for my $word (@words) {
5133 $word = ucfirst(lc($word)) if $word ne 'CJK';
5134 }
5135 $range_name = join " ", @words;
5136 }
5137 elsif ($type == $HANGUL_SYLLABLE) {
5138 $range_name = "Hangul Syllable";
5139 }
5140
5141 $OUT[-1] .= " $range_name" if $range_name;
5142
5143 # Include the number of code points in the range
5144 my $count = main::clarify_number($range_end - $i + 1);
5145 $OUT[-1] .= " [$count]\n";
5146
5147 # Skip to the end of the range
5148 $i = $range_end;
5149 }
5150 else { # Not in a range.
5151 my $comment = "";
5152
5153 # When outputting the names of each character, use
5154 # the character itself if printable
5155 $comment .= "'" . chr($i) . "' " if $printable[$i];
5156
5157 # To make it more readable, use a minimum indentation
5158 my $comment_indent;
5159
5160 # Determine the annotation
5161 if ($format eq $DECOMP_STRING_FORMAT) {
5162
5163 # This is very specialized, with the type of
5164 # decomposition beginning the line enclosed in
5165 # <...>, and the code points that the code point
5166 # decomposes to separated by blanks. Create two
5167 # strings, one of the printable characters, and
5168 # one of their official names.
5169 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5170 my $tostr = "";
5171 my $to_name = "";
5172 my $to_chr = "";
5173 foreach my $to (split " ", $map) {
5174 $to = CORE::hex $to;
5175 $to_name .= " + " if $to_name;
5176 $to_chr .= chr($to);
5177 main::populate_char_info($to)
5178 if ! defined $viacode[$to];
5179 $to_name .= $viacode[$to];
5180 }
5181
5182 $comment .=
5183 "=> '$to_chr'; $viacode[$i] => $to_name";
5184 $comment_indent = 25; # Determined by experiment
5185 }
5186 else {
5187
5188 # Assume that any table that has hex format is a
5189 # mapping of one code point to another.
5190 if ($format eq $HEX_FORMAT) {
5191 my $decimal_value = CORE::hex $value;
5192 main::populate_char_info($decimal_value)
5193 if ! defined $viacode[$decimal_value];
5194 $comment .= "=> '"
5195 . chr($decimal_value)
5196 . "'; " if $printable[$decimal_value];
5197 }
5198 $comment .= $viacode[$i] if $include_name
5199 && $viacode[$i];
5200 if ($format eq $HEX_FORMAT) {
5201 my $decimal_value = CORE::hex $value;
5202 $comment .= " => $viacode[$decimal_value]"
5203 if $viacode[$decimal_value];
5204 }
5205
5206 # If including the name, no need to indent, as the
5207 # name will already be way across the line.
5208 $comment_indent = ($include_name) ? 0 : 60;
5209 }
5210
5211 # Use any passed in routine to output the base part of
5212 # the line.
5213 if (ref $range_size_1 eq 'CODE') {
5214 my $base_part = &{$range_size_1}($i, $value);
5215 chomp $base_part;
5216 push @OUT, $base_part;
5217 }
5218 else {
5219 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5220 }
5221
5222 # And add the annotation.
5223 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5224 $OUT[-1],
5225 $comment if $comment;
5226 $OUT[-1] .= "\n";
5227 }
99870f4d
KW
5228 }
5229 } # End of loop through all the table's ranges
5230 }
5231
5232 # Add anything that goes after the main body, but within the here
5233 # document,
5234 my $append_to_body = $self->append_to_body;
5235 push @OUT, $append_to_body if $append_to_body;
5236
5237 # And finish the here document.
5238 push @OUT, "END\n";
5239
668b3bfc
KW
5240 # Done with the main portion of the body. Can now figure out what
5241 # should appear before it in the file.
5242 my $pre_body = $self->pre_body;
5243 push @HEADER, $pre_body, "\n" if $pre_body;
668b3bfc 5244
6b0079b5
KW
5245 # All these files should have a .pl suffix added to them.
5246 my @file_with_pl = @{$file_path{$addr}};
5247 $file_with_pl[-1] .= '.pl';
99870f4d 5248
6b0079b5 5249 main::write(\@file_with_pl,
558712cf 5250 $annotate, # utf8 iff annotating
9218f1cf
KW
5251 \@HEADER,
5252 \@OUT);
99870f4d
KW
5253 return;
5254 }
5255
5256 sub set_status { # Set the table's status
5257 my $self = shift;
5258 my $status = shift; # The status enum value
5259 my $info = shift; # Any message associated with it.
5260 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5261
ffe43484 5262 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5263
5264 $status{$addr} = $status;
5265 $status_info{$addr} = $info;
5266 return;
5267 }
5268
5269 sub lock {
5270 # Don't allow changes to the table from now on. This stores a stack
5271 # trace of where it was called, so that later attempts to modify it
5272 # can immediately show where it got locked.
5273
5274 my $self = shift;
5275 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5276
ffe43484 5277 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5278
5279 $locked{$addr} = "";
5280
5281 my $line = (caller(0))[2];
5282 my $i = 1;
5283
5284 # Accumulate the stack trace
5285 while (1) {
5286 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5287
5288 last unless defined $caller;
5289
5290 $locked{$addr} .= " called from $caller() at line $line\n";
5291 $line = $caller_line;
5292 }
5293 $locked{$addr} .= " called from main at line $line\n";
5294
5295 return;
5296 }
5297
5298 sub carp_if_locked {
5299 # Return whether a table is locked or not, and, by the way, complain
5300 # if is locked
5301
5302 my $self = shift;
5303 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5304
ffe43484 5305 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5306
5307 return 0 if ! $locked{$addr};
5308 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5309 return 1;
5310 }
5311
5312 sub set_file_path { # Set the final directory path for this table
5313 my $self = shift;
5314 # Rest of parameters passed on
5315
f998e60c 5316 no overloading;
051df77b 5317 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
5318 return
5319 }
5320
5321 # Accessors for the range list stored in this table. First for
5322 # unconditional
ea25a9b2 5323 for my $sub (qw(
2f7a8815 5324 containing_range
99870f4d
KW
5325 contains
5326 count
5327 each_range
5328 hash
5329 is_empty
09aba7e4 5330 matches_identically_to
99870f4d
KW
5331 max
5332 min
5333 range_count
5334 reset_each_range
0a9dbafc 5335 type_of
99870f4d 5336 value_of
ea25a9b2 5337 ))
99870f4d
KW
5338 {
5339 no strict "refs";
5340 *$sub = sub {
5341 use strict "refs";
5342 my $self = shift;
f998e60c 5343 no overloading;
051df77b 5344 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
5345 }
5346 }
5347
5348 # Then for ones that should fail if locked
ea25a9b2 5349 for my $sub (qw(
99870f4d 5350 delete_range
ea25a9b2 5351 ))
99870f4d
KW
5352 {
5353 no strict "refs";
5354 *$sub = sub {
5355 use strict "refs";
5356 my $self = shift;
5357
5358 return if $self->carp_if_locked;
f998e60c 5359 no overloading;
051df77b 5360 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
5361 }
5362 }
5363
5364} # End closure
5365
5366package Map_Table;
5367use base '_Base_Table';
5368
5369# A Map Table is a table that contains the mappings from code points to
5370# values. There are two weird cases:
5371# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5372# are written in the table's file at the end of the table nonetheless. It
5373# requires specially constructed code to handle these; utf8.c can not read
5374# these in, so they should not go in $map_directory. As of this writing,
5375# the only case that these happen is for named sequences used in
5376# charnames.pm. But this code doesn't enforce any syntax on these, so
5377# something else could come along that uses it.
5378# 2) Specials are anything that doesn't fit syntactically into the body of the
5379# table. The ranges for these have a map type of non-zero. The code below
5380# knows about and handles each possible type. In most cases, these are
5381# written as part of the header.
5382#
5383# A map table deliberately can't be manipulated at will unlike match tables.
5384# This is because of the ambiguities having to do with what to do with
5385# overlapping code points. And there just isn't a need for those things;
5386# what one wants to do is just query, add, replace, or delete mappings, plus
5387# write the final result.
5388# However, there is a method to get the list of possible ranges that aren't in
5389# this table to use for defaulting missing code point mappings. And,
5390# map_add_or_replace_non_nulls() does allow one to add another table to this
5391# one, but it is clearly very specialized, and defined that the other's
5392# non-null values replace this one's if there is any overlap.
5393
5394sub trace { return main::trace(@_); }
5395
5396{ # Closure
5397
5398 main::setup_package();
5399
5400 my %default_map;
5401 # Many input files omit some entries; this gives what the mapping for the
5402 # missing entries should be
5403 main::set_access('default_map', \%default_map, 'r');
5404
5405 my %anomalous_entries;
5406 # Things that go in the body of the table which don't fit the normal
5407 # scheme of things, like having a range. Not much can be done with these
5408 # once there except to output them. This was created to handle named
5409 # sequences.
5410 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5411 main::set_access('anomalous_entries', # Append singular, read plural
5412 \%anomalous_entries,
5413 'readable_array');
5414
99870f4d
KW
5415 my %core_access;
5416 # This is a string, solely for documentation, indicating how one can get
5417 # access to this property via the Perl core.
5418 main::set_access('core_access', \%core_access, 'r', 's');
5419
99870f4d 5420 my %to_output_map;
8572ace0 5421 # Enum as to whether or not to write out this map table:
c12f2655 5422 # 0 don't output
8572ace0
KW
5423 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5424 # it should not be removed nor its format changed. This
5425 # is done for those files that have traditionally been
5426 # output.
5427 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5428 # with this file
99870f4d
KW
5429 main::set_access('to_output_map', \%to_output_map, 's');
5430
5431
5432 sub new {
5433 my $class = shift;
5434 my $name = shift;
5435
5436 my %args = @_;
5437
5438 # Optional initialization data for the table.
5439 my $initialize = delete $args{'Initialize'};
5440
5441 my $core_access = delete $args{'Core_Access'};
5442 my $default_map = delete $args{'Default_Map'};
99870f4d 5443 my $property = delete $args{'_Property'};
23e33b60 5444 my $full_name = delete $args{'Full_Name'};
20863809 5445
99870f4d
KW
5446 # Rest of parameters passed on
5447
5448 my $range_list = Range_Map->new(Owner => $property);
5449
5450 my $self = $class->SUPER::new(
5451 Name => $name,
23e33b60
KW
5452 Complete_Name => $full_name,
5453 Full_Name => $full_name,
99870f4d
KW
5454 _Property => $property,
5455 _Range_List => $range_list,
5456 %args);
5457
ffe43484 5458 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5459
5460 $anomalous_entries{$addr} = [];
5461 $core_access{$addr} = $core_access;
5462 $default_map{$addr} = $default_map;
99870f4d
KW
5463
5464 $self->initialize($initialize) if defined $initialize;
5465
5466 return $self;
5467 }
5468
5469 use overload
5470 fallback => 0,
5471 qw("") => "_operator_stringify",
5472 ;
5473
5474 sub _operator_stringify {
5475 my $self = shift;
5476
5477 my $name = $self->property->full_name;
5478 $name = '""' if $name eq "";
5479 return "Map table for Property '$name'";
5480 }
5481
99870f4d
KW
5482 sub add_alias {
5483 # Add a synonym for this table (which means the property itself)
5484 my $self = shift;
5485 my $name = shift;
5486 # Rest of parameters passed on.
5487
5488 $self->SUPER::add_alias($name, $self->property, @_);
5489 return;
5490 }
5491
5492 sub add_map {
5493 # Add a range of code points to the list of specially-handled code
5494 # points. $MULTI_CP is assumed if the type of special is not passed
5495 # in.
5496
5497 my $self = shift;
5498 my $lower = shift;
5499 my $upper = shift;
5500 my $string = shift;
5501 my %args = @_;
5502
5503 my $type = delete $args{'Type'} || 0;
5504 # Rest of parameters passed on
5505
5506 # Can't change the table if locked.
5507 return if $self->carp_if_locked;
5508
ffe43484 5509 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 5510
99870f4d
KW
5511 $self->_range_list->add_map($lower, $upper,
5512 $string,
5513 @_,
5514 Type => $type);
5515 return;
5516 }
5517
5518 sub append_to_body {
5519 # Adds to the written HERE document of the table's body any anomalous
5520 # entries in the table..
5521
5522 my $self = shift;
5523 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5524
ffe43484 5525 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5526
5527 return "" unless @{$anomalous_entries{$addr}};
5528 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5529 }
5530
5531 sub map_add_or_replace_non_nulls {
5532 # This adds the mappings in the table $other to $self. Non-null
5533 # mappings from $other override those in $self. It essentially merges
5534 # the two tables, with the second having priority except for null
5535 # mappings.
5536
5537 my $self = shift;
5538 my $other = shift;
5539 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5540
5541 return if $self->carp_if_locked;
5542
5543 if (! $other->isa(__PACKAGE__)) {
5544 Carp::my_carp_bug("$other should be a "
5545 . __PACKAGE__
5546 . ". Not a '"
5547 . ref($other)
5548 . "'. Not added;");
5549 return;
5550 }
5551
ffe43484
NC
5552 my $addr = do { no overloading; pack 'J', $self; };
5553 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5554
5555 local $to_trace = 0 if main::DEBUG;
5556
5557 my $self_range_list = $self->_range_list;
5558 my $other_range_list = $other->_range_list;
5559 foreach my $range ($other_range_list->ranges) {
5560 my $value = $range->value;
5561 next if $value eq "";
5562 $self_range_list->_add_delete('+',
5563 $range->start,
5564 $range->end,
5565 $value,
5566 Type => $range->type,
5567 Replace => $UNCONDITIONALLY);
5568 }
5569
99870f4d
KW
5570 return;
5571 }
5572
5573 sub set_default_map {
5574 # Define what code points that are missing from the input files should
5575 # map to
5576
5577 my $self = shift;
5578 my $map = shift;
5579 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5580
ffe43484 5581 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5582
5583 # Convert the input to the standard equivalent, if any (won't have any
5584 # for $STRING properties)
5585 my $standard = $self->_find_table_from_alias->{$map};
5586 $map = $standard->name if defined $standard;
5587
5588 # Warn if there already is a non-equivalent default map for this
5589 # property. Note that a default map can be a ref, which means that
5590 # what it actually means is delayed until later in the program, and it
5591 # IS permissible to override it here without a message.
5592 my $default_map = $default_map{$addr};
5593 if (defined $default_map
5594 && ! ref($default_map)
5595 && $default_map ne $map
5596 && main::Standardize($map) ne $default_map)
5597 {
5598 my $property = $self->property;
5599 my $map_table = $property->table($map);
5600 my $default_table = $property->table($default_map);
5601 if (defined $map_table
5602 && defined $default_table
5603 && $map_table != $default_table)
5604 {
5605 Carp::my_carp("Changing the default mapping for "
5606 . $property
5607 . " from $default_map to $map'");
5608 }
5609 }
5610
5611 $default_map{$addr} = $map;
5612
5613 # Don't also create any missing table for this map at this point,
5614 # because if we did, it could get done before the main table add is
5615 # done for PropValueAliases.txt; instead the caller will have to make
5616 # sure it exists, if desired.
5617 return;
5618 }
5619
5620 sub to_output_map {
5621 # Returns boolean: should we write this map table?
5622
5623 my $self = shift;
5624 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5625
ffe43484 5626 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5627
5628 # If overridden, use that
5629 return $to_output_map{$addr} if defined $to_output_map{$addr};
5630
5631 my $full_name = $self->full_name;
fcf1973c
KW
5632 return $global_to_output_map{$full_name}
5633 if defined $global_to_output_map{$full_name};
99870f4d 5634
20863809 5635 # If table says to output, do so; if says to suppress it, do so.
8572ace0 5636 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
99870f4d
KW
5637 return 0 if $self->status eq $SUPPRESSED;
5638
5639 my $type = $self->property->type;
5640
5641 # Don't want to output binary map tables even for debugging.
5642 return 0 if $type == $BINARY;
5643
5644 # But do want to output string ones.
8572ace0 5645 return $EXTERNAL_MAP if $type == $STRING;
99870f4d 5646
8572ace0
KW
5647 # Otherwise is an $ENUM, do output it, for Perl's purposes
5648 return $INTERNAL_MAP;
99870f4d
KW
5649 }
5650
5651 sub inverse_list {
5652 # Returns a Range_List that is gaps of the current table. That is,
5653 # the inversion
5654
5655 my $self = shift;
5656 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5657
5658 my $current = Range_List->new(Initialize => $self->_range_list,
5659 Owner => $self->property);
5660 return ~ $current;
5661 }
5662
8572ace0
KW
5663 sub header {
5664 my $self = shift;
5665 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5666
5667 my $return = $self->SUPER::header();
5668
5669 $return .= $INTERNAL_ONLY if $self->to_output_map == $INTERNAL_MAP;
5670 return $return;
5671 }
5672
99870f4d
KW
5673 sub set_final_comment {
5674 # Just before output, create the comment that heads the file
5675 # containing this table.
5676
bd9ebcfd
KW
5677 return unless $debugging_build;
5678
99870f4d
KW
5679 my $self = shift;
5680 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5681
5682 # No sense generating a comment if aren't going to write it out.
5683 return if ! $self->to_output_map;
5684
ffe43484 5685 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5686
5687 my $property = $self->property;
5688
5689 # Get all the possible names for this property. Don't use any that
5690 # aren't ok for use in a file name, etc. This is perhaps causing that
5691 # flag to do double duty, and may have to be changed in the future to
5692 # have our own flag for just this purpose; but it works now to exclude
5693 # Perl generated synonyms from the lists for properties, where the
5694 # name is always the proper Unicode one.
5695 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5696
5697 my $count = $self->count;
5698 my $default_map = $default_map{$addr};
5699
5700 # The ranges that map to the default aren't output, so subtract that
5701 # to get those actually output. A property with matching tables
5702 # already has the information calculated.
5703 if ($property->type != $STRING) {
5704 $count -= $property->table($default_map)->count;
5705 }
5706 elsif (defined $default_map) {
5707
5708 # But for $STRING properties, must calculate now. Subtract the
5709 # count from each range that maps to the default.
5710 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5711 if ($range->value eq $default_map) {
5712 $count -= $range->end +1 - $range->start;
5713 }
5714 }
5715
5716 }
5717
5718 # Get a string version of $count with underscores in large numbers,
5719 # for clarity.
5720 my $string_count = main::clarify_number($count);
5721
5722 my $code_points = ($count == 1)
5723 ? 'single code point'
5724 : "$string_count code points";
5725
5726 my $mapping;
5727 my $these_mappings;
5728 my $are;
5729 if (@property_aliases <= 1) {
5730 $mapping = 'mapping';
5731 $these_mappings = 'this mapping';
5732 $are = 'is'
5733 }
5734 else {
5735 $mapping = 'synonymous mappings';
5736 $these_mappings = 'these mappings';
5737 $are = 'are'
5738 }
5739 my $cp;
5740 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5741 $cp = "any code point in Unicode Version $string_version";
5742 }
5743 else {
5744 my $map_to;
5745 if ($default_map eq "") {
5746 $map_to = 'the null string';
5747 }
5748 elsif ($default_map eq $CODE_POINT) {
5749 $map_to = "itself";
5750 }
5751 else {
5752 $map_to = "'$default_map'";
5753 }
5754 if ($count == 1) {
5755 $cp = "the single code point";
5756 }
5757 else {
5758 $cp = "one of the $code_points";
5759 }
5760 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5761 }
5762
5763 my $comment = "";
5764
5765 my $status = $self->status;
5766 if ($status) {
5767 my $warn = uc $status_past_participles{$status};
5768 $comment .= <<END;
5769
5770!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5771 All property or property=value combinations contained in this file are $warn.
5772 See $unicode_reference_url for what this means.
5773
5774END
5775 }
5776 $comment .= "This file returns the $mapping:\n";
5777
5778 for my $i (0 .. @property_aliases - 1) {
5779 $comment .= sprintf("%-8s%s\n",
5780 " ",
5781 $property_aliases[$i]->name . '(cp)'
5782 );
5783 }
5784 $comment .=
5785 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5786
5787 my $access = $core_access{$addr};
5788 if ($access) {
5789 $comment .= "accessible through the Perl core via $access.";
5790 }
5791 else {
5792 $comment .= "not accessible through the Perl core directly.";
5793 }
5794
5795 # And append any commentary already set from the actual property.
5796 $comment .= "\n\n" . $self->comment if $self->comment;
5797 if ($self->description) {
5798 $comment .= "\n\n" . join " ", $self->description;
5799 }
5800 if ($self->note) {
5801 $comment .= "\n\n" . join " ", $self->note;
5802 }
5803 $comment .= "\n";
5804
5805 if (! $self->perl_extension) {
5806 $comment .= <<END;
5807
5808For information about what this property really means, see:
5809$unicode_reference_url
5810END
5811 }
5812
5813 if ($count) { # Format differs for empty table
5814 $comment.= "\nThe format of the ";
5815 if ($self->range_size_1) {
5816 $comment.= <<END;
5817main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5818is in hex; MAPPING is what CODE_POINT maps to.
5819END
5820 }
5821 else {
5822
5823 # There are tables which end up only having one element per
5824 # range, but it is not worth keeping track of for making just
5825 # this comment a little better.
5826 $comment.= <<END;
5827non-comment portions of the main body of lines of this file is:
5828START\\tSTOP\\tMAPPING where START is the starting code point of the
5829range, in hex; STOP is the ending point, or if omitted, the range has just one
5830code point; MAPPING is what each code point between START and STOP maps to.
5831END
0c07e538 5832 if ($self->output_range_counts) {
99870f4d
KW
5833 $comment .= <<END;
5834Numbers in comments in [brackets] indicate how many code points are in the
5835range (omitted when the range is a single code point or if the mapping is to
5836the null string).
5837END
5838 }
5839 }
5840 }
5841 $self->set_comment(main::join_lines($comment));
5842 return;
5843 }
5844
5845 my %swash_keys; # Makes sure don't duplicate swash names.
5846
668b3bfc
KW
5847 # The remaining variables are temporaries used while writing each table,
5848 # to output special ranges.
5849 my $has_hangul_syllables;
5850 my @multi_code_point_maps; # Map is to more than one code point.
5851
5852 # The key is the base name of the code point, and the value is an
5853 # array giving all the ranges that use this base name. Each range
5854 # is actually a hash giving the 'low' and 'high' values of it.
5855 my %names_ending_in_code_point;
8c32d378 5856 my %loose_names_ending_in_code_point;
668b3bfc
KW
5857
5858 # Inverse mapping. The list of ranges that have these kinds of
c12f2655
KW
5859 # names. Each element contains the low, high, and base names in an
5860 # anonymous hash.
668b3bfc
KW
5861 my @code_points_ending_in_code_point;
5862
5863 sub handle_special_range {
5864 # Called in the middle of write when it finds a range it doesn't know
5865 # how to handle.
5866
5867 my $self = shift;
5868 my $range = shift;
5869 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5870
5871 my $addr = do { no overloading; pack 'J', $self; };
5872
5873 my $type = $range->type;
5874
5875 my $low = $range->start;
5876 my $high = $range->end;
5877 my $map = $range->value;
5878
5879 # No need to output the range if it maps to the default.
5880 return if $map eq $default_map{$addr};
5881
5882 # Switch based on the map type...
5883 if ($type == $HANGUL_SYLLABLE) {
5884
5885 # These are entirely algorithmically determinable based on
5886 # some constants furnished by Unicode; for now, just set a
5887 # flag to indicate that have them. After everything is figured
5888 # out, we will output the code that does the algorithm.
5889 $has_hangul_syllables = 1;
5890 }
5891 elsif ($type == $CP_IN_NAME) {
5892
5893 # Code points whose the name ends in their code point are also
5894 # algorithmically determinable, but need information about the map
5895 # to do so. Both the map and its inverse are stored in data
5896 # structures output in the file.
5897 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5898 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5899
8c32d378
KW
5900 my $squeezed = $map =~ s/[-\s]+//gr;
5901 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, $low;
5902 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, $high;
5903
668b3bfc 5904 push @code_points_ending_in_code_point, { low => $low,
c12f2655
KW
5905 high => $high,
5906 name => $map
668b3bfc
KW
5907 };
5908 }
5909 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5910
5911 # Multi-code point maps and null string maps have an entry
5912 # for each code point in the range. They use the same
5913 # output format.
5914 for my $code_point ($low .. $high) {
5915
c12f2655
KW
5916 # The pack() below can't cope with surrogates. XXX This may
5917 # no longer be true
668b3bfc 5918 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
98dc9551 5919 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
668b3bfc
KW
5920 next;
5921 }
5922
5923 # Generate the hash entries for these in the form that
5924 # utf8.c understands.
5925 my $tostr = "";
5926 my $to_name = "";
5927 my $to_chr = "";
5928 foreach my $to (split " ", $map) {
5929 if ($to !~ /^$code_point_re$/) {
5930 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5931 next;
5932 }
5933 $tostr .= sprintf "\\x{%s}", $to;
5934 $to = CORE::hex $to;
558712cf 5935 if ($annotate) {
c4019d52
KW
5936 $to_name .= " + " if $to_name;
5937 $to_chr .= chr($to);
5938 main::populate_char_info($to)
5939 if ! defined $viacode[$to];
5940 $to_name .= $viacode[$to];
5941 }
668b3bfc
KW
5942 }
5943
5944 # I (khw) have never waded through this line to
5945 # understand it well enough to comment it.
5946 my $utf8 = sprintf(qq["%s" => "$tostr",],
5947 join("", map { sprintf "\\x%02X", $_ }
5948 unpack("U0C*", pack("U", $code_point))));
5949
5950 # Add a comment so that a human reader can more easily
5951 # see what's going on.
5952 push @multi_code_point_maps,
5953 sprintf("%-45s # U+%04X", $utf8, $code_point);
558712cf 5954 if (! $annotate) {
c4019d52
KW
5955 $multi_code_point_maps[-1] .= " => $map";
5956 }
5957 else {
5958 main::populate_char_info($code_point)
5959 if ! defined $viacode[$code_point];
5960 $multi_code_point_maps[-1] .= " '"
5961 . chr($code_point)
5962 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
5963 }
668b3bfc
KW
5964 }
5965 }
5966 else {
5967 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
5968 }
5969
5970 return;
5971 }
5972
99870f4d
KW
5973 sub pre_body {
5974 # Returns the string that should be output in the file before the main
668b3bfc
KW
5975 # body of this table. It isn't called until the main body is
5976 # calculated, saving a pass. The string includes some hash entries
5977 # identifying the format of the body, and what the single value should
5978 # be for all ranges missing from it. It also includes any code points
5979 # which have map_types that don't go in the main table.
99870f4d
KW
5980
5981 my $self = shift;
5982 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5983
ffe43484 5984 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5985
5986 my $name = $self->property->swash_name;
5987
5988 if (defined $swash_keys{$name}) {
5989 Carp::my_carp(join_lines(<<END
5990Already created a swash name '$name' for $swash_keys{$name}. This means that
5991the same name desired for $self shouldn't be used. Bad News. This must be
5992fixed before production use, but proceeding anyway
5993END
5994 ));
5995 }
5996 $swash_keys{$name} = "$self";
5997
99870f4d 5998 my $pre_body = "";
99870f4d 5999
668b3bfc
KW
6000 # Here we assume we were called after have gone through the whole
6001 # file. If we actually generated anything for each map type, add its
6002 # respective header and trailer
ec2f0128 6003 my $specials_name = "";
668b3bfc 6004 if (@multi_code_point_maps) {
ec2f0128 6005 $specials_name = "utf8::ToSpec$name";
668b3bfc 6006 $pre_body .= <<END;
99870f4d
KW
6007
6008# Some code points require special handling because their mappings are each to
6009# multiple code points. These do not appear in the main body, but are defined
6010# in the hash below.
6011
76591e2b
KW
6012# Each key is the string of N bytes that together make up the UTF-8 encoding
6013# for the code point. (i.e. the same as looking at the code point's UTF-8
6014# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
ec2f0128 6015\%$specials_name = (
99870f4d 6016END
668b3bfc
KW
6017 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6018 }
99870f4d 6019
668b3bfc
KW
6020 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
6021
6022 # Convert these structures to output format.
6023 my $code_points_ending_in_code_point =
6024 main::simple_dumper(\@code_points_ending_in_code_point,
6025 ' ' x 8);
6026 my $names = main::simple_dumper(\%names_ending_in_code_point,
6027 ' ' x 8);
8c32d378
KW
6028 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
6029 ' ' x 8);
668b3bfc
KW
6030
6031 # Do the same with the Hangul names,
6032 my $jamo;
6033 my $jamo_l;
6034 my $jamo_v;
6035 my $jamo_t;
6036 my $jamo_re;
6037 if ($has_hangul_syllables) {
6038
6039 # Construct a regular expression of all the possible
6040 # combinations of the Hangul syllables.
6041 my @L_re; # Leading consonants
6042 for my $i ($LBase .. $LBase + $LCount - 1) {
6043 push @L_re, $Jamo{$i}
6044 }
6045 my @V_re; # Middle vowels
6046 for my $i ($VBase .. $VBase + $VCount - 1) {
6047 push @V_re, $Jamo{$i}
6048 }
6049 my @T_re; # Trailing consonants
6050 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
6051 push @T_re, $Jamo{$i}
99870f4d
KW
6052 }
6053
668b3bfc
KW
6054 # The whole re is made up of the L V T combination.
6055 $jamo_re = '('
6056 . join ('|', sort @L_re)
6057 . ')('
6058 . join ('|', sort @V_re)
6059 . ')('
6060 . join ('|', sort @T_re)
6061 . ')?';
6062
6063 # These hashes needed by the algorithm were generated
6064 # during reading of the Jamo.txt file
6065 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
6066 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
6067 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
6068 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
6069 }
6070
6071 $pre_body .= <<END;
99870f4d
KW
6072
6073# To achieve significant memory savings when this file is read in,
6074# algorithmically derivable code points are omitted from the main body below.
6075# Instead, the following routines can be used to translate between name and
6076# code point and vice versa
6077
6078{ # Closure
6079
6080 # Matches legal code point. 4-6 hex numbers, If there are 6, the
6081 # first two must be '10'; if there are 5, the first must not be a '0'.
8c32d378
KW
6082 # First can match at the end of a word provided that the end of the
6083 # word doesn't look like a hex number.
6084 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
99870f4d
KW
6085 my \$code_point_re = qr/$code_point_re/;
6086
6087 # In the following hash, the keys are the bases of names which includes
6088 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
6089 # of each key is another hash which is used to get the low and high ends
8c32d378 6090 # for each range of code points that apply to the name.
99870f4d
KW
6091 my %names_ending_in_code_point = (
6092$names
6093 );
6094
8c32d378
KW
6095 # The following hash is a copy of the previous one, except is for loose
6096 # matching, so each name has blanks and dashes squeezed out
6097 my %loose_names_ending_in_code_point = (
6098$loose_names
6099 );
6100
99870f4d
KW
6101 # And the following array gives the inverse mapping from code points to
6102 # names. Lowest code points are first
6103 my \@code_points_ending_in_code_point = (
6104$code_points_ending_in_code_point
6105 );
6106END
668b3bfc
KW
6107 # Earlier releases didn't have Jamos. No sense outputting
6108 # them unless will be used.
6109 if ($has_hangul_syllables) {
6110 $pre_body .= <<END;
99870f4d
KW
6111
6112 # Convert from code point to Jamo short name for use in composing Hangul
6113 # syllable names
6114 my %Jamo = (
6115$jamo
6116 );
6117
6118 # Leading consonant (can be null)
6119 my %Jamo_L = (
6120$jamo_l
6121 );
6122
6123 # Vowel
6124 my %Jamo_V = (
6125$jamo_v
6126 );
6127
6128 # Optional trailing consonant
6129 my %Jamo_T = (
6130$jamo_t
6131 );
6132
6133 # Computed re that splits up a Hangul name into LVT or LV syllables
6134 my \$syllable_re = qr/$jamo_re/;
6135
6136 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
8c32d378 6137 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
99870f4d
KW
6138
6139 # These constants names and values were taken from the Unicode standard,
6140 # version 5.1, section 3.12. They are used in conjunction with Hangul
6141 # syllables
6e5a209b
KW
6142 my \$SBase = $SBase_string;
6143 my \$LBase = $LBase_string;
6144 my \$VBase = $VBase_string;
6145 my \$TBase = $TBase_string;
6146 my \$SCount = $SCount;
6147 my \$LCount = $LCount;
6148 my \$VCount = $VCount;
6149 my \$TCount = $TCount;
99870f4d
KW
6150 my \$NCount = \$VCount * \$TCount;
6151END
668b3bfc 6152 } # End of has Jamos
99870f4d 6153
668b3bfc 6154 $pre_body .= << 'END';
99870f4d
KW
6155
6156 sub name_to_code_point_special {
8c32d378 6157 my ($name, $loose) = @_;
99870f4d
KW
6158
6159 # Returns undef if not one of the specially handled names; otherwise
6160 # returns the code point equivalent to the input name
8c32d378
KW
6161 # $loose is non-zero if to use loose matching, 'name' in that case
6162 # must be input as upper case with all blanks and dashes squeezed out.
99870f4d 6163END
668b3bfc
KW
6164 if ($has_hangul_syllables) {
6165 $pre_body .= << 'END';
99870f4d 6166
8c32d378
KW
6167 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
6168 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
6169 {
99870f4d
KW
6170 return if $name !~ qr/^$syllable_re$/;
6171 my $L = $Jamo_L{$1};
6172 my $V = $Jamo_V{$2};
6173 my $T = (defined $3) ? $Jamo_T{$3} : 0;
6174 return ($L * $VCount + $V) * $TCount + $T + $SBase;
6175 }
6176END
668b3bfc
KW
6177 }
6178 $pre_body .= << 'END';
99870f4d 6179
8c32d378
KW
6180 # Name must end in 'code_point' for this to handle.
6181 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
6182 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
99870f4d
KW
6183
6184 my $base = $1;
6185 my $code_point = CORE::hex $2;
8c32d378
KW
6186 my $names_ref;
6187
6188 if ($loose) {
6189 $names_ref = \%loose_names_ending_in_code_point;
6190 }
6191 else {
6192 return if $base !~ s/-$//;
6193 $names_ref = \%names_ending_in_code_point;
6194 }
99870f4d
KW
6195
6196 # Name must be one of the ones which has the code point in it.
8c32d378 6197 return if ! $names_ref->{$base};
99870f4d
KW
6198
6199 # Look through the list of ranges that apply to this name to see if
6200 # the code point is in one of them.
8c32d378
KW
6201 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
6202 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
6203 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
99870f4d
KW
6204
6205 # Here, the code point is in the range.
6206 return $code_point;
6207 }
6208
6209 # Here, looked like the name had a code point number in it, but
6210 # did not match one of the valid ones.
6211 return;
6212 }
6213
6214 sub code_point_to_name_special {
6215 my $code_point = shift;
6216
6217 # Returns the name of a code point if algorithmically determinable;
6218 # undef if not
6219END
668b3bfc
KW
6220 if ($has_hangul_syllables) {
6221 $pre_body .= << 'END';
99870f4d
KW
6222
6223 # If in the Hangul range, calculate the name based on Unicode's
6224 # algorithm
6225 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
6226 use integer;
6227 my $SIndex = $code_point - $SBase;
6228 my $L = $LBase + $SIndex / $NCount;
6229 my $V = $VBase + ($SIndex % $NCount) / $TCount;
6230 my $T = $TBase + $SIndex % $TCount;
03e1aa51 6231 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
99870f4d
KW
6232 $name .= $Jamo{$T} if $T != $TBase;
6233 return $name;
6234 }
6235END
668b3bfc
KW
6236 }
6237 $pre_body .= << 'END';
99870f4d
KW
6238
6239 # Look through list of these code points for one in range.
6240 foreach my $hash (@code_points_ending_in_code_point) {
6241 return if $code_point < $hash->{'low'};
6242 if ($code_point <= $hash->{'high'}) {
6243 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
6244 }
6245 }
6246 return; # None found
6247 }
6248} # End closure
6249
6250END
668b3bfc
KW
6251 } # End of has hangul or code point in name maps.
6252
6253 my $format = $self->format;
6254
6255 my $return = <<END;
6256# The name this swash is to be known by, with the format of the mappings in
6257# the main body of the table, and what all code points missing from this file
6258# map to.
6259\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6260END
ec2f0128
KW
6261 if ($specials_name) {
6262 $return .= <<END;
6263\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6264END
6265 }
668b3bfc
KW
6266 my $default_map = $default_map{$addr};
6267 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6268
6269 if ($default_map eq $CODE_POINT) {
6270 $return .= ' # code point maps to itself';
6271 }
6272 elsif ($default_map eq "") {
6273 $return .= ' # code point maps to the null string';
6274 }
6275 $return .= "\n";
6276
6277 $return .= $pre_body;
6278
6279 return $return;
6280 }
6281
6282 sub write {
6283 # Write the table to the file.
6284
6285 my $self = shift;
6286 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6287
6288 my $addr = do { no overloading; pack 'J', $self; };
6289
6290 # Clear the temporaries
6291 $has_hangul_syllables = 0;
6292 undef @multi_code_point_maps;
6293 undef %names_ending_in_code_point;
8c32d378 6294 undef %loose_names_ending_in_code_point;
668b3bfc 6295 undef @code_points_ending_in_code_point;
99870f4d
KW
6296
6297 # Calculate the format of the table if not already done.
f5817e0a 6298 my $format = $self->format;
668b3bfc
KW
6299 my $type = $self->property->type;
6300 my $default_map = $self->default_map;
99870f4d
KW
6301 if (! defined $format) {
6302 if ($type == $BINARY) {
6303
6304 # Don't bother checking the values, because we elsewhere
6305 # verify that a binary table has only 2 values.
6306 $format = $BINARY_FORMAT;
6307 }
6308 else {
6309 my @ranges = $self->_range_list->ranges;
6310
6311 # default an empty table based on its type and default map
6312 if (! @ranges) {
6313
6314 # But it turns out that the only one we can say is a
6315 # non-string (besides binary, handled above) is when the
6316 # table is a string and the default map is to a code point
6317 if ($type == $STRING && $default_map eq $CODE_POINT) {
6318 $format = $HEX_FORMAT;
6319 }
6320 else {
6321 $format = $STRING_FORMAT;
6322 }
6323 }
6324 else {
6325
6326 # Start with the most restrictive format, and as we find
6327 # something that doesn't fit with that, change to the next
6328 # most restrictive, and so on.
6329 $format = $DECIMAL_FORMAT;
6330 foreach my $range (@ranges) {
668b3bfc
KW
6331 next if $range->type != 0; # Non-normal ranges don't
6332 # affect the main body
99870f4d
KW
6333 my $map = $range->value;
6334 if ($map ne $default_map) {
6335 last if $format eq $STRING_FORMAT; # already at
6336 # least
6337 # restrictive
6338 $format = $INTEGER_FORMAT
6339 if $format eq $DECIMAL_FORMAT
6340 && $map !~ / ^ [0-9] $ /x;
6341 $format = $FLOAT_FORMAT
6342 if $format eq $INTEGER_FORMAT
6343 && $map !~ / ^ -? [0-9]+ $ /x;
6344 $format = $RATIONAL_FORMAT
6345 if $format eq $FLOAT_FORMAT
6346 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6347 $format = $HEX_FORMAT
6348 if $format eq $RATIONAL_FORMAT
6349 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6350 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6351 && $map =~ /[^0-9A-F]/;
6352 }
6353 }
6354 }
6355 }
6356 } # end of calculating format
6357
668b3bfc 6358 if ($default_map eq $CODE_POINT
99870f4d 6359 && $format ne $HEX_FORMAT
668b3bfc
KW
6360 && ! defined $self->format) # manual settings are always
6361 # considered ok
99870f4d
KW
6362 {
6363 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6364 }
99870f4d 6365
668b3bfc 6366 $self->_set_format($format);
99870f4d 6367
0911a63d
KW
6368 # Core Perl has a different definition of mapping ranges than we do,
6369 # that is applicable mainly to mapping code points, so for tables
6370 # where it is possible that core Perl could be used to read it,
6371 # make it range size 1 to prevent possible confusion
6372 $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6373
99870f4d
KW
6374 return $self->SUPER::write(
6375 ($self->property == $block)
6376 ? 7 # block file needs more tab stops
6377 : 3,
668b3bfc 6378 $default_map); # don't write defaulteds
99870f4d
KW
6379 }
6380
6381 # Accessors for the underlying list that should fail if locked.
ea25a9b2 6382 for my $sub (qw(
99870f4d 6383 add_duplicate
ea25a9b2 6384 ))
99870f4d
KW
6385 {
6386 no strict "refs";
6387 *$sub = sub {
6388 use strict "refs";
6389 my $self = shift;
6390
6391 return if $self->carp_if_locked;
6392 return $self->_range_list->$sub(@_);
6393 }
6394 }
6395} # End closure for Map_Table
6396
6397package Match_Table;
6398use base '_Base_Table';
6399
6400# A Match table is one which is a list of all the code points that have
6401# the same property and property value, for use in \p{property=value}
6402# constructs in regular expressions. It adds very little data to the base
6403# structure, but many methods, as these lists can be combined in many ways to
6404# form new ones.
6405# There are only a few concepts added:
6406# 1) Equivalents and Relatedness.
6407# Two tables can match the identical code points, but have different names.
6408# This always happens when there is a perl single form extension
6409# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6410# tables are set to be related, with the Perl extension being a child, and
6411# the Unicode property being the parent.
6412#
6413# It may be that two tables match the identical code points and we don't
6414# know if they are related or not. This happens most frequently when the
6415# Block and Script properties have the exact range. But note that a
6416# revision to Unicode could add new code points to the script, which would
6417# now have to be in a different block (as the block was filled, or there
6418# would have been 'Unknown' script code points in it and they wouldn't have
6419# been identical). So we can't rely on any two properties from Unicode
6420# always matching the same code points from release to release, and thus
6421# these tables are considered coincidentally equivalent--not related. When
6422# two tables are unrelated but equivalent, one is arbitrarily chosen as the
6423# 'leader', and the others are 'equivalents'. This concept is useful
6424# to minimize the number of tables written out. Only one file is used for
6425# any identical set of code points, with entries in Heavy.pl mapping all
6426# the involved tables to it.
6427#
6428# Related tables will always be identical; we set them up to be so. Thus
6429# if the Unicode one is deprecated, the Perl one will be too. Not so for
6430# unrelated tables. Relatedness makes generating the documentation easier.
6431#
c12f2655
KW
6432# 2) Complement.
6433# Like equivalents, two tables may be the inverses of each other, the
6434# intersection between them is null, and the union is every Unicode code
6435# point. The two tables that occupy a binary property are necessarily like
6436# this. By specifying one table as the complement of another, we can avoid
6437# storing it on disk (using the other table and performing a fast
6438# transform), and some memory and calculations.
6439#
6440# 3) Conflicting. It may be that there will eventually be name clashes, with
99870f4d
KW
6441# the same name meaning different things. For a while, there actually were
6442# conflicts, but they have so far been resolved by changing Perl's or
6443# Unicode's definitions to match the other, but when this code was written,
6444# it wasn't clear that that was what was going to happen. (Unicode changed
6445# because of protests during their beta period.) Name clashes are warned
6446# about during compilation, and the documentation. The generated tables
6447# are sane, free of name clashes, because the code suppresses the Perl
6448# version. But manual intervention to decide what the actual behavior
6449# should be may be required should this happen. The introductory comments
6450# have more to say about this.
6451
6452sub standardize { return main::standardize($_[0]); }
6453sub trace { return main::trace(@_); }
6454
6455
6456{ # Closure
6457
6458 main::setup_package();
6459
6460 my %leader;
6461 # The leader table of this one; initially $self.
6462 main::set_access('leader', \%leader, 'r');
6463
6464 my %equivalents;
6465 # An array of any tables that have this one as their leader
6466 main::set_access('equivalents', \%equivalents, 'readable_array');
6467
6468 my %parent;
6469 # The parent table to this one, initially $self. This allows us to
c12f2655
KW
6470 # distinguish between equivalent tables that are related (for which this
6471 # is set to), and those which may not be, but share the same output file
6472 # because they match the exact same set of code points in the current
6473 # Unicode release.
99870f4d
KW
6474 main::set_access('parent', \%parent, 'r');
6475
6476 my %children;
6477 # An array of any tables that have this one as their parent
6478 main::set_access('children', \%children, 'readable_array');
6479
6480 my %conflicting;
6481 # Array of any tables that would have the same name as this one with
6482 # a different meaning. This is used for the generated documentation.
6483 main::set_access('conflicting', \%conflicting, 'readable_array');
6484
6485 my %matches_all;
6486 # Set in the constructor for tables that are expected to match all code
6487 # points.
6488 main::set_access('matches_all', \%matches_all, 'r');
6489
a92d5c2e
KW
6490 my %complement;
6491 # Points to the complement that this table is expressed in terms of; 0 if
6492 # none.
6493 main::set_access('complement', \%complement, 'r', 's' );
6494
99870f4d
KW
6495 sub new {
6496 my $class = shift;
6497
6498 my %args = @_;
6499
6500 # The property for which this table is a listing of property values.
6501 my $property = delete $args{'_Property'};
6502
23e33b60
KW
6503 my $name = delete $args{'Name'};
6504 my $full_name = delete $args{'Full_Name'};
6505 $full_name = $name if ! defined $full_name;
6506
99870f4d
KW
6507 # Optional
6508 my $initialize = delete $args{'Initialize'};
6509 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 6510 my $format = delete $args{'Format'};
99870f4d
KW
6511 # Rest of parameters passed on.
6512
6513 my $range_list = Range_List->new(Initialize => $initialize,
6514 Owner => $property);
6515
23e33b60
KW
6516 my $complete = $full_name;
6517 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6518 # but this helps debug if it
6519 # does
6520 # The complete name for a match table includes it's property in a
6521 # compound form 'property=table', except if the property is the
6522 # pseudo-property, perl, in which case it is just the single form,
6523 # 'table' (If you change the '=' must also change the ':' in lots of
6524 # places in this program that assume an equal sign)
6525 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 6526
99870f4d 6527 my $self = $class->SUPER::new(%args,
23e33b60
KW
6528 Name => $name,
6529 Complete_Name => $complete,
6530 Full_Name => $full_name,
99870f4d
KW
6531 _Property => $property,
6532 _Range_List => $range_list,
f5817e0a 6533 Format => $EMPTY_FORMAT,
99870f4d 6534 );
ffe43484 6535 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6536
6537 $conflicting{$addr} = [ ];
6538 $equivalents{$addr} = [ ];
6539 $children{$addr} = [ ];
6540 $matches_all{$addr} = $matches_all;
6541 $leader{$addr} = $self;
6542 $parent{$addr} = $self;
a92d5c2e 6543 $complement{$addr} = 0;
99870f4d 6544
f5817e0a
KW
6545 if (defined $format && $format ne $EMPTY_FORMAT) {
6546 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6547 }
6548
99870f4d
KW
6549 return $self;
6550 }
6551
6552 # See this program's beginning comment block about overloading these.
6553 use overload
6554 fallback => 0,
6555 qw("") => "_operator_stringify",
6556 '=' => sub {
6557 my $self = shift;
6558
6559 return if $self->carp_if_locked;
6560 return $self;
6561 },
6562
6563 '+' => sub {
6564 my $self = shift;
6565 my $other = shift;
6566
6567 return $self->_range_list + $other;
6568 },
6569 '&' => sub {
6570 my $self = shift;
6571 my $other = shift;
6572
6573 return $self->_range_list & $other;
6574 },
6575 '+=' => sub {
6576 my $self = shift;
6577 my $other = shift;
6578
6579 return if $self->carp_if_locked;
6580
ffe43484 6581 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6582
6583 if (ref $other) {
6584
6585 # Change the range list of this table to be the
6586 # union of the two.
6587 $self->_set_range_list($self->_range_list
6588 + $other);
6589 }
6590 else { # $other is just a simple value
6591 $self->add_range($other, $other);
6592 }
6593 return $self;
6594 },
6595 '-' => sub { my $self = shift;
6596 my $other = shift;
6597 my $reversed = shift;
6598
6599 if ($reversed) {
6600 Carp::my_carp_bug("Can't cope with a "
6601 . __PACKAGE__
6602 . " being the first parameter in a '-'. Subtraction ignored.");
6603 return;
6604 }
6605
6606 return $self->_range_list - $other;
6607 },
6608 '~' => sub { my $self = shift;
6609 return ~ $self->_range_list;
6610 },
6611 ;
6612
6613 sub _operator_stringify {
6614 my $self = shift;
6615
23e33b60 6616 my $name = $self->complete_name;
99870f4d
KW
6617 return "Table '$name'";
6618 }
6619
6620 sub add_alias {
6621 # Add a synonym for this table. See the comments in the base class
6622
6623 my $self = shift;
6624 my $name = shift;
6625 # Rest of parameters passed on.
6626
6627 $self->SUPER::add_alias($name, $self, @_);
6628 return;
6629 }
6630
6631 sub add_conflicting {
6632 # Add the name of some other object to the list of ones that name
6633 # clash with this match table.
6634
6635 my $self = shift;
6636 my $conflicting_name = shift; # The name of the conflicting object
6637 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6638 my $conflicting_object = shift; # Optional, the conflicting object
6639 # itself. This is used to
6640 # disambiguate the text if the input
6641 # name is identical to any of the
6642 # aliases $self is known by.
6643 # Sometimes the conflicting object is
6644 # merely hypothetical, so this has to
6645 # be an optional parameter.
6646 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6647
ffe43484 6648 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6649
6650 # Check if the conflicting name is exactly the same as any existing
6651 # alias in this table (as long as there is a real object there to
6652 # disambiguate with).
6653 if (defined $conflicting_object) {
6654 foreach my $alias ($self->aliases) {
6655 if ($alias->name eq $conflicting_name) {
6656
6657 # Here, there is an exact match. This results in
6658 # ambiguous comments, so disambiguate by changing the
6659 # conflicting name to its object's complete equivalent.
6660 $conflicting_name = $conflicting_object->complete_name;
6661 last;
6662 }
6663 }
6664 }
6665
6666 # Convert to the \p{...} final name
6667 $conflicting_name = "\\$p" . "{$conflicting_name}";
6668
6669 # Only add once
6670 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6671
6672 push @{$conflicting{$addr}}, $conflicting_name;
6673
6674 return;
6675 }
6676
6505c6e2 6677 sub is_set_equivalent_to {
99870f4d
KW
6678 # Return boolean of whether or not the other object is a table of this
6679 # type and has been marked equivalent to this one.
6680
6681 my $self = shift;
6682 my $other = shift;
6683 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6684
6685 return 0 if ! defined $other; # Can happen for incomplete early
6686 # releases
6687 unless ($other->isa(__PACKAGE__)) {
6688 my $ref_other = ref $other;
6689 my $ref_self = ref $self;
6505c6e2 6690 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
99870f4d
KW
6691 return 0;
6692 }
6693
6694 # Two tables are equivalent if they have the same leader.
f998e60c 6695 no overloading;
051df77b 6696 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6697 return;
6698 }
6699
99870f4d
KW
6700 sub set_equivalent_to {
6701 # Set $self equivalent to the parameter table.
6702 # The required Related => 'x' parameter is a boolean indicating
6703 # whether these tables are related or not. If related, $other becomes
6704 # the 'parent' of $self; if unrelated it becomes the 'leader'
6705 #
6706 # Related tables share all characteristics except names; equivalents
6707 # not quite so many.
6708 # If they are related, one must be a perl extension. This is because
6709 # we can't guarantee that Unicode won't change one or the other in a
98dc9551 6710 # later release even if they are identical now.
99870f4d
KW
6711
6712 my $self = shift;
6713 my $other = shift;
6714
6715 my %args = @_;
6716 my $related = delete $args{'Related'};
6717
6718 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6719
6720 return if ! defined $other; # Keep on going; happens in some early
6721 # Unicode releases.
6722
6723 if (! defined $related) {
6724 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6725 $related = 0;
6726 }
6727
6728 # If already are equivalent, no need to re-do it; if subroutine
6729 # returns null, it found an error, also do nothing
6505c6e2 6730 my $are_equivalent = $self->is_set_equivalent_to($other);
99870f4d
KW
6731 return if ! defined $are_equivalent || $are_equivalent;
6732
ffe43484 6733 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6734 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d 6735
45e32b91
KW
6736 if ($related) {
6737 if ($current_leader->perl_extension) {
6738 if ($other->perl_extension) {
6739 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6740 return;
6741 }
6742 } elsif (! $other->perl_extension) {
6743 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6744 $related = 0;
6745 }
6746 }
6747
6748 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6749 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
6750 return;
99870f4d
KW
6751 }
6752
ffe43484
NC
6753 my $leader = do { no overloading; pack 'J', $current_leader; };
6754 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6755
6756 # Any tables that are equivalent to or children of this table must now
6757 # instead be equivalent to or (children) to the new leader (parent),
6758 # still equivalent. The equivalency includes their matches_all info,
6759 # and for related tables, their status
6760 # All related tables are of necessity equivalent, but the converse
6761 # isn't necessarily true
6762 my $status = $other->status;
6763 my $status_info = $other->status_info;
6764 my $matches_all = $matches_all{other_addr};
d867ccfb 6765 my $caseless_equivalent = $other->caseless_equivalent;
99870f4d
KW
6766 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6767 next if $table == $other;
6768 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6769
ffe43484 6770 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6771 $leader{$table_addr} = $other;
6772 $matches_all{$table_addr} = $matches_all;
6773 $self->_set_range_list($other->_range_list);
6774 push @{$equivalents{$other_addr}}, $table;
6775 if ($related) {
6776 $parent{$table_addr} = $other;
6777 push @{$children{$other_addr}}, $table;
6778 $table->set_status($status, $status_info);
d867ccfb 6779 $self->set_caseless_equivalent($caseless_equivalent);
99870f4d
KW
6780 }
6781 }
6782
6783 # Now that we've declared these to be equivalent, any changes to one
6784 # of the tables would invalidate that equivalency.
6785 $self->lock;
6786 $other->lock;
6787 return;
6788 }
6789
6790 sub add_range { # Add a range to the list for this table.
6791 my $self = shift;
6792 # Rest of parameters passed on
6793
6794 return if $self->carp_if_locked;
6795 return $self->_range_list->add_range(@_);
6796 }
6797
99870f4d
KW
6798 sub pre_body { # Does nothing for match tables.
6799 return
6800 }
6801
6802 sub append_to_body { # Does nothing for match tables.
6803 return
6804 }
6805
6806 sub write {
6807 my $self = shift;
6808 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6809
6810 return $self->SUPER::write(2); # 2 tab stops
6811 }
6812
6813 sub set_final_comment {
6814 # This creates a comment for the file that is to hold the match table
6815 # $self. It is somewhat convoluted to make the English read nicely,
6816 # but, heh, it's just a comment.
6817 # This should be called only with the leader match table of all the
6818 # ones that share the same file. It lists all such tables, ordered so
6819 # that related ones are together.
6820
bd9ebcfd
KW
6821 return unless $debugging_build;
6822
99870f4d
KW
6823 my $leader = shift; # Should only be called on the leader table of
6824 # an equivalent group
6825 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6826
ffe43484 6827 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6828
6829 if ($leader{$addr} != $leader) {
6830 Carp::my_carp_bug(<<END
6831set_final_comment() must be called on a leader table, which $leader is not.
6832It is equivalent to $leader{$addr}. No comment created
6833END
6834 );
6835 return;
6836 }
6837
6838 # Get the number of code points matched by each of the tables in this
6839 # file, and add underscores for clarity.
6840 my $count = $leader->count;
6841 my $string_count = main::clarify_number($count);
6842
6843 my $loose_count = 0; # how many aliases loosely matched
6844 my $compound_name = ""; # ? Are any names compound?, and if so, an
6845 # example
6846 my $properties_with_compound_names = 0; # count of these
6847
6848
6849 my %flags; # The status flags used in the file
6850 my $total_entries = 0; # number of entries written in the comment
6851 my $matches_comment = ""; # The portion of the comment about the
6852 # \p{}'s
6853 my @global_comments; # List of all the tables' comments that are
6854 # there before this routine was called.
6855
6856 # Get list of all the parent tables that are equivalent to this one
6857 # (including itself).
6858 my @parents = grep { $parent{main::objaddr $_} == $_ }
6859 main::uniques($leader, @{$equivalents{$addr}});
6860 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6861 # tables
6862
6863 for my $parent (@parents) {
6864
6865 my $property = $parent->property;
6866
6867 # Special case 'N' tables in properties with two match tables when
6868 # the other is a 'Y' one. These are likely to be binary tables,
6869 # but not necessarily. In either case, \P{} will match the
6870 # complement of \p{}, and so if something is a synonym of \p, the
6871 # complement of that something will be the synonym of \P. This
6872 # would be true of any property with just two match tables, not
6873 # just those whose values are Y and N; but that would require a
6874 # little extra work, and there are none such so far in Unicode.
6875 my $perl_p = 'p'; # which is it? \p{} or \P{}
6876 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6877
6878 if (scalar $property->tables == 2
6879 && $parent == $property->table('N')
6880 && defined (my $yes = $property->table('Y')))
6881 {
ffe43484 6882 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6883 @yes_perl_synonyms
6884 = grep { $_->property == $perl }
6885 main::uniques($yes,
6886 $parent{$yes_addr},
6887 $parent{$yes_addr}->children);
6888
6889 # But these synonyms are \P{} ,not \p{}
6890 $perl_p = 'P';
6891 }
6892
6893 my @description; # Will hold the table description
6894 my @note; # Will hold the table notes.
6895 my @conflicting; # Will hold the table conflicts.
6896
6897 # Look at the parent, any yes synonyms, and all the children
ffe43484 6898 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6899 for my $table ($parent,
6900 @yes_perl_synonyms,
f998e60c 6901 @{$children{$parent_addr}})
99870f4d 6902 {
ffe43484 6903 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6904 my $table_property = $table->property;
6905
6906 # Tables are separated by a blank line to create a grouping.
6907 $matches_comment .= "\n" if $matches_comment;
6908
6909 # The table is named based on the property and value
6910 # combination it is for, like script=greek. But there may be
6911 # a number of synonyms for each side, like 'sc' for 'script',
6912 # and 'grek' for 'greek'. Any combination of these is a valid
6913 # name for this table. In this case, there are three more,
6914 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6915 # listing all possible combinations in the comment, we make
6916 # sure that each synonym occurs at least once, and add
6917 # commentary that the other combinations are possible.
6918 my @property_aliases = $table_property->aliases;
6919 my @table_aliases = $table->aliases;
6920
6921 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6922
6923 # The alias lists above are already ordered in the order we
6924 # want to output them. To ensure that each synonym is listed,
6925 # we must use the max of the two numbers.
6926 my $listed_combos = main::max(scalar @table_aliases,
6927 scalar @property_aliases);
6928 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6929
6930 my $property_had_compound_name = 0;
6931
6932 for my $i (0 .. $listed_combos - 1) {
6933 $total_entries++;
6934
6935 # The current alias for the property is the next one on
6936 # the list, or if beyond the end, start over. Similarly
6937 # for the table (\p{prop=table})
6938 my $property_alias = $property_aliases
6939 [$i % @property_aliases]->name;
6940 my $table_alias_object = $table_aliases
6941 [$i % @table_aliases];
6942 my $table_alias = $table_alias_object->name;
6943 my $loose_match = $table_alias_object->loose_match;
6944
6945 if ($table_alias !~ /\D/) { # Clarify large numbers.
6946 $table_alias = main::clarify_number($table_alias)
6947 }
6948
6949 # Add a comment for this alias combination
6950 my $current_match_comment;
6951 if ($table_property == $perl) {
6952 $current_match_comment = "\\$perl_p"
6953 . "{$table_alias}";
6954 }
6955 else {
6956 $current_match_comment
6957 = "\\p{$property_alias=$table_alias}";
6958 $property_had_compound_name = 1;
6959 }
6960
6961 # Flag any abnormal status for this table.
6962 my $flag = $property->status
6963 || $table->status
6964 || $table_alias_object->status;
37e2e78e
KW
6965 if ($flag) {
6966 if ($flag ne $PLACEHOLDER) {
6967 $flags{$flag} = $status_past_participles{$flag};
6968 } else {
6969 $flags{$flag} = <<END;
6970a placeholder because it is not in Version $string_version of Unicode, but is
6971needed by the Perl core to work gracefully. Because it is not in this version
6972of Unicode, it will not be listed in $pod_file.pod
6973END
6974 }
6975 }
99870f4d
KW
6976
6977 $loose_count++;
6978
6979 # Pretty up the comment. Note the \b; it says don't make
6980 # this line a continuation.
6981 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6982 $flag,
6983 " " x 7,
6984 $current_match_comment);
6985 } # End of generating the entries for this table.
6986
6987 # Save these for output after this group of related tables.
6988 push @description, $table->description;
6989 push @note, $table->note;
6990 push @conflicting, $table->conflicting;
6991
37e2e78e
KW
6992 # And this for output after all the tables.
6993 push @global_comments, $table->comment;
6994
99870f4d
KW
6995 # Compute an alternate compound name using the final property
6996 # synonym and the first table synonym with a colon instead of
6997 # the equal sign used elsewhere.
6998 if ($property_had_compound_name) {
6999 $properties_with_compound_names ++;
7000 if (! $compound_name || @property_aliases > 1) {
7001 $compound_name = $property_aliases[-1]->name
7002 . ': '
7003 . $table_aliases[0]->name;
7004 }
7005 }
7006 } # End of looping through all children of this table
7007
7008 # Here have assembled in $matches_comment all the related tables
7009 # to the current parent (preceded by the same info for all the
7010 # previous parents). Put out information that applies to all of
7011 # the current family.
7012 if (@conflicting) {
7013
7014 # But output the conflicting information now, as it applies to
7015 # just this table.
7016 my $conflicting = join ", ", @conflicting;
7017 if ($conflicting) {
7018 $matches_comment .= <<END;
7019
7020 Note that contrary to what you might expect, the above is NOT the same as
7021END
7022 $matches_comment .= "any of: " if @conflicting > 1;
7023 $matches_comment .= "$conflicting\n";
7024 }
7025 }
7026 if (@description) {
7027 $matches_comment .= "\n Meaning: "
7028 . join('; ', @description)
7029 . "\n";
7030 }
7031 if (@note) {
7032 $matches_comment .= "\n Note: "
7033 . join("\n ", @note)
7034 . "\n";
7035 }
7036 } # End of looping through all tables
7037
7038
7039 my $code_points;
7040 my $match;
7041 my $any_of_these;
7042 if ($count == 1) {
7043 $match = 'matches';
7044 $code_points = 'single code point';
7045 }
7046 else {
7047 $match = 'match';
7048 $code_points = "$string_count code points";
7049 }
7050
7051 my $synonyms;
7052 my $entries;
7053 if ($total_entries <= 1) {
7054 $synonyms = "";
7055 $entries = 'entry';
7056 $any_of_these = 'this'
7057 }
7058 else {
7059 $synonyms = " any of the following regular expression constructs";
7060 $entries = 'entries';
7061 $any_of_these = 'any of these'
7062 }
7063
7064 my $comment = "";
7065 if ($has_unrelated) {
7066 $comment .= <<END;
7067This file is for tables that are not necessarily related: To conserve
7068resources, every table that matches the identical set of code points in this
7069version of Unicode uses this file. Each one is listed in a separate group
7070below. It could be that the tables will match the same set of code points in
7071other Unicode releases, or it could be purely coincidence that they happen to
7072be the same in Unicode $string_version, and hence may not in other versions.
7073
7074END
7075 }
7076
7077 if (%flags) {
7078 foreach my $flag (sort keys %flags) {
7079 $comment .= <<END;
37e2e78e 7080'$flag' below means that this form is $flags{$flag}.
99870f4d 7081END
37e2e78e
KW
7082 next if $flag eq $PLACEHOLDER;
7083 $comment .= "Consult $pod_file.pod\n";
99870f4d
KW
7084 }
7085 $comment .= "\n";
7086 }
7087
7088 $comment .= <<END;
7089This file returns the $code_points in Unicode Version $string_version that
7090$match$synonyms:
7091
7092$matches_comment
37e2e78e 7093$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
7094including if adding or subtracting white space, underscore, and hyphen
7095characters matters or doesn't matter, and other permissible syntactic
7096variants. Upper/lower case distinctions never matter.
7097END
7098
7099 if ($compound_name) {
7100 $comment .= <<END;
7101
7102A colon can be substituted for the equals sign, and
7103END
7104 if ($properties_with_compound_names > 1) {
7105 $comment .= <<END;
7106within each group above,
7107END
7108 }
7109 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7110
7111 # Note the \b below, it says don't make that line a continuation.
7112 $comment .= <<END;
7113anything to the left of the equals (or colon) can be combined with anything to
7114the right. Thus, for example,
7115$compound_name
7116\bis also valid.
7117END
7118 }
7119
7120 # And append any comment(s) from the actual tables. They are all
7121 # gathered here, so may not read all that well.
37e2e78e
KW
7122 if (@global_comments) {
7123 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7124 }
99870f4d
KW
7125
7126 if ($count) { # The format differs if no code points, and needs no
7127 # explanation in that case
7128 $comment.= <<END;
7129
7130The format of the lines of this file is:
7131END
7132 $comment.= <<END;
7133START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7134STOP is the ending point, or if omitted, the range has just one code point.
7135END
0c07e538 7136 if ($leader->output_range_counts) {
99870f4d
KW
7137 $comment .= <<END;
7138Numbers in comments in [brackets] indicate how many code points are in the
7139range.
7140END
7141 }
7142 }
7143
7144 $leader->set_comment(main::join_lines($comment));
7145 return;
7146 }
7147
7148 # Accessors for the underlying list
ea25a9b2 7149 for my $sub (qw(
99870f4d
KW
7150 get_valid_code_point
7151 get_invalid_code_point
ea25a9b2 7152 ))
99870f4d
KW
7153 {
7154 no strict "refs";
7155 *$sub = sub {
7156 use strict "refs";
7157 my $self = shift;
7158
7159 return $self->_range_list->$sub(@_);
7160 }
7161 }
7162} # End closure for Match_Table
7163
7164package Property;
7165
7166# The Property class represents a Unicode property, or the $perl
7167# pseudo-property. It contains a map table initialized empty at construction
7168# time, and for properties accessible through regular expressions, various
7169# match tables, created through the add_match_table() method, and referenced
7170# by the table('NAME') or tables() methods, the latter returning a list of all
7171# of the match tables. Otherwise table operations implicitly are for the map
7172# table.
7173#
7174# Most of the data in the property is actually about its map table, so it
7175# mostly just uses that table's accessors for most methods. The two could
7176# have been combined into one object, but for clarity because of their
7177# differing semantics, they have been kept separate. It could be argued that
7178# the 'file' and 'directory' fields should be kept with the map table.
7179#
7180# Each property has a type. This can be set in the constructor, or in the
7181# set_type accessor, but mostly it is figured out by the data. Every property
7182# starts with unknown type, overridden by a parameter to the constructor, or
7183# as match tables are added, or ranges added to the map table, the data is
7184# inspected, and the type changed. After the table is mostly or entirely
7185# filled, compute_type() should be called to finalize they analysis.
7186#
7187# There are very few operations defined. One can safely remove a range from
7188# the map table, and property_add_or_replace_non_nulls() adds the maps from another
7189# table to this one, replacing any in the intersection of the two.
7190
7191sub standardize { return main::standardize($_[0]); }
7192sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7193
7194{ # Closure
7195
7196 # This hash will contain as keys, all the aliases of all properties, and
7197 # as values, pointers to their respective property objects. This allows
7198 # quick look-up of a property from any of its names.
7199 my %alias_to_property_of;
7200
7201 sub dump_alias_to_property_of {
7202 # For debugging
7203
7204 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7205 return;
7206 }
7207
7208 sub property_ref {
7209 # This is a package subroutine, not called as a method.
7210 # If the single parameter is a literal '*' it returns a list of all
7211 # defined properties.
7212 # Otherwise, the single parameter is a name, and it returns a pointer
7213 # to the corresponding property object, or undef if none.
7214 #
7215 # Properties can have several different names. The 'standard' form of
7216 # each of them is stored in %alias_to_property_of as they are defined.
7217 # But it's possible that this subroutine will be called with some
7218 # variant, so if the initial lookup fails, it is repeated with the
98dc9551 7219 # standardized form of the input name. If found, besides returning the
99870f4d
KW
7220 # result, the input name is added to the list so future calls won't
7221 # have to do the conversion again.
7222
7223 my $name = shift;
7224
7225 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7226
7227 if (! defined $name) {
7228 Carp::my_carp_bug("Undefined input property. No action taken.");
7229 return;
7230 }
7231
7232 return main::uniques(values %alias_to_property_of) if $name eq '*';
7233
7234 # Return cached result if have it.
7235 my $result = $alias_to_property_of{$name};
7236 return $result if defined $result;
7237
7238 # Convert the input to standard form.
7239 my $standard_name = standardize($name);
7240
7241 $result = $alias_to_property_of{$standard_name};
7242 return unless defined $result; # Don't cache undefs
7243
7244 # Cache the result before returning it.
7245 $alias_to_property_of{$name} = $result;
7246 return $result;
7247 }
7248
7249
7250 main::setup_package();
7251
7252 my %map;
7253 # A pointer to the map table object for this property
7254 main::set_access('map', \%map);
7255
7256 my %full_name;
7257 # The property's full name. This is a duplicate of the copy kept in the
7258 # map table, but is needed because stringify needs it during
7259 # construction of the map table, and then would have a chicken before egg
7260 # problem.
7261 main::set_access('full_name', \%full_name, 'r');
7262
7263 my %table_ref;
7264 # This hash will contain as keys, all the aliases of any match tables
7265 # attached to this property, and as values, the pointers to their
7266 # respective tables. This allows quick look-up of a table from any of its
7267 # names.
7268 main::set_access('table_ref', \%table_ref);
7269
7270 my %type;
7271 # The type of the property, $ENUM, $BINARY, etc
7272 main::set_access('type', \%type, 'r');
7273
7274 my %file;
7275 # The filename where the map table will go (if actually written).
7276 # Normally defaulted, but can be overridden.
7277 main::set_access('file', \%file, 'r', 's');
7278
7279 my %directory;
7280 # The directory where the map table will go (if actually written).
7281 # Normally defaulted, but can be overridden.
7282 main::set_access('directory', \%directory, 's');
7283
7284 my %pseudo_map_type;
7285 # This is used to affect the calculation of the map types for all the
7286 # ranges in the table. It should be set to one of the values that signify
7287 # to alter the calculation.
7288 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7289
7290 my %has_only_code_point_maps;
7291 # A boolean used to help in computing the type of data in the map table.
7292 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7293
7294 my %unique_maps;
7295 # A list of the first few distinct mappings this property has. This is
7296 # used to disambiguate between binary and enum property types, so don't
7297 # have to keep more than three.
7298 main::set_access('unique_maps', \%unique_maps);
7299
56557540
KW
7300 my %pre_declared_maps;
7301 # A boolean that gives whether the input data should declare all the
7302 # tables used, or not. If the former, unknown ones raise a warning.
7303 main::set_access('pre_declared_maps',
7304 \%pre_declared_maps, 'r');
7305
99870f4d
KW
7306 sub new {
7307 # The only required parameter is the positionally first, name. All
7308 # other parameters are key => value pairs. See the documentation just
7309 # above for the meanings of the ones not passed directly on to the map
7310 # table constructor.
7311
7312 my $class = shift;
7313 my $name = shift || "";
7314
7315 my $self = property_ref($name);
7316 if (defined $self) {
7317 my $options_string = join ", ", @_;
7318 $options_string = ". Ignoring options $options_string" if $options_string;
7319 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7320 return $self;
7321 }
7322
7323 my %args = @_;
7324
7325 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 7326 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7327
7328 $directory{$addr} = delete $args{'Directory'};
7329 $file{$addr} = delete $args{'File'};
7330 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7331 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7332 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
56557540
KW
7333 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7334 # Starting in this release, property
7335 # values should be defined for all
7336 # properties, except those overriding this
7337 // $v_version ge v5.1.0;
c12f2655 7338
99870f4d
KW
7339 # Rest of parameters passed on.
7340
7341 $has_only_code_point_maps{$addr} = 1;
7342 $table_ref{$addr} = { };
7343 $unique_maps{$addr} = { };
7344
7345 $map{$addr} = Map_Table->new($name,
7346 Full_Name => $full_name{$addr},
7347 _Alias_Hash => \%alias_to_property_of,
7348 _Property => $self,
7349 %args);
7350 return $self;
7351 }
7352
7353 # See this program's beginning comment block about overloading the copy
7354 # constructor. Few operations are defined on properties, but a couple are
7355 # useful. It is safe to take the inverse of a property, and to remove a
7356 # single code point from it.
7357 use overload
7358 fallback => 0,
7359 qw("") => "_operator_stringify",
7360 "." => \&main::_operator_dot,
7361 '==' => \&main::_operator_equal,
7362 '!=' => \&main::_operator_not_equal,
7363 '=' => sub { return shift },
7364 '-=' => "_minus_and_equal",
7365 ;
7366
7367 sub _operator_stringify {
7368 return "Property '" . shift->full_name . "'";
7369 }
7370
7371 sub _minus_and_equal {
7372 # Remove a single code point from the map table of a property.
7373
7374 my $self = shift;
7375 my $other = shift;
7376 my $reversed = shift;
7377 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7378
7379 if (ref $other) {
7380 Carp::my_carp_bug("Can't cope with a "
7381 . ref($other)
7382 . " argument to '-='. Subtraction ignored.");
7383 return $self;
7384 }
98dc9551 7385 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
99870f4d
KW
7386 Carp::my_carp_bug("Can't cope with a "
7387 . __PACKAGE__
7388 . " being the first parameter in a '-='. Subtraction ignored.");
7389 return $self;
7390 }
7391 else {
f998e60c 7392 no overloading;
051df77b 7393 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
7394 }
7395 return $self;
7396 }
7397
7398 sub add_match_table {
7399 # Add a new match table for this property, with name given by the
7400 # parameter. It returns a pointer to the table.
7401
7402 my $self = shift;
7403 my $name = shift;
7404 my %args = @_;
7405
ffe43484 7406 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7407
7408 my $table = $table_ref{$addr}{$name};
7409 my $standard_name = main::standardize($name);
7410 if (defined $table
7411 || (defined ($table = $table_ref{$addr}{$standard_name})))
7412 {
7413 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7414 $table_ref{$addr}{$name} = $table;
7415 return $table;
7416 }
7417 else {
7418
7419 # See if this is a perl extension, if not passed in.
7420 my $perl_extension = delete $args{'Perl_Extension'};
7421 $perl_extension
7422 = $self->perl_extension if ! defined $perl_extension;
7423
7424 $table = Match_Table->new(
7425 Name => $name,
7426 Perl_Extension => $perl_extension,
7427 _Alias_Hash => $table_ref{$addr},
7428 _Property => $self,
7429
7430 # gets property's status by default
7431 Status => $self->status,
7432 _Status_Info => $self->status_info,
7433 %args,
7434 Internal_Only_Warning => 1); # Override any
7435 # input param
7436 return unless defined $table;
7437 }
7438
7439 # Save the names for quick look up
7440 $table_ref{$addr}{$standard_name} = $table;
7441 $table_ref{$addr}{$name} = $table;
7442
7443 # Perhaps we can figure out the type of this property based on the
7444 # fact of adding this match table. First, string properties don't
7445 # have match tables; second, a binary property can't have 3 match
7446 # tables
7447 if ($type{$addr} == $UNKNOWN) {
7448 $type{$addr} = $NON_STRING;
7449 }
7450 elsif ($type{$addr} == $STRING) {
7451 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7452 $type{$addr} = $NON_STRING;
7453 }
7454 elsif ($type{$addr} != $ENUM) {
7455 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7456 && $type{$addr} == $BINARY)
7457 {
7458 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7459 $type{$addr} = $ENUM;
7460 }
7461 }
7462
7463 return $table;
7464 }
7465
4b9b0bc5
KW
7466 sub delete_match_table {
7467 # Delete the table referred to by $2 from the property $1.
7468
7469 my $self = shift;
7470 my $table_to_remove = shift;
7471 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7472
7473 my $addr = do { no overloading; pack 'J', $self; };
7474
7475 # Remove all names that refer to it.
7476 foreach my $key (keys %{$table_ref{$addr}}) {
7477 delete $table_ref{$addr}{$key}
7478 if $table_ref{$addr}{$key} == $table_to_remove;
7479 }
7480
7481 $table_to_remove->DESTROY;
7482 return;
7483 }
7484
99870f4d
KW
7485 sub table {
7486 # Return a pointer to the match table (with name given by the
7487 # parameter) associated with this property; undef if none.
7488
7489 my $self = shift;
7490 my $name = shift;
7491 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7492
ffe43484 7493 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7494
7495 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7496
7497 # If quick look-up failed, try again using the standard form of the
7498 # input name. If that succeeds, cache the result before returning so
7499 # won't have to standardize this input name again.
7500 my $standard_name = main::standardize($name);
7501 return unless defined $table_ref{$addr}{$standard_name};
7502
7503 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7504 return $table_ref{$addr}{$name};
7505 }
7506
7507 sub tables {
7508 # Return a list of pointers to all the match tables attached to this
7509 # property
7510
f998e60c 7511 no overloading;
051df77b 7512 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
7513 }
7514
7515 sub directory {
7516 # Returns the directory the map table for this property should be
7517 # output in. If a specific directory has been specified, that has
7518 # priority; 'undef' is returned if the type isn't defined;
7519 # or $map_directory for everything else.
7520
ffe43484 7521 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
7522
7523 return $directory{$addr} if defined $directory{$addr};
7524 return undef if $type{$addr} == $UNKNOWN;
7525 return $map_directory;
7526 }
7527
7528 sub swash_name {
7529 # Return the name that is used to both:
7530 # 1) Name the file that the map table is written to.
7531 # 2) The name of swash related stuff inside that file.
7532 # The reason for this is that the Perl core historically has used
7533 # certain names that aren't the same as the Unicode property names.
7534 # To continue using these, $file is hard-coded in this file for those,
7535 # but otherwise the standard name is used. This is different from the
7536 # external_name, so that the rest of the files, like in lib can use
7537 # the standard name always, without regard to historical precedent.
7538
7539 my $self = shift;
7540 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7541
ffe43484 7542 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7543
7544 return $file{$addr} if defined $file{$addr};
7545 return $map{$addr}->external_name;
7546 }
7547
7548 sub to_create_match_tables {
7549 # Returns a boolean as to whether or not match tables should be
7550 # created for this property.
7551
7552 my $self = shift;
7553 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7554
7555 # The whole point of this pseudo property is match tables.
7556 return 1 if $self == $perl;
7557
ffe43484 7558 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7559
7560 # Don't generate tables of code points that match the property values
7561 # of a string property. Such a list would most likely have many
7562 # property values, each with just one or very few code points mapping
7563 # to it.
7564 return 0 if $type{$addr} == $STRING;
7565
7566 # Don't generate anything for unimplemented properties.
7567 return 0 if grep { $self->complete_name eq $_ }
7568 @unimplemented_properties;
7569 # Otherwise, do.
7570 return 1;
7571 }
7572
7573 sub property_add_or_replace_non_nulls {
7574 # This adds the mappings in the property $other to $self. Non-null
7575 # mappings from $other override those in $self. It essentially merges
7576 # the two properties, with the second having priority except for null
7577 # mappings.
7578
7579 my $self = shift;
7580 my $other = shift;
7581 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7582
7583 if (! $other->isa(__PACKAGE__)) {
7584 Carp::my_carp_bug("$other should be a "
7585 . __PACKAGE__
7586 . ". Not a '"
7587 . ref($other)
7588 . "'. Not added;");
7589 return;
7590 }
7591
f998e60c 7592 no overloading;
051df77b 7593 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
7594 }
7595
7596 sub set_type {
7597 # Set the type of the property. Mostly this is figured out by the
7598 # data in the table. But this is used to set it explicitly. The
7599 # reason it is not a standard accessor is that when setting a binary
7600 # property, we need to make sure that all the true/false aliases are
7601 # present, as they were omitted in early Unicode releases.
7602
7603 my $self = shift;
7604 my $type = shift;
7605 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7606
7607 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
7608 Carp::my_carp("Unrecognized type '$type'. Type not set");
7609 return;
7610 }
7611
051df77b 7612 { no overloading; $type{pack 'J', $self} = $type; }
99870f4d
KW
7613 return if $type != $BINARY;
7614
7615 my $yes = $self->table('Y');
7616 $yes = $self->table('Yes') if ! defined $yes;
01adf4be
KW
7617 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
7618 if ! defined $yes;
7619
7620 # Add aliases in order wanted, duplicates will be ignored. Note, that
7621 # could run into problems in outputting things in that we don't
7622 # distinguish between the name and full name of these. Hopefully, if
7623 # the table was already created before this code is executed, it was
7624 # done with these set properly.
7625 $yes->add_alias('Y');
99870f4d
KW
7626 $yes->add_alias('Yes');
7627 $yes->add_alias('T');
7628 $yes->add_alias('True');
7629
7630 my $no = $self->table('N');
7631 $no = $self->table('No') if ! defined $no;
01adf4be
KW
7632 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
7633 $no->add_alias('N');
99870f4d
KW
7634 $no->add_alias('No');
7635 $no->add_alias('F');
7636 $no->add_alias('False');
c12f2655 7637
99870f4d
KW
7638 return;
7639 }
7640
7641 sub add_map {
7642 # Add a map to the property's map table. This also keeps
7643 # track of the maps so that the property type can be determined from
7644 # its data.
7645
7646 my $self = shift;
7647 my $start = shift; # First code point in range
7648 my $end = shift; # Final code point in range
7649 my $map = shift; # What the range maps to.
7650 # Rest of parameters passed on.
7651
ffe43484 7652 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7653
7654 # If haven't the type of the property, gather information to figure it
7655 # out.
7656 if ($type{$addr} == $UNKNOWN) {
7657
7658 # If the map contains an interior blank or dash, or most other
7659 # nonword characters, it will be a string property. This
7660 # heuristic may actually miss some string properties. If so, they
7661 # may need to have explicit set_types called for them. This
7662 # happens in the Unihan properties.
7663 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7664 || $map =~ / [^\w.\/\ -] /x)
7665 {
7666 $self->set_type($STRING);
7667
7668 # $unique_maps is used for disambiguating between ENUM and
7669 # BINARY later; since we know the property is not going to be
7670 # one of those, no point in keeping the data around
7671 undef $unique_maps{$addr};
7672 }
7673 else {
7674
7675 # Not necessarily a string. The final decision has to be
7676 # deferred until all the data are in. We keep track of if all
7677 # the values are code points for that eventual decision.
7678 $has_only_code_point_maps{$addr} &=
7679 $map =~ / ^ $code_point_re $/x;
7680
7681 # For the purposes of disambiguating between binary and other
7682 # enumerations at the end, we keep track of the first three
7683 # distinct property values. Once we get to three, we know
7684 # it's not going to be binary, so no need to track more.
7685 if (scalar keys %{$unique_maps{$addr}} < 3) {
7686 $unique_maps{$addr}{main::standardize($map)} = 1;
7687 }
7688 }
7689 }
7690
7691 # Add the mapping by calling our map table's method
7692 return $map{$addr}->add_map($start, $end, $map, @_);
7693 }
7694
7695 sub compute_type {
7696 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7697 # should be called after the property is mostly filled with its maps.
7698 # We have been keeping track of what the property values have been,
7699 # and now have the necessary information to figure out the type.
7700
7701 my $self = shift;
7702 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7703
ffe43484 7704 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7705
7706 my $type = $type{$addr};
7707
7708 # If already have figured these out, no need to do so again, but we do
7709 # a double check on ENUMS to make sure that a string property hasn't
7710 # improperly been classified as an ENUM, so continue on with those.
7711 return if $type == $STRING || $type == $BINARY;
7712
7713 # If every map is to a code point, is a string property.
7714 if ($type == $UNKNOWN
7715 && ($has_only_code_point_maps{$addr}
7716 || (defined $map{$addr}->default_map
7717 && $map{$addr}->default_map eq "")))
7718 {
7719 $self->set_type($STRING);
7720 }
7721 else {
7722
7723 # Otherwise, it is to some sort of enumeration. (The case where
7724 # it is a Unicode miscellaneous property, and treated like a
7725 # string in this program is handled in add_map()). Distinguish
7726 # between binary and some other enumeration type. Of course, if
7727 # there are more than two values, it's not binary. But more
7728 # subtle is the test that the default mapping is defined means it
7729 # isn't binary. This in fact may change in the future if Unicode
7730 # changes the way its data is structured. But so far, no binary
7731 # properties ever have @missing lines for them, so the default map
7732 # isn't defined for them. The few properties that are two-valued
7733 # and aren't considered binary have the default map defined
7734 # starting in Unicode 5.0, when the @missing lines appeared; and
7735 # this program has special code to put in a default map for them
7736 # for earlier than 5.0 releases.
7737 if ($type == $ENUM
7738 || scalar keys %{$unique_maps{$addr}} > 2
7739 || defined $self->default_map)
7740 {
7741 my $tables = $self->tables;
7742 my $count = $self->count;
7743 if ($verbosity && $count > 500 && $tables/$count > .1) {
7744 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7745 }
7746 $self->set_type($ENUM);
7747 }
7748 else {
7749 $self->set_type($BINARY);
7750 }
7751 }
7752 undef $unique_maps{$addr}; # Garbage collect
7753 return;
7754 }
7755
7756 # Most of the accessors for a property actually apply to its map table.
7757 # Setup up accessor functions for those, referring to %map
ea25a9b2 7758 for my $sub (qw(
99870f4d
KW
7759 add_alias
7760 add_anomalous_entry
7761 add_comment
7762 add_conflicting
7763 add_description
7764 add_duplicate
7765 add_note
7766 aliases
7767 comment
7768 complete_name
2f7a8815 7769 containing_range
99870f4d
KW
7770 core_access
7771 count
7772 default_map
7773 delete_range
7774 description
7775 each_range
7776 external_name
7777 file_path
7778 format
7779 initialize
7780 inverse_list
7781 is_empty
7782 name
7783 note
7784 perl_extension
7785 property
7786 range_count
7787 ranges
7788 range_size_1
7789 reset_each_range
7790 set_comment
7791 set_core_access
7792 set_default_map
7793 set_file_path
7794 set_final_comment
7795 set_range_size_1
7796 set_status
7797 set_to_output_map
7798 short_name
7799 status
7800 status_info
7801 to_output_map
0a9dbafc 7802 type_of
99870f4d
KW
7803 value_of
7804 write
ea25a9b2 7805 ))
99870f4d
KW
7806 # 'property' above is for symmetry, so that one can take
7807 # the property of a property and get itself, and so don't
7808 # have to distinguish between properties and tables in
7809 # calling code
7810 {
7811 no strict "refs";
7812 *$sub = sub {
7813 use strict "refs";
7814 my $self = shift;
f998e60c 7815 no overloading;
051df77b 7816 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7817 }
7818 }
7819
7820
7821} # End closure
7822
7823package main;
7824
7825sub join_lines($) {
7826 # Returns lines of the input joined together, so that they can be folded
7827 # properly.
7828 # This causes continuation lines to be joined together into one long line
7829 # for folding. A continuation line is any line that doesn't begin with a
7830 # space or "\b" (the latter is stripped from the output). This is so
7831 # lines can be be in a HERE document so as to fit nicely in the terminal
7832 # width, but be joined together in one long line, and then folded with
7833 # indents, '#' prefixes, etc, properly handled.
7834 # A blank separates the joined lines except if there is a break; an extra
7835 # blank is inserted after a period ending a line.
7836
98dc9551 7837 # Initialize the return with the first line.
99870f4d
KW
7838 my ($return, @lines) = split "\n", shift;
7839
7840 # If the first line is null, it was an empty line, add the \n back in
7841 $return = "\n" if $return eq "";
7842
7843 # Now join the remainder of the physical lines.
7844 for my $line (@lines) {
7845
7846 # An empty line means wanted a blank line, so add two \n's to get that
7847 # effect, and go to the next line.
7848 if (length $line == 0) {
7849 $return .= "\n\n";
7850 next;
7851 }
7852
7853 # Look at the last character of what we have so far.
7854 my $previous_char = substr($return, -1, 1);
7855
7856 # And at the next char to be output.
7857 my $next_char = substr($line, 0, 1);
7858
7859 if ($previous_char ne "\n") {
7860
7861 # Here didn't end wth a nl. If the next char a blank or \b, it
7862 # means that here there is a break anyway. So add a nl to the
7863 # output.
7864 if ($next_char eq " " || $next_char eq "\b") {
7865 $previous_char = "\n";
7866 $return .= $previous_char;
7867 }
7868
7869 # Add an extra space after periods.
7870 $return .= " " if $previous_char eq '.';
7871 }
7872
7873 # Here $previous_char is still the latest character to be output. If
7874 # it isn't a nl, it means that the next line is to be a continuation
7875 # line, with a blank inserted between them.
7876 $return .= " " if $previous_char ne "\n";
7877
7878 # Get rid of any \b
7879 substr($line, 0, 1) = "" if $next_char eq "\b";
7880
7881 # And append this next line.
7882 $return .= $line;
7883 }
7884
7885 return $return;
7886}
7887
7888sub simple_fold($;$$$) {
7889 # Returns a string of the input (string or an array of strings) folded
7890 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7891 # a \n
7892 # This is tailored for the kind of text written by this program,
7893 # especially the pod file, which can have very long names with
7894 # underscores in the middle, or words like AbcDefgHij.... We allow
7895 # breaking in the middle of such constructs if the line won't fit
7896 # otherwise. The break in such cases will come either just after an
7897 # underscore, or just before one of the Capital letters.
7898
7899 local $to_trace = 0 if main::DEBUG;
7900
7901 my $line = shift;
7902 my $prefix = shift; # Optional string to prepend to each output
7903 # line
7904 $prefix = "" unless defined $prefix;
7905
7906 my $hanging_indent = shift; # Optional number of spaces to indent
7907 # continuation lines
7908 $hanging_indent = 0 unless $hanging_indent;
7909
7910 my $right_margin = shift; # Optional number of spaces to narrow the
7911 # total width by.
7912 $right_margin = 0 unless defined $right_margin;
7913
7914 # Call carp with the 'nofold' option to avoid it from trying to call us
7915 # recursively
7916 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7917
7918 # The space available doesn't include what's automatically prepended
7919 # to each line, or what's reserved on the right.
7920 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7921 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7922
7923 if (DEBUG && $hanging_indent >= $max) {
7924 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7925 $hanging_indent = 0;
7926 }
7927
7928 # First, split into the current physical lines.
7929 my @line;
7930 if (ref $line) { # Better be an array, because not bothering to
7931 # test
7932 foreach my $line (@{$line}) {
7933 push @line, split /\n/, $line;
7934 }
7935 }
7936 else {
7937 @line = split /\n/, $line;
7938 }
7939
7940 #local $to_trace = 1 if main::DEBUG;
7941 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7942
7943 # Look at each current physical line.
7944 for (my $i = 0; $i < @line; $i++) {
7945 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7946 #local $to_trace = 1 if main::DEBUG;
7947 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7948
7949 # Remove prefix, because will be added back anyway, don't want
7950 # doubled prefix
7951 $line[$i] =~ s/^$prefix//;
7952
7953 # Remove trailing space
7954 $line[$i] =~ s/\s+\Z//;
7955
7956 # If the line is too long, fold it.
7957 if (length $line[$i] > $max) {
7958 my $remainder;
7959
7960 # Here needs to fold. Save the leading space in the line for
7961 # later.
7962 $line[$i] =~ /^ ( \s* )/x;
7963 my $leading_space = $1;
7964 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7965
7966 # If character at final permissible position is white space,
7967 # fold there, which will delete that white space
7968 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7969 $remainder = substr($line[$i], $max);
7970 $line[$i] = substr($line[$i], 0, $max - 1);
7971 }
7972 else {
7973
7974 # Otherwise fold at an acceptable break char closest to
7975 # the max length. Look at just the maximal initial
7976 # segment of the line
7977 my $segment = substr($line[$i], 0, $max - 1);
7978 if ($segment =~
7979 /^ ( .{$hanging_indent} # Don't look before the
7980 # indent.
7981 \ * # Don't look in leading
7982 # blanks past the indent
7983 [^ ] .* # Find the right-most
7984 (?: # acceptable break:
7985 [ \s = ] # space or equal
7986 | - (?! [.0-9] ) # or non-unary minus.
7987 ) # $1 includes the character
7988 )/x)
7989 {
7990 # Split into the initial part that fits, and remaining
7991 # part of the input
7992 $remainder = substr($line[$i], length $1);
7993 $line[$i] = $1;
7994 trace $line[$i] if DEBUG && $to_trace;
7995 trace $remainder if DEBUG && $to_trace;
7996 }
7997
7998 # If didn't find a good breaking spot, see if there is a
7999 # not-so-good breaking spot. These are just after
8000 # underscores or where the case changes from lower to
8001 # upper. Use \a as a soft hyphen, but give up
8002 # and don't break the line if there is actually a \a
8003 # already in the input. We use an ascii character for the
8004 # soft-hyphen to avoid any attempt by miniperl to try to
8005 # access the files that this program is creating.
8006 elsif ($segment !~ /\a/
8007 && ($segment =~ s/_/_\a/g
8008 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8009 {
8010 # Here were able to find at least one place to insert
8011 # our substitute soft hyphen. Find the right-most one
8012 # and replace it by a real hyphen.
8013 trace $segment if DEBUG && $to_trace;
8014 substr($segment,
8015 rindex($segment, "\a"),
8016 1) = '-';
8017
8018 # Then remove the soft hyphen substitutes.
8019 $segment =~ s/\a//g;
8020 trace $segment if DEBUG && $to_trace;
8021
8022 # And split into the initial part that fits, and
8023 # remainder of the line
8024 my $pos = rindex($segment, '-');
8025 $remainder = substr($line[$i], $pos);
8026 trace $remainder if DEBUG && $to_trace;
8027 $line[$i] = substr($segment, 0, $pos + 1);
8028 }
8029 }
8030
8031 # Here we know if we can fold or not. If we can, $remainder
8032 # is what remains to be processed in the next iteration.
8033 if (defined $remainder) {
8034 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8035
8036 # Insert the folded remainder of the line as a new element
8037 # of the array. (It may still be too long, but we will
8038 # deal with that next time through the loop.) Omit any
8039 # leading space in the remainder.
8040 $remainder =~ s/^\s+//;
8041 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8042
8043 # But then indent by whichever is larger of:
8044 # 1) the leading space on the input line;
8045 # 2) the hanging indent.
8046 # This preserves indentation in the original line.
8047 my $lead = ($leading_space)
8048 ? length $leading_space
8049 : $hanging_indent;
8050 $lead = max($lead, $hanging_indent);
8051 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8052 }
8053 }
8054
8055 # Ready to output the line. Get rid of any trailing space
8056 # And prefix by the required $prefix passed in.
8057 $line[$i] =~ s/\s+$//;
8058 $line[$i] = "$prefix$line[$i]\n";
8059 } # End of looping through all the lines.
8060
8061 return join "", @line;
8062}
8063
8064sub property_ref { # Returns a reference to a property object.
8065 return Property::property_ref(@_);
8066}
8067
8068sub force_unlink ($) {
8069 my $filename = shift;
8070 return unless file_exists($filename);
8071 return if CORE::unlink($filename);
8072
8073 # We might need write permission
8074 chmod 0777, $filename;
8075 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8076 return;
8077}
8078
9218f1cf 8079sub write ($$@) {
9abe8df8
KW
8080 # Given a filename and references to arrays of lines, write the lines of
8081 # each array to the file
99870f4d
KW
8082 # Filename can be given as an arrayref of directory names
8083
9218f1cf 8084 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
99870f4d 8085
9abe8df8 8086 my $file = shift;
9218f1cf 8087 my $use_utf8 = shift;
99870f4d
KW
8088
8089 # Get into a single string if an array, and get rid of, in Unix terms, any
8090 # leading '.'
8091 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8092 $file = File::Spec->canonpath($file);
8093
8094 # If has directories, make sure that they all exist
8095 (undef, my $directories, undef) = File::Spec->splitpath($file);
8096 File::Path::mkpath($directories) if $directories && ! -d $directories;
8097
8098 push @files_actually_output, $file;
8099
99870f4d
KW
8100 force_unlink ($file);
8101
8102 my $OUT;
8103 if (not open $OUT, ">", $file) {
8104 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8105 return;
8106 }
430ada4c 8107
9218f1cf
KW
8108 binmode $OUT, ":utf8" if $use_utf8;
8109
9abe8df8
KW
8110 while (defined (my $lines_ref = shift)) {
8111 unless (@$lines_ref) {
8112 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8113 }
8114
8115 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8116 }
430ada4c
NC
8117 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8118
99870f4d
KW
8119 print "$file written.\n" if $verbosity >= $VERBOSE;
8120
99870f4d
KW
8121 return;
8122}
8123
8124
8125sub Standardize($) {
8126 # This converts the input name string into a standardized equivalent to
8127 # use internally.
8128
8129 my $name = shift;
8130 unless (defined $name) {
8131 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8132 return;
8133 }
8134
8135 # Remove any leading or trailing white space
8136 $name =~ s/^\s+//g;
8137 $name =~ s/\s+$//g;
8138
98dc9551 8139 # Convert interior white space and hyphens into underscores.
99870f4d
KW
8140 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8141
8142 # Capitalize the letter following an underscore, and convert a sequence of
8143 # multiple underscores to a single one
8144 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8145
8146 # And capitalize the first letter, but not for the special cjk ones.
8147 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8148 return $name;
8149}
8150
8151sub standardize ($) {
8152 # Returns a lower-cased standardized name, without underscores. This form
8153 # is chosen so that it can distinguish between any real versus superficial
8154 # Unicode name differences. It relies on the fact that Unicode doesn't
8155 # have interior underscores, white space, nor dashes in any
8156 # stricter-matched name. It should not be used on Unicode code point
8157 # names (the Name property), as they mostly, but not always follow these
8158 # rules.
8159
8160 my $name = Standardize(shift);
8161 return if !defined $name;
8162
8163 $name =~ s/ (?<= .) _ (?= . ) //xg;
8164 return lc $name;
8165}
8166
c85f591a
KW
8167sub utf8_heavy_name ($$) {
8168 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8169 # perhaps this function should be placed somewhere, like Heavy.pl so that
8170 # utf8_heavy can use it directly without duplicating code that can get
8171 # out-of sync.
8172
8173 my $table = shift;
8174 my $alias = shift;
8175 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8176
8177 my $property = $table->property;
8178 $property = ($property == $perl)
8179 ? "" # 'perl' is never explicitly stated
8180 : standardize($property->name) . '=';
8181 if ($alias->loose_match) {
8182 return $property . standardize($alias->name);
8183 }
8184 else {
8185 return lc ($property . $alias->name);
8186 }
8187
8188 return;
8189}
8190
99870f4d
KW
8191{ # Closure
8192
8193 my $indent_increment = " " x 2;
8194 my %already_output;
8195
8196 $main::simple_dumper_nesting = 0;
8197
8198 sub simple_dumper {
8199 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8200 # the real thing as we have to run under miniperl.
8201
8202 # It is designed so that on input it is at the beginning of a line,
8203 # and the final thing output in any call is a trailing ",\n".
8204
8205 my $item = shift;
8206 my $indent = shift;
8207 $indent = "" if ! defined $indent;
8208
8209 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8210
8211 # nesting level is localized, so that as the call stack pops, it goes
8212 # back to the prior value.
8213 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8214 undef %already_output if $main::simple_dumper_nesting == 0;
8215 $main::simple_dumper_nesting++;
8216 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8217
8218 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8219
8220 # Determine the indent for recursive calls.
8221 my $next_indent = $indent . $indent_increment;
8222
8223 my $output;
8224 if (! ref $item) {
8225
8226 # Dump of scalar: just output it in quotes if not a number. To do
8227 # so we must escape certain characters, and therefore need to
8228 # operate on a copy to avoid changing the original
8229 my $copy = $item;
8230 $copy = $UNDEF unless defined $copy;
8231
8232 # Quote non-numbers (numbers also have optional leading '-' and
8233 # fractions)
8234 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
8235
8236 # Escape apostrophe and backslash
8237 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8238 $copy = "'$copy'";
8239 }
8240 $output = "$indent$copy,\n";
8241 }
8242 else {
8243
8244 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 8245 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 8246 if (defined $already_output{$addr}) {
99870f4d
KW
8247 return "${indent}ALREADY OUTPUT: $item\n";
8248 }
f998e60c 8249 $already_output{$addr} = $item;
99870f4d
KW
8250
8251 if (ref $item eq 'ARRAY') {
8252 my $using_brackets;
8253 $output = $indent;
8254 if ($main::simple_dumper_nesting > 1) {
8255 $output .= '[';
8256 $using_brackets = 1;
8257 }
8258 else {
8259 $using_brackets = 0;
8260 }
8261
8262 # If the array is empty, put the closing bracket on the same
8263 # line. Otherwise, recursively add each array element
8264 if (@$item == 0) {
8265 $output .= " ";
8266 }
8267 else {
8268 $output .= "\n";
8269 for (my $i = 0; $i < @$item; $i++) {
8270
8271 # Indent array elements one level
8272 $output .= &simple_dumper($item->[$i], $next_indent);
c12f2655
KW
8273 $output =~ s/\n$//; # Remove any trailing nl so
8274 $output .= " # [$i]\n"; # as to add a comment giving
8275 # the array index
99870f4d
KW
8276 }
8277 $output .= $indent; # Indent closing ']' to orig level
8278 }
8279 $output .= ']' if $using_brackets;
8280 $output .= ",\n";
8281 }
8282 elsif (ref $item eq 'HASH') {
8283 my $is_first_line;
8284 my $using_braces;
8285 my $body_indent;
8286
8287 # No surrounding braces at top level
8288 $output .= $indent;
8289 if ($main::simple_dumper_nesting > 1) {
8290 $output .= "{\n";
8291 $is_first_line = 0;
8292 $body_indent = $next_indent;
8293 $next_indent .= $indent_increment;
8294 $using_braces = 1;
8295 }
8296 else {
8297 $is_first_line = 1;
8298 $body_indent = $indent;
8299 $using_braces = 0;
8300 }
8301
8302 # Output hashes sorted alphabetically instead of apparently
8303 # random. Use caseless alphabetic sort
8304 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8305 {
8306 if ($is_first_line) {
8307 $is_first_line = 0;
8308 }
8309 else {
8310 $output .= "$body_indent";
8311 }
8312
8313 # The key must be a scalar, but this recursive call quotes
8314 # it
8315 $output .= &simple_dumper($key);
8316
8317 # And change the trailing comma and nl to the hash fat
8318 # comma for clarity, and so the value can be on the same
8319 # line
8320 $output =~ s/,\n$/ => /;
8321
8322 # Recursively call to get the value's dump.
8323 my $next = &simple_dumper($item->{$key}, $next_indent);
8324
8325 # If the value is all on one line, remove its indent, so
8326 # will follow the => immediately. If it takes more than
8327 # one line, start it on a new line.
8328 if ($next !~ /\n.*\n/) {
8329 $next =~ s/^ *//;
8330 }
8331 else {
8332 $output .= "\n";
8333 }
8334 $output .= $next;
8335 }
8336
8337 $output .= "$indent},\n" if $using_braces;
8338 }
8339 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8340 $output = $indent . ref($item) . "\n";
8341 # XXX see if blessed
8342 }
8343 elsif ($item->can('dump')) {
8344
8345 # By convention in this program, objects furnish a 'dump'
8346 # method. Since not doing any output at this level, just pass
8347 # on the input indent
8348 $output = $item->dump($indent);
8349 }
8350 else {
8351 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8352 }
8353 }
8354 return $output;
8355 }
8356}
8357
8358sub dump_inside_out {
8359 # Dump inside-out hashes in an object's state by converting them to a
8360 # regular hash and then calling simple_dumper on that.
8361
8362 my $object = shift;
8363 my $fields_ref = shift;
8364 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8365
ffe43484 8366 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
8367
8368 my %hash;
8369 foreach my $key (keys %$fields_ref) {
8370 $hash{$key} = $fields_ref->{$key}{$addr};
8371 }
8372
8373 return simple_dumper(\%hash, @_);
8374}
8375
8376sub _operator_dot {
8377 # Overloaded '.' method that is common to all packages. It uses the
8378 # package's stringify method.
8379
8380 my $self = shift;
8381 my $other = shift;
8382 my $reversed = shift;
8383 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8384
8385 $other = "" unless defined $other;
8386
8387 foreach my $which (\$self, \$other) {
8388 next unless ref $$which;
8389 if ($$which->can('_operator_stringify')) {
8390 $$which = $$which->_operator_stringify;
8391 }
8392 else {
8393 my $ref = ref $$which;
ffe43484 8394 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
8395 $$which = "$ref ($addr)";
8396 }
8397 }
8398 return ($reversed)
8399 ? "$other$self"
8400 : "$self$other";
8401}
8402
8403sub _operator_equal {
8404 # Generic overloaded '==' routine. To be equal, they must be the exact
8405 # same object
8406
8407 my $self = shift;
8408 my $other = shift;
8409
8410 return 0 unless defined $other;
8411 return 0 unless ref $other;
f998e60c 8412 no overloading;
2100aa98 8413 return $self == $other;
99870f4d
KW
8414}
8415
8416sub _operator_not_equal {
8417 my $self = shift;
8418 my $other = shift;
8419
8420 return ! _operator_equal($self, $other);
8421}
8422
8423sub process_PropertyAliases($) {
8424 # This reads in the PropertyAliases.txt file, which contains almost all
8425 # the character properties in Unicode and their equivalent aliases:
8426 # scf ; Simple_Case_Folding ; sfc
8427 #
8428 # Field 0 is the preferred short name for the property.
8429 # Field 1 is the full name.
8430 # Any succeeding ones are other accepted names.
8431
8432 my $file= shift;
8433 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8434
8435 # This whole file was non-existent in early releases, so use our own
8436 # internal one.
8437 $file->insert_lines(get_old_property_aliases())
8438 if ! -e 'PropertyAliases.txt';
8439
8440 # Add any cjk properties that may have been defined.
8441 $file->insert_lines(@cjk_properties);
8442
8443 while ($file->next_line) {
8444
8445 my @data = split /\s*;\s*/;
8446
8447 my $full = $data[1];
8448
8449 my $this = Property->new($data[0], Full_Name => $full);
8450
8451 # Start looking for more aliases after these two.
8452 for my $i (2 .. @data - 1) {
8453 $this->add_alias($data[$i]);
8454 }
8455
8456 }
8457 return;
8458}
8459
8460sub finish_property_setup {
8461 # Finishes setting up after PropertyAliases.
8462
8463 my $file = shift;
8464 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8465
8466 # This entry was missing from this file in earlier Unicode versions
8467 if (-e 'Jamo.txt') {
8468 my $jsn = property_ref('JSN');
8469 if (! defined $jsn) {
8470 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8471 }
8472 }
8473
5f7264c7 8474 # This entry is still missing as of 6.0, perhaps because no short name for
99870f4d
KW
8475 # it.
8476 if (-e 'NameAliases.txt') {
8477 my $aliases = property_ref('Name_Alias');
8478 if (! defined $aliases) {
8479 $aliases = Property->new('Name_Alias');
8480 }
8481 }
8482
8483 # These are used so much, that we set globals for them.
8484 $gc = property_ref('General_Category');
8485 $block = property_ref('Block');
8486
8487 # Perl adds this alias.
8488 $gc->add_alias('Category');
8489
8490 # For backwards compatibility, these property files have particular names.
8491 my $upper = property_ref('Uppercase_Mapping');
8492 $upper->set_core_access('uc()');
8493 $upper->set_file('Upper'); # This is what utf8.c calls it
8494
8495 my $lower = property_ref('Lowercase_Mapping');
8496 $lower->set_core_access('lc()');
8497 $lower->set_file('Lower');
8498
8499 my $title = property_ref('Titlecase_Mapping');
8500 $title->set_core_access('ucfirst()');
8501 $title->set_file('Title');
8502
8503 my $fold = property_ref('Case_Folding');
8504 $fold->set_file('Fold') if defined $fold;
8505
d3cbe105
KW
8506 # Unicode::Normalize expects this file with this name and directory.
8507 my $ccc = property_ref('Canonical_Combining_Class');
8508 if (defined $ccc) {
8509 $ccc->set_file('CombiningClass');
8510 $ccc->set_directory(File::Spec->curdir());
8511 }
8512
2cd56239
KW
8513 # utf8.c has a different meaning for non range-size-1 for map properties
8514 # that this program doesn't currently handle; and even if it were changed
8515 # to do so, some other code may be using them expecting range size 1.
99870f4d
KW
8516 foreach my $property (qw {
8517 Case_Folding
8518 Lowercase_Mapping
8519 Titlecase_Mapping
8520 Uppercase_Mapping
8521 })
8522 {
8523 property_ref($property)->set_range_size_1(1);
8524 }
8525
8526 # These two properties aren't actually used in the core, but unfortunately
8527 # the names just above that are in the core interfere with these, so
8528 # choose different names. These aren't a problem unless the map tables
8529 # for these files get written out.
8530 my $lowercase = property_ref('Lowercase');
8531 $lowercase->set_file('IsLower') if defined $lowercase;
8532 my $uppercase = property_ref('Uppercase');
8533 $uppercase->set_file('IsUpper') if defined $uppercase;
8534
8535 # Set up the hard-coded default mappings, but only on properties defined
8536 # for this release
8537 foreach my $property (keys %default_mapping) {
8538 my $property_object = property_ref($property);
8539 next if ! defined $property_object;
8540 my $default_map = $default_mapping{$property};
8541 $property_object->set_default_map($default_map);
8542
8543 # A map of <code point> implies the property is string.
8544 if ($property_object->type == $UNKNOWN
8545 && $default_map eq $CODE_POINT)
8546 {
8547 $property_object->set_type($STRING);
8548 }
8549 }
8550
8551 # The following use the Multi_Default class to create objects for
8552 # defaults.
8553
8554 # Bidi class has a complicated default, but the derived file takes care of
8555 # the complications, leaving just 'L'.
8556 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8557 property_ref('Bidi_Class')->set_default_map('L');
8558 }
8559 else {
8560 my $default;
8561
8562 # The derived file was introduced in 3.1.1. The values below are
8563 # taken from table 3-8, TUS 3.0
8564 my $default_R =
8565 'my $default = Range_List->new;
8566 $default->add_range(0x0590, 0x05FF);
8567 $default->add_range(0xFB1D, 0xFB4F);'
8568 ;
8569
8570 # The defaults apply only to unassigned characters
a67f160a 8571 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
8572
8573 if ($v_version lt v3.0.0) {
8574 $default = Multi_Default->new(R => $default_R, 'L');
8575 }
8576 else {
8577
8578 # AL apparently not introduced until 3.0: TUS 2.x references are
8579 # not on-line to check it out
8580 my $default_AL =
8581 'my $default = Range_List->new;
8582 $default->add_range(0x0600, 0x07BF);
8583 $default->add_range(0xFB50, 0xFDFF);
8584 $default->add_range(0xFE70, 0xFEFF);'
8585 ;
8586
8587 # Non-character code points introduced in this release; aren't AL
8588 if ($v_version ge 3.1.0) {
8589 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8590 }
a67f160a 8591 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
8592 $default = Multi_Default->new(AL => $default_AL,
8593 R => $default_R,
8594 'L');
8595 }
8596 property_ref('Bidi_Class')->set_default_map($default);
8597 }
8598
8599 # Joining type has a complicated default, but the derived file takes care
8600 # of the complications, leaving just 'U' (or Non_Joining), except the file
8601 # is bad in 3.1.0
8602 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8603 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8604 property_ref('Joining_Type')->set_default_map('Non_Joining');
8605 }
8606 else {
8607
8608 # Otherwise, there are not one, but two possibilities for the
8609 # missing defaults: T and U.
8610 # The missing defaults that evaluate to T are given by:
8611 # T = Mn + Cf - ZWNJ - ZWJ
8612 # where Mn and Cf are the general category values. In other words,
8613 # any non-spacing mark or any format control character, except
8614 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8615 # WIDTH JOINER (joining type C).
8616 my $default = Multi_Default->new(
8617 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8618 'Non_Joining');
8619 property_ref('Joining_Type')->set_default_map($default);
8620 }
8621 }
8622
8623 # Line break has a complicated default in early releases. It is 'Unknown'
8624 # for non-assigned code points; 'AL' for assigned.
8625 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8626 my $lb = property_ref('Line_Break');
8627 if ($v_version gt 3.2.0) {
8628 $lb->set_default_map('Unknown');
8629 }
8630 else {
8631 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8632 'AL');
8633 $lb->set_default_map($default);
8634 }
8635
8636 # If has the URS property, make sure that the standard aliases are in
8637 # it, since not in the input tables in some versions.
8638 my $urs = property_ref('Unicode_Radical_Stroke');
8639 if (defined $urs) {
8640 $urs->add_alias('cjkRSUnicode');
8641 $urs->add_alias('kRSUnicode');
8642 }
8643 }
8644 return;
8645}
8646
8647sub get_old_property_aliases() {
8648 # Returns what would be in PropertyAliases.txt if it existed in very old
8649 # versions of Unicode. It was derived from the one in 3.2, and pared
8650 # down based on the data that was actually in the older releases.
8651 # An attempt was made to use the existence of files to mean inclusion or
8652 # not of various aliases, but if this was not sufficient, using version
8653 # numbers was resorted to.
8654
8655 my @return;
8656
8657 # These are to be used in all versions (though some are constructed by
8658 # this program if missing)
8659 push @return, split /\n/, <<'END';
8660bc ; Bidi_Class
8661Bidi_M ; Bidi_Mirrored
8662cf ; Case_Folding
8663ccc ; Canonical_Combining_Class
8664dm ; Decomposition_Mapping
8665dt ; Decomposition_Type
8666gc ; General_Category
8667isc ; ISO_Comment
8668lc ; Lowercase_Mapping
8669na ; Name
8670na1 ; Unicode_1_Name
8671nt ; Numeric_Type
8672nv ; Numeric_Value
8673sfc ; Simple_Case_Folding
8674slc ; Simple_Lowercase_Mapping
8675stc ; Simple_Titlecase_Mapping
8676suc ; Simple_Uppercase_Mapping
8677tc ; Titlecase_Mapping
8678uc ; Uppercase_Mapping
8679END
8680
8681 if (-e 'Blocks.txt') {
8682 push @return, "blk ; Block\n";
8683 }
8684 if (-e 'ArabicShaping.txt') {
8685 push @return, split /\n/, <<'END';
8686jg ; Joining_Group
8687jt ; Joining_Type
8688END
8689 }
8690 if (-e 'PropList.txt') {
8691
8692 # This first set is in the original old-style proplist.
8693 push @return, split /\n/, <<'END';
8694Alpha ; Alphabetic
8695Bidi_C ; Bidi_Control
8696Dash ; Dash
8697Dia ; Diacritic
8698Ext ; Extender
8699Hex ; Hex_Digit
8700Hyphen ; Hyphen
8701IDC ; ID_Continue
8702Ideo ; Ideographic
8703Join_C ; Join_Control
8704Math ; Math
8705QMark ; Quotation_Mark
8706Term ; Terminal_Punctuation
8707WSpace ; White_Space
8708END
8709 # The next sets were added later
8710 if ($v_version ge v3.0.0) {
8711 push @return, split /\n/, <<'END';
8712Upper ; Uppercase
8713Lower ; Lowercase
8714END
8715 }
8716 if ($v_version ge v3.0.1) {
8717 push @return, split /\n/, <<'END';
8718NChar ; Noncharacter_Code_Point
8719END
8720 }
8721 # The next sets were added in the new-style
8722 if ($v_version ge v3.1.0) {
8723 push @return, split /\n/, <<'END';
8724OAlpha ; Other_Alphabetic
8725OLower ; Other_Lowercase
8726OMath ; Other_Math
8727OUpper ; Other_Uppercase
8728END
8729 }
8730 if ($v_version ge v3.1.1) {
8731 push @return, "AHex ; ASCII_Hex_Digit\n";
8732 }
8733 }
8734 if (-e 'EastAsianWidth.txt') {
8735 push @return, "ea ; East_Asian_Width\n";
8736 }
8737 if (-e 'CompositionExclusions.txt') {
8738 push @return, "CE ; Composition_Exclusion\n";
8739 }
8740 if (-e 'LineBreak.txt') {
8741 push @return, "lb ; Line_Break\n";
8742 }
8743 if (-e 'BidiMirroring.txt') {
8744 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8745 }
8746 if (-e 'Scripts.txt') {
8747 push @return, "sc ; Script\n";
8748 }
8749 if (-e 'DNormalizationProps.txt') {
8750 push @return, split /\n/, <<'END';
8751Comp_Ex ; Full_Composition_Exclusion
8752FC_NFKC ; FC_NFKC_Closure
8753NFC_QC ; NFC_Quick_Check
8754NFD_QC ; NFD_Quick_Check
8755NFKC_QC ; NFKC_Quick_Check
8756NFKD_QC ; NFKD_Quick_Check
8757XO_NFC ; Expands_On_NFC
8758XO_NFD ; Expands_On_NFD
8759XO_NFKC ; Expands_On_NFKC
8760XO_NFKD ; Expands_On_NFKD
8761END
8762 }
8763 if (-e 'DCoreProperties.txt') {
8764 push @return, split /\n/, <<'END';
8765IDS ; ID_Start
8766XIDC ; XID_Continue
8767XIDS ; XID_Start
8768END
8769 # These can also appear in some versions of PropList.txt
8770 push @return, "Lower ; Lowercase\n"
8771 unless grep { $_ =~ /^Lower\b/} @return;
8772 push @return, "Upper ; Uppercase\n"
8773 unless grep { $_ =~ /^Upper\b/} @return;
8774 }
8775
8776 # This flag requires the DAge.txt file to be copied into the directory.
8777 if (DEBUG && $compare_versions) {
8778 push @return, 'age ; Age';
8779 }
8780
8781 return @return;
8782}
8783
8784sub process_PropValueAliases {
8785 # This file contains values that properties look like:
8786 # bc ; AL ; Arabic_Letter
8787 # blk; n/a ; Greek_And_Coptic ; Greek
8788 #
8789 # Field 0 is the property.
8790 # Field 1 is the short name of a property value or 'n/a' if no
8791 # short name exists;
8792 # Field 2 is the full property value name;
8793 # Any other fields are more synonyms for the property value.
8794 # Purely numeric property values are omitted from the file; as are some
8795 # others, fewer and fewer in later releases
8796
8797 # Entries for the ccc property have an extra field before the
8798 # abbreviation:
8799 # ccc; 0; NR ; Not_Reordered
8800 # It is the numeric value that the names are synonyms for.
8801
8802 # There are comment entries for values missing from this file:
8803 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8804 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8805
8806 my $file= shift;
8807 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8808
8809 # This whole file was non-existent in early releases, so use our own
8810 # internal one if necessary.
8811 if (! -e 'PropValueAliases.txt') {
8812 $file->insert_lines(get_old_property_value_aliases());
8813 }
8814
8815 # Add any explicit cjk values
8816 $file->insert_lines(@cjk_property_values);
8817
8818 # This line is used only for testing the code that checks for name
8819 # conflicts. There is a script Inherited, and when this line is executed
8820 # it causes there to be a name conflict with the 'Inherited' that this
8821 # program generates for this block property value
8822 #$file->insert_lines('blk; n/a; Herited');
8823
8824
8825 # Process each line of the file ...
8826 while ($file->next_line) {
8827
8828 my ($property, @data) = split /\s*;\s*/;
8829
66b4eb0a
KW
8830 # The ccc property has an extra field at the beginning, which is the
8831 # numeric value. Move it to be after the other two, mnemonic, fields,
8832 # so that those will be used as the property value's names, and the
8833 # number will be an extra alias. (Rightmost splice removes field 1-2,
8834 # returning them in a slice; left splice inserts that before anything,
8835 # thus shifting the former field 0 to after them.)
8836 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
8837
8838 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
8839 # there is no short name, use the full one in element 1
027866c1
KW
8840 if ($data[0] eq "n/a") {
8841 $data[0] = $data[1];
8842 }
8843 elsif ($data[0] ne $data[1]
8844 && standardize($data[0]) eq standardize($data[1])
8845 && $data[1] !~ /[[:upper:]]/)
8846 {
8847 # Also, there is a bug in the file in which "n/a" is omitted, and
8848 # the two fields are identical except for case, and the full name
8849 # is all lower case. Copy the "short" name unto the full one to
8850 # give it some upper case.
8851
8852 $data[1] = $data[0];
8853 }
99870f4d
KW
8854
8855 # Earlier releases had the pseudo property 'qc' that should expand to
8856 # the ones that replace it below.
8857 if ($property eq 'qc') {
8858 if (lc $data[0] eq 'y') {
8859 $file->insert_lines('NFC_QC; Y ; Yes',
8860 'NFD_QC; Y ; Yes',
8861 'NFKC_QC; Y ; Yes',
8862 'NFKD_QC; Y ; Yes',
8863 );
8864 }
8865 elsif (lc $data[0] eq 'n') {
8866 $file->insert_lines('NFC_QC; N ; No',
8867 'NFD_QC; N ; No',
8868 'NFKC_QC; N ; No',
8869 'NFKD_QC; N ; No',
8870 );
8871 }
8872 elsif (lc $data[0] eq 'm') {
8873 $file->insert_lines('NFC_QC; M ; Maybe',
8874 'NFKC_QC; M ; Maybe',
8875 );
8876 }
8877 else {
8878 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8879 }
8880 next;
8881 }
8882
8883 # The first field is the short name, 2nd is the full one.
8884 my $property_object = property_ref($property);
8885 my $table = $property_object->add_match_table($data[0],
8886 Full_Name => $data[1]);
8887
8888 # Start looking for more aliases after these two.
8889 for my $i (2 .. @data - 1) {
8890 $table->add_alias($data[$i]);
8891 }
8892 } # End of looping through the file
8893
8894 # As noted in the comments early in the program, it generates tables for
8895 # the default values for all releases, even those for which the concept
8896 # didn't exist at the time. Here we add those if missing.
8897 my $age = property_ref('age');
8898 if (defined $age && ! defined $age->table('Unassigned')) {
8899 $age->add_match_table('Unassigned');
8900 }
8901 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8902 && ! defined $block->table('No_Block');
8903
8904
8905 # Now set the default mappings of the properties from the file. This is
8906 # done after the loop because a number of properties have only @missings
8907 # entries in the file, and may not show up until the end.
8908 my @defaults = $file->get_missings;
8909 foreach my $default_ref (@defaults) {
8910 my $default = $default_ref->[0];
8911 my $property = property_ref($default_ref->[1]);
8912 $property->set_default_map($default);
8913 }
8914 return;
8915}
8916
8917sub get_old_property_value_aliases () {
8918 # Returns what would be in PropValueAliases.txt if it existed in very old
8919 # versions of Unicode. It was derived from the one in 3.2, and pared
8920 # down. An attempt was made to use the existence of files to mean
8921 # inclusion or not of various aliases, but if this was not sufficient,
8922 # using version numbers was resorted to.
8923
8924 my @return = split /\n/, <<'END';
8925bc ; AN ; Arabic_Number
8926bc ; B ; Paragraph_Separator
8927bc ; CS ; Common_Separator
8928bc ; EN ; European_Number
8929bc ; ES ; European_Separator
8930bc ; ET ; European_Terminator
8931bc ; L ; Left_To_Right
8932bc ; ON ; Other_Neutral
8933bc ; R ; Right_To_Left
8934bc ; WS ; White_Space
8935
8936# The standard combining classes are very much different in v1, so only use
8937# ones that look right (not checked thoroughly)
8938ccc; 0; NR ; Not_Reordered
8939ccc; 1; OV ; Overlay
8940ccc; 7; NK ; Nukta
8941ccc; 8; KV ; Kana_Voicing
8942ccc; 9; VR ; Virama
8943ccc; 202; ATBL ; Attached_Below_Left
8944ccc; 216; ATAR ; Attached_Above_Right
8945ccc; 218; BL ; Below_Left
8946ccc; 220; B ; Below
8947ccc; 222; BR ; Below_Right
8948ccc; 224; L ; Left
8949ccc; 228; AL ; Above_Left
8950ccc; 230; A ; Above
8951ccc; 232; AR ; Above_Right
8952ccc; 234; DA ; Double_Above
8953
8954dt ; can ; canonical
8955dt ; enc ; circle
8956dt ; fin ; final
8957dt ; font ; font
8958dt ; fra ; fraction
8959dt ; init ; initial
8960dt ; iso ; isolated
8961dt ; med ; medial
8962dt ; n/a ; none
8963dt ; nb ; noBreak
8964dt ; sqr ; square
8965dt ; sub ; sub
8966dt ; sup ; super
8967
8968gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8969gc ; Cc ; Control
8970gc ; Cn ; Unassigned
8971gc ; Co ; Private_Use
8972gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8973gc ; LC ; Cased_Letter # Ll | Lt | Lu
8974gc ; Ll ; Lowercase_Letter
8975gc ; Lm ; Modifier_Letter
8976gc ; Lo ; Other_Letter
8977gc ; Lu ; Uppercase_Letter
8978gc ; M ; Mark # Mc | Me | Mn
8979gc ; Mc ; Spacing_Mark
8980gc ; Mn ; Nonspacing_Mark
8981gc ; N ; Number # Nd | Nl | No
8982gc ; Nd ; Decimal_Number
8983gc ; No ; Other_Number
8984gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8985gc ; Pd ; Dash_Punctuation
8986gc ; Pe ; Close_Punctuation
8987gc ; Po ; Other_Punctuation
8988gc ; Ps ; Open_Punctuation
8989gc ; S ; Symbol # Sc | Sk | Sm | So
8990gc ; Sc ; Currency_Symbol
8991gc ; Sm ; Math_Symbol
8992gc ; So ; Other_Symbol
8993gc ; Z ; Separator # Zl | Zp | Zs
8994gc ; Zl ; Line_Separator
8995gc ; Zp ; Paragraph_Separator
8996gc ; Zs ; Space_Separator
8997
8998nt ; de ; Decimal
8999nt ; di ; Digit
9000nt ; n/a ; None
9001nt ; nu ; Numeric
9002END
9003
9004 if (-e 'ArabicShaping.txt') {
9005 push @return, split /\n/, <<'END';
9006jg ; n/a ; AIN
9007jg ; n/a ; ALEF
9008jg ; n/a ; DAL
9009jg ; n/a ; GAF
9010jg ; n/a ; LAM
9011jg ; n/a ; MEEM
9012jg ; n/a ; NO_JOINING_GROUP
9013jg ; n/a ; NOON
9014jg ; n/a ; QAF
9015jg ; n/a ; SAD
9016jg ; n/a ; SEEN
9017jg ; n/a ; TAH
9018jg ; n/a ; WAW
9019
9020jt ; C ; Join_Causing
9021jt ; D ; Dual_Joining
9022jt ; L ; Left_Joining
9023jt ; R ; Right_Joining
9024jt ; U ; Non_Joining
9025jt ; T ; Transparent
9026END
9027 if ($v_version ge v3.0.0) {
9028 push @return, split /\n/, <<'END';
9029jg ; n/a ; ALAPH
9030jg ; n/a ; BEH
9031jg ; n/a ; BETH
9032jg ; n/a ; DALATH_RISH
9033jg ; n/a ; E
9034jg ; n/a ; FEH
9035jg ; n/a ; FINAL_SEMKATH
9036jg ; n/a ; GAMAL
9037jg ; n/a ; HAH
9038jg ; n/a ; HAMZA_ON_HEH_GOAL
9039jg ; n/a ; HE
9040jg ; n/a ; HEH
9041jg ; n/a ; HEH_GOAL
9042jg ; n/a ; HETH
9043jg ; n/a ; KAF
9044jg ; n/a ; KAPH
9045jg ; n/a ; KNOTTED_HEH
9046jg ; n/a ; LAMADH
9047jg ; n/a ; MIM
9048jg ; n/a ; NUN
9049jg ; n/a ; PE
9050jg ; n/a ; QAPH
9051jg ; n/a ; REH
9052jg ; n/a ; REVERSED_PE
9053jg ; n/a ; SADHE
9054jg ; n/a ; SEMKATH
9055jg ; n/a ; SHIN
9056jg ; n/a ; SWASH_KAF
9057jg ; n/a ; TAW
9058jg ; n/a ; TEH_MARBUTA
9059jg ; n/a ; TETH
9060jg ; n/a ; YEH
9061jg ; n/a ; YEH_BARREE
9062jg ; n/a ; YEH_WITH_TAIL
9063jg ; n/a ; YUDH
9064jg ; n/a ; YUDH_HE
9065jg ; n/a ; ZAIN
9066END
9067 }
9068 }
9069
9070
9071 if (-e 'EastAsianWidth.txt') {
9072 push @return, split /\n/, <<'END';
9073ea ; A ; Ambiguous
9074ea ; F ; Fullwidth
9075ea ; H ; Halfwidth
9076ea ; N ; Neutral
9077ea ; Na ; Narrow
9078ea ; W ; Wide
9079END
9080 }
9081
9082 if (-e 'LineBreak.txt') {
9083 push @return, split /\n/, <<'END';
9084lb ; AI ; Ambiguous
9085lb ; AL ; Alphabetic
9086lb ; B2 ; Break_Both
9087lb ; BA ; Break_After
9088lb ; BB ; Break_Before
9089lb ; BK ; Mandatory_Break
9090lb ; CB ; Contingent_Break
9091lb ; CL ; Close_Punctuation
9092lb ; CM ; Combining_Mark
9093lb ; CR ; Carriage_Return
9094lb ; EX ; Exclamation
9095lb ; GL ; Glue
9096lb ; HY ; Hyphen
9097lb ; ID ; Ideographic
9098lb ; IN ; Inseperable
9099lb ; IS ; Infix_Numeric
9100lb ; LF ; Line_Feed
9101lb ; NS ; Nonstarter
9102lb ; NU ; Numeric
9103lb ; OP ; Open_Punctuation
9104lb ; PO ; Postfix_Numeric
9105lb ; PR ; Prefix_Numeric
9106lb ; QU ; Quotation
9107lb ; SA ; Complex_Context
9108lb ; SG ; Surrogate
9109lb ; SP ; Space
9110lb ; SY ; Break_Symbols
9111lb ; XX ; Unknown
9112lb ; ZW ; ZWSpace
9113END
9114 }
9115
9116 if (-e 'DNormalizationProps.txt') {
9117 push @return, split /\n/, <<'END';
9118qc ; M ; Maybe
9119qc ; N ; No
9120qc ; Y ; Yes
9121END
9122 }
9123
9124 if (-e 'Scripts.txt') {
9125 push @return, split /\n/, <<'END';
9126sc ; Arab ; Arabic
9127sc ; Armn ; Armenian
9128sc ; Beng ; Bengali
9129sc ; Bopo ; Bopomofo
9130sc ; Cans ; Canadian_Aboriginal
9131sc ; Cher ; Cherokee
9132sc ; Cyrl ; Cyrillic
9133sc ; Deva ; Devanagari
9134sc ; Dsrt ; Deseret
9135sc ; Ethi ; Ethiopic
9136sc ; Geor ; Georgian
9137sc ; Goth ; Gothic
9138sc ; Grek ; Greek
9139sc ; Gujr ; Gujarati
9140sc ; Guru ; Gurmukhi
9141sc ; Hang ; Hangul
9142sc ; Hani ; Han
9143sc ; Hebr ; Hebrew
9144sc ; Hira ; Hiragana
9145sc ; Ital ; Old_Italic
9146sc ; Kana ; Katakana
9147sc ; Khmr ; Khmer
9148sc ; Knda ; Kannada
9149sc ; Laoo ; Lao
9150sc ; Latn ; Latin
9151sc ; Mlym ; Malayalam
9152sc ; Mong ; Mongolian
9153sc ; Mymr ; Myanmar
9154sc ; Ogam ; Ogham
9155sc ; Orya ; Oriya
9156sc ; Qaai ; Inherited
9157sc ; Runr ; Runic
9158sc ; Sinh ; Sinhala
9159sc ; Syrc ; Syriac
9160sc ; Taml ; Tamil
9161sc ; Telu ; Telugu
9162sc ; Thaa ; Thaana
9163sc ; Thai ; Thai
9164sc ; Tibt ; Tibetan
9165sc ; Yiii ; Yi
9166sc ; Zyyy ; Common
9167END
9168 }
9169
9170 if ($v_version ge v2.0.0) {
9171 push @return, split /\n/, <<'END';
9172dt ; com ; compat
9173dt ; nar ; narrow
9174dt ; sml ; small
9175dt ; vert ; vertical
9176dt ; wide ; wide
9177
9178gc ; Cf ; Format
9179gc ; Cs ; Surrogate
9180gc ; Lt ; Titlecase_Letter
9181gc ; Me ; Enclosing_Mark
9182gc ; Nl ; Letter_Number
9183gc ; Pc ; Connector_Punctuation
9184gc ; Sk ; Modifier_Symbol
9185END
9186 }
9187 if ($v_version ge v2.1.2) {
9188 push @return, "bc ; S ; Segment_Separator\n";
9189 }
9190 if ($v_version ge v2.1.5) {
9191 push @return, split /\n/, <<'END';
9192gc ; Pf ; Final_Punctuation
9193gc ; Pi ; Initial_Punctuation
9194END
9195 }
9196 if ($v_version ge v2.1.8) {
9197 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9198 }
9199
9200 if ($v_version ge v3.0.0) {
9201 push @return, split /\n/, <<'END';
9202bc ; AL ; Arabic_Letter
9203bc ; BN ; Boundary_Neutral
9204bc ; LRE ; Left_To_Right_Embedding
9205bc ; LRO ; Left_To_Right_Override
9206bc ; NSM ; Nonspacing_Mark
9207bc ; PDF ; Pop_Directional_Format
9208bc ; RLE ; Right_To_Left_Embedding
9209bc ; RLO ; Right_To_Left_Override
9210
9211ccc; 233; DB ; Double_Below
9212END
9213 }
9214
9215 if ($v_version ge v3.1.0) {
9216 push @return, "ccc; 226; R ; Right\n";
9217 }
9218
9219 return @return;
9220}
9221
b1c167a3
KW
9222sub output_perl_charnames_line ($$) {
9223
9224 # Output the entries in Perl_charnames specially, using 5 digits instead
9225 # of four. This makes the entries a constant length, and simplifies
9226 # charnames.pm which this table is for. Unicode can have 6 digit
9227 # ordinals, but they are all private use or noncharacters which do not
9228 # have names, so won't be in this table.
9229
73d9566f 9230 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
9231}
9232
99870f4d
KW
9233{ # Closure
9234 # This is used to store the range list of all the code points usable when
9235 # the little used $compare_versions feature is enabled.
9236 my $compare_versions_range_list;
9237
9238 sub process_generic_property_file {
9239 # This processes a file containing property mappings and puts them
9240 # into internal map tables. It should be used to handle any property
9241 # files that have mappings from a code point or range thereof to
9242 # something else. This means almost all the UCD .txt files.
9243 # each_line_handlers() should be set to adjust the lines of these
9244 # files, if necessary, to what this routine understands:
9245 #
9246 # 0374 ; NFD_QC; N
9247 # 003C..003E ; Math
9248 #
92f9d56c 9249 # the fields are: "codepoint-range ; property; map"
99870f4d
KW
9250 #
9251 # meaning the codepoints in the range all have the value 'map' under
9252 # 'property'.
98dc9551 9253 # Beginning and trailing white space in each field are not significant.
99870f4d
KW
9254 # Note there is not a trailing semi-colon in the above. A trailing
9255 # semi-colon means the map is a null-string. An omitted map, as
9256 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9257 # table syntax. (This could have been hidden from this routine by
9258 # doing it in the $file object, but that would require parsing of the
9259 # line there, so would have to parse it twice, or change the interface
9260 # to pass this an array. So not done.)
9261 #
9262 # The map field may begin with a sequence of commands that apply to
9263 # this range. Each such command begins and ends with $CMD_DELIM.
9264 # These are used to indicate, for example, that the mapping for a
9265 # range has a non-default type.
9266 #
9267 # This loops through the file, calling it's next_line() method, and
9268 # then taking the map and adding it to the property's table.
9269 # Complications arise because any number of properties can be in the
9270 # file, in any order, interspersed in any way. The first time a
9271 # property is seen, it gets information about that property and
f86864ac 9272 # caches it for quick retrieval later. It also normalizes the maps
5d7f7709
KW
9273 # so that only one of many synonyms is stored. The Unicode input
9274 # files do use some multiple synonyms.
99870f4d
KW
9275
9276 my $file = shift;
9277 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9278
9279 my %property_info; # To keep track of what properties
9280 # have already had entries in the
9281 # current file, and info about each,
9282 # so don't have to recompute.
9283 my $property_name; # property currently being worked on
9284 my $property_type; # and its type
9285 my $previous_property_name = ""; # name from last time through loop
9286 my $property_object; # pointer to the current property's
9287 # object
9288 my $property_addr; # the address of that object
9289 my $default_map; # the string that code points missing
9290 # from the file map to
9291 my $default_table; # For non-string properties, a
9292 # reference to the match table that
9293 # will contain the list of code
9294 # points that map to $default_map.
9295
9296 # Get the next real non-comment line
9297 LINE:
9298 while ($file->next_line) {
9299
9300 # Default replacement type; means that if parts of the range have
9301 # already been stored in our tables, the new map overrides them if
9302 # they differ more than cosmetically
9303 my $replace = $IF_NOT_EQUIVALENT;
9304 my $map_type; # Default type for the map of this range
9305
9306 #local $to_trace = 1 if main::DEBUG;
9307 trace $_ if main::DEBUG && $to_trace;
9308
9309 # Split the line into components
9310 my ($range, $property_name, $map, @remainder)
9311 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9312
9313 # If more or less on the line than we are expecting, warn and skip
9314 # the line
9315 if (@remainder) {
9316 $file->carp_bad_line('Extra fields');
9317 next LINE;
9318 }
9319 elsif ( ! defined $property_name) {
9320 $file->carp_bad_line('Missing property');
9321 next LINE;
9322 }
9323
9324 # Examine the range.
9325 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9326 {
9327 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9328 next LINE;
9329 }
9330 my $low = hex $1;
9331 my $high = (defined $2) ? hex $2 : $low;
9332
9333 # For the very specialized case of comparing two Unicode
9334 # versions...
9335 if (DEBUG && $compare_versions) {
9336 if ($property_name eq 'Age') {
9337
9338 # Only allow code points at least as old as the version
9339 # specified.
9340 my $age = pack "C*", split(/\./, $map); # v string
9341 next LINE if $age gt $compare_versions;
9342 }
9343 else {
9344
9345 # Again, we throw out code points younger than those of
9346 # the specified version. By now, the Age property is
9347 # populated. We use the intersection of each input range
9348 # with this property to find what code points in it are
9349 # valid. To do the intersection, we have to convert the
9350 # Age property map to a Range_list. We only have to do
9351 # this once.
9352 if (! defined $compare_versions_range_list) {
9353 my $age = property_ref('Age');
9354 if (! -e 'DAge.txt') {
9355 croak "Need to have 'DAge.txt' file to do version comparison";
9356 }
9357 elsif ($age->count == 0) {
9358 croak "The 'Age' table is empty, but its file exists";
9359 }
9360 $compare_versions_range_list
9361 = Range_List->new(Initialize => $age);
9362 }
9363
9364 # An undefined map is always 'Y'
9365 $map = 'Y' if ! defined $map;
9366
9367 # Calculate the intersection of the input range with the
9368 # code points that are known in the specified version
9369 my @ranges = ($compare_versions_range_list
9370 & Range->new($low, $high))->ranges;
9371
9372 # If the intersection is empty, throw away this range
9373 next LINE unless @ranges;
9374
9375 # Only examine the first range this time through the loop.
9376 my $this_range = shift @ranges;
9377
9378 # Put any remaining ranges in the queue to be processed
9379 # later. Note that there is unnecessary work here, as we
9380 # will do the intersection again for each of these ranges
9381 # during some future iteration of the LINE loop, but this
9382 # code is not used in production. The later intersections
9383 # are guaranteed to not splinter, so this will not become
9384 # an infinite loop.
9385 my $line = join ';', $property_name, $map;
9386 foreach my $range (@ranges) {
9387 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9388 $range->start,
9389 $range->end,
9390 $line));
9391 }
9392
9393 # And process the first range, like any other.
9394 $low = $this_range->start;
9395 $high = $this_range->end;
9396 }
9397 } # End of $compare_versions
9398
9399 # If changing to a new property, get the things constant per
9400 # property
9401 if ($previous_property_name ne $property_name) {
9402
9403 $property_object = property_ref($property_name);
9404 if (! defined $property_object) {
9405 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9406 next LINE;
9407 }
051df77b 9408 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
9409
9410 # Defer changing names until have a line that is acceptable
9411 # (the 'next' statement above means is unacceptable)
9412 $previous_property_name = $property_name;
9413
9414 # If not the first time for this property, retrieve info about
9415 # it from the cache
9416 if (defined ($property_info{$property_addr}{'type'})) {
9417 $property_type = $property_info{$property_addr}{'type'};
9418 $default_map = $property_info{$property_addr}{'default'};
9419 $map_type
9420 = $property_info{$property_addr}{'pseudo_map_type'};
9421 $default_table
9422 = $property_info{$property_addr}{'default_table'};
9423 }
9424 else {
9425
9426 # Here, is the first time for this property. Set up the
9427 # cache.
9428 $property_type = $property_info{$property_addr}{'type'}
9429 = $property_object->type;
9430 $map_type
9431 = $property_info{$property_addr}{'pseudo_map_type'}
9432 = $property_object->pseudo_map_type;
9433
9434 # The Unicode files are set up so that if the map is not
9435 # defined, it is a binary property
9436 if (! defined $map && $property_type != $BINARY) {
9437 if ($property_type != $UNKNOWN
9438 && $property_type != $NON_STRING)
9439 {
9440 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9441 }
9442 else {
9443 $property_object->set_type($BINARY);
9444 $property_type
9445 = $property_info{$property_addr}{'type'}
9446 = $BINARY;
9447 }
9448 }
9449
9450 # Get any @missings default for this property. This
9451 # should precede the first entry for the property in the
9452 # input file, and is located in a comment that has been
9453 # stored by the Input_file class until we access it here.
9454 # It's possible that there is more than one such line
9455 # waiting for us; collect them all, and parse
9456 my @missings_list = $file->get_missings
9457 if $file->has_missings_defaults;
9458 foreach my $default_ref (@missings_list) {
9459 my $default = $default_ref->[0];
ffe43484 9460 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
9461
9462 # For string properties, the default is just what the
9463 # file says, but non-string properties should already
9464 # have set up a table for the default property value;
9465 # use the table for these, so can resolve synonyms
9466 # later to a single standard one.
9467 if ($property_type == $STRING
9468 || $property_type == $UNKNOWN)
9469 {
9470 $property_info{$addr}{'missings'} = $default;
9471 }
9472 else {
9473 $property_info{$addr}{'missings'}
9474 = $property_object->table($default);
9475 }
9476 }
9477
9478 # Finished storing all the @missings defaults in the input
9479 # file so far. Get the one for the current property.
9480 my $missings = $property_info{$property_addr}{'missings'};
9481
9482 # But we likely have separately stored what the default
9483 # should be. (This is to accommodate versions of the
9484 # standard where the @missings lines are absent or
9485 # incomplete.) Hopefully the two will match. But check
9486 # it out.
9487 $default_map = $property_object->default_map;
9488
9489 # If the map is a ref, it means that the default won't be
9490 # processed until later, so undef it, so next few lines
9491 # will redefine it to something that nothing will match
9492 undef $default_map if ref $default_map;
9493
9494 # Create a $default_map if don't have one; maybe a dummy
9495 # that won't match anything.
9496 if (! defined $default_map) {
9497
9498 # Use any @missings line in the file.
9499 if (defined $missings) {
9500 if (ref $missings) {
9501 $default_map = $missings->full_name;
9502 $default_table = $missings;
9503 }
9504 else {
9505 $default_map = $missings;
9506 }
678f13d5 9507
99870f4d
KW
9508 # And store it with the property for outside use.
9509 $property_object->set_default_map($default_map);
9510 }
9511 else {
9512
9513 # Neither an @missings nor a default map. Create
9514 # a dummy one, so won't have to test definedness
9515 # in the main loop.
9516 $default_map = '_Perl This will never be in a file
9517 from Unicode';
9518 }
9519 }
9520
9521 # Here, we have $default_map defined, possibly in terms of
9522 # $missings, but maybe not, and possibly is a dummy one.
9523 if (defined $missings) {
9524
9525 # Make sure there is no conflict between the two.
9526 # $missings has priority.
9527 if (ref $missings) {
23e33b60
KW
9528 $default_table
9529 = $property_object->table($default_map);
99870f4d
KW
9530 if (! defined $default_table
9531 || $default_table != $missings)
9532 {
9533 if (! defined $default_table) {
9534 $default_table = $UNDEF;
9535 }
9536 $file->carp_bad_line(<<END
9537The \@missings line for $property_name in $file says that missings default to
9538$missings, but we expect it to be $default_table. $missings used.
9539END
9540 );
9541 $default_table = $missings;
9542 $default_map = $missings->full_name;
9543 }
9544 $property_info{$property_addr}{'default_table'}
9545 = $default_table;
9546 }
9547 elsif ($default_map ne $missings) {
9548 $file->carp_bad_line(<<END
9549The \@missings line for $property_name in $file says that missings default to
9550$missings, but we expect it to be $default_map. $missings used.
9551END
9552 );
9553 $default_map = $missings;
9554 }
9555 }
9556
9557 $property_info{$property_addr}{'default'}
9558 = $default_map;
9559
9560 # If haven't done so already, find the table corresponding
9561 # to this map for non-string properties.
9562 if (! defined $default_table
9563 && $property_type != $STRING
9564 && $property_type != $UNKNOWN)
9565 {
9566 $default_table = $property_info{$property_addr}
9567 {'default_table'}
9568 = $property_object->table($default_map);
9569 }
9570 } # End of is first time for this property
9571 } # End of switching properties.
9572
9573 # Ready to process the line.
9574 # The Unicode files are set up so that if the map is not defined,
9575 # it is a binary property with value 'Y'
9576 if (! defined $map) {
9577 $map = 'Y';
9578 }
9579 else {
9580
9581 # If the map begins with a special command to us (enclosed in
9582 # delimiters), extract the command(s).
a35d7f90
KW
9583 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9584 my $command = $1;
9585 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9586 $replace = $1;
99870f4d 9587 }
a35d7f90
KW
9588 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9589 $map_type = $1;
9590 }
9591 else {
9592 $file->carp_bad_line("Unknown command line: '$1'");
9593 next LINE;
9594 }
9595 }
99870f4d
KW
9596 }
9597
9598 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9599 {
9600
9601 # Here, we have a map to a particular code point, and the
9602 # default map is to a code point itself. If the range
9603 # includes the particular code point, change that portion of
9604 # the range to the default. This makes sure that in the final
9605 # table only the non-defaults are listed.
9606 my $decimal_map = hex $map;
9607 if ($low <= $decimal_map && $decimal_map <= $high) {
9608
9609 # If the range includes stuff before or after the map
9610 # we're changing, split it and process the split-off parts
9611 # later.
9612 if ($low < $decimal_map) {
9613 $file->insert_adjusted_lines(
9614 sprintf("%04X..%04X; %s; %s",
9615 $low,
9616 $decimal_map - 1,
9617 $property_name,
9618 $map));
9619 }
9620 if ($high > $decimal_map) {
9621 $file->insert_adjusted_lines(
9622 sprintf("%04X..%04X; %s; %s",
9623 $decimal_map + 1,
9624 $high,
9625 $property_name,
9626 $map));
9627 }
9628 $low = $high = $decimal_map;
9629 $map = $CODE_POINT;
9630 }
9631 }
9632
9633 # If we can tell that this is a synonym for the default map, use
9634 # the default one instead.
9635 if ($property_type != $STRING
9636 && $property_type != $UNKNOWN)
9637 {
9638 my $table = $property_object->table($map);
9639 if (defined $table && $table == $default_table) {
9640 $map = $default_map;
9641 }
9642 }
9643
9644 # And figure out the map type if not known.
9645 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9646 if ($map eq "") { # Nulls are always $NULL map type
9647 $map_type = $NULL;
9648 } # Otherwise, non-strings, and those that don't allow
9649 # $MULTI_CP, and those that aren't multiple code points are
9650 # 0
9651 elsif
9652 (($property_type != $STRING && $property_type != $UNKNOWN)
9653 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9654 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9655 {
9656 $map_type = 0;
9657 }
9658 else {
9659 $map_type = $MULTI_CP;
9660 }
9661 }
9662
9663 $property_object->add_map($low, $high,
9664 $map,
9665 Type => $map_type,
9666 Replace => $replace);
9667 } # End of loop through file's lines
9668
9669 return;
9670 }
9671}
9672
99870f4d
KW
9673{ # Closure for UnicodeData.txt handling
9674
9675 # This file was the first one in the UCD; its design leads to some
9676 # awkwardness in processing. Here is a sample line:
9677 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9678 # The fields in order are:
9679 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9680 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9681 my $CATEGORY = $i++; # category (e.g. "Lu")
9682 my $CCC = $i++; # Canonical combining class (e.g. "230")
9683 my $BIDI = $i++; # directional class (e.g. "L")
9684 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9685 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9686 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9687 # Dual-use in this program; see below
9688 my $NUMERIC = $i++; # numeric value
9689 my $MIRRORED = $i++; # ? mirrored
9690 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9691 my $COMMENT = $i++; # iso comment
9692 my $UPPER = $i++; # simple uppercase mapping
9693 my $LOWER = $i++; # simple lowercase mapping
9694 my $TITLE = $i++; # simple titlecase mapping
9695 my $input_field_count = $i;
9696
9697 # This routine in addition outputs these extra fields:
9698 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9699
9700 # These fields are modifications of ones above, and are usually
9701 # suppressed; they must come last, as for speed, the loop upper bound is
9702 # normally set to ignore them
9703 my $NAME = $i++; # This is the strict name field, not the one that
9704 # charnames uses.
9705 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9706 # by Unicode::Normalize
99870f4d
KW
9707 my $last_field = $i - 1;
9708
9709 # All these are read into an array for each line, with the indices defined
9710 # above. The empty fields in the example line above indicate that the
9711 # value is defaulted. The handler called for each line of the input
9712 # changes these to their defaults.
9713
9714 # Here are the official names of the properties, in a parallel array:
9715 my @field_names;
9716 $field_names[$BIDI] = 'Bidi_Class';
9717 $field_names[$CATEGORY] = 'General_Category';
9718 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9719 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9720 $field_names[$COMMENT] = 'ISO_Comment';
9721 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9722 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9723 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9724 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9725 $field_names[$NAME] = 'Name';
9726 $field_names[$NUMERIC] = 'Numeric_Value';
9727 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9728 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9729 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9730 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9731 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9732 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9733
28093d0e
KW
9734 # Some of these need a little more explanation:
9735 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9736 # property, but is used in calculating the Numeric_Type. Perl however,
9737 # creates a file from this field, so a Perl property is created from it.
9738 # Similarly, the Other_Digit field is used only for calculating the
9739 # Numeric_Type, and so it can be safely re-used as the place to store
9740 # the value for Numeric_Type; hence it is referred to as
9741 # $NUMERIC_TYPE_OTHER_DIGIT.
9742 # The input field named $PERL_DECOMPOSITION is a combination of both the
9743 # decomposition mapping and its type. Perl creates a file containing
9744 # exactly this field, so it is used for that. The two properties are
9745 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9746 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9747 # output it), as Perl doesn't use it directly.
9748 # The input field named here $CHARNAME is used to construct the
9749 # Perl_Charnames property, which is a combination of the Name property
9750 # (which the input field contains), and the Unicode_1_Name property, and
9751 # others from other files. Since, the strict Name property is not used
9752 # by Perl, this field is used for the table that Perl does use. The
9753 # strict Name property table is usually suppressed (unless the lists are
9754 # changed to output it), so it is accumulated in a separate field,
9755 # $NAME, which to save time is discarded unless the table is actually to
9756 # be output
99870f4d
KW
9757
9758 # This file is processed like most in this program. Control is passed to
9759 # process_generic_property_file() which calls filter_UnicodeData_line()
9760 # for each input line. This filter converts the input into line(s) that
9761 # process_generic_property_file() understands. There is also a setup
9762 # routine called before any of the file is processed, and a handler for
9763 # EOF processing, all in this closure.
9764
9765 # A huge speed-up occurred at the cost of some added complexity when these
9766 # routines were altered to buffer the outputs into ranges. Almost all the
9767 # lines of the input file apply to just one code point, and for most
9768 # properties, the map for the next code point up is the same as the
9769 # current one. So instead of creating a line for each property for each
9770 # input line, filter_UnicodeData_line() remembers what the previous map
9771 # of a property was, and doesn't generate a line to pass on until it has
9772 # to, as when the map changes; and that passed-on line encompasses the
9773 # whole contiguous range of code points that have the same map for that
9774 # property. This means a slight amount of extra setup, and having to
9775 # flush these buffers on EOF, testing if the maps have changed, plus
9776 # remembering state information in the closure. But it means a lot less
9777 # real time in not having to change the data base for each property on
9778 # each line.
9779
9780 # Another complication is that there are already a few ranges designated
9781 # in the input. There are two lines for each, with the same maps except
9782 # the code point and name on each line. This was actually the hardest
9783 # thing to design around. The code points in those ranges may actually
9784 # have real maps not given by these two lines. These maps will either
98dc9551 9785 # be algorithmically determinable, or in the extracted files furnished
99870f4d
KW
9786 # with the UCD. In the event of conflicts between these extracted files,
9787 # and this one, Unicode says that this one prevails. But it shouldn't
9788 # prevail for conflicts that occur in these ranges. The data from the
9789 # extracted files prevails in those cases. So, this program is structured
9790 # so that those files are processed first, storing maps. Then the other
9791 # files are processed, generally overwriting what the extracted files
9792 # stored. But just the range lines in this input file are processed
9793 # without overwriting. This is accomplished by adding a special string to
9794 # the lines output to tell process_generic_property_file() to turn off the
9795 # overwriting for just this one line.
9796 # A similar mechanism is used to tell it that the map is of a non-default
9797 # type.
9798
9799 sub setup_UnicodeData { # Called before any lines of the input are read
9800 my $file = shift;
9801 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9802
28093d0e
KW
9803 # Create a new property specially located that is a combination of the
9804 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9805 # Name_Alias properties. (The final duplicates elements of the
9806 # first.) A comment for it will later be constructed based on the
9807 # actual properties present and used
3e20195b 9808 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9809 Core_Access => '\N{...} and "use charnames"',
9810 Default_Map => "",
9811 Directory => File::Spec->curdir(),
9812 File => 'Name',
9813 Internal_Only_Warning => 1,
9814 Perl_Extension => 1,
b1c167a3 9815 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9816 Type => $STRING,
9817 );
9818
99870f4d 9819 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9820 Directory => File::Spec->curdir(),
99870f4d 9821 File => 'Decomposition',
a14f3cb1 9822 Format => $DECOMP_STRING_FORMAT,
99870f4d
KW
9823 Internal_Only_Warning => 1,
9824 Perl_Extension => 1,
9825 Default_Map => $CODE_POINT,
9826
0c07e538
KW
9827 # normalize.pm can't cope with these
9828 Output_Range_Counts => 0,
9829
99870f4d
KW
9830 # This is a specially formatted table
9831 # explicitly for normalize.pm, which
9832 # is expecting a particular format,
9833 # which means that mappings containing
9834 # multiple code points are in the main
9835 # body of the table
9836 Map_Type => $COMPUTE_NO_MULTI_CP,
9837 Type => $STRING,
9838 );
9839 $Perl_decomp->add_comment(join_lines(<<END
9840This mapping is a combination of the Unicode 'Decomposition_Type' and
9841'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9842identical to the official Unicode 'Decomposition_Mapping' property except for
9843two things:
9844 1) It omits the algorithmically determinable Hangul syllable decompositions,
9845which normalize.pm handles algorithmically.
9846 2) It contains the decomposition type as well. Non-canonical decompositions
9847begin with a word in angle brackets, like <super>, which denotes the
9848compatible decomposition type. If the map does not begin with the <angle
9849brackets>, the decomposition is canonical.
9850END
9851 ));
9852
9853 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9854 Default_Map => "",
9855 Perl_Extension => 1,
9856 File => 'Digit', # Trad. location
9857 Directory => $map_directory,
9858 Type => $STRING,
9859 Range_Size_1 => 1,
9860 );
9861 $Decimal_Digit->add_comment(join_lines(<<END
9862This file gives the mapping of all code points which represent a single
9863decimal digit [0-9] to their respective digits. For example, the code point
9864U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9865that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9866numerals.
9867END
9868 ));
9869
28093d0e
KW
9870 # These properties are not used for generating anything else, and are
9871 # usually not output. By making them last in the list, we can just
99870f4d 9872 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
9873 # generating a table(s) that is/are just going to get thrown away.
9874 if (! property_ref('Decomposition_Mapping')->to_output_map
9875 && ! property_ref('Name')->to_output_map)
9876 {
9877 $last_field = min($NAME, $DECOMP_MAP) - 1;
9878 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9879 $last_field = $DECOMP_MAP;
9880 } elsif (property_ref('Name')->to_output_map) {
9881 $last_field = $NAME;
99870f4d
KW
9882 }
9883 return;
9884 }
9885
9886 my $first_time = 1; # ? Is this the first line of the file
9887 my $in_range = 0; # ? Are we in one of the file's ranges
9888 my $previous_cp; # hex code point of previous line
9889 my $decimal_previous_cp = -1; # And its decimal equivalent
9890 my @start; # For each field, the current starting
9891 # code point in hex for the range
9892 # being accumulated.
9893 my @fields; # The input fields;
9894 my @previous_fields; # And those from the previous call
9895
9896 sub filter_UnicodeData_line {
9897 # Handle a single input line from UnicodeData.txt; see comments above
9898 # Conceptually this takes a single line from the file containing N
9899 # properties, and converts it into N lines with one property per line,
9900 # which is what the final handler expects. But there are
9901 # complications due to the quirkiness of the input file, and to save
9902 # time, it accumulates ranges where the property values don't change
9903 # and only emits lines when necessary. This is about an order of
9904 # magnitude fewer lines emitted.
9905
9906 my $file = shift;
9907 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9908
9909 # $_ contains the input line.
9910 # -1 in split means retain trailing null fields
9911 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9912
9913 #local $to_trace = 1 if main::DEBUG;
9914 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9915 if (@fields > $input_field_count) {
9916 $file->carp_bad_line('Extra fields');
9917 $_ = "";
9918 return;
9919 }
9920
9921 my $decimal_cp = hex $cp;
9922
9923 # We have to output all the buffered ranges when the next code point
9924 # is not exactly one after the previous one, which means there is a
9925 # gap in the ranges.
9926 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9927
9928 # The decomposition mapping field requires special handling. It looks
9929 # like either:
9930 #
9931 # <compat> 0032 0020
9932 # 0041 0300
9933 #
9934 # The decomposition type is enclosed in <brackets>; if missing, it
9935 # means the type is canonical. There are two decomposition mapping
9936 # tables: the one for use by Perl's normalize.pm has a special format
9937 # which is this field intact; the other, for general use is of
9938 # standard format. In either case we have to find the decomposition
9939 # type. Empty fields have None as their type, and map to the code
9940 # point itself
9941 if ($fields[$PERL_DECOMPOSITION] eq "") {
9942 $fields[$DECOMP_TYPE] = 'None';
9943 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9944 }
9945 else {
9946 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9947 =~ / < ( .+? ) > \s* ( .+ ) /x;
9948 if (! defined $fields[$DECOMP_TYPE]) {
9949 $fields[$DECOMP_TYPE] = 'Canonical';
9950 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9951 }
9952 else {
9953 $fields[$DECOMP_MAP] = $map;
9954 }
9955 }
9956
9957 # The 3 numeric fields also require special handling. The 2 digit
9958 # fields must be either empty or match the number field. This means
9959 # that if it is empty, they must be as well, and the numeric type is
9960 # None, and the numeric value is 'Nan'.
9961 # The decimal digit field must be empty or match the other digit
9962 # field. If the decimal digit field is non-empty, the code point is
9963 # a decimal digit, and the other two fields will have the same value.
9964 # If it is empty, but the other digit field is non-empty, the code
9965 # point is an 'other digit', and the number field will have the same
9966 # value as the other digit field. If the other digit field is empty,
9967 # but the number field is non-empty, the code point is a generic
9968 # numeric type.
9969 if ($fields[$NUMERIC] eq "") {
9970 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9971 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9972 ) {
9973 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9974 }
9975 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9976 $fields[$NUMERIC] = 'NaN';
9977 }
9978 else {
9979 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9980 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9981 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9982 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9983 }
9984 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9985 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9986 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9987 }
9988 else {
9989 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9990
9991 # Rationals require extra effort.
9992 register_fraction($fields[$NUMERIC])
9993 if $fields[$NUMERIC] =~ qr{/};
9994 }
9995 }
9996
9997 # For the properties that have empty fields in the file, and which
9998 # mean something different from empty, change them to that default.
9999 # Certain fields just haven't been empty so far in any Unicode
10000 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10001 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 10002 # the defaults; which are very unlikely to ever change.
99870f4d
KW
10003 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10004 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10005
10006 # UAX44 says that if title is empty, it is the same as whatever upper
10007 # is,
10008 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10009
10010 # There are a few pairs of lines like:
10011 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10012 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10013 # that define ranges. These should be processed after the fields are
10014 # adjusted above, as they may override some of them; but mostly what
28093d0e 10015 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
10016 # paired lines start with a '<', but this is also true of '<control>,
10017 # which isn't one of these special ones.
28093d0e 10018 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
10019
10020 # Some code points in this file have the pseudo-name
10021 # '<control>', but the official name for such ones is the null
28093d0e 10022 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 10023 $fields[$NAME] = "";
28093d0e 10024 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
10025
10026 # We had better not be in between range lines.
10027 if ($in_range) {
28093d0e 10028 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10029 $in_range = 0;
10030 }
10031 }
28093d0e 10032 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
10033
10034 # Here is a non-range line. We had better not be in between range
10035 # lines.
10036 if ($in_range) {
28093d0e 10037 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10038 $in_range = 0;
10039 }
edb80b88
KW
10040 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10041
10042 # These are code points whose names end in their code points,
10043 # which means the names are algorithmically derivable from the
10044 # code points. To shorten the output Name file, the algorithm
10045 # for deriving these is placed in the file instead of each
10046 # code point, so they have map type $CP_IN_NAME
10047 $fields[$CHARNAME] = $CMD_DELIM
10048 . $MAP_TYPE_CMD
10049 . '='
10050 . $CP_IN_NAME
10051 . $CMD_DELIM
10052 . $fields[$CHARNAME];
10053 }
28093d0e 10054 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 10055 }
28093d0e
KW
10056 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10057 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
10058
10059 # Here we are at the beginning of a range pair.
10060 if ($in_range) {
28093d0e 10061 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10062 }
10063 $in_range = 1;
10064
10065 # Because the properties in the range do not overwrite any already
10066 # in the db, we must flush the buffers of what's already there, so
10067 # they get handled in the normal scheme.
10068 $force_output = 1;
10069
10070 }
28093d0e
KW
10071 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10072 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
10073 $_ = "";
10074 return;
10075 }
10076 else { # Here, we are at the last line of a range pair.
10077
10078 if (! $in_range) {
28093d0e 10079 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
10080 $_ = "";
10081 return;
10082 }
10083 $in_range = 0;
10084
28093d0e
KW
10085 $fields[$NAME] = $fields[$CHARNAME];
10086
99870f4d
KW
10087 # Check that the input is valid: that the closing of the range is
10088 # the same as the beginning.
10089 foreach my $i (0 .. $last_field) {
10090 next if $fields[$i] eq $previous_fields[$i];
10091 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10092 }
10093
10094 # The processing differs depending on the type of range,
28093d0e
KW
10095 # determined by its $CHARNAME
10096 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
10097
10098 # Check that the data looks right.
10099 if ($decimal_previous_cp != $SBase) {
10100 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10101 }
10102 if ($decimal_cp != $SBase + $SCount - 1) {
10103 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10104 }
10105
10106 # The Hangul syllable range has a somewhat complicated name
10107 # generation algorithm. Each code point in it has a canonical
10108 # decomposition also computable by an algorithm. The
10109 # perl decomposition map table built from these is used only
10110 # by normalize.pm, which has the algorithm built in it, so the
10111 # decomposition maps are not needed, and are large, so are
10112 # omitted from it. If the full decomposition map table is to
10113 # be output, the decompositions are generated for it, in the
10114 # EOF handling code for this input file.
10115
10116 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10117
10118 # This range is stored in our internal structure with its
10119 # own map type, different from all others.
28093d0e
KW
10120 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10121 = $CMD_DELIM
99870f4d
KW
10122 . $MAP_TYPE_CMD
10123 . '='
10124 . $HANGUL_SYLLABLE
10125 . $CMD_DELIM
28093d0e 10126 . $fields[$CHARNAME];
99870f4d 10127 }
28093d0e 10128 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
10129
10130 # The name for these contains the code point itself, and all
10131 # are defined to have the same base name, regardless of what
10132 # is in the file. They are stored in our internal structure
10133 # with a map type of $CP_IN_NAME
28093d0e
KW
10134 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10135 = $CMD_DELIM
99870f4d
KW
10136 . $MAP_TYPE_CMD
10137 . '='
10138 . $CP_IN_NAME
10139 . $CMD_DELIM
10140 . 'CJK UNIFIED IDEOGRAPH';
10141
10142 }
10143 elsif ($fields[$CATEGORY] eq 'Co'
10144 || $fields[$CATEGORY] eq 'Cs')
10145 {
10146 # The names of all the code points in these ranges are set to
10147 # null, as there are no names for the private use and
10148 # surrogate code points.
10149
28093d0e 10150 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
10151 }
10152 else {
28093d0e 10153 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
10154 }
10155
10156 # The first line of the range caused everything else to be output,
10157 # and then its values were stored as the beginning values for the
10158 # next set of ranges, which this one ends. Now, for each value,
10159 # add a command to tell the handler that these values should not
10160 # replace any existing ones in our database.
10161 foreach my $i (0 .. $last_field) {
10162 $previous_fields[$i] = $CMD_DELIM
10163 . $REPLACE_CMD
10164 . '='
10165 . $NO
10166 . $CMD_DELIM
10167 . $previous_fields[$i];
10168 }
10169
10170 # And change things so it looks like the entire range has been
10171 # gone through with this being the final part of it. Adding the
10172 # command above to each field will cause this range to be flushed
10173 # during the next iteration, as it guaranteed that the stored
10174 # field won't match whatever value the next one has.
10175 $previous_cp = $cp;
10176 $decimal_previous_cp = $decimal_cp;
10177
10178 # We are now set up for the next iteration; so skip the remaining
10179 # code in this subroutine that does the same thing, but doesn't
10180 # know about these ranges.
10181 $_ = "";
c1739a4a 10182
99870f4d
KW
10183 return;
10184 }
10185
10186 # On the very first line, we fake it so the code below thinks there is
10187 # nothing to output, and initialize so that when it does get output it
10188 # uses the first line's values for the lowest part of the range.
10189 # (One could avoid this by using peek(), but then one would need to
10190 # know the adjustments done above and do the same ones in the setup
10191 # routine; not worth it)
10192 if ($first_time) {
10193 $first_time = 0;
10194 @previous_fields = @fields;
10195 @start = ($cp) x scalar @fields;
10196 $decimal_previous_cp = $decimal_cp - 1;
10197 }
10198
10199 # For each field, output the stored up ranges that this code point
10200 # doesn't fit in. Earlier we figured out if all ranges should be
10201 # terminated because of changing the replace or map type styles, or if
10202 # there is a gap between this new code point and the previous one, and
10203 # that is stored in $force_output. But even if those aren't true, we
10204 # need to output the range if this new code point's value for the
10205 # given property doesn't match the stored range's.
10206 #local $to_trace = 1 if main::DEBUG;
10207 foreach my $i (0 .. $last_field) {
10208 my $field = $fields[$i];
10209 if ($force_output || $field ne $previous_fields[$i]) {
10210
10211 # Flush the buffer of stored values.
10212 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10213
10214 # Start a new range with this code point and its value
10215 $start[$i] = $cp;
10216 $previous_fields[$i] = $field;
10217 }
10218 }
10219
10220 # Set the values for the next time.
10221 $previous_cp = $cp;
10222 $decimal_previous_cp = $decimal_cp;
10223
10224 # The input line has generated whatever adjusted lines are needed, and
10225 # should not be looked at further.
10226 $_ = "";
10227 return;
10228 }
10229
10230 sub EOF_UnicodeData {
10231 # Called upon EOF to flush the buffers, and create the Hangul
10232 # decomposition mappings if needed.
10233
10234 my $file = shift;
10235 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10236
10237 # Flush the buffers.
10238 foreach my $i (1 .. $last_field) {
10239 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10240 }
10241
10242 if (-e 'Jamo.txt') {
10243
10244 # The algorithm is published by Unicode, based on values in
10245 # Jamo.txt, (which should have been processed before this
10246 # subroutine), and the results left in %Jamo
10247 unless (%Jamo) {
10248 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10249 return;
10250 }
10251
10252 # If the full decomposition map table is being output, insert
10253 # into it the Hangul syllable mappings. This is to avoid having
10254 # to publish a subroutine in it to compute them. (which would
10255 # essentially be this code.) This uses the algorithm published by
10256 # Unicode.
10257 if (property_ref('Decomposition_Mapping')->to_output_map) {
10258 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10259 use integer;
10260 my $SIndex = $S - $SBase;
10261 my $L = $LBase + $SIndex / $NCount;
10262 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10263 my $T = $TBase + $SIndex % $TCount;
10264
10265 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10266 my $decomposition = sprintf("%04X %04X", $L, $V);
10267 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10268 $file->insert_adjusted_lines(
10269 sprintf("%04X; Decomposition_Mapping; %s",
10270 $S,
10271 $decomposition));
10272 }
10273 }
10274 }
10275
10276 return;
10277 }
10278
10279 sub filter_v1_ucd {
10280 # Fix UCD lines in version 1. This is probably overkill, but this
10281 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10282 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10283 # removed. This program retains them
10284 # 2) didn't include ranges, which it should have, and which are now
10285 # added in @corrected_lines below. It was hand populated by
10286 # taking the data from Version 2, verified by analyzing
10287 # DAge.txt.
10288 # 3) There is a syntax error in the entry for U+09F8 which could
10289 # cause problems for utf8_heavy, and so is changed. It's
10290 # numeric value was simply a minus sign, without any number.
10291 # (Eventually Unicode changed the code point to non-numeric.)
10292 # 4) The decomposition types often don't match later versions
10293 # exactly, and the whole syntax of that field is different; so
10294 # the syntax is changed as well as the types to their later
10295 # terminology. Otherwise normalize.pm would be very unhappy
10296 # 5) Many ccc classes are different. These are left intact.
10297 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10298 # fields. These are unchanged because it doesn't really cause
10299 # problems for Perl.
10300 # 7) A number of code points, such as controls, don't have their
10301 # Unicode Version 1 Names in this file. These are unchanged.
10302
10303 my @corrected_lines = split /\n/, <<'END';
103044E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
103059FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10306E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10307F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10308F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10309FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10310END
10311
10312 my $file = shift;
10313 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10314
10315 #local $to_trace = 1 if main::DEBUG;
10316 trace $_ if main::DEBUG && $to_trace;
10317
10318 # -1 => retain trailing null fields
10319 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10320
10321 # At the first place that is wrong in the input, insert all the
10322 # corrections, replacing the wrong line.
10323 if ($code_point eq '4E00') {
10324 my @copy = @corrected_lines;
10325 $_ = shift @copy;
10326 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10327
10328 $file->insert_lines(@copy);
10329 }
10330
10331
10332 if ($fields[$NUMERIC] eq '-') {
10333 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10334 }
10335
10336 if ($fields[$PERL_DECOMPOSITION] ne "") {
10337
10338 # Several entries have this change to superscript 2 or 3 in the
10339 # middle. Convert these to the modern version, which is to use
10340 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10341 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10342 # 'HHHH HHHH 00B3 HHHH'.
10343 # It turns out that all of these that don't have another
10344 # decomposition defined at the beginning of the line have the
10345 # <square> decomposition in later releases.
10346 if ($code_point ne '00B2' && $code_point ne '00B3') {
10347 if ($fields[$PERL_DECOMPOSITION]
10348 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10349 {
10350 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10351 $fields[$PERL_DECOMPOSITION] = '<square> '
10352 . $fields[$PERL_DECOMPOSITION];
10353 }
10354 }
10355 }
10356
10357 # If is like '<+circled> 0052 <-circled>', convert to
10358 # '<circled> 0052'
10359 $fields[$PERL_DECOMPOSITION] =~
10360 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10361
10362 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10363 $fields[$PERL_DECOMPOSITION] =~
10364 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10365 or $fields[$PERL_DECOMPOSITION] =~
10366 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10367 or $fields[$PERL_DECOMPOSITION] =~
10368 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10369 or $fields[$PERL_DECOMPOSITION] =~
10370 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10371
10372 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10373 $fields[$PERL_DECOMPOSITION] =~
10374 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10375
10376 # Change names to modern form.
10377 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10378 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10379 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10380 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10381
10382 # One entry has weird braces
10383 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10384 }
10385
10386 $_ = join ';', $code_point, @fields;
10387 trace $_ if main::DEBUG && $to_trace;
10388 return;
10389 }
10390
10391 sub filter_v2_1_5_ucd {
10392 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10393 # columns swapped; These all had mirrored be 'N'. So if the numeric
10394 # column appears to be N, swap it back.
10395
10396 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10397 if ($fields[$NUMERIC] eq 'N') {
10398 $fields[$NUMERIC] = $fields[$MIRRORED];
10399 $fields[$MIRRORED] = 'N';
10400 $_ = join ';', $code_point, @fields;
10401 }
10402 return;
10403 }
3ffed8c2
KW
10404
10405 sub filter_v6_ucd {
10406
c12f2655
KW
10407 # Unicode 6.0 co-opted the name BELL for U+1F514, but we haven't
10408 # accepted that yet to allow for some deprecation cycles.
3ffed8c2 10409
484741e1 10410 return if $_ !~ /^(?:0007|1F514|070F);/;
3ffed8c2
KW
10411
10412 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10413 if ($code_point eq '0007') {
0e429600 10414 $fields[$CHARNAME] = "ALERT";
3ffed8c2 10415 }
484741e1
KW
10416 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10417 # http://www.unicode.org/versions/corrigendum8.html
10418 $fields[$BIDI] = "AL";
10419 }
10914c78 10420 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
3ffed8c2
KW
10421 $fields[$CHARNAME] = "";
10422 }
10423
10424 $_ = join ';', $code_point, @fields;
10425
10426 return;
10427 }
99870f4d
KW
10428} # End closure for UnicodeData
10429
37e2e78e
KW
10430sub process_GCB_test {
10431
10432 my $file = shift;
10433 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10434
10435 while ($file->next_line) {
10436 push @backslash_X_tests, $_;
10437 }
678f13d5 10438
37e2e78e
KW
10439 return;
10440}
10441
99870f4d
KW
10442sub process_NamedSequences {
10443 # NamedSequences.txt entries are just added to an array. Because these
10444 # don't look like the other tables, they have their own handler.
10445 # An example:
10446 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10447 #
10448 # This just adds the sequence to an array for later handling
10449
99870f4d
KW
10450 my $file = shift;
10451 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10452
10453 while ($file->next_line) {
10454 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10455 if (@remainder) {
10456 $file->carp_bad_line(
10457 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10458 next;
10459 }
fb121860
KW
10460
10461 # Note single \t in keeping with special output format of
10462 # Perl_charnames. But it turns out that the code points don't have to
10463 # be 5 digits long, like the rest, based on the internal workings of
10464 # charnames.pm. This could be easily changed for consistency.
10465 push @named_sequences, "$sequence\t$name";
99870f4d
KW
10466 }
10467 return;
10468}
10469
10470{ # Closure
10471
10472 my $first_range;
10473
10474 sub filter_early_ea_lb {
10475 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10476 # third field be the name of the code point, which can be ignored in
10477 # most cases. But it can be meaningful if it marks a range:
10478 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10479 # 3400;W;<CJK Ideograph Extension A, First>
10480 #
10481 # We need to see the First in the example above to know it's a range.
10482 # They did not use the later range syntaxes. This routine changes it
10483 # to use the modern syntax.
10484 # $1 is the Input_file object.
10485
10486 my @fields = split /\s*;\s*/;
10487 if ($fields[2] =~ /^<.*, First>/) {
10488 $first_range = $fields[0];
10489 $_ = "";
10490 }
10491 elsif ($fields[2] =~ /^<.*, Last>/) {
10492 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10493 }
10494 else {
10495 undef $first_range;
10496 $_ = "$fields[0]; $fields[1]";
10497 }
10498
10499 return;
10500 }
10501}
10502
10503sub filter_old_style_arabic_shaping {
10504 # Early versions used a different term for the later one.
10505
10506 my @fields = split /\s*;\s*/;
10507 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10508 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10509 $_ = join ';', @fields;
10510 return;
10511}
10512
10513sub filter_arabic_shaping_line {
10514 # ArabicShaping.txt has entries that look like:
10515 # 062A; TEH; D; BEH
10516 # The field containing 'TEH' is not used. The next field is Joining_Type
10517 # and the last is Joining_Group
10518 # This generates two lines to pass on, one for each property on the input
10519 # line.
10520
10521 my $file = shift;
10522 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10523
10524 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10525
10526 if (@fields > 4) {
10527 $file->carp_bad_line('Extra fields');
10528 $_ = "";
10529 return;
10530 }
10531
10532 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10533 $_ = "$fields[0]; Joining_Type; $fields[2]";
10534
10535 return;
10536}
10537
d3fed3dd
KW
10538{ # Closure
10539 my $lc; # Table for lowercase mapping
10540 my $tc;
10541 my $uc;
10542
6c0259ad
KW
10543 sub setup_special_casing {
10544 # SpecialCasing.txt contains the non-simple case change mappings. The
10545 # simple ones are in UnicodeData.txt, which should already have been
10546 # read in to the full property data structures, so as to initialize
10547 # these with the simple ones. Then the SpecialCasing.txt entries
10548 # overwrite the ones which have different full mappings.
10549
10550 # This routine sees if the simple mappings are to be output, and if
10551 # so, copies what has already been put into the full mapping tables,
10552 # while they still contain only the simple mappings.
10553
10554 # The reason it is done this way is that the simple mappings are
10555 # probably not going to be output, so it saves work to initialize the
10556 # full tables with the simple mappings, and then overwrite those
10557 # relatively few entries in them that have different full mappings,
10558 # and thus skip the simple mapping tables altogether.
10559
c12f2655
KW
10560 # New tables with just the simple mappings that are overridden by the
10561 # full ones are constructed. These are for Unicode::UCD, which
10562 # requires the simple mappings. The Case_Folding table is a combined
10563 # table of both the simple and full mappings, with the full ones being
10564 # in the hash, and the simple ones, even those overridden by the hash,
10565 # being in the base table. That same mechanism could have been
10566 # employed here, except that the docs have said that the generated
10567 # files are usuable directly by programs, so we dare not change the
10568 # format in any way.
10569
6c0259ad
KW
10570 my $file= shift;
10571 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10572
6c0259ad
KW
10573 $lc = property_ref('lc');
10574 $tc = property_ref('tc');
10575 $uc = property_ref('uc');
10576
10577 # For each of the case change mappings...
10578 foreach my $case_table ($lc, $tc, $uc) {
10579 my $case = $case_table->name;
10580 my $full = property_ref($case);
10581 unless (defined $full && ! $full->is_empty) {
10582 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10583 }
10584
10585 # The simple version's name in each mapping merely has an 's' in
10586 # front of the full one's
10587 my $simple = property_ref('s' . $case);
10588 $simple->initialize($full) if $simple->to_output_map();
10589
10590 my $simple_only = Property->new("_s$case",
10591 Type => $STRING,
10592 Default_Map => $CODE_POINT,
10593 Perl_Extension => 1,
10594 Description => "The simple mappings for $case for code points that have full mappings as well");
10595 $simple_only->set_to_output_map($INTERNAL_MAP);
10596 $simple_only->add_comment(join_lines( <<END
d3fed3dd
KW
10597This file is for UCD.pm so that it can construct simple mappings that would
10598otherwise be lost because they are overridden by full mappings.
10599END
6c0259ad
KW
10600 ));
10601 }
99870f4d 10602
6c0259ad
KW
10603 return;
10604 }
99870f4d 10605
6c0259ad
KW
10606 sub filter_special_casing_line {
10607 # Change the format of $_ from SpecialCasing.txt into something that
10608 # the generic handler understands. Each input line contains three
10609 # case mappings. This will generate three lines to pass to the
10610 # generic handler for each of those.
99870f4d 10611
6c0259ad
KW
10612 # The input syntax (after stripping comments and trailing white space
10613 # is like one of the following (with the final two being entries that
10614 # we ignore):
10615 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10616 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10617 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10618 # Note the trailing semi-colon, unlike many of the input files. That
10619 # means that there will be an extra null field generated by the split
99870f4d 10620
6c0259ad
KW
10621 my $file = shift;
10622 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10623
6c0259ad
KW
10624 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10625 # fields
10626
10627 # field #4 is when this mapping is conditional. If any of these get
10628 # implemented, it would be by hard-coding in the casing functions in
10629 # the Perl core, not through tables. But if there is a new condition
10630 # we don't know about, output a warning. We know about all the
10631 # conditions through 6.0
10632 if ($fields[4] ne "") {
10633 my @conditions = split ' ', $fields[4];
10634 if ($conditions[0] ne 'tr' # We know that these languages have
10635 # conditions, and some are multiple
10636 && $conditions[0] ne 'az'
10637 && $conditions[0] ne 'lt'
10638
10639 # And, we know about a single condition Final_Sigma, but
10640 # nothing else.
10641 && ($v_version gt v5.2.0
10642 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10643 {
10644 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10645 }
10646 elsif ($conditions[0] ne 'Final_Sigma') {
99870f4d 10647
6c0259ad
KW
10648 # Don't print out a message for Final_Sigma, because we
10649 # have hard-coded handling for it. (But the standard
10650 # could change what the rule should be, but it wouldn't
10651 # show up here anyway.
99870f4d 10652
6c0259ad 10653 print "# SKIPPING Special Casing: $_\n"
99870f4d 10654 if $verbosity >= $VERBOSE;
6c0259ad
KW
10655 }
10656 $_ = "";
10657 return;
10658 }
10659 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10660 $file->carp_bad_line('Extra fields');
10661 $_ = "";
10662 return;
99870f4d 10663 }
99870f4d 10664
6c0259ad
KW
10665 $_ = "$fields[0]; lc; $fields[1]";
10666 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10667 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
99870f4d 10668
6c0259ad
KW
10669 # Copy any simple case change to the special tables constructed if
10670 # being overridden by a multi-character case change.
10671 if ($fields[1] ne $fields[0]
10672 && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10673 {
10674 $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10675 }
10676 if ($fields[2] ne $fields[0]
10677 && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10678 {
10679 $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10680 }
10681 if ($fields[3] ne $fields[0]
10682 && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10683 {
10684 $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10685 }
d3fed3dd 10686
6c0259ad
KW
10687 return;
10688 }
d3fed3dd 10689}
99870f4d
KW
10690
10691sub filter_old_style_case_folding {
10692 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 10693 # and later style. Different letters were used in the earlier.
99870f4d
KW
10694
10695 my $file = shift;
10696 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10697
10698 my @fields = split /\s*;\s*/;
10699 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10700 $fields[1] = 'I';
10701 }
10702 elsif ($fields[1] eq 'L') {
10703 $fields[1] = 'C'; # L => C always
10704 }
10705 elsif ($fields[1] eq 'E') {
10706 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10707 $fields[1] = 'F'
10708 }
10709 else {
10710 $fields[1] = 'C'
10711 }
10712 }
10713 else {
10714 $file->carp_bad_line("Expecting L or E in second field");
10715 $_ = "";
10716 return;
10717 }
10718 $_ = join("; ", @fields) . ';';
10719 return;
10720}
10721
10722{ # Closure for case folding
10723
10724 # Create the map for simple only if are going to output it, for otherwise
10725 # it takes no part in anything we do.
10726 my $to_output_simple;
10727
99870f4d
KW
10728 sub setup_case_folding($) {
10729 # Read in the case foldings in CaseFolding.txt. This handles both
10730 # simple and full case folding.
10731
10732 $to_output_simple
10733 = property_ref('Simple_Case_Folding')->to_output_map;
10734
10735 return;
10736 }
10737
10738 sub filter_case_folding_line {
10739 # Called for each line in CaseFolding.txt
10740 # Input lines look like:
10741 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10742 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10743 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10744 #
10745 # 'C' means that folding is the same for both simple and full
10746 # 'F' that it is only for full folding
10747 # 'S' that it is only for simple folding
10748 # 'T' is locale-dependent, and ignored
10749 # 'I' is a type of 'F' used in some early releases.
10750 # Note the trailing semi-colon, unlike many of the input files. That
10751 # means that there will be an extra null field generated by the split
10752 # below, which we ignore and hence is not an error.
10753
10754 my $file = shift;
10755 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10756
10757 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10758 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10759 $file->carp_bad_line('Extra fields');
10760 $_ = "";
10761 return;
10762 }
10763
10764 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10765 $_ = "";
10766 return;
10767 }
10768
10769 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
3c099872
KW
10770 # I are all full foldings; S is single-char. For S, there is always
10771 # an F entry, so we must allow multiple values for the same code
10772 # point. Fortunately this table doesn't need further manipulation
10773 # which would preclude using multiple-values. The S is now included
10774 # so that _swash_inversion_hash() is able to construct closures
10775 # without having to worry about F mappings.
10776 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10777 $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
99870f4d
KW
10778 }
10779 else {
10780 $_ = "";
3c099872 10781 $file->carp_bad_line('Expecting C F I S or T in second field');
99870f4d
KW
10782 }
10783
10784 # C and S are simple foldings, but simple case folding is not needed
10785 # unless we explicitly want its map table output.
10786 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10787 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10788 }
10789
99870f4d
KW
10790 return;
10791 }
10792
99870f4d
KW
10793} # End case fold closure
10794
10795sub filter_jamo_line {
10796 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10797 # from this file that is used in generating the Name property for Jamo
10798 # code points. But, it also is used to convert early versions' syntax
10799 # into the modern form. Here are two examples:
10800 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10801 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10802 #
10803 # The input is $_, the output is $_ filtered.
10804
10805 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10806
10807 # Let the caller handle unexpected input. In earlier versions, there was
10808 # a third field which is supposed to be a comment, but did not have a '#'
10809 # before it.
10810 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10811
10812 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10813 # beginning.
10814
10815 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10816 $fields[1] = 'R' if $fields[0] eq '1105';
10817
10818 # Add to structure so can generate Names from it.
10819 my $cp = hex $fields[0];
10820 my $short_name = $fields[1];
10821 $Jamo{$cp} = $short_name;
10822 if ($cp <= $LBase + $LCount) {
10823 $Jamo_L{$short_name} = $cp - $LBase;
10824 }
10825 elsif ($cp <= $VBase + $VCount) {
10826 $Jamo_V{$short_name} = $cp - $VBase;
10827 }
10828 elsif ($cp <= $TBase + $TCount) {
10829 $Jamo_T{$short_name} = $cp - $TBase;
10830 }
10831 else {
10832 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10833 }
10834
10835
10836 # Reassemble using just the first two fields to look like a typical
10837 # property file line
10838 $_ = "$fields[0]; $fields[1]";
10839
10840 return;
10841}
10842
99870f4d
KW
10843sub register_fraction($) {
10844 # This registers the input rational number so that it can be passed on to
10845 # utf8_heavy.pl, both in rational and floating forms.
10846
10847 my $rational = shift;
10848
10849 my $float = eval $rational;
10850 $nv_floating_to_rational{$float} = $rational;
10851 return;
10852}
10853
10854sub filter_numeric_value_line {
10855 # DNumValues contains lines of a different syntax than the typical
10856 # property file:
10857 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10858 #
10859 # This routine transforms $_ containing the anomalous syntax to the
10860 # typical, by filtering out the extra columns, and convert early version
10861 # decimal numbers to strings that look like rational numbers.
10862
10863 my $file = shift;
10864 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10865
10866 # Starting in 5.1, there is a rational field. Just use that, omitting the
10867 # extra columns. Otherwise convert the decimal number in the second field
10868 # to a rational, and omit extraneous columns.
10869 my @fields = split /\s*;\s*/, $_, -1;
10870 my $rational;
10871
10872 if ($v_version ge v5.1.0) {
10873 if (@fields != 4) {
10874 $file->carp_bad_line('Not 4 semi-colon separated fields');
10875 $_ = "";
10876 return;
10877 }
10878 $rational = $fields[3];
10879 $_ = join '; ', @fields[ 0, 3 ];
10880 }
10881 else {
10882
10883 # Here, is an older Unicode file, which has decimal numbers instead of
10884 # rationals in it. Use the fraction to calculate the denominator and
10885 # convert to rational.
10886
10887 if (@fields != 2 && @fields != 3) {
10888 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10889 $_ = "";
10890 return;
10891 }
10892
10893 my $codepoints = $fields[0];
10894 my $decimal = $fields[1];
10895 if ($decimal =~ s/\.0+$//) {
10896
10897 # Anything ending with a decimal followed by nothing but 0's is an
10898 # integer
10899 $_ = "$codepoints; $decimal";
10900 $rational = $decimal;
10901 }
10902 else {
10903
10904 my $denominator;
10905 if ($decimal =~ /\.50*$/) {
10906 $denominator = 2;
10907 }
10908
10909 # Here have the hardcoded repeating decimals in the fraction, and
10910 # the denominator they imply. There were only a few denominators
10911 # in the older Unicode versions of this file which this code
10912 # handles, so it is easy to convert them.
10913
10914 # The 4 is because of a round-off error in the Unicode 3.2 files
10915 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10916 $denominator = 3;
10917 }
10918 elsif ($decimal =~ /\.[27]50*$/) {
10919 $denominator = 4;
10920 }
10921 elsif ($decimal =~ /\.[2468]0*$/) {
10922 $denominator = 5;
10923 }
10924 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10925 $denominator = 6;
10926 }
10927 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10928 $denominator = 8;
10929 }
10930 if ($denominator) {
10931 my $sign = ($decimal < 0) ? "-" : "";
10932 my $numerator = int((abs($decimal) * $denominator) + .5);
10933 $rational = "$sign$numerator/$denominator";
10934 $_ = "$codepoints; $rational";
10935 }
10936 else {
10937 $file->carp_bad_line("Can't cope with number '$decimal'.");
10938 $_ = "";
10939 return;
10940 }
10941 }
10942 }
10943
10944 register_fraction($rational) if $rational =~ qr{/};
10945 return;
10946}
10947
10948{ # Closure
10949 my %unihan_properties;
10950 my $iicore;
10951
10952
10953 sub setup_unihan {
10954 # Do any special setup for Unihan properties.
10955
10956 # This property gives the wrong computed type, so override.
10957 my $usource = property_ref('kIRG_USource');
10958 $usource->set_type($STRING) if defined $usource;
10959
b2abbe5b
KW
10960 # This property is to be considered binary (it says so in
10961 # http://www.unicode.org/reports/tr38/)
99870f4d
KW
10962 $iicore = property_ref('kIICore');
10963 if (defined $iicore) {
b2abbe5b 10964 $iicore->set_type($BINARY);
99870f4d
KW
10965
10966 # We have to change the default map, because the @missing line is
10967 # misleading, given that we are treating it as binary.
10968 $iicore->set_default_map('N');
cc6d1d88
KW
10969 $iicore->table("Y")
10970 ->add_note("Converted to a binary property as per unicode.org UAX #38.");
99870f4d
KW
10971 }
10972
10973 return;
10974 }
10975
10976 sub filter_unihan_line {
10977 # Change unihan db lines to look like the others in the db. Here is
10978 # an input sample:
10979 # U+341C kCangjie IEKN
10980
10981 # Tabs are used instead of semi-colons to separate fields; therefore
10982 # they may have semi-colons embedded in them. Change these to periods
10983 # so won't screw up the rest of the code.
10984 s/;/./g;
10985
10986 # Remove lines that don't look like ones we accept.
10987 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10988 $_ = "";
10989 return;
10990 }
10991
10992 # Extract the property, and save a reference to its object.
10993 my $property = $1;
10994 if (! exists $unihan_properties{$property}) {
10995 $unihan_properties{$property} = property_ref($property);
10996 }
10997
10998 # Don't do anything unless the property is one we're handling, which
10999 # we determine by seeing if there is an object defined for it or not
11000 if (! defined $unihan_properties{$property}) {
11001 $_ = "";
11002 return;
11003 }
11004
11005 # The iicore property is supposed to be a boolean, so convert to our
11006 # standard boolean form.
11007 if (defined $iicore && $unihan_properties{$property} == $iicore) {
11008 $_ =~ s/$property.*/$property\tY/
11009 }
11010
11011 # Convert the tab separators to our standard semi-colons, and convert
11012 # the U+HHHH notation to the rest of the standard's HHHH
11013 s/\t/;/g;
11014 s/\b U \+ (?= $code_point_re )//xg;
11015
11016 #local $to_trace = 1 if main::DEBUG;
11017 trace $_ if main::DEBUG && $to_trace;
11018
11019 return;
11020 }
11021}
11022
11023sub filter_blocks_lines {
11024 # In the Blocks.txt file, the names of the blocks don't quite match the
11025 # names given in PropertyValueAliases.txt, so this changes them so they
11026 # do match: Blanks and hyphens are changed into underscores. Also makes
11027 # early release versions look like later ones
11028 #
11029 # $_ is transformed to the correct value.
11030
11031 my $file = shift;
11032 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11033
11034 if ($v_version lt v3.2.0) {
11035 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11036 $_ = "";
11037 return;
11038 }
11039
11040 # Old versions used a different syntax to mark the range.
11041 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11042 }
11043
11044 my @fields = split /\s*;\s*/, $_, -1;
11045 if (@fields != 2) {
11046 $file->carp_bad_line("Expecting exactly two fields");
11047 $_ = "";
11048 return;
11049 }
11050
11051 # Change hyphens and blanks in the block name field only
11052 $fields[1] =~ s/[ -]/_/g;
11053 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11054
11055 $_ = join("; ", @fields);
11056 return;
11057}
11058
11059{ # Closure
11060 my $current_property;
11061
11062 sub filter_old_style_proplist {
11063 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11064 # was in a completely different syntax. Ken Whistler of Unicode says
11065 # that it was something he used as an aid for his own purposes, but
11066 # was never an official part of the standard. However, comments in
11067 # DAge.txt indicate that non-character code points were available in
11068 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11069 # there except through this file (but on the other hand, they first
11070 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11071 # not. But the claim is that it was published as an aid to others who
11072 # might want some more information than was given in the official UCD
11073 # of the time. Many of the properties in it were incorporated into
11074 # the later PropList.txt, but some were not. This program uses this
11075 # early file to generate property tables that are otherwise not
11076 # accessible in the early UCD's, and most were probably not really
11077 # official at that time, so one could argue that it should be ignored,
11078 # and you can easily modify things to skip this. And there are bugs
11079 # in this file in various versions. (For example, the 2.1.9 version
11080 # removes from Alphabetic the CJK range starting at 4E00, and they
11081 # weren't added back in until 3.1.0.) Many of this file's properties
11082 # were later sanctioned, so this code generates tables for those
11083 # properties that aren't otherwise in the UCD of the time but
11084 # eventually did become official, and throws away the rest. Here is a
11085 # list of all the ones that are thrown away:
11086 # Bidi=* duplicates UnicodeData.txt
11087 # Combining never made into official property;
11088 # is \P{ccc=0}
11089 # Composite never made into official property.
11090 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11091 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11092 # Delimiter never made into official property;
11093 # removed in 3.0.1
11094 # Format Control never made into official property;
11095 # similar to gc=cf
11096 # High Surrogate duplicates Blocks.txt
11097 # Ignorable Control never made into official property;
11098 # similar to di=y
11099 # ISO Control duplicates UnicodeData.txt: gc=cc
11100 # Left of Pair never made into official property;
11101 # Line Separator duplicates UnicodeData.txt: gc=zl
11102 # Low Surrogate duplicates Blocks.txt
11103 # Non-break was actually listed as a property
11104 # in 3.2, but without any code
11105 # points. Unicode denies that this
11106 # was ever an official property
11107 # Non-spacing duplicate UnicodeData.txt: gc=mn
11108 # Numeric duplicates UnicodeData.txt: gc=cc
11109 # Paired Punctuation never made into official property;
11110 # appears to be gc=ps + gc=pe
11111 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11112 # Private Use duplicates UnicodeData.txt: gc=co
11113 # Private Use High Surrogate duplicates Blocks.txt
11114 # Punctuation duplicates UnicodeData.txt: gc=p
11115 # Space different definition than eventual
11116 # one.
11117 # Titlecase duplicates UnicodeData.txt: gc=lt
11118 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
98dc9551 11119 # Zero-width never made into official property;
99870f4d
KW
11120 # subset of gc=cf
11121 # Most of the properties have the same names in this file as in later
11122 # versions, but a couple do not.
11123 #
11124 # This subroutine filters $_, converting it from the old style into
11125 # the new style. Here's a sample of the old-style
11126 #
11127 # *******************************************
11128 #
11129 # Property dump for: 0x100000A0 (Join Control)
11130 #
11131 # 200C..200D (2 chars)
11132 #
11133 # In the example, the property is "Join Control". It is kept in this
11134 # closure between calls to the subroutine. The numbers beginning with
11135 # 0x were internal to Ken's program that generated this file.
11136
11137 # If this line contains the property name, extract it.
11138 if (/^Property dump for: [^(]*\((.*)\)/) {
11139 $_ = $1;
11140
11141 # Convert white space to underscores.
11142 s/ /_/g;
11143
11144 # Convert the few properties that don't have the same name as
11145 # their modern counterparts
11146 s/Identifier_Part/ID_Continue/
11147 or s/Not_a_Character/NChar/;
11148
11149 # If the name matches an existing property, use it.
11150 if (defined property_ref($_)) {
11151 trace "new property=", $_ if main::DEBUG && $to_trace;
11152 $current_property = $_;
11153 }
11154 else { # Otherwise discard it
11155 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11156 undef $current_property;
11157 }
11158 $_ = ""; # The property is saved for the next lines of the
11159 # file, but this defining line is of no further use,
11160 # so clear it so that the caller won't process it
11161 # further.
11162 }
11163 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11164
11165 # Here, the input line isn't a header defining a property for the
11166 # following section, and either we aren't in such a section, or
11167 # the line doesn't look like one that defines the code points in
11168 # such a section. Ignore this line.
11169 $_ = "";
11170 }
11171 else {
11172
11173 # Here, we have a line defining the code points for the current
11174 # stashed property. Anything starting with the first blank is
11175 # extraneous. Otherwise, it should look like a normal range to
11176 # the caller. Append the property name so that it looks just like
11177 # a modern PropList entry.
11178
11179 $_ =~ s/\s.*//;
11180 $_ .= "; $current_property";
11181 }
11182 trace $_ if main::DEBUG && $to_trace;
11183 return;
11184 }
11185} # End closure for old style proplist
11186
11187sub filter_old_style_normalization_lines {
11188 # For early releases of Unicode, the lines were like:
11189 # 74..2A76 ; NFKD_NO
11190 # For later releases this became:
11191 # 74..2A76 ; NFKD_QC; N
11192 # Filter $_ to look like those in later releases.
11193 # Similarly for MAYBEs
11194
11195 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11196
11197 # Also, the property FC_NFKC was abbreviated to FNC
11198 s/FNC/FC_NFKC/;
11199 return;
11200}
11201
82aed44a
KW
11202sub setup_script_extensions {
11203 # The Script_Extensions property starts out with a clone of the Script
11204 # property.
11205
11206 my $sc = property_ref("Script");
11207 my $scx = Property->new("scx", Full_Name => "Script_Extensions",
11208 Initialize => $sc,
11209 Default_Map => $sc->default_map,
11210 Pre_Declared_Maps => 0,
11211 );
11212 $scx->add_comment(join_lines( <<END
11213The values for code points that appear in one script are just the same as for
11214the 'Script' property. Likewise the values for those that appear in many
11215scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11216values of code points that appear in a few scripts are a space separated list
11217of those scripts.
11218END
11219 ));
11220
11221 # Make the scx's tables and aliases for them the same as sc's
11222 foreach my $table ($sc->tables) {
11223 my $scx_table = $scx->add_match_table($table->name,
11224 Full_Name => $table->full_name);
11225 foreach my $alias ($table->aliases) {
11226 $scx_table->add_alias($alias->name);
11227 }
11228 }
11229}
11230
99870f4d
KW
11231sub finish_Unicode() {
11232 # This routine should be called after all the Unicode files have been read
11233 # in. It:
11234 # 1) Adds the mappings for code points missing from the files which have
11235 # defaults specified for them.
11236 # 2) At this this point all mappings are known, so it computes the type of
11237 # each property whose type hasn't been determined yet.
11238 # 3) Calculates all the regular expression match tables based on the
11239 # mappings.
11240 # 3) Calculates and adds the tables which are defined by Unicode, but
11241 # which aren't derived by them
11242
11243 # For each property, fill in any missing mappings, and calculate the re
11244 # match tables. If a property has more than one missing mapping, the
11245 # default is a reference to a data structure, and requires data from other
11246 # properties to resolve. The sort is used to cause these to be processed
11247 # last, after all the other properties have been calculated.
11248 # (Fortunately, the missing properties so far don't depend on each other.)
11249 foreach my $property
11250 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11251 property_ref('*'))
11252 {
11253 # $perl has been defined, but isn't one of the Unicode properties that
11254 # need to be finished up.
11255 next if $property == $perl;
11256
11257 # Handle the properties that have more than one possible default
11258 if (ref $property->default_map) {
11259 my $default_map = $property->default_map;
11260
11261 # These properties have stored in the default_map:
11262 # One or more of:
11263 # 1) A default map which applies to all code points in a
11264 # certain class
11265 # 2) an expression which will evaluate to the list of code
11266 # points in that class
11267 # And
11268 # 3) the default map which applies to every other missing code
11269 # point.
11270 #
11271 # Go through each list.
11272 while (my ($default, $eval) = $default_map->get_next_defaults) {
11273
11274 # Get the class list, and intersect it with all the so-far
11275 # unspecified code points yielding all the code points
11276 # in the class that haven't been specified.
11277 my $list = eval $eval;
11278 if ($@) {
11279 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11280 last;
11281 }
11282
11283 # Narrow down the list to just those code points we don't have
11284 # maps for yet.
11285 $list = $list & $property->inverse_list;
11286
11287 # Add mappings to the property for each code point in the list
11288 foreach my $range ($list->ranges) {
56343c78
KW
11289 $property->add_map($range->start, $range->end, $default,
11290 Replace => $CROAK);
99870f4d
KW
11291 }
11292 }
11293
11294 # All remaining code points have the other mapping. Set that up
11295 # so the normal single-default mapping code will work on them
11296 $property->set_default_map($default_map->other_default);
11297
11298 # And fall through to do that
11299 }
11300
11301 # We should have enough data now to compute the type of the property.
11302 $property->compute_type;
11303 my $property_type = $property->type;
11304
11305 next if ! $property->to_create_match_tables;
11306
11307 # Here want to create match tables for this property
11308
11309 # The Unicode db always (so far, and they claim into the future) have
11310 # the default for missing entries in binary properties be 'N' (unless
11311 # there is a '@missing' line that specifies otherwise)
11312 if ($property_type == $BINARY && ! defined $property->default_map) {
11313 $property->set_default_map('N');
11314 }
11315
11316 # Add any remaining code points to the mapping, using the default for
5d7f7709 11317 # missing code points.
99870f4d 11318 if (defined (my $default_map = $property->default_map)) {
1520492f 11319
f4c2a127
KW
11320 # Make sure there is a match table for the default
11321 my $default_table;
11322 if (! defined ($default_table = $property->table($default_map))) {
11323 $default_table = $property->add_match_table($default_map);
11324 }
11325
a92d5c2e
KW
11326 # And, if the property is binary, the default table will just
11327 # be the complement of the other table.
11328 if ($property_type == $BINARY) {
11329 my $non_default_table;
11330
11331 # Find the non-default table.
11332 for my $table ($property->tables) {
11333 next if $table == $default_table;
11334 $non_default_table = $table;
11335 }
11336 $default_table->set_complement($non_default_table);
11337 }
11338
e1759d04
KW
11339 # This fills in any missing values with the default. It's
11340 # tempting to save some time and memory in running this program
11341 # by skipping this step for binary tables where the default
11342 # is easily calculated. But it is needed for generating
11343 # the test file, and other changes would also be required to do
11344 # so.
1520492f
KW
11345 $property->add_map(0, $LAST_UNICODE_CODEPOINT,
11346 $default_map, Replace => $NO);
99870f4d
KW
11347 }
11348
11349 # Have all we need to populate the match tables.
11350 my $property_name = $property->name;
56557540 11351 my $maps_should_be_defined = $property->pre_declared_maps;
99870f4d
KW
11352 foreach my $range ($property->ranges) {
11353 my $map = $range->value;
11354 my $table = property_ref($property_name)->table($map);
11355 if (! defined $table) {
11356
11357 # Integral and rational property values are not necessarily
56557540
KW
11358 # defined in PropValueAliases, but whether all the other ones
11359 # should be depends on the property.
11360 if ($maps_should_be_defined
99870f4d
KW
11361 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11362 {
11363 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
11364 }
11365 $table = property_ref($property_name)->add_match_table($map);
11366 }
11367
11368 $table->add_range($range->start, $range->end);
11369 }
11370
807807b7
KW
11371 # For Perl 5.6 compatibility, all properties matchable in regexes can
11372 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
11373 # But warn if this creates a conflict with a (new) Unicode property
11374 # name, although it appears that Unicode has made a decision never to
11375 # begin a property name with 'Is_', so this shouldn't happen.
99870f4d
KW
11376 foreach my $alias ($property->aliases) {
11377 my $Is_name = 'Is_' . $alias->name;
807807b7 11378 if (defined (my $pre_existing = property_ref($Is_name))) {
99870f4d 11379 Carp::my_carp(<<END
807807b7
KW
11380There is already an alias named $Is_name (from " . $pre_existing . "), so
11381creating one for $property won't work. This is bad news. If it is not too
11382late, get Unicode to back off. Otherwise go back to the old scheme (findable
11383from the git blame log for this area of the code that suppressed individual
11384aliases that conflict with the new Unicode names. Proceeding anyway.
99870f4d
KW
11385END
11386 );
99870f4d
KW
11387 }
11388 } # End of loop through aliases for this property
11389 } # End of loop through all Unicode properties.
11390
11391 # Fill in the mappings that Unicode doesn't completely furnish. First the
11392 # single letter major general categories. If Unicode were to start
11393 # delivering the values, this would be redundant, but better that than to
11394 # try to figure out if should skip and not get it right. Ths could happen
11395 # if a new major category were to be introduced, and the hard-coded test
11396 # wouldn't know about it.
11397 # This routine depends on the standard names for the general categories
11398 # being what it thinks they are, like 'Cn'. The major categories are the
11399 # union of all the general category tables which have the same first
11400 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11401 foreach my $minor_table ($gc->tables) {
11402 my $minor_name = $minor_table->name;
11403 next if length $minor_name == 1;
11404 if (length $minor_name != 2) {
11405 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11406 next;
11407 }
11408
11409 my $major_name = uc(substr($minor_name, 0, 1));
11410 my $major_table = $gc->table($major_name);
11411 $major_table += $minor_table;
11412 }
11413
11414 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11415 # defines it as LC)
11416 my $LC = $gc->table('LC');
11417 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11418 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11419
11420
11421 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11422 # deliver the correct values in it
11423 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11424
11425 # Lt not in release 1.
a5c376b7
KW
11426 if (defined $gc->table('Lt')) {
11427 $LC += $gc->table('Lt');
11428 $gc->table('Lt')->set_caseless_equivalent($LC);
11429 }
99870f4d
KW
11430 }
11431 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11432
a5c376b7
KW
11433 $gc->table('Ll')->set_caseless_equivalent($LC);
11434 $gc->table('Lu')->set_caseless_equivalent($LC);
11435
99870f4d 11436 my $Cs = $gc->table('Cs');
99870f4d
KW
11437
11438
11439 # Folding information was introduced later into Unicode data. To get
11440 # Perl's case ignore (/i) to work at all in releases that don't have
11441 # folding, use the best available alternative, which is lower casing.
11442 my $fold = property_ref('Simple_Case_Folding');
11443 if ($fold->is_empty) {
11444 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11445 $fold->add_note(join_lines(<<END
11446WARNING: This table uses lower case as a substitute for missing fold
11447information
11448END
11449 ));
11450 }
11451
11452 # Multiple-character mapping was introduced later into Unicode data. If
11453 # missing, use the single-characters maps as best available alternative
11454 foreach my $map (qw { Uppercase_Mapping
11455 Lowercase_Mapping
11456 Titlecase_Mapping
11457 Case_Folding
11458 } ) {
11459 my $full = property_ref($map);
11460 if ($full->is_empty) {
11461 my $simple = property_ref('Simple_' . $map);
11462 $full->initialize($simple);
11463 $full->add_comment($simple->comment) if ($simple->comment);
11464 $full->add_note(join_lines(<<END
11465WARNING: This table uses simple mapping (single-character only) as a
11466substitute for missing multiple-character information
11467END
11468 ));
11469 }
11470 }
82aed44a
KW
11471
11472 # The Script_Extensions property started out as a clone of the Script
11473 # property. But processing its data file caused some elements to be
11474 # replaced with different data. (These elements were for the Common and
11475 # Inherited properties.) This data is a qw() list of all the scripts that
11476 # the code points in the given range are in. An example line is:
11477 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
11478 #
11479 # The code above has created a new match table named "Arab Syrc Thaa"
11480 # which contains 060C. (The cloned table started out with this code point
11481 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
11482 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
11483 # match table. This is repeated for all these tables and ranges. The map
11484 # data is retained in the map table for reference, but the spurious match
11485 # tables are deleted.
11486
11487 my $scx = property_ref("Script_Extensions");
11488 foreach my $table ($scx->tables) {
c12f2655
KW
11489 next unless $table->name =~ /\s/; # All the new and only the new
11490 # tables have a space in their
11491 # names
82aed44a
KW
11492 my @scripts = split /\s+/, $table->name;
11493 foreach my $script (@scripts) {
11494 my $script_table = $scx->table($script);
11495 $script_table += $table;
11496 }
11497 $scx->delete_match_table($table);
11498 }
11499
11500 return;
99870f4d
KW
11501}
11502
11503sub compile_perl() {
11504 # Create perl-defined tables. Almost all are part of the pseudo-property
11505 # named 'perl' internally to this program. Many of these are recommended
11506 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11507 # on those found there.
11508 # Almost all of these are equivalent to some Unicode property.
11509 # A number of these properties have equivalents restricted to the ASCII
11510 # range, with their names prefaced by 'Posix', to signify that these match
11511 # what the Posix standard says they should match. A couple are
11512 # effectively this, but the name doesn't have 'Posix' in it because there
cbc24f92
KW
11513 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
11514 # to the full Unicode range, by our guesses as to what is appropriate.
99870f4d
KW
11515
11516 # 'Any' is all code points. As an error check, instead of just setting it
11517 # to be that, construct it to be the union of all the major categories
7fc6cb55 11518 $Any = $perl->add_match_table('Any',
99870f4d
KW
11519 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
11520 Matches_All => 1);
11521
11522 foreach my $major_table ($gc->tables) {
11523
11524 # Major categories are the ones with single letter names.
11525 next if length($major_table->name) != 1;
11526
11527 $Any += $major_table;
11528 }
11529
11530 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
11531 Carp::my_carp_bug("Generated highest code point ("
11532 . sprintf("%X", $Any->max)
11533 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
11534 }
11535 if ($Any->range_count != 1 || $Any->min != 0) {
11536 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11537 }
11538
11539 $Any->add_alias('All');
11540
11541 # Assigned is the opposite of gc=unassigned
11542 my $Assigned = $perl->add_match_table('Assigned',
11543 Description => "All assigned code points",
11544 Initialize => ~ $gc->table('Unassigned'),
11545 );
11546
11547 # Our internal-only property should be treated as more than just a
11548 # synonym.
11549 $perl->add_match_table('_CombAbove')
11550 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11551 Related => 1);
11552
11553 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11554 if (defined $block) { # This is equivalent to the block if have it.
11555 my $Unicode_ASCII = $block->table('Basic_Latin');
11556 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11557 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11558 }
11559 }
11560
11561 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11562 # necessary
11563 if ($ASCII->is_empty) {
11564 $ASCII->initialize([ 0..127 ]);
11565 }
11566
99870f4d
KW
11567 # Get the best available case definitions. Early Unicode versions didn't
11568 # have Uppercase and Lowercase defined, so use the general category
11569 # instead for them.
11570 my $Lower = $perl->add_match_table('Lower');
11571 my $Unicode_Lower = property_ref('Lowercase');
11572 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11573 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
a5c376b7
KW
11574 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11575 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11576 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11577
99870f4d
KW
11578 }
11579 else {
11580 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11581 Related => 1);
11582 }
cbc24f92 11583 $Lower->add_alias('XPosixLower');
a5c376b7 11584 my $Posix_Lower = $perl->add_match_table("PosixLower",
ad5e8af1
KW
11585 Description => "[a-z]",
11586 Initialize => $Lower & $ASCII,
11587 );
99870f4d
KW
11588
11589 my $Upper = $perl->add_match_table('Upper');
11590 my $Unicode_Upper = property_ref('Uppercase');
11591 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11592 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
a5c376b7
KW
11593 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11594 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11595 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
99870f4d
KW
11596 }
11597 else {
11598 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11599 Related => 1);
11600 }
cbc24f92 11601 $Upper->add_alias('XPosixUpper');
a5c376b7 11602 my $Posix_Upper = $perl->add_match_table("PosixUpper",
ad5e8af1
KW
11603 Description => "[A-Z]",
11604 Initialize => $Upper & $ASCII,
11605 );
99870f4d
KW
11606
11607 # Earliest releases didn't have title case. Initialize it to empty if not
11608 # otherwise present
4364919a
KW
11609 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11610 Description => '(= \p{Gc=Lt})');
99870f4d 11611 my $lt = $gc->table('Lt');
a5c376b7
KW
11612
11613 # Earlier versions of mktables had this related to $lt since they have
c12f2655
KW
11614 # identical code points, but their caseless equivalents are not the same,
11615 # one being 'Cased' and the other being 'LC', and so now must be kept as
11616 # separate entities.
a5c376b7 11617 $Title += $lt if defined $lt;
99870f4d
KW
11618
11619 # If this Unicode version doesn't have Cased, set up our own. From
11620 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11621 # and only if C has the Lowercase or Uppercase property or has a
11622 # General_Category value of Titlecase_Letter.
a5c376b7
KW
11623 my $Unicode_Cased = property_ref('Cased');
11624 unless (defined $Unicode_Cased) {
99870f4d
KW
11625 my $cased = $perl->add_match_table('Cased',
11626 Initialize => $Lower + $Upper + $Title,
11627 Description => 'Uppercase or Lowercase or Titlecase',
11628 );
a5c376b7 11629 $Unicode_Cased = $cased;
99870f4d 11630 }
a5c376b7 11631 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
99870f4d
KW
11632
11633 # Similarly, set up our own Case_Ignorable property if this Unicode
11634 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11635 # C is defined to be case-ignorable if C has the value MidLetter or the
11636 # value MidNumLet for the Word_Break property or its General_Category is
11637 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11638 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11639
11640 # Perl has long had an internal-only alias for this property.
11641 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
11642 my $case_ignorable = property_ref('Case_Ignorable');
11643 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11644 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11645 Related => 1);
11646 }
11647 else {
11648
11649 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11650
11651 # The following three properties are not in early releases
11652 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11653 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11654 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11655
11656 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11657 # correspondingly the case-ignorable definition lacks that one. For
11658 # 4.0, it appears that it was meant to be the same definition, but was
11659 # inadvertently omitted from the standard's text, so add it if the
11660 # property actually is there
11661 my $wb = property_ref('Word_Break');
11662 if (defined $wb) {
11663 my $midlet = $wb->table('MidLetter');
11664 $perl_case_ignorable += $midlet if defined $midlet;
11665 my $midnumlet = $wb->table('MidNumLet');
11666 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11667 }
11668 else {
11669
11670 # In earlier versions of the standard, instead of the above two
11671 # properties , just the following characters were used:
11672 $perl_case_ignorable += 0x0027 # APOSTROPHE
11673 + 0x00AD # SOFT HYPHEN (SHY)
11674 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11675 }
11676 }
11677
11678 # The remaining perl defined tables are mostly based on Unicode TR 18,
11679 # "Annex C: Compatibility Properties". All of these have two versions,
11680 # one whose name generally begins with Posix that is posix-compliant, and
11681 # one that matches Unicode characters beyond the Posix, ASCII range
11682
ad5e8af1 11683 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
11684
11685 # Alphabetic was not present in early releases
11686 my $Alphabetic = property_ref('Alphabetic');
11687 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11688 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11689 }
11690 else {
11691
11692 # For early releases, we don't get it exactly right. The below
11693 # includes more than it should, which in 5.2 terms is: L + Nl +
11694 # Other_Alphabetic. Other_Alphabetic contains many characters from
11695 # Mn and Mc. It's better to match more than we should, than less than
11696 # we should.
11697 $Alpha->initialize($gc->table('Letter')
11698 + $gc->table('Mn')
11699 + $gc->table('Mc'));
11700 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 11701 $Alpha->add_description('Alphabetic');
99870f4d 11702 }
cbc24f92 11703 $Alpha->add_alias('XPosixAlpha');
a5c376b7 11704 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
ad5e8af1
KW
11705 Description => "[A-Za-z]",
11706 Initialize => $Alpha & $ASCII,
11707 );
a5c376b7
KW
11708 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11709 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
99870f4d
KW
11710
11711 my $Alnum = $perl->add_match_table('Alnum',
ad5e8af1 11712 Description => 'Alphabetic and (Decimal) Numeric',
99870f4d
KW
11713 Initialize => $Alpha + $gc->table('Decimal_Number'),
11714 );
cbc24f92 11715 $Alnum->add_alias('XPosixAlnum');
ad5e8af1
KW
11716 $perl->add_match_table("PosixAlnum",
11717 Description => "[A-Za-z0-9]",
11718 Initialize => $Alnum & $ASCII,
11719 );
99870f4d
KW
11720
11721 my $Word = $perl->add_match_table('Word',
d35dd6c6
KW
11722 Description => '\w, including beyond ASCII;'
11723 . ' = \p{Alnum} + \pM + \p{Pc}',
99870f4d
KW
11724 Initialize => $Alnum + $gc->table('Mark'),
11725 );
cbc24f92 11726 $Word->add_alias('XPosixWord');
99870f4d
KW
11727 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11728 $Word += $Pc if defined $Pc;
11729
f38f76ae 11730 # This is a Perl extension, so the name doesn't begin with Posix.
cbc24f92 11731 my $PerlWord = $perl->add_match_table('PerlWord',
99870f4d
KW
11732 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11733 Initialize => $Word & $ASCII,
11734 );
cbc24f92 11735 $PerlWord->add_alias('PosixWord');
99870f4d
KW
11736
11737 my $Blank = $perl->add_match_table('Blank',
11738 Description => '\h, Horizontal white space',
11739
11740 # 200B is Zero Width Space which is for line
11741 # break control, and was listed as
11742 # Space_Separator in early releases
11743 Initialize => $gc->table('Space_Separator')
11744 + 0x0009 # TAB
11745 - 0x200B, # ZWSP
11746 );
11747 $Blank->add_alias('HorizSpace'); # Another name for it.
cbc24f92 11748 $Blank->add_alias('XPosixBlank');
ad5e8af1
KW
11749 $perl->add_match_table("PosixBlank",
11750 Description => "\\t and ' '",
11751 Initialize => $Blank & $ASCII,
11752 );
99870f4d
KW
11753
11754 my $VertSpace = $perl->add_match_table('VertSpace',
11755 Description => '\v',
11756 Initialize => $gc->table('Line_Separator')
11757 + $gc->table('Paragraph_Separator')
11758 + 0x000A # LINE FEED
11759 + 0x000B # VERTICAL TAB
11760 + 0x000C # FORM FEED
11761 + 0x000D # CARRIAGE RETURN
11762 + 0x0085, # NEL
11763 );
11764 # No Posix equivalent for vertical space
11765
11766 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
11767 Description => '\s including beyond ASCII plus vertical tab',
11768 Initialize => $Blank + $VertSpace,
99870f4d 11769 );
cbc24f92 11770 $Space->add_alias('XPosixSpace');
ad5e8af1 11771 $perl->add_match_table("PosixSpace",
f38f76ae 11772 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
11773 Initialize => $Space & $ASCII,
11774 );
99870f4d
KW
11775
11776 # Perl's traditional space doesn't include Vertical Tab
cbc24f92 11777 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
99870f4d
KW
11778 Description => '\s, including beyond ASCII',
11779 Initialize => $Space - 0x000B,
11780 );
cbc24f92
KW
11781 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
11782 my $PerlSpace = $perl->add_match_table('PerlSpace',
de25ec47
KW
11783 Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11784 Initialize => $XPerlSpace & $ASCII,
99870f4d
KW
11785 );
11786
cbc24f92 11787
99870f4d 11788 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 11789 Description => 'Control characters');
99870f4d 11790 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
cbc24f92 11791 $Cntrl->add_alias('XPosixCntrl');
ad5e8af1 11792 $perl->add_match_table("PosixCntrl",
f38f76ae 11793 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
11794 Initialize => $Cntrl & $ASCII,
11795 );
99870f4d
KW
11796
11797 # $controls is a temporary used to construct Graph.
11798 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11799 + $gc->table('Control'));
11800 # Cs not in release 1
11801 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11802
11803 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11804 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 11805 Description => 'Characters that are graphical',
99870f4d
KW
11806 Initialize => ~ ($Space + $controls),
11807 );
cbc24f92 11808 $Graph->add_alias('XPosixGraph');
ad5e8af1 11809 $perl->add_match_table("PosixGraph",
f38f76ae
KW
11810 Description =>
11811 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
11812 Initialize => $Graph & $ASCII,
11813 );
99870f4d 11814
3e20195b 11815 $print = $perl->add_match_table('Print',
ad5e8af1 11816 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 11817 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 11818 );
cbc24f92 11819 $print->add_alias('XPosixPrint');
ad5e8af1 11820 $perl->add_match_table("PosixPrint",
66fd7fd0 11821 Description =>
f38f76ae 11822 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 11823 Initialize => $print & $ASCII,
ad5e8af1 11824 );
99870f4d
KW
11825
11826 my $Punct = $perl->add_match_table('Punct');
11827 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11828
11829 # \p{punct} doesn't include the symbols, which posix does
cbc24f92
KW
11830 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
11831 Description => '\p{Punct} + ASCII-range \p{Symbol}',
11832 Initialize => $gc->table('Punctuation')
11833 + ($ASCII & $gc->table('Symbol')),
11834 );
99870f4d 11835 $perl->add_match_table('PosixPunct',
f38f76ae 11836 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
cbc24f92 11837 Initialize => $ASCII & $XPosixPunct,
ad5e8af1 11838 );
99870f4d
KW
11839
11840 my $Digit = $perl->add_match_table('Digit',
f3a73f6e 11841 Description => '[0-9] + all other decimal digits');
99870f4d 11842 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
cbc24f92 11843 $Digit->add_alias('XPosixDigit');
ad5e8af1
KW
11844 my $PosixDigit = $perl->add_match_table("PosixDigit",
11845 Description => '[0-9]',
11846 Initialize => $Digit & $ASCII,
11847 );
99870f4d 11848
eadadd41
KW
11849 # Hex_Digit was not present in first release
11850 my $Xdigit = $perl->add_match_table('XDigit');
cbc24f92 11851 $Xdigit->add_alias('XPosixXDigit');
eadadd41
KW
11852 my $Hex = property_ref('Hex_Digit');
11853 if (defined $Hex && ! $Hex->is_empty) {
11854 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
11855 }
11856 else {
eadadd41
KW
11857 # (Have to use hex instead of e.g. '0', because could be running on an
11858 # non-ASCII machine, and we want the Unicode (ASCII) values)
11859 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11860 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11861 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d 11862 }
4efcc33b
KW
11863
11864 # AHex was not present in early releases
11865 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
11866 my $AHex = property_ref('ASCII_Hex_Digit');
11867 if (defined $AHex && ! $AHex->is_empty) {
11868 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
11869 }
11870 else {
11871 $PosixXDigit->initialize($Xdigit & $ASCII);
11872 }
11873 $PosixXDigit->add_description('[0-9A-Fa-f]');
99870f4d 11874
99870f4d
KW
11875 my $dt = property_ref('Decomposition_Type');
11876 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11877 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11878 Perl_Extension => 1,
d57ccc9a 11879 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
11880 );
11881
11882 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11883 # than SD appeared, construct it ourselves, based on the first release SD
11884 # was in.
11885 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11886 my $soft_dotted = property_ref('Soft_Dotted');
11887 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11888 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11889 }
11890 else {
11891
11892 # This list came from 3.2 Soft_Dotted.
11893 $CanonDCIJ->initialize([ 0x0069,
11894 0x006A,
11895 0x012F,
11896 0x0268,
11897 0x0456,
11898 0x0458,
11899 0x1E2D,
11900 0x1ECB,
11901 ]);
11902 $CanonDCIJ = $CanonDCIJ & $Assigned;
11903 }
11904
f86864ac 11905 # These are used in Unicode's definition of \X
37e2e78e
KW
11906 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11907 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11908
ee24a51c
KW
11909 # For backward compatibility, Perl has its own definition for IDStart
11910 # First, we include the underscore, and then the regular XID_Start also
11911 # have to be Words
11912 $perl->add_match_table('_Perl_IDStart',
11913 Perl_Extension => 1,
11914 Internal_Only => 1,
11915 Initialize =>
11916 ord('_')
11917 + (property_ref('XID_Start')->table('Y') & $Word)
11918 );
11919
99870f4d 11920 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 11921
678f13d5 11922 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
11923 # definition differs too much from the traditional Perl one to use.
11924 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11925
11926 # Note that assumes HST is defined; it came in an earlier release than
11927 # GCB. In the line below, two negatives means: yes hangul
11928 $begin += ~ property_ref('Hangul_Syllable_Type')
11929 ->table('Not_Applicable')
11930 + ~ ($gcb->table('Control')
11931 + $gcb->table('CR')
11932 + $gcb->table('LF'));
11933 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11934
11935 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11936 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
11937 }
11938 else { # Old definition, used on early releases.
f86864ac 11939 $extend += $gc->table('Mark')
37e2e78e
KW
11940 + 0x200C # ZWNJ
11941 + 0x200D; # ZWJ
11942 $begin += ~ $extend;
11943
11944 # Here we may have a release that has the regular grapheme cluster
11945 # defined, or a release that doesn't have anything defined.
11946 # We set things up so the Perl core degrades gracefully, possibly with
11947 # placeholders that match nothing.
11948
11949 if (! defined $gcb) {
11950 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11951 }
11952 my $hst = property_ref('HST');
11953 if (!defined $hst) {
11954 $hst = Property->new('HST', Status => $PLACEHOLDER);
11955 $hst->add_match_table('Not_Applicable',
11956 Initialize => $Any,
11957 Matches_All => 1);
11958 }
11959
11960 # On some releases, here we may not have the needed tables for the
11961 # perl core, in some releases we may.
11962 foreach my $name (qw{ L LV LVT T V prepend }) {
11963 my $table = $gcb->table($name);
11964 if (! defined $table) {
11965 $table = $gcb->add_match_table($name);
11966 push @tables_that_may_be_empty, $table->complete_name;
11967 }
11968
11969 # The HST property predates the GCB one, and has identical tables
11970 # for some of them, so use it if we can.
11971 if ($table->is_empty
11972 && defined $hst
11973 && defined $hst->table($name))
11974 {
11975 $table += $hst->table($name);
11976 }
11977 }
11978 }
11979
11980 # More GCB. If we found some hangul syllables, populate a combined
11981 # table.
11982 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11983 my $LV = $gcb->table('LV');
11984 if ($LV->is_empty) {
11985 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11986 } else {
11987 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11988 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
11989 }
11990
28093d0e 11991 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
11992 my @composition = ('Name', 'Unicode_1_Name');
11993
11994 if (@named_sequences) {
11995 push @composition, 'Named_Sequence';
11996 foreach my $sequence (@named_sequences) {
11997 $perl_charname->add_anomalous_entry($sequence);
11998 }
11999 }
12000
12001 my $alias_sentence = "";
12002 my $alias = property_ref('Name_Alias');
12003 if (defined $alias) {
12004 push @composition, 'Name_Alias';
12005 $alias->reset_each_range;
12006 while (my ($range) = $alias->each_range) {
12007 next if $range->value eq "";
12008 if ($range->start != $range->end) {
12009 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
12010 }
12011 $perl_charname->add_duplicate($range->start, $range->value);
12012 }
12013 $alias_sentence = <<END;
12014The Name_Alias property adds duplicate code point entries with a corrected
12015name. The original (less correct, but still valid) name will be physically
53d84487 12016last.
99870f4d
KW
12017END
12018 }
12019 my $comment;
12020 if (@composition <= 2) { # Always at least 2
12021 $comment = join " and ", @composition;
12022 }
12023 else {
12024 $comment = join ", ", @composition[0 .. scalar @composition - 2];
12025 $comment .= ", and $composition[-1]";
12026 }
12027
99870f4d
KW
12028 $perl_charname->add_comment(join_lines( <<END
12029This file is for charnames.pm. It is the union of the $comment properties.
12030Unicode_1_Name entries are used only for otherwise nameless code
12031points.
12032$alias_sentence
12033END
12034 ));
12035
99870f4d
KW
12036 # Construct the Present_In property from the Age property.
12037 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
12038 my $default_map = $age->default_map;
12039 my $in = Property->new('In',
12040 Default_Map => $default_map,
12041 Full_Name => "Present_In",
12042 Internal_Only_Warning => 1,
12043 Perl_Extension => 1,
12044 Type => $ENUM,
12045 Initialize => $age,
12046 );
12047 $in->add_comment(join_lines(<<END
c12f2655 12048THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
99870f4d
KW
12049same as for $age, and not for what $in really means. This is because anything
12050defined in a given release should have multiple values: that release and all
12051higher ones. But only one value per code point can be represented in a table
12052like this.
12053END
12054 ));
12055
12056 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
12057 # lowest numbered (earliest) come first, with the non-numeric one
12058 # last.
12059 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12060 ? 1
12061 : ($b->name !~ /^[\d.]*$/)
12062 ? -1
12063 : $a->name <=> $b->name
12064 } $age->tables;
12065
12066 # The Present_In property is the cumulative age properties. The first
12067 # one hence is identical to the first age one.
12068 my $previous_in = $in->add_match_table($first_age->name);
12069 $previous_in->set_equivalent_to($first_age, Related => 1);
12070
12071 my $description_start = "Code point's usage introduced in version ";
12072 $first_age->add_description($description_start . $first_age->name);
12073
98dc9551 12074 # To construct the accumulated values, for each of the age tables
99870f4d
KW
12075 # starting with the 2nd earliest, merge the earliest with it, to get
12076 # all those code points existing in the 2nd earliest. Repeat merging
12077 # the new 2nd earliest with the 3rd earliest to get all those existing
12078 # in the 3rd earliest, and so on.
12079 foreach my $current_age (@rest_ages) {
12080 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
12081
12082 my $current_in = $in->add_match_table(
12083 $current_age->name,
12084 Initialize => $current_age + $previous_in,
12085 Description => $description_start
12086 . $current_age->name
12087 . ' or earlier',
12088 );
12089 $previous_in = $current_in;
12090
12091 # Add clarifying material for the corresponding age file. This is
12092 # in part because of the confusing and contradictory information
12093 # given in the Standard's documentation itself, as of 5.2.
12094 $current_age->add_description(
12095 "Code point's usage was introduced in version "
12096 . $current_age->name);
12097 $current_age->add_note("See also $in");
12098
12099 }
12100
12101 # And finally the code points whose usages have yet to be decided are
12102 # the same in both properties. Note that permanently unassigned code
12103 # points actually have their usage assigned (as being permanently
12104 # unassigned), so that these tables are not the same as gc=cn.
12105 my $unassigned = $in->add_match_table($default_map);
12106 my $age_default = $age->table($default_map);
12107 $age_default->add_description(<<END
12108Code point's usage has not been assigned in any Unicode release thus far.
12109END
12110 );
12111 $unassigned->set_equivalent_to($age_default, Related => 1);
12112 }
12113
12114
12115 # Finished creating all the perl properties. All non-internal non-string
12116 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
12117 # an underscore.) These do not get a separate entry in the pod file
12118 foreach my $table ($perl->tables) {
12119 foreach my $alias ($table->aliases) {
12120 next if $alias->name =~ /^_/;
12121 $table->add_alias('Is_' . $alias->name,
12122 Pod_Entry => 0,
12123 Status => $alias->status,
12124 Externally_Ok => 0);
12125 }
12126 }
12127
c4019d52
KW
12128 # Here done with all the basic stuff. Ready to populate the information
12129 # about each character if annotating them.
558712cf 12130 if ($annotate) {
c4019d52
KW
12131
12132 # See comments at its declaration
12133 $annotate_ranges = Range_Map->new;
12134
12135 # This separates out the non-characters from the other unassigneds, so
12136 # can give different annotations for each.
12137 $unassigned_sans_noncharacters = Range_List->new(
12138 Initialize => $gc->table('Unassigned')
12139 & property_ref('Noncharacter_Code_Point')->table('N'));
12140
12141 for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) {
12142 $i = populate_char_info($i); # Note sets $i so may cause skips
12143 }
12144 }
12145
99870f4d
KW
12146 return;
12147}
12148
12149sub add_perl_synonyms() {
12150 # A number of Unicode tables have Perl synonyms that are expressed in
12151 # the single-form, \p{name}. These are:
12152 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12153 # \p{Is_Name} as synonyms
12154 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12155 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12156 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12157 # conflict, \p{Value} and \p{Is_Value} as well
12158 #
12159 # This routine generates these synonyms, warning of any unexpected
12160 # conflicts.
12161
12162 # Construct the list of tables to get synonyms for. Start with all the
12163 # binary and the General_Category ones.
12164 my @tables = grep { $_->type == $BINARY } property_ref('*');
12165 push @tables, $gc->tables;
12166
12167 # If the version of Unicode includes the Script property, add its tables
12168 if (defined property_ref('Script')) {
12169 push @tables, property_ref('Script')->tables;
12170 }
12171
12172 # The Block tables are kept separate because they are treated differently.
12173 # And the earliest versions of Unicode didn't include them, so add only if
12174 # there are some.
12175 my @blocks;
12176 push @blocks, $block->tables if defined $block;
12177
12178 # Here, have the lists of tables constructed. Process blocks last so that
12179 # if there are name collisions with them, blocks have lowest priority.
12180 # Should there ever be other collisions, manual intervention would be
12181 # required. See the comments at the beginning of the program for a
12182 # possible way to handle those semi-automatically.
12183 foreach my $table (@tables, @blocks) {
12184
12185 # For non-binary properties, the synonym is just the name of the
12186 # table, like Greek, but for binary properties the synonym is the name
12187 # of the property, and means the code points in its 'Y' table.
12188 my $nominal = $table;
12189 my $nominal_property = $nominal->property;
12190 my $actual;
12191 if (! $nominal->isa('Property')) {
12192 $actual = $table;
12193 }
12194 else {
12195
12196 # Here is a binary property. Use the 'Y' table. Verify that is
12197 # there
12198 my $yes = $nominal->table('Y');
12199 unless (defined $yes) { # Must be defined, but is permissible to
12200 # be empty.
12201 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
12202 next;
12203 }
12204 $actual = $yes;
12205 }
12206
12207 foreach my $alias ($nominal->aliases) {
12208
12209 # Attempt to create a table in the perl directory for the
12210 # candidate table, using whatever aliases in it that don't
12211 # conflict. Also add non-conflicting aliases for all these
12212 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12213 PREFIX:
12214 foreach my $prefix ("", 'Is_', 'In_') {
12215
12216 # Only Block properties can have added 'In_' aliases.
12217 next if $prefix eq 'In_' and $nominal_property != $block;
12218
12219 my $proposed_name = $prefix . $alias->name;
12220
12221 # No Is_Is, In_In, nor combinations thereof
12222 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12223 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12224
12225 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12226
12227 # Get a reference to any existing table in the perl
12228 # directory with the desired name.
12229 my $pre_existing = $perl->table($proposed_name);
12230
12231 if (! defined $pre_existing) {
12232
12233 # No name collision, so ok to add the perl synonym.
12234
12235 my $make_pod_entry;
12236 my $externally_ok;
4cd1260a 12237 my $status = $alias->status;
99870f4d
KW
12238 if ($nominal_property == $block) {
12239
12240 # For block properties, the 'In' form is preferred for
12241 # external use; the pod file contains wild cards for
12242 # this and the 'Is' form so no entries for those; and
12243 # we don't want people using the name without the
12244 # 'In', so discourage that.
12245 if ($prefix eq "") {
12246 $make_pod_entry = 1;
12247 $status = $status || $DISCOURAGED;
12248 $externally_ok = 0;
12249 }
12250 elsif ($prefix eq 'In_') {
12251 $make_pod_entry = 0;
12252 $status = $status || $NORMAL;
12253 $externally_ok = 1;
12254 }
12255 else {
12256 $make_pod_entry = 0;
12257 $status = $status || $DISCOURAGED;
12258 $externally_ok = 0;
12259 }
12260 }
12261 elsif ($prefix ne "") {
12262
12263 # The 'Is' prefix is handled in the pod by a wild
12264 # card, and we won't use it for an external name
12265 $make_pod_entry = 0;
12266 $status = $status || $NORMAL;
12267 $externally_ok = 0;
12268 }
12269 else {
12270
12271 # Here, is an empty prefix, non block. This gets its
12272 # own pod entry and can be used for an external name.
12273 $make_pod_entry = 1;
12274 $status = $status || $NORMAL;
12275 $externally_ok = 1;
12276 }
12277
12278 # Here, there isn't a perl pre-existing table with the
12279 # name. Look through the list of equivalents of this
12280 # table to see if one is a perl table.
12281 foreach my $equivalent ($actual->leader->equivalents) {
12282 next if $equivalent->property != $perl;
12283
12284 # Here, have found a table for $perl. Add this alias
12285 # to it, and are done with this prefix.
12286 $equivalent->add_alias($proposed_name,
12287 Pod_Entry => $make_pod_entry,
12288 Status => $status,
12289 Externally_Ok => $externally_ok);
12290 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12291 next PREFIX;
12292 }
12293
12294 # Here, $perl doesn't already have a table that is a
12295 # synonym for this property, add one.
12296 my $added_table = $perl->add_match_table($proposed_name,
12297 Pod_Entry => $make_pod_entry,
12298 Status => $status,
12299 Externally_Ok => $externally_ok);
12300 # And it will be related to the actual table, since it is
12301 # based on it.
12302 $added_table->set_equivalent_to($actual, Related => 1);
12303 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12304 next;
12305 } # End of no pre-existing.
12306
12307 # Here, there is a pre-existing table that has the proposed
12308 # name. We could be in trouble, but not if this is just a
12309 # synonym for another table that we have already made a child
12310 # of the pre-existing one.
6505c6e2 12311 if ($pre_existing->is_set_equivalent_to($actual)) {
99870f4d
KW
12312 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12313 $pre_existing->add_alias($proposed_name);
12314 next;
12315 }
12316
12317 # Here, there is a name collision, but it still could be ok if
12318 # the tables match the identical set of code points, in which
12319 # case, we can combine the names. Compare each table's code
12320 # point list to see if they are identical.
12321 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12322 if ($pre_existing->matches_identically_to($actual)) {
12323
12324 # Here, they do match identically. Not a real conflict.
12325 # Make the perl version a child of the Unicode one, except
12326 # in the non-obvious case of where the perl name is
12327 # already a synonym of another Unicode property. (This is
12328 # excluded by the test for it being its own parent.) The
12329 # reason for this exclusion is that then the two Unicode
12330 # properties become related; and we don't really know if
12331 # they are or not. We generate documentation based on
12332 # relatedness, and this would be misleading. Code
12333 # later executed in the process will cause the tables to
12334 # be represented by a single file anyway, without making
12335 # it look in the pod like they are necessarily related.
12336 if ($pre_existing->parent == $pre_existing
12337 && ($pre_existing->property == $perl
12338 || $actual->property == $perl))
12339 {
12340 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12341 $pre_existing->set_equivalent_to($actual, Related => 1);
12342 }
12343 elsif (main::DEBUG && $to_trace) {
12344 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12345 trace $pre_existing->parent;
12346 }
12347 next PREFIX;
12348 }
12349
12350 # Here they didn't match identically, there is a real conflict
12351 # between our new name and a pre-existing property.
12352 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12353 $pre_existing->add_conflicting($nominal->full_name,
12354 'p',
12355 $actual);
12356
12357 # Don't output a warning for aliases for the block
12358 # properties (unless they start with 'In_') as it is
12359 # expected that there will be conflicts and the block
12360 # form loses.
12361 if ($verbosity >= $NORMAL_VERBOSITY
12362 && ($actual->property != $block || $prefix eq 'In_'))
12363 {
12364 print simple_fold(join_lines(<<END
12365There is already an alias named $proposed_name (from " . $pre_existing . "),
12366so not creating this alias for " . $actual
12367END
12368 ), "", 4);
12369 }
12370
12371 # Keep track for documentation purposes.
12372 $has_In_conflicts++ if $prefix eq 'In_';
12373 $has_Is_conflicts++ if $prefix eq 'Is_';
12374 }
12375 }
12376 }
12377
12378 # There are some properties which have No and Yes (and N and Y) as
12379 # property values, but aren't binary, and could possibly be confused with
12380 # binary ones. So create caveats for them. There are tables that are
12381 # named 'No', and tables that are named 'N', but confusion is not likely
12382 # unless they are the same table. For example, N meaning Number or
12383 # Neutral is not likely to cause confusion, so don't add caveats to things
12384 # like them.
12385 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
12386 my $yes = $property->table('Yes');
12387 if (defined $yes) {
12388 my $y = $property->table('Y');
12389 if (defined $y && $yes == $y) {
12390 foreach my $alias ($property->aliases) {
12391 $yes->add_conflicting($alias->name);
12392 }
12393 }
12394 }
12395 my $no = $property->table('No');
12396 if (defined $no) {
12397 my $n = $property->table('N');
12398 if (defined $n && $no == $n) {
12399 foreach my $alias ($property->aliases) {
12400 $no->add_conflicting($alias->name, 'P');
12401 }
12402 }
12403 }
12404 }
12405
12406 return;
12407}
12408
12409sub register_file_for_name($$$) {
12410 # Given info about a table and a datafile that it should be associated
98dc9551 12411 # with, register that association
99870f4d
KW
12412
12413 my $table = shift;
12414 my $directory_ref = shift; # Array of the directory path for the file
e6ebc4c0 12415 my $file = shift; # The file name in the final directory.
99870f4d
KW
12416 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12417
12418 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12419
12420 if ($table->isa('Property')) {
12421 $table->set_file_path(@$directory_ref, $file);
12422 push @map_properties, $table
12423 if $directory_ref->[0] eq $map_directory;
12424 return;
12425 }
12426
12427 # Do all of the work for all equivalent tables when called with the leader
12428 # table, so skip if isn't the leader.
12429 return if $table->leader != $table;
12430
a92d5c2e
KW
12431 # If this is a complement of another file, use that other file instead,
12432 # with a ! prepended to it.
12433 my $complement;
12434 if (($complement = $table->complement) != 0) {
12435 my @directories = $complement->file_path;
12436
12437 # This assumes that the 0th element is something like 'lib',
12438 # the 1th element the property name (in its own directory), like
12439 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12440 # appended to it later.
12441 $directories[1] =~ s/^/!/;
12442 $file = pop @directories;
12443 $directory_ref =\@directories;
12444 }
12445
99870f4d
KW
12446 # Join all the file path components together, using slashes.
12447 my $full_filename = join('/', @$directory_ref, $file);
12448
12449 # All go in the same subdirectory of unicore
12450 if ($directory_ref->[0] ne $matches_directory) {
12451 Carp::my_carp("Unexpected directory in "
12452 . join('/', @{$directory_ref}, $file));
12453 }
12454
12455 # For this table and all its equivalents ...
12456 foreach my $table ($table, $table->equivalents) {
12457
12458 # Associate it with its file internally. Don't include the
12459 # $matches_directory first component
12460 $table->set_file_path(@$directory_ref, $file);
12461 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12462
12463 my $property = $table->property;
12464 $property = ($property == $perl)
12465 ? "" # 'perl' is never explicitly stated
12466 : standardize($property->name) . '=';
12467
12468 my $deprecated = ($table->status eq $DEPRECATED)
12469 ? $table->status_info
12470 : "";
d867ccfb 12471 my $caseless_equivalent = $table->caseless_equivalent;
99870f4d
KW
12472
12473 # And for each of the table's aliases... This inner loop eventually
12474 # goes through all aliases in the UCD that we generate regex match
12475 # files for
12476 foreach my $alias ($table->aliases) {
c85f591a 12477 my $standard = utf8_heavy_name($table, $alias);
99870f4d
KW
12478
12479 # Generate an entry in either the loose or strict hashes, which
12480 # will translate the property and alias names combination into the
12481 # file where the table for them is stored.
99870f4d 12482 if ($alias->loose_match) {
99870f4d
KW
12483 if (exists $loose_to_file_of{$standard}) {
12484 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12485 }
12486 else {
12487 $loose_to_file_of{$standard} = $sub_filename;
12488 }
12489 }
12490 else {
99870f4d
KW
12491 if (exists $stricter_to_file_of{$standard}) {
12492 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12493 }
12494 else {
12495 $stricter_to_file_of{$standard} = $sub_filename;
12496
12497 # Tightly coupled with how utf8_heavy.pl works, for a
12498 # floating point number that is a whole number, get rid of
12499 # the trailing decimal point and 0's, so that utf8_heavy
12500 # will work. Also note that this assumes that such a
12501 # number is matched strictly; so if that were to change,
12502 # this would be wrong.
c85f591a 12503 if ((my $integer_name = $alias->name)
99870f4d
KW
12504 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12505 {
12506 $stricter_to_file_of{$property . $integer_name}
c12f2655 12507 = $sub_filename;
99870f4d
KW
12508 }
12509 }
12510 }
12511
12512 # Keep a list of the deprecated properties and their filenames
a92d5c2e 12513 if ($deprecated && $complement == 0) {
99870f4d
KW
12514 $utf8::why_deprecated{$sub_filename} = $deprecated;
12515 }
d867ccfb
KW
12516
12517 # And a substitute table, if any, for case-insensitive matching
12518 if ($caseless_equivalent != 0) {
12519 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12520 }
99870f4d
KW
12521 }
12522 }
12523
12524 return;
12525}
12526
12527{ # Closure
12528 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12529 # conflicts
12530 my %full_dir_name_of; # Full length names of directories used.
12531
12532 sub construct_filename($$$) {
12533 # Return a file name for a table, based on the table name, but perhaps
12534 # changed to get rid of non-portable characters in it, and to make
12535 # sure that it is unique on a file system that allows the names before
12536 # any period to be at most 8 characters (DOS). While we're at it
12537 # check and complain if there are any directory conflicts.
12538
12539 my $name = shift; # The name to start with
12540 my $mutable = shift; # Boolean: can it be changed? If no, but
12541 # yet it must be to work properly, a warning
12542 # is given
12543 my $directories_ref = shift; # A reference to an array containing the
12544 # path to the file, with each element one path
12545 # component. This is used because the same
12546 # name can be used in different directories.
12547 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12548
12549 my $warn = ! defined wantarray; # If true, then if the name is
12550 # changed, a warning is issued as well.
12551
12552 if (! defined $name) {
12553 Carp::my_carp("Undefined name in directory "
12554 . File::Spec->join(@$directories_ref)
12555 . ". '_' used");
12556 return '_';
12557 }
12558
12559 # Make sure that no directory names conflict with each other. Look at
12560 # each directory in the input file's path. If it is already in use,
12561 # assume it is correct, and is merely being re-used, but if we
12562 # truncate it to 8 characters, and find that there are two directories
12563 # that are the same for the first 8 characters, but differ after that,
12564 # then that is a problem.
12565 foreach my $directory (@$directories_ref) {
12566 my $short_dir = substr($directory, 0, 8);
12567 if (defined $full_dir_name_of{$short_dir}) {
12568 next if $full_dir_name_of{$short_dir} eq $directory;
12569 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12570 }
12571 else {
12572 $full_dir_name_of{$short_dir} = $directory;
12573 }
12574 }
12575
12576 my $path = join '/', @$directories_ref;
12577 $path .= '/' if $path;
12578
12579 # Remove interior underscores.
12580 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12581
12582 # Change any non-word character into an underscore, and truncate to 8.
12583 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12584 substr($filename, 8) = "" if length($filename) > 8;
12585
12586 # Make sure the basename doesn't conflict with something we
12587 # might have already written. If we have, say,
12588 # InGreekExtended1
12589 # InGreekExtended2
12590 # they become
12591 # InGreekE
12592 # InGreek2
12593 my $warned = 0;
12594 while (my $num = $base_names{$path}{lc $filename}++) {
12595 $num++; # so basenames with numbers start with '2', which
12596 # just looks more natural.
12597
12598 # Want to append $num, but if it'll make the basename longer
12599 # than 8 characters, pre-truncate $filename so that the result
12600 # is acceptable.
12601 my $delta = length($filename) + length($num) - 8;
12602 if ($delta > 0) {
12603 substr($filename, -$delta) = $num;
12604 }
12605 else {
12606 $filename .= $num;
12607 }
12608 if ($warn && ! $warned) {
12609 $warned = 1;
12610 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12611 }
12612 }
12613
12614 return $filename if $mutable;
12615
12616 # If not changeable, must return the input name, but warn if needed to
12617 # change it beyond shortening it.
12618 if ($name ne $filename
12619 && substr($name, 0, length($filename)) ne $filename) {
12620 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12621 }
12622 return $name;
12623 }
12624}
12625
12626# The pod file contains a very large table. Many of the lines in that table
12627# would exceed a typical output window's size, and so need to be wrapped with
12628# a hanging indent to make them look good. The pod language is really
12629# insufficient here. There is no general construct to do that in pod, so it
12630# is done here by beginning each such line with a space to cause the result to
12631# be output without formatting, and doing all the formatting here. This leads
12632# to the result that if the eventual display window is too narrow it won't
12633# look good, and if the window is too wide, no advantage is taken of that
12634# extra width. A further complication is that the output may be indented by
12635# the formatter so that there is less space than expected. What I (khw) have
12636# done is to assume that that indent is a particular number of spaces based on
12637# what it is in my Linux system; people can always resize their windows if
12638# necessary, but this is obviously less than desirable, but the best that can
12639# be expected.
12640my $automatic_pod_indent = 8;
12641
12642# Try to format so that uses fewest lines, but few long left column entries
12643# slide into the right column. An experiment on 5.1 data yielded the
12644# following percentages that didn't cut into the other side along with the
12645# associated first-column widths
12646# 69% = 24
12647# 80% not too bad except for a few blocks
12648# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12649# 95% = 37;
12650my $indent_info_column = 27; # 75% of lines didn't have overlap
12651
12652my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12653 # The 3 is because of:
12654 # 1 for the leading space to tell the pod formatter to
12655 # output as-is
12656 # 1 for the flag
12657 # 1 for the space between the flag and the main data
12658
12659sub format_pod_line ($$$;$$) {
12660 # Take a pod line and return it, formatted properly
12661
12662 my $first_column_width = shift;
12663 my $entry = shift; # Contents of left column
12664 my $info = shift; # Contents of right column
12665
12666 my $status = shift || ""; # Any flag
12667
12668 my $loose_match = shift; # Boolean.
12669 $loose_match = 1 unless defined $loose_match;
12670
12671 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12672
12673 my $flags = "";
12674 $flags .= $STRICTER if ! $loose_match;
12675
12676 $flags .= $status if $status;
12677
12678 # There is a blank in the left column to cause the pod formatter to
12679 # output the line as-is.
12680 return sprintf " %-*s%-*s %s\n",
12681 # The first * in the format is replaced by this, the -1 is
12682 # to account for the leading blank. There isn't a
12683 # hard-coded blank after this to separate the flags from
12684 # the rest of the line, so that in the unlikely event that
12685 # multiple flags are shown on the same line, they both
12686 # will get displayed at the expense of that separation,
12687 # but since they are left justified, a blank will be
12688 # inserted in the normal case.
12689 $FILLER - 1,
12690 $flags,
12691
12692 # The other * in the format is replaced by this number to
12693 # cause the first main column to right fill with blanks.
12694 # The -1 is for the guaranteed blank following it.
12695 $first_column_width - $FILLER - 1,
12696 $entry,
12697 $info;
12698}
12699
12700my @zero_match_tables; # List of tables that have no matches in this release
12701
12702sub make_table_pod_entries($) {
12703 # This generates the entries for the pod file for a given table.
12704 # Also done at this time are any children tables. The output looks like:
12705 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
12706
12707 my $input_table = shift; # Table the entry is for
12708 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12709
12710 # Generate parent and all its children at the same time.
12711 return if $input_table->parent != $input_table;
12712
12713 my $property = $input_table->property;
12714 my $type = $property->type;
12715 my $full_name = $property->full_name;
12716
12717 my $count = $input_table->count;
12718 my $string_count = clarify_number($count);
12719 my $status = $input_table->status;
12720 my $status_info = $input_table->status_info;
56ca34ca 12721 my $caseless_equivalent = $input_table->caseless_equivalent;
99870f4d
KW
12722
12723 my $entry_for_first_table; # The entry for the first table output.
12724 # Almost certainly, it is the parent.
12725
12726 # For each related table (including itself), we will generate a pod entry
12727 # for each name each table goes by
12728 foreach my $table ($input_table, $input_table->children) {
12729
12730 # utf8_heavy.pl cannot deal with null string property values, so don't
12731 # output any.
12732 next if $table->name eq "";
12733
12734 # First, gather all the info that applies to this table as a whole.
12735
12736 push @zero_match_tables, $table if $count == 0;
12737
12738 my $table_property = $table->property;
12739
12740 # The short name has all the underscores removed, while the full name
12741 # retains them. Later, we decide whether to output a short synonym
12742 # for the full one, we need to compare apples to apples, so we use the
12743 # short name's length including underscores.
12744 my $table_property_short_name_length;
12745 my $table_property_short_name
12746 = $table_property->short_name(\$table_property_short_name_length);
12747 my $table_property_full_name = $table_property->full_name;
12748
12749 # Get how much savings there is in the short name over the full one
12750 # (delta will always be <= 0)
12751 my $table_property_short_delta = $table_property_short_name_length
12752 - length($table_property_full_name);
12753 my @table_description = $table->description;
12754 my @table_note = $table->note;
12755
12756 # Generate an entry for each alias in this table.
12757 my $entry_for_first_alias; # saves the first one encountered.
12758 foreach my $alias ($table->aliases) {
12759
12760 # Skip if not to go in pod.
12761 next unless $alias->make_pod_entry;
12762
12763 # Start gathering all the components for the entry
12764 my $name = $alias->name;
12765
12766 my $entry; # Holds the left column, may include extras
12767 my $entry_ref; # To refer to the left column's contents from
12768 # another entry; has no extras
12769
12770 # First the left column of the pod entry. Tables for the $perl
12771 # property always use the single form.
12772 if ($table_property == $perl) {
12773 $entry = "\\p{$name}";
12774 $entry_ref = "\\p{$name}";
12775 }
12776 else { # Compound form.
12777
12778 # Only generate one entry for all the aliases that mean true
12779 # or false in binary properties. Append a '*' to indicate
12780 # some are missing. (The heading comment notes this.)
12781 my $wild_card_mark;
12782 if ($type == $BINARY) {
12783 next if $name ne 'N' && $name ne 'Y';
12784 $wild_card_mark = '*';
12785 }
12786 else {
12787 $wild_card_mark = "";
12788 }
12789
12790 # Colon-space is used to give a little more space to be easier
12791 # to read;
12792 $entry = "\\p{"
12793 . $table_property_full_name
12794 . ": $name$wild_card_mark}";
12795
12796 # But for the reference to this entry, which will go in the
12797 # right column, where space is at a premium, use equals
12798 # without a space
12799 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
12800 }
12801
12802 # Then the right (info) column. This is stored as components of
12803 # an array for the moment, then joined into a string later. For
12804 # non-internal only properties, begin the info with the entry for
12805 # the first table we encountered (if any), as things are ordered
12806 # so that that one is the most descriptive. This leads to the
12807 # info column of an entry being a more descriptive version of the
12808 # name column
12809 my @info;
12810 if ($name =~ /^_/) {
12811 push @info,
12812 '(For internal use by Perl, not necessarily stable)';
12813 }
12814 elsif ($entry_for_first_alias) {
12815 push @info, $entry_for_first_alias;
12816 }
12817
12818 # If this entry is equivalent to another, add that to the info,
12819 # using the first such table we encountered
12820 if ($entry_for_first_table) {
12821 if (@info) {
12822 push @info, "(= $entry_for_first_table)";
12823 }
12824 else {
12825 push @info, $entry_for_first_table;
12826 }
12827 }
12828
12829 # If the name is a large integer, add an equivalent with an
12830 # exponent for better readability
12831 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12832 push @info, sprintf "(= %.1e)", $name
12833 }
12834
12835 my $parenthesized = "";
12836 if (! $entry_for_first_alias) {
12837
12838 # This is the first alias for the current table. The alias
12839 # array is ordered so that this is the fullest, most
12840 # descriptive alias, so it gets the fullest info. The other
12841 # aliases are mostly merely pointers to this one, using the
12842 # information already added above.
12843
12844 # Display any status message, but only on the parent table
12845 if ($status && ! $entry_for_first_table) {
12846 push @info, $status_info;
12847 }
12848
12849 # Put out any descriptive info
12850 if (@table_description || @table_note) {
12851 push @info, join "; ", @table_description, @table_note;
12852 }
12853
12854 # Look to see if there is a shorter name we can point people
12855 # at
12856 my $standard_name = standardize($name);
12857 my $short_name;
12858 my $proposed_short = $table->short_name;
12859 if (defined $proposed_short) {
12860 my $standard_short = standardize($proposed_short);
12861
12862 # If the short name is shorter than the standard one, or
12863 # even it it's not, but the combination of it and its
12864 # short property name (as in \p{prop=short} ($perl doesn't
12865 # have this form)) saves at least two characters, then,
12866 # cause it to be listed as a shorter synonym.
12867 if (length $standard_short < length $standard_name
12868 || ($table_property != $perl
12869 && (length($standard_short)
12870 - length($standard_name)
12871 + $table_property_short_delta) # (<= 0)
12872 < -2))
12873 {
12874 $short_name = $proposed_short;
12875 if ($table_property != $perl) {
12876 $short_name = $table_property_short_name
12877 . "=$short_name";
12878 }
12879 $short_name = "\\p{$short_name}";
12880 }
12881 }
12882
12883 # And if this is a compound form name, see if there is a
12884 # single form equivalent
12885 my $single_form;
12886 if ($table_property != $perl) {
12887
12888 # Special case the binary N tables, so that will print
12889 # \P{single}, but use the Y table values to populate
c12f2655 12890 # 'single', as we haven't likewise populated the N table.
99870f4d
KW
12891 my $test_table;
12892 my $p;
12893 if ($type == $BINARY
12894 && $input_table == $property->table('No'))
12895 {
12896 $test_table = $property->table('Yes');
12897 $p = 'P';
12898 }
12899 else {
12900 $test_table = $input_table;
12901 $p = 'p';
12902 }
12903
12904 # Look for a single form amongst all the children.
12905 foreach my $table ($test_table->children) {
12906 next if $table->property != $perl;
12907 my $proposed_name = $table->short_name;
12908 next if ! defined $proposed_name;
12909
12910 # Don't mention internal-only properties as a possible
12911 # single form synonym
12912 next if substr($proposed_name, 0, 1) eq '_';
12913
12914 $proposed_name = "\\$p\{$proposed_name}";
12915 if (! defined $single_form
12916 || length($proposed_name) < length $single_form)
12917 {
12918 $single_form = $proposed_name;
12919
12920 # The goal here is to find a single form; not the
12921 # shortest possible one. We've already found a
12922 # short name. So, stop at the first single form
12923 # found, which is likely to be closer to the
12924 # original.
12925 last;
12926 }
12927 }
12928 }
12929
12930 # Ouput both short and single in the same parenthesized
12931 # expression, but with only one of 'Single', 'Short' if there
12932 # are both items.
12933 if ($short_name || $single_form || $table->conflicting) {
99870f4d
KW
12934 $parenthesized .= "Short: $short_name" if $short_name;
12935 if ($short_name && $single_form) {
12936 $parenthesized .= ', ';
12937 }
12938 elsif ($single_form) {
12939 $parenthesized .= 'Single: ';
12940 }
12941 $parenthesized .= $single_form if $single_form;
12942 }
12943 }
12944
56ca34ca
KW
12945 if ($caseless_equivalent != 0) {
12946 $parenthesized .= '; ' if $parenthesized ne "";
12947 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
12948 }
12949
99870f4d
KW
12950
12951 # Warn if this property isn't the same as one that a
12952 # semi-casual user might expect. The other components of this
12953 # parenthesized structure are calculated only for the first entry
12954 # for this table, but the conflicting is deemed important enough
12955 # to go on every entry.
12956 my $conflicting = join " NOR ", $table->conflicting;
12957 if ($conflicting) {
e5228720 12958 $parenthesized .= '; ' if $parenthesized ne "";
99870f4d
KW
12959 $parenthesized .= "NOT $conflicting";
12960 }
99870f4d 12961
e5228720 12962 push @info, "($parenthesized)" if $parenthesized;
d57ccc9a 12963
0f88d393
KW
12964 if ($name =~ /_$/ && $alias->loose_match) {
12965 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
12966 }
12967
d57ccc9a
KW
12968 if ($table_property != $perl && $table->perl_extension) {
12969 push @info, '(Perl extension)';
12970 }
2cf724d4 12971 push @info, "($string_count)";
99870f4d
KW
12972
12973 # Now, we have both the entry and info so add them to the
12974 # list of all the properties.
12975 push @match_properties,
12976 format_pod_line($indent_info_column,
12977 $entry,
12978 join( " ", @info),
12979 $alias->status,
12980 $alias->loose_match);
12981
12982 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12983 } # End of looping through the aliases for this table.
12984
12985 if (! $entry_for_first_table) {
12986 $entry_for_first_table = $entry_for_first_alias;
12987 }
12988 } # End of looping through all the related tables
12989 return;
12990}
12991
12992sub pod_alphanumeric_sort {
12993 # Sort pod entries alphanumerically.
12994
99f78760
KW
12995 # The first few character columns are filler, plus the '\p{'; and get rid
12996 # of all the trailing stuff, starting with the trailing '}', so as to sort
12997 # on just 'Name=Value'
12998 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 12999 $a =~ s/}.*//;
99f78760 13000 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
13001 $b =~ s/}.*//;
13002
99f78760
KW
13003 # Determine if the two operands are both internal only or both not.
13004 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
13005 # should be the underscore that begins internal only
13006 my $a_is_internal = (substr($a, 0, 1) eq '_');
13007 my $b_is_internal = (substr($b, 0, 1) eq '_');
13008
13009 # Sort so the internals come last in the table instead of first (which the
13010 # leading underscore would otherwise indicate).
13011 if ($a_is_internal != $b_is_internal) {
13012 return 1 if $a_is_internal;
13013 return -1
13014 }
13015
99870f4d 13016 # Determine if the two operands are numeric property values or not.
99f78760 13017 # A numeric property will look like xyz: 3. But the number
99870f4d 13018 # can begin with an optional minus sign, and may have a
99f78760 13019 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
13020 # isn't numeric, use alphabetic sort.
13021 my ($a_initial, $a_number) =
99f78760 13022 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13023 return $a cmp $b unless defined $a_number;
13024 my ($b_initial, $b_number) =
99f78760 13025 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
13026 return $a cmp $b unless defined $b_number;
13027
13028 # Here they are both numeric, but use alphabetic sort if the
13029 # initial parts don't match
13030 return $a cmp $b if $a_initial ne $b_initial;
13031
13032 # Convert rationals to floating for the comparison.
13033 $a_number = eval $a_number if $a_number =~ qr{/};
13034 $b_number = eval $b_number if $b_number =~ qr{/};
13035
13036 return $a_number <=> $b_number;
13037}
13038
13039sub make_pod () {
13040 # Create the .pod file. This generates the various subsections and then
13041 # combines them in one big HERE document.
13042
13043 return unless defined $pod_directory;
13044 print "Making pod file\n" if $verbosity >= $PROGRESS;
13045
13046 my $exception_message =
13047 '(Any exceptions are individually noted beginning with the word NOT.)';
13048 my @block_warning;
13049 if (-e 'Blocks.txt') {
13050
13051 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
13052 # if the global $has_In_conflicts indicates we have them.
13053 push @match_properties, format_pod_line($indent_info_column,
13054 '\p{In_*}',
13055 '\p{Block: *}'
13056 . (($has_In_conflicts)
13057 ? " $exception_message"
13058 : ""));
13059 @block_warning = << "END";
13060
77173124
KW
13061Matches in the Block property have shortcuts that begin with "In_". For
13062example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
13063backward compatibility, if there is no conflict with another shortcut, these
13064may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
13065are numerous such conflicting shortcuts. Use of these forms for Block is
13066discouraged, and are flagged as such, not only because of the potential
13067confusion as to what is meant, but also because a later release of Unicode may
13068preempt the shortcut, and your program would no longer be correct. Use the
13069"In_" form instead to avoid this, or even more clearly, use the compound form,
13070e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
13071about this.
99870f4d
KW
13072END
13073 }
77173124 13074 my $text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
99870f4d
KW
13075 $text = "$exception_message $text" if $has_Is_conflicts;
13076
13077 # And the 'Is_ line';
13078 push @match_properties, format_pod_line($indent_info_column,
13079 '\p{Is_*}',
13080 "\\p{*} $text");
13081
13082 # Sort the properties array for output. It is sorted alphabetically
13083 # except numerically for numeric properties, and only output unique lines.
13084 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13085
13086 my $formatted_properties = simple_fold(\@match_properties,
13087 "",
13088 # indent succeeding lines by two extra
13089 # which looks better
13090 $indent_info_column + 2,
13091
13092 # shorten the line length by how much
13093 # the formatter indents, so the folded
13094 # line will fit in the space
13095 # presumably available
13096 $automatic_pod_indent);
13097 # Add column headings, indented to be a little more centered, but not
13098 # exactly
13099 $formatted_properties = format_pod_line($indent_info_column,
13100 ' NAME',
13101 ' INFO')
13102 . "\n"
13103 . $formatted_properties;
13104
13105 # Generate pod documentation lines for the tables that match nothing
13106 my $zero_matches;
13107 if (@zero_match_tables) {
13108 @zero_match_tables = uniques(@zero_match_tables);
13109 $zero_matches = join "\n\n",
13110 map { $_ = '=item \p{' . $_->complete_name . "}" }
13111 sort { $a->complete_name cmp $b->complete_name }
13112 uniques(@zero_match_tables);
13113
13114 $zero_matches = <<END;
13115
77173124 13116=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
99870f4d
KW
13117
13118Unicode has some property-value pairs that currently don't match anything.
c12f2655
KW
13119This happens generally either because they are obsolete, or they exist for
13120symmetry with other forms, but no language has yet been encoded that uses
13121them. In this version of Unicode, the following match zero code points:
99870f4d
KW
13122
13123=over 4
13124
13125$zero_matches
13126
13127=back
13128
13129END
13130 }
13131
13132 # Generate list of properties that we don't accept, grouped by the reasons
13133 # why. This is so only put out the 'why' once, and then list all the
13134 # properties that have that reason under it.
13135
13136 my %why_list; # The keys are the reasons; the values are lists of
13137 # properties that have the key as their reason
13138
13139 # For each property, add it to the list that are suppressed for its reason
13140 # The sort will cause the alphabetically first properties to be added to
13141 # each list first, so each list will be sorted.
13142 foreach my $property (sort keys %why_suppressed) {
13143 push @{$why_list{$why_suppressed{$property}}}, $property;
13144 }
13145
13146 # For each reason (sorted by the first property that has that reason)...
13147 my @bad_re_properties;
13148 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13149 keys %why_list)
13150 {
13151 # Add to the output, all the properties that have that reason. Start
13152 # with an empty line.
13153 push @bad_re_properties, "\n\n";
13154
13155 my $has_item = 0; # Flag if actually output anything.
13156 foreach my $name (@{$why_list{$why}}) {
13157
13158 # Split compound names into $property and $table components
13159 my $property = $name;
13160 my $table;
13161 if ($property =~ / (.*) = (.*) /x) {
13162 $property = $1;
13163 $table = $2;
13164 }
13165
13166 # This release of Unicode may not have a property that is
13167 # suppressed, so don't reference a non-existent one.
13168 $property = property_ref($property);
13169 next if ! defined $property;
13170
13171 # And since this list is only for match tables, don't list the
13172 # ones that don't have match tables.
13173 next if ! $property->to_create_match_tables;
13174
13175 # Find any abbreviation, and turn it into a compound name if this
13176 # is a property=value pair.
13177 my $short_name = $property->name;
13178 $short_name .= '=' . $property->table($table)->name if $table;
13179
13180 # And add the property as an item for the reason.
13181 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13182 $has_item = 1;
13183 }
13184
13185 # And add the reason under the list of properties, if such a list
13186 # actually got generated. Note that the header got added
13187 # unconditionally before. But pod ignores extra blank lines, so no
13188 # harm.
13189 push @bad_re_properties, "\n$why\n" if $has_item;
13190
13191 } # End of looping through each reason.
13192
13193 # Generate a list of the properties whose map table we output, from the
13194 # global @map_properties.
13195 my @map_tables_actually_output;
13196 my $info_indent = 20; # Left column is narrower than \p{} table.
13197 foreach my $property (@map_properties) {
13198
13199 # Get the path to the file; don't output any not in the standard
13200 # directory.
13201 my @path = $property->file_path;
13202 next if $path[0] ne $map_directory;
8572ace0
KW
13203
13204 # Don't mention map tables that are for internal-use only
13205 next if $property->to_output_map == $INTERNAL_MAP;
13206
99870f4d
KW
13207 shift @path; # Remove the standard name
13208
13209 my $file = join '/', @path; # In case is in sub directory
13210 my $info = $property->full_name;
13211 my $short_name = $property->name;
13212 if ($info ne $short_name) {
13213 $info .= " ($short_name)";
13214 }
13215 foreach my $more_info ($property->description,
13216 $property->note,
13217 $property->status_info)
13218 {
13219 next unless $more_info;
13220 $info =~ s/\.\Z//;
13221 $info .= ". $more_info";
13222 }
13223 push @map_tables_actually_output, format_pod_line($info_indent,
13224 $file,
13225 $info,
13226 $property->status);
13227 }
13228
13229 # Sort alphabetically, and fold for output
13230 @map_tables_actually_output = sort
13231 pod_alphanumeric_sort @map_tables_actually_output;
13232 @map_tables_actually_output
13233 = simple_fold(\@map_tables_actually_output,
13234 ' ',
13235 $info_indent,
13236 $automatic_pod_indent);
13237
13238 # Generate a list of the formats that can appear in the map tables.
13239 my @map_table_formats;
13240 foreach my $format (sort keys %map_table_formats) {
12916dad 13241 push @map_table_formats, " $format $map_table_formats{$format}\n";
99870f4d
KW
13242 }
13243
12916dad
MS
13244 local $" = "";
13245
99870f4d
KW
13246 # Everything is ready to assemble.
13247 my @OUT = << "END";
13248=begin comment
13249
13250$HEADER
13251
13252To change this file, edit $0 instead.
13253
13254=end comment
13255
13256=head1 NAME
13257
51f494cc 13258$pod_file - Index of Unicode Version $string_version properties in Perl
99870f4d
KW
13259
13260=head1 DESCRIPTION
13261
13262There are many properties in Unicode, and Perl provides access to almost all of
13263them, as well as some additional extensions and short-cut synonyms.
13264
13265And just about all of the few that aren't accessible through the Perl
77173124
KW
13266core are accessible through the modules: L<Unicode::Normalize> and
13267L<Unicode::UCD>, and for Unihan properties, via the CPAN module
13268L<Unicode::Unihan>.
99870f4d
KW
13269
13270This document merely lists all available properties and does not attempt to
13271explain what each property really means. There is a brief description of each
13272Perl extension. There is some detail about Blocks, Scripts, General_Category,
13273and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
13274Unicode properties, refer to the Unicode standard. A good starting place is
13275L<$unicode_reference_url>. More information on the Perl extensions is in
78bb419c 13276L<perlunicode/Other Properties>.
99870f4d
KW
13277
13278Note that you can define your own properties; see
13279L<perlunicode/"User-Defined Character Properties">.
13280
77173124 13281=head1 Properties accessible through C<\\p{}> and C<\\P{}>
99870f4d 13282
77173124
KW
13283The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13284most of the Unicode character properties. The table below shows all these
13285constructs, both single and compound forms.
99870f4d
KW
13286
13287B<Compound forms> consist of two components, separated by an equals sign or a
13288colon. The first component is the property name, and the second component is
13289the particular value of the property to match against, for example,
77173124 13290C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
99870f4d
KW
13291whose Script property is Greek.
13292
77173124 13293B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
99870f4d 13294their equivalent compound forms. The table shows these equivalences. (In our
77173124 13295example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
99870f4d 13296There are also a few Perl-defined single forms that are not shortcuts for a
77173124 13297compound form. One such is C<\\p{Word}>. These are also listed in the table.
99870f4d
KW
13298
13299In parsing these constructs, Perl always ignores Upper/lower case differences
77173124
KW
13300everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
13301C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
13302the left brace completely changes the meaning of the construct, from "match"
13303(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
13304for improved legibility.
99870f4d
KW
13305
13306Also, white space, hyphens, and underscores are also normally ignored
13307everywhere between the {braces}, and hence can be freely added or removed
13308even if the C</x> modifier hasn't been specified on the regular expression.
13309But $a_bold_stricter at the beginning of an entry in the table below
13310means that tighter (stricter) rules are used for that entry:
13311
13312=over 4
13313
77173124 13314=item Single form (C<\\p{name}>) tighter rules:
99870f4d
KW
13315
13316White space, hyphens, and underscores ARE significant
13317except for:
13318
13319=over 4
13320
13321=item * white space adjacent to a non-word character
13322
13323=item * underscores separating digits in numbers
13324
13325=back
13326
13327That means, for example, that you can freely add or remove white space
13328adjacent to (but within) the braces without affecting the meaning.
13329
77173124 13330=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
99870f4d
KW
13331
13332The tighter rules given above for the single form apply to everything to the
13333right of the colon or equals; the looser rules still apply to everything to
13334the left.
13335
13336That means, for example, that you can freely add or remove white space
13337adjacent to (but within) the braces and the colon or equal sign.
13338
13339=back
13340
78bb419c
KW
13341Some properties are considered obsolete by Unicode, but still available.
13342There are several varieties of obsolescence:
99870f4d
KW
13343
13344=over 4
13345
99870f4d
KW
13346=item Stabilized
13347
5f7264c7
KW
13348Obsolete properties may be stabilized. Such a determination does not indicate
13349that the property should or should not be used; instead it is a declaration
13350that the property will not be maintained nor extended for newly encoded
13351characters. Such properties are marked with $a_bold_stabilized in the
13352table.
99870f4d
KW
13353
13354=item Deprecated
13355
5f7264c7 13356An obsolete property may be deprecated, perhaps because its original intent
78bb419c
KW
13357has been replaced by another property, or because its specification was
13358somehow defective. This means that its use is strongly
99870f4d
KW
13359discouraged, so much so that a warning will be issued if used, unless the
13360regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13361statement. $A_bold_deprecated flags each such entry in the table, and
13362the entry there for the longest, most descriptive version of the property will
13363give the reason it is deprecated, and perhaps advice. Perl may issue such a
13364warning, even for properties that aren't officially deprecated by Unicode,
13365when there used to be characters or code points that were matched by them, but
13366no longer. This is to warn you that your program may not work like it did on
13367earlier Unicode releases.
13368
13369A deprecated property may be made unavailable in a future Perl version, so it
13370is best to move away from them.
13371
c12f2655
KW
13372A deprecated property may also be stabilized, but this fact is not shown.
13373
13374=item Obsolete
13375
13376Properties marked with $a_bold_obsolete in the table are considered (plain)
13377obsolete. Generally this designation is given to properties that Unicode once
13378used for internal purposes (but not any longer).
13379
99870f4d
KW
13380=back
13381
13382Some Perl extensions are present for backwards compatibility and are
c12f2655
KW
13383discouraged from being used, but are not obsolete. $A_bold_discouraged
13384flags each such entry in the table. Future Unicode versions may force
13385some of these extensions to be removed without warning, replaced by another
13386property with the same name that means something different. Use the
13387equivalent shown instead.
99870f4d
KW
13388
13389@block_warning
13390
77173124 13391The table below has two columns. The left column contains the C<\\p{}>
98dc9551 13392constructs to look up, possibly preceded by the flags mentioned above; and
99870f4d
KW
13393the right column contains information about them, like a description, or
13394synonyms. It shows both the single and compound forms for each property that
13395has them. If the left column is a short name for a property, the right column
13396will give its longer, more descriptive name; and if the left column is the
13397longest name, the right column will show any equivalent shortest name, in both
13398single and compound forms if applicable.
13399
13400The right column will also caution you if a property means something different
13401than what might normally be expected.
13402
d57ccc9a
KW
13403All single forms are Perl extensions; a few compound forms are as well, and
13404are noted as such.
13405
99870f4d
KW
13406Numbers in (parentheses) indicate the total number of code points matched by
13407the property. For emphasis, those properties that match no code points at all
13408are listed as well in a separate section following the table.
13409
56ca34ca
KW
13410Most properties match the same code points regardless of whether C<"/i">
13411case-insensitive matching is specified or not. But a few properties are
13412affected. These are shown with the notation
13413
13414 (/i= other_property)
13415
13416in the second column. Under case-insensitive matching they match the
13417same code pode points as the property "other_property".
13418
99870f4d 13419There is no description given for most non-Perl defined properties (See
77173124 13420L<$unicode_reference_url> for that).
d73e5302 13421
99870f4d
KW
13422For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13423combinations. For example, entries like:
d73e5302 13424
99870f4d 13425 \\p{Gc: *} \\p{General_Category: *}
5beb625e 13426
99870f4d
KW
13427mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13428for the latter is also valid for the former. Similarly,
5beb625e 13429
99870f4d 13430 \\p{Is_*} \\p{*}
5beb625e 13431
77173124
KW
13432means that if and only if, for example, C<\\p{Foo}> exists, then
13433C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13434And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13435C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
13436underscore.
5beb625e 13437
99870f4d
KW
13438Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13439And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
13440'N*' to indicate this, and doesn't have separate entries for the other
13441possibilities. Note that not all properties which have values 'Yes' and 'No'
13442are binary, and they have all their values spelled out without using this wild
13443card, and a C<NOT> clause in their description that highlights their not being
13444binary. These also require the compound form to match them, whereas true
13445binary properties have both single and compound forms available.
5beb625e 13446
99870f4d
KW
13447Note that all non-essential underscores are removed in the display of the
13448short names below.
5beb625e 13449
c12f2655 13450B<Legend summary:>
5beb625e 13451
99870f4d 13452=over 4
cf25bb62 13453
21405004 13454=item Z<>B<*> is a wild-card
cf25bb62 13455
99870f4d
KW
13456=item B<(\\d+)> in the info column gives the number of code points matched by
13457this property.
cf25bb62 13458
99870f4d 13459=item B<$DEPRECATED> means this is deprecated.
cf25bb62 13460
99870f4d 13461=item B<$OBSOLETE> means this is obsolete.
cf25bb62 13462
99870f4d 13463=item B<$STABILIZED> means this is stabilized.
cf25bb62 13464
99870f4d 13465=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 13466
c12f2655
KW
13467=item B<$DISCOURAGED> means use of this form is discouraged, and may not be
13468stable.
5beb625e 13469
99870f4d 13470=back
da7fcca4 13471
99870f4d 13472$formatted_properties
cf25bb62 13473
99870f4d 13474$zero_matches
cf25bb62 13475
99870f4d 13476=head1 Properties not accessible through \\p{} and \\P{}
cf25bb62 13477
99870f4d
KW
13478A few properties are accessible in Perl via various function calls only.
13479These are:
78bb419c 13480
99870f4d
KW
13481 Lowercase_Mapping lc() and lcfirst()
13482 Titlecase_Mapping ucfirst()
13483 Uppercase_Mapping uc()
12ac2576 13484
77173124 13485Case_Folding is accessible through the C</i> modifier in regular expressions.
cf25bb62 13486
77173124 13487The Name property is accessible through the C<\\N{}> interpolation in
99870f4d 13488double-quoted strings and regular expressions, but both usages require a C<use
fb121860
KW
13489charnames;> to be specified, which also contains related functions viacode(),
13490vianame(), and string_vianame().
cf25bb62 13491
99870f4d 13492=head1 Unicode regular expression properties that are NOT accepted by Perl
d2d499f5 13493
99870f4d
KW
13494Perl will generate an error for a few character properties in Unicode when
13495used in a regular expression. The non-Unihan ones are listed below, with the
13496reasons they are not accepted, perhaps with work-arounds. The short names for
13497the properties are listed enclosed in (parentheses).
c12f2655
KW
13498As described after the list, an installation can change the defaults and choose
13499to accept any of these. The list is machine generated based on the
13500choices made for the installation that generated this document.
ae6979a8 13501
99870f4d 13502=over 4
ae6979a8 13503
99870f4d 13504@bad_re_properties
a3a8c5f0 13505
99870f4d 13506=back
a3a8c5f0 13507
b7986f4f
KW
13508An installation can choose to allow any of these to be matched by downloading
13509the Unicode database from L<http://www.unicode.org/Public/> to
f3514a2f
KW
13510C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
13511controlling lists contained in the program
13512C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
13513(C<\%Config> is available from the Config module).
d73e5302 13514
99870f4d 13515=head1 Files in the I<To> directory (for serious hackers only)
12ac2576 13516
99870f4d
KW
13517All Unicode properties are really mappings (in the mathematical sense) from
13518code points to their respective values. As part of its build process,
13519Perl constructs tables containing these mappings for all properties that it
50b27e73 13520deals with. Some, but not all, of these are written out into files.
99870f4d 13521Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
77173124 13522(C<%Config> is available from the C<Config> module).
7ebf06b3 13523
50b27e73
KW
13524Perl reserves the right to change the format and even the existence of any of
13525those files without notice, except the ones that were in existence prior to
c6d31e50 13526release 5.14. If those change, a deprecation cycle will be done first. These
50b27e73 13527are:
12ac2576 13528
99870f4d 13529@map_tables_actually_output
12ac2576 13530
ec2f0128
KW
13531Each of the files in this directory defines several hash entries to help
13532reading programs decipher it. One of them looks like this:
12ac2576 13533
99870f4d 13534 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
d73e5302 13535
77173124
KW
13536where "NAME" is a name to indicate the property. For backwards compatibility,
13537this is not necessarily the property's official Unicode name. (The "To" is
99870f4d
KW
13538also for backwards compatibility.) The hash entry gives the format of the
13539mapping fields of the table, currently one of the following:
d73e5302 13540
12916dad 13541@map_table_formats
d73e5302 13542
99870f4d
KW
13543This format applies only to the entries in the main body of the table.
13544Entries defined in hashes or ones that are missing from the list can have a
13545different format.
d73e5302 13546
ec2f0128 13547The value that the missing entries have is given by another SwashInfo hash
99870f4d 13548entry line; it looks like this:
d73e5302 13549
99870f4d 13550 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
d73e5302 13551
99870f4d 13552This example line says that any Unicode code points not explicitly listed in
77173124 13553the file have the value "NaN" under the property indicated by NAME. If the
99870f4d
KW
13554value is the special string C<< <code point> >>, it means that the value for
13555any missing code point is the code point itself. This happens, for example,
13556in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
77173124 13557character "A", are missing because the uppercase of "A" is itself.
d73e5302 13558
ec2f0128
KW
13559Finally, if the file contains a hash for special case entries, its name is
13560specified by an entry that looks like this:
13561
13562 \$utf8::SwashInfo{'ToNAME'}{'specials_name'} = 'utf8::ToSpecNAME';
13563
99870f4d 13564=head1 SEE ALSO
d73e5302 13565
99870f4d 13566L<$unicode_reference_url>
12ac2576 13567
99870f4d 13568L<perlrecharclass>
12ac2576 13569
99870f4d 13570L<perlunicode>
d73e5302 13571
99870f4d 13572END
d73e5302 13573
9218f1cf
KW
13574 # And write it. The 0 means no utf8.
13575 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
99870f4d
KW
13576 return;
13577}
d73e5302 13578
99870f4d
KW
13579sub make_Heavy () {
13580 # Create and write Heavy.pl, which passes info about the tables to
13581 # utf8_heavy.pl
12ac2576 13582
99870f4d
KW
13583 my @heavy = <<END;
13584$HEADER
13585$INTERNAL_ONLY
d73e5302 13586
99870f4d 13587# This file is for the use of utf8_heavy.pl
12ac2576 13588
c12f2655
KW
13589# Maps Unicode (not Perl single-form extensions) property names in loose
13590# standard form to their corresponding standard names
99870f4d
KW
13591\%utf8::loose_property_name_of = (
13592END
cf25bb62 13593
99870f4d
KW
13594 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
13595 push @heavy, <<END;
13596);
12ac2576 13597
99870f4d
KW
13598# Maps property, table to file for those using stricter matching
13599\%utf8::stricter_to_file_of = (
13600END
13601 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
13602 push @heavy, <<END;
13603);
12ac2576 13604
99870f4d
KW
13605# Maps property, table to file for those using loose matching
13606\%utf8::loose_to_file_of = (
13607END
13608 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
13609 push @heavy, <<END;
13610);
12ac2576 13611
99870f4d
KW
13612# Maps floating point to fractional form
13613\%utf8::nv_floating_to_rational = (
13614END
13615 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
13616 push @heavy, <<END;
13617);
12ac2576 13618
99870f4d
KW
13619# If a floating point number doesn't have enough digits in it to get this
13620# close to a fraction, it isn't considered to be that fraction even if all the
13621# digits it does have match.
13622\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 13623
99870f4d
KW
13624# Deprecated tables to generate a warning for. The key is the file containing
13625# the table, so as to avoid duplication, as many property names can map to the
13626# file, but we only need one entry for all of them.
13627\%utf8::why_deprecated = (
13628END
12ac2576 13629
99870f4d
KW
13630 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
13631 push @heavy, <<END;
13632);
12ac2576 13633
d867ccfb
KW
13634# A few properties have different behavior under /i matching. This maps the
13635# those to substitute files to use under /i.
13636\%utf8::caseless_equivalent = (
13637END
13638
d867ccfb
KW
13639 # We set the key to the file when we associated files with tables, but we
13640 # couldn't do the same for the value then, as we might not have the file
13641 # for the alternate table figured out at that time.
13642 foreach my $cased (keys %caseless_equivalent_to) {
13643 my @path = $caseless_equivalent_to{$cased}->file_path;
13644 my $path = join '/', @path[1, -1];
d867ccfb
KW
13645 $utf8::caseless_equivalent_to{$cased} = $path;
13646 }
13647 push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
13648 push @heavy, <<END;
13649);
13650
99870f4d
KW
136511;
13652END
12ac2576 13653
9218f1cf 13654 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
99870f4d 13655 return;
12ac2576
JP
13656}
13657
99870f4d
KW
13658sub write_all_tables() {
13659 # Write out all the tables generated by this program to files, as well as
13660 # the supporting data structures, pod file, and .t file.
13661
13662 my @writables; # List of tables that actually get written
13663 my %match_tables_to_write; # Used to collapse identical match tables
13664 # into one file. Each key is a hash function
13665 # result to partition tables into buckets.
13666 # Each value is an array of the tables that
13667 # fit in the bucket.
13668
13669 # For each property ...
13670 # (sort so that if there is an immutable file name, it has precedence, so
13671 # some other property can't come in and take over its file name. If b's
13672 # file name is defined, will return 1, meaning to take it first; don't
7fc6cb55
KW
13673 # care if both defined, as they had better be different anyway. And the
13674 # property named 'Perl' needs to be first (it doesn't have any immutable
13675 # file name) because empty properties are defined in terms of it's table
13676 # named 'Any'.)
99870f4d 13677 PROPERTY:
7fc6cb55
KW
13678 foreach my $property (sort { return -1 if $a == $perl;
13679 return 1 if $b == $perl;
13680 return defined $b->file
13681 } property_ref('*'))
13682 {
99870f4d
KW
13683 my $type = $property->type;
13684
13685 # And for each table for that property, starting with the mapping
13686 # table for it ...
13687 TABLE:
13688 foreach my $table($property,
13689
13690 # and all the match tables for it (if any), sorted so
13691 # the ones with the shortest associated file name come
13692 # first. The length sorting prevents problems of a
13693 # longer file taking a name that might have to be used
13694 # by a shorter one. The alphabetic sorting prevents
13695 # differences between releases
13696 sort { my $ext_a = $a->external_name;
13697 return 1 if ! defined $ext_a;
13698 my $ext_b = $b->external_name;
13699 return -1 if ! defined $ext_b;
a92d5c2e
KW
13700
13701 # But return the non-complement table before
13702 # the complement one, as the latter is defined
13703 # in terms of the former, and needs to have
13704 # the information for the former available.
13705 return 1 if $a->complement != 0;
13706 return -1 if $b->complement != 0;
13707
99870f4d
KW
13708 my $cmp = length $ext_a <=> length $ext_b;
13709
13710 # Return result if lengths not equal
13711 return $cmp if $cmp;
13712
13713 # Alphabetic if lengths equal
13714 return $ext_a cmp $ext_b
13715 } $property->tables
13716 )
13717 {
12ac2576 13718
99870f4d
KW
13719 # Here we have a table associated with a property. It could be
13720 # the map table (done first for each property), or one of the
13721 # other tables. Determine which type.
13722 my $is_property = $table->isa('Property');
13723
13724 my $name = $table->name;
13725 my $complete_name = $table->complete_name;
13726
13727 # See if should suppress the table if is empty, but warn if it
13728 # contains something.
13729 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
13730 keys %why_suppress_if_empty_warn_if_not;
13731
13732 # Calculate if this table should have any code points associated
13733 # with it or not.
13734 my $expected_empty =
13735
13736 # $perl should be empty, as well as properties that we just
13737 # don't do anything with
13738 ($is_property
13739 && ($table == $perl
13740 || grep { $complete_name eq $_ }
13741 @unimplemented_properties
13742 )
13743 )
13744
13745 # Match tables in properties we skipped populating should be
13746 # empty
13747 || (! $is_property && ! $property->to_create_match_tables)
13748
13749 # Tables and properties that are expected to have no code
13750 # points should be empty
13751 || $suppress_if_empty_warn_if_not
13752 ;
13753
13754 # Set a boolean if this table is the complement of an empty binary
13755 # table
13756 my $is_complement_of_empty_binary =
13757 $type == $BINARY &&
13758 (($table == $property->table('Y')
13759 && $property->table('N')->is_empty)
13760 || ($table == $property->table('N')
13761 && $property->table('Y')->is_empty));
13762
13763
13764 # Some tables should match everything
13765 my $expected_full =
13766 ($is_property)
13767 ? # All these types of map tables will be full because
13768 # they will have been populated with defaults
13769 ($type == $ENUM || $type == $BINARY)
13770
13771 : # A match table should match everything if its method
13772 # shows it should
13773 ($table->matches_all
13774
13775 # The complement of an empty binary table will match
13776 # everything
13777 || $is_complement_of_empty_binary
13778 )
13779 ;
13780
13781 if ($table->is_empty) {
13782
99870f4d
KW
13783 if ($suppress_if_empty_warn_if_not) {
13784 $table->set_status($SUPPRESSED,
13785 $why_suppress_if_empty_warn_if_not{$complete_name});
13786 }
12ac2576 13787
c12f2655 13788 # Suppress (by skipping them) expected empty tables.
99870f4d
KW
13789 next TABLE if $expected_empty;
13790
13791 # And setup to later output a warning for those that aren't
13792 # known to be allowed to be empty. Don't do the warning if
13793 # this table is a child of another one to avoid duplicating
13794 # the warning that should come from the parent one.
13795 if (($table == $property || $table->parent == $table)
13796 && $table->status ne $SUPPRESSED
13797 && ! grep { $complete_name =~ /^$_$/ }
13798 @tables_that_may_be_empty)
13799 {
13800 push @unhandled_properties, "$table";
13801 }
7fc6cb55
KW
13802
13803 # An empty table is just the complement of everything.
13804 $table->set_complement($Any) if $table != $property;
99870f4d
KW
13805 }
13806 elsif ($expected_empty) {
13807 my $because = "";
13808 if ($suppress_if_empty_warn_if_not) {
13809 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
13810 }
12ac2576 13811
99870f4d
KW
13812 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
13813 }
12ac2576 13814
99870f4d
KW
13815 my $count = $table->count;
13816 if ($expected_full) {
13817 if ($count != $MAX_UNICODE_CODEPOINTS) {
13818 Carp::my_carp("$table matches only "
13819 . clarify_number($count)
13820 . " Unicode code points but should match "
13821 . clarify_number($MAX_UNICODE_CODEPOINTS)
13822 . " (off by "
13823 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
13824 . "). Proceeding anyway.");
13825 }
12ac2576 13826
99870f4d
KW
13827 # Here is expected to be full. If it is because it is the
13828 # complement of an (empty) binary table that is to be
13829 # suppressed, then suppress this one as well.
13830 if ($is_complement_of_empty_binary) {
13831 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
13832 my $opposing = $property->table($opposing_name);
13833 my $opposing_status = $opposing->status;
13834 if ($opposing_status) {
13835 $table->set_status($opposing_status,
13836 $opposing->status_info);
13837 }
13838 }
13839 }
13840 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
13841 if ($table == $property || $table->leader == $table) {
13842 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
13843 }
13844 }
d73e5302 13845
99870f4d
KW
13846 if ($table->status eq $SUPPRESSED) {
13847 if (! $is_property) {
13848 my @children = $table->children;
13849 foreach my $child (@children) {
13850 if ($child->status ne $SUPPRESSED) {
13851 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
13852 }
13853 }
13854 }
13855 next TABLE;
d73e5302 13856
99870f4d
KW
13857 }
13858 if (! $is_property) {
13859
13860 # Several things need to be done just once for each related
13861 # group of match tables. Do them on the parent.
13862 if ($table->parent == $table) {
13863
13864 # Add an entry in the pod file for the table; it also does
13865 # the children.
23e33b60 13866 make_table_pod_entries($table) if defined $pod_directory;
99870f4d
KW
13867
13868 # See if the the table matches identical code points with
13869 # something that has already been output. In that case,
13870 # no need to have two files with the same code points in
13871 # them. We use the table's hash() method to store these
13872 # in buckets, so that it is quite likely that if two
13873 # tables are in the same bucket they will be identical, so
13874 # don't have to compare tables frequently. The tables
13875 # have to have the same status to share a file, so add
13876 # this to the bucket hash. (The reason for this latter is
13877 # that Heavy.pl associates a status with a file.)
13878 my $hash = $table->hash . ';' . $table->status;
13879
13880 # Look at each table that is in the same bucket as this
13881 # one would be.
13882 foreach my $comparison (@{$match_tables_to_write{$hash}})
13883 {
13884 if ($table->matches_identically_to($comparison)) {
13885 $table->set_equivalent_to($comparison,
13886 Related => 0);
13887 next TABLE;
13888 }
13889 }
d73e5302 13890
99870f4d
KW
13891 # Here, not equivalent, add this table to the bucket.
13892 push @{$match_tables_to_write{$hash}}, $table;
13893 }
13894 }
13895 else {
13896
13897 # Here is the property itself.
13898 # Don't write out or make references to the $perl property
13899 next if $table == $perl;
13900
13901 if ($type != $STRING) {
13902
13903 # There is a mapping stored of the various synonyms to the
13904 # standardized name of the property for utf8_heavy.pl.
13905 # Also, the pod file contains entries of the form:
13906 # \p{alias: *} \p{full: *}
13907 # rather than show every possible combination of things.
13908
13909 my @property_aliases = $property->aliases;
13910
13911 # The full name of this property is stored by convention
13912 # first in the alias array
13913 my $full_property_name =
13914 '\p{' . $property_aliases[0]->name . ': *}';
13915 my $standard_property_name = standardize($table->name);
13916
13917 # For each synonym ...
13918 for my $i (0 .. @property_aliases - 1) {
13919 my $alias = $property_aliases[$i];
13920 my $alias_name = $alias->name;
13921 my $alias_standard = standardize($alias_name);
13922
c12f2655 13923 # For utf8_heavy, set the mapping of the alias to the
99870f4d
KW
13924 # property
13925 if (exists ($loose_property_name_of{$alias_standard}))
13926 {
13927 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
13928 }
13929 else {
13930 $loose_property_name_of{$alias_standard}
13931 = $standard_property_name;
13932 }
13933
23e33b60
KW
13934 # Now for the pod entry for this alias. Skip if not
13935 # outputting a pod; skip the first one, which is the
13936 # full name so won't have an entry like: '\p{full: *}
13937 # \p{full: *}', and skip if don't want an entry for
13938 # this one.
13939 next if $i == 0
13940 || ! defined $pod_directory
13941 || ! $alias->make_pod_entry;
99870f4d 13942
d57ccc9a
KW
13943 my $rhs = $full_property_name;
13944 if ($property != $perl && $table->perl_extension) {
13945 $rhs .= ' (Perl extension)';
13946 }
99870f4d
KW
13947 push @match_properties,
13948 format_pod_line($indent_info_column,
13949 '\p{' . $alias->name . ': *}',
d57ccc9a 13950 $rhs,
99870f4d
KW
13951 $alias->status);
13952 }
13953 } # End of non-string-like property code
d73e5302 13954
d73e5302 13955
c12f2655 13956 # Don't write out a mapping file if not desired.
99870f4d
KW
13957 next if ! $property->to_output_map;
13958 }
d73e5302 13959
99870f4d
KW
13960 # Here, we know we want to write out the table, but don't do it
13961 # yet because there may be other tables that come along and will
13962 # want to share the file, and the file's comments will change to
13963 # mention them. So save for later.
13964 push @writables, $table;
13965
13966 } # End of looping through the property and all its tables.
13967 } # End of looping through all properties.
13968
13969 # Now have all the tables that will have files written for them. Do it.
13970 foreach my $table (@writables) {
13971 my @directory;
13972 my $filename;
13973 my $property = $table->property;
13974 my $is_property = ($table == $property);
13975 if (! $is_property) {
13976
13977 # Match tables for the property go in lib/$subdirectory, which is
13978 # the property's name. Don't use the standard file name for this,
13979 # as may get an unfamiliar alias
13980 @directory = ($matches_directory, $property->external_name);
13981 }
13982 else {
d73e5302 13983
99870f4d
KW
13984 @directory = $table->directory;
13985 $filename = $table->file;
13986 }
d73e5302 13987
98dc9551 13988 # Use specified filename if available, or default to property's
99870f4d
KW
13989 # shortest name. We need an 8.3 safe filename (which means "an 8
13990 # safe" filename, since after the dot is only 'pl', which is < 3)
13991 # The 2nd parameter is if the filename shouldn't be changed, and
13992 # it shouldn't iff there is a hard-coded name for this table.
13993 $filename = construct_filename(
13994 $filename || $table->external_name,
13995 ! $filename, # mutable if no filename
13996 \@directory);
d73e5302 13997
99870f4d 13998 register_file_for_name($table, \@directory, $filename);
d73e5302 13999
99870f4d
KW
14000 # Only need to write one file when shared by more than one
14001 # property
a92d5c2e
KW
14002 next if ! $is_property
14003 && ($table->leader != $table || $table->complement != 0);
d73e5302 14004
99870f4d
KW
14005 # Construct a nice comment to add to the file
14006 $table->set_final_comment;
14007
14008 $table->write;
cf25bb62 14009 }
d73e5302 14010
d73e5302 14011
99870f4d
KW
14012 # Write out the pod file
14013 make_pod;
14014
14015 # And Heavy.pl
14016 make_Heavy;
d73e5302 14017
99870f4d
KW
14018 make_property_test_script() if $make_test_script;
14019 return;
cf25bb62 14020}
d73e5302 14021
99870f4d
KW
14022my @white_space_separators = ( # This used only for making the test script.
14023 "",
14024 ' ',
14025 "\t",
14026 ' '
14027 );
d73e5302 14028
99870f4d
KW
14029sub generate_separator($) {
14030 # This used only for making the test script. It generates the colon or
14031 # equal separator between the property and property value, with random
14032 # white space surrounding the separator
d73e5302 14033
99870f4d 14034 my $lhs = shift;
d73e5302 14035
99870f4d 14036 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 14037
99870f4d
KW
14038 # Choose space before and after randomly
14039 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
14040 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 14041
99870f4d
KW
14042 # And return the whole complex, half the time using a colon, half the
14043 # equals
14044 return $spaces_before
14045 . (rand() < 0.5) ? '=' : ':'
14046 . $spaces_after;
14047}
76ccdbe2 14048
430ada4c 14049sub generate_tests($$$$$) {
99870f4d
KW
14050 # This used only for making the test script. It generates test cases that
14051 # are expected to compile successfully in perl. Note that the lhs and
14052 # rhs are assumed to already be as randomized as the caller wants.
14053
99870f4d
KW
14054 my $lhs = shift; # The property: what's to the left of the colon
14055 # or equals separator
14056 my $rhs = shift; # The property value; what's to the right
14057 my $valid_code = shift; # A code point that's known to be in the
14058 # table given by lhs=rhs; undef if table is
14059 # empty
14060 my $invalid_code = shift; # A code point known to not be in the table;
14061 # undef if the table is all code points
14062 my $warning = shift;
14063
14064 # Get the colon or equal
14065 my $separator = generate_separator($lhs);
14066
14067 # The whole 'property=value'
14068 my $name = "$lhs$separator$rhs";
14069
430ada4c 14070 my @output;
99870f4d
KW
14071 # Create a complete set of tests, with complements.
14072 if (defined $valid_code) {
430ada4c
NC
14073 push @output, <<"EOC"
14074Expect(1, $valid_code, '\\p{$name}', $warning);
14075Expect(0, $valid_code, '\\p{^$name}', $warning);
14076Expect(0, $valid_code, '\\P{$name}', $warning);
14077Expect(1, $valid_code, '\\P{^$name}', $warning);
14078EOC
99870f4d
KW
14079 }
14080 if (defined $invalid_code) {
430ada4c
NC
14081 push @output, <<"EOC"
14082Expect(0, $invalid_code, '\\p{$name}', $warning);
14083Expect(1, $invalid_code, '\\p{^$name}', $warning);
14084Expect(1, $invalid_code, '\\P{$name}', $warning);
14085Expect(0, $invalid_code, '\\P{^$name}', $warning);
14086EOC
14087 }
14088 return @output;
99870f4d 14089}
cf25bb62 14090
430ada4c 14091sub generate_error($$$) {
99870f4d
KW
14092 # This used only for making the test script. It generates test cases that
14093 # are expected to not only not match, but to be syntax or similar errors
14094
99870f4d
KW
14095 my $lhs = shift; # The property: what's to the left of the
14096 # colon or equals separator
14097 my $rhs = shift; # The property value; what's to the right
14098 my $already_in_error = shift; # Boolean; if true it's known that the
14099 # unmodified lhs and rhs will cause an error.
14100 # This routine should not force another one
14101 # Get the colon or equal
14102 my $separator = generate_separator($lhs);
14103
14104 # Since this is an error only, don't bother to randomly decide whether to
14105 # put the error on the left or right side; and assume that the rhs is
14106 # loosely matched, again for convenience rather than rigor.
14107 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
14108
14109 my $property = $lhs . $separator . $rhs;
14110
430ada4c
NC
14111 return <<"EOC";
14112Error('\\p{$property}');
14113Error('\\P{$property}');
14114EOC
d73e5302
JH
14115}
14116
99870f4d
KW
14117# These are used only for making the test script
14118# XXX Maybe should also have a bad strict seps, which includes underscore.
14119
14120my @good_loose_seps = (
14121 " ",
14122 "-",
14123 "\t",
14124 "",
14125 "_",
14126 );
14127my @bad_loose_seps = (
14128 "/a/",
14129 ':=',
14130 );
14131
14132sub randomize_stricter_name {
14133 # This used only for making the test script. Take the input name and
14134 # return a randomized, but valid version of it under the stricter matching
14135 # rules.
14136
14137 my $name = shift;
14138 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14139
14140 # If the name looks like a number (integer, floating, or rational), do
14141 # some extra work
14142 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
14143 my $sign = $1;
14144 my $number = $2;
14145 my $separator = $3;
14146
14147 # If there isn't a sign, part of the time add a plus
14148 # Note: Not testing having any denominator having a minus sign
14149 if (! $sign) {
14150 $sign = '+' if rand() <= .3;
14151 }
14152
14153 # And add 0 or more leading zeros.
14154 $name = $sign . ('0' x int rand(10)) . $number;
14155
14156 if (defined $separator) {
14157 my $extra_zeros = '0' x int rand(10);
cf25bb62 14158
99870f4d
KW
14159 if ($separator eq '.') {
14160
14161 # Similarly, add 0 or more trailing zeros after a decimal
14162 # point
14163 $name .= $extra_zeros;
14164 }
14165 else {
14166
14167 # Or, leading zeros before the denominator
14168 $name =~ s,/,/$extra_zeros,;
14169 }
14170 }
cf25bb62 14171 }
d73e5302 14172
99870f4d
KW
14173 # For legibility of the test, only change the case of whole sections at a
14174 # time. To do this, first split into sections. The split returns the
14175 # delimiters
14176 my @sections;
14177 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
14178 trace $section if main::DEBUG && $to_trace;
14179
14180 if (length $section > 1 && $section !~ /\D/) {
14181
14182 # If the section is a sequence of digits, about half the time
14183 # randomly add underscores between some of them.
14184 if (rand() > .5) {
14185
14186 # Figure out how many underscores to add. max is 1 less than
14187 # the number of digits. (But add 1 at the end to make sure
14188 # result isn't 0, and compensate earlier by subtracting 2
14189 # instead of 1)
14190 my $num_underscores = int rand(length($section) - 2) + 1;
14191
14192 # And add them evenly throughout, for convenience, not rigor
14193 use integer;
14194 my $spacing = (length($section) - 1)/ $num_underscores;
14195 my $temp = $section;
14196 $section = "";
14197 for my $i (1 .. $num_underscores) {
14198 $section .= substr($temp, 0, $spacing, "") . '_';
14199 }
14200 $section .= $temp;
14201 }
14202 push @sections, $section;
14203 }
14204 else {
d73e5302 14205
99870f4d
KW
14206 # Here not a sequence of digits. Change the case of the section
14207 # randomly
14208 my $switch = int rand(4);
14209 if ($switch == 0) {
14210 push @sections, uc $section;
14211 }
14212 elsif ($switch == 1) {
14213 push @sections, lc $section;
14214 }
14215 elsif ($switch == 2) {
14216 push @sections, ucfirst $section;
14217 }
14218 else {
14219 push @sections, $section;
14220 }
14221 }
cf25bb62 14222 }
99870f4d
KW
14223 trace "returning", join "", @sections if main::DEBUG && $to_trace;
14224 return join "", @sections;
14225}
71d929cb 14226
99870f4d
KW
14227sub randomize_loose_name($;$) {
14228 # This used only for making the test script
71d929cb 14229
99870f4d
KW
14230 my $name = shift;
14231 my $want_error = shift; # if true, make an error
14232 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14233
14234 $name = randomize_stricter_name($name);
5beb625e
JH
14235
14236 my @parts;
99870f4d 14237 push @parts, $good_loose_seps[rand(@good_loose_seps)];
45376db6
KW
14238
14239 # Preserve trailing ones for the sake of not stripping the underscore from
14240 # 'L_'
14241 for my $part (split /[-\s_]+ (?= . )/, $name) {
5beb625e 14242 if (@parts) {
99870f4d
KW
14243 if ($want_error and rand() < 0.3) {
14244 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
14245 $want_error = 0;
14246 }
14247 else {
14248 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
14249 }
14250 }
99870f4d 14251 push @parts, $part;
5beb625e 14252 }
99870f4d
KW
14253 my $new = join("", @parts);
14254 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 14255
99870f4d 14256 if ($want_error) {
5beb625e 14257 if (rand() >= 0.5) {
99870f4d
KW
14258 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
14259 }
14260 else {
14261 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
14262 }
14263 }
14264 return $new;
14265}
14266
99870f4d
KW
14267# Used to make sure don't generate duplicate test cases.
14268my %test_generated;
5beb625e 14269
99870f4d
KW
14270sub make_property_test_script() {
14271 # This used only for making the test script
14272 # this written directly -- it's huge.
5beb625e 14273
99870f4d 14274 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 14275
99870f4d
KW
14276 # This uses randomness to test different possibilities without testing all
14277 # possibilities. To ensure repeatability, set the seed to 0. But if
14278 # tests are added, it will perturb all later ones in the .t file
14279 srand 0;
5beb625e 14280
3df51b85
KW
14281 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
14282
99870f4d
KW
14283 # Keep going down an order of magnitude
14284 # until find that adding this quantity to
14285 # 1 remains 1; but put an upper limit on
14286 # this so in case this algorithm doesn't
14287 # work properly on some platform, that we
14288 # won't loop forever.
14289 my $digits = 0;
14290 my $min_floating_slop = 1;
14291 while (1+ $min_floating_slop != 1
14292 && $digits++ < 50)
5beb625e 14293 {
99870f4d
KW
14294 my $next = $min_floating_slop / 10;
14295 last if $next == 0; # If underflows,
14296 # use previous one
14297 $min_floating_slop = $next;
5beb625e 14298 }
430ada4c
NC
14299
14300 # It doesn't matter whether the elements of this array contain single lines
14301 # or multiple lines. main::write doesn't count the lines.
14302 my @output;
99870f4d
KW
14303
14304 foreach my $property (property_ref('*')) {
14305 foreach my $table ($property->tables) {
14306
14307 # Find code points that match, and don't match this table.
14308 my $valid = $table->get_valid_code_point;
14309 my $invalid = $table->get_invalid_code_point;
14310 my $warning = ($table->status eq $DEPRECATED)
14311 ? "'deprecated'"
14312 : '""';
14313
14314 # Test each possible combination of the property's aliases with
14315 # the table's. If this gets to be too many, could do what is done
14316 # in the set_final_comment() for Tables
14317 my @table_aliases = $table->aliases;
14318 my @property_aliases = $table->property->aliases;
807807b7
KW
14319
14320 # Every property can be optionally be prefixed by 'Is_', so test
14321 # that those work, by creating such a new alias for each
14322 # pre-existing one.
14323 push @property_aliases, map { Alias->new("Is_" . $_->name,
14324 $_->loose_match,
14325 $_->make_pod_entry,
14326 $_->externally_ok,
14327 $_->status)
14328 } @property_aliases;
99870f4d
KW
14329 my $max = max(scalar @table_aliases, scalar @property_aliases);
14330 for my $j (0 .. $max - 1) {
14331
14332 # The current alias for property is the next one on the list,
14333 # or if beyond the end, start over. Similarly for table
14334 my $property_name
14335 = $property_aliases[$j % @property_aliases]->name;
14336
14337 $property_name = "" if $table->property == $perl;
14338 my $table_alias = $table_aliases[$j % @table_aliases];
14339 my $table_name = $table_alias->name;
14340 my $loose_match = $table_alias->loose_match;
14341
14342 # If the table doesn't have a file, any test for it is
14343 # already guaranteed to be in error
14344 my $already_error = ! $table->file_path;
14345
14346 # Generate error cases for this alias.
430ada4c
NC
14347 push @output, generate_error($property_name,
14348 $table_name,
14349 $already_error);
99870f4d
KW
14350
14351 # If the table is guaranteed to always generate an error,
14352 # quit now without generating success cases.
14353 next if $already_error;
14354
14355 # Now for the success cases.
14356 my $random;
14357 if ($loose_match) {
14358
14359 # For loose matching, create an extra test case for the
14360 # standard name.
14361 my $standard = standardize($table_name);
14362
14363 # $test_name should be a unique combination for each test
14364 # case; used just to avoid duplicate tests
14365 my $test_name = "$property_name=$standard";
14366
14367 # Don't output duplicate test cases.
14368 if (! exists $test_generated{$test_name}) {
14369 $test_generated{$test_name} = 1;
430ada4c
NC
14370 push @output, generate_tests($property_name,
14371 $standard,
14372 $valid,
14373 $invalid,
14374 $warning,
14375 );
5beb625e 14376 }
99870f4d
KW
14377 $random = randomize_loose_name($table_name)
14378 }
14379 else { # Stricter match
14380 $random = randomize_stricter_name($table_name);
99598c8c 14381 }
99598c8c 14382
99870f4d
KW
14383 # Now for the main test case for this alias.
14384 my $test_name = "$property_name=$random";
14385 if (! exists $test_generated{$test_name}) {
14386 $test_generated{$test_name} = 1;
430ada4c
NC
14387 push @output, generate_tests($property_name,
14388 $random,
14389 $valid,
14390 $invalid,
14391 $warning,
14392 );
99870f4d
KW
14393
14394 # If the name is a rational number, add tests for the
14395 # floating point equivalent.
14396 if ($table_name =~ qr{/}) {
14397
14398 # Calculate the float, and find just the fraction.
14399 my $float = eval $table_name;
14400 my ($whole, $fraction)
14401 = $float =~ / (.*) \. (.*) /x;
14402
14403 # Starting with one digit after the decimal point,
14404 # create a test for each possible precision (number of
14405 # digits past the decimal point) until well beyond the
14406 # native number found on this machine. (If we started
14407 # with 0 digits, it would be an integer, which could
14408 # well match an unrelated table)
14409 PLACE:
14410 for my $i (1 .. $min_floating_slop + 3) {
14411 my $table_name = sprintf("%.*f", $i, $float);
14412 if ($i < $MIN_FRACTION_LENGTH) {
14413
14414 # If the test case has fewer digits than the
14415 # minimum acceptable precision, it shouldn't
14416 # succeed, so we expect an error for it.
14417 # E.g., 2/3 = .7 at one decimal point, and we
14418 # shouldn't say it matches .7. We should make
14419 # it be .667 at least before agreeing that the
14420 # intent was to match 2/3. But at the
14421 # less-than- acceptable level of precision, it
14422 # might actually match an unrelated number.
14423 # So don't generate a test case if this
14424 # conflating is possible. In our example, we
14425 # don't want 2/3 matching 7/10, if there is
14426 # a 7/10 code point.
14427 for my $existing
14428 (keys %nv_floating_to_rational)
14429 {
14430 next PLACE
14431 if abs($table_name - $existing)
14432 < $MAX_FLOATING_SLOP;
14433 }
430ada4c
NC
14434 push @output, generate_error($property_name,
14435 $table_name,
14436 1 # 1 => already an error
14437 );
99870f4d
KW
14438 }
14439 else {
14440
14441 # Here the number of digits exceeds the
14442 # minimum we think is needed. So generate a
14443 # success test case for it.
430ada4c
NC
14444 push @output, generate_tests($property_name,
14445 $table_name,
14446 $valid,
14447 $invalid,
14448 $warning,
14449 );
99870f4d
KW
14450 }
14451 }
99598c8c
JH
14452 }
14453 }
99870f4d
KW
14454 }
14455 }
14456 }
37e2e78e 14457
9218f1cf
KW
14458 &write($t_path,
14459 0, # Not utf8;
14460 [<DATA>,
14461 @output,
14462 (map {"Test_X('$_');\n"} @backslash_X_tests),
14463 "Finished();\n"]);
99870f4d
KW
14464 return;
14465}
99598c8c 14466
99870f4d
KW
14467# This is a list of the input files and how to handle them. The files are
14468# processed in their order in this list. Some reordering is possible if
14469# desired, but the v0 files should be first, and the extracted before the
14470# others except DAge.txt (as data in an extracted file can be over-ridden by
14471# the non-extracted. Some other files depend on data derived from an earlier
14472# file, like UnicodeData requires data from Jamo, and the case changing and
14473# folding requires data from Unicode. Mostly, it safest to order by first
14474# version releases in (except the Jamo). DAge.txt is read before the
14475# extracted ones because of the rarely used feature $compare_versions. In the
14476# unlikely event that there were ever an extracted file that contained the Age
14477# property information, it would have to go in front of DAge.
14478#
14479# The version strings allow the program to know whether to expect a file or
14480# not, but if a file exists in the directory, it will be processed, even if it
14481# is in a version earlier than expected, so you can copy files from a later
14482# release into an earlier release's directory.
14483my @input_file_objects = (
14484 Input_file->new('PropertyAliases.txt', v0,
14485 Handler => \&process_PropertyAliases,
14486 ),
14487 Input_file->new(undef, v0, # No file associated with this
3df51b85 14488 Progress_Message => 'Finishing property setup',
99870f4d
KW
14489 Handler => \&finish_property_setup,
14490 ),
14491 Input_file->new('PropValueAliases.txt', v0,
14492 Handler => \&process_PropValueAliases,
14493 Has_Missings_Defaults => $NOT_IGNORED,
14494 ),
14495 Input_file->new('DAge.txt', v3.2.0,
14496 Has_Missings_Defaults => $NOT_IGNORED,
14497 Property => 'Age'
14498 ),
14499 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
14500 Property => 'General_Category',
14501 ),
14502 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
14503 Property => 'Canonical_Combining_Class',
14504 Has_Missings_Defaults => $NOT_IGNORED,
14505 ),
14506 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
14507 Property => 'Numeric_Type',
14508 Has_Missings_Defaults => $NOT_IGNORED,
14509 ),
14510 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
14511 Property => 'East_Asian_Width',
14512 Has_Missings_Defaults => $NOT_IGNORED,
14513 ),
14514 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
14515 Property => 'Line_Break',
14516 Has_Missings_Defaults => $NOT_IGNORED,
14517 ),
14518 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
14519 Property => 'Bidi_Class',
14520 Has_Missings_Defaults => $NOT_IGNORED,
14521 ),
14522 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
14523 Property => 'Decomposition_Type',
14524 Has_Missings_Defaults => $NOT_IGNORED,
14525 ),
14526 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
14527 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
14528 Property => 'Numeric_Value',
14529 Each_Line_Handler => \&filter_numeric_value_line,
14530 Has_Missings_Defaults => $NOT_IGNORED,
14531 ),
14532 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
14533 Property => 'Joining_Group',
14534 Has_Missings_Defaults => $NOT_IGNORED,
14535 ),
14536
14537 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
14538 Property => 'Joining_Type',
14539 Has_Missings_Defaults => $NOT_IGNORED,
14540 ),
14541 Input_file->new('Jamo.txt', v2.0.0,
14542 Property => 'Jamo_Short_Name',
14543 Each_Line_Handler => \&filter_jamo_line,
14544 ),
14545 Input_file->new('UnicodeData.txt', v1.1.5,
14546 Pre_Handler => \&setup_UnicodeData,
14547
14548 # We clean up this file for some early versions.
14549 Each_Line_Handler => [ (($v_version lt v2.0.0 )
14550 ? \&filter_v1_ucd
14551 : ($v_version eq v2.1.5)
14552 ? \&filter_v2_1_5_ucd
3ffed8c2
KW
14553
14554 # And for 5.14 Perls with 6.0,
14555 # have to also make changes
14556 : ($v_version ge v6.0.0)
14557 ? \&filter_v6_ucd
14558 : undef),
99870f4d
KW
14559
14560 # And the main filter
14561 \&filter_UnicodeData_line,
14562 ],
14563 EOF_Handler => \&EOF_UnicodeData,
14564 ),
14565 Input_file->new('ArabicShaping.txt', v2.0.0,
14566 Each_Line_Handler =>
14567 [ ($v_version lt 4.1.0)
14568 ? \&filter_old_style_arabic_shaping
14569 : undef,
14570 \&filter_arabic_shaping_line,
14571 ],
14572 Has_Missings_Defaults => $NOT_IGNORED,
14573 ),
14574 Input_file->new('Blocks.txt', v2.0.0,
14575 Property => 'Block',
14576 Has_Missings_Defaults => $NOT_IGNORED,
14577 Each_Line_Handler => \&filter_blocks_lines
14578 ),
14579 Input_file->new('PropList.txt', v2.0.0,
14580 Each_Line_Handler => (($v_version lt v3.1.0)
14581 ? \&filter_old_style_proplist
14582 : undef),
14583 ),
14584 Input_file->new('Unihan.txt', v2.0.0,
14585 Pre_Handler => \&setup_unihan,
14586 Optional => 1,
14587 Each_Line_Handler => \&filter_unihan_line,
14588 ),
14589 Input_file->new('SpecialCasing.txt', v2.1.8,
14590 Each_Line_Handler => \&filter_special_casing_line,
14591 Pre_Handler => \&setup_special_casing,
14592 ),
14593 Input_file->new(
14594 'LineBreak.txt', v3.0.0,
14595 Has_Missings_Defaults => $NOT_IGNORED,
14596 Property => 'Line_Break',
14597 # Early versions had problematic syntax
14598 Each_Line_Handler => (($v_version lt v3.1.0)
14599 ? \&filter_early_ea_lb
14600 : undef),
14601 ),
14602 Input_file->new('EastAsianWidth.txt', v3.0.0,
14603 Property => 'East_Asian_Width',
14604 Has_Missings_Defaults => $NOT_IGNORED,
14605 # Early versions had problematic syntax
14606 Each_Line_Handler => (($v_version lt v3.1.0)
14607 ? \&filter_early_ea_lb
14608 : undef),
14609 ),
14610 Input_file->new('CompositionExclusions.txt', v3.0.0,
14611 Property => 'Composition_Exclusion',
14612 ),
14613 Input_file->new('BidiMirroring.txt', v3.0.1,
14614 Property => 'Bidi_Mirroring_Glyph',
14615 ),
37e2e78e
KW
14616 Input_file->new("NormalizationTest.txt", v3.0.1,
14617 Skip => 1,
14618 ),
99870f4d
KW
14619 Input_file->new('CaseFolding.txt', v3.0.1,
14620 Pre_Handler => \&setup_case_folding,
14621 Each_Line_Handler =>
14622 [ ($v_version lt v3.1.0)
14623 ? \&filter_old_style_case_folding
14624 : undef,
14625 \&filter_case_folding_line
14626 ],
99870f4d
KW
14627 ),
14628 Input_file->new('DCoreProperties.txt', v3.1.0,
14629 # 5.2 changed this file
14630 Has_Missings_Defaults => (($v_version ge v5.2.0)
14631 ? $NOT_IGNORED
14632 : $NO_DEFAULTS),
14633 ),
14634 Input_file->new('Scripts.txt', v3.1.0,
14635 Property => 'Script',
14636 Has_Missings_Defaults => $NOT_IGNORED,
14637 ),
14638 Input_file->new('DNormalizationProps.txt', v3.1.0,
14639 Has_Missings_Defaults => $NOT_IGNORED,
14640 Each_Line_Handler => (($v_version lt v4.0.1)
14641 ? \&filter_old_style_normalization_lines
14642 : undef),
14643 ),
14644 Input_file->new('HangulSyllableType.txt', v4.0.0,
14645 Has_Missings_Defaults => $NOT_IGNORED,
14646 Property => 'Hangul_Syllable_Type'),
14647 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
14648 Property => 'Word_Break',
14649 Has_Missings_Defaults => $NOT_IGNORED,
14650 ),
14651 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
14652 Property => 'Grapheme_Cluster_Break',
14653 Has_Missings_Defaults => $NOT_IGNORED,
14654 ),
37e2e78e
KW
14655 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
14656 Handler => \&process_GCB_test,
14657 ),
14658 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
14659 Skip => 1,
14660 ),
14661 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
14662 Skip => 1,
14663 ),
14664 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
14665 Skip => 1,
14666 ),
99870f4d
KW
14667 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
14668 Property => 'Sentence_Break',
14669 Has_Missings_Defaults => $NOT_IGNORED,
14670 ),
14671 Input_file->new('NamedSequences.txt', v4.1.0,
14672 Handler => \&process_NamedSequences
14673 ),
14674 Input_file->new('NameAliases.txt', v5.0.0,
14675 Property => 'Name_Alias',
14676 ),
37e2e78e
KW
14677 Input_file->new("BidiTest.txt", v5.2.0,
14678 Skip => 1,
14679 ),
99870f4d
KW
14680 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
14681 Optional => 1,
14682 Each_Line_Handler => \&filter_unihan_line,
14683 ),
14684 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
14685 Optional => 1,
14686 Each_Line_Handler => \&filter_unihan_line,
14687 ),
14688 Input_file->new('UnihanIRGSources.txt', v5.2.0,
14689 Optional => 1,
14690 Pre_Handler => \&setup_unihan,
14691 Each_Line_Handler => \&filter_unihan_line,
14692 ),
14693 Input_file->new('UnihanNumericValues.txt', v5.2.0,
14694 Optional => 1,
14695 Each_Line_Handler => \&filter_unihan_line,
14696 ),
14697 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
14698 Optional => 1,
14699 Each_Line_Handler => \&filter_unihan_line,
14700 ),
14701 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
14702 Optional => 1,
14703 Each_Line_Handler => \&filter_unihan_line,
14704 ),
14705 Input_file->new('UnihanReadings.txt', v5.2.0,
14706 Optional => 1,
14707 Each_Line_Handler => \&filter_unihan_line,
14708 ),
14709 Input_file->new('UnihanVariants.txt', v5.2.0,
14710 Optional => 1,
14711 Each_Line_Handler => \&filter_unihan_line,
14712 ),
82aed44a
KW
14713 Input_file->new('ScriptExtensions.txt', v6.0.0,
14714 Property => 'Script_Extensions',
14715 Pre_Handler => \&setup_script_extensions,
14716 ),
99870f4d 14717);
99598c8c 14718
99870f4d
KW
14719# End of all the preliminaries.
14720# Do it...
99598c8c 14721
99870f4d
KW
14722if ($compare_versions) {
14723 Carp::my_carp(<<END
14724Warning. \$compare_versions is set. Output is not suitable for production
14725END
14726 );
14727}
99598c8c 14728
99870f4d
KW
14729# Put into %potential_files a list of all the files in the directory structure
14730# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 14731# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
14732my @ignored_files_full_names = map { File::Spec->rel2abs(
14733 internal_file_to_platform($_))
14734 } keys %ignored_files;
14735File::Find::find({
14736 wanted=>sub {
37e2e78e 14737 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 14738 my $full = lc(File::Spec->rel2abs($_));
99870f4d 14739 $potential_files{$full} = 1
37e2e78e 14740 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
14741 return;
14742 }
14743}, File::Spec->curdir());
99598c8c 14744
99870f4d 14745my @mktables_list_output_files;
cdcef19a 14746my $old_start_time = 0;
cf25bb62 14747
3644ba60
KW
14748if (! -e $file_list) {
14749 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
14750 $write_unchanged_files = 1;
14751} elsif ($write_unchanged_files) {
99870f4d
KW
14752 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
14753}
14754else {
14755 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
14756 my $file_handle;
23e33b60 14757 if (! open $file_handle, "<", $file_list) {
3644ba60 14758 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
14759 $glob_list = 1;
14760 }
14761 else {
14762 my @input;
14763
14764 # Read and parse mktables.lst, placing the results from the first part
14765 # into @input, and the second part into @mktables_list_output_files
14766 for my $list ( \@input, \@mktables_list_output_files ) {
14767 while (<$file_handle>) {
14768 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
14769 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
14770 $old_start_time = $1;
14771 }
99870f4d
KW
14772 next if /^ \s* (?: \# .* )? $/x;
14773 last if /^ =+ $/x;
14774 my ( $file ) = split /\t/;
14775 push @$list, $file;
cf25bb62 14776 }
99870f4d
KW
14777 @$list = uniques(@$list);
14778 next;
cf25bb62
JH
14779 }
14780
99870f4d
KW
14781 # Look through all the input files
14782 foreach my $input (@input) {
14783 next if $input eq 'version'; # Already have checked this.
cf25bb62 14784
99870f4d
KW
14785 # Ignore if doesn't exist. The checking about whether we care or
14786 # not is done via the Input_file object.
14787 next if ! file_exists($input);
5beb625e 14788
99870f4d
KW
14789 # The paths are stored with relative names, and with '/' as the
14790 # delimiter; convert to absolute on this machine
517956bf 14791 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
99870f4d 14792 $potential_files{$full} = 1
517956bf 14793 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 14794 }
5beb625e 14795 }
cf25bb62 14796
99870f4d
KW
14797 close $file_handle;
14798}
14799
14800if ($glob_list) {
14801
14802 # Here wants to process all .txt files in the directory structure.
14803 # Convert them to full path names. They are stored in the platform's
14804 # relative style
f86864ac
KW
14805 my @known_files;
14806 foreach my $object (@input_file_objects) {
14807 my $file = $object->file;
14808 next unless defined $file;
14809 push @known_files, File::Spec->rel2abs($file);
14810 }
99870f4d
KW
14811
14812 my @unknown_input_files;
14813 foreach my $file (keys %potential_files) {
517956bf 14814 next if grep { lc($file) eq lc($_) } @known_files;
99870f4d
KW
14815
14816 # Here, the file is unknown to us. Get relative path name
14817 $file = File::Spec->abs2rel($file);
14818 push @unknown_input_files, $file;
14819
14820 # What will happen is we create a data structure for it, and add it to
14821 # the list of input files to process. First get the subdirectories
14822 # into an array
14823 my (undef, $directories, undef) = File::Spec->splitpath($file);
14824 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14825 my @directories = File::Spec->splitdir($directories);
14826
14827 # If the file isn't extracted (meaning none of the directories is the
14828 # extracted one), just add it to the end of the list of inputs.
14829 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 14830 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
14831 }
14832 else {
14833
14834 # Here, the file is extracted. It needs to go ahead of most other
14835 # processing. Search for the first input file that isn't a
14836 # special required property (that is, find one whose first_release
14837 # is non-0), and isn't extracted. Also, the Age property file is
14838 # processed before the extracted ones, just in case
14839 # $compare_versions is set.
14840 for (my $i = 0; $i < @input_file_objects; $i++) {
14841 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
14842 && lc($input_file_objects[$i]->file) ne 'dage.txt'
14843 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 14844 {
99f78760 14845 splice @input_file_objects, $i, 0,
37e2e78e 14846 Input_file->new($file, v0);
99870f4d
KW
14847 last;
14848 }
cf25bb62 14849 }
99870f4d 14850
cf25bb62 14851 }
d2d499f5 14852 }
99870f4d 14853 if (@unknown_input_files) {
23e33b60 14854 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
14855
14856The following files are unknown as to how to handle. Assuming they are
14857typical property files. You'll know by later error messages if it worked or
14858not:
14859END
99f78760 14860 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
14861 }
14862} # End of looking through directory structure for more .txt files.
5beb625e 14863
99870f4d
KW
14864# Create the list of input files from the objects we have defined, plus
14865# version
14866my @input_files = 'version';
14867foreach my $object (@input_file_objects) {
14868 my $file = $object->file;
14869 next if ! defined $file; # Not all objects have files
14870 next if $object->optional && ! -e $file;
14871 push @input_files, $file;
14872}
5beb625e 14873
99870f4d
KW
14874if ( $verbosity >= $VERBOSE ) {
14875 print "Expecting ".scalar( @input_files )." input files. ",
14876 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
14877}
cf25bb62 14878
aeab6150
KW
14879# We set $most_recent to be the most recently changed input file, including
14880# this program itself (done much earlier in this file)
99870f4d 14881foreach my $in (@input_files) {
cdcef19a
KW
14882 next unless -e $in; # Keep going even if missing a file
14883 my $mod_time = (stat $in)[9];
aeab6150 14884 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
14885
14886 # See that the input files have distinct names, to warn someone if they
14887 # are adding a new one
14888 if ($make_list) {
14889 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
14890 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14891 my @directories = File::Spec->splitdir($directories);
14892 my $base = $file =~ s/\.txt$//;
14893 construct_filename($file, 'mutable', \@directories);
cf25bb62 14894 }
99870f4d 14895}
cf25bb62 14896
dff6c046 14897my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 14898 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 14899 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 14900
99870f4d
KW
14901# Now we check to see if any output files are older than youngest, if
14902# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 14903if (! $rebuild) {
99870f4d
KW
14904 foreach my $out (@mktables_list_output_files) {
14905 if ( ! file_exists($out)) {
14906 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 14907 $rebuild = 1;
99870f4d
KW
14908 last;
14909 }
14910 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
14911 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
14912 if ( (stat $out)[9] <= $most_recent ) {
14913 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 14914 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 14915 $rebuild = 1;
99870f4d 14916 last;
cf25bb62 14917 }
cf25bb62 14918 }
99870f4d 14919}
d1d1cd7a 14920if (! $rebuild) {
1265e11f 14921 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
14922 exit(0);
14923}
14924print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 14925
99870f4d
KW
14926# Ready to do the major processing. First create the perl pseudo-property.
14927$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 14928
99870f4d
KW
14929# Process each input file
14930foreach my $file (@input_file_objects) {
14931 $file->run;
d2d499f5
JH
14932}
14933
99870f4d 14934# Finish the table generation.
c4051cc5 14935
99870f4d
KW
14936print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
14937finish_Unicode();
c4051cc5 14938
99870f4d
KW
14939print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
14940compile_perl();
c4051cc5 14941
99870f4d
KW
14942print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
14943add_perl_synonyms();
c4051cc5 14944
99870f4d
KW
14945print "Writing tables\n" if $verbosity >= $PROGRESS;
14946write_all_tables();
c4051cc5 14947
99870f4d
KW
14948# Write mktables.lst
14949if ( $file_list and $make_list ) {
c4051cc5 14950
99870f4d
KW
14951 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
14952 foreach my $file (@input_files, @files_actually_output) {
14953 my (undef, $directories, $file) = File::Spec->splitpath($file);
14954 my @directories = File::Spec->splitdir($directories);
14955 $file = join '/', @directories, $file;
14956 }
14957
14958 my $ofh;
14959 if (! open $ofh,">",$file_list) {
14960 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
14961 return
14962 }
14963 else {
cdcef19a 14964 my $localtime = localtime $start_time;
99870f4d
KW
14965 print $ofh <<"END";
14966#
14967# $file_list -- File list for $0.
97050450 14968#
cdcef19a 14969# Autogenerated starting on $start_time ($localtime)
97050450
YO
14970#
14971# - First section is input files
99870f4d 14972# ($0 itself is not listed but is automatically considered an input)
98dc9551 14973# - Section separator is /^=+\$/
97050450
YO
14974# - Second section is a list of output files.
14975# - Lines matching /^\\s*#/ are treated as comments
14976# which along with blank lines are ignored.
14977#
14978
14979# Input files:
14980
99870f4d
KW
14981END
14982 print $ofh "$_\n" for sort(@input_files);
14983 print $ofh "\n=================================\n# Output files:\n\n";
14984 print $ofh "$_\n" for sort @files_actually_output;
14985 print $ofh "\n# ",scalar(@input_files)," input files\n",
14986 "# ",scalar(@files_actually_output)+1," output files\n\n",
14987 "# End list\n";
14988 close $ofh
14989 or Carp::my_carp("Failed to close $ofh: $!");
14990
14991 print "Filelist has ",scalar(@input_files)," input files and ",
14992 scalar(@files_actually_output)+1," output files\n"
14993 if $verbosity >= $VERBOSE;
14994 }
14995}
14996
14997# Output these warnings unless -q explicitly specified.
c83dffeb 14998if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
99870f4d
KW
14999 if (@unhandled_properties) {
15000 print "\nProperties and tables that unexpectedly have no code points\n";
15001 foreach my $property (sort @unhandled_properties) {
15002 print $property, "\n";
15003 }
15004 }
15005
15006 if (%potential_files) {
15007 print "\nInput files that are not considered:\n";
15008 foreach my $file (sort keys %potential_files) {
15009 print File::Spec->abs2rel($file), "\n";
15010 }
15011 }
15012 print "\nAll done\n" if $verbosity >= $VERBOSE;
15013}
5beb625e 15014exit(0);
cf25bb62 15015
99870f4d 15016# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 15017__DATA__
99870f4d 15018
5beb625e
JH
15019use strict;
15020use warnings;
15021
66fd7fd0
KW
15022# If run outside the normal test suite on an ASCII platform, you can
15023# just create a latin1_to_native() function that just returns its
15024# inputs, because that's the only function used from test.pl
15025require "test.pl";
15026
37e2e78e
KW
15027# Test qr/\X/ and the \p{} regular expression constructs. This file is
15028# constructed by mktables from the tables it generates, so if mktables is
15029# buggy, this won't necessarily catch those bugs. Tests are generated for all
15030# feasible properties; a few aren't currently feasible; see
15031# is_code_point_usable() in mktables for details.
99870f4d
KW
15032
15033# Standard test packages are not used because this manipulates SIG_WARN. It
15034# exits 0 if every non-skipped test succeeded; -1 if any failed.
15035
5beb625e
JH
15036my $Tests = 0;
15037my $Fails = 0;
99870f4d 15038
99870f4d
KW
15039sub Expect($$$$) {
15040 my $expected = shift;
15041 my $ord = shift;
15042 my $regex = shift;
15043 my $warning_type = shift; # Type of warning message, like 'deprecated'
15044 # or empty if none
15045 my $line = (caller)[2];
66fd7fd0 15046 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 15047
99870f4d 15048 # Convert the code point to hex form
23e33b60 15049 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 15050
99870f4d 15051 my @tests = "";
5beb625e 15052
37e2e78e
KW
15053 # The first time through, use all warnings. If the input should generate
15054 # a warning, add another time through with them turned off
99870f4d
KW
15055 push @tests, "no warnings '$warning_type';" if $warning_type;
15056
15057 foreach my $no_warnings (@tests) {
15058
15059 # Store any warning messages instead of outputting them
15060 local $SIG{__WARN__} = $SIG{__WARN__};
15061 my $warning_message;
15062 $SIG{__WARN__} = sub { $warning_message = $_[0] };
15063
15064 $Tests++;
15065
15066 # A string eval is needed because of the 'no warnings'.
15067 # Assumes no parens in the regular expression
15068 my $result = eval "$no_warnings
15069 my \$RegObj = qr($regex);
15070 $string =~ \$RegObj ? 1 : 0";
15071 if (not defined $result) {
15072 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
15073 $Fails++;
15074 }
15075 elsif ($result ^ $expected) {
15076 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
15077 $Fails++;
15078 }
15079 elsif ($warning_message) {
15080 if (! $warning_type || ($warning_type && $no_warnings)) {
15081 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
15082 $Fails++;
15083 }
15084 else {
15085 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
15086 }
15087 }
15088 elsif ($warning_type && ! $no_warnings) {
15089 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
15090 $Fails++;
15091 }
15092 else {
15093 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
15094 }
5beb625e 15095 }
99870f4d 15096 return;
5beb625e 15097}
d73e5302 15098
99870f4d
KW
15099sub Error($) {
15100 my $regex = shift;
5beb625e 15101 $Tests++;
99870f4d 15102 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 15103 $Fails++;
99870f4d
KW
15104 my $line = (caller)[2];
15105 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 15106 }
99870f4d
KW
15107 else {
15108 my $line = (caller)[2];
15109 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
15110 }
15111 return;
5beb625e
JH
15112}
15113
37e2e78e
KW
15114# GCBTest.txt character that separates grapheme clusters
15115my $breakable_utf8 = my $breakable = chr(0xF7);
15116utf8::upgrade($breakable_utf8);
15117
15118# GCBTest.txt character that indicates that the adjoining code points are part
15119# of the same grapheme cluster
15120my $nobreak_utf8 = my $nobreak = chr(0xD7);
15121utf8::upgrade($nobreak_utf8);
15122
15123sub Test_X($) {
15124 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
15125 # Each such line is a sequence of code points given by their hex numbers,
15126 # separated by the two characters defined just before this subroutine that
15127 # indicate that either there can or cannot be a break between the adjacent
15128 # code points. If there isn't a break, that means the sequence forms an
15129 # extended grapheme cluster, which means that \X should match the whole
15130 # thing. If there is a break, \X should stop there. This is all
15131 # converted by this routine into a match:
15132 # $string =~ /(\X)/,
15133 # Each \X should match the next cluster; and that is what is checked.
15134
15135 my $template = shift;
15136
15137 my $line = (caller)[2];
15138
15139 # The line contains characters above the ASCII range, but in Latin1. It
15140 # may or may not be in utf8, and if it is, it may or may not know it. So,
15141 # convert these characters to 8 bits. If knows is in utf8, simply
15142 # downgrade.
15143 if (utf8::is_utf8($template)) {
15144 utf8::downgrade($template);
15145 } else {
15146
15147 # Otherwise, if it is in utf8, but doesn't know it, the next lines
15148 # convert the two problematic characters to their 8-bit equivalents.
15149 # If it isn't in utf8, they don't harm anything.
15150 use bytes;
15151 $template =~ s/$nobreak_utf8/$nobreak/g;
15152 $template =~ s/$breakable_utf8/$breakable/g;
15153 }
15154
15155 # Get rid of the leading and trailing breakables
15156 $template =~ s/^ \s* $breakable \s* //x;
15157 $template =~ s/ \s* $breakable \s* $ //x;
15158
15159 # And no-breaks become just a space.
15160 $template =~ s/ \s* $nobreak \s* / /xg;
15161
15162 # Split the input into segments that are breakable between them.
15163 my @segments = split /\s*$breakable\s*/, $template;
15164
15165 my $string = "";
15166 my $display_string = "";
15167 my @should_match;
15168 my @should_display;
15169
15170 # Convert the code point sequence in each segment into a Perl string of
15171 # characters
15172 foreach my $segment (@segments) {
15173 my @code_points = split /\s+/, $segment;
15174 my $this_string = "";
15175 my $this_display = "";
15176 foreach my $code_point (@code_points) {
66fd7fd0 15177 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
15178 $this_display .= "\\x{$code_point}";
15179 }
15180
15181 # The next cluster should match the string in this segment.
15182 push @should_match, $this_string;
15183 push @should_display, $this_display;
15184 $string .= $this_string;
15185 $display_string .= $this_display;
15186 }
15187
15188 # If a string can be represented in both non-ut8 and utf8, test both cases
15189 UPGRADE:
15190 for my $to_upgrade (0 .. 1) {
678f13d5 15191
37e2e78e
KW
15192 if ($to_upgrade) {
15193
15194 # If already in utf8, would just be a repeat
15195 next UPGRADE if utf8::is_utf8($string);
15196
15197 utf8::upgrade($string);
15198 }
15199
15200 # Finally, do the \X match.
15201 my @matches = $string =~ /(\X)/g;
15202
15203 # Look through each matched cluster to verify that it matches what we
15204 # expect.
15205 my $min = (@matches < @should_match) ? @matches : @should_match;
15206 for my $i (0 .. $min - 1) {
15207 $Tests++;
15208 if ($matches[$i] eq $should_match[$i]) {
15209 print "ok $Tests - ";
15210 if ($i == 0) {
15211 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
15212 } else {
15213 print "And \\X #", $i + 1,
15214 }
15215 print " correctly matched $should_display[$i]; line $line\n";
15216 } else {
15217 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
15218 unpack("U*", $matches[$i]));
15219 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
15220 $i + 1,
15221 " should have matched $should_display[$i]",
15222 " but instead matched $matches[$i]",
15223 ". Abandoning rest of line $line\n";
15224 next UPGRADE;
15225 }
15226 }
15227
15228 # And the number of matches should equal the number of expected matches.
15229 $Tests++;
15230 if (@matches == @should_match) {
15231 print "ok $Tests - Nothing was left over; line $line\n";
15232 } else {
15233 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
15234 }
15235 }
15236
15237 return;
15238}
15239
99870f4d 15240sub Finished() {
f86864ac 15241 print "1..$Tests\n";
99870f4d 15242 exit($Fails ? -1 : 0);
5beb625e 15243}
99870f4d
KW
15244
15245Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 15246Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
15247Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
15248Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 15249Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726