This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
perlunicode, perluniprops: \p{Title} is Perl extension
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a 19my $start_time;
98dc9551 20BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
21 # get it as close as possible
22 $start_time= time;
23}
24
25
23e33b60 26require 5.010_001;
d73e5302 27use strict;
99870f4d 28use warnings;
cf25bb62 29use Carp;
bd9ebcfd 30use Config;
99870f4d
KW
31use File::Find;
32use File::Path;
d07a55ed 33use File::Spec;
99870f4d
KW
34use Text::Tabs;
35
36sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 37my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d
KW
38
39##########################################################################
40#
41# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42# from the Unicode database files (lib/unicore/.../*.txt), It also generates
43# a pod file and a .t file
44#
45# The structure of this file is:
46# First these introductory comments; then
47# code needed for everywhere, such as debugging stuff; then
48# code to handle input parameters; then
49# data structures likely to be of external interest (some of which depend on
50# the input parameters, so follows them; then
51# more data structures and subroutine and package (class) definitions; then
52# the small actual loop to process the input files and finish up; then
53# a __DATA__ section, for the .t tests
54#
5f7264c7 55# This program works on all releases of Unicode through at least 6.0. The
99870f4d
KW
56# outputs have been scrutinized most intently for release 5.1. The others
57# have been checked for somewhat more than just sanity. It can handle all
58# existing Unicode character properties in those releases.
59#
99870f4d
KW
60# This program is mostly about Unicode character (or code point) properties.
61# A property describes some attribute or quality of a code point, like if it
62# is lowercase or not, its name, what version of Unicode it was first defined
63# in, or what its uppercase equivalent is. Unicode deals with these disparate
64# possibilities by making all properties into mappings from each code point
65# into some corresponding value. In the case of it being lowercase or not,
66# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67# property maps each Unicode code point to a single value, called a "property
68# value". (Hence each Unicode property is a true mathematical function with
69# exactly one value per code point.)
70#
71# When using a property in a regular expression, what is desired isn't the
72# mapping of the code point to its property's value, but the reverse (or the
73# mathematical "inverse relation"): starting with the property value, "Does a
74# code point map to it?" These are written in a "compound" form:
75# \p{property=value}, e.g., \p{category=punctuation}. This program generates
76# files containing the lists of code points that map to each such regular
77# expression property value, one file per list
78#
79# There is also a single form shortcut that Perl adds for many of the commonly
80# used properties. This happens for all binary properties, plus script,
81# general_category, and block properties.
82#
83# Thus the outputs of this program are files. There are map files, mostly in
84# the 'To' directory; and there are list files for use in regular expression
85# matching, all in subdirectories of the 'lib' directory, with each
86# subdirectory being named for the property that the lists in it are for.
87# Bookkeeping, test, and documentation files are also generated.
88
89my $matches_directory = 'lib'; # Where match (\p{}) files go.
90my $map_directory = 'To'; # Where map files go.
91
92# DATA STRUCTURES
93#
94# The major data structures of this program are Property, of course, but also
95# Table. There are two kinds of tables, very similar to each other.
96# "Match_Table" is the data structure giving the list of code points that have
97# a particular property value, mentioned above. There is also a "Map_Table"
98# data structure which gives the property's mapping from code point to value.
99# There are two structures because the match tables need to be combined in
100# various ways, such as constructing unions, intersections, complements, etc.,
101# and the map ones don't. And there would be problems, perhaps subtle, if
102# a map table were inadvertently operated on in some of those ways.
103# The use of separate classes with operations defined on one but not the other
104# prevents accidentally confusing the two.
105#
106# At the heart of each table's data structure is a "Range_List", which is just
107# an ordered list of "Ranges", plus ancillary information, and methods to
108# operate on them. A Range is a compact way to store property information.
109# Each range has a starting code point, an ending code point, and a value that
110# is meant to apply to all the code points between the two end points,
111# inclusive. For a map table, this value is the property value for those
112# code points. Two such ranges could be written like this:
113# 0x41 .. 0x5A, 'Upper',
114# 0x61 .. 0x7A, 'Lower'
115#
116# Each range also has a type used as a convenience to classify the values.
117# Most ranges in this program will be Type 0, or normal, but there are some
118# ranges that have a non-zero type. These are used only in map tables, and
119# are for mappings that don't fit into the normal scheme of things. Mappings
120# that require a hash entry to communicate with utf8.c are one example;
121# another example is mappings for charnames.pm to use which indicate a name
122# that is algorithmically determinable from its code point (and vice-versa).
123# These are used to significantly compact these tables, instead of listing
124# each one of the tens of thousands individually.
125#
126# In a match table, the value of a range is irrelevant (and hence the type as
127# well, which will always be 0), and arbitrarily set to the null string.
128# Using the example above, there would be two match tables for those two
129# entries, one named Upper would contain the 0x41..0x5A range, and the other
130# named Lower would contain 0x61..0x7A.
131#
132# Actually, there are two types of range lists, "Range_Map" is the one
133# associated with map tables, and "Range_List" with match tables.
134# Again, this is so that methods can be defined on one and not the other so as
135# to prevent operating on them in incorrect ways.
136#
137# Eventually, most tables are written out to files to be read by utf8_heavy.pl
138# in the perl core. All tables could in theory be written, but some are
139# suppressed because there is no current practical use for them. It is easy
140# to change which get written by changing various lists that are near the top
141# of the actual code in this file. The table data structures contain enough
142# ancillary information to allow them to be treated as separate entities for
143# writing, such as the path to each one's file. There is a heading in each
144# map table that gives the format of its entries, and what the map is for all
145# the code points missing from it. (This allows tables to be more compact.)
678f13d5 146#
99870f4d
KW
147# The Property data structure contains one or more tables. All properties
148# contain a map table (except the $perl property which is a
149# pseudo-property containing only match tables), and any properties that
150# are usable in regular expression matches also contain various matching
151# tables, one for each value the property can have. A binary property can
152# have two values, True and False (or Y and N, which are preferred by Unicode
153# terminology). Thus each of these properties will have a map table that
154# takes every code point and maps it to Y or N (but having ranges cuts the
155# number of entries in that table way down), and two match tables, one
156# which has a list of all the code points that map to Y, and one for all the
157# code points that map to N. (For each of these, a third table is also
158# generated for the pseudo Perl property. It contains the identical code
159# points as the Y table, but can be written, not in the compound form, but in
160# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
161# properties have several possible values, some have many, and properties like
162# Name have a different value for every named code point. Those will not,
163# unless the controlling lists are changed, have their match tables written
164# out. But all the ones which can be used in regular expression \p{} and \P{}
165# constructs will. Generally a property will have either its map table or its
166# match tables written but not both. Again, what gets written is controlled
dc85bd38
KW
167# by lists which can easily be changed. Properties have a 'Type', like
168# binary, or string, or enum depending on how many match tables there are and
169# the content of the maps. This 'Type' is different than a range 'Type', so
170# don't get confused by the two concepts having the same name.
678f13d5 171#
99870f4d
KW
172# For information about the Unicode properties, see Unicode's UAX44 document:
173
174my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
175
176# As stated earlier, this program will work on any release of Unicode so far.
177# Most obvious problems in earlier data have NOT been corrected except when
178# necessary to make Perl or this program work reasonably. For example, no
179# folding information was given in early releases, so this program uses the
180# substitute of lower case, just so that a regular expression with the /i
181# option will do something that actually gives the right results in many
182# cases. There are also a couple other corrections for version 1.1.5,
183# commented at the point they are made. As an example of corrections that
184# weren't made (but could be) is this statement from DerivedAge.txt: "The
185# supplementary private use code points and the non-character code points were
186# assigned in version 2.0, but not specifically listed in the UCD until
187# versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0)
188# More information on Unicode version glitches is further down in these
189# introductory comments.
190#
5f7264c7
KW
191# This program works on all non-provisional properties as of 6.0, though the
192# files for some are suppressed from apparent lack of demand for them. You
193# can change which are output by changing lists in this program.
678f13d5 194#
dc85bd38 195# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
196# loose matchings rules (from Unicode TR18):
197#
198# The recommended names for UCD properties and property values are in
199# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
200# [PropValue]. There are both abbreviated names and longer, more
201# descriptive names. It is strongly recommended that both names be
202# recognized, and that loose matching of property names be used,
203# whereby the case distinctions, whitespace, hyphens, and underbar
204# are ignored.
205# The program still allows Fuzzy to override its determination of if loose
206# matching should be used, but it isn't currently used, as it is no longer
207# needed; the calculations it makes are good enough.
678f13d5 208#
99870f4d
KW
209# SUMMARY OF HOW IT WORKS:
210#
211# Process arguments
212#
213# A list is constructed containing each input file that is to be processed
214#
215# Each file on the list is processed in a loop, using the associated handler
216# code for each:
217# The PropertyAliases.txt and PropValueAliases.txt files are processed
218# first. These files name the properties and property values.
219# Objects are created of all the property and property value names
220# that the rest of the input should expect, including all synonyms.
221# The other input files give mappings from properties to property
222# values. That is, they list code points and say what the mapping
223# is under the given property. Some files give the mappings for
224# just one property; and some for many. This program goes through
225# each file and populates the properties from them. Some properties
226# are listed in more than one file, and Unicode has set up a
227# precedence as to which has priority if there is a conflict. Thus
228# the order of processing matters, and this program handles the
229# conflict possibility by processing the overriding input files
230# last, so that if necessary they replace earlier values.
231# After this is all done, the program creates the property mappings not
232# furnished by Unicode, but derivable from what it does give.
233# The tables of code points that match each property value in each
234# property that is accessible by regular expressions are created.
235# The Perl-defined properties are created and populated. Many of these
236# require data determined from the earlier steps
237# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 238# and Unicode are reconciled and warned about.
99870f4d
KW
239# All the properties are written to files
240# Any other files are written, and final warnings issued.
678f13d5 241#
99870f4d
KW
242# For clarity, a number of operators have been overloaded to work on tables:
243# ~ means invert (take all characters not in the set). The more
244# conventional '!' is not used because of the possibility of confusing
245# it with the actual boolean operation.
246# + means union
247# - means subtraction
248# & means intersection
249# The precedence of these is the order listed. Parentheses should be
250# copiously used. These are not a general scheme. The operations aren't
251# defined for a number of things, deliberately, to avoid getting into trouble.
252# Operations are done on references and affect the underlying structures, so
253# that the copy constructors for them have been overloaded to not return a new
254# clone, but the input object itself.
678f13d5 255#
99870f4d
KW
256# The bool operator is deliberately not overloaded to avoid confusion with
257# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
258#
259# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
260#
261# This program needs to be able to run under miniperl. Therefore, it uses a
262# minimum of other modules, and hence implements some things itself that could
263# be gotten from CPAN
264#
265# This program uses inputs published by the Unicode Consortium. These can
266# change incompatibly between releases without the Perl maintainers realizing
267# it. Therefore this program is now designed to try to flag these. It looks
268# at the directories where the inputs are, and flags any unrecognized files.
269# It keeps track of all the properties in the files it handles, and flags any
270# that it doesn't know how to handle. It also flags any input lines that
271# don't match the expected syntax, among other checks.
272#
273# It is also designed so if a new input file matches one of the known
274# templates, one hopefully just needs to add it to a list to have it
275# processed.
276#
277# As mentioned earlier, some properties are given in more than one file. In
278# particular, the files in the extracted directory are supposedly just
279# reformattings of the others. But they contain information not easily
280# derivable from the other files, including results for Unihan, which this
281# program doesn't ordinarily look at, and for unassigned code points. They
282# also have historically had errors or been incomplete. In an attempt to
283# create the best possible data, this program thus processes them first to
284# glean information missing from the other files; then processes those other
285# files to override any errors in the extracted ones. Much of the design was
286# driven by this need to store things and then possibly override them.
287#
288# It tries to keep fatal errors to a minimum, to generate something usable for
289# testing purposes. It always looks for files that could be inputs, and will
290# warn about any that it doesn't know how to handle (the -q option suppresses
291# the warning).
99870f4d
KW
292#
293# Why have files written out for binary 'N' matches?
294# For binary properties, if you know the mapping for either Y or N; the
678f13d5
KW
295# other is trivial to construct, so could be done at Perl run-time by just
296# complementing the result, instead of having a file for it. That is, if
297# someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
298# not need a file. The problem is communicating to Perl that a given
299# property is binary. Perl can't figure it out from looking at the N (or
300# No), as some non-binary properties have these as property values. So
301# rather than inventing a way to communicate this info back to the core,
302# which would have required changes there as well, it was simpler just to
303# add the extra tables.
304#
305# Why is there more than one type of range?
306# This simplified things. There are some very specialized code points that
307# have to be handled specially for output, such as Hangul syllable names.
308# By creating a range type (done late in the development process), it
309# allowed this to be stored with the range, and overridden by other input.
310# Originally these were stored in another data structure, and it became a
311# mess trying to decide if a second file that was for the same property was
312# overriding the earlier one or not.
313#
314# Why are there two kinds of tables, match and map?
315# (And there is a base class shared by the two as well.) As stated above,
316# they actually are for different things. Development proceeded much more
317# smoothly when I (khw) realized the distinction. Map tables are used to
318# give the property value for every code point (actually every code point
319# that doesn't map to a default value). Match tables are used for regular
320# expression matches, and are essentially the inverse mapping. Separating
321# the two allows more specialized methods, and error checks so that one
322# can't just take the intersection of two map tables, for example, as that
323# is nonsensical.
99870f4d
KW
324#
325# There are no match tables generated for matches of the null string. These
c1739a4a 326# would look like qr/\p{JSN=}/ currently without modifying the regex code.
678f13d5
KW
327# Perhaps something like them could be added if necessary. The JSN does have
328# a real code point U+110B that maps to the null string, but it is a
329# contributory property, and therefore not output by default. And it's easily
330# handled so far by making the null string the default where it is a
331# possibility.
99870f4d 332#
23e33b60
KW
333# DEBUGGING
334#
678f13d5
KW
335# This program is written so it will run under miniperl. Occasionally changes
336# will cause an error where the backtrace doesn't work well under miniperl.
337# To diagnose the problem, you can instead run it under regular perl, if you
338# have one compiled.
339#
340# There is a good trace facility. To enable it, first sub DEBUG must be set
341# to return true. Then a line like
342#
343# local $to_trace = 1 if main::DEBUG;
344#
345# can be added to enable tracing in its lexical scope or until you insert
346# another line:
347#
348# local $to_trace = 0 if main::DEBUG;
349#
350# then use a line like "trace $a, @b, %c, ...;
351#
352# Some of the more complex subroutines already have trace statements in them.
353# Permanent trace statements should be like:
354#
355# trace ... if main::DEBUG && $to_trace;
356#
357# If there is just one or a few files that you're debugging, you can easily
358# cause most everything else to be skipped. Change the line
359#
360# my $debug_skip = 0;
361#
362# to 1, and every file whose object is in @input_file_objects and doesn't have
363# a, 'non_skip => 1,' in its constructor will be skipped.
364#
b4a0206c 365# To compare the output tables, it may be useful to specify the -annotate
c4019d52
KW
366# flag. This causes the tables to expand so there is one entry for each
367# non-algorithmically named code point giving, currently its name, and its
368# graphic representation if printable (and you have a font that knows about
369# it). This makes it easier to see what the particular code points are in
370# each output table. The tables are usable, but because they don't have
371# ranges (for the most part), a Perl using them will run slower. Non-named
372# code points are annotated with a description of their status, and contiguous
373# ones with the same description will be output as a range rather than
374# individually. Algorithmically named characters are also output as ranges,
375# except when there are just a few contiguous ones.
376#
99870f4d
KW
377# FUTURE ISSUES
378#
379# The program would break if Unicode were to change its names so that
380# interior white space, underscores, or dashes differences were significant
381# within property and property value names.
382#
383# It might be easier to use the xml versions of the UCD if this program ever
384# would need heavy revision, and the ability to handle old versions was not
385# required.
386#
387# There is the potential for name collisions, in that Perl has chosen names
388# that Unicode could decide it also likes. There have been such collisions in
389# the past, with mostly Perl deciding to adopt the Unicode definition of the
390# name. However in the 5.2 Unicode beta testing, there were a number of such
391# collisions, which were withdrawn before the final release, because of Perl's
392# and other's protests. These all involved new properties which began with
393# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
394# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
395# Unicode document, so they are unlikely to be used by Unicode for another
396# purpose. However, they might try something beginning with 'In', or use any
397# of the other Perl-defined properties. This program will warn you of name
398# collisions, and refuse to generate tables with them, but manual intervention
399# will be required in this event. One scheme that could be implemented, if
400# necessary, would be to have this program generate another file, or add a
401# field to mktables.lst that gives the date of first definition of a property.
402# Each new release of Unicode would use that file as a basis for the next
403# iteration. And the Perl synonym addition code could sort based on the age
404# of the property, so older properties get priority, and newer ones that clash
405# would be refused; hence existing code would not be impacted, and some other
406# synonym would have to be used for the new property. This is ugly, and
407# manual intervention would certainly be easier to do in the short run; lets
408# hope it never comes to this.
678f13d5 409#
99870f4d
KW
410# A NOTE ON UNIHAN
411#
412# This program can generate tables from the Unihan database. But it doesn't
413# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
414# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
415# database was split into 8 different files, all beginning with the letters
416# 'Unihan'. This program will read those file(s) if present, but it needs to
417# know which of the many properties in the file(s) should have tables created
418# for them. It will create tables for any properties listed in
419# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
420# @cjk_properties array and the @cjk_property_values array. Thus, if a
421# property you want is not in those files of the release you are building
422# against, you must add it to those two arrays. Starting in 4.0, the
423# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
424# is present in the directory, a table will be generated for that property.
425# In 5.2, several more properties were added. For your convenience, the two
5f7264c7 426# arrays are initialized with all the 6.0 listed properties that are also in
99870f4d
KW
427# earlier releases. But these are commented out. You can just uncomment the
428# ones you want, or use them as a template for adding entries for other
429# properties.
430#
431# You may need to adjust the entries to suit your purposes. setup_unihan(),
432# and filter_unihan_line() are the functions where this is done. This program
433# already does some adjusting to make the lines look more like the rest of the
434# Unicode DB; You can see what that is in filter_unihan_line()
435#
436# There is a bug in the 3.2 data file in which some values for the
437# kPrimaryNumeric property have commas and an unexpected comment. A filter
438# could be added for these; or for a particular installation, the Unihan.txt
439# file could be edited to fix them.
99870f4d 440#
678f13d5
KW
441# HOW TO ADD A FILE TO BE PROCESSED
442#
443# A new file from Unicode needs to have an object constructed for it in
444# @input_file_objects, probably at the end or at the end of the extracted
445# ones. The program should warn you if its name will clash with others on
446# restrictive file systems, like DOS. If so, figure out a better name, and
447# add lines to the README.perl file giving that. If the file is a character
448# property, it should be in the format that Unicode has by default
449# standardized for such files for the more recently introduced ones.
450# If so, the Input_file constructor for @input_file_objects can just be the
451# file name and release it first appeared in. If not, then it should be
452# possible to construct an each_line_handler() to massage the line into the
453# standardized form.
454#
455# For non-character properties, more code will be needed. You can look at
456# the existing entries for clues.
457#
458# UNICODE VERSIONS NOTES
459#
460# The Unicode UCD has had a number of errors in it over the versions. And
461# these remain, by policy, in the standard for that version. Therefore it is
462# risky to correct them, because code may be expecting the error. So this
463# program doesn't generally make changes, unless the error breaks the Perl
464# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
465# for U+1105, which causes real problems for the algorithms for Jamo
466# calculations, so it is changed here.
467#
468# But it isn't so clear cut as to what to do about concepts that are
469# introduced in a later release; should they extend back to earlier releases
470# where the concept just didn't exist? It was easier to do this than to not,
471# so that's what was done. For example, the default value for code points not
472# in the files for various properties was probably undefined until changed by
473# some version. No_Block for blocks is such an example. This program will
474# assign No_Block even in Unicode versions that didn't have it. This has the
475# benefit that code being written doesn't have to special case earlier
476# versions; and the detriment that it doesn't match the Standard precisely for
477# the affected versions.
478#
479# Here are some observations about some of the issues in early versions:
480#
6426c51b 481# The number of code points in \p{alpha} halved in 2.1.9. It turns out that
678f13d5
KW
482# the reason is that the CJK block starting at 4E00 was removed from PropList,
483# and was not put back in until 3.1.0
484#
485# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
486# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
487# reason is that 3.2 introduced U+205F=medium math space, which was not
488# classed as white space, but Perl figured out that it should have been. 4.0
489# reclassified it correctly.
490#
491# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
492# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
493# was left with no code points, as all the ones that mapped to 202 stayed
494# mapped to 202. Thus if your program used the numeric name for the class,
495# it would not have been affected, but if it used the mnemonic, it would have
496# been.
497#
498# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
499# points which eventually came to have this script property value, instead
500# mapped to "Unknown". But in the next release all these code points were
501# moved to \p{sc=common} instead.
99870f4d
KW
502#
503# The default for missing code points for BidiClass is complicated. Starting
504# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
505# tries to do the best it can for earlier releases. It is done in
506# process_PropertyAliases()
507#
508##############################################################################
509
510my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
511 # and errors
512my $MAX_LINE_WIDTH = 78;
513
514# Debugging aid to skip most files so as to not be distracted by them when
515# concentrating on the ones being debugged. Add
516# non_skip => 1,
517# to the constructor for those files you want processed when you set this.
518# Files with a first version number of 0 are special: they are always
519# processed regardless of the state of this flag.
520my $debug_skip = 0;
521
522# Set to 1 to enable tracing.
523our $to_trace = 0;
524
525{ # Closure for trace: debugging aid
526 my $print_caller = 1; # ? Include calling subroutine name
527 my $main_with_colon = 'main::';
528 my $main_colon_length = length($main_with_colon);
529
530 sub trace {
531 return unless $to_trace; # Do nothing if global flag not set
532
533 my @input = @_;
534
535 local $DB::trace = 0;
536 $DB::trace = 0; # Quiet 'used only once' message
537
538 my $line_number;
539
540 # Loop looking up the stack to get the first non-trace caller
541 my $caller_line;
542 my $caller_name;
543 my $i = 0;
544 do {
545 $line_number = $caller_line;
546 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
547 $caller = $main_with_colon unless defined $caller;
548
549 $caller_name = $caller;
550
551 # get rid of pkg
552 $caller_name =~ s/.*:://;
553 if (substr($caller_name, 0, $main_colon_length)
554 eq $main_with_colon)
555 {
556 $caller_name = substr($caller_name, $main_colon_length);
557 }
558
559 } until ($caller_name ne 'trace');
560
561 # If the stack was empty, we were called from the top level
562 $caller_name = 'main' if ($caller_name eq ""
563 || $caller_name eq 'trace');
564
565 my $output = "";
566 foreach my $string (@input) {
567 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
568 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
569 $output .= simple_dumper($string);
570 }
571 else {
572 $string = "$string" if ref $string;
573 $string = $UNDEF unless defined $string;
574 chomp $string;
575 $string = '""' if $string eq "";
576 $output .= " " if $output ne ""
577 && $string ne ""
578 && substr($output, -1, 1) ne " "
579 && substr($string, 0, 1) ne " ";
580 $output .= $string;
581 }
582 }
583
99f78760
KW
584 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
585 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
586 print STDERR $output, "\n";
587 return;
588 }
589}
590
591# This is for a rarely used development feature that allows you to compare two
592# versions of the Unicode standard without having to deal with changes caused
1c2e8cca 593# by the code points introduced in the later version. Change the 0 to a SINGLE
99870f4d
KW
594# dotted Unicode release number (e.g. 2.1). Only code points introduced in
595# that release and earlier will be used; later ones are thrown away. You use
596# the version number of the earliest one you want to compare; then run this
597# program on directory structures containing each release, and compare the
598# outputs. These outputs will therefore include only the code points common
599# to both releases, and you can see the changes caused just by the underlying
600# release semantic changes. For versions earlier than 3.2, you must copy a
601# version of DAge.txt into the directory.
602my $string_compare_versions = DEBUG && 0; # e.g., v2.1;
603my $compare_versions = DEBUG
604 && $string_compare_versions
605 && pack "C*", split /\./, $string_compare_versions;
606
607sub uniques {
608 # Returns non-duplicated input values. From "Perl Best Practices:
609 # Encapsulated Cleverness". p. 455 in first edition.
610
611 my %seen;
0e407844
NC
612 # Arguably this breaks encapsulation, if the goal is to permit multiple
613 # distinct objects to stringify to the same value, and be interchangeable.
614 # However, for this program, no two objects stringify identically, and all
615 # lists passed to this function are either objects or strings. So this
616 # doesn't affect correctness, but it does give a couple of percent speedup.
617 no overloading;
99870f4d
KW
618 return grep { ! $seen{$_}++ } @_;
619}
620
621$0 = File::Spec->canonpath($0);
622
623my $make_test_script = 0; # ? Should we output a test script
624my $write_unchanged_files = 0; # ? Should we update the output files even if
625 # we don't think they have changed
626my $use_directory = ""; # ? Should we chdir somewhere.
627my $pod_directory; # input directory to store the pod file.
628my $pod_file = 'perluniprops';
629my $t_path; # Path to the .t test file
630my $file_list = 'mktables.lst'; # File to store input and output file names.
631 # This is used to speed up the build, by not
632 # executing the main body of the program if
633 # nothing on the list has changed since the
634 # previous build
635my $make_list = 1; # ? Should we write $file_list. Set to always
636 # make a list so that when the pumpking is
637 # preparing a release, s/he won't have to do
638 # special things
639my $glob_list = 0; # ? Should we try to include unknown .txt files
640 # in the input.
bd9ebcfd
KW
641my $output_range_counts = $debugging_build; # ? Should we include the number
642 # of code points in ranges in
643 # the output
558712cf 644my $annotate = 0; # ? Should character names be in the output
9ef2b94f 645
99870f4d
KW
646# Verbosity levels; 0 is quiet
647my $NORMAL_VERBOSITY = 1;
648my $PROGRESS = 2;
649my $VERBOSE = 3;
650
651my $verbosity = $NORMAL_VERBOSITY;
652
653# Process arguments
654while (@ARGV) {
cf25bb62
JH
655 my $arg = shift @ARGV;
656 if ($arg eq '-v') {
99870f4d
KW
657 $verbosity = $VERBOSE;
658 }
659 elsif ($arg eq '-p') {
660 $verbosity = $PROGRESS;
661 $| = 1; # Flush buffers as we go.
662 }
663 elsif ($arg eq '-q') {
664 $verbosity = 0;
665 }
666 elsif ($arg eq '-w') {
667 $write_unchanged_files = 1; # update the files even if havent changed
668 }
669 elsif ($arg eq '-check') {
6ae7e459
YO
670 my $this = shift @ARGV;
671 my $ok = shift @ARGV;
672 if ($this ne $ok) {
673 print "Skipping as check params are not the same.\n";
674 exit(0);
675 }
00a8df5c 676 }
99870f4d
KW
677 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
678 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
679 }
3df51b85
KW
680 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
681 {
99870f4d 682 $make_test_script = 1;
99870f4d
KW
683 }
684 elsif ($arg eq '-makelist') {
685 $make_list = 1;
686 }
687 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
688 -d $use_directory or croak "Unknown directory '$use_directory'";
689 }
690 elsif ($arg eq '-L') {
691
692 # Existence not tested until have chdir'd
693 $file_list = shift;
694 }
695 elsif ($arg eq '-globlist') {
696 $glob_list = 1;
697 }
698 elsif ($arg eq '-c') {
699 $output_range_counts = ! $output_range_counts
700 }
b4a0206c 701 elsif ($arg eq '-annotate') {
558712cf 702 $annotate = 1;
bd9ebcfd
KW
703 $debugging_build = 1;
704 $output_range_counts = 1;
9ef2b94f 705 }
99870f4d
KW
706 else {
707 my $with_c = 'with';
708 $with_c .= 'out' if $output_range_counts; # Complements the state
709 croak <<END;
710usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
711 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
712 [-check A B ]
713 -c : Output comments $with_c number of code points in ranges
714 -q : Quiet Mode: Only output serious warnings.
715 -p : Set verbosity level to normal plus show progress.
716 -v : Set Verbosity level high: Show progress and non-serious
717 warnings
718 -w : Write files regardless
719 -C dir : Change to this directory before proceeding. All relative paths
720 except those specified by the -P and -T options will be done
721 with respect to this directory.
722 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 723 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
724 -L filelist : Use alternate 'filelist' instead of standard one
725 -globlist : Take as input all non-Test *.txt files in current and sub
726 directories
3df51b85
KW
727 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
728 overrides -T
99870f4d 729 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 730 -annotate : Output an annotation for each character in the table files;
c4019d52
KW
731 useful for debugging mktables, looking at diffs; but is slow,
732 memory intensive; resulting tables are usable but slow and
733 very large.
99870f4d
KW
734 -check A B : Executes $0 only if A and B are the same
735END
736 }
737}
738
739# Stores the most-recently changed file. If none have changed, can skip the
740# build
aeab6150 741my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
742
743# Change directories now, because need to read 'version' early.
744if ($use_directory) {
3df51b85 745 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
746 $pod_directory = File::Spec->rel2abs($pod_directory);
747 }
3df51b85 748 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 749 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 750 }
99870f4d 751 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 752 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 753 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 754 }
3df51b85 755 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 756 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 757 }
00a8df5c
YO
758}
759
99870f4d
KW
760# Get Unicode version into regular and v-string. This is done now because
761# various tables below get populated based on it. These tables are populated
762# here to be near the top of the file, and so easily seeable by those needing
763# to modify things.
764open my $VERSION, "<", "version"
765 or croak "$0: can't open required file 'version': $!\n";
766my $string_version = <$VERSION>;
767close $VERSION;
768chomp $string_version;
769my $v_version = pack "C*", split /\./, $string_version; # v string
770
771# The following are the complete names of properties with property values that
772# are known to not match any code points in some versions of Unicode, but that
773# may change in the future so they should be matchable, hence an empty file is
774# generated for them.
775my @tables_that_may_be_empty = (
776 'Joining_Type=Left_Joining',
777 );
778push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
779push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
780push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
781 if $v_version ge v4.1.0;
82aed44a
KW
782push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
783 if $v_version ge v6.0.0;
99870f4d
KW
784
785# The lists below are hashes, so the key is the item in the list, and the
786# value is the reason why it is in the list. This makes generation of
787# documentation easier.
788
789my %why_suppressed; # No file generated for these.
790
791# Files aren't generated for empty extraneous properties. This is arguable.
792# Extraneous properties generally come about because a property is no longer
793# used in a newer version of Unicode. If we generated a file without code
794# points, programs that used to work on that property will still execute
795# without errors. It just won't ever match (or will always match, with \P{}).
796# This means that the logic is now likely wrong. I (khw) think its better to
797# find this out by getting an error message. Just move them to the table
798# above to change this behavior
799my %why_suppress_if_empty_warn_if_not = (
800
801 # It is the only property that has ever officially been removed from the
802 # Standard. The database never contained any code points for it.
803 'Special_Case_Condition' => 'Obsolete',
804
805 # Apparently never official, but there were code points in some versions of
806 # old-style PropList.txt
807 'Non_Break' => 'Obsolete',
808);
809
810# These would normally go in the warn table just above, but they were changed
811# a long time before this program was written, so warnings about them are
812# moot.
813if ($v_version gt v3.2.0) {
814 push @tables_that_may_be_empty,
815 'Canonical_Combining_Class=Attached_Below_Left'
816}
817
5f7264c7 818# These are listed in the Property aliases file in 6.0, but Unihan is ignored
99870f4d
KW
819# unless explicitly added.
820if ($v_version ge v5.2.0) {
821 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 822 foreach my $table (qw (
99870f4d
KW
823 kAccountingNumeric
824 kOtherNumeric
825 kPrimaryNumeric
826 kCompatibilityVariant
827 kIICore
828 kIRG_GSource
829 kIRG_HSource
830 kIRG_JSource
831 kIRG_KPSource
832 kIRG_MSource
833 kIRG_KSource
834 kIRG_TSource
835 kIRG_USource
836 kIRG_VSource
837 kRSUnicode
ea25a9b2 838 ))
99870f4d
KW
839 {
840 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
841 }
ca12659b
NC
842}
843
272501f6
KW
844# Enum values for to_output_map() method in the Map_Table package.
845my $EXTERNAL_MAP = 1;
846my $INTERNAL_MAP = 2;
847
fcf1973c
KW
848# To override computed values for writing the map tables for these properties.
849# The default for enum map tables is to write them out, so that the Unicode
850# .txt files can be removed, but all the data to compute any property value
851# for any code point is available in a more compact form.
852my %global_to_output_map = (
853 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
854 # get stuck supporting it if things change. Sinc it is a STRING property,
855 # it normally would be listed in the pod, but INTERNAL_MAP suppresses
856 # that.
857 Unicode_1_Name => $INTERNAL_MAP,
858
859 Present_In => 0, # Suppress, as easily computed from Age
860 Canonical_Combining_Class => 0, # Duplicate of CombiningClass.pl
861 Block => 0, # Suppress, as Blocks.txt is retained.
862);
863
99870f4d
KW
864# Properties that this program ignores.
865my @unimplemented_properties = (
866'Unicode_Radical_Stroke' # Remove if changing to handle this one.
867);
d73e5302 868
99870f4d
KW
869# There are several types of obsolete properties defined by Unicode. These
870# must be hand-edited for every new Unicode release.
871my %why_deprecated; # Generates a deprecated warning message if used.
872my %why_stabilized; # Documentation only
873my %why_obsolete; # Documentation only
874
875{ # Closure
876 my $simple = 'Perl uses the more complete version of this property';
877 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
878
879 my $other_properties = 'other properties';
880 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 881 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
882
883 %why_deprecated = (
5f7264c7 884 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
885 'Jamo_Short_Name' => $contributory,
886 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
887 'Other_Alphabetic' => $contributory,
888 'Other_Default_Ignorable_Code_Point' => $contributory,
889 'Other_Grapheme_Extend' => $contributory,
890 'Other_ID_Continue' => $contributory,
891 'Other_ID_Start' => $contributory,
892 'Other_Lowercase' => $contributory,
893 'Other_Math' => $contributory,
894 'Other_Uppercase' => $contributory,
895 );
896
897 %why_suppressed = (
5f7264c7 898 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
899 # contains the same information, but without the algorithmically
900 # determinable Hangul syllables'. This file is not published, so it's
901 # existence is not noted in the comment.
902 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
903
904 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
99870f4d
KW
905
906 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
907 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
908 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
909 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
910
911 'Name' => "Accessible via 'use charnames;'",
912 'Name_Alias' => "Accessible via 'use charnames;'",
913
5f7264c7 914 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
915 Expands_On_NFC => $why_no_expand,
916 Expands_On_NFD => $why_no_expand,
917 Expands_On_NFKC => $why_no_expand,
918 Expands_On_NFKD => $why_no_expand,
919 );
920
921 # The following are suppressed because they were made contributory or
922 # deprecated by Unicode before Perl ever thought about supporting them.
923 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
924 $why_suppressed{$property} = $why_deprecated{$property};
925 }
cf25bb62 926
99870f4d
KW
927 # Customize the message for all the 'Other_' properties
928 foreach my $property (keys %why_deprecated) {
929 next if (my $main_property = $property) !~ s/^Other_//;
930 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
931 }
932}
933
934if ($v_version ge 4.0.0) {
935 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
936 if ($v_version ge 6.0.0) {
937 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
938 }
99870f4d 939}
5f7264c7 940if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 941 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7
KW
942 if ($v_version ge 6.0.0) {
943 $why_deprecated{'ISO_Comment'} = 'No longer needed for chart generation; otherwise not useful, and code points for it have been removed';
944 }
99870f4d
KW
945}
946
947# Probably obsolete forever
948if ($v_version ge v4.1.0) {
82aed44a
KW
949 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
950}
951if ($v_version ge v6.0.0) {
952 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using Script_Extensions=Katakana or Script_Extensions=Hiragana (or both)"';
953 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either Script_Extensions=Katakana or Script_Extensions=Hiragana"';
99870f4d
KW
954}
955
956# This program can create files for enumerated-like properties, such as
957# 'Numeric_Type'. This file would be the same format as for a string
958# property, with a mapping from code point to its value, so you could look up,
959# for example, the script a code point is in. But no one so far wants this
960# mapping, or they have found another way to get it since this is a new
961# feature. So no file is generated except if it is in this list.
962my @output_mapped_properties = split "\n", <<END;
963END
964
965# If you are using the Unihan database, you need to add the properties that
966# you want to extract from it to this table. For your convenience, the
5f7264c7 967# properties in the 6.0 PropertyAliases.txt file are listed, commented out
99870f4d
KW
968my @cjk_properties = split "\n", <<'END';
969#cjkAccountingNumeric; kAccountingNumeric
970#cjkOtherNumeric; kOtherNumeric
971#cjkPrimaryNumeric; kPrimaryNumeric
972#cjkCompatibilityVariant; kCompatibilityVariant
973#cjkIICore ; kIICore
974#cjkIRG_GSource; kIRG_GSource
975#cjkIRG_HSource; kIRG_HSource
976#cjkIRG_JSource; kIRG_JSource
977#cjkIRG_KPSource; kIRG_KPSource
978#cjkIRG_KSource; kIRG_KSource
979#cjkIRG_TSource; kIRG_TSource
980#cjkIRG_USource; kIRG_USource
981#cjkIRG_VSource; kIRG_VSource
982#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
983END
984
985# Similarly for the property values. For your convenience, the lines in the
5f7264c7 986# 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
99870f4d
KW
987# '#' marks
988my @cjk_property_values = split "\n", <<'END';
989## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
990## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
991## @missing: 0000..10FFFF; cjkIICore; <none>
992## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
993## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
994## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
995## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
996## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
997## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
998## @missing: 0000..10FFFF; cjkIRG_USource; <none>
999## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1000## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1001## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1002## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1003END
1004
1005# The input files don't list every code point. Those not listed are to be
1006# defaulted to some value. Below are hard-coded what those values are for
1007# non-binary properties as of 5.1. Starting in 5.0, there are
1008# machine-parsable comment lines in the files the give the defaults; so this
1009# list shouldn't have to be extended. The claim is that all missing entries
1010# for binary properties will default to 'N'. Unicode tried to change that in
1011# 5.2, but the beta period produced enough protest that they backed off.
1012#
1013# The defaults for the fields that appear in UnicodeData.txt in this hash must
1014# be in the form that it expects. The others may be synonyms.
1015my $CODE_POINT = '<code point>';
1016my %default_mapping = (
1017 Age => "Unassigned",
1018 # Bidi_Class => Complicated; set in code
1019 Bidi_Mirroring_Glyph => "",
1020 Block => 'No_Block',
1021 Canonical_Combining_Class => 0,
1022 Case_Folding => $CODE_POINT,
1023 Decomposition_Mapping => $CODE_POINT,
1024 Decomposition_Type => 'None',
1025 East_Asian_Width => "Neutral",
1026 FC_NFKC_Closure => $CODE_POINT,
1027 General_Category => 'Cn',
1028 Grapheme_Cluster_Break => 'Other',
1029 Hangul_Syllable_Type => 'NA',
1030 ISO_Comment => "",
1031 Jamo_Short_Name => "",
1032 Joining_Group => "No_Joining_Group",
1033 # Joining_Type => Complicated; set in code
1034 kIICore => 'N', # Is converted to binary
1035 #Line_Break => Complicated; set in code
1036 Lowercase_Mapping => $CODE_POINT,
1037 Name => "",
1038 Name_Alias => "",
1039 NFC_QC => 'Yes',
1040 NFD_QC => 'Yes',
1041 NFKC_QC => 'Yes',
1042 NFKD_QC => 'Yes',
1043 Numeric_Type => 'None',
1044 Numeric_Value => 'NaN',
1045 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1046 Sentence_Break => 'Other',
1047 Simple_Case_Folding => $CODE_POINT,
1048 Simple_Lowercase_Mapping => $CODE_POINT,
1049 Simple_Titlecase_Mapping => $CODE_POINT,
1050 Simple_Uppercase_Mapping => $CODE_POINT,
1051 Titlecase_Mapping => $CODE_POINT,
1052 Unicode_1_Name => "",
1053 Unicode_Radical_Stroke => "",
1054 Uppercase_Mapping => $CODE_POINT,
1055 Word_Break => 'Other',
1056);
1057
1058# Below are files that Unicode furnishes, but this program ignores, and why
1059my %ignored_files = (
1060 'CJKRadicals.txt' => 'Unihan data',
1061 'Index.txt' => 'An index, not actual data',
1062 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1063 'NamesList.txt' => 'Just adds commentary',
1064 'NormalizationCorrections.txt' => 'Data is already in other files.',
1065 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1066 'ReadMe.txt' => 'Just comments',
1067 'README.TXT' => 'Just comments',
1068 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
5f7264c7
KW
1069 'EmojiSources.txt' => 'Not of general utility: for Japanese legacy cell-phone applications',
1070 'IndicMatraCategory.txt' => 'Provisional',
1071 'IndicSyllabicCategory.txt' => 'Provisional',
99870f4d
KW
1072);
1073
678f13d5 1074### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1075
1076my $HEADER=<<"EOF";
1077# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1078# This file is machine-generated by $0 from the Unicode
1079# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1080EOF
1081
b6922eda 1082my $INTERNAL_ONLY=<<"EOF";
99870f4d
KW
1083
1084# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
b6922eda 1085# This file is for internal use by the Perl program only. The format and even
99870f4d
KW
1086# the name or existence of this file are subject to change without notice.
1087# Don't use it directly.
1088EOF
1089
1090my $DEVELOPMENT_ONLY=<<"EOF";
1091# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1092# This file contains information artificially constrained to code points
1093# present in Unicode release $string_compare_versions.
1094# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1095# not be used for production.
b6922eda
KW
1096
1097EOF
1098
99870f4d
KW
1099my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1100my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1101my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1102
1103# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1104# two must be 10; if there are 5, the first must not be a 0. Written this way
8c32d378
KW
1105# to decrease backtracking. The first one allows the code point to be at the
1106# end of a word, but to work properly, the word shouldn't end with a valid hex
1107# character. The second one won't match a code point at the end of a word,
1108# and doesn't have the run-on issue
1109my $run_on_code_point_re =
1110 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1111my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1112
1113# This matches the beginning of the line in the Unicode db files that give the
1114# defaults for code points not listed (i.e., missing) in the file. The code
1115# depends on this ending with a semi-colon, so it can assume it is a valid
1116# field when the line is split() by semi-colons
1117my $missing_defaults_prefix =
1118 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1119
1120# Property types. Unicode has more types, but these are sufficient for our
1121# purposes.
1122my $UNKNOWN = -1; # initialized to illegal value
1123my $NON_STRING = 1; # Either binary or enum
1124my $BINARY = 2;
1125my $ENUM = 3; # Include catalog
1126my $STRING = 4; # Anything else: string or misc
1127
1128# Some input files have lines that give default values for code points not
1129# contained in the file. Sometimes these should be ignored.
1130my $NO_DEFAULTS = 0; # Must evaluate to false
1131my $NOT_IGNORED = 1;
1132my $IGNORED = 2;
1133
1134# Range types. Each range has a type. Most ranges are type 0, for normal,
1135# and will appear in the main body of the tables in the output files, but
1136# there are other types of ranges as well, listed below, that are specially
1137# handled. There are pseudo-types as well that will never be stored as a
1138# type, but will affect the calculation of the type.
1139
1140# 0 is for normal, non-specials
1141my $MULTI_CP = 1; # Sequence of more than code point
1142my $HANGUL_SYLLABLE = 2;
1143my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1144my $NULL = 4; # The map is to the null string; utf8.c can't
1145 # handle these, nor is there an accepted syntax
1146 # for them in \p{} constructs
f86864ac 1147my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1148 # otherwise be $MULTI_CP type are instead type 0
1149
1150# process_generic_property_file() can accept certain overrides in its input.
1151# Each of these must begin AND end with $CMD_DELIM.
1152my $CMD_DELIM = "\a";
1153my $REPLACE_CMD = 'replace'; # Override the Replace
1154my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1155
1156my $NO = 0;
1157my $YES = 1;
1158
1159# Values for the Replace argument to add_range.
1160# $NO # Don't replace; add only the code points not
1161 # already present.
1162my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1163 # the comments at the subroutine definition.
1164my $UNCONDITIONALLY = 2; # Replace without conditions.
1165my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1166 # already there
56343c78 1167my $CROAK = 5; # Die with an error if is already there
99870f4d
KW
1168
1169# Flags to give property statuses. The phrases are to remind maintainers that
1170# if the flag is changed, the indefinite article referring to it in the
1171# documentation may need to be as well.
1172my $NORMAL = "";
1173my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1174 # it is suppressed
37e2e78e 1175my $PLACEHOLDER = 'P'; # Implies no pod entry generated
99870f4d
KW
1176my $DEPRECATED = 'D';
1177my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1178my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1179my $DISCOURAGED = 'X';
1180my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1181my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1182my $STRICTER = 'T';
1183my $a_bold_stricter = "a 'B<$STRICTER>'";
1184my $A_bold_stricter = "A 'B<$STRICTER>'";
1185my $STABILIZED = 'S';
1186my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1187my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1188my $OBSOLETE = 'O';
1189my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1190my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1191
1192my %status_past_participles = (
1193 $DISCOURAGED => 'discouraged',
1194 $SUPPRESSED => 'should never be generated',
1195 $STABILIZED => 'stabilized',
1196 $OBSOLETE => 'obsolete',
37e2e78e 1197 $DEPRECATED => 'deprecated',
99870f4d
KW
1198);
1199
f5817e0a
KW
1200# The format of the values of the tables:
1201my $EMPTY_FORMAT = "";
99870f4d
KW
1202my $BINARY_FORMAT = 'b';
1203my $DECIMAL_FORMAT = 'd';
1204my $FLOAT_FORMAT = 'f';
1205my $INTEGER_FORMAT = 'i';
1206my $HEX_FORMAT = 'x';
1207my $RATIONAL_FORMAT = 'r';
1208my $STRING_FORMAT = 's';
a14f3cb1 1209my $DECOMP_STRING_FORMAT = 'c';
99870f4d
KW
1210
1211my %map_table_formats = (
1212 $BINARY_FORMAT => 'binary',
1213 $DECIMAL_FORMAT => 'single decimal digit',
1214 $FLOAT_FORMAT => 'floating point number',
1215 $INTEGER_FORMAT => 'integer',
1216 $HEX_FORMAT => 'positive hex whole number; a code point',
1217 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1218 $STRING_FORMAT => 'string',
92f9d56c 1219 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
99870f4d
KW
1220);
1221
1222# Unicode didn't put such derived files in a separate directory at first.
1223my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1224my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1225my $AUXILIARY = 'auxiliary';
1226
1227# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1228my %loose_to_file_of; # loosely maps table names to their respective
1229 # files
1230my %stricter_to_file_of; # same; but for stricter mapping.
1231my %nv_floating_to_rational; # maps numeric values floating point numbers to
1232 # their rational equivalent
1233my %loose_property_name_of; # Loosely maps property names to standard form
1234
d867ccfb
KW
1235# Most properties are immune to caseless matching, otherwise you would get
1236# nonsensical results, as properties are a function of a code point, not
1237# everything that is caselessly equivalent to that code point. For example,
1238# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1239# be true because 's' and 'S' are equivalent caselessly. However,
1240# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1241# extend that concept to those very few properties that are like this. Each
1242# such property will match the full range caselessly. They are hard-coded in
1243# the program; it's not worth trying to make it general as it's extremely
1244# unlikely that they will ever change.
1245my %caseless_equivalent_to;
1246
99870f4d
KW
1247# These constants names and values were taken from the Unicode standard,
1248# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1249# syllables. The '_string' versions are so generated tables can retain the
1250# hex format, which is the more familiar value
1251my $SBase_string = "0xAC00";
1252my $SBase = CORE::hex $SBase_string;
1253my $LBase_string = "0x1100";
1254my $LBase = CORE::hex $LBase_string;
1255my $VBase_string = "0x1161";
1256my $VBase = CORE::hex $VBase_string;
1257my $TBase_string = "0x11A7";
1258my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1259my $SCount = 11172;
1260my $LCount = 19;
1261my $VCount = 21;
1262my $TCount = 28;
1263my $NCount = $VCount * $TCount;
1264
1265# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1266# with the above published constants.
1267my %Jamo;
1268my %Jamo_L; # Leading consonants
1269my %Jamo_V; # Vowels
1270my %Jamo_T; # Trailing consonants
1271
37e2e78e 1272my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1273my @unhandled_properties; # Will contain a list of properties found in
1274 # the input that we didn't process.
f86864ac 1275my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1276 # listed in the pod
1277my @map_properties; # Properties that get map files written
1278my @named_sequences; # NamedSequences.txt contents.
1279my %potential_files; # Generated list of all .txt files in the directory
1280 # structure so we can warn if something is being
1281 # ignored.
1282my @files_actually_output; # List of files we generated.
1283my @more_Names; # Some code point names are compound; this is used
1284 # to store the extra components of them.
1285my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1286 # the minimum before we consider it equivalent to a
1287 # candidate rational
1288my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1289
1290# These store references to certain commonly used property objects
1291my $gc;
1292my $perl;
1293my $block;
3e20195b
KW
1294my $perl_charname;
1295my $print;
7fc6cb55 1296my $Any;
99870f4d
KW
1297
1298# Are there conflicting names because of beginning with 'In_', or 'Is_'
1299my $has_In_conflicts = 0;
1300my $has_Is_conflicts = 0;
1301
1302sub internal_file_to_platform ($) {
1303 # Convert our file paths which have '/' separators to those of the
1304 # platform.
1305
1306 my $file = shift;
1307 return undef unless defined $file;
1308
1309 return File::Spec->join(split '/', $file);
d07a55ed 1310}
5beb625e 1311
99870f4d
KW
1312sub file_exists ($) { # platform independent '-e'. This program internally
1313 # uses slash as a path separator.
1314 my $file = shift;
1315 return 0 if ! defined $file;
1316 return -e internal_file_to_platform($file);
1317}
5beb625e 1318
99870f4d 1319sub objaddr($) {
23e33b60
KW
1320 # Returns the address of the blessed input object.
1321 # It doesn't check for blessedness because that would do a string eval
1322 # every call, and the program is structured so that this is never called
1323 # for a non-blessed object.
99870f4d 1324
23e33b60 1325 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1326
1327 # Numifying a ref gives its address.
051df77b 1328 return pack 'J', $_[0];
99870f4d
KW
1329}
1330
558712cf 1331# These are used only if $annotate is true.
c4019d52
KW
1332# The entire range of Unicode characters is examined to populate these
1333# after all the input has been processed. But most can be skipped, as they
1334# have the same descriptive phrases, such as being unassigned
1335my @viacode; # Contains the 1 million character names
1336my @printable; # boolean: And are those characters printable?
1337my @annotate_char_type; # Contains a type of those characters, specifically
1338 # for the purposes of annotation.
1339my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1340 # name for the purposes of annotation. They map to the
c4019d52
KW
1341 # upper edge of the range, so that the end point can
1342 # be immediately found. This is used to skip ahead to
1343 # the end of a range, and avoid processing each
1344 # individual code point in it.
1345my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1346 # characters, but excluding those which are
1347 # also noncharacter code points
1348
1349# The annotation types are an extension of the regular range types, though
1350# some of the latter are folded into one. Make the new types negative to
1351# avoid conflicting with the regular types
1352my $SURROGATE_TYPE = -1;
1353my $UNASSIGNED_TYPE = -2;
1354my $PRIVATE_USE_TYPE = -3;
1355my $NONCHARACTER_TYPE = -4;
1356my $CONTROL_TYPE = -5;
1357my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1358
1359sub populate_char_info ($) {
558712cf 1360 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1361 # input code point's info that are needed for outputting more detailed
1362 # comments. If calling context wants a return, it is the end point of
1363 # any contiguous range of characters that share essentially the same info
1364
1365 my $i = shift;
1366 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1367
1368 $viacode[$i] = $perl_charname->value_of($i) || "";
1369
1370 # A character is generally printable if Unicode says it is,
1371 # but below we make sure that most Unicode general category 'C' types
1372 # aren't.
1373 $printable[$i] = $print->contains($i);
1374
1375 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1376
1377 # Only these two regular types are treated specially for annotations
1378 # purposes
1379 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1380 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1381
1382 # Give a generic name to all code points that don't have a real name.
1383 # We output ranges, if applicable, for these. Also calculate the end
1384 # point of the range.
1385 my $end;
1386 if (! $viacode[$i]) {
1387 if ($gc-> table('Surrogate')->contains($i)) {
1388 $viacode[$i] = 'Surrogate';
1389 $annotate_char_type[$i] = $SURROGATE_TYPE;
1390 $printable[$i] = 0;
1391 $end = $gc->table('Surrogate')->containing_range($i)->end;
1392 }
1393 elsif ($gc-> table('Private_use')->contains($i)) {
1394 $viacode[$i] = 'Private Use';
1395 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1396 $printable[$i] = 0;
1397 $end = $gc->table('Private_Use')->containing_range($i)->end;
1398 }
1399 elsif (Property::property_ref('Noncharacter_Code_Point')-> table('Y')->
1400 contains($i))
1401 {
1402 $viacode[$i] = 'Noncharacter';
1403 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1404 $printable[$i] = 0;
1405 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1406 containing_range($i)->end;
1407 }
1408 elsif ($gc-> table('Control')->contains($i)) {
1409 $viacode[$i] = 'Control';
1410 $annotate_char_type[$i] = $CONTROL_TYPE;
1411 $printable[$i] = 0;
1412 $end = 0x81 if $i == 0x80; # Hard-code this one known case
1413 }
1414 elsif ($gc-> table('Unassigned')->contains($i)) {
1415 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1416 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1417 $printable[$i] = 0;
1418
1419 # Because we name the unassigned by the blocks they are in, it
1420 # can't go past the end of that block, and it also can't go past
1421 # the unassigned range it is in. The special table makes sure
1422 # that the non-characters, which are unassigned, are separated
1423 # out.
1424 $end = min($block->containing_range($i)->end,
1425 $unassigned_sans_noncharacters-> containing_range($i)->
1426 end);
13ca76ff
KW
1427 }
1428 else {
1429 Carp::my_carp_bug("Can't figure out how to annotate "
1430 . sprintf("U+%04X", $i)
1431 . ". Proceeding anyway.");
c4019d52
KW
1432 $viacode[$i] = 'UNKNOWN';
1433 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1434 $printable[$i] = 0;
1435 }
1436 }
1437
1438 # Here, has a name, but if it's one in which the code point number is
1439 # appended to the name, do that.
1440 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1441 $viacode[$i] .= sprintf("-%04X", $i);
1442 $end = $perl_charname->containing_range($i)->end;
1443 }
1444
1445 # And here, has a name, but if it's a hangul syllable one, replace it with
1446 # the correct name from the Unicode algorithm
1447 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1448 use integer;
1449 my $SIndex = $i - $SBase;
1450 my $L = $LBase + $SIndex / $NCount;
1451 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1452 my $T = $TBase + $SIndex % $TCount;
1453 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1454 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1455 $end = $perl_charname->containing_range($i)->end;
1456 }
1457
1458 return if ! defined wantarray;
1459 return $i if ! defined $end; # If not a range, return the input
1460
1461 # Save this whole range so can find the end point quickly
1462 $annotate_ranges->add_map($i, $end, $end);
1463
1464 return $end;
1465}
1466
23e33b60
KW
1467# Commented code below should work on Perl 5.8.
1468## This 'require' doesn't necessarily work in miniperl, and even if it does,
1469## the native perl version of it (which is what would operate under miniperl)
1470## is extremely slow, as it does a string eval every call.
1471#my $has_fast_scalar_util = $\18 !~ /miniperl/
1472# && defined eval "require Scalar::Util";
1473#
1474#sub objaddr($) {
1475# # Returns the address of the blessed input object. Uses the XS version if
1476# # available. It doesn't check for blessedness because that would do a
1477# # string eval every call, and the program is structured so that this is
1478# # never called for a non-blessed object.
1479#
1480# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1481#
1482# # Check at least that is a ref.
1483# my $pkg = ref($_[0]) or return undef;
1484#
1485# # Change to a fake package to defeat any overloaded stringify
1486# bless $_[0], 'main::Fake';
1487#
1488# # Numifying a ref gives its address.
051df77b 1489# my $addr = pack 'J', $_[0];
23e33b60
KW
1490#
1491# # Return to original class
1492# bless $_[0], $pkg;
1493# return $addr;
1494#}
1495
99870f4d
KW
1496sub max ($$) {
1497 my $a = shift;
1498 my $b = shift;
1499 return $a if $a >= $b;
1500 return $b;
1501}
1502
1503sub min ($$) {
1504 my $a = shift;
1505 my $b = shift;
1506 return $a if $a <= $b;
1507 return $b;
1508}
1509
1510sub clarify_number ($) {
1511 # This returns the input number with underscores inserted every 3 digits
1512 # in large (5 digits or more) numbers. Input must be entirely digits, not
1513 # checked.
1514
1515 my $number = shift;
1516 my $pos = length($number) - 3;
1517 return $number if $pos <= 1;
1518 while ($pos > 0) {
1519 substr($number, $pos, 0) = '_';
1520 $pos -= 3;
5beb625e 1521 }
99870f4d 1522 return $number;
99598c8c
JH
1523}
1524
12ac2576 1525
99870f4d 1526package Carp;
7ebf06b3 1527
99870f4d
KW
1528# These routines give a uniform treatment of messages in this program. They
1529# are placed in the Carp package to cause the stack trace to not include them,
1530# although an alternative would be to use another package and set @CARP_NOT
1531# for it.
12ac2576 1532
99870f4d 1533our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1534
99f78760
KW
1535# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1536# and overload trying to load Scalar:Util under miniperl. See
1537# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1538undef $overload::VERSION;
1539
99870f4d
KW
1540sub my_carp {
1541 my $message = shift || "";
1542 my $nofold = shift || 0;
7ebf06b3 1543
99870f4d
KW
1544 if ($message) {
1545 $message = main::join_lines($message);
1546 $message =~ s/^$0: *//; # Remove initial program name
1547 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1548 $message = "\n$0: $message;";
12ac2576 1549
99870f4d
KW
1550 # Fold the message with program name, semi-colon end punctuation
1551 # (which looks good with the message that carp appends to it), and a
1552 # hanging indent for continuation lines.
1553 $message = main::simple_fold($message, "", 4) unless $nofold;
1554 $message =~ s/\n$//; # Remove the trailing nl so what carp
1555 # appends is to the same line
1556 }
12ac2576 1557
99870f4d 1558 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1559
99870f4d
KW
1560 carp $message;
1561 return;
1562}
7ebf06b3 1563
99870f4d
KW
1564sub my_carp_bug {
1565 # This is called when it is clear that the problem is caused by a bug in
1566 # this program.
7ebf06b3 1567
99870f4d
KW
1568 my $message = shift;
1569 $message =~ s/^$0: *//;
1570 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1571 carp $message;
1572 return;
1573}
7ebf06b3 1574
99870f4d
KW
1575sub carp_too_few_args {
1576 if (@_ != 2) {
1577 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1578 return;
12ac2576 1579 }
7ebf06b3 1580
99870f4d
KW
1581 my $args_ref = shift;
1582 my $count = shift;
7ebf06b3 1583
99870f4d
KW
1584 my_carp_bug("Need at least $count arguments to "
1585 . (caller 1)[3]
1586 . ". Instead got: '"
1587 . join ', ', @$args_ref
1588 . "'. No action taken.");
1589 return;
12ac2576
JP
1590}
1591
99870f4d
KW
1592sub carp_extra_args {
1593 my $args_ref = shift;
1594 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1595
99870f4d
KW
1596 unless (ref $args_ref) {
1597 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1598 return;
1599 }
1600 my ($package, $file, $line) = caller;
1601 my $subroutine = (caller 1)[3];
cf25bb62 1602
99870f4d
KW
1603 my $list;
1604 if (ref $args_ref eq 'HASH') {
1605 foreach my $key (keys %$args_ref) {
1606 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1607 }
99870f4d 1608 $list = join ', ', each %{$args_ref};
cf25bb62 1609 }
99870f4d
KW
1610 elsif (ref $args_ref eq 'ARRAY') {
1611 foreach my $arg (@$args_ref) {
1612 $arg = $UNDEF unless defined $arg;
1613 }
1614 $list = join ', ', @$args_ref;
1615 }
1616 else {
1617 my_carp_bug("Can't cope with ref "
1618 . ref($args_ref)
1619 . " . argument to 'carp_extra_args'. Not checking arguments.");
1620 return;
1621 }
1622
1623 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1624 return;
d73e5302
JH
1625}
1626
99870f4d
KW
1627package main;
1628
1629{ # Closure
1630
1631 # This program uses the inside-out method for objects, as recommended in
1632 # "Perl Best Practices". This closure aids in generating those. There
1633 # are two routines. setup_package() is called once per package to set
1634 # things up, and then set_access() is called for each hash representing a
1635 # field in the object. These routines arrange for the object to be
1636 # properly destroyed when no longer used, and for standard accessor
1637 # functions to be generated. If you need more complex accessors, just
1638 # write your own and leave those accesses out of the call to set_access().
1639 # More details below.
1640
1641 my %constructor_fields; # fields that are to be used in constructors; see
1642 # below
1643
1644 # The values of this hash will be the package names as keys to other
1645 # hashes containing the name of each field in the package as keys, and
1646 # references to their respective hashes as values.
1647 my %package_fields;
1648
1649 sub setup_package {
1650 # Sets up the package, creating standard DESTROY and dump methods
1651 # (unless already defined). The dump method is used in debugging by
1652 # simple_dumper().
1653 # The optional parameters are:
1654 # a) a reference to a hash, that gets populated by later
1655 # set_access() calls with one of the accesses being
1656 # 'constructor'. The caller can then refer to this, but it is
1657 # not otherwise used by these two routines.
1658 # b) a reference to a callback routine to call during destruction
1659 # of the object, before any fields are actually destroyed
1660
1661 my %args = @_;
1662 my $constructor_ref = delete $args{'Constructor_Fields'};
1663 my $destroy_callback = delete $args{'Destroy_Callback'};
1664 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1665
1666 my %fields;
1667 my $package = (caller)[0];
1668
1669 $package_fields{$package} = \%fields;
1670 $constructor_fields{$package} = $constructor_ref;
1671
1672 unless ($package->can('DESTROY')) {
1673 my $destroy_name = "${package}::DESTROY";
1674 no strict "refs";
1675
1676 # Use typeglob to give the anonymous subroutine the name we want
1677 *$destroy_name = sub {
1678 my $self = shift;
ffe43484 1679 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1680
1681 $self->$destroy_callback if $destroy_callback;
1682 foreach my $field (keys %{$package_fields{$package}}) {
1683 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1684 delete $package_fields{$package}{$field}{$addr};
1685 }
1686 return;
1687 }
1688 }
1689
1690 unless ($package->can('dump')) {
1691 my $dump_name = "${package}::dump";
1692 no strict "refs";
1693 *$dump_name = sub {
1694 my $self = shift;
1695 return dump_inside_out($self, $package_fields{$package}, @_);
1696 }
1697 }
1698 return;
1699 }
1700
1701 sub set_access {
1702 # Arrange for the input field to be garbage collected when no longer
1703 # needed. Also, creates standard accessor functions for the field
1704 # based on the optional parameters-- none if none of these parameters:
1705 # 'addable' creates an 'add_NAME()' accessor function.
1706 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1707 # function.
1708 # 'settable' creates a 'set_NAME()' accessor function.
1709 # 'constructor' doesn't create an accessor function, but adds the
1710 # field to the hash that was previously passed to
1711 # setup_package();
1712 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1713 # 'add' etc. all mean 'addable'.
1714 # The read accessor function will work on both array and scalar
1715 # values. If another accessor in the parameter list is 'a', the read
1716 # access assumes an array. You can also force it to be array access
1717 # by specifying 'readable_array' instead of 'readable'
1718 #
1719 # A sort-of 'protected' access can be set-up by preceding the addable,
1720 # readable or settable with some initial portion of 'protected_' (but,
1721 # the underscore is required), like 'p_a', 'pro_set', etc. The
1722 # "protection" is only by convention. All that happens is that the
1723 # accessor functions' names begin with an underscore. So instead of
1724 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1725 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1726 # end of each package, and then storing the __LINE__ ranges and
1727 # checking them on every accessor. But that is way overkill.)
1728
1729 # We create anonymous subroutines as the accessors and then use
1730 # typeglobs to assign them to the proper package and name
1731
1732 my $name = shift; # Name of the field
1733 my $field = shift; # Reference to the inside-out hash containing the
1734 # field
1735
1736 my $package = (caller)[0];
1737
1738 if (! exists $package_fields{$package}) {
1739 croak "$0: Must call 'setup_package' before 'set_access'";
1740 }
d73e5302 1741
99870f4d
KW
1742 # Stash the field so DESTROY can get it.
1743 $package_fields{$package}{$name} = $field;
cf25bb62 1744
99870f4d
KW
1745 # Remaining arguments are the accessors. For each...
1746 foreach my $access (@_) {
1747 my $access = lc $access;
cf25bb62 1748
99870f4d 1749 my $protected = "";
cf25bb62 1750
99870f4d
KW
1751 # Match the input as far as it goes.
1752 if ($access =~ /^(p[^_]*)_/) {
1753 $protected = $1;
1754 if (substr('protected_', 0, length $protected)
1755 eq $protected)
1756 {
1757
1758 # Add 1 for the underscore not included in $protected
1759 $access = substr($access, length($protected) + 1);
1760 $protected = '_';
1761 }
1762 else {
1763 $protected = "";
1764 }
1765 }
1766
1767 if (substr('addable', 0, length $access) eq $access) {
1768 my $subname = "${package}::${protected}add_$name";
1769 no strict "refs";
1770
1771 # add_ accessor. Don't add if already there, which we
1772 # determine using 'eq' for scalars and '==' otherwise.
1773 *$subname = sub {
1774 use strict "refs";
1775 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1776 my $self = shift;
1777 my $value = shift;
ffe43484 1778 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1779 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1780 if (ref $value) {
f998e60c 1781 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1782 }
1783 else {
f998e60c 1784 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1785 }
f998e60c 1786 push @{$field->{$addr}}, $value;
99870f4d
KW
1787 return;
1788 }
1789 }
1790 elsif (substr('constructor', 0, length $access) eq $access) {
1791 if ($protected) {
1792 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1793 }
1794 else {
1795 $constructor_fields{$package}{$name} = $field;
1796 }
1797 }
1798 elsif (substr('readable_array', 0, length $access) eq $access) {
1799
1800 # Here has read access. If one of the other parameters for
1801 # access is array, or this one specifies array (by being more
1802 # than just 'readable_'), then create a subroutine that
1803 # assumes the data is an array. Otherwise just a scalar
1804 my $subname = "${package}::${protected}$name";
1805 if (grep { /^a/i } @_
1806 or length($access) > length('readable_'))
1807 {
1808 no strict "refs";
1809 *$subname = sub {
1810 use strict "refs";
23e33b60 1811 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1812 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1813 if (ref $field->{$addr} ne 'ARRAY') {
1814 my $type = ref $field->{$addr};
1815 $type = 'scalar' unless $type;
1816 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1817 return;
1818 }
1819 return scalar @{$field->{$addr}} unless wantarray;
1820
1821 # Make a copy; had problems with caller modifying the
1822 # original otherwise
1823 my @return = @{$field->{$addr}};
1824 return @return;
1825 }
1826 }
1827 else {
1828
1829 # Here not an array value, a simpler function.
1830 no strict "refs";
1831 *$subname = sub {
1832 use strict "refs";
23e33b60 1833 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1834 no overloading;
051df77b 1835 return $field->{pack 'J', $_[0]};
99870f4d
KW
1836 }
1837 }
1838 }
1839 elsif (substr('settable', 0, length $access) eq $access) {
1840 my $subname = "${package}::${protected}set_$name";
1841 no strict "refs";
1842 *$subname = sub {
1843 use strict "refs";
23e33b60
KW
1844 if (main::DEBUG) {
1845 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1846 Carp::carp_extra_args(\@_) if @_ > 2;
1847 }
1848 # $self is $_[0]; $value is $_[1]
f998e60c 1849 no overloading;
051df77b 1850 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1851 return;
1852 }
1853 }
1854 else {
1855 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1856 }
cf25bb62 1857 }
99870f4d 1858 return;
cf25bb62 1859 }
99870f4d
KW
1860}
1861
1862package Input_file;
1863
1864# All input files use this object, which stores various attributes about them,
1865# and provides for convenient, uniform handling. The run method wraps the
1866# processing. It handles all the bookkeeping of opening, reading, and closing
1867# the file, returning only significant input lines.
1868#
1869# Each object gets a handler which processes the body of the file, and is
1870# called by run(). Most should use the generic, default handler, which has
1871# code scrubbed to handle things you might not expect. A handler should
1872# basically be a while(next_line()) {...} loop.
1873#
1874# You can also set up handlers to
1875# 1) call before the first line is read for pre processing
1876# 2) call to adjust each line of the input before the main handler gets them
1877# 3) call upon EOF before the main handler exits its loop
1878# 4) call at the end for post processing
1879#
1880# $_ is used to store the input line, and is to be filtered by the
1881# each_line_handler()s. So, if the format of the line is not in the desired
1882# format for the main handler, these are used to do that adjusting. They can
1883# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1884# so the $_ output of one is used as the input to the next. None of the other
1885# handlers are stackable, but could easily be changed to be so.
1886#
1887# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1888# which insert the parameters as lines to be processed before the next input
1889# file line is read. This allows the EOF handler to flush buffers, for
1890# example. The difference between the two routines is that the lines inserted
1891# by insert_lines() are subjected to the each_line_handler()s. (So if you
1892# called it from such a handler, you would get infinite recursion.) Lines
1893# inserted by insert_adjusted_lines() go directly to the main handler without
1894# any adjustments. If the post-processing handler calls any of these, there
1895# will be no effect. Some error checking for these conditions could be added,
1896# but it hasn't been done.
1897#
1898# carp_bad_line() should be called to warn of bad input lines, which clears $_
1899# to prevent further processing of the line. This routine will output the
1900# message as a warning once, and then keep a count of the lines that have the
1901# same message, and output that count at the end of the file's processing.
1902# This keeps the number of messages down to a manageable amount.
1903#
1904# get_missings() should be called to retrieve any @missing input lines.
1905# Messages will be raised if this isn't done if the options aren't to ignore
1906# missings.
1907
1908sub trace { return main::trace(@_); }
1909
99870f4d
KW
1910{ # Closure
1911 # Keep track of fields that are to be put into the constructor.
1912 my %constructor_fields;
1913
1914 main::setup_package(Constructor_Fields => \%constructor_fields);
1915
1916 my %file; # Input file name, required
1917 main::set_access('file', \%file, qw{ c r });
1918
1919 my %first_released; # Unicode version file was first released in, required
1920 main::set_access('first_released', \%first_released, qw{ c r });
1921
1922 my %handler; # Subroutine to process the input file, defaults to
1923 # 'process_generic_property_file'
1924 main::set_access('handler', \%handler, qw{ c });
1925
1926 my %property;
1927 # name of property this file is for. defaults to none, meaning not
1928 # applicable, or is otherwise determinable, for example, from each line.
1929 main::set_access('property', \%property, qw{ c });
1930
1931 my %optional;
1932 # If this is true, the file is optional. If not present, no warning is
1933 # output. If it is present, the string given by this parameter is
1934 # evaluated, and if false the file is not processed.
1935 main::set_access('optional', \%optional, 'c', 'r');
1936
1937 my %non_skip;
1938 # This is used for debugging, to skip processing of all but a few input
1939 # files. Add 'non_skip => 1' to the constructor for those files you want
1940 # processed when you set the $debug_skip global.
1941 main::set_access('non_skip', \%non_skip, 'c');
1942
37e2e78e
KW
1943 my %skip;
1944 # This is used to skip processing of this input file semi-permanently.
1945 # It is used for files that we aren't planning to process anytime soon,
1946 # but want to allow to be in the directory and not raise a message that we
1947 # are not handling. Mostly for test files. This is in contrast to the
1948 # non_skip element, which is supposed to be used very temporarily for
1949 # debugging. Sets 'optional' to 1
1950 main::set_access('skip', \%skip, 'c');
1951
99870f4d
KW
1952 my %each_line_handler;
1953 # list of subroutines to look at and filter each non-comment line in the
1954 # file. defaults to none. The subroutines are called in order, each is
1955 # to adjust $_ for the next one, and the final one adjusts it for
1956 # 'handler'
1957 main::set_access('each_line_handler', \%each_line_handler, 'c');
1958
1959 my %has_missings_defaults;
1960 # ? Are there lines in the file giving default values for code points
1961 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1962 # the norm, but IGNORED means it has such lines, but the handler doesn't
1963 # use them. Having these three states allows us to catch changes to the
1964 # UCD that this program should track
1965 main::set_access('has_missings_defaults',
1966 \%has_missings_defaults, qw{ c r });
1967
1968 my %pre_handler;
1969 # Subroutine to call before doing anything else in the file. If undef, no
1970 # such handler is called.
1971 main::set_access('pre_handler', \%pre_handler, qw{ c });
1972
1973 my %eof_handler;
1974 # Subroutine to call upon getting an EOF on the input file, but before
1975 # that is returned to the main handler. This is to allow buffers to be
1976 # flushed. The handler is expected to call insert_lines() or
1977 # insert_adjusted() with the buffered material
1978 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1979
1980 my %post_handler;
1981 # Subroutine to call after all the lines of the file are read in and
1982 # processed. If undef, no such handler is called.
1983 main::set_access('post_handler', \%post_handler, qw{ c });
1984
1985 my %progress_message;
1986 # Message to print to display progress in lieu of the standard one
1987 main::set_access('progress_message', \%progress_message, qw{ c });
1988
1989 my %handle;
1990 # cache open file handle, internal. Is undef if file hasn't been
1991 # processed at all, empty if has;
1992 main::set_access('handle', \%handle);
1993
1994 my %added_lines;
1995 # cache of lines added virtually to the file, internal
1996 main::set_access('added_lines', \%added_lines);
1997
1998 my %errors;
1999 # cache of errors found, internal
2000 main::set_access('errors', \%errors);
2001
2002 my %missings;
2003 # storage of '@missing' defaults lines
2004 main::set_access('missings', \%missings);
2005
2006 sub new {
2007 my $class = shift;
2008
2009 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2010 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2011
2012 # Set defaults
2013 $handler{$addr} = \&main::process_generic_property_file;
2014 $non_skip{$addr} = 0;
37e2e78e 2015 $skip{$addr} = 0;
99870f4d
KW
2016 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2017 $handle{$addr} = undef;
2018 $added_lines{$addr} = [ ];
2019 $each_line_handler{$addr} = [ ];
2020 $errors{$addr} = { };
2021 $missings{$addr} = [ ];
2022
2023 # Two positional parameters.
99f78760 2024 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2025 $file{$addr} = main::internal_file_to_platform(shift);
2026 $first_released{$addr} = shift;
2027
2028 # The rest of the arguments are key => value pairs
2029 # %constructor_fields has been set up earlier to list all possible
2030 # ones. Either set or push, depending on how the default has been set
2031 # up just above.
2032 my %args = @_;
2033 foreach my $key (keys %args) {
2034 my $argument = $args{$key};
2035
2036 # Note that the fields are the lower case of the constructor keys
2037 my $hash = $constructor_fields{lc $key};
2038 if (! defined $hash) {
2039 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2040 next;
2041 }
2042 if (ref $hash->{$addr} eq 'ARRAY') {
2043 if (ref $argument eq 'ARRAY') {
2044 foreach my $argument (@{$argument}) {
2045 next if ! defined $argument;
2046 push @{$hash->{$addr}}, $argument;
2047 }
2048 }
2049 else {
2050 push @{$hash->{$addr}}, $argument if defined $argument;
2051 }
2052 }
2053 else {
2054 $hash->{$addr} = $argument;
2055 }
2056 delete $args{$key};
2057 };
2058
2059 # If the file has a property for it, it means that the property is not
2060 # listed in the file's entries. So add a handler to the list of line
2061 # handlers to insert the property name into the lines, to provide a
2062 # uniform interface to the final processing subroutine.
2063 # the final code doesn't have to worry about that.
2064 if ($property{$addr}) {
2065 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2066 }
2067
2068 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2069 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 2070 }
99870f4d 2071
37e2e78e
KW
2072 $optional{$addr} = 1 if $skip{$addr};
2073
99870f4d 2074 return $self;
d73e5302
JH
2075 }
2076
cf25bb62 2077
99870f4d
KW
2078 use overload
2079 fallback => 0,
2080 qw("") => "_operator_stringify",
2081 "." => \&main::_operator_dot,
2082 ;
cf25bb62 2083
99870f4d
KW
2084 sub _operator_stringify {
2085 my $self = shift;
cf25bb62 2086
99870f4d 2087 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2088 }
d73e5302 2089
99870f4d
KW
2090 # flag to make sure extracted files are processed early
2091 my $seen_non_extracted_non_age = 0;
d73e5302 2092
99870f4d
KW
2093 sub run {
2094 # Process the input object $self. This opens and closes the file and
2095 # calls all the handlers for it. Currently, this can only be called
2096 # once per file, as it destroy's the EOF handler
d73e5302 2097
99870f4d
KW
2098 my $self = shift;
2099 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2100
ffe43484 2101 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2102
99870f4d 2103 my $file = $file{$addr};
d73e5302 2104
99870f4d
KW
2105 # Don't process if not expecting this file (because released later
2106 # than this Unicode version), and isn't there. This means if someone
2107 # copies it into an earlier version's directory, we will go ahead and
2108 # process it.
2109 return if $first_released{$addr} gt $v_version && ! -e $file;
2110
2111 # If in debugging mode and this file doesn't have the non-skip
2112 # flag set, and isn't one of the critical files, skip it.
2113 if ($debug_skip
2114 && $first_released{$addr} ne v0
2115 && ! $non_skip{$addr})
2116 {
2117 print "Skipping $file in debugging\n" if $verbosity;
2118 return;
2119 }
2120
2121 # File could be optional
37e2e78e 2122 if ($optional{$addr}) {
99870f4d
KW
2123 return unless -e $file;
2124 my $result = eval $optional{$addr};
2125 if (! defined $result) {
2126 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2127 return;
2128 }
2129 if (! $result) {
2130 if ($verbosity) {
2131 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2132 }
2133 return;
2134 }
2135 }
2136
2137 if (! defined $file || ! -e $file) {
2138
2139 # If the file doesn't exist, see if have internal data for it
2140 # (based on first_released being 0).
2141 if ($first_released{$addr} eq v0) {
2142 $handle{$addr} = 'pretend_is_open';
2143 }
2144 else {
2145 if (! $optional{$addr} # File could be optional
2146 && $v_version ge $first_released{$addr})
2147 {
2148 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2149 }
2150 return;
2151 }
2152 }
2153 else {
2154
37e2e78e
KW
2155 # Here, the file exists. Some platforms may change the case of
2156 # its name
99870f4d 2157 if ($seen_non_extracted_non_age) {
517956bf 2158 if ($file =~ /$EXTRACTED/i) {
99870f4d 2159 Carp::my_carp_bug(join_lines(<<END
99f78760 2160$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2161anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2162have subtle problems
2163END
2164 ));
2165 }
2166 }
2167 elsif ($EXTRACTED_DIR
2168 && $first_released{$addr} ne v0
517956bf
CB
2169 && $file !~ /$EXTRACTED/i
2170 && lc($file) ne 'dage.txt')
99870f4d
KW
2171 {
2172 # We don't set this (by the 'if' above) if we have no
2173 # extracted directory, so if running on an early version,
2174 # this test won't work. Not worth worrying about.
2175 $seen_non_extracted_non_age = 1;
2176 }
2177
2178 # And mark the file as having being processed, and warn if it
2179 # isn't a file we are expecting. As we process the files,
2180 # they are deleted from the hash, so any that remain at the
2181 # end of the program are files that we didn't process.
517956bf
CB
2182 my $fkey = File::Spec->rel2abs($file);
2183 my $expecting = delete $potential_files{$fkey};
2184 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
678f13d5
KW
2185 Carp::my_carp("Was not expecting '$file'.") if
2186 ! $expecting
99870f4d
KW
2187 && ! defined $handle{$addr};
2188
37e2e78e
KW
2189 # Having deleted from expected files, we can quit if not to do
2190 # anything. Don't print progress unless really want verbosity
2191 if ($skip{$addr}) {
2192 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2193 return;
2194 }
2195
99870f4d
KW
2196 # Open the file, converting the slashes used in this program
2197 # into the proper form for the OS
2198 my $file_handle;
2199 if (not open $file_handle, "<", $file) {
2200 Carp::my_carp("Can't open $file. Skipping: $!");
2201 return 0;
2202 }
2203 $handle{$addr} = $file_handle; # Cache the open file handle
2204 }
2205
2206 if ($verbosity >= $PROGRESS) {
2207 if ($progress_message{$addr}) {
2208 print "$progress_message{$addr}\n";
2209 }
2210 else {
2211 # If using a virtual file, say so.
2212 print "Processing ", (-e $file)
2213 ? $file
2214 : "substitute $file",
2215 "\n";
2216 }
2217 }
2218
2219
2220 # Call any special handler for before the file.
2221 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2222
2223 # Then the main handler
2224 &{$handler{$addr}}($self);
2225
2226 # Then any special post-file handler.
2227 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2228
2229 # If any errors have been accumulated, output the counts (as the first
2230 # error message in each class was output when it was encountered).
2231 if ($errors{$addr}) {
2232 my $total = 0;
2233 my $types = 0;
2234 foreach my $error (keys %{$errors{$addr}}) {
2235 $total += $errors{$addr}->{$error};
2236 delete $errors{$addr}->{$error};
2237 $types++;
2238 }
2239 if ($total > 1) {
2240 my $message
2241 = "A total of $total lines had errors in $file. ";
2242
2243 $message .= ($types == 1)
2244 ? '(Only the first one was displayed.)'
2245 : '(Only the first of each type was displayed.)';
2246 Carp::my_carp($message);
2247 }
2248 }
2249
2250 if (@{$missings{$addr}}) {
2251 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2252 }
2253
2254 # If a real file handle, close it.
2255 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2256 ref $handle{$addr};
2257 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2258 # the file, as opposed to undef
2259 return;
2260 }
2261
2262 sub next_line {
2263 # Sets $_ to be the next logical input line, if any. Returns non-zero
2264 # if such a line exists. 'logical' means that any lines that have
2265 # been added via insert_lines() will be returned in $_ before the file
2266 # is read again.
2267
2268 my $self = shift;
2269 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2270
ffe43484 2271 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2272
2273 # Here the file is open (or if the handle is not a ref, is an open
2274 # 'virtual' file). Get the next line; any inserted lines get priority
2275 # over the file itself.
2276 my $adjusted;
2277
2278 LINE:
2279 while (1) { # Loop until find non-comment, non-empty line
2280 #local $to_trace = 1 if main::DEBUG;
2281 my $inserted_ref = shift @{$added_lines{$addr}};
2282 if (defined $inserted_ref) {
2283 ($adjusted, $_) = @{$inserted_ref};
2284 trace $adjusted, $_ if main::DEBUG && $to_trace;
2285 return 1 if $adjusted;
2286 }
2287 else {
2288 last if ! ref $handle{$addr}; # Don't read unless is real file
2289 last if ! defined ($_ = readline $handle{$addr});
2290 }
2291 chomp;
2292 trace $_ if main::DEBUG && $to_trace;
2293
2294 # See if this line is the comment line that defines what property
2295 # value that code points that are not listed in the file should
2296 # have. The format or existence of these lines is not guaranteed
2297 # by Unicode since they are comments, but the documentation says
2298 # that this was added for machine-readability, so probably won't
2299 # change. This works starting in Unicode Version 5.0. They look
2300 # like:
2301 #
2302 # @missing: 0000..10FFFF; Not_Reordered
2303 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2304 # @missing: 0000..10FFFF; ; NaN
2305 #
2306 # Save the line for a later get_missings() call.
2307 if (/$missing_defaults_prefix/) {
2308 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2309 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2310 }
2311 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2312 my @defaults = split /\s* ; \s*/x, $_;
2313
2314 # The first field is the @missing, which ends in a
2315 # semi-colon, so can safely shift.
2316 shift @defaults;
2317
2318 # Some of these lines may have empty field placeholders
2319 # which get in the way. An example is:
2320 # @missing: 0000..10FFFF; ; NaN
2321 # Remove them. Process starting from the top so the
2322 # splice doesn't affect things still to be looked at.
2323 for (my $i = @defaults - 1; $i >= 0; $i--) {
2324 next if $defaults[$i] ne "";
2325 splice @defaults, $i, 1;
2326 }
2327
2328 # What's left should be just the property (maybe) and the
2329 # default. Having only one element means it doesn't have
2330 # the property.
2331 my $default;
2332 my $property;
2333 if (@defaults >= 1) {
2334 if (@defaults == 1) {
2335 $default = $defaults[0];
2336 }
2337 else {
2338 $property = $defaults[0];
2339 $default = $defaults[1];
2340 }
2341 }
2342
2343 if (@defaults < 1
2344 || @defaults > 2
2345 || ($default =~ /^</
2346 && $default !~ /^<code *point>$/i
2347 && $default !~ /^<none>$/i))
2348 {
2349 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2350 }
2351 else {
2352
2353 # If the property is missing from the line, it should
2354 # be the one for the whole file
2355 $property = $property{$addr} if ! defined $property;
2356
2357 # Change <none> to the null string, which is what it
2358 # really means. If the default is the code point
2359 # itself, set it to <code point>, which is what
2360 # Unicode uses (but sometimes they've forgotten the
2361 # space)
2362 if ($default =~ /^<none>$/i) {
2363 $default = "";
2364 }
2365 elsif ($default =~ /^<code *point>$/i) {
2366 $default = $CODE_POINT;
2367 }
2368
2369 # Store them as a sub-arrays with both components.
2370 push @{$missings{$addr}}, [ $default, $property ];
2371 }
2372 }
2373
2374 # There is nothing for the caller to process on this comment
2375 # line.
2376 next;
2377 }
2378
2379 # Remove comments and trailing space, and skip this line if the
2380 # result is empty
2381 s/#.*//;
2382 s/\s+$//;
2383 next if /^$/;
2384
2385 # Call any handlers for this line, and skip further processing of
2386 # the line if the handler sets the line to null.
2387 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2388 &{$sub_ref}($self);
2389 next LINE if /^$/;
2390 }
2391
2392 # Here the line is ok. return success.
2393 return 1;
2394 } # End of looping through lines.
2395
2396 # If there is an EOF handler, call it (only once) and if it generates
2397 # more lines to process go back in the loop to handle them.
2398 if ($eof_handler{$addr}) {
2399 &{$eof_handler{$addr}}($self);
2400 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2401 goto LINE if $added_lines{$addr};
2402 }
2403
2404 # Return failure -- no more lines.
2405 return 0;
2406
2407 }
2408
2409# Not currently used, not fully tested.
2410# sub peek {
2411# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2412# # record. Not callable from an each_line_handler(), nor does it call
2413# # an each_line_handler() on the line.
2414#
2415# my $self = shift;
ffe43484 2416# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2417#
2418# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2419# my ($adjusted, $line) = @{$inserted_ref};
2420# next if $adjusted;
2421#
2422# # Remove comments and trailing space, and return a non-empty
2423# # resulting line
2424# $line =~ s/#.*//;
2425# $line =~ s/\s+$//;
2426# return $line if $line ne "";
2427# }
2428#
2429# return if ! ref $handle{$addr}; # Don't read unless is real file
2430# while (1) { # Loop until find non-comment, non-empty line
2431# local $to_trace = 1 if main::DEBUG;
2432# trace $_ if main::DEBUG && $to_trace;
2433# return if ! defined (my $line = readline $handle{$addr});
2434# chomp $line;
2435# push @{$added_lines{$addr}}, [ 0, $line ];
2436#
2437# $line =~ s/#.*//;
2438# $line =~ s/\s+$//;
2439# return $line if $line ne "";
2440# }
2441#
2442# return;
2443# }
2444
2445
2446 sub insert_lines {
2447 # Lines can be inserted so that it looks like they were in the input
2448 # file at the place it was when this routine is called. See also
2449 # insert_adjusted_lines(). Lines inserted via this routine go through
2450 # any each_line_handler()
2451
2452 my $self = shift;
2453
2454 # Each inserted line is an array, with the first element being 0 to
2455 # indicate that this line hasn't been adjusted, and needs to be
2456 # processed.
f998e60c 2457 no overloading;
051df77b 2458 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2459 return;
2460 }
2461
2462 sub insert_adjusted_lines {
2463 # Lines can be inserted so that it looks like they were in the input
2464 # file at the place it was when this routine is called. See also
2465 # insert_lines(). Lines inserted via this routine are already fully
2466 # adjusted, ready to be processed; each_line_handler()s handlers will
2467 # not be called. This means this is not a completely general
2468 # facility, as only the last each_line_handler on the stack should
2469 # call this. It could be made more general, by passing to each of the
2470 # line_handlers their position on the stack, which they would pass on
2471 # to this routine, and that would replace the boolean first element in
2472 # the anonymous array pushed here, so that the next_line routine could
2473 # use that to call only those handlers whose index is after it on the
2474 # stack. But this is overkill for what is needed now.
2475
2476 my $self = shift;
2477 trace $_[0] if main::DEBUG && $to_trace;
2478
2479 # Each inserted line is an array, with the first element being 1 to
2480 # indicate that this line has been adjusted
f998e60c 2481 no overloading;
051df77b 2482 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2483 return;
2484 }
2485
2486 sub get_missings {
2487 # Returns the stored up @missings lines' values, and clears the list.
2488 # The values are in an array, consisting of the default in the first
2489 # element, and the property in the 2nd. However, since these lines
2490 # can be stacked up, the return is an array of all these arrays.
2491
2492 my $self = shift;
2493 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2494
ffe43484 2495 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2496
2497 # If not accepting a list return, just return the first one.
2498 return shift @{$missings{$addr}} unless wantarray;
2499
2500 my @return = @{$missings{$addr}};
2501 undef @{$missings{$addr}};
2502 return @return;
2503 }
2504
2505 sub _insert_property_into_line {
2506 # Add a property field to $_, if this file requires it.
2507
f998e60c 2508 my $self = shift;
ffe43484 2509 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2510 my $property = $property{$addr};
99870f4d
KW
2511 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2512
2513 $_ =~ s/(;|$)/; $property$1/;
2514 return;
2515 }
2516
2517 sub carp_bad_line {
2518 # Output consistent error messages, using either a generic one, or the
2519 # one given by the optional parameter. To avoid gazillions of the
2520 # same message in case the syntax of a file is way off, this routine
2521 # only outputs the first instance of each message, incrementing a
2522 # count so the totals can be output at the end of the file.
2523
2524 my $self = shift;
2525 my $message = shift;
2526 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2527
ffe43484 2528 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2529
2530 $message = 'Unexpected line' unless $message;
2531
2532 # No trailing punctuation so as to fit with our addenda.
2533 $message =~ s/[.:;,]$//;
2534
2535 # If haven't seen this exact message before, output it now. Otherwise
2536 # increment the count of how many times it has occurred
2537 unless ($errors{$addr}->{$message}) {
2538 Carp::my_carp("$message in '$_' in "
f998e60c 2539 . $file{$addr}
99870f4d
KW
2540 . " at line $.. Skipping this line;");
2541 $errors{$addr}->{$message} = 1;
2542 }
2543 else {
2544 $errors{$addr}->{$message}++;
2545 }
2546
2547 # Clear the line to prevent any further (meaningful) processing of it.
2548 $_ = "";
2549
2550 return;
2551 }
2552} # End closure
2553
2554package Multi_Default;
2555
2556# Certain properties in early versions of Unicode had more than one possible
2557# default for code points missing from the files. In these cases, one
2558# default applies to everything left over after all the others are applied,
2559# and for each of the others, there is a description of which class of code
2560# points applies to it. This object helps implement this by storing the
2561# defaults, and for all but that final default, an eval string that generates
2562# the class that it applies to.
2563
2564
2565{ # Closure
2566
2567 main::setup_package();
2568
2569 my %class_defaults;
2570 # The defaults structure for the classes
2571 main::set_access('class_defaults', \%class_defaults);
2572
2573 my %other_default;
2574 # The default that applies to everything left over.
2575 main::set_access('other_default', \%other_default, 'r');
2576
2577
2578 sub new {
2579 # The constructor is called with default => eval pairs, terminated by
2580 # the left-over default. e.g.
2581 # Multi_Default->new(
2582 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2583 # - 0x200D',
2584 # 'R' => 'some other expression that evaluates to code points',
2585 # .
2586 # .
2587 # .
2588 # 'U'));
2589
2590 my $class = shift;
2591
2592 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2593 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2594
2595 while (@_ > 1) {
2596 my $default = shift;
2597 my $eval = shift;
2598 $class_defaults{$addr}->{$default} = $eval;
2599 }
2600
2601 $other_default{$addr} = shift;
2602
2603 return $self;
2604 }
2605
2606 sub get_next_defaults {
2607 # Iterates and returns the next class of defaults.
2608 my $self = shift;
2609 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2610
ffe43484 2611 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2612
2613 return each %{$class_defaults{$addr}};
2614 }
2615}
2616
2617package Alias;
2618
2619# An alias is one of the names that a table goes by. This class defines them
2620# including some attributes. Everything is currently setup in the
2621# constructor.
2622
2623
2624{ # Closure
2625
2626 main::setup_package();
2627
2628 my %name;
2629 main::set_access('name', \%name, 'r');
2630
2631 my %loose_match;
2632 # Determined by the constructor code if this name should match loosely or
2633 # not. The constructor parameters can override this, but it isn't fully
2634 # implemented, as should have ability to override Unicode one's via
2635 # something like a set_loose_match()
2636 main::set_access('loose_match', \%loose_match, 'r');
2637
2638 my %make_pod_entry;
2639 # Some aliases should not get their own entries because they are covered
2640 # by a wild-card, and some we want to discourage use of. Binary
2641 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2642
2643 my %status;
2644 # Aliases have a status, like deprecated, or even suppressed (which means
2645 # they don't appear in documentation). Enum
2646 main::set_access('status', \%status, 'r');
2647
2648 my %externally_ok;
2649 # Similarly, some aliases should not be considered as usable ones for
2650 # external use, such as file names, or we don't want documentation to
2651 # recommend them. Boolean
2652 main::set_access('externally_ok', \%externally_ok, 'r');
2653
2654 sub new {
2655 my $class = shift;
2656
2657 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2658 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2659
2660 $name{$addr} = shift;
2661 $loose_match{$addr} = shift;
2662 $make_pod_entry{$addr} = shift;
2663 $externally_ok{$addr} = shift;
2664 $status{$addr} = shift;
2665
2666 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2667
2668 # Null names are never ok externally
2669 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2670
2671 return $self;
2672 }
2673}
2674
2675package Range;
2676
2677# A range is the basic unit for storing code points, and is described in the
2678# comments at the beginning of the program. Each range has a starting code
2679# point; an ending code point (not less than the starting one); a value
2680# that applies to every code point in between the two end-points, inclusive;
2681# and an enum type that applies to the value. The type is for the user's
2682# convenience, and has no meaning here, except that a non-zero type is
2683# considered to not obey the normal Unicode rules for having standard forms.
2684#
2685# The same structure is used for both map and match tables, even though in the
2686# latter, the value (and hence type) is irrelevant and could be used as a
2687# comment. In map tables, the value is what all the code points in the range
2688# map to. Type 0 values have the standardized version of the value stored as
2689# well, so as to not have to recalculate it a lot.
2690
2691sub trace { return main::trace(@_); }
2692
2693{ # Closure
2694
2695 main::setup_package();
2696
2697 my %start;
2698 main::set_access('start', \%start, 'r', 's');
2699
2700 my %end;
2701 main::set_access('end', \%end, 'r', 's');
2702
2703 my %value;
2704 main::set_access('value', \%value, 'r');
2705
2706 my %type;
2707 main::set_access('type', \%type, 'r');
2708
2709 my %standard_form;
2710 # The value in internal standard form. Defined only if the type is 0.
2711 main::set_access('standard_form', \%standard_form);
2712
2713 # Note that if these fields change, the dump() method should as well
2714
2715 sub new {
2716 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2717 my $class = shift;
2718
2719 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2720 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2721
2722 $start{$addr} = shift;
2723 $end{$addr} = shift;
2724
2725 my %args = @_;
2726
2727 my $value = delete $args{'Value'}; # Can be 0
2728 $value = "" unless defined $value;
2729 $value{$addr} = $value;
2730
2731 $type{$addr} = delete $args{'Type'} || 0;
2732
2733 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2734
2735 if (! $type{$addr}) {
2736 $standard_form{$addr} = main::standardize($value);
2737 }
2738
2739 return $self;
2740 }
2741
2742 use overload
2743 fallback => 0,
2744 qw("") => "_operator_stringify",
2745 "." => \&main::_operator_dot,
2746 ;
2747
2748 sub _operator_stringify {
2749 my $self = shift;
ffe43484 2750 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2751
2752 # Output it like '0041..0065 (value)'
2753 my $return = sprintf("%04X", $start{$addr})
2754 . '..'
2755 . sprintf("%04X", $end{$addr});
2756 my $value = $value{$addr};
2757 my $type = $type{$addr};
2758 $return .= ' (';
2759 $return .= "$value";
2760 $return .= ", Type=$type" if $type != 0;
2761 $return .= ')';
2762
2763 return $return;
2764 }
2765
2766 sub standard_form {
2767 # The standard form is the value itself if the standard form is
2768 # undefined (that is if the value is special)
2769
2770 my $self = shift;
2771 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2772
ffe43484 2773 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2774
2775 return $standard_form{$addr} if defined $standard_form{$addr};
2776 return $value{$addr};
2777 }
2778
2779 sub dump {
2780 # Human, not machine readable. For machine readable, comment out this
2781 # entire routine and let the standard one take effect.
2782 my $self = shift;
2783 my $indent = shift;
2784 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2785
ffe43484 2786 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2787
2788 my $return = $indent
2789 . sprintf("%04X", $start{$addr})
2790 . '..'
2791 . sprintf("%04X", $end{$addr})
2792 . " '$value{$addr}';";
2793 if (! defined $standard_form{$addr}) {
2794 $return .= "(type=$type{$addr})";
2795 }
2796 elsif ($standard_form{$addr} ne $value{$addr}) {
2797 $return .= "(standard '$standard_form{$addr}')";
2798 }
2799 return $return;
2800 }
2801} # End closure
2802
2803package _Range_List_Base;
2804
2805# Base class for range lists. A range list is simply an ordered list of
2806# ranges, so that the ranges with the lowest starting numbers are first in it.
2807#
2808# When a new range is added that is adjacent to an existing range that has the
2809# same value and type, it merges with it to form a larger range.
2810#
2811# Ranges generally do not overlap, except that there can be multiple entries
2812# of single code point ranges. This is because of NameAliases.txt.
2813#
2814# In this program, there is a standard value such that if two different
2815# values, have the same standard value, they are considered equivalent. This
2816# value was chosen so that it gives correct results on Unicode data
2817
2818# There are a number of methods to manipulate range lists, and some operators
2819# are overloaded to handle them.
2820
99870f4d
KW
2821sub trace { return main::trace(@_); }
2822
2823{ # Closure
2824
2825 our $addr;
2826
2827 main::setup_package();
2828
2829 my %ranges;
2830 # The list of ranges
2831 main::set_access('ranges', \%ranges, 'readable_array');
2832
2833 my %max;
2834 # The highest code point in the list. This was originally a method, but
2835 # actual measurements said it was used a lot.
2836 main::set_access('max', \%max, 'r');
2837
2838 my %each_range_iterator;
2839 # Iterator position for each_range()
2840 main::set_access('each_range_iterator', \%each_range_iterator);
2841
2842 my %owner_name_of;
2843 # Name of parent this is attached to, if any. Solely for better error
2844 # messages.
2845 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2846
2847 my %_search_ranges_cache;
2848 # A cache of the previous result from _search_ranges(), for better
2849 # performance
2850 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2851
2852 sub new {
2853 my $class = shift;
2854 my %args = @_;
2855
2856 # Optional initialization data for the range list.
2857 my $initialize = delete $args{'Initialize'};
2858
2859 my $self;
2860
2861 # Use _union() to initialize. _union() returns an object of this
2862 # class, which means that it will call this constructor recursively.
2863 # But it won't have this $initialize parameter so that it won't
2864 # infinitely loop on this.
2865 return _union($class, $initialize, %args) if defined $initialize;
2866
2867 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2868 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2869
2870 # Optional parent object, only for debug info.
2871 $owner_name_of{$addr} = delete $args{'Owner'};
2872 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2873
2874 # Stringify, in case it is an object.
2875 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2876
2877 # This is used only for error messages, and so a colon is added
2878 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2879
2880 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2881
2882 # Max is initialized to a negative value that isn't adjacent to 0,
2883 # for simpler tests
2884 $max{$addr} = -2;
2885
2886 $_search_ranges_cache{$addr} = 0;
2887 $ranges{$addr} = [];
2888
2889 return $self;
2890 }
2891
2892 use overload
2893 fallback => 0,
2894 qw("") => "_operator_stringify",
2895 "." => \&main::_operator_dot,
2896 ;
2897
2898 sub _operator_stringify {
2899 my $self = shift;
ffe43484 2900 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2901
2902 return "Range_List attached to '$owner_name_of{$addr}'"
2903 if $owner_name_of{$addr};
2904 return "anonymous Range_List " . \$self;
2905 }
2906
2907 sub _union {
2908 # Returns the union of the input code points. It can be called as
2909 # either a constructor or a method. If called as a method, the result
2910 # will be a new() instance of the calling object, containing the union
2911 # of that object with the other parameter's code points; if called as
2912 # a constructor, the first parameter gives the class the new object
2913 # should be, and the second parameter gives the code points to go into
2914 # it.
2915 # In either case, there are two parameters looked at by this routine;
2916 # any additional parameters are passed to the new() constructor.
2917 #
2918 # The code points can come in the form of some object that contains
2919 # ranges, and has a conventionally named method to access them; or
2920 # they can be an array of individual code points (as integers); or
2921 # just a single code point.
2922 #
2923 # If they are ranges, this routine doesn't make any effort to preserve
2924 # the range values of one input over the other. Therefore this base
2925 # class should not allow _union to be called from other than
2926 # initialization code, so as to prevent two tables from being added
2927 # together where the range values matter. The general form of this
2928 # routine therefore belongs in a derived class, but it was moved here
2929 # to avoid duplication of code. The failure to overload this in this
2930 # class keeps it safe.
2931 #
2932
2933 my $self;
2934 my @args; # Arguments to pass to the constructor
2935
2936 my $class = shift;
2937
2938 # If a method call, will start the union with the object itself, and
2939 # the class of the new object will be the same as self.
2940 if (ref $class) {
2941 $self = $class;
2942 $class = ref $self;
2943 push @args, $self;
2944 }
2945
2946 # Add the other required parameter.
2947 push @args, shift;
2948 # Rest of parameters are passed on to the constructor
2949
2950 # Accumulate all records from both lists.
2951 my @records;
2952 for my $arg (@args) {
2953 #local $to_trace = 0 if main::DEBUG;
2954 trace "argument = $arg" if main::DEBUG && $to_trace;
2955 if (! defined $arg) {
2956 my $message = "";
2957 if (defined $self) {
f998e60c 2958 no overloading;
051df77b 2959 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2960 }
2961 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2962 return;
2963 }
2964 $arg = [ $arg ] if ! ref $arg;
2965 my $type = ref $arg;
2966 if ($type eq 'ARRAY') {
2967 foreach my $element (@$arg) {
2968 push @records, Range->new($element, $element);
2969 }
2970 }
2971 elsif ($arg->isa('Range')) {
2972 push @records, $arg;
2973 }
2974 elsif ($arg->can('ranges')) {
2975 push @records, $arg->ranges;
2976 }
2977 else {
2978 my $message = "";
2979 if (defined $self) {
f998e60c 2980 no overloading;
051df77b 2981 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2982 }
2983 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2984 return;
2985 }
2986 }
2987
2988 # Sort with the range containing the lowest ordinal first, but if
2989 # two ranges start at the same code point, sort with the bigger range
2990 # of the two first, because it takes fewer cycles.
2991 @records = sort { ($a->start <=> $b->start)
2992 or
2993 # if b is shorter than a, b->end will be
2994 # less than a->end, and we want to select
2995 # a, so want to return -1
2996 ($b->end <=> $a->end)
2997 } @records;
2998
2999 my $new = $class->new(@_);
3000
3001 # Fold in records so long as they add new information.
3002 for my $set (@records) {
3003 my $start = $set->start;
3004 my $end = $set->end;
3005 my $value = $set->value;
3006 if ($start > $new->max) {
3007 $new->_add_delete('+', $start, $end, $value);
3008 }
3009 elsif ($end > $new->max) {
3010 $new->_add_delete('+', $new->max +1, $end, $value);
3011 }
3012 }
3013
3014 return $new;
3015 }
3016
3017 sub range_count { # Return the number of ranges in the range list
3018 my $self = shift;
3019 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3020
f998e60c 3021 no overloading;
051df77b 3022 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
3023 }
3024
3025 sub min {
3026 # Returns the minimum code point currently in the range list, or if
3027 # the range list is empty, 2 beyond the max possible. This is a
3028 # method because used so rarely, that not worth saving between calls,
3029 # and having to worry about changing it as ranges are added and
3030 # deleted.
3031
3032 my $self = shift;
3033 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3034
ffe43484 3035 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3036
3037 # If the range list is empty, return a large value that isn't adjacent
3038 # to any that could be in the range list, for simpler tests
3039 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3040 return $ranges{$addr}->[0]->start;
3041 }
3042
3043 sub contains {
3044 # Boolean: Is argument in the range list? If so returns $i such that:
3045 # range[$i]->end < $codepoint <= range[$i+1]->end
3046 # which is one beyond what you want; this is so that the 0th range
3047 # doesn't return false
3048 my $self = shift;
3049 my $codepoint = shift;
3050 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3051
99870f4d
KW
3052 my $i = $self->_search_ranges($codepoint);
3053 return 0 unless defined $i;
3054
3055 # The search returns $i, such that
3056 # range[$i-1]->end < $codepoint <= range[$i]->end
3057 # So is in the table if and only iff it is at least the start position
3058 # of range $i.
f998e60c 3059 no overloading;
051df77b 3060 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
3061 return $i + 1;
3062 }
3063
2f7a8815
KW
3064 sub containing_range {
3065 # Returns the range object that contains the code point, undef if none
3066
3067 my $self = shift;
3068 my $codepoint = shift;
3069 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3070
3071 my $i = $self->contains($codepoint);
3072 return unless $i;
3073
3074 # contains() returns 1 beyond where we should look
3075 no overloading;
3076 return $ranges{pack 'J', $self}->[$i-1];
3077 }
3078
99870f4d
KW
3079 sub value_of {
3080 # Returns the value associated with the code point, undef if none
3081
3082 my $self = shift;
3083 my $codepoint = shift;
3084 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3085
d69c231b
KW
3086 my $range = $self->containing_range($codepoint);
3087 return unless defined $range;
99870f4d 3088
d69c231b 3089 return $range->value;
99870f4d
KW
3090 }
3091
0a9dbafc
KW
3092 sub type_of {
3093 # Returns the type of the range containing the code point, undef if
3094 # the code point is not in the table
3095
3096 my $self = shift;
3097 my $codepoint = shift;
3098 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3099
3100 my $range = $self->containing_range($codepoint);
3101 return unless defined $range;
3102
3103 return $range->type;
3104 }
3105
99870f4d
KW
3106 sub _search_ranges {
3107 # Find the range in the list which contains a code point, or where it
3108 # should go if were to add it. That is, it returns $i, such that:
3109 # range[$i-1]->end < $codepoint <= range[$i]->end
3110 # Returns undef if no such $i is possible (e.g. at end of table), or
3111 # if there is an error.
3112
3113 my $self = shift;
3114 my $code_point = shift;
3115 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3116
ffe43484 3117 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3118
3119 return if $code_point > $max{$addr};
3120 my $r = $ranges{$addr}; # The current list of ranges
3121 my $range_list_size = scalar @$r;
3122 my $i;
3123
3124 use integer; # want integer division
3125
3126 # Use the cached result as the starting guess for this one, because,
3127 # an experiment on 5.1 showed that 90% of the time the cache was the
3128 # same as the result on the next call (and 7% it was one less).
3129 $i = $_search_ranges_cache{$addr};
3130 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3131 # from an intervening deletion
3132 #local $to_trace = 1 if main::DEBUG;
3133 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3134 return $i if $code_point <= $r->[$i]->end
3135 && ($i == 0 || $r->[$i-1]->end < $code_point);
3136
3137 # Here the cache doesn't yield the correct $i. Try adding 1.
3138 if ($i < $range_list_size - 1
3139 && $r->[$i]->end < $code_point &&
3140 $code_point <= $r->[$i+1]->end)
3141 {
3142 $i++;
3143 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3144 $_search_ranges_cache{$addr} = $i;
3145 return $i;
3146 }
3147
3148 # Here, adding 1 also didn't work. We do a binary search to
3149 # find the correct position, starting with current $i
3150 my $lower = 0;
3151 my $upper = $range_list_size - 1;
3152 while (1) {
3153 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3154
3155 if ($code_point <= $r->[$i]->end) {
3156
3157 # Here we have met the upper constraint. We can quit if we
3158 # also meet the lower one.
3159 last if $i == 0 || $r->[$i-1]->end < $code_point;
3160
3161 $upper = $i; # Still too high.
3162
3163 }
3164 else {
3165
3166 # Here, $r[$i]->end < $code_point, so look higher up.
3167 $lower = $i;
3168 }
3169
3170 # Split search domain in half to try again.
3171 my $temp = ($upper + $lower) / 2;
3172
3173 # No point in continuing unless $i changes for next time
3174 # in the loop.
3175 if ($temp == $i) {
3176
3177 # We can't reach the highest element because of the averaging.
3178 # So if one below the upper edge, force it there and try one
3179 # more time.
3180 if ($i == $range_list_size - 2) {
3181
3182 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3183 $i = $range_list_size - 1;
3184
3185 # Change $lower as well so if fails next time through,
3186 # taking the average will yield the same $i, and we will
3187 # quit with the error message just below.
3188 $lower = $i;
3189 next;
3190 }
3191 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3192 return;
3193 }
3194 $i = $temp;
3195 } # End of while loop
3196
3197 if (main::DEBUG && $to_trace) {
3198 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3199 trace "i= [ $i ]", $r->[$i];
3200 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3201 }
3202
3203 # Here we have found the offset. Cache it as a starting point for the
3204 # next call.
3205 $_search_ranges_cache{$addr} = $i;
3206 return $i;
3207 }
3208
3209 sub _add_delete {
3210 # Add, replace or delete ranges to or from a list. The $type
3211 # parameter gives which:
3212 # '+' => insert or replace a range, returning a list of any changed
3213 # ranges.
3214 # '-' => delete a range, returning a list of any deleted ranges.
3215 #
3216 # The next three parameters give respectively the start, end, and
3217 # value associated with the range. 'value' should be null unless the
3218 # operation is '+';
3219 #
3220 # The range list is kept sorted so that the range with the lowest
3221 # starting position is first in the list, and generally, adjacent
c1739a4a 3222 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3223 # exceptions below).
3224 #
c1739a4a 3225 # There are more parameters; all are key => value pairs:
99870f4d
KW
3226 # Type gives the type of the value. It is only valid for '+'.
3227 # All ranges have types; if this parameter is omitted, 0 is
3228 # assumed. Ranges with type 0 are assumed to obey the
3229 # Unicode rules for casing, etc; ranges with other types are
3230 # not. Otherwise, the type is arbitrary, for the caller's
3231 # convenience, and looked at only by this routine to keep
3232 # adjacent ranges of different types from being merged into
3233 # a single larger range, and when Replace =>
3234 # $IF_NOT_EQUIVALENT is specified (see just below).
3235 # Replace determines what to do if the range list already contains
3236 # ranges which coincide with all or portions of the input
3237 # range. It is only valid for '+':
3238 # => $NO means that the new value is not to replace
3239 # any existing ones, but any empty gaps of the
3240 # range list coinciding with the input range
3241 # will be filled in with the new value.
3242 # => $UNCONDITIONALLY means to replace the existing values with
3243 # this one unconditionally. However, if the
3244 # new and old values are identical, the
3245 # replacement is skipped to save cycles
3246 # => $IF_NOT_EQUIVALENT means to replace the existing values
3247 # with this one if they are not equivalent.
3248 # Ranges are equivalent if their types are the
c1739a4a 3249 # same, and they are the same string; or if
99870f4d
KW
3250 # both are type 0 ranges, if their Unicode
3251 # standard forms are identical. In this last
3252 # case, the routine chooses the more "modern"
3253 # one to use. This is because some of the
3254 # older files are formatted with values that
3255 # are, for example, ALL CAPs, whereas the
3256 # derived files have a more modern style,
3257 # which looks better. By looking for this
3258 # style when the pre-existing and replacement
3259 # standard forms are the same, we can move to
3260 # the modern style
3261 # => $MULTIPLE means that if this range duplicates an
3262 # existing one, but has a different value,
3263 # don't replace the existing one, but insert
3264 # this, one so that the same range can occur
53d84487
KW
3265 # multiple times. They are stored LIFO, so
3266 # that the final one inserted is the first one
3267 # returned in an ordered search of the table.
99870f4d
KW
3268 # => anything else is the same as => $IF_NOT_EQUIVALENT
3269 #
c1739a4a
KW
3270 # "same value" means identical for non-type-0 ranges, and it means
3271 # having the same standard forms for type-0 ranges.
99870f4d
KW
3272
3273 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3274
3275 my $self = shift;
3276 my $operation = shift; # '+' for add/replace; '-' for delete;
3277 my $start = shift;
3278 my $end = shift;
3279 my $value = shift;
3280
3281 my %args = @_;
3282
3283 $value = "" if not defined $value; # warning: $value can be "0"
3284
3285 my $replace = delete $args{'Replace'};
3286 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3287
3288 my $type = delete $args{'Type'};
3289 $type = 0 unless defined $type;
3290
3291 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3292
ffe43484 3293 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3294
3295 if ($operation ne '+' && $operation ne '-') {
3296 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3297 return;
3298 }
3299 unless (defined $start && defined $end) {
3300 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3301 return;
3302 }
3303 unless ($end >= $start) {
3304 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3305 return;
3306 }
3307 #local $to_trace = 1 if main::DEBUG;
3308
3309 if ($operation eq '-') {
3310 if ($replace != $IF_NOT_EQUIVALENT) {
3311 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3312 $replace = $IF_NOT_EQUIVALENT;
3313 }
3314 if ($type) {
3315 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3316 $type = 0;
3317 }
3318 if ($value ne "") {
3319 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3320 $value = "";
3321 }
3322 }
3323
3324 my $r = $ranges{$addr}; # The current list of ranges
3325 my $range_list_size = scalar @$r; # And its size
3326 my $max = $max{$addr}; # The current high code point in
3327 # the list of ranges
3328
3329 # Do a special case requiring fewer machine cycles when the new range
3330 # starts after the current highest point. The Unicode input data is
3331 # structured so this is common.
3332 if ($start > $max) {
3333
3334 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3335 return if $operation eq '-'; # Deleting a non-existing range is a
3336 # no-op
3337
3338 # If the new range doesn't logically extend the current final one
3339 # in the range list, create a new range at the end of the range
3340 # list. (max cleverly is initialized to a negative number not
3341 # adjacent to 0 if the range list is empty, so even adding a range
3342 # to an empty range list starting at 0 will have this 'if'
3343 # succeed.)
3344 if ($start > $max + 1 # non-adjacent means can't extend.
3345 || @{$r}[-1]->value ne $value # values differ, can't extend.
3346 || @{$r}[-1]->type != $type # types differ, can't extend.
3347 ) {
3348 push @$r, Range->new($start, $end,
3349 Value => $value,
3350 Type => $type);
3351 }
3352 else {
3353
3354 # Here, the new range starts just after the current highest in
3355 # the range list, and they have the same type and value.
3356 # Extend the current range to incorporate the new one.
3357 @{$r}[-1]->set_end($end);
3358 }
3359
3360 # This becomes the new maximum.
3361 $max{$addr} = $end;
3362
3363 return;
3364 }
3365 #local $to_trace = 0 if main::DEBUG;
3366
3367 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3368
3369 # Here, the input range isn't after the whole rest of the range list.
3370 # Most likely 'splice' will be needed. The rest of the routine finds
3371 # the needed splice parameters, and if necessary, does the splice.
3372 # First, find the offset parameter needed by the splice function for
3373 # the input range. Note that the input range may span multiple
3374 # existing ones, but we'll worry about that later. For now, just find
3375 # the beginning. If the input range is to be inserted starting in a
3376 # position not currently in the range list, it must (obviously) come
3377 # just after the range below it, and just before the range above it.
3378 # Slightly less obviously, it will occupy the position currently
3379 # occupied by the range that is to come after it. More formally, we
3380 # are looking for the position, $i, in the array of ranges, such that:
3381 #
3382 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3383 #
3384 # (The ordered relationships within existing ranges are also shown in
3385 # the equation above). However, if the start of the input range is
3386 # within an existing range, the splice offset should point to that
3387 # existing range's position in the list; that is $i satisfies a
3388 # somewhat different equation, namely:
3389 #
3390 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3391 #
3392 # More briefly, $start can come before or after r[$i]->start, and at
3393 # this point, we don't know which it will be. However, these
3394 # two equations share these constraints:
3395 #
3396 # r[$i-1]->end < $start <= r[$i]->end
3397 #
3398 # And that is good enough to find $i.
3399
3400 my $i = $self->_search_ranges($start);
3401 if (! defined $i) {
3402 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3403 return;
3404 }
3405
3406 # The search function returns $i such that:
3407 #
3408 # r[$i-1]->end < $start <= r[$i]->end
3409 #
3410 # That means that $i points to the first range in the range list
3411 # that could possibly be affected by this operation. We still don't
3412 # know if the start of the input range is within r[$i], or if it
3413 # points to empty space between r[$i-1] and r[$i].
3414 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3415
3416 # Special case the insertion of data that is not to replace any
3417 # existing data.
3418 if ($replace == $NO) { # If $NO, has to be operation '+'
3419 #local $to_trace = 1 if main::DEBUG;
3420 trace "Doesn't replace" if main::DEBUG && $to_trace;
3421
3422 # Here, the new range is to take effect only on those code points
3423 # that aren't already in an existing range. This can be done by
3424 # looking through the existing range list and finding the gaps in
3425 # the ranges that this new range affects, and then calling this
3426 # function recursively on each of those gaps, leaving untouched
3427 # anything already in the list. Gather up a list of the changed
3428 # gaps first so that changes to the internal state as new ranges
3429 # are added won't be a problem.
3430 my @gap_list;
3431
3432 # First, if the starting point of the input range is outside an
3433 # existing one, there is a gap from there to the beginning of the
3434 # existing range -- add a span to fill the part that this new
3435 # range occupies
3436 if ($start < $r->[$i]->start) {
3437 push @gap_list, Range->new($start,
3438 main::min($end,
3439 $r->[$i]->start - 1),
3440 Type => $type);
3441 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3442 }
3443
3444 # Then look through the range list for other gaps until we reach
3445 # the highest range affected by the input one.
3446 my $j;
3447 for ($j = $i+1; $j < $range_list_size; $j++) {
3448 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3449 last if $end < $r->[$j]->start;
3450
3451 # If there is a gap between when this range starts and the
3452 # previous one ends, add a span to fill it. Note that just
3453 # because there are two ranges doesn't mean there is a
3454 # non-zero gap between them. It could be that they have
3455 # different values or types
3456 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3457 push @gap_list,
3458 Range->new($r->[$j-1]->end + 1,
3459 $r->[$j]->start - 1,
3460 Type => $type);
3461 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3462 }
3463 }
3464
3465 # Here, we have either found an existing range in the range list,
3466 # beyond the area affected by the input one, or we fell off the
3467 # end of the loop because the input range affects the whole rest
3468 # of the range list. In either case, $j is 1 higher than the
3469 # highest affected range. If $j == $i, it means that there are no
3470 # affected ranges, that the entire insertion is in the gap between
3471 # r[$i-1], and r[$i], which we already have taken care of before
3472 # the loop.
3473 # On the other hand, if there are affected ranges, it might be
3474 # that there is a gap that needs filling after the final such
3475 # range to the end of the input range
3476 if ($r->[$j-1]->end < $end) {
3477 push @gap_list, Range->new(main::max($start,
3478 $r->[$j-1]->end + 1),
3479 $end,
3480 Type => $type);
3481 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3482 }
3483
3484 # Call recursively to fill in all the gaps.
3485 foreach my $gap (@gap_list) {
3486 $self->_add_delete($operation,
3487 $gap->start,
3488 $gap->end,
3489 $value,
3490 Type => $type);
3491 }
3492
3493 return;
3494 }
3495
53d84487
KW
3496 # Here, we have taken care of the case where $replace is $NO.
3497 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3498 # If inserting a multiple record, this is where it goes, before the
3499 # first (if any) existing one. This implies an insertion, and no
3500 # change to any existing ranges. Note that $i can be -1 if this new
3501 # range doesn't actually duplicate any existing, and comes at the
3502 # beginning of the list.
3503 if ($replace == $MULTIPLE) {
3504
3505 if ($start != $end) {
3506 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3507 return;
3508 }
3509
3510 # Don't add an exact duplicate, as it isn't really a multiple
3511 if ($end >= $r->[$i]->start) {
3512 if ($r->[$i]->start != $r->[$i]->end) {
3513 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the other range ($r->[$i]) contains more than one code point. No action taken.");
3514 return;
3515 }
3516 return if $value eq $r->[$i]->value && $type eq $r->[$i]->type;
3517 }
3518
3519 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3520 my @return = splice @$r,
3521 $i,
3522 0,
3523 Range->new($start,
3524 $end,
3525 Value => $value,
3526 Type => $type);
3527 if (main::DEBUG && $to_trace) {
3528 trace "After splice:";
3529 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3530 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3531 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3532 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3533 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3534 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3535 }
3536 return @return;
3537 }
3538
3539 # Here, we have taken care of $NO and $MULTIPLE replaces. This leaves
3540 # delete, insert, and replace either unconditionally or if not
3541 # equivalent. $i still points to the first potential affected range.
3542 # Now find the highest range affected, which will determine the length
3543 # parameter to splice. (The input range can span multiple existing
3544 # ones.) If this isn't a deletion, while we are looking through the
3545 # range list, see also if this is a replacement rather than a clean
3546 # insertion; that is if it will change the values of at least one
3547 # existing range. Start off assuming it is an insert, until find it
3548 # isn't.
3549 my $clean_insert = $operation eq '+';
99870f4d
KW
3550 my $j; # This will point to the highest affected range
3551
3552 # For non-zero types, the standard form is the value itself;
3553 my $standard_form = ($type) ? $value : main::standardize($value);
3554
3555 for ($j = $i; $j < $range_list_size; $j++) {
3556 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3557
3558 # If find a range that it doesn't overlap into, we can stop
3559 # searching
3560 last if $end < $r->[$j]->start;
3561
969a34cc
KW
3562 # Here, overlaps the range at $j. If the values don't match,
3563 # and so far we think this is a clean insertion, it becomes a
3564 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3565 if ($clean_insert) {
99870f4d 3566 if ($r->[$j]->standard_form ne $standard_form) {
969a34cc 3567 $clean_insert = 0;
56343c78
KW
3568 if ($replace == $CROAK) {
3569 main::croak("The range to add "
3570 . sprintf("%04X", $start)
3571 . '-'
3572 . sprintf("%04X", $end)
3573 . " with value '$value' overlaps an existing range $r->[$j]");
3574 }
99870f4d
KW
3575 }
3576 else {
3577
3578 # Here, the two values are essentially the same. If the
3579 # two are actually identical, replacing wouldn't change
3580 # anything so skip it.
3581 my $pre_existing = $r->[$j]->value;
3582 if ($pre_existing ne $value) {
3583
3584 # Here the new and old standardized values are the
3585 # same, but the non-standardized values aren't. If
3586 # replacing unconditionally, then replace
3587 if( $replace == $UNCONDITIONALLY) {
969a34cc 3588 $clean_insert = 0;
99870f4d
KW
3589 }
3590 else {
3591
3592 # Here, are replacing conditionally. Decide to
3593 # replace or not based on which appears to look
3594 # the "nicest". If one is mixed case and the
3595 # other isn't, choose the mixed case one.
3596 my $new_mixed = $value =~ /[A-Z]/
3597 && $value =~ /[a-z]/;
3598 my $old_mixed = $pre_existing =~ /[A-Z]/
3599 && $pre_existing =~ /[a-z]/;
3600
3601 if ($old_mixed != $new_mixed) {
969a34cc 3602 $clean_insert = 0 if $new_mixed;
99870f4d 3603 if (main::DEBUG && $to_trace) {
969a34cc
KW
3604 if ($clean_insert) {
3605 trace "Retaining $pre_existing over $value";
99870f4d
KW
3606 }
3607 else {
969a34cc 3608 trace "Replacing $pre_existing with $value";
99870f4d
KW
3609 }
3610 }
3611 }
3612 else {
3613
3614 # Here casing wasn't different between the two.
3615 # If one has hyphens or underscores and the
3616 # other doesn't, choose the one with the
3617 # punctuation.
3618 my $new_punct = $value =~ /[-_]/;
3619 my $old_punct = $pre_existing =~ /[-_]/;
3620
3621 if ($old_punct != $new_punct) {
969a34cc 3622 $clean_insert = 0 if $new_punct;
99870f4d 3623 if (main::DEBUG && $to_trace) {
969a34cc
KW
3624 if ($clean_insert) {
3625 trace "Retaining $pre_existing over $value";
99870f4d
KW
3626 }
3627 else {
969a34cc 3628 trace "Replacing $pre_existing with $value";
99870f4d
KW
3629 }
3630 }
3631 } # else existing one is just as "good";
3632 # retain it to save cycles.
3633 }
3634 }
3635 }
3636 }
3637 }
3638 } # End of loop looking for highest affected range.
3639
3640 # Here, $j points to one beyond the highest range that this insertion
3641 # affects (hence to beyond the range list if that range is the final
3642 # one in the range list).
3643
3644 # The splice length is all the affected ranges. Get it before
3645 # subtracting, for efficiency, so we don't have to later add 1.
3646 my $length = $j - $i;
3647
3648 $j--; # $j now points to the highest affected range.
3649 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3650
99870f4d
KW
3651 # Here, have taken care of $NO and $MULTIPLE replaces.
3652 # $j points to the highest affected range. But it can be < $i or even
3653 # -1. These happen only if the insertion is entirely in the gap
3654 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3655 # above exited first time through with $end < $r->[$i]->start. (And
3656 # then we subtracted one from j) This implies also that $start <
3657 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3658 # $start, so the entire input range is in the gap.
3659 if ($j < $i) {
3660
3661 # Here the entire input range is in the gap before $i.
3662
3663 if (main::DEBUG && $to_trace) {
3664 if ($i) {
3665 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3666 }
3667 else {
3668 trace "Entire range is before $r->[$i]";
3669 }
3670 }
3671 return if $operation ne '+'; # Deletion of a non-existent range is
3672 # a no-op
3673 }
3674 else {
3675
969a34cc
KW
3676 # Here part of the input range is not in the gap before $i. Thus,
3677 # there is at least one affected one, and $j points to the highest
3678 # such one.
99870f4d
KW
3679
3680 # At this point, here is the situation:
3681 # This is not an insertion of a multiple, nor of tentative ($NO)
3682 # data.
3683 # $i points to the first element in the current range list that
3684 # may be affected by this operation. In fact, we know
3685 # that the range at $i is affected because we are in
3686 # the else branch of this 'if'
3687 # $j points to the highest affected range.
3688 # In other words,
3689 # r[$i-1]->end < $start <= r[$i]->end
3690 # And:
3691 # r[$i-1]->end < $start <= $end <= r[$j]->end
3692 #
3693 # Also:
969a34cc
KW
3694 # $clean_insert is a boolean which is set true if and only if
3695 # this is a "clean insertion", i.e., not a change nor a
3696 # deletion (multiple was handled above).
99870f4d
KW
3697
3698 # We now have enough information to decide if this call is a no-op
969a34cc
KW
3699 # or not. It is a no-op if this is an insertion of already
3700 # existing data.
99870f4d 3701
969a34cc 3702 if (main::DEBUG && $to_trace && $clean_insert
99870f4d
KW
3703 && $i == $j
3704 && $start >= $r->[$i]->start)
3705 {
3706 trace "no-op";
3707 }
969a34cc 3708 return if $clean_insert
99870f4d
KW
3709 && $i == $j # more than one affected range => not no-op
3710
3711 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3712 # Further, $start and/or $end is >= r[$i]->start
3713 # The test below hence guarantees that
3714 # r[$i]->start < $start <= $end <= r[$i]->end
3715 # This means the input range is contained entirely in
3716 # the one at $i, so is a no-op
3717 && $start >= $r->[$i]->start;
3718 }
3719
3720 # Here, we know that some action will have to be taken. We have
3721 # calculated the offset and length (though adjustments may be needed)
3722 # for the splice. Now start constructing the replacement list.
3723 my @replacement;
3724 my $splice_start = $i;
3725
3726 my $extends_below;
3727 my $extends_above;
3728
3729 # See if should extend any adjacent ranges.
3730 if ($operation eq '-') { # Don't extend deletions
3731 $extends_below = $extends_above = 0;
3732 }
3733 else { # Here, should extend any adjacent ranges. See if there are
3734 # any.
3735 $extends_below = ($i > 0
3736 # can't extend unless adjacent
3737 && $r->[$i-1]->end == $start -1
3738 # can't extend unless are same standard value
3739 && $r->[$i-1]->standard_form eq $standard_form
3740 # can't extend unless share type
3741 && $r->[$i-1]->type == $type);
3742 $extends_above = ($j+1 < $range_list_size
3743 && $r->[$j+1]->start == $end +1
3744 && $r->[$j+1]->standard_form eq $standard_form
23822bda 3745 && $r->[$j+1]->type == $type);
99870f4d
KW
3746 }
3747 if ($extends_below && $extends_above) { # Adds to both
3748 $splice_start--; # start replace at element below
3749 $length += 2; # will replace on both sides
3750 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3751
3752 # The result will fill in any gap, replacing both sides, and
3753 # create one large range.
3754 @replacement = Range->new($r->[$i-1]->start,
3755 $r->[$j+1]->end,
3756 Value => $value,
3757 Type => $type);
3758 }
3759 else {
3760
3761 # Here we know that the result won't just be the conglomeration of
3762 # a new range with both its adjacent neighbors. But it could
3763 # extend one of them.
3764
3765 if ($extends_below) {
3766
3767 # Here the new element adds to the one below, but not to the
3768 # one above. If inserting, and only to that one range, can
3769 # just change its ending to include the new one.
969a34cc 3770 if ($length == 0 && $clean_insert) {
99870f4d
KW
3771 $r->[$i-1]->set_end($end);
3772 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3773 return;
3774 }
3775 else {
3776 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3777 $splice_start--; # start replace at element below
3778 $length++; # will replace the element below
3779 $start = $r->[$i-1]->start;
3780 }
3781 }
3782 elsif ($extends_above) {
3783
3784 # Here the new element adds to the one above, but not below.
3785 # Mirror the code above
969a34cc 3786 if ($length == 0 && $clean_insert) {
99870f4d
KW
3787 $r->[$j+1]->set_start($start);
3788 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3789 return;
3790 }
3791 else {
3792 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3793 $length++; # will replace the element above
3794 $end = $r->[$j+1]->end;
3795 }
3796 }
3797
3798 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3799
3800 # Finally, here we know there will have to be a splice.
3801 # If the change or delete affects only the highest portion of the
3802 # first affected range, the range will have to be split. The
3803 # splice will remove the whole range, but will replace it by a new
3804 # range containing just the unaffected part. So, in this case,
3805 # add to the replacement list just this unaffected portion.
3806 if (! $extends_below
3807 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3808 {
3809 push @replacement,
3810 Range->new($r->[$i]->start,
3811 $start - 1,
3812 Value => $r->[$i]->value,
3813 Type => $r->[$i]->type);
3814 }
3815
3816 # In the case of an insert or change, but not a delete, we have to
3817 # put in the new stuff; this comes next.
3818 if ($operation eq '+') {
3819 push @replacement, Range->new($start,
3820 $end,
3821 Value => $value,
3822 Type => $type);
3823 }
3824
3825 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3826 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3827
3828 # And finally, if we're changing or deleting only a portion of the
3829 # highest affected range, it must be split, as the lowest one was.
3830 if (! $extends_above
3831 && $j >= 0 # Remember that j can be -1 if before first
3832 # current element
3833 && $end >= $r->[$j]->start
3834 && $end < $r->[$j]->end)
3835 {
3836 push @replacement,
3837 Range->new($end + 1,
3838 $r->[$j]->end,
3839 Value => $r->[$j]->value,
3840 Type => $r->[$j]->type);
3841 }
3842 }
3843
3844 # And do the splice, as calculated above
3845 if (main::DEBUG && $to_trace) {
3846 trace "replacing $length element(s) at $i with ";
3847 foreach my $replacement (@replacement) {
3848 trace " $replacement";
3849 }
3850 trace "Before splice:";
3851 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3852 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3853 trace "i =[", $i, "]", $r->[$i];
3854 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3855 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3856 }
3857
3858 my @return = splice @$r, $splice_start, $length, @replacement;
3859
3860 if (main::DEBUG && $to_trace) {
3861 trace "After splice:";
3862 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3863 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3864 trace "i =[", $i, "]", $r->[$i];
3865 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3866 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
e6451557 3867 trace "removed ", @return if @return;
99870f4d
KW
3868 }
3869
3870 # An actual deletion could have changed the maximum in the list.
3871 # There was no deletion if the splice didn't return something, but
3872 # otherwise recalculate it. This is done too rarely to worry about
3873 # performance.
3874 if ($operation eq '-' && @return) {
3875 $max{$addr} = $r->[-1]->end;
3876 }
3877 return @return;
3878 }
3879
3880 sub reset_each_range { # reset the iterator for each_range();
3881 my $self = shift;
3882 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3883
f998e60c 3884 no overloading;
051df77b 3885 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
3886 return;
3887 }
3888
3889 sub each_range {
3890 # Iterate over each range in a range list. Results are undefined if
3891 # the range list is changed during the iteration.
3892
3893 my $self = shift;
3894 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3895
ffe43484 3896 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3897
3898 return if $self->is_empty;
3899
3900 $each_range_iterator{$addr} = -1
3901 if ! defined $each_range_iterator{$addr};
3902 $each_range_iterator{$addr}++;
3903 return $ranges{$addr}->[$each_range_iterator{$addr}]
3904 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3905 undef $each_range_iterator{$addr};
3906 return;
3907 }
3908
3909 sub count { # Returns count of code points in range list
3910 my $self = shift;
3911 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3912
ffe43484 3913 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3914
3915 my $count = 0;
3916 foreach my $range (@{$ranges{$addr}}) {
3917 $count += $range->end - $range->start + 1;
3918 }
3919 return $count;
3920 }
3921
3922 sub delete_range { # Delete a range
3923 my $self = shift;
3924 my $start = shift;
3925 my $end = shift;
3926
3927 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3928
3929 return $self->_add_delete('-', $start, $end, "");
3930 }
3931
3932 sub is_empty { # Returns boolean as to if a range list is empty
3933 my $self = shift;
3934 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3935
f998e60c 3936 no overloading;
051df77b 3937 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
3938 }
3939
3940 sub hash {
3941 # Quickly returns a scalar suitable for separating tables into
3942 # buckets, i.e. it is a hash function of the contents of a table, so
3943 # there are relatively few conflicts.
3944
3945 my $self = shift;
3946 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3947
ffe43484 3948 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3949
3950 # These are quickly computable. Return looks like 'min..max;count'
3951 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3952 }
3953} # End closure for _Range_List_Base
3954
3955package Range_List;
3956use base '_Range_List_Base';
3957
3958# A Range_List is a range list for match tables; i.e. the range values are
3959# not significant. Thus a number of operations can be safely added to it,
3960# such as inversion, intersection. Note that union is also an unsafe
3961# operation when range values are cared about, and that method is in the base
3962# class, not here. But things are set up so that that method is callable only
3963# during initialization. Only in this derived class, is there an operation
3964# that combines two tables. A Range_Map can thus be used to initialize a
3965# Range_List, and its mappings will be in the list, but are not significant to
3966# this class.
3967
3968sub trace { return main::trace(@_); }
3969
3970{ # Closure
3971
3972 use overload
3973 fallback => 0,
3974 '+' => sub { my $self = shift;
3975 my $other = shift;
3976
3977 return $self->_union($other)
3978 },
3979 '&' => sub { my $self = shift;
3980 my $other = shift;
3981
3982 return $self->_intersect($other, 0);
3983 },
3984 '~' => "_invert",
3985 '-' => "_subtract",
3986 ;
3987
3988 sub _invert {
3989 # Returns a new Range_List that gives all code points not in $self.
3990
3991 my $self = shift;
3992
3993 my $new = Range_List->new;
3994
3995 # Go through each range in the table, finding the gaps between them
3996 my $max = -1; # Set so no gap before range beginning at 0
3997 for my $range ($self->ranges) {
3998 my $start = $range->start;
3999 my $end = $range->end;
4000
4001 # If there is a gap before this range, the inverse will contain
4002 # that gap.
4003 if ($start > $max + 1) {
4004 $new->add_range($max + 1, $start - 1);
4005 }
4006 $max = $end;
4007 }
4008
4009 # And finally, add the gap from the end of the table to the max
4010 # possible code point
4011 if ($max < $LAST_UNICODE_CODEPOINT) {
4012 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
4013 }
4014 return $new;
4015 }
4016
4017 sub _subtract {
4018 # Returns a new Range_List with the argument deleted from it. The
4019 # argument can be a single code point, a range, or something that has
4020 # a range, with the _range_list() method on it returning them
4021
4022 my $self = shift;
4023 my $other = shift;
4024 my $reversed = shift;
4025 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4026
4027 if ($reversed) {
4028 Carp::my_carp_bug("Can't cope with a "
4029 . __PACKAGE__
4030 . " being the second parameter in a '-'. Subtraction ignored.");
4031 return $self;
4032 }
4033
4034 my $new = Range_List->new(Initialize => $self);
4035
4036 if (! ref $other) { # Single code point
4037 $new->delete_range($other, $other);
4038 }
4039 elsif ($other->isa('Range')) {
4040 $new->delete_range($other->start, $other->end);
4041 }
4042 elsif ($other->can('_range_list')) {
4043 foreach my $range ($other->_range_list->ranges) {
4044 $new->delete_range($range->start, $range->end);
4045 }
4046 }
4047 else {
4048 Carp::my_carp_bug("Can't cope with a "
4049 . ref($other)
4050 . " argument to '-'. Subtraction ignored."
4051 );
4052 return $self;
4053 }
4054
4055 return $new;
4056 }
4057
4058 sub _intersect {
4059 # Returns either a boolean giving whether the two inputs' range lists
4060 # intersect (overlap), or a new Range_List containing the intersection
4061 # of the two lists. The optional final parameter being true indicates
4062 # to do the check instead of the intersection.
4063
4064 my $a_object = shift;
4065 my $b_object = shift;
4066 my $check_if_overlapping = shift;
4067 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4068 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4069
4070 if (! defined $b_object) {
4071 my $message = "";
4072 $message .= $a_object->_owner_name_of if defined $a_object;
4073 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4074 return;
4075 }
4076
4077 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4078 # Thus the intersection could be much more simply be written:
4079 # return ~(~$a_object + ~$b_object);
4080 # But, this is slower, and when taking the inverse of a large
4081 # range_size_1 table, back when such tables were always stored that
4082 # way, it became prohibitively slow, hence the code was changed to the
4083 # below
4084
4085 if ($b_object->isa('Range')) {
4086 $b_object = Range_List->new(Initialize => $b_object,
4087 Owner => $a_object->_owner_name_of);
4088 }
4089 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4090
4091 my @a_ranges = $a_object->ranges;
4092 my @b_ranges = $b_object->ranges;
4093
4094 #local $to_trace = 1 if main::DEBUG;
4095 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4096
4097 # Start with the first range in each list
4098 my $a_i = 0;
4099 my $range_a = $a_ranges[$a_i];
4100 my $b_i = 0;
4101 my $range_b = $b_ranges[$b_i];
4102
4103 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4104 if ! $check_if_overlapping;
4105
4106 # If either list is empty, there is no intersection and no overlap
4107 if (! defined $range_a || ! defined $range_b) {
4108 return $check_if_overlapping ? 0 : $new;
4109 }
4110 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4111
4112 # Otherwise, must calculate the intersection/overlap. Start with the
4113 # very first code point in each list
4114 my $a = $range_a->start;
4115 my $b = $range_b->start;
4116
4117 # Loop through all the ranges of each list; in each iteration, $a and
4118 # $b are the current code points in their respective lists
4119 while (1) {
4120
4121 # If $a and $b are the same code point, ...
4122 if ($a == $b) {
4123
4124 # it means the lists overlap. If just checking for overlap
4125 # know the answer now,
4126 return 1 if $check_if_overlapping;
4127
4128 # The intersection includes this code point plus anything else
4129 # common to both current ranges.
4130 my $start = $a;
4131 my $end = main::min($range_a->end, $range_b->end);
4132 if (! $check_if_overlapping) {
4133 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4134 $new->add_range($start, $end);
4135 }
4136
4137 # Skip ahead to the end of the current intersect
4138 $a = $b = $end;
4139
4140 # If the current intersect ends at the end of either range (as
4141 # it must for at least one of them), the next possible one
4142 # will be the beginning code point in it's list's next range.
4143 if ($a == $range_a->end) {
4144 $range_a = $a_ranges[++$a_i];
4145 last unless defined $range_a;
4146 $a = $range_a->start;
4147 }
4148 if ($b == $range_b->end) {
4149 $range_b = $b_ranges[++$b_i];
4150 last unless defined $range_b;
4151 $b = $range_b->start;
4152 }
4153
4154 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4155 }
4156 elsif ($a < $b) {
4157
4158 # Not equal, but if the range containing $a encompasses $b,
4159 # change $a to be the middle of the range where it does equal
4160 # $b, so the next iteration will get the intersection
4161 if ($range_a->end >= $b) {
4162 $a = $b;
4163 }
4164 else {
4165
4166 # Here, the current range containing $a is entirely below
4167 # $b. Go try to find a range that could contain $b.
4168 $a_i = $a_object->_search_ranges($b);
4169
4170 # If no range found, quit.
4171 last unless defined $a_i;
4172
4173 # The search returns $a_i, such that
4174 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4175 # Set $a to the beginning of this new range, and repeat.
4176 $range_a = $a_ranges[$a_i];
4177 $a = $range_a->start;
4178 }
4179 }
4180 else { # Here, $b < $a.
4181
4182 # Mirror image code to the leg just above
4183 if ($range_b->end >= $a) {
4184 $b = $a;
4185 }
4186 else {
4187 $b_i = $b_object->_search_ranges($a);
4188 last unless defined $b_i;
4189 $range_b = $b_ranges[$b_i];
4190 $b = $range_b->start;
4191 }
4192 }
4193 } # End of looping through ranges.
4194
4195 # Intersection fully computed, or now know that there is no overlap
4196 return $check_if_overlapping ? 0 : $new;
4197 }
4198
4199 sub overlaps {
4200 # Returns boolean giving whether the two arguments overlap somewhere
4201
4202 my $self = shift;
4203 my $other = shift;
4204 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4205
4206 return $self->_intersect($other, 1);
4207 }
4208
4209 sub add_range {
4210 # Add a range to the list.
4211
4212 my $self = shift;
4213 my $start = shift;
4214 my $end = shift;
4215 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4216
4217 return $self->_add_delete('+', $start, $end, "");
4218 }
4219
09aba7e4
KW
4220 sub matches_identically_to {
4221 # Return a boolean as to whether or not two Range_Lists match identical
4222 # sets of code points.
4223
4224 my $self = shift;
4225 my $other = shift;
4226 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4227
4228 # These are ordered in increasing real time to figure out (at least
4229 # until a patch changes that and doesn't change this)
4230 return 0 if $self->max != $other->max;
4231 return 0 if $self->min != $other->min;
4232 return 0 if $self->range_count != $other->range_count;
4233 return 0 if $self->count != $other->count;
4234
4235 # Here they could be identical because all the tests above passed.
4236 # The loop below is somewhat simpler since we know they have the same
4237 # number of elements. Compare range by range, until reach the end or
4238 # find something that differs.
4239 my @a_ranges = $self->ranges;
4240 my @b_ranges = $other->ranges;
4241 for my $i (0 .. @a_ranges - 1) {
4242 my $a = $a_ranges[$i];
4243 my $b = $b_ranges[$i];
4244 trace "self $a; other $b" if main::DEBUG && $to_trace;
4245 return 0 if $a->start != $b->start || $a->end != $b->end;
4246 }
4247 return 1;
4248 }
4249
99870f4d
KW
4250 sub is_code_point_usable {
4251 # This used only for making the test script. See if the input
4252 # proposed trial code point is one that Perl will handle. If second
4253 # parameter is 0, it won't select some code points for various
4254 # reasons, noted below.
4255
4256 my $code = shift;
4257 my $try_hard = shift;
4258 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4259
4260 return 0 if $code < 0; # Never use a negative
4261
99870f4d
KW
4262 # shun null. I'm (khw) not sure why this was done, but NULL would be
4263 # the character very frequently used.
4264 return $try_hard if $code == 0x0000;
4265
99870f4d
KW
4266 # shun non-character code points.
4267 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4268 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4269
4270 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
4271 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4272
4273 return 1;
4274 }
4275
4276 sub get_valid_code_point {
4277 # Return a code point that's part of the range list. Returns nothing
4278 # if the table is empty or we can't find a suitable code point. This
4279 # used only for making the test script.
4280
4281 my $self = shift;
4282 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4283
ffe43484 4284 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4285
4286 # On first pass, don't choose less desirable code points; if no good
4287 # one is found, repeat, allowing a less desirable one to be selected.
4288 for my $try_hard (0, 1) {
4289
4290 # Look through all the ranges for a usable code point.
4291 for my $set ($self->ranges) {
4292
4293 # Try the edge cases first, starting with the end point of the
4294 # range.
4295 my $end = $set->end;
4296 return $end if is_code_point_usable($end, $try_hard);
4297
4298 # End point didn't, work. Start at the beginning and try
4299 # every one until find one that does work.
4300 for my $trial ($set->start .. $end - 1) {
4301 return $trial if is_code_point_usable($trial, $try_hard);
4302 }
4303 }
4304 }
4305 return (); # If none found, give up.
4306 }
4307
4308 sub get_invalid_code_point {
4309 # Return a code point that's not part of the table. Returns nothing
4310 # if the table covers all code points or a suitable code point can't
4311 # be found. This used only for making the test script.
4312
4313 my $self = shift;
4314 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4315
4316 # Just find a valid code point of the inverse, if any.
4317 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4318 }
4319} # end closure for Range_List
4320
4321package Range_Map;
4322use base '_Range_List_Base';
4323
4324# A Range_Map is a range list in which the range values (called maps) are
4325# significant, and hence shouldn't be manipulated by our other code, which
4326# could be ambiguous or lose things. For example, in taking the union of two
4327# lists, which share code points, but which have differing values, which one
4328# has precedence in the union?
4329# It turns out that these operations aren't really necessary for map tables,
4330# and so this class was created to make sure they aren't accidentally
4331# applied to them.
4332
4333{ # Closure
4334
4335 sub add_map {
4336 # Add a range containing a mapping value to the list
4337
4338 my $self = shift;
4339 # Rest of parameters passed on
4340
4341 return $self->_add_delete('+', @_);
4342 }
4343
4344 sub add_duplicate {
4345 # Adds entry to a range list which can duplicate an existing entry
4346
4347 my $self = shift;
4348 my $code_point = shift;
4349 my $value = shift;
4350 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4351
4352 return $self->add_map($code_point, $code_point,
4353 $value, Replace => $MULTIPLE);
4354 }
4355} # End of closure for package Range_Map
4356
4357package _Base_Table;
4358
4359# A table is the basic data structure that gets written out into a file for
4360# use by the Perl core. This is the abstract base class implementing the
4361# common elements from the derived ones. A list of the methods to be
4362# furnished by an implementing class is just after the constructor.
4363
4364sub standardize { return main::standardize($_[0]); }
4365sub trace { return main::trace(@_); }
4366
4367{ # Closure
4368
4369 main::setup_package();
4370
4371 my %range_list;
4372 # Object containing the ranges of the table.
4373 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4374
4375 my %full_name;
4376 # The full table name.
4377 main::set_access('full_name', \%full_name, 'r');
4378
4379 my %name;
4380 # The table name, almost always shorter
4381 main::set_access('name', \%name, 'r');
4382
4383 my %short_name;
4384 # The shortest of all the aliases for this table, with underscores removed
4385 main::set_access('short_name', \%short_name);
4386
4387 my %nominal_short_name_length;
4388 # The length of short_name before removing underscores
4389 main::set_access('nominal_short_name_length',
4390 \%nominal_short_name_length);
4391
23e33b60
KW
4392 my %complete_name;
4393 # The complete name, including property.
4394 main::set_access('complete_name', \%complete_name, 'r');
4395
99870f4d
KW
4396 my %property;
4397 # Parent property this table is attached to.
4398 main::set_access('property', \%property, 'r');
4399
4400 my %aliases;
4401 # Ordered list of aliases of the table's name. The first ones in the list
4402 # are output first in comments
4403 main::set_access('aliases', \%aliases, 'readable_array');
4404
4405 my %comment;
4406 # A comment associated with the table for human readers of the files
4407 main::set_access('comment', \%comment, 's');
4408
4409 my %description;
4410 # A comment giving a short description of the table's meaning for human
4411 # readers of the files.
4412 main::set_access('description', \%description, 'readable_array');
4413
4414 my %note;
4415 # A comment giving a short note about the table for human readers of the
4416 # files.
4417 main::set_access('note', \%note, 'readable_array');
4418
4419 my %internal_only;
4420 # Boolean; if set means any file that contains this table is marked as for
4421 # internal-only use.
4422 main::set_access('internal_only', \%internal_only);
4423
4424 my %find_table_from_alias;
4425 # The parent property passes this pointer to a hash which this class adds
4426 # all its aliases to, so that the parent can quickly take an alias and
4427 # find this table.
4428 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4429
4430 my %locked;
4431 # After this table is made equivalent to another one; we shouldn't go
4432 # changing the contents because that could mean it's no longer equivalent
4433 main::set_access('locked', \%locked, 'r');
4434
4435 my %file_path;
4436 # This gives the final path to the file containing the table. Each
4437 # directory in the path is an element in the array
4438 main::set_access('file_path', \%file_path, 'readable_array');
4439
4440 my %status;
4441 # What is the table's status, normal, $OBSOLETE, etc. Enum
4442 main::set_access('status', \%status, 'r');
4443
4444 my %status_info;
4445 # A comment about its being obsolete, or whatever non normal status it has
4446 main::set_access('status_info', \%status_info, 'r');
4447
d867ccfb
KW
4448 my %caseless_equivalent;
4449 # The table this is equivalent to under /i matching, if any.
4450 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4451
99870f4d
KW
4452 my %range_size_1;
4453 # Is the table to be output with each range only a single code point?
4454 # This is done to avoid breaking existing code that may have come to rely
4455 # on this behavior in previous versions of this program.)
4456 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4457
4458 my %perl_extension;
4459 # A boolean set iff this table is a Perl extension to the Unicode
4460 # standard.
4461 main::set_access('perl_extension', \%perl_extension, 'r');
4462
0c07e538
KW
4463 my %output_range_counts;
4464 # A boolean set iff this table is to have comments written in the
4465 # output file that contain the number of code points in the range.
4466 # The constructor can override the global flag of the same name.
4467 main::set_access('output_range_counts', \%output_range_counts, 'r');
4468
f5817e0a
KW
4469 my %format;
4470 # The format of the entries of the table. This is calculated from the
4471 # data in the table (or passed in the constructor). This is an enum e.g.,
4472 # $STRING_FORMAT
4473 main::set_access('format', \%format, 'r', 'p_s');
4474
99870f4d
KW
4475 sub new {
4476 # All arguments are key => value pairs, which you can see below, most
4477 # of which match fields documented above. Otherwise: Pod_Entry,
4478 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4479 # documented in the Alias package
4480
4481 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4482
4483 my $class = shift;
4484
4485 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4486 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4487
4488 my %args = @_;
4489
4490 $name{$addr} = delete $args{'Name'};
4491 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4492 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4493 my $complete_name = $complete_name{$addr}
4494 = delete $args{'Complete_Name'};
f5817e0a 4495 $format{$addr} = delete $args{'Format'};
99870f4d 4496 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
0c07e538 4497 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4498 $property{$addr} = delete $args{'_Property'};
4499 $range_list{$addr} = delete $args{'_Range_List'};
4500 $status{$addr} = delete $args{'Status'} || $NORMAL;
4501 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4502 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
d867ccfb 4503 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
99870f4d
KW
4504
4505 my $description = delete $args{'Description'};
4506 my $externally_ok = delete $args{'Externally_Ok'};
4507 my $loose_match = delete $args{'Fuzzy'};
4508 my $note = delete $args{'Note'};
4509 my $make_pod_entry = delete $args{'Pod_Entry'};
37e2e78e 4510 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4511
4512 # Shouldn't have any left over
4513 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4514
4515 # Can't use || above because conceivably the name could be 0, and
4516 # can't use // operator in case this program gets used in Perl 5.8
4517 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4518 $output_range_counts{$addr} = $output_range_counts if
4519 ! defined $output_range_counts{$addr};
99870f4d
KW
4520
4521 $aliases{$addr} = [ ];
4522 $comment{$addr} = [ ];
4523 $description{$addr} = [ ];
4524 $note{$addr} = [ ];
4525 $file_path{$addr} = [ ];
4526 $locked{$addr} = "";
4527
4528 push @{$description{$addr}}, $description if $description;
4529 push @{$note{$addr}}, $note if $note;
4530
37e2e78e
KW
4531 if ($status{$addr} eq $PLACEHOLDER) {
4532
4533 # A placeholder table doesn't get documented, is a perl extension,
4534 # and quite likely will be empty
4535 $make_pod_entry = 0 if ! defined $make_pod_entry;
4536 $perl_extension = 1 if ! defined $perl_extension;
4537 push @tables_that_may_be_empty, $complete_name{$addr};
4538 }
4539 elsif (! $status{$addr}) {
4540
4541 # If hasn't set its status already, see if it is on one of the
4542 # lists of properties or tables that have particular statuses; if
4543 # not, is normal. The lists are prioritized so the most serious
4544 # ones are checked first
ec11e5f4 4545 if (exists $why_suppressed{$complete_name}
98dc9551 4546 # Don't suppress if overridden
ec11e5f4
KW
4547 && ! grep { $_ eq $complete_name{$addr} }
4548 @output_mapped_properties)
4549 {
99870f4d
KW
4550 $status{$addr} = $SUPPRESSED;
4551 }
4552 elsif (exists $why_deprecated{$complete_name}) {
4553 $status{$addr} = $DEPRECATED;
4554 }
4555 elsif (exists $why_stabilized{$complete_name}) {
4556 $status{$addr} = $STABILIZED;
4557 }
4558 elsif (exists $why_obsolete{$complete_name}) {
4559 $status{$addr} = $OBSOLETE;
4560 }
4561
4562 # Existence above doesn't necessarily mean there is a message
4563 # associated with it. Use the most serious message.
4564 if ($status{$addr}) {
4565 if ($why_suppressed{$complete_name}) {
4566 $status_info{$addr}
4567 = $why_suppressed{$complete_name};
4568 }
4569 elsif ($why_deprecated{$complete_name}) {
4570 $status_info{$addr}
4571 = $why_deprecated{$complete_name};
4572 }
4573 elsif ($why_stabilized{$complete_name}) {
4574 $status_info{$addr}
4575 = $why_stabilized{$complete_name};
4576 }
4577 elsif ($why_obsolete{$complete_name}) {
4578 $status_info{$addr}
4579 = $why_obsolete{$complete_name};
4580 }
4581 }
4582 }
4583
37e2e78e
KW
4584 $perl_extension{$addr} = $perl_extension || 0;
4585
99870f4d
KW
4586 # By convention what typically gets printed only or first is what's
4587 # first in the list, so put the full name there for good output
4588 # clarity. Other routines rely on the full name being first on the
4589 # list
4590 $self->add_alias($full_name{$addr},
4591 Externally_Ok => $externally_ok,
4592 Fuzzy => $loose_match,
4593 Pod_Entry => $make_pod_entry,
4594 Status => $status{$addr},
4595 );
4596
4597 # Then comes the other name, if meaningfully different.
4598 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4599 $self->add_alias($name{$addr},
4600 Externally_Ok => $externally_ok,
4601 Fuzzy => $loose_match,
4602 Pod_Entry => $make_pod_entry,
4603 Status => $status{$addr},
4604 );
4605 }
4606
4607 return $self;
4608 }
4609
4610 # Here are the methods that are required to be defined by any derived
4611 # class
ea25a9b2 4612 for my $sub (qw(
668b3bfc 4613 handle_special_range
99870f4d 4614 append_to_body
99870f4d 4615 pre_body
ea25a9b2 4616 ))
668b3bfc
KW
4617 # write() knows how to write out normal ranges, but it calls
4618 # handle_special_range() when it encounters a non-normal one.
4619 # append_to_body() is called by it after it has handled all
4620 # ranges to add anything after the main portion of the table.
4621 # And finally, pre_body() is called after all this to build up
4622 # anything that should appear before the main portion of the
4623 # table. Doing it this way allows things in the middle to
4624 # affect what should appear before the main portion of the
99870f4d 4625 # table.
99870f4d
KW
4626 {
4627 no strict "refs";
4628 *$sub = sub {
4629 Carp::my_carp_bug( __LINE__
4630 . ": Must create method '$sub()' for "
4631 . ref shift);
4632 return;
4633 }
4634 }
4635
4636 use overload
4637 fallback => 0,
4638 "." => \&main::_operator_dot,
4639 '!=' => \&main::_operator_not_equal,
4640 '==' => \&main::_operator_equal,
4641 ;
4642
4643 sub ranges {
4644 # Returns the array of ranges associated with this table.
4645
f998e60c 4646 no overloading;
051df77b 4647 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4648 }
4649
4650 sub add_alias {
4651 # Add a synonym for this table.
4652
4653 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4654
4655 my $self = shift;
4656 my $name = shift; # The name to add.
4657 my $pointer = shift; # What the alias hash should point to. For
4658 # map tables, this is the parent property;
4659 # for match tables, it is the table itself.
4660
4661 my %args = @_;
4662 my $loose_match = delete $args{'Fuzzy'};
4663
4664 my $make_pod_entry = delete $args{'Pod_Entry'};
4665 $make_pod_entry = $YES unless defined $make_pod_entry;
4666
4667 my $externally_ok = delete $args{'Externally_Ok'};
4668 $externally_ok = 1 unless defined $externally_ok;
4669
4670 my $status = delete $args{'Status'};
4671 $status = $NORMAL unless defined $status;
4672
4673 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4674
4675 # Capitalize the first letter of the alias unless it is one of the CJK
4676 # ones which specifically begins with a lower 'k'. Do this because
4677 # Unicode has varied whether they capitalize first letters or not, and
4678 # have later changed their minds and capitalized them, but not the
4679 # other way around. So do it always and avoid changes from release to
4680 # release
4681 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4682
ffe43484 4683 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4684
4685 # Figure out if should be loosely matched if not already specified.
4686 if (! defined $loose_match) {
4687
4688 # Is a loose_match if isn't null, and doesn't begin with an
4689 # underscore and isn't just a number
4690 if ($name ne ""
4691 && substr($name, 0, 1) ne '_'
4692 && $name !~ qr{^[0-9_.+-/]+$})
4693 {
4694 $loose_match = 1;
4695 }
4696 else {
4697 $loose_match = 0;
4698 }
4699 }
4700
4701 # If this alias has already been defined, do nothing.
4702 return if defined $find_table_from_alias{$addr}->{$name};
4703
4704 # That includes if it is standardly equivalent to an existing alias,
4705 # in which case, add this name to the list, so won't have to search
4706 # for it again.
4707 my $standard_name = main::standardize($name);
4708 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4709 $find_table_from_alias{$addr}->{$name}
4710 = $find_table_from_alias{$addr}->{$standard_name};
4711 return;
4712 }
4713
4714 # Set the index hash for this alias for future quick reference.
4715 $find_table_from_alias{$addr}->{$name} = $pointer;
4716 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4717 local $to_trace = 0 if main::DEBUG;
4718 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4719 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4720
4721
4722 # Put the new alias at the end of the list of aliases unless the final
4723 # element begins with an underscore (meaning it is for internal perl
4724 # use) or is all numeric, in which case, put the new one before that
4725 # one. This floats any all-numeric or underscore-beginning aliases to
4726 # the end. This is done so that they are listed last in output lists,
4727 # to encourage the user to use a better name (either more descriptive
4728 # or not an internal-only one) instead. This ordering is relied on
4729 # implicitly elsewhere in this program, like in short_name()
4730 my $list = $aliases{$addr};
4731 my $insert_position = (@$list == 0
4732 || (substr($list->[-1]->name, 0, 1) ne '_'
4733 && $list->[-1]->name =~ /\D/))
4734 ? @$list
4735 : @$list - 1;
4736 splice @$list,
4737 $insert_position,
4738 0,
4739 Alias->new($name, $loose_match, $make_pod_entry,
4740 $externally_ok, $status);
4741
4742 # This name may be shorter than any existing ones, so clear the cache
4743 # of the shortest, so will have to be recalculated.
f998e60c 4744 no overloading;
051df77b 4745 undef $short_name{pack 'J', $self};
99870f4d
KW
4746 return;
4747 }
4748
4749 sub short_name {
4750 # Returns a name suitable for use as the base part of a file name.
4751 # That is, shorter wins. It can return undef if there is no suitable
4752 # name. The name has all non-essential underscores removed.
4753
4754 # The optional second parameter is a reference to a scalar in which
4755 # this routine will store the length the returned name had before the
4756 # underscores were removed, or undef if the return is undef.
4757
4758 # The shortest name can change if new aliases are added. So using
4759 # this should be deferred until after all these are added. The code
4760 # that does that should clear this one's cache.
4761 # Any name with alphabetics is preferred over an all numeric one, even
4762 # if longer.
4763
4764 my $self = shift;
4765 my $nominal_length_ptr = shift;
4766 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4767
ffe43484 4768 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4769
4770 # For efficiency, don't recalculate, but this means that adding new
4771 # aliases could change what the shortest is, so the code that does
4772 # that needs to undef this.
4773 if (defined $short_name{$addr}) {
4774 if ($nominal_length_ptr) {
4775 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4776 }
4777 return $short_name{$addr};
4778 }
4779
4780 # Look at each alias
4781 foreach my $alias ($self->aliases()) {
4782
4783 # Don't use an alias that isn't ok to use for an external name.
4784 next if ! $alias->externally_ok;
4785
4786 my $name = main::Standardize($alias->name);
4787 trace $self, $name if main::DEBUG && $to_trace;
4788
4789 # Take the first one, or a shorter one that isn't numeric. This
4790 # relies on numeric aliases always being last in the array
4791 # returned by aliases(). Any alpha one will have precedence.
4792 if (! defined $short_name{$addr}
4793 || ($name =~ /\D/
4794 && length($name) < length($short_name{$addr})))
4795 {
4796 # Remove interior underscores.
4797 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4798
4799 $nominal_short_name_length{$addr} = length $name;
4800 }
4801 }
4802
4803 # If no suitable external name return undef
4804 if (! defined $short_name{$addr}) {
4805 $$nominal_length_ptr = undef if $nominal_length_ptr;
4806 return;
4807 }
4808
4809 # Don't allow a null external name.
4810 if ($short_name{$addr} eq "") {
4811 $short_name{$addr} = '_';
4812 $nominal_short_name_length{$addr} = 1;
4813 }
4814
4815 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4816
4817 if ($nominal_length_ptr) {
4818 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4819 }
4820 return $short_name{$addr};
4821 }
4822
4823 sub external_name {
4824 # Returns the external name that this table should be known by. This
4825 # is usually the short_name, but not if the short_name is undefined.
4826
4827 my $self = shift;
4828 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4829
4830 my $short = $self->short_name;
4831 return $short if defined $short;
4832
4833 return '_';
4834 }
4835
4836 sub add_description { # Adds the parameter as a short description.
4837
4838 my $self = shift;
4839 my $description = shift;
4840 chomp $description;
4841 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4842
f998e60c 4843 no overloading;
051df77b 4844 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
4845
4846 return;
4847 }
4848
4849 sub add_note { # Adds the parameter as a short note.
4850
4851 my $self = shift;
4852 my $note = shift;
4853 chomp $note;
4854 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4855
f998e60c 4856 no overloading;
051df77b 4857 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
4858
4859 return;
4860 }
4861
4862 sub add_comment { # Adds the parameter as a comment.
4863
bd9ebcfd
KW
4864 return unless $debugging_build;
4865
99870f4d
KW
4866 my $self = shift;
4867 my $comment = shift;
4868 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4869
4870 chomp $comment;
f998e60c
KW
4871
4872 no overloading;
051df77b 4873 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
4874
4875 return;
4876 }
4877
4878 sub comment {
4879 # Return the current comment for this table. If called in list
4880 # context, returns the array of comments. In scalar, returns a string
4881 # of each element joined together with a period ending each.
4882
4883 my $self = shift;
4884 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4885
ffe43484 4886 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 4887 my @list = @{$comment{$addr}};
99870f4d
KW
4888 return @list if wantarray;
4889 my $return = "";
4890 foreach my $sentence (@list) {
4891 $return .= '. ' if $return;
4892 $return .= $sentence;
4893 $return =~ s/\.$//;
4894 }
4895 $return .= '.' if $return;
4896 return $return;
4897 }
4898
4899 sub initialize {
4900 # Initialize the table with the argument which is any valid
4901 # initialization for range lists.
4902
4903 my $self = shift;
ffe43484 4904 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4905 my $initialization = shift;
4906 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4907
4908 # Replace the current range list with a new one of the same exact
4909 # type.
f998e60c
KW
4910 my $class = ref $range_list{$addr};
4911 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
4912 Initialize => $initialization);
4913 return;
4914
4915 }
4916
4917 sub header {
4918 # The header that is output for the table in the file it is written
4919 # in.
4920
4921 my $self = shift;
4922 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4923
4924 my $return = "";
4925 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4926 $return .= $HEADER;
f998e60c 4927 no overloading;
051df77b 4928 $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
99870f4d
KW
4929 return $return;
4930 }
4931
4932 sub write {
668b3bfc
KW
4933 # Write a representation of the table to its file. It calls several
4934 # functions furnished by sub-classes of this abstract base class to
4935 # handle non-normal ranges, to add stuff before the table, and at its
4936 # end.
99870f4d
KW
4937
4938 my $self = shift;
4939 my $tab_stops = shift; # The number of tab stops over to put any
4940 # comment.
4941 my $suppress_value = shift; # Optional, if the value associated with
4942 # a range equals this one, don't write
4943 # the range
4944 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4945
ffe43484 4946 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4947
4948 # Start with the header
668b3bfc 4949 my @HEADER = $self->header;
99870f4d
KW
4950
4951 # Then the comments
668b3bfc 4952 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
99870f4d
KW
4953 if $comment{$addr};
4954
668b3bfc
KW
4955 # Things discovered processing the main body of the document may
4956 # affect what gets output before it, therefore pre_body() isn't called
4957 # until after all other processing of the table is done.
99870f4d 4958
c4019d52
KW
4959 # The main body looks like a 'here' document. If annotating, get rid
4960 # of the comments before passing to the caller, as some callers, such
4961 # as charnames.pm, can't cope with them. (Outputting range counts
4962 # also introduces comments, but these don't show up in the tables that
4963 # can't cope with comments, and there aren't that many of them that
4964 # it's worth the extra real time to get rid of them).
668b3bfc 4965 my @OUT;
558712cf 4966 if ($annotate) {
c4019d52
KW
4967 # Use the line below in Perls that don't have /r
4968 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
4969 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
4970 } else {
4971 push @OUT, "return <<'END';\n";
4972 }
99870f4d
KW
4973
4974 if ($range_list{$addr}->is_empty) {
4975
4976 # This is a kludge for empty tables to silence a warning in
4977 # utf8.c, which can't really deal with empty tables, but it can
4978 # deal with a table that matches nothing, as the inverse of 'Any'
4979 # does.
67a53d68 4980 push @OUT, "!utf8::Any\n";
99870f4d 4981 }
c69a9c68
KW
4982 elsif ($self->name eq 'N'
4983
4984 # To save disk space and table cache space, avoid putting out
4985 # binary N tables, but instead create a file which just inverts
4986 # the Y table. Since the file will still exist and occupy a
4987 # certain number of blocks, might as well output the whole
4988 # thing if it all will fit in one block. The number of
4989 # ranges below is an approximate number for that.
4990 && $self->property->type == $BINARY
4991 # && $self->property->tables == 2 Can't do this because the
4992 # non-binary properties, like NFDQC aren't specifiable
4993 # by the notation
4994 && $range_list{$addr}->ranges > 15
4995 && ! $annotate) # Under --annotate, want to see everything
4996 {
4997 push @OUT, "!utf8::" . $self->property->name . "\n";
4998 }
99870f4d
KW
4999 else {
5000 my $range_size_1 = $range_size_1{$addr};
558712cf
KW
5001 my $format; # Used only in $annotate option
5002 my $include_name; # Used only in $annotate option
c4019d52 5003
558712cf 5004 if ($annotate) {
c4019d52
KW
5005
5006 # if annotating each code point, must print 1 per line.
5007 # The variable could point to a subroutine, and we don't want
5008 # to lose that fact, so only set if not set already
5009 $range_size_1 = 1 if ! $range_size_1;
5010
5011 $format = $self->format;
5012
5013 # The name of the character is output only for tables that
5014 # don't already include the name in the output.
5015 my $property = $self->property;
5016 $include_name =
5017 ! ($property == $perl_charname
5018 || $property == main::property_ref('Unicode_1_Name')
5019 || $property == main::property_ref('Name')
5020 || $property == main::property_ref('Name_Alias')
5021 );
5022 }
99870f4d
KW
5023
5024 # Output each range as part of the here document.
5a2b5ddb 5025 RANGE:
99870f4d 5026 for my $set ($range_list{$addr}->ranges) {
5a2b5ddb
KW
5027 if ($set->type != 0) {
5028 $self->handle_special_range($set);
5029 next RANGE;
5030 }
99870f4d
KW
5031 my $start = $set->start;
5032 my $end = $set->end;
5033 my $value = $set->value;
5034
5035 # Don't output ranges whose value is the one to suppress
c4019d52
KW
5036 next RANGE if defined $suppress_value
5037 && $value eq $suppress_value;
99870f4d 5038
c4019d52
KW
5039 # If there is a range and doesn't need a single point range
5040 # output
5041 if ($start != $end && ! $range_size_1) {
bd9ebcfd
KW
5042 push @OUT, sprintf "%04X\t%04X", $start, $end;
5043 $OUT[-1] .= "\t$value" if $value ne "";
99870f4d
KW
5044
5045 # Add a comment with the size of the range, if requested.
5046 # Expand Tabs to make sure they all start in the same
5047 # column, and then unexpand to use mostly tabs.
0c07e538 5048 if (! $output_range_counts{$addr}) {
99870f4d
KW
5049 $OUT[-1] .= "\n";
5050 }
5051 else {
5052 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5053 my $count = main::clarify_number($end - $start + 1);
5054 use integer;
5055
5056 my $width = $tab_stops * 8 - 1;
5057 $OUT[-1] = sprintf("%-*s # [%s]\n",
5058 $width,
5059 $OUT[-1],
5060 $count);
5061 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5062 }
c4019d52
KW
5063 next RANGE;
5064 }
5065
5066 # Here to output a single code point per line
5067
5068 # If not to annotate, use the simple formats
558712cf 5069 if (! $annotate) {
c4019d52
KW
5070
5071 # Use any passed in subroutine to output.
5072 if (ref $range_size_1 eq 'CODE') {
5073 for my $i ($start .. $end) {
5074 push @OUT, &{$range_size_1}($i, $value);
5075 }
5076 }
5077 else {
5078
5079 # Here, caller is ok with default output.
5080 for (my $i = $start; $i <= $end; $i++) {
5081 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5082 }
5083 }
5084 next RANGE;
5085 }
5086
5087 # Here, wants annotation.
5088 for (my $i = $start; $i <= $end; $i++) {
5089
5090 # Get character information if don't have it already
5091 main::populate_char_info($i)
5092 if ! defined $viacode[$i];
5093 my $type = $annotate_char_type[$i];
5094
5095 # Figure out if should output the next code points as part
5096 # of a range or not. If this is not in an annotation
5097 # range, then won't output as a range, so returns $i.
5098 # Otherwise use the end of the annotation range, but no
5099 # further than the maximum possible end point of the loop.
5100 my $range_end = main::min($annotate_ranges->value_of($i)
5101 || $i,
5102 $end);
5103
5104 # Use a range if it is a range, and either is one of the
5105 # special annotation ranges, or the range is at most 3
5106 # long. This last case causes the algorithmically named
5107 # code points to be output individually in spans of at
5108 # most 3, as they are the ones whose $type is > 0.
5109 if ($range_end != $i
5110 && ( $type < 0 || $range_end - $i > 2))
5111 {
5112 # Here is to output a range. We don't allow a
5113 # caller-specified output format--just use the
5114 # standard one.
5115 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5116 $range_end,
5117 $value;
5118 my $range_name = $viacode[$i];
5119
5120 # For the code points which end in their hex value, we
5121 # eliminate that from the output annotation, and
5122 # capitalize only the first letter of each word.
5123 if ($type == $CP_IN_NAME) {
5124 my $hex = sprintf "%04X", $i;
5125 $range_name =~ s/-$hex$//;
5126 my @words = split " ", $range_name;
5127 for my $word (@words) {
5128 $word = ucfirst(lc($word)) if $word ne 'CJK';
5129 }
5130 $range_name = join " ", @words;
5131 }
5132 elsif ($type == $HANGUL_SYLLABLE) {
5133 $range_name = "Hangul Syllable";
5134 }
5135
5136 $OUT[-1] .= " $range_name" if $range_name;
5137
5138 # Include the number of code points in the range
5139 my $count = main::clarify_number($range_end - $i + 1);
5140 $OUT[-1] .= " [$count]\n";
5141
5142 # Skip to the end of the range
5143 $i = $range_end;
5144 }
5145 else { # Not in a range.
5146 my $comment = "";
5147
5148 # When outputting the names of each character, use
5149 # the character itself if printable
5150 $comment .= "'" . chr($i) . "' " if $printable[$i];
5151
5152 # To make it more readable, use a minimum indentation
5153 my $comment_indent;
5154
5155 # Determine the annotation
5156 if ($format eq $DECOMP_STRING_FORMAT) {
5157
5158 # This is very specialized, with the type of
5159 # decomposition beginning the line enclosed in
5160 # <...>, and the code points that the code point
5161 # decomposes to separated by blanks. Create two
5162 # strings, one of the printable characters, and
5163 # one of their official names.
5164 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5165 my $tostr = "";
5166 my $to_name = "";
5167 my $to_chr = "";
5168 foreach my $to (split " ", $map) {
5169 $to = CORE::hex $to;
5170 $to_name .= " + " if $to_name;
5171 $to_chr .= chr($to);
5172 main::populate_char_info($to)
5173 if ! defined $viacode[$to];
5174 $to_name .= $viacode[$to];
5175 }
5176
5177 $comment .=
5178 "=> '$to_chr'; $viacode[$i] => $to_name";
5179 $comment_indent = 25; # Determined by experiment
5180 }
5181 else {
5182
5183 # Assume that any table that has hex format is a
5184 # mapping of one code point to another.
5185 if ($format eq $HEX_FORMAT) {
5186 my $decimal_value = CORE::hex $value;
5187 main::populate_char_info($decimal_value)
5188 if ! defined $viacode[$decimal_value];
5189 $comment .= "=> '"
5190 . chr($decimal_value)
5191 . "'; " if $printable[$decimal_value];
5192 }
5193 $comment .= $viacode[$i] if $include_name
5194 && $viacode[$i];
5195 if ($format eq $HEX_FORMAT) {
5196 my $decimal_value = CORE::hex $value;
5197 $comment .= " => $viacode[$decimal_value]"
5198 if $viacode[$decimal_value];
5199 }
5200
5201 # If including the name, no need to indent, as the
5202 # name will already be way across the line.
5203 $comment_indent = ($include_name) ? 0 : 60;
5204 }
5205
5206 # Use any passed in routine to output the base part of
5207 # the line.
5208 if (ref $range_size_1 eq 'CODE') {
5209 my $base_part = &{$range_size_1}($i, $value);
5210 chomp $base_part;
5211 push @OUT, $base_part;
5212 }
5213 else {
5214 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5215 }
5216
5217 # And add the annotation.
5218 $OUT[-1] = sprintf "%-*s\t# %s", $comment_indent,
5219 $OUT[-1],
5220 $comment if $comment;
5221 $OUT[-1] .= "\n";
5222 }
99870f4d
KW
5223 }
5224 } # End of loop through all the table's ranges
5225 }
5226
5227 # Add anything that goes after the main body, but within the here
5228 # document,
5229 my $append_to_body = $self->append_to_body;
5230 push @OUT, $append_to_body if $append_to_body;
5231
5232 # And finish the here document.
5233 push @OUT, "END\n";
5234
668b3bfc
KW
5235 # Done with the main portion of the body. Can now figure out what
5236 # should appear before it in the file.
5237 my $pre_body = $self->pre_body;
5238 push @HEADER, $pre_body, "\n" if $pre_body;
668b3bfc 5239
6b0079b5
KW
5240 # All these files should have a .pl suffix added to them.
5241 my @file_with_pl = @{$file_path{$addr}};
5242 $file_with_pl[-1] .= '.pl';
99870f4d 5243
6b0079b5 5244 main::write(\@file_with_pl,
558712cf 5245 $annotate, # utf8 iff annotating
9218f1cf
KW
5246 \@HEADER,
5247 \@OUT);
99870f4d
KW
5248 return;
5249 }
5250
5251 sub set_status { # Set the table's status
5252 my $self = shift;
5253 my $status = shift; # The status enum value
5254 my $info = shift; # Any message associated with it.
5255 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5256
ffe43484 5257 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5258
5259 $status{$addr} = $status;
5260 $status_info{$addr} = $info;
5261 return;
5262 }
5263
5264 sub lock {
5265 # Don't allow changes to the table from now on. This stores a stack
5266 # trace of where it was called, so that later attempts to modify it
5267 # can immediately show where it got locked.
5268
5269 my $self = shift;
5270 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5271
ffe43484 5272 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5273
5274 $locked{$addr} = "";
5275
5276 my $line = (caller(0))[2];
5277 my $i = 1;
5278
5279 # Accumulate the stack trace
5280 while (1) {
5281 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5282
5283 last unless defined $caller;
5284
5285 $locked{$addr} .= " called from $caller() at line $line\n";
5286 $line = $caller_line;
5287 }
5288 $locked{$addr} .= " called from main at line $line\n";
5289
5290 return;
5291 }
5292
5293 sub carp_if_locked {
5294 # Return whether a table is locked or not, and, by the way, complain
5295 # if is locked
5296
5297 my $self = shift;
5298 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5299
ffe43484 5300 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5301
5302 return 0 if ! $locked{$addr};
5303 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5304 return 1;
5305 }
5306
5307 sub set_file_path { # Set the final directory path for this table
5308 my $self = shift;
5309 # Rest of parameters passed on
5310
f998e60c 5311 no overloading;
051df77b 5312 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
5313 return
5314 }
5315
5316 # Accessors for the range list stored in this table. First for
5317 # unconditional
ea25a9b2 5318 for my $sub (qw(
2f7a8815 5319 containing_range
99870f4d
KW
5320 contains
5321 count
5322 each_range
5323 hash
5324 is_empty
09aba7e4 5325 matches_identically_to
99870f4d
KW
5326 max
5327 min
5328 range_count
5329 reset_each_range
0a9dbafc 5330 type_of
99870f4d 5331 value_of
ea25a9b2 5332 ))
99870f4d
KW
5333 {
5334 no strict "refs";
5335 *$sub = sub {
5336 use strict "refs";
5337 my $self = shift;
f998e60c 5338 no overloading;
051df77b 5339 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
5340 }
5341 }
5342
5343 # Then for ones that should fail if locked
ea25a9b2 5344 for my $sub (qw(
99870f4d 5345 delete_range
ea25a9b2 5346 ))
99870f4d
KW
5347 {
5348 no strict "refs";
5349 *$sub = sub {
5350 use strict "refs";
5351 my $self = shift;
5352
5353 return if $self->carp_if_locked;
f998e60c 5354 no overloading;
051df77b 5355 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
5356 }
5357 }
5358
5359} # End closure
5360
5361package Map_Table;
5362use base '_Base_Table';
5363
5364# A Map Table is a table that contains the mappings from code points to
5365# values. There are two weird cases:
5366# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5367# are written in the table's file at the end of the table nonetheless. It
5368# requires specially constructed code to handle these; utf8.c can not read
5369# these in, so they should not go in $map_directory. As of this writing,
5370# the only case that these happen is for named sequences used in
5371# charnames.pm. But this code doesn't enforce any syntax on these, so
5372# something else could come along that uses it.
5373# 2) Specials are anything that doesn't fit syntactically into the body of the
5374# table. The ranges for these have a map type of non-zero. The code below
5375# knows about and handles each possible type. In most cases, these are
5376# written as part of the header.
5377#
5378# A map table deliberately can't be manipulated at will unlike match tables.
5379# This is because of the ambiguities having to do with what to do with
5380# overlapping code points. And there just isn't a need for those things;
5381# what one wants to do is just query, add, replace, or delete mappings, plus
5382# write the final result.
5383# However, there is a method to get the list of possible ranges that aren't in
5384# this table to use for defaulting missing code point mappings. And,
5385# map_add_or_replace_non_nulls() does allow one to add another table to this
5386# one, but it is clearly very specialized, and defined that the other's
5387# non-null values replace this one's if there is any overlap.
5388
5389sub trace { return main::trace(@_); }
5390
5391{ # Closure
5392
5393 main::setup_package();
5394
5395 my %default_map;
5396 # Many input files omit some entries; this gives what the mapping for the
5397 # missing entries should be
5398 main::set_access('default_map', \%default_map, 'r');
5399
5400 my %anomalous_entries;
5401 # Things that go in the body of the table which don't fit the normal
5402 # scheme of things, like having a range. Not much can be done with these
5403 # once there except to output them. This was created to handle named
5404 # sequences.
5405 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5406 main::set_access('anomalous_entries', # Append singular, read plural
5407 \%anomalous_entries,
5408 'readable_array');
5409
99870f4d
KW
5410 my %core_access;
5411 # This is a string, solely for documentation, indicating how one can get
5412 # access to this property via the Perl core.
5413 main::set_access('core_access', \%core_access, 'r', 's');
5414
99870f4d 5415 my %to_output_map;
8572ace0
KW
5416 # Enum as to whether or not to write out this map table:
5417 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5418 # it should not be removed nor its format changed. This
5419 # is done for those files that have traditionally been
5420 # output.
5421 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5422 # with this file
99870f4d
KW
5423 main::set_access('to_output_map', \%to_output_map, 's');
5424
5425
5426 sub new {
5427 my $class = shift;
5428 my $name = shift;
5429
5430 my %args = @_;
5431
5432 # Optional initialization data for the table.
5433 my $initialize = delete $args{'Initialize'};
5434
5435 my $core_access = delete $args{'Core_Access'};
5436 my $default_map = delete $args{'Default_Map'};
99870f4d 5437 my $property = delete $args{'_Property'};
23e33b60 5438 my $full_name = delete $args{'Full_Name'};
20863809 5439
99870f4d
KW
5440 # Rest of parameters passed on
5441
5442 my $range_list = Range_Map->new(Owner => $property);
5443
5444 my $self = $class->SUPER::new(
5445 Name => $name,
23e33b60
KW
5446 Complete_Name => $full_name,
5447 Full_Name => $full_name,
99870f4d
KW
5448 _Property => $property,
5449 _Range_List => $range_list,
5450 %args);
5451
ffe43484 5452 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5453
5454 $anomalous_entries{$addr} = [];
5455 $core_access{$addr} = $core_access;
5456 $default_map{$addr} = $default_map;
99870f4d
KW
5457
5458 $self->initialize($initialize) if defined $initialize;
5459
5460 return $self;
5461 }
5462
5463 use overload
5464 fallback => 0,
5465 qw("") => "_operator_stringify",
5466 ;
5467
5468 sub _operator_stringify {
5469 my $self = shift;
5470
5471 my $name = $self->property->full_name;
5472 $name = '""' if $name eq "";
5473 return "Map table for Property '$name'";
5474 }
5475
99870f4d
KW
5476 sub add_alias {
5477 # Add a synonym for this table (which means the property itself)
5478 my $self = shift;
5479 my $name = shift;
5480 # Rest of parameters passed on.
5481
5482 $self->SUPER::add_alias($name, $self->property, @_);
5483 return;
5484 }
5485
5486 sub add_map {
5487 # Add a range of code points to the list of specially-handled code
5488 # points. $MULTI_CP is assumed if the type of special is not passed
5489 # in.
5490
5491 my $self = shift;
5492 my $lower = shift;
5493 my $upper = shift;
5494 my $string = shift;
5495 my %args = @_;
5496
5497 my $type = delete $args{'Type'} || 0;
5498 # Rest of parameters passed on
5499
5500 # Can't change the table if locked.
5501 return if $self->carp_if_locked;
5502
ffe43484 5503 my $addr = do { no overloading; pack 'J', $self; };
99870f4d 5504
99870f4d
KW
5505 $self->_range_list->add_map($lower, $upper,
5506 $string,
5507 @_,
5508 Type => $type);
5509 return;
5510 }
5511
5512 sub append_to_body {
5513 # Adds to the written HERE document of the table's body any anomalous
5514 # entries in the table..
5515
5516 my $self = shift;
5517 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5518
ffe43484 5519 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5520
5521 return "" unless @{$anomalous_entries{$addr}};
5522 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5523 }
5524
5525 sub map_add_or_replace_non_nulls {
5526 # This adds the mappings in the table $other to $self. Non-null
5527 # mappings from $other override those in $self. It essentially merges
5528 # the two tables, with the second having priority except for null
5529 # mappings.
5530
5531 my $self = shift;
5532 my $other = shift;
5533 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5534
5535 return if $self->carp_if_locked;
5536
5537 if (! $other->isa(__PACKAGE__)) {
5538 Carp::my_carp_bug("$other should be a "
5539 . __PACKAGE__
5540 . ". Not a '"
5541 . ref($other)
5542 . "'. Not added;");
5543 return;
5544 }
5545
ffe43484
NC
5546 my $addr = do { no overloading; pack 'J', $self; };
5547 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5548
5549 local $to_trace = 0 if main::DEBUG;
5550
5551 my $self_range_list = $self->_range_list;
5552 my $other_range_list = $other->_range_list;
5553 foreach my $range ($other_range_list->ranges) {
5554 my $value = $range->value;
5555 next if $value eq "";
5556 $self_range_list->_add_delete('+',
5557 $range->start,
5558 $range->end,
5559 $value,
5560 Type => $range->type,
5561 Replace => $UNCONDITIONALLY);
5562 }
5563
99870f4d
KW
5564 return;
5565 }
5566
5567 sub set_default_map {
5568 # Define what code points that are missing from the input files should
5569 # map to
5570
5571 my $self = shift;
5572 my $map = shift;
5573 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5574
ffe43484 5575 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5576
5577 # Convert the input to the standard equivalent, if any (won't have any
5578 # for $STRING properties)
5579 my $standard = $self->_find_table_from_alias->{$map};
5580 $map = $standard->name if defined $standard;
5581
5582 # Warn if there already is a non-equivalent default map for this
5583 # property. Note that a default map can be a ref, which means that
5584 # what it actually means is delayed until later in the program, and it
5585 # IS permissible to override it here without a message.
5586 my $default_map = $default_map{$addr};
5587 if (defined $default_map
5588 && ! ref($default_map)
5589 && $default_map ne $map
5590 && main::Standardize($map) ne $default_map)
5591 {
5592 my $property = $self->property;
5593 my $map_table = $property->table($map);
5594 my $default_table = $property->table($default_map);
5595 if (defined $map_table
5596 && defined $default_table
5597 && $map_table != $default_table)
5598 {
5599 Carp::my_carp("Changing the default mapping for "
5600 . $property
5601 . " from $default_map to $map'");
5602 }
5603 }
5604
5605 $default_map{$addr} = $map;
5606
5607 # Don't also create any missing table for this map at this point,
5608 # because if we did, it could get done before the main table add is
5609 # done for PropValueAliases.txt; instead the caller will have to make
5610 # sure it exists, if desired.
5611 return;
5612 }
5613
5614 sub to_output_map {
5615 # Returns boolean: should we write this map table?
5616
5617 my $self = shift;
5618 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5619
ffe43484 5620 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5621
5622 # If overridden, use that
5623 return $to_output_map{$addr} if defined $to_output_map{$addr};
5624
5625 my $full_name = $self->full_name;
fcf1973c
KW
5626 return $global_to_output_map{$full_name}
5627 if defined $global_to_output_map{$full_name};
99870f4d 5628
20863809 5629 # If table says to output, do so; if says to suppress it, do so.
8572ace0 5630 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
99870f4d
KW
5631 return 0 if $self->status eq $SUPPRESSED;
5632
5633 my $type = $self->property->type;
5634
5635 # Don't want to output binary map tables even for debugging.
5636 return 0 if $type == $BINARY;
5637
5638 # But do want to output string ones.
8572ace0 5639 return $EXTERNAL_MAP if $type == $STRING;
99870f4d 5640
8572ace0
KW
5641 # Otherwise is an $ENUM, do output it, for Perl's purposes
5642 return $INTERNAL_MAP;
99870f4d
KW
5643 }
5644
5645 sub inverse_list {
5646 # Returns a Range_List that is gaps of the current table. That is,
5647 # the inversion
5648
5649 my $self = shift;
5650 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5651
5652 my $current = Range_List->new(Initialize => $self->_range_list,
5653 Owner => $self->property);
5654 return ~ $current;
5655 }
5656
8572ace0
KW
5657 sub header {
5658 my $self = shift;
5659 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5660
5661 my $return = $self->SUPER::header();
5662
5663 $return .= $INTERNAL_ONLY if $self->to_output_map == $INTERNAL_MAP;
5664 return $return;
5665 }
5666
99870f4d
KW
5667 sub set_final_comment {
5668 # Just before output, create the comment that heads the file
5669 # containing this table.
5670
bd9ebcfd
KW
5671 return unless $debugging_build;
5672
99870f4d
KW
5673 my $self = shift;
5674 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5675
5676 # No sense generating a comment if aren't going to write it out.
5677 return if ! $self->to_output_map;
5678
ffe43484 5679 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5680
5681 my $property = $self->property;
5682
5683 # Get all the possible names for this property. Don't use any that
5684 # aren't ok for use in a file name, etc. This is perhaps causing that
5685 # flag to do double duty, and may have to be changed in the future to
5686 # have our own flag for just this purpose; but it works now to exclude
5687 # Perl generated synonyms from the lists for properties, where the
5688 # name is always the proper Unicode one.
5689 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5690
5691 my $count = $self->count;
5692 my $default_map = $default_map{$addr};
5693
5694 # The ranges that map to the default aren't output, so subtract that
5695 # to get those actually output. A property with matching tables
5696 # already has the information calculated.
5697 if ($property->type != $STRING) {
5698 $count -= $property->table($default_map)->count;
5699 }
5700 elsif (defined $default_map) {
5701
5702 # But for $STRING properties, must calculate now. Subtract the
5703 # count from each range that maps to the default.
5704 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5705 if ($range->value eq $default_map) {
5706 $count -= $range->end +1 - $range->start;
5707 }
5708 }
5709
5710 }
5711
5712 # Get a string version of $count with underscores in large numbers,
5713 # for clarity.
5714 my $string_count = main::clarify_number($count);
5715
5716 my $code_points = ($count == 1)
5717 ? 'single code point'
5718 : "$string_count code points";
5719
5720 my $mapping;
5721 my $these_mappings;
5722 my $are;
5723 if (@property_aliases <= 1) {
5724 $mapping = 'mapping';
5725 $these_mappings = 'this mapping';
5726 $are = 'is'
5727 }
5728 else {
5729 $mapping = 'synonymous mappings';
5730 $these_mappings = 'these mappings';
5731 $are = 'are'
5732 }
5733 my $cp;
5734 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5735 $cp = "any code point in Unicode Version $string_version";
5736 }
5737 else {
5738 my $map_to;
5739 if ($default_map eq "") {
5740 $map_to = 'the null string';
5741 }
5742 elsif ($default_map eq $CODE_POINT) {
5743 $map_to = "itself";
5744 }
5745 else {
5746 $map_to = "'$default_map'";
5747 }
5748 if ($count == 1) {
5749 $cp = "the single code point";
5750 }
5751 else {
5752 $cp = "one of the $code_points";
5753 }
5754 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5755 }
5756
5757 my $comment = "";
5758
5759 my $status = $self->status;
5760 if ($status) {
5761 my $warn = uc $status_past_participles{$status};
5762 $comment .= <<END;
5763
5764!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5765 All property or property=value combinations contained in this file are $warn.
5766 See $unicode_reference_url for what this means.
5767
5768END
5769 }
5770 $comment .= "This file returns the $mapping:\n";
5771
5772 for my $i (0 .. @property_aliases - 1) {
5773 $comment .= sprintf("%-8s%s\n",
5774 " ",
5775 $property_aliases[$i]->name . '(cp)'
5776 );
5777 }
5778 $comment .=
5779 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5780
5781 my $access = $core_access{$addr};
5782 if ($access) {
5783 $comment .= "accessible through the Perl core via $access.";
5784 }
5785 else {
5786 $comment .= "not accessible through the Perl core directly.";
5787 }
5788
5789 # And append any commentary already set from the actual property.
5790 $comment .= "\n\n" . $self->comment if $self->comment;
5791 if ($self->description) {
5792 $comment .= "\n\n" . join " ", $self->description;
5793 }
5794 if ($self->note) {
5795 $comment .= "\n\n" . join " ", $self->note;
5796 }
5797 $comment .= "\n";
5798
5799 if (! $self->perl_extension) {
5800 $comment .= <<END;
5801
5802For information about what this property really means, see:
5803$unicode_reference_url
5804END
5805 }
5806
5807 if ($count) { # Format differs for empty table
5808 $comment.= "\nThe format of the ";
5809 if ($self->range_size_1) {
5810 $comment.= <<END;
5811main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5812is in hex; MAPPING is what CODE_POINT maps to.
5813END
5814 }
5815 else {
5816
5817 # There are tables which end up only having one element per
5818 # range, but it is not worth keeping track of for making just
5819 # this comment a little better.
5820 $comment.= <<END;
5821non-comment portions of the main body of lines of this file is:
5822START\\tSTOP\\tMAPPING where START is the starting code point of the
5823range, in hex; STOP is the ending point, or if omitted, the range has just one
5824code point; MAPPING is what each code point between START and STOP maps to.
5825END
0c07e538 5826 if ($self->output_range_counts) {
99870f4d
KW
5827 $comment .= <<END;
5828Numbers in comments in [brackets] indicate how many code points are in the
5829range (omitted when the range is a single code point or if the mapping is to
5830the null string).
5831END
5832 }
5833 }
5834 }
5835 $self->set_comment(main::join_lines($comment));
5836 return;
5837 }
5838
5839 my %swash_keys; # Makes sure don't duplicate swash names.
5840
668b3bfc
KW
5841 # The remaining variables are temporaries used while writing each table,
5842 # to output special ranges.
5843 my $has_hangul_syllables;
5844 my @multi_code_point_maps; # Map is to more than one code point.
5845
5846 # The key is the base name of the code point, and the value is an
5847 # array giving all the ranges that use this base name. Each range
5848 # is actually a hash giving the 'low' and 'high' values of it.
5849 my %names_ending_in_code_point;
8c32d378 5850 my %loose_names_ending_in_code_point;
668b3bfc
KW
5851
5852 # Inverse mapping. The list of ranges that have these kinds of
5853 # names. Each element contains the low, high, and base names in a
5854 # hash.
5855 my @code_points_ending_in_code_point;
5856
5857 sub handle_special_range {
5858 # Called in the middle of write when it finds a range it doesn't know
5859 # how to handle.
5860
5861 my $self = shift;
5862 my $range = shift;
5863 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5864
5865 my $addr = do { no overloading; pack 'J', $self; };
5866
5867 my $type = $range->type;
5868
5869 my $low = $range->start;
5870 my $high = $range->end;
5871 my $map = $range->value;
5872
5873 # No need to output the range if it maps to the default.
5874 return if $map eq $default_map{$addr};
5875
5876 # Switch based on the map type...
5877 if ($type == $HANGUL_SYLLABLE) {
5878
5879 # These are entirely algorithmically determinable based on
5880 # some constants furnished by Unicode; for now, just set a
5881 # flag to indicate that have them. After everything is figured
5882 # out, we will output the code that does the algorithm.
5883 $has_hangul_syllables = 1;
5884 }
5885 elsif ($type == $CP_IN_NAME) {
5886
5887 # Code points whose the name ends in their code point are also
5888 # algorithmically determinable, but need information about the map
5889 # to do so. Both the map and its inverse are stored in data
5890 # structures output in the file.
5891 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5892 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5893
8c32d378
KW
5894 my $squeezed = $map =~ s/[-\s]+//gr;
5895 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}}, $low;
5896 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}}, $high;
5897
668b3bfc
KW
5898 push @code_points_ending_in_code_point, { low => $low,
5899 high => $high,
5900 name => $map
5901 };
5902 }
5903 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5904
5905 # Multi-code point maps and null string maps have an entry
5906 # for each code point in the range. They use the same
5907 # output format.
5908 for my $code_point ($low .. $high) {
5909
5910 # The pack() below can't cope with surrogates.
5911 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
98dc9551 5912 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
668b3bfc
KW
5913 next;
5914 }
5915
5916 # Generate the hash entries for these in the form that
5917 # utf8.c understands.
5918 my $tostr = "";
5919 my $to_name = "";
5920 my $to_chr = "";
5921 foreach my $to (split " ", $map) {
5922 if ($to !~ /^$code_point_re$/) {
5923 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5924 next;
5925 }
5926 $tostr .= sprintf "\\x{%s}", $to;
5927 $to = CORE::hex $to;
558712cf 5928 if ($annotate) {
c4019d52
KW
5929 $to_name .= " + " if $to_name;
5930 $to_chr .= chr($to);
5931 main::populate_char_info($to)
5932 if ! defined $viacode[$to];
5933 $to_name .= $viacode[$to];
5934 }
668b3bfc
KW
5935 }
5936
5937 # I (khw) have never waded through this line to
5938 # understand it well enough to comment it.
5939 my $utf8 = sprintf(qq["%s" => "$tostr",],
5940 join("", map { sprintf "\\x%02X", $_ }
5941 unpack("U0C*", pack("U", $code_point))));
5942
5943 # Add a comment so that a human reader can more easily
5944 # see what's going on.
5945 push @multi_code_point_maps,
5946 sprintf("%-45s # U+%04X", $utf8, $code_point);
558712cf 5947 if (! $annotate) {
c4019d52
KW
5948 $multi_code_point_maps[-1] .= " => $map";
5949 }
5950 else {
5951 main::populate_char_info($code_point)
5952 if ! defined $viacode[$code_point];
5953 $multi_code_point_maps[-1] .= " '"
5954 . chr($code_point)
5955 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
5956 }
668b3bfc
KW
5957 }
5958 }
5959 else {
5960 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
5961 }
5962
5963 return;
5964 }
5965
99870f4d
KW
5966 sub pre_body {
5967 # Returns the string that should be output in the file before the main
668b3bfc
KW
5968 # body of this table. It isn't called until the main body is
5969 # calculated, saving a pass. The string includes some hash entries
5970 # identifying the format of the body, and what the single value should
5971 # be for all ranges missing from it. It also includes any code points
5972 # which have map_types that don't go in the main table.
99870f4d
KW
5973
5974 my $self = shift;
5975 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5976
ffe43484 5977 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5978
5979 my $name = $self->property->swash_name;
5980
5981 if (defined $swash_keys{$name}) {
5982 Carp::my_carp(join_lines(<<END
5983Already created a swash name '$name' for $swash_keys{$name}. This means that
5984the same name desired for $self shouldn't be used. Bad News. This must be
5985fixed before production use, but proceeding anyway
5986END
5987 ));
5988 }
5989 $swash_keys{$name} = "$self";
5990
99870f4d 5991 my $pre_body = "";
99870f4d 5992
668b3bfc
KW
5993 # Here we assume we were called after have gone through the whole
5994 # file. If we actually generated anything for each map type, add its
5995 # respective header and trailer
ec2f0128 5996 my $specials_name = "";
668b3bfc 5997 if (@multi_code_point_maps) {
ec2f0128 5998 $specials_name = "utf8::ToSpec$name";
668b3bfc 5999 $pre_body .= <<END;
99870f4d
KW
6000
6001# Some code points require special handling because their mappings are each to
6002# multiple code points. These do not appear in the main body, but are defined
6003# in the hash below.
6004
76591e2b
KW
6005# Each key is the string of N bytes that together make up the UTF-8 encoding
6006# for the code point. (i.e. the same as looking at the code point's UTF-8
6007# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
ec2f0128 6008\%$specials_name = (
99870f4d 6009END
668b3bfc
KW
6010 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6011 }
99870f4d 6012
668b3bfc
KW
6013 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
6014
6015 # Convert these structures to output format.
6016 my $code_points_ending_in_code_point =
6017 main::simple_dumper(\@code_points_ending_in_code_point,
6018 ' ' x 8);
6019 my $names = main::simple_dumper(\%names_ending_in_code_point,
6020 ' ' x 8);
8c32d378
KW
6021 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
6022 ' ' x 8);
668b3bfc
KW
6023
6024 # Do the same with the Hangul names,
6025 my $jamo;
6026 my $jamo_l;
6027 my $jamo_v;
6028 my $jamo_t;
6029 my $jamo_re;
6030 if ($has_hangul_syllables) {
6031
6032 # Construct a regular expression of all the possible
6033 # combinations of the Hangul syllables.
6034 my @L_re; # Leading consonants
6035 for my $i ($LBase .. $LBase + $LCount - 1) {
6036 push @L_re, $Jamo{$i}
6037 }
6038 my @V_re; # Middle vowels
6039 for my $i ($VBase .. $VBase + $VCount - 1) {
6040 push @V_re, $Jamo{$i}
6041 }
6042 my @T_re; # Trailing consonants
6043 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
6044 push @T_re, $Jamo{$i}
99870f4d
KW
6045 }
6046
668b3bfc
KW
6047 # The whole re is made up of the L V T combination.
6048 $jamo_re = '('
6049 . join ('|', sort @L_re)
6050 . ')('
6051 . join ('|', sort @V_re)
6052 . ')('
6053 . join ('|', sort @T_re)
6054 . ')?';
6055
6056 # These hashes needed by the algorithm were generated
6057 # during reading of the Jamo.txt file
6058 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
6059 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
6060 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
6061 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
6062 }
6063
6064 $pre_body .= <<END;
99870f4d
KW
6065
6066# To achieve significant memory savings when this file is read in,
6067# algorithmically derivable code points are omitted from the main body below.
6068# Instead, the following routines can be used to translate between name and
6069# code point and vice versa
6070
6071{ # Closure
6072
6073 # Matches legal code point. 4-6 hex numbers, If there are 6, the
6074 # first two must be '10'; if there are 5, the first must not be a '0'.
8c32d378
KW
6075 # First can match at the end of a word provided that the end of the
6076 # word doesn't look like a hex number.
6077 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
99870f4d
KW
6078 my \$code_point_re = qr/$code_point_re/;
6079
6080 # In the following hash, the keys are the bases of names which includes
6081 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
6082 # of each key is another hash which is used to get the low and high ends
8c32d378 6083 # for each range of code points that apply to the name.
99870f4d
KW
6084 my %names_ending_in_code_point = (
6085$names
6086 );
6087
8c32d378
KW
6088 # The following hash is a copy of the previous one, except is for loose
6089 # matching, so each name has blanks and dashes squeezed out
6090 my %loose_names_ending_in_code_point = (
6091$loose_names
6092 );
6093
99870f4d
KW
6094 # And the following array gives the inverse mapping from code points to
6095 # names. Lowest code points are first
6096 my \@code_points_ending_in_code_point = (
6097$code_points_ending_in_code_point
6098 );
6099END
668b3bfc
KW
6100 # Earlier releases didn't have Jamos. No sense outputting
6101 # them unless will be used.
6102 if ($has_hangul_syllables) {
6103 $pre_body .= <<END;
99870f4d
KW
6104
6105 # Convert from code point to Jamo short name for use in composing Hangul
6106 # syllable names
6107 my %Jamo = (
6108$jamo
6109 );
6110
6111 # Leading consonant (can be null)
6112 my %Jamo_L = (
6113$jamo_l
6114 );
6115
6116 # Vowel
6117 my %Jamo_V = (
6118$jamo_v
6119 );
6120
6121 # Optional trailing consonant
6122 my %Jamo_T = (
6123$jamo_t
6124 );
6125
6126 # Computed re that splits up a Hangul name into LVT or LV syllables
6127 my \$syllable_re = qr/$jamo_re/;
6128
6129 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
8c32d378 6130 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
99870f4d
KW
6131
6132 # These constants names and values were taken from the Unicode standard,
6133 # version 5.1, section 3.12. They are used in conjunction with Hangul
6134 # syllables
6e5a209b
KW
6135 my \$SBase = $SBase_string;
6136 my \$LBase = $LBase_string;
6137 my \$VBase = $VBase_string;
6138 my \$TBase = $TBase_string;
6139 my \$SCount = $SCount;
6140 my \$LCount = $LCount;
6141 my \$VCount = $VCount;
6142 my \$TCount = $TCount;
99870f4d
KW
6143 my \$NCount = \$VCount * \$TCount;
6144END
668b3bfc 6145 } # End of has Jamos
99870f4d 6146
668b3bfc 6147 $pre_body .= << 'END';
99870f4d
KW
6148
6149 sub name_to_code_point_special {
8c32d378 6150 my ($name, $loose) = @_;
99870f4d
KW
6151
6152 # Returns undef if not one of the specially handled names; otherwise
6153 # returns the code point equivalent to the input name
8c32d378
KW
6154 # $loose is non-zero if to use loose matching, 'name' in that case
6155 # must be input as upper case with all blanks and dashes squeezed out.
99870f4d 6156END
668b3bfc
KW
6157 if ($has_hangul_syllables) {
6158 $pre_body .= << 'END';
99870f4d 6159
8c32d378
KW
6160 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
6161 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
6162 {
99870f4d
KW
6163 return if $name !~ qr/^$syllable_re$/;
6164 my $L = $Jamo_L{$1};
6165 my $V = $Jamo_V{$2};
6166 my $T = (defined $3) ? $Jamo_T{$3} : 0;
6167 return ($L * $VCount + $V) * $TCount + $T + $SBase;
6168 }
6169END
668b3bfc
KW
6170 }
6171 $pre_body .= << 'END';
99870f4d 6172
8c32d378
KW
6173 # Name must end in 'code_point' for this to handle.
6174 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
6175 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
99870f4d
KW
6176
6177 my $base = $1;
6178 my $code_point = CORE::hex $2;
8c32d378
KW
6179 my $names_ref;
6180
6181 if ($loose) {
6182 $names_ref = \%loose_names_ending_in_code_point;
6183 }
6184 else {
6185 return if $base !~ s/-$//;
6186 $names_ref = \%names_ending_in_code_point;
6187 }
99870f4d
KW
6188
6189 # Name must be one of the ones which has the code point in it.
8c32d378 6190 return if ! $names_ref->{$base};
99870f4d
KW
6191
6192 # Look through the list of ranges that apply to this name to see if
6193 # the code point is in one of them.
8c32d378
KW
6194 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
6195 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
6196 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
99870f4d
KW
6197
6198 # Here, the code point is in the range.
6199 return $code_point;
6200 }
6201
6202 # Here, looked like the name had a code point number in it, but
6203 # did not match one of the valid ones.
6204 return;
6205 }
6206
6207 sub code_point_to_name_special {
6208 my $code_point = shift;
6209
6210 # Returns the name of a code point if algorithmically determinable;
6211 # undef if not
6212END
668b3bfc
KW
6213 if ($has_hangul_syllables) {
6214 $pre_body .= << 'END';
99870f4d
KW
6215
6216 # If in the Hangul range, calculate the name based on Unicode's
6217 # algorithm
6218 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
6219 use integer;
6220 my $SIndex = $code_point - $SBase;
6221 my $L = $LBase + $SIndex / $NCount;
6222 my $V = $VBase + ($SIndex % $NCount) / $TCount;
6223 my $T = $TBase + $SIndex % $TCount;
03e1aa51 6224 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
99870f4d
KW
6225 $name .= $Jamo{$T} if $T != $TBase;
6226 return $name;
6227 }
6228END
668b3bfc
KW
6229 }
6230 $pre_body .= << 'END';
99870f4d
KW
6231
6232 # Look through list of these code points for one in range.
6233 foreach my $hash (@code_points_ending_in_code_point) {
6234 return if $code_point < $hash->{'low'};
6235 if ($code_point <= $hash->{'high'}) {
6236 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
6237 }
6238 }
6239 return; # None found
6240 }
6241} # End closure
6242
6243END
668b3bfc
KW
6244 } # End of has hangul or code point in name maps.
6245
6246 my $format = $self->format;
6247
6248 my $return = <<END;
6249# The name this swash is to be known by, with the format of the mappings in
6250# the main body of the table, and what all code points missing from this file
6251# map to.
6252\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6253END
ec2f0128
KW
6254 if ($specials_name) {
6255 $return .= <<END;
6256\$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6257END
6258 }
668b3bfc
KW
6259 my $default_map = $default_map{$addr};
6260 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$default_map';";
6261
6262 if ($default_map eq $CODE_POINT) {
6263 $return .= ' # code point maps to itself';
6264 }
6265 elsif ($default_map eq "") {
6266 $return .= ' # code point maps to the null string';
6267 }
6268 $return .= "\n";
6269
6270 $return .= $pre_body;
6271
6272 return $return;
6273 }
6274
6275 sub write {
6276 # Write the table to the file.
6277
6278 my $self = shift;
6279 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6280
6281 my $addr = do { no overloading; pack 'J', $self; };
6282
6283 # Clear the temporaries
6284 $has_hangul_syllables = 0;
6285 undef @multi_code_point_maps;
6286 undef %names_ending_in_code_point;
8c32d378 6287 undef %loose_names_ending_in_code_point;
668b3bfc 6288 undef @code_points_ending_in_code_point;
99870f4d
KW
6289
6290 # Calculate the format of the table if not already done.
f5817e0a 6291 my $format = $self->format;
668b3bfc
KW
6292 my $type = $self->property->type;
6293 my $default_map = $self->default_map;
99870f4d
KW
6294 if (! defined $format) {
6295 if ($type == $BINARY) {
6296
6297 # Don't bother checking the values, because we elsewhere
6298 # verify that a binary table has only 2 values.
6299 $format = $BINARY_FORMAT;
6300 }
6301 else {
6302 my @ranges = $self->_range_list->ranges;
6303
6304 # default an empty table based on its type and default map
6305 if (! @ranges) {
6306
6307 # But it turns out that the only one we can say is a
6308 # non-string (besides binary, handled above) is when the
6309 # table is a string and the default map is to a code point
6310 if ($type == $STRING && $default_map eq $CODE_POINT) {
6311 $format = $HEX_FORMAT;
6312 }
6313 else {
6314 $format = $STRING_FORMAT;
6315 }
6316 }
6317 else {
6318
6319 # Start with the most restrictive format, and as we find
6320 # something that doesn't fit with that, change to the next
6321 # most restrictive, and so on.
6322 $format = $DECIMAL_FORMAT;
6323 foreach my $range (@ranges) {
668b3bfc
KW
6324 next if $range->type != 0; # Non-normal ranges don't
6325 # affect the main body
99870f4d
KW
6326 my $map = $range->value;
6327 if ($map ne $default_map) {
6328 last if $format eq $STRING_FORMAT; # already at
6329 # least
6330 # restrictive
6331 $format = $INTEGER_FORMAT
6332 if $format eq $DECIMAL_FORMAT
6333 && $map !~ / ^ [0-9] $ /x;
6334 $format = $FLOAT_FORMAT
6335 if $format eq $INTEGER_FORMAT
6336 && $map !~ / ^ -? [0-9]+ $ /x;
6337 $format = $RATIONAL_FORMAT
6338 if $format eq $FLOAT_FORMAT
6339 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6340 $format = $HEX_FORMAT
6341 if $format eq $RATIONAL_FORMAT
6342 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
6343 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6344 && $map =~ /[^0-9A-F]/;
6345 }
6346 }
6347 }
6348 }
6349 } # end of calculating format
6350
668b3bfc 6351 if ($default_map eq $CODE_POINT
99870f4d 6352 && $format ne $HEX_FORMAT
668b3bfc
KW
6353 && ! defined $self->format) # manual settings are always
6354 # considered ok
99870f4d
KW
6355 {
6356 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6357 }
99870f4d 6358
668b3bfc 6359 $self->_set_format($format);
99870f4d 6360
0911a63d
KW
6361 # Core Perl has a different definition of mapping ranges than we do,
6362 # that is applicable mainly to mapping code points, so for tables
6363 # where it is possible that core Perl could be used to read it,
6364 # make it range size 1 to prevent possible confusion
6365 $self->set_range_size_1(1) if $format eq $HEX_FORMAT;
6366
99870f4d
KW
6367 return $self->SUPER::write(
6368 ($self->property == $block)
6369 ? 7 # block file needs more tab stops
6370 : 3,
668b3bfc 6371 $default_map); # don't write defaulteds
99870f4d
KW
6372 }
6373
6374 # Accessors for the underlying list that should fail if locked.
ea25a9b2 6375 for my $sub (qw(
99870f4d 6376 add_duplicate
ea25a9b2 6377 ))
99870f4d
KW
6378 {
6379 no strict "refs";
6380 *$sub = sub {
6381 use strict "refs";
6382 my $self = shift;
6383
6384 return if $self->carp_if_locked;
6385 return $self->_range_list->$sub(@_);
6386 }
6387 }
6388} # End closure for Map_Table
6389
6390package Match_Table;
6391use base '_Base_Table';
6392
6393# A Match table is one which is a list of all the code points that have
6394# the same property and property value, for use in \p{property=value}
6395# constructs in regular expressions. It adds very little data to the base
6396# structure, but many methods, as these lists can be combined in many ways to
6397# form new ones.
6398# There are only a few concepts added:
6399# 1) Equivalents and Relatedness.
6400# Two tables can match the identical code points, but have different names.
6401# This always happens when there is a perl single form extension
6402# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6403# tables are set to be related, with the Perl extension being a child, and
6404# the Unicode property being the parent.
6405#
6406# It may be that two tables match the identical code points and we don't
6407# know if they are related or not. This happens most frequently when the
6408# Block and Script properties have the exact range. But note that a
6409# revision to Unicode could add new code points to the script, which would
6410# now have to be in a different block (as the block was filled, or there
6411# would have been 'Unknown' script code points in it and they wouldn't have
6412# been identical). So we can't rely on any two properties from Unicode
6413# always matching the same code points from release to release, and thus
6414# these tables are considered coincidentally equivalent--not related. When
6415# two tables are unrelated but equivalent, one is arbitrarily chosen as the
6416# 'leader', and the others are 'equivalents'. This concept is useful
6417# to minimize the number of tables written out. Only one file is used for
6418# any identical set of code points, with entries in Heavy.pl mapping all
6419# the involved tables to it.
6420#
6421# Related tables will always be identical; we set them up to be so. Thus
6422# if the Unicode one is deprecated, the Perl one will be too. Not so for
6423# unrelated tables. Relatedness makes generating the documentation easier.
6424#
6425# 2) Conflicting. It may be that there will eventually be name clashes, with
6426# the same name meaning different things. For a while, there actually were
6427# conflicts, but they have so far been resolved by changing Perl's or
6428# Unicode's definitions to match the other, but when this code was written,
6429# it wasn't clear that that was what was going to happen. (Unicode changed
6430# because of protests during their beta period.) Name clashes are warned
6431# about during compilation, and the documentation. The generated tables
6432# are sane, free of name clashes, because the code suppresses the Perl
6433# version. But manual intervention to decide what the actual behavior
6434# should be may be required should this happen. The introductory comments
6435# have more to say about this.
6436
6437sub standardize { return main::standardize($_[0]); }
6438sub trace { return main::trace(@_); }
6439
6440
6441{ # Closure
6442
6443 main::setup_package();
6444
6445 my %leader;
6446 # The leader table of this one; initially $self.
6447 main::set_access('leader', \%leader, 'r');
6448
6449 my %equivalents;
6450 # An array of any tables that have this one as their leader
6451 main::set_access('equivalents', \%equivalents, 'readable_array');
6452
6453 my %parent;
6454 # The parent table to this one, initially $self. This allows us to
6455 # distinguish between equivalent tables that are related, and those which
6456 # may not be, but share the same output file because they match the exact
6457 # same set of code points in the current Unicode release.
6458 main::set_access('parent', \%parent, 'r');
6459
6460 my %children;
6461 # An array of any tables that have this one as their parent
6462 main::set_access('children', \%children, 'readable_array');
6463
6464 my %conflicting;
6465 # Array of any tables that would have the same name as this one with
6466 # a different meaning. This is used for the generated documentation.
6467 main::set_access('conflicting', \%conflicting, 'readable_array');
6468
6469 my %matches_all;
6470 # Set in the constructor for tables that are expected to match all code
6471 # points.
6472 main::set_access('matches_all', \%matches_all, 'r');
6473
a92d5c2e
KW
6474 my %complement;
6475 # Points to the complement that this table is expressed in terms of; 0 if
6476 # none.
6477 main::set_access('complement', \%complement, 'r', 's' );
6478
99870f4d
KW
6479 sub new {
6480 my $class = shift;
6481
6482 my %args = @_;
6483
6484 # The property for which this table is a listing of property values.
6485 my $property = delete $args{'_Property'};
6486
23e33b60
KW
6487 my $name = delete $args{'Name'};
6488 my $full_name = delete $args{'Full_Name'};
6489 $full_name = $name if ! defined $full_name;
6490
99870f4d
KW
6491 # Optional
6492 my $initialize = delete $args{'Initialize'};
6493 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 6494 my $format = delete $args{'Format'};
99870f4d
KW
6495 # Rest of parameters passed on.
6496
6497 my $range_list = Range_List->new(Initialize => $initialize,
6498 Owner => $property);
6499
23e33b60
KW
6500 my $complete = $full_name;
6501 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6502 # but this helps debug if it
6503 # does
6504 # The complete name for a match table includes it's property in a
6505 # compound form 'property=table', except if the property is the
6506 # pseudo-property, perl, in which case it is just the single form,
6507 # 'table' (If you change the '=' must also change the ':' in lots of
6508 # places in this program that assume an equal sign)
6509 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 6510
99870f4d 6511 my $self = $class->SUPER::new(%args,
23e33b60
KW
6512 Name => $name,
6513 Complete_Name => $complete,
6514 Full_Name => $full_name,
99870f4d
KW
6515 _Property => $property,
6516 _Range_List => $range_list,
f5817e0a 6517 Format => $EMPTY_FORMAT,
99870f4d 6518 );
ffe43484 6519 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6520
6521 $conflicting{$addr} = [ ];
6522 $equivalents{$addr} = [ ];
6523 $children{$addr} = [ ];
6524 $matches_all{$addr} = $matches_all;
6525 $leader{$addr} = $self;
6526 $parent{$addr} = $self;
a92d5c2e 6527 $complement{$addr} = 0;
99870f4d 6528
f5817e0a
KW
6529 if (defined $format && $format ne $EMPTY_FORMAT) {
6530 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6531 }
6532
99870f4d
KW
6533 return $self;
6534 }
6535
6536 # See this program's beginning comment block about overloading these.
6537 use overload
6538 fallback => 0,
6539 qw("") => "_operator_stringify",
6540 '=' => sub {
6541 my $self = shift;
6542
6543 return if $self->carp_if_locked;
6544 return $self;
6545 },
6546
6547 '+' => sub {
6548 my $self = shift;
6549 my $other = shift;
6550
6551 return $self->_range_list + $other;
6552 },
6553 '&' => sub {
6554 my $self = shift;
6555 my $other = shift;
6556
6557 return $self->_range_list & $other;
6558 },
6559 '+=' => sub {
6560 my $self = shift;
6561 my $other = shift;
6562
6563 return if $self->carp_if_locked;
6564
ffe43484 6565 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6566
6567 if (ref $other) {
6568
6569 # Change the range list of this table to be the
6570 # union of the two.
6571 $self->_set_range_list($self->_range_list
6572 + $other);
6573 }
6574 else { # $other is just a simple value
6575 $self->add_range($other, $other);
6576 }
6577 return $self;
6578 },
6579 '-' => sub { my $self = shift;
6580 my $other = shift;
6581 my $reversed = shift;
6582
6583 if ($reversed) {
6584 Carp::my_carp_bug("Can't cope with a "
6585 . __PACKAGE__
6586 . " being the first parameter in a '-'. Subtraction ignored.");
6587 return;
6588 }
6589
6590 return $self->_range_list - $other;
6591 },
6592 '~' => sub { my $self = shift;
6593 return ~ $self->_range_list;
6594 },
6595 ;
6596
6597 sub _operator_stringify {
6598 my $self = shift;
6599
23e33b60 6600 my $name = $self->complete_name;
99870f4d
KW
6601 return "Table '$name'";
6602 }
6603
6604 sub add_alias {
6605 # Add a synonym for this table. See the comments in the base class
6606
6607 my $self = shift;
6608 my $name = shift;
6609 # Rest of parameters passed on.
6610
6611 $self->SUPER::add_alias($name, $self, @_);
6612 return;
6613 }
6614
6615 sub add_conflicting {
6616 # Add the name of some other object to the list of ones that name
6617 # clash with this match table.
6618
6619 my $self = shift;
6620 my $conflicting_name = shift; # The name of the conflicting object
6621 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6622 my $conflicting_object = shift; # Optional, the conflicting object
6623 # itself. This is used to
6624 # disambiguate the text if the input
6625 # name is identical to any of the
6626 # aliases $self is known by.
6627 # Sometimes the conflicting object is
6628 # merely hypothetical, so this has to
6629 # be an optional parameter.
6630 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6631
ffe43484 6632 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6633
6634 # Check if the conflicting name is exactly the same as any existing
6635 # alias in this table (as long as there is a real object there to
6636 # disambiguate with).
6637 if (defined $conflicting_object) {
6638 foreach my $alias ($self->aliases) {
6639 if ($alias->name eq $conflicting_name) {
6640
6641 # Here, there is an exact match. This results in
6642 # ambiguous comments, so disambiguate by changing the
6643 # conflicting name to its object's complete equivalent.
6644 $conflicting_name = $conflicting_object->complete_name;
6645 last;
6646 }
6647 }
6648 }
6649
6650 # Convert to the \p{...} final name
6651 $conflicting_name = "\\$p" . "{$conflicting_name}";
6652
6653 # Only add once
6654 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6655
6656 push @{$conflicting{$addr}}, $conflicting_name;
6657
6658 return;
6659 }
6660
6505c6e2 6661 sub is_set_equivalent_to {
99870f4d
KW
6662 # Return boolean of whether or not the other object is a table of this
6663 # type and has been marked equivalent to this one.
6664
6665 my $self = shift;
6666 my $other = shift;
6667 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6668
6669 return 0 if ! defined $other; # Can happen for incomplete early
6670 # releases
6671 unless ($other->isa(__PACKAGE__)) {
6672 my $ref_other = ref $other;
6673 my $ref_self = ref $self;
6505c6e2 6674 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
99870f4d
KW
6675 return 0;
6676 }
6677
6678 # Two tables are equivalent if they have the same leader.
f998e60c 6679 no overloading;
051df77b 6680 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6681 return;
6682 }
6683
99870f4d
KW
6684 sub set_equivalent_to {
6685 # Set $self equivalent to the parameter table.
6686 # The required Related => 'x' parameter is a boolean indicating
6687 # whether these tables are related or not. If related, $other becomes
6688 # the 'parent' of $self; if unrelated it becomes the 'leader'
6689 #
6690 # Related tables share all characteristics except names; equivalents
6691 # not quite so many.
6692 # If they are related, one must be a perl extension. This is because
6693 # we can't guarantee that Unicode won't change one or the other in a
98dc9551 6694 # later release even if they are identical now.
99870f4d
KW
6695
6696 my $self = shift;
6697 my $other = shift;
6698
6699 my %args = @_;
6700 my $related = delete $args{'Related'};
6701
6702 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6703
6704 return if ! defined $other; # Keep on going; happens in some early
6705 # Unicode releases.
6706
6707 if (! defined $related) {
6708 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6709 $related = 0;
6710 }
6711
6712 # If already are equivalent, no need to re-do it; if subroutine
6713 # returns null, it found an error, also do nothing
6505c6e2 6714 my $are_equivalent = $self->is_set_equivalent_to($other);
99870f4d
KW
6715 return if ! defined $are_equivalent || $are_equivalent;
6716
ffe43484 6717 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6718 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d 6719
45e32b91
KW
6720 if ($related) {
6721 if ($current_leader->perl_extension) {
6722 if ($other->perl_extension) {
6723 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
6724 return;
6725 }
6726 } elsif (! $other->perl_extension) {
6727 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6728 $related = 0;
6729 }
6730 }
6731
6732 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
6733 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
6734 return;
99870f4d
KW
6735 }
6736
ffe43484
NC
6737 my $leader = do { no overloading; pack 'J', $current_leader; };
6738 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6739
6740 # Any tables that are equivalent to or children of this table must now
6741 # instead be equivalent to or (children) to the new leader (parent),
6742 # still equivalent. The equivalency includes their matches_all info,
6743 # and for related tables, their status
6744 # All related tables are of necessity equivalent, but the converse
6745 # isn't necessarily true
6746 my $status = $other->status;
6747 my $status_info = $other->status_info;
6748 my $matches_all = $matches_all{other_addr};
d867ccfb 6749 my $caseless_equivalent = $other->caseless_equivalent;
99870f4d
KW
6750 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6751 next if $table == $other;
6752 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6753
ffe43484 6754 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6755 $leader{$table_addr} = $other;
6756 $matches_all{$table_addr} = $matches_all;
6757 $self->_set_range_list($other->_range_list);
6758 push @{$equivalents{$other_addr}}, $table;
6759 if ($related) {
6760 $parent{$table_addr} = $other;
6761 push @{$children{$other_addr}}, $table;
6762 $table->set_status($status, $status_info);
d867ccfb 6763 $self->set_caseless_equivalent($caseless_equivalent);
99870f4d
KW
6764 }
6765 }
6766
6767 # Now that we've declared these to be equivalent, any changes to one
6768 # of the tables would invalidate that equivalency.
6769 $self->lock;
6770 $other->lock;
6771 return;
6772 }
6773
6774 sub add_range { # Add a range to the list for this table.
6775 my $self = shift;
6776 # Rest of parameters passed on
6777
6778 return if $self->carp_if_locked;
6779 return $self->_range_list->add_range(@_);
6780 }
6781
99870f4d
KW
6782 sub pre_body { # Does nothing for match tables.
6783 return
6784 }
6785
6786 sub append_to_body { # Does nothing for match tables.
6787 return
6788 }
6789
6790 sub write {
6791 my $self = shift;
6792 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6793
6794 return $self->SUPER::write(2); # 2 tab stops
6795 }
6796
6797 sub set_final_comment {
6798 # This creates a comment for the file that is to hold the match table
6799 # $self. It is somewhat convoluted to make the English read nicely,
6800 # but, heh, it's just a comment.
6801 # This should be called only with the leader match table of all the
6802 # ones that share the same file. It lists all such tables, ordered so
6803 # that related ones are together.
6804
bd9ebcfd
KW
6805 return unless $debugging_build;
6806
99870f4d
KW
6807 my $leader = shift; # Should only be called on the leader table of
6808 # an equivalent group
6809 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6810
ffe43484 6811 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6812
6813 if ($leader{$addr} != $leader) {
6814 Carp::my_carp_bug(<<END
6815set_final_comment() must be called on a leader table, which $leader is not.
6816It is equivalent to $leader{$addr}. No comment created
6817END
6818 );
6819 return;
6820 }
6821
6822 # Get the number of code points matched by each of the tables in this
6823 # file, and add underscores for clarity.
6824 my $count = $leader->count;
6825 my $string_count = main::clarify_number($count);
6826
6827 my $loose_count = 0; # how many aliases loosely matched
6828 my $compound_name = ""; # ? Are any names compound?, and if so, an
6829 # example
6830 my $properties_with_compound_names = 0; # count of these
6831
6832
6833 my %flags; # The status flags used in the file
6834 my $total_entries = 0; # number of entries written in the comment
6835 my $matches_comment = ""; # The portion of the comment about the
6836 # \p{}'s
6837 my @global_comments; # List of all the tables' comments that are
6838 # there before this routine was called.
6839
6840 # Get list of all the parent tables that are equivalent to this one
6841 # (including itself).
6842 my @parents = grep { $parent{main::objaddr $_} == $_ }
6843 main::uniques($leader, @{$equivalents{$addr}});
6844 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6845 # tables
6846
6847 for my $parent (@parents) {
6848
6849 my $property = $parent->property;
6850
6851 # Special case 'N' tables in properties with two match tables when
6852 # the other is a 'Y' one. These are likely to be binary tables,
6853 # but not necessarily. In either case, \P{} will match the
6854 # complement of \p{}, and so if something is a synonym of \p, the
6855 # complement of that something will be the synonym of \P. This
6856 # would be true of any property with just two match tables, not
6857 # just those whose values are Y and N; but that would require a
6858 # little extra work, and there are none such so far in Unicode.
6859 my $perl_p = 'p'; # which is it? \p{} or \P{}
6860 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6861
6862 if (scalar $property->tables == 2
6863 && $parent == $property->table('N')
6864 && defined (my $yes = $property->table('Y')))
6865 {
ffe43484 6866 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6867 @yes_perl_synonyms
6868 = grep { $_->property == $perl }
6869 main::uniques($yes,
6870 $parent{$yes_addr},
6871 $parent{$yes_addr}->children);
6872
6873 # But these synonyms are \P{} ,not \p{}
6874 $perl_p = 'P';
6875 }
6876
6877 my @description; # Will hold the table description
6878 my @note; # Will hold the table notes.
6879 my @conflicting; # Will hold the table conflicts.
6880
6881 # Look at the parent, any yes synonyms, and all the children
ffe43484 6882 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6883 for my $table ($parent,
6884 @yes_perl_synonyms,
f998e60c 6885 @{$children{$parent_addr}})
99870f4d 6886 {
ffe43484 6887 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6888 my $table_property = $table->property;
6889
6890 # Tables are separated by a blank line to create a grouping.
6891 $matches_comment .= "\n" if $matches_comment;
6892
6893 # The table is named based on the property and value
6894 # combination it is for, like script=greek. But there may be
6895 # a number of synonyms for each side, like 'sc' for 'script',
6896 # and 'grek' for 'greek'. Any combination of these is a valid
6897 # name for this table. In this case, there are three more,
6898 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6899 # listing all possible combinations in the comment, we make
6900 # sure that each synonym occurs at least once, and add
6901 # commentary that the other combinations are possible.
6902 my @property_aliases = $table_property->aliases;
6903 my @table_aliases = $table->aliases;
6904
6905 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6906
6907 # The alias lists above are already ordered in the order we
6908 # want to output them. To ensure that each synonym is listed,
6909 # we must use the max of the two numbers.
6910 my $listed_combos = main::max(scalar @table_aliases,
6911 scalar @property_aliases);
6912 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6913
6914 my $property_had_compound_name = 0;
6915
6916 for my $i (0 .. $listed_combos - 1) {
6917 $total_entries++;
6918
6919 # The current alias for the property is the next one on
6920 # the list, or if beyond the end, start over. Similarly
6921 # for the table (\p{prop=table})
6922 my $property_alias = $property_aliases
6923 [$i % @property_aliases]->name;
6924 my $table_alias_object = $table_aliases
6925 [$i % @table_aliases];
6926 my $table_alias = $table_alias_object->name;
6927 my $loose_match = $table_alias_object->loose_match;
6928
6929 if ($table_alias !~ /\D/) { # Clarify large numbers.
6930 $table_alias = main::clarify_number($table_alias)
6931 }
6932
6933 # Add a comment for this alias combination
6934 my $current_match_comment;
6935 if ($table_property == $perl) {
6936 $current_match_comment = "\\$perl_p"
6937 . "{$table_alias}";
6938 }
6939 else {
6940 $current_match_comment
6941 = "\\p{$property_alias=$table_alias}";
6942 $property_had_compound_name = 1;
6943 }
6944
6945 # Flag any abnormal status for this table.
6946 my $flag = $property->status
6947 || $table->status
6948 || $table_alias_object->status;
37e2e78e
KW
6949 if ($flag) {
6950 if ($flag ne $PLACEHOLDER) {
6951 $flags{$flag} = $status_past_participles{$flag};
6952 } else {
6953 $flags{$flag} = <<END;
6954a placeholder because it is not in Version $string_version of Unicode, but is
6955needed by the Perl core to work gracefully. Because it is not in this version
6956of Unicode, it will not be listed in $pod_file.pod
6957END
6958 }
6959 }
99870f4d
KW
6960
6961 $loose_count++;
6962
6963 # Pretty up the comment. Note the \b; it says don't make
6964 # this line a continuation.
6965 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6966 $flag,
6967 " " x 7,
6968 $current_match_comment);
6969 } # End of generating the entries for this table.
6970
6971 # Save these for output after this group of related tables.
6972 push @description, $table->description;
6973 push @note, $table->note;
6974 push @conflicting, $table->conflicting;
6975
37e2e78e
KW
6976 # And this for output after all the tables.
6977 push @global_comments, $table->comment;
6978
99870f4d
KW
6979 # Compute an alternate compound name using the final property
6980 # synonym and the first table synonym with a colon instead of
6981 # the equal sign used elsewhere.
6982 if ($property_had_compound_name) {
6983 $properties_with_compound_names ++;
6984 if (! $compound_name || @property_aliases > 1) {
6985 $compound_name = $property_aliases[-1]->name
6986 . ': '
6987 . $table_aliases[0]->name;
6988 }
6989 }
6990 } # End of looping through all children of this table
6991
6992 # Here have assembled in $matches_comment all the related tables
6993 # to the current parent (preceded by the same info for all the
6994 # previous parents). Put out information that applies to all of
6995 # the current family.
6996 if (@conflicting) {
6997
6998 # But output the conflicting information now, as it applies to
6999 # just this table.
7000 my $conflicting = join ", ", @conflicting;
7001 if ($conflicting) {
7002 $matches_comment .= <<END;
7003
7004 Note that contrary to what you might expect, the above is NOT the same as
7005END
7006 $matches_comment .= "any of: " if @conflicting > 1;
7007 $matches_comment .= "$conflicting\n";
7008 }
7009 }
7010 if (@description) {
7011 $matches_comment .= "\n Meaning: "
7012 . join('; ', @description)
7013 . "\n";
7014 }
7015 if (@note) {
7016 $matches_comment .= "\n Note: "
7017 . join("\n ", @note)
7018 . "\n";
7019 }
7020 } # End of looping through all tables
7021
7022
7023 my $code_points;
7024 my $match;
7025 my $any_of_these;
7026 if ($count == 1) {
7027 $match = 'matches';
7028 $code_points = 'single code point';
7029 }
7030 else {
7031 $match = 'match';
7032 $code_points = "$string_count code points";
7033 }
7034
7035 my $synonyms;
7036 my $entries;
7037 if ($total_entries <= 1) {
7038 $synonyms = "";
7039 $entries = 'entry';
7040 $any_of_these = 'this'
7041 }
7042 else {
7043 $synonyms = " any of the following regular expression constructs";
7044 $entries = 'entries';
7045 $any_of_these = 'any of these'
7046 }
7047
7048 my $comment = "";
7049 if ($has_unrelated) {
7050 $comment .= <<END;
7051This file is for tables that are not necessarily related: To conserve
7052resources, every table that matches the identical set of code points in this
7053version of Unicode uses this file. Each one is listed in a separate group
7054below. It could be that the tables will match the same set of code points in
7055other Unicode releases, or it could be purely coincidence that they happen to
7056be the same in Unicode $string_version, and hence may not in other versions.
7057
7058END
7059 }
7060
7061 if (%flags) {
7062 foreach my $flag (sort keys %flags) {
7063 $comment .= <<END;
37e2e78e 7064'$flag' below means that this form is $flags{$flag}.
99870f4d 7065END
37e2e78e
KW
7066 next if $flag eq $PLACEHOLDER;
7067 $comment .= "Consult $pod_file.pod\n";
99870f4d
KW
7068 }
7069 $comment .= "\n";
7070 }
7071
7072 $comment .= <<END;
7073This file returns the $code_points in Unicode Version $string_version that
7074$match$synonyms:
7075
7076$matches_comment
37e2e78e 7077$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
7078including if adding or subtracting white space, underscore, and hyphen
7079characters matters or doesn't matter, and other permissible syntactic
7080variants. Upper/lower case distinctions never matter.
7081END
7082
7083 if ($compound_name) {
7084 $comment .= <<END;
7085
7086A colon can be substituted for the equals sign, and
7087END
7088 if ($properties_with_compound_names > 1) {
7089 $comment .= <<END;
7090within each group above,
7091END
7092 }
7093 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7094
7095 # Note the \b below, it says don't make that line a continuation.
7096 $comment .= <<END;
7097anything to the left of the equals (or colon) can be combined with anything to
7098the right. Thus, for example,
7099$compound_name
7100\bis also valid.
7101END
7102 }
7103
7104 # And append any comment(s) from the actual tables. They are all
7105 # gathered here, so may not read all that well.
37e2e78e
KW
7106 if (@global_comments) {
7107 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7108 }
99870f4d
KW
7109
7110 if ($count) { # The format differs if no code points, and needs no
7111 # explanation in that case
7112 $comment.= <<END;
7113
7114The format of the lines of this file is:
7115END
7116 $comment.= <<END;
7117START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7118STOP is the ending point, or if omitted, the range has just one code point.
7119END
0c07e538 7120 if ($leader->output_range_counts) {
99870f4d
KW
7121 $comment .= <<END;
7122Numbers in comments in [brackets] indicate how many code points are in the
7123range.
7124END
7125 }
7126 }
7127
7128 $leader->set_comment(main::join_lines($comment));
7129 return;
7130 }
7131
7132 # Accessors for the underlying list
ea25a9b2 7133 for my $sub (qw(
99870f4d
KW
7134 get_valid_code_point
7135 get_invalid_code_point
ea25a9b2 7136 ))
99870f4d
KW
7137 {
7138 no strict "refs";
7139 *$sub = sub {
7140 use strict "refs";
7141 my $self = shift;
7142
7143 return $self->_range_list->$sub(@_);
7144 }
7145 }
7146} # End closure for Match_Table
7147
7148package Property;
7149
7150# The Property class represents a Unicode property, or the $perl
7151# pseudo-property. It contains a map table initialized empty at construction
7152# time, and for properties accessible through regular expressions, various
7153# match tables, created through the add_match_table() method, and referenced
7154# by the table('NAME') or tables() methods, the latter returning a list of all
7155# of the match tables. Otherwise table operations implicitly are for the map
7156# table.
7157#
7158# Most of the data in the property is actually about its map table, so it
7159# mostly just uses that table's accessors for most methods. The two could
7160# have been combined into one object, but for clarity because of their
7161# differing semantics, they have been kept separate. It could be argued that
7162# the 'file' and 'directory' fields should be kept with the map table.
7163#
7164# Each property has a type. This can be set in the constructor, or in the
7165# set_type accessor, but mostly it is figured out by the data. Every property
7166# starts with unknown type, overridden by a parameter to the constructor, or
7167# as match tables are added, or ranges added to the map table, the data is
7168# inspected, and the type changed. After the table is mostly or entirely
7169# filled, compute_type() should be called to finalize they analysis.
7170#
7171# There are very few operations defined. One can safely remove a range from
7172# the map table, and property_add_or_replace_non_nulls() adds the maps from another
7173# table to this one, replacing any in the intersection of the two.
7174
7175sub standardize { return main::standardize($_[0]); }
7176sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7177
7178{ # Closure
7179
7180 # This hash will contain as keys, all the aliases of all properties, and
7181 # as values, pointers to their respective property objects. This allows
7182 # quick look-up of a property from any of its names.
7183 my %alias_to_property_of;
7184
7185 sub dump_alias_to_property_of {
7186 # For debugging
7187
7188 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7189 return;
7190 }
7191
7192 sub property_ref {
7193 # This is a package subroutine, not called as a method.
7194 # If the single parameter is a literal '*' it returns a list of all
7195 # defined properties.
7196 # Otherwise, the single parameter is a name, and it returns a pointer
7197 # to the corresponding property object, or undef if none.
7198 #
7199 # Properties can have several different names. The 'standard' form of
7200 # each of them is stored in %alias_to_property_of as they are defined.
7201 # But it's possible that this subroutine will be called with some
7202 # variant, so if the initial lookup fails, it is repeated with the
98dc9551 7203 # standardized form of the input name. If found, besides returning the
99870f4d
KW
7204 # result, the input name is added to the list so future calls won't
7205 # have to do the conversion again.
7206
7207 my $name = shift;
7208
7209 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7210
7211 if (! defined $name) {
7212 Carp::my_carp_bug("Undefined input property. No action taken.");
7213 return;
7214 }
7215
7216 return main::uniques(values %alias_to_property_of) if $name eq '*';
7217
7218 # Return cached result if have it.
7219 my $result = $alias_to_property_of{$name};
7220 return $result if defined $result;
7221
7222 # Convert the input to standard form.
7223 my $standard_name = standardize($name);
7224
7225 $result = $alias_to_property_of{$standard_name};
7226 return unless defined $result; # Don't cache undefs
7227
7228 # Cache the result before returning it.
7229 $alias_to_property_of{$name} = $result;
7230 return $result;
7231 }
7232
7233
7234 main::setup_package();
7235
7236 my %map;
7237 # A pointer to the map table object for this property
7238 main::set_access('map', \%map);
7239
7240 my %full_name;
7241 # The property's full name. This is a duplicate of the copy kept in the
7242 # map table, but is needed because stringify needs it during
7243 # construction of the map table, and then would have a chicken before egg
7244 # problem.
7245 main::set_access('full_name', \%full_name, 'r');
7246
7247 my %table_ref;
7248 # This hash will contain as keys, all the aliases of any match tables
7249 # attached to this property, and as values, the pointers to their
7250 # respective tables. This allows quick look-up of a table from any of its
7251 # names.
7252 main::set_access('table_ref', \%table_ref);
7253
7254 my %type;
7255 # The type of the property, $ENUM, $BINARY, etc
7256 main::set_access('type', \%type, 'r');
7257
7258 my %file;
7259 # The filename where the map table will go (if actually written).
7260 # Normally defaulted, but can be overridden.
7261 main::set_access('file', \%file, 'r', 's');
7262
7263 my %directory;
7264 # The directory where the map table will go (if actually written).
7265 # Normally defaulted, but can be overridden.
7266 main::set_access('directory', \%directory, 's');
7267
7268 my %pseudo_map_type;
7269 # This is used to affect the calculation of the map types for all the
7270 # ranges in the table. It should be set to one of the values that signify
7271 # to alter the calculation.
7272 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7273
7274 my %has_only_code_point_maps;
7275 # A boolean used to help in computing the type of data in the map table.
7276 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7277
7278 my %unique_maps;
7279 # A list of the first few distinct mappings this property has. This is
7280 # used to disambiguate between binary and enum property types, so don't
7281 # have to keep more than three.
7282 main::set_access('unique_maps', \%unique_maps);
7283
56557540
KW
7284 my %pre_declared_maps;
7285 # A boolean that gives whether the input data should declare all the
7286 # tables used, or not. If the former, unknown ones raise a warning.
7287 main::set_access('pre_declared_maps',
7288 \%pre_declared_maps, 'r');
7289
99870f4d
KW
7290 sub new {
7291 # The only required parameter is the positionally first, name. All
7292 # other parameters are key => value pairs. See the documentation just
7293 # above for the meanings of the ones not passed directly on to the map
7294 # table constructor.
7295
7296 my $class = shift;
7297 my $name = shift || "";
7298
7299 my $self = property_ref($name);
7300 if (defined $self) {
7301 my $options_string = join ", ", @_;
7302 $options_string = ". Ignoring options $options_string" if $options_string;
7303 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7304 return $self;
7305 }
7306
7307 my %args = @_;
7308
7309 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 7310 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7311
7312 $directory{$addr} = delete $args{'Directory'};
7313 $file{$addr} = delete $args{'File'};
7314 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7315 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7316 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
56557540
KW
7317 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7318 # Starting in this release, property
7319 # values should be defined for all
7320 # properties, except those overriding this
7321 // $v_version ge v5.1.0;
99870f4d
KW
7322 # Rest of parameters passed on.
7323
7324 $has_only_code_point_maps{$addr} = 1;
7325 $table_ref{$addr} = { };
7326 $unique_maps{$addr} = { };
7327
7328 $map{$addr} = Map_Table->new($name,
7329 Full_Name => $full_name{$addr},
7330 _Alias_Hash => \%alias_to_property_of,
7331 _Property => $self,
7332 %args);
7333 return $self;
7334 }
7335
7336 # See this program's beginning comment block about overloading the copy
7337 # constructor. Few operations are defined on properties, but a couple are
7338 # useful. It is safe to take the inverse of a property, and to remove a
7339 # single code point from it.
7340 use overload
7341 fallback => 0,
7342 qw("") => "_operator_stringify",
7343 "." => \&main::_operator_dot,
7344 '==' => \&main::_operator_equal,
7345 '!=' => \&main::_operator_not_equal,
7346 '=' => sub { return shift },
7347 '-=' => "_minus_and_equal",
7348 ;
7349
7350 sub _operator_stringify {
7351 return "Property '" . shift->full_name . "'";
7352 }
7353
7354 sub _minus_and_equal {
7355 # Remove a single code point from the map table of a property.
7356
7357 my $self = shift;
7358 my $other = shift;
7359 my $reversed = shift;
7360 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7361
7362 if (ref $other) {
7363 Carp::my_carp_bug("Can't cope with a "
7364 . ref($other)
7365 . " argument to '-='. Subtraction ignored.");
7366 return $self;
7367 }
98dc9551 7368 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
99870f4d
KW
7369 Carp::my_carp_bug("Can't cope with a "
7370 . __PACKAGE__
7371 . " being the first parameter in a '-='. Subtraction ignored.");
7372 return $self;
7373 }
7374 else {
f998e60c 7375 no overloading;
051df77b 7376 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
7377 }
7378 return $self;
7379 }
7380
7381 sub add_match_table {
7382 # Add a new match table for this property, with name given by the
7383 # parameter. It returns a pointer to the table.
7384
7385 my $self = shift;
7386 my $name = shift;
7387 my %args = @_;
7388
ffe43484 7389 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7390
7391 my $table = $table_ref{$addr}{$name};
7392 my $standard_name = main::standardize($name);
7393 if (defined $table
7394 || (defined ($table = $table_ref{$addr}{$standard_name})))
7395 {
7396 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7397 $table_ref{$addr}{$name} = $table;
7398 return $table;
7399 }
7400 else {
7401
7402 # See if this is a perl extension, if not passed in.
7403 my $perl_extension = delete $args{'Perl_Extension'};
7404 $perl_extension
7405 = $self->perl_extension if ! defined $perl_extension;
7406
7407 $table = Match_Table->new(
7408 Name => $name,
7409 Perl_Extension => $perl_extension,
7410 _Alias_Hash => $table_ref{$addr},
7411 _Property => $self,
7412
7413 # gets property's status by default
7414 Status => $self->status,
7415 _Status_Info => $self->status_info,
7416 %args,
7417 Internal_Only_Warning => 1); # Override any
7418 # input param
7419 return unless defined $table;
7420 }
7421
7422 # Save the names for quick look up
7423 $table_ref{$addr}{$standard_name} = $table;
7424 $table_ref{$addr}{$name} = $table;
7425
7426 # Perhaps we can figure out the type of this property based on the
7427 # fact of adding this match table. First, string properties don't
7428 # have match tables; second, a binary property can't have 3 match
7429 # tables
7430 if ($type{$addr} == $UNKNOWN) {
7431 $type{$addr} = $NON_STRING;
7432 }
7433 elsif ($type{$addr} == $STRING) {
7434 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7435 $type{$addr} = $NON_STRING;
7436 }
7437 elsif ($type{$addr} != $ENUM) {
7438 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7439 && $type{$addr} == $BINARY)
7440 {
7441 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7442 $type{$addr} = $ENUM;
7443 }
7444 }
7445
7446 return $table;
7447 }
7448
4b9b0bc5
KW
7449 sub delete_match_table {
7450 # Delete the table referred to by $2 from the property $1.
7451
7452 my $self = shift;
7453 my $table_to_remove = shift;
7454 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7455
7456 my $addr = do { no overloading; pack 'J', $self; };
7457
7458 # Remove all names that refer to it.
7459 foreach my $key (keys %{$table_ref{$addr}}) {
7460 delete $table_ref{$addr}{$key}
7461 if $table_ref{$addr}{$key} == $table_to_remove;
7462 }
7463
7464 $table_to_remove->DESTROY;
7465 return;
7466 }
7467
99870f4d
KW
7468 sub table {
7469 # Return a pointer to the match table (with name given by the
7470 # parameter) associated with this property; undef if none.
7471
7472 my $self = shift;
7473 my $name = shift;
7474 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7475
ffe43484 7476 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7477
7478 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7479
7480 # If quick look-up failed, try again using the standard form of the
7481 # input name. If that succeeds, cache the result before returning so
7482 # won't have to standardize this input name again.
7483 my $standard_name = main::standardize($name);
7484 return unless defined $table_ref{$addr}{$standard_name};
7485
7486 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7487 return $table_ref{$addr}{$name};
7488 }
7489
7490 sub tables {
7491 # Return a list of pointers to all the match tables attached to this
7492 # property
7493
f998e60c 7494 no overloading;
051df77b 7495 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
7496 }
7497
7498 sub directory {
7499 # Returns the directory the map table for this property should be
7500 # output in. If a specific directory has been specified, that has
7501 # priority; 'undef' is returned if the type isn't defined;
7502 # or $map_directory for everything else.
7503
ffe43484 7504 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
7505
7506 return $directory{$addr} if defined $directory{$addr};
7507 return undef if $type{$addr} == $UNKNOWN;
7508 return $map_directory;
7509 }
7510
7511 sub swash_name {
7512 # Return the name that is used to both:
7513 # 1) Name the file that the map table is written to.
7514 # 2) The name of swash related stuff inside that file.
7515 # The reason for this is that the Perl core historically has used
7516 # certain names that aren't the same as the Unicode property names.
7517 # To continue using these, $file is hard-coded in this file for those,
7518 # but otherwise the standard name is used. This is different from the
7519 # external_name, so that the rest of the files, like in lib can use
7520 # the standard name always, without regard to historical precedent.
7521
7522 my $self = shift;
7523 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7524
ffe43484 7525 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7526
7527 return $file{$addr} if defined $file{$addr};
7528 return $map{$addr}->external_name;
7529 }
7530
7531 sub to_create_match_tables {
7532 # Returns a boolean as to whether or not match tables should be
7533 # created for this property.
7534
7535 my $self = shift;
7536 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7537
7538 # The whole point of this pseudo property is match tables.
7539 return 1 if $self == $perl;
7540
ffe43484 7541 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7542
7543 # Don't generate tables of code points that match the property values
7544 # of a string property. Such a list would most likely have many
7545 # property values, each with just one or very few code points mapping
7546 # to it.
7547 return 0 if $type{$addr} == $STRING;
7548
7549 # Don't generate anything for unimplemented properties.
7550 return 0 if grep { $self->complete_name eq $_ }
7551 @unimplemented_properties;
7552 # Otherwise, do.
7553 return 1;
7554 }
7555
7556 sub property_add_or_replace_non_nulls {
7557 # This adds the mappings in the property $other to $self. Non-null
7558 # mappings from $other override those in $self. It essentially merges
7559 # the two properties, with the second having priority except for null
7560 # mappings.
7561
7562 my $self = shift;
7563 my $other = shift;
7564 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7565
7566 if (! $other->isa(__PACKAGE__)) {
7567 Carp::my_carp_bug("$other should be a "
7568 . __PACKAGE__
7569 . ". Not a '"
7570 . ref($other)
7571 . "'. Not added;");
7572 return;
7573 }
7574
f998e60c 7575 no overloading;
051df77b 7576 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
7577 }
7578
7579 sub set_type {
7580 # Set the type of the property. Mostly this is figured out by the
7581 # data in the table. But this is used to set it explicitly. The
7582 # reason it is not a standard accessor is that when setting a binary
7583 # property, we need to make sure that all the true/false aliases are
7584 # present, as they were omitted in early Unicode releases.
7585
7586 my $self = shift;
7587 my $type = shift;
7588 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7589
7590 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
7591 Carp::my_carp("Unrecognized type '$type'. Type not set");
7592 return;
7593 }
7594
051df77b 7595 { no overloading; $type{pack 'J', $self} = $type; }
99870f4d
KW
7596 return if $type != $BINARY;
7597
7598 my $yes = $self->table('Y');
7599 $yes = $self->table('Yes') if ! defined $yes;
7600 $yes = $self->add_match_table('Y') if ! defined $yes;
7601 $yes->add_alias('Yes');
7602 $yes->add_alias('T');
7603 $yes->add_alias('True');
7604
7605 my $no = $self->table('N');
7606 $no = $self->table('No') if ! defined $no;
7607 $no = $self->add_match_table('N') if ! defined $no;
7608 $no->add_alias('No');
7609 $no->add_alias('F');
7610 $no->add_alias('False');
7611 return;
7612 }
7613
7614 sub add_map {
7615 # Add a map to the property's map table. This also keeps
7616 # track of the maps so that the property type can be determined from
7617 # its data.
7618
7619 my $self = shift;
7620 my $start = shift; # First code point in range
7621 my $end = shift; # Final code point in range
7622 my $map = shift; # What the range maps to.
7623 # Rest of parameters passed on.
7624
ffe43484 7625 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7626
7627 # If haven't the type of the property, gather information to figure it
7628 # out.
7629 if ($type{$addr} == $UNKNOWN) {
7630
7631 # If the map contains an interior blank or dash, or most other
7632 # nonword characters, it will be a string property. This
7633 # heuristic may actually miss some string properties. If so, they
7634 # may need to have explicit set_types called for them. This
7635 # happens in the Unihan properties.
7636 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7637 || $map =~ / [^\w.\/\ -] /x)
7638 {
7639 $self->set_type($STRING);
7640
7641 # $unique_maps is used for disambiguating between ENUM and
7642 # BINARY later; since we know the property is not going to be
7643 # one of those, no point in keeping the data around
7644 undef $unique_maps{$addr};
7645 }
7646 else {
7647
7648 # Not necessarily a string. The final decision has to be
7649 # deferred until all the data are in. We keep track of if all
7650 # the values are code points for that eventual decision.
7651 $has_only_code_point_maps{$addr} &=
7652 $map =~ / ^ $code_point_re $/x;
7653
7654 # For the purposes of disambiguating between binary and other
7655 # enumerations at the end, we keep track of the first three
7656 # distinct property values. Once we get to three, we know
7657 # it's not going to be binary, so no need to track more.
7658 if (scalar keys %{$unique_maps{$addr}} < 3) {
7659 $unique_maps{$addr}{main::standardize($map)} = 1;
7660 }
7661 }
7662 }
7663
7664 # Add the mapping by calling our map table's method
7665 return $map{$addr}->add_map($start, $end, $map, @_);
7666 }
7667
7668 sub compute_type {
7669 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7670 # should be called after the property is mostly filled with its maps.
7671 # We have been keeping track of what the property values have been,
7672 # and now have the necessary information to figure out the type.
7673
7674 my $self = shift;
7675 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7676
ffe43484 7677 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7678
7679 my $type = $type{$addr};
7680
7681 # If already have figured these out, no need to do so again, but we do
7682 # a double check on ENUMS to make sure that a string property hasn't
7683 # improperly been classified as an ENUM, so continue on with those.
7684 return if $type == $STRING || $type == $BINARY;
7685
7686 # If every map is to a code point, is a string property.
7687 if ($type == $UNKNOWN
7688 && ($has_only_code_point_maps{$addr}
7689 || (defined $map{$addr}->default_map
7690 && $map{$addr}->default_map eq "")))
7691 {
7692 $self->set_type($STRING);
7693 }
7694 else {
7695
7696 # Otherwise, it is to some sort of enumeration. (The case where
7697 # it is a Unicode miscellaneous property, and treated like a
7698 # string in this program is handled in add_map()). Distinguish
7699 # between binary and some other enumeration type. Of course, if
7700 # there are more than two values, it's not binary. But more
7701 # subtle is the test that the default mapping is defined means it
7702 # isn't binary. This in fact may change in the future if Unicode
7703 # changes the way its data is structured. But so far, no binary
7704 # properties ever have @missing lines for them, so the default map
7705 # isn't defined for them. The few properties that are two-valued
7706 # and aren't considered binary have the default map defined
7707 # starting in Unicode 5.0, when the @missing lines appeared; and
7708 # this program has special code to put in a default map for them
7709 # for earlier than 5.0 releases.
7710 if ($type == $ENUM
7711 || scalar keys %{$unique_maps{$addr}} > 2
7712 || defined $self->default_map)
7713 {
7714 my $tables = $self->tables;
7715 my $count = $self->count;
7716 if ($verbosity && $count > 500 && $tables/$count > .1) {
7717 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7718 }
7719 $self->set_type($ENUM);
7720 }
7721 else {
7722 $self->set_type($BINARY);
7723 }
7724 }
7725 undef $unique_maps{$addr}; # Garbage collect
7726 return;
7727 }
7728
7729 # Most of the accessors for a property actually apply to its map table.
7730 # Setup up accessor functions for those, referring to %map
ea25a9b2 7731 for my $sub (qw(
99870f4d
KW
7732 add_alias
7733 add_anomalous_entry
7734 add_comment
7735 add_conflicting
7736 add_description
7737 add_duplicate
7738 add_note
7739 aliases
7740 comment
7741 complete_name
2f7a8815 7742 containing_range
99870f4d
KW
7743 core_access
7744 count
7745 default_map
7746 delete_range
7747 description
7748 each_range
7749 external_name
7750 file_path
7751 format
7752 initialize
7753 inverse_list
7754 is_empty
7755 name
7756 note
7757 perl_extension
7758 property
7759 range_count
7760 ranges
7761 range_size_1
7762 reset_each_range
7763 set_comment
7764 set_core_access
7765 set_default_map
7766 set_file_path
7767 set_final_comment
7768 set_range_size_1
7769 set_status
7770 set_to_output_map
7771 short_name
7772 status
7773 status_info
7774 to_output_map
0a9dbafc 7775 type_of
99870f4d
KW
7776 value_of
7777 write
ea25a9b2 7778 ))
99870f4d
KW
7779 # 'property' above is for symmetry, so that one can take
7780 # the property of a property and get itself, and so don't
7781 # have to distinguish between properties and tables in
7782 # calling code
7783 {
7784 no strict "refs";
7785 *$sub = sub {
7786 use strict "refs";
7787 my $self = shift;
f998e60c 7788 no overloading;
051df77b 7789 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7790 }
7791 }
7792
7793
7794} # End closure
7795
7796package main;
7797
7798sub join_lines($) {
7799 # Returns lines of the input joined together, so that they can be folded
7800 # properly.
7801 # This causes continuation lines to be joined together into one long line
7802 # for folding. A continuation line is any line that doesn't begin with a
7803 # space or "\b" (the latter is stripped from the output). This is so
7804 # lines can be be in a HERE document so as to fit nicely in the terminal
7805 # width, but be joined together in one long line, and then folded with
7806 # indents, '#' prefixes, etc, properly handled.
7807 # A blank separates the joined lines except if there is a break; an extra
7808 # blank is inserted after a period ending a line.
7809
98dc9551 7810 # Initialize the return with the first line.
99870f4d
KW
7811 my ($return, @lines) = split "\n", shift;
7812
7813 # If the first line is null, it was an empty line, add the \n back in
7814 $return = "\n" if $return eq "";
7815
7816 # Now join the remainder of the physical lines.
7817 for my $line (@lines) {
7818
7819 # An empty line means wanted a blank line, so add two \n's to get that
7820 # effect, and go to the next line.
7821 if (length $line == 0) {
7822 $return .= "\n\n";
7823 next;
7824 }
7825
7826 # Look at the last character of what we have so far.
7827 my $previous_char = substr($return, -1, 1);
7828
7829 # And at the next char to be output.
7830 my $next_char = substr($line, 0, 1);
7831
7832 if ($previous_char ne "\n") {
7833
7834 # Here didn't end wth a nl. If the next char a blank or \b, it
7835 # means that here there is a break anyway. So add a nl to the
7836 # output.
7837 if ($next_char eq " " || $next_char eq "\b") {
7838 $previous_char = "\n";
7839 $return .= $previous_char;
7840 }
7841
7842 # Add an extra space after periods.
7843 $return .= " " if $previous_char eq '.';
7844 }
7845
7846 # Here $previous_char is still the latest character to be output. If
7847 # it isn't a nl, it means that the next line is to be a continuation
7848 # line, with a blank inserted between them.
7849 $return .= " " if $previous_char ne "\n";
7850
7851 # Get rid of any \b
7852 substr($line, 0, 1) = "" if $next_char eq "\b";
7853
7854 # And append this next line.
7855 $return .= $line;
7856 }
7857
7858 return $return;
7859}
7860
7861sub simple_fold($;$$$) {
7862 # Returns a string of the input (string or an array of strings) folded
7863 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7864 # a \n
7865 # This is tailored for the kind of text written by this program,
7866 # especially the pod file, which can have very long names with
7867 # underscores in the middle, or words like AbcDefgHij.... We allow
7868 # breaking in the middle of such constructs if the line won't fit
7869 # otherwise. The break in such cases will come either just after an
7870 # underscore, or just before one of the Capital letters.
7871
7872 local $to_trace = 0 if main::DEBUG;
7873
7874 my $line = shift;
7875 my $prefix = shift; # Optional string to prepend to each output
7876 # line
7877 $prefix = "" unless defined $prefix;
7878
7879 my $hanging_indent = shift; # Optional number of spaces to indent
7880 # continuation lines
7881 $hanging_indent = 0 unless $hanging_indent;
7882
7883 my $right_margin = shift; # Optional number of spaces to narrow the
7884 # total width by.
7885 $right_margin = 0 unless defined $right_margin;
7886
7887 # Call carp with the 'nofold' option to avoid it from trying to call us
7888 # recursively
7889 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7890
7891 # The space available doesn't include what's automatically prepended
7892 # to each line, or what's reserved on the right.
7893 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7894 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7895
7896 if (DEBUG && $hanging_indent >= $max) {
7897 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7898 $hanging_indent = 0;
7899 }
7900
7901 # First, split into the current physical lines.
7902 my @line;
7903 if (ref $line) { # Better be an array, because not bothering to
7904 # test
7905 foreach my $line (@{$line}) {
7906 push @line, split /\n/, $line;
7907 }
7908 }
7909 else {
7910 @line = split /\n/, $line;
7911 }
7912
7913 #local $to_trace = 1 if main::DEBUG;
7914 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7915
7916 # Look at each current physical line.
7917 for (my $i = 0; $i < @line; $i++) {
7918 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7919 #local $to_trace = 1 if main::DEBUG;
7920 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7921
7922 # Remove prefix, because will be added back anyway, don't want
7923 # doubled prefix
7924 $line[$i] =~ s/^$prefix//;
7925
7926 # Remove trailing space
7927 $line[$i] =~ s/\s+\Z//;
7928
7929 # If the line is too long, fold it.
7930 if (length $line[$i] > $max) {
7931 my $remainder;
7932
7933 # Here needs to fold. Save the leading space in the line for
7934 # later.
7935 $line[$i] =~ /^ ( \s* )/x;
7936 my $leading_space = $1;
7937 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7938
7939 # If character at final permissible position is white space,
7940 # fold there, which will delete that white space
7941 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7942 $remainder = substr($line[$i], $max);
7943 $line[$i] = substr($line[$i], 0, $max - 1);
7944 }
7945 else {
7946
7947 # Otherwise fold at an acceptable break char closest to
7948 # the max length. Look at just the maximal initial
7949 # segment of the line
7950 my $segment = substr($line[$i], 0, $max - 1);
7951 if ($segment =~
7952 /^ ( .{$hanging_indent} # Don't look before the
7953 # indent.
7954 \ * # Don't look in leading
7955 # blanks past the indent
7956 [^ ] .* # Find the right-most
7957 (?: # acceptable break:
7958 [ \s = ] # space or equal
7959 | - (?! [.0-9] ) # or non-unary minus.
7960 ) # $1 includes the character
7961 )/x)
7962 {
7963 # Split into the initial part that fits, and remaining
7964 # part of the input
7965 $remainder = substr($line[$i], length $1);
7966 $line[$i] = $1;
7967 trace $line[$i] if DEBUG && $to_trace;
7968 trace $remainder if DEBUG && $to_trace;
7969 }
7970
7971 # If didn't find a good breaking spot, see if there is a
7972 # not-so-good breaking spot. These are just after
7973 # underscores or where the case changes from lower to
7974 # upper. Use \a as a soft hyphen, but give up
7975 # and don't break the line if there is actually a \a
7976 # already in the input. We use an ascii character for the
7977 # soft-hyphen to avoid any attempt by miniperl to try to
7978 # access the files that this program is creating.
7979 elsif ($segment !~ /\a/
7980 && ($segment =~ s/_/_\a/g
7981 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7982 {
7983 # Here were able to find at least one place to insert
7984 # our substitute soft hyphen. Find the right-most one
7985 # and replace it by a real hyphen.
7986 trace $segment if DEBUG && $to_trace;
7987 substr($segment,
7988 rindex($segment, "\a"),
7989 1) = '-';
7990
7991 # Then remove the soft hyphen substitutes.
7992 $segment =~ s/\a//g;
7993 trace $segment if DEBUG && $to_trace;
7994
7995 # And split into the initial part that fits, and
7996 # remainder of the line
7997 my $pos = rindex($segment, '-');
7998 $remainder = substr($line[$i], $pos);
7999 trace $remainder if DEBUG && $to_trace;
8000 $line[$i] = substr($segment, 0, $pos + 1);
8001 }
8002 }
8003
8004 # Here we know if we can fold or not. If we can, $remainder
8005 # is what remains to be processed in the next iteration.
8006 if (defined $remainder) {
8007 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8008
8009 # Insert the folded remainder of the line as a new element
8010 # of the array. (It may still be too long, but we will
8011 # deal with that next time through the loop.) Omit any
8012 # leading space in the remainder.
8013 $remainder =~ s/^\s+//;
8014 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8015
8016 # But then indent by whichever is larger of:
8017 # 1) the leading space on the input line;
8018 # 2) the hanging indent.
8019 # This preserves indentation in the original line.
8020 my $lead = ($leading_space)
8021 ? length $leading_space
8022 : $hanging_indent;
8023 $lead = max($lead, $hanging_indent);
8024 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8025 }
8026 }
8027
8028 # Ready to output the line. Get rid of any trailing space
8029 # And prefix by the required $prefix passed in.
8030 $line[$i] =~ s/\s+$//;
8031 $line[$i] = "$prefix$line[$i]\n";
8032 } # End of looping through all the lines.
8033
8034 return join "", @line;
8035}
8036
8037sub property_ref { # Returns a reference to a property object.
8038 return Property::property_ref(@_);
8039}
8040
8041sub force_unlink ($) {
8042 my $filename = shift;
8043 return unless file_exists($filename);
8044 return if CORE::unlink($filename);
8045
8046 # We might need write permission
8047 chmod 0777, $filename;
8048 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8049 return;
8050}
8051
9218f1cf 8052sub write ($$@) {
9abe8df8
KW
8053 # Given a filename and references to arrays of lines, write the lines of
8054 # each array to the file
99870f4d
KW
8055 # Filename can be given as an arrayref of directory names
8056
9218f1cf 8057 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
99870f4d 8058
9abe8df8 8059 my $file = shift;
9218f1cf 8060 my $use_utf8 = shift;
99870f4d
KW
8061
8062 # Get into a single string if an array, and get rid of, in Unix terms, any
8063 # leading '.'
8064 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8065 $file = File::Spec->canonpath($file);
8066
8067 # If has directories, make sure that they all exist
8068 (undef, my $directories, undef) = File::Spec->splitpath($file);
8069 File::Path::mkpath($directories) if $directories && ! -d $directories;
8070
8071 push @files_actually_output, $file;
8072
99870f4d
KW
8073 force_unlink ($file);
8074
8075 my $OUT;
8076 if (not open $OUT, ">", $file) {
8077 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8078 return;
8079 }
430ada4c 8080
9218f1cf
KW
8081 binmode $OUT, ":utf8" if $use_utf8;
8082
9abe8df8
KW
8083 while (defined (my $lines_ref = shift)) {
8084 unless (@$lines_ref) {
8085 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8086 }
8087
8088 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8089 }
430ada4c
NC
8090 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8091
99870f4d
KW
8092 print "$file written.\n" if $verbosity >= $VERBOSE;
8093
99870f4d
KW
8094 return;
8095}
8096
8097
8098sub Standardize($) {
8099 # This converts the input name string into a standardized equivalent to
8100 # use internally.
8101
8102 my $name = shift;
8103 unless (defined $name) {
8104 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8105 return;
8106 }
8107
8108 # Remove any leading or trailing white space
8109 $name =~ s/^\s+//g;
8110 $name =~ s/\s+$//g;
8111
98dc9551 8112 # Convert interior white space and hyphens into underscores.
99870f4d
KW
8113 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8114
8115 # Capitalize the letter following an underscore, and convert a sequence of
8116 # multiple underscores to a single one
8117 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8118
8119 # And capitalize the first letter, but not for the special cjk ones.
8120 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8121 return $name;
8122}
8123
8124sub standardize ($) {
8125 # Returns a lower-cased standardized name, without underscores. This form
8126 # is chosen so that it can distinguish between any real versus superficial
8127 # Unicode name differences. It relies on the fact that Unicode doesn't
8128 # have interior underscores, white space, nor dashes in any
8129 # stricter-matched name. It should not be used on Unicode code point
8130 # names (the Name property), as they mostly, but not always follow these
8131 # rules.
8132
8133 my $name = Standardize(shift);
8134 return if !defined $name;
8135
8136 $name =~ s/ (?<= .) _ (?= . ) //xg;
8137 return lc $name;
8138}
8139
c85f591a
KW
8140sub utf8_heavy_name ($$) {
8141 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8142 # perhaps this function should be placed somewhere, like Heavy.pl so that
8143 # utf8_heavy can use it directly without duplicating code that can get
8144 # out-of sync.
8145
8146 my $table = shift;
8147 my $alias = shift;
8148 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8149
8150 my $property = $table->property;
8151 $property = ($property == $perl)
8152 ? "" # 'perl' is never explicitly stated
8153 : standardize($property->name) . '=';
8154 if ($alias->loose_match) {
8155 return $property . standardize($alias->name);
8156 }
8157 else {
8158 return lc ($property . $alias->name);
8159 }
8160
8161 return;
8162}
8163
99870f4d
KW
8164{ # Closure
8165
8166 my $indent_increment = " " x 2;
8167 my %already_output;
8168
8169 $main::simple_dumper_nesting = 0;
8170
8171 sub simple_dumper {
8172 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8173 # the real thing as we have to run under miniperl.
8174
8175 # It is designed so that on input it is at the beginning of a line,
8176 # and the final thing output in any call is a trailing ",\n".
8177
8178 my $item = shift;
8179 my $indent = shift;
8180 $indent = "" if ! defined $indent;
8181
8182 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8183
8184 # nesting level is localized, so that as the call stack pops, it goes
8185 # back to the prior value.
8186 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8187 undef %already_output if $main::simple_dumper_nesting == 0;
8188 $main::simple_dumper_nesting++;
8189 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8190
8191 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8192
8193 # Determine the indent for recursive calls.
8194 my $next_indent = $indent . $indent_increment;
8195
8196 my $output;
8197 if (! ref $item) {
8198
8199 # Dump of scalar: just output it in quotes if not a number. To do
8200 # so we must escape certain characters, and therefore need to
8201 # operate on a copy to avoid changing the original
8202 my $copy = $item;
8203 $copy = $UNDEF unless defined $copy;
8204
8205 # Quote non-numbers (numbers also have optional leading '-' and
8206 # fractions)
8207 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
8208
8209 # Escape apostrophe and backslash
8210 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8211 $copy = "'$copy'";
8212 }
8213 $output = "$indent$copy,\n";
8214 }
8215 else {
8216
8217 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 8218 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 8219 if (defined $already_output{$addr}) {
99870f4d
KW
8220 return "${indent}ALREADY OUTPUT: $item\n";
8221 }
f998e60c 8222 $already_output{$addr} = $item;
99870f4d
KW
8223
8224 if (ref $item eq 'ARRAY') {
8225 my $using_brackets;
8226 $output = $indent;
8227 if ($main::simple_dumper_nesting > 1) {
8228 $output .= '[';
8229 $using_brackets = 1;
8230 }
8231 else {
8232 $using_brackets = 0;
8233 }
8234
8235 # If the array is empty, put the closing bracket on the same
8236 # line. Otherwise, recursively add each array element
8237 if (@$item == 0) {
8238 $output .= " ";
8239 }
8240 else {
8241 $output .= "\n";
8242 for (my $i = 0; $i < @$item; $i++) {
8243
8244 # Indent array elements one level
8245 $output .= &simple_dumper($item->[$i], $next_indent);
8246 $output =~ s/\n$//; # Remove trailing nl so as to
8247 $output .= " # [$i]\n"; # add a comment giving the
8248 # array index
8249 }
8250 $output .= $indent; # Indent closing ']' to orig level
8251 }
8252 $output .= ']' if $using_brackets;
8253 $output .= ",\n";
8254 }
8255 elsif (ref $item eq 'HASH') {
8256 my $is_first_line;
8257 my $using_braces;
8258 my $body_indent;
8259
8260 # No surrounding braces at top level
8261 $output .= $indent;
8262 if ($main::simple_dumper_nesting > 1) {
8263 $output .= "{\n";
8264 $is_first_line = 0;
8265 $body_indent = $next_indent;
8266 $next_indent .= $indent_increment;
8267 $using_braces = 1;
8268 }
8269 else {
8270 $is_first_line = 1;
8271 $body_indent = $indent;
8272 $using_braces = 0;
8273 }
8274
8275 # Output hashes sorted alphabetically instead of apparently
8276 # random. Use caseless alphabetic sort
8277 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8278 {
8279 if ($is_first_line) {
8280 $is_first_line = 0;
8281 }
8282 else {
8283 $output .= "$body_indent";
8284 }
8285
8286 # The key must be a scalar, but this recursive call quotes
8287 # it
8288 $output .= &simple_dumper($key);
8289
8290 # And change the trailing comma and nl to the hash fat
8291 # comma for clarity, and so the value can be on the same
8292 # line
8293 $output =~ s/,\n$/ => /;
8294
8295 # Recursively call to get the value's dump.
8296 my $next = &simple_dumper($item->{$key}, $next_indent);
8297
8298 # If the value is all on one line, remove its indent, so
8299 # will follow the => immediately. If it takes more than
8300 # one line, start it on a new line.
8301 if ($next !~ /\n.*\n/) {
8302 $next =~ s/^ *//;
8303 }
8304 else {
8305 $output .= "\n";
8306 }
8307 $output .= $next;
8308 }
8309
8310 $output .= "$indent},\n" if $using_braces;
8311 }
8312 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8313 $output = $indent . ref($item) . "\n";
8314 # XXX see if blessed
8315 }
8316 elsif ($item->can('dump')) {
8317
8318 # By convention in this program, objects furnish a 'dump'
8319 # method. Since not doing any output at this level, just pass
8320 # on the input indent
8321 $output = $item->dump($indent);
8322 }
8323 else {
8324 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8325 }
8326 }
8327 return $output;
8328 }
8329}
8330
8331sub dump_inside_out {
8332 # Dump inside-out hashes in an object's state by converting them to a
8333 # regular hash and then calling simple_dumper on that.
8334
8335 my $object = shift;
8336 my $fields_ref = shift;
8337 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8338
ffe43484 8339 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
8340
8341 my %hash;
8342 foreach my $key (keys %$fields_ref) {
8343 $hash{$key} = $fields_ref->{$key}{$addr};
8344 }
8345
8346 return simple_dumper(\%hash, @_);
8347}
8348
8349sub _operator_dot {
8350 # Overloaded '.' method that is common to all packages. It uses the
8351 # package's stringify method.
8352
8353 my $self = shift;
8354 my $other = shift;
8355 my $reversed = shift;
8356 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8357
8358 $other = "" unless defined $other;
8359
8360 foreach my $which (\$self, \$other) {
8361 next unless ref $$which;
8362 if ($$which->can('_operator_stringify')) {
8363 $$which = $$which->_operator_stringify;
8364 }
8365 else {
8366 my $ref = ref $$which;
ffe43484 8367 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
8368 $$which = "$ref ($addr)";
8369 }
8370 }
8371 return ($reversed)
8372 ? "$other$self"
8373 : "$self$other";
8374}
8375
8376sub _operator_equal {
8377 # Generic overloaded '==' routine. To be equal, they must be the exact
8378 # same object
8379
8380 my $self = shift;
8381 my $other = shift;
8382
8383 return 0 unless defined $other;
8384 return 0 unless ref $other;
f998e60c 8385 no overloading;
2100aa98 8386 return $self == $other;
99870f4d
KW
8387}
8388
8389sub _operator_not_equal {
8390 my $self = shift;
8391 my $other = shift;
8392
8393 return ! _operator_equal($self, $other);
8394}
8395
8396sub process_PropertyAliases($) {
8397 # This reads in the PropertyAliases.txt file, which contains almost all
8398 # the character properties in Unicode and their equivalent aliases:
8399 # scf ; Simple_Case_Folding ; sfc
8400 #
8401 # Field 0 is the preferred short name for the property.
8402 # Field 1 is the full name.
8403 # Any succeeding ones are other accepted names.
8404
8405 my $file= shift;
8406 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8407
8408 # This whole file was non-existent in early releases, so use our own
8409 # internal one.
8410 $file->insert_lines(get_old_property_aliases())
8411 if ! -e 'PropertyAliases.txt';
8412
8413 # Add any cjk properties that may have been defined.
8414 $file->insert_lines(@cjk_properties);
8415
8416 while ($file->next_line) {
8417
8418 my @data = split /\s*;\s*/;
8419
8420 my $full = $data[1];
8421
8422 my $this = Property->new($data[0], Full_Name => $full);
8423
8424 # Start looking for more aliases after these two.
8425 for my $i (2 .. @data - 1) {
8426 $this->add_alias($data[$i]);
8427 }
8428
8429 }
8430 return;
8431}
8432
8433sub finish_property_setup {
8434 # Finishes setting up after PropertyAliases.
8435
8436 my $file = shift;
8437 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8438
8439 # This entry was missing from this file in earlier Unicode versions
8440 if (-e 'Jamo.txt') {
8441 my $jsn = property_ref('JSN');
8442 if (! defined $jsn) {
8443 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
8444 }
8445 }
8446
5f7264c7 8447 # This entry is still missing as of 6.0, perhaps because no short name for
99870f4d
KW
8448 # it.
8449 if (-e 'NameAliases.txt') {
8450 my $aliases = property_ref('Name_Alias');
8451 if (! defined $aliases) {
8452 $aliases = Property->new('Name_Alias');
8453 }
8454 }
8455
8456 # These are used so much, that we set globals for them.
8457 $gc = property_ref('General_Category');
8458 $block = property_ref('Block');
8459
8460 # Perl adds this alias.
8461 $gc->add_alias('Category');
8462
8463 # For backwards compatibility, these property files have particular names.
8464 my $upper = property_ref('Uppercase_Mapping');
8465 $upper->set_core_access('uc()');
8466 $upper->set_file('Upper'); # This is what utf8.c calls it
8467
8468 my $lower = property_ref('Lowercase_Mapping');
8469 $lower->set_core_access('lc()');
8470 $lower->set_file('Lower');
8471
8472 my $title = property_ref('Titlecase_Mapping');
8473 $title->set_core_access('ucfirst()');
8474 $title->set_file('Title');
8475
8476 my $fold = property_ref('Case_Folding');
8477 $fold->set_file('Fold') if defined $fold;
8478
2cd56239
KW
8479 # utf8.c has a different meaning for non range-size-1 for map properties
8480 # that this program doesn't currently handle; and even if it were changed
8481 # to do so, some other code may be using them expecting range size 1.
99870f4d
KW
8482 foreach my $property (qw {
8483 Case_Folding
8484 Lowercase_Mapping
8485 Titlecase_Mapping
8486 Uppercase_Mapping
8487 })
8488 {
8489 property_ref($property)->set_range_size_1(1);
8490 }
8491
8492 # These two properties aren't actually used in the core, but unfortunately
8493 # the names just above that are in the core interfere with these, so
8494 # choose different names. These aren't a problem unless the map tables
8495 # for these files get written out.
8496 my $lowercase = property_ref('Lowercase');
8497 $lowercase->set_file('IsLower') if defined $lowercase;
8498 my $uppercase = property_ref('Uppercase');
8499 $uppercase->set_file('IsUpper') if defined $uppercase;
8500
8501 # Set up the hard-coded default mappings, but only on properties defined
8502 # for this release
8503 foreach my $property (keys %default_mapping) {
8504 my $property_object = property_ref($property);
8505 next if ! defined $property_object;
8506 my $default_map = $default_mapping{$property};
8507 $property_object->set_default_map($default_map);
8508
8509 # A map of <code point> implies the property is string.
8510 if ($property_object->type == $UNKNOWN
8511 && $default_map eq $CODE_POINT)
8512 {
8513 $property_object->set_type($STRING);
8514 }
8515 }
8516
8517 # The following use the Multi_Default class to create objects for
8518 # defaults.
8519
8520 # Bidi class has a complicated default, but the derived file takes care of
8521 # the complications, leaving just 'L'.
8522 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
8523 property_ref('Bidi_Class')->set_default_map('L');
8524 }
8525 else {
8526 my $default;
8527
8528 # The derived file was introduced in 3.1.1. The values below are
8529 # taken from table 3-8, TUS 3.0
8530 my $default_R =
8531 'my $default = Range_List->new;
8532 $default->add_range(0x0590, 0x05FF);
8533 $default->add_range(0xFB1D, 0xFB4F);'
8534 ;
8535
8536 # The defaults apply only to unassigned characters
a67f160a 8537 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
8538
8539 if ($v_version lt v3.0.0) {
8540 $default = Multi_Default->new(R => $default_R, 'L');
8541 }
8542 else {
8543
8544 # AL apparently not introduced until 3.0: TUS 2.x references are
8545 # not on-line to check it out
8546 my $default_AL =
8547 'my $default = Range_List->new;
8548 $default->add_range(0x0600, 0x07BF);
8549 $default->add_range(0xFB50, 0xFDFF);
8550 $default->add_range(0xFE70, 0xFEFF);'
8551 ;
8552
8553 # Non-character code points introduced in this release; aren't AL
8554 if ($v_version ge 3.1.0) {
8555 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
8556 }
a67f160a 8557 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
8558 $default = Multi_Default->new(AL => $default_AL,
8559 R => $default_R,
8560 'L');
8561 }
8562 property_ref('Bidi_Class')->set_default_map($default);
8563 }
8564
8565 # Joining type has a complicated default, but the derived file takes care
8566 # of the complications, leaving just 'U' (or Non_Joining), except the file
8567 # is bad in 3.1.0
8568 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
8569 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
8570 property_ref('Joining_Type')->set_default_map('Non_Joining');
8571 }
8572 else {
8573
8574 # Otherwise, there are not one, but two possibilities for the
8575 # missing defaults: T and U.
8576 # The missing defaults that evaluate to T are given by:
8577 # T = Mn + Cf - ZWNJ - ZWJ
8578 # where Mn and Cf are the general category values. In other words,
8579 # any non-spacing mark or any format control character, except
8580 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8581 # WIDTH JOINER (joining type C).
8582 my $default = Multi_Default->new(
8583 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8584 'Non_Joining');
8585 property_ref('Joining_Type')->set_default_map($default);
8586 }
8587 }
8588
8589 # Line break has a complicated default in early releases. It is 'Unknown'
8590 # for non-assigned code points; 'AL' for assigned.
8591 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8592 my $lb = property_ref('Line_Break');
8593 if ($v_version gt 3.2.0) {
8594 $lb->set_default_map('Unknown');
8595 }
8596 else {
8597 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8598 'AL');
8599 $lb->set_default_map($default);
8600 }
8601
8602 # If has the URS property, make sure that the standard aliases are in
8603 # it, since not in the input tables in some versions.
8604 my $urs = property_ref('Unicode_Radical_Stroke');
8605 if (defined $urs) {
8606 $urs->add_alias('cjkRSUnicode');
8607 $urs->add_alias('kRSUnicode');
8608 }
8609 }
8610 return;
8611}
8612
8613sub get_old_property_aliases() {
8614 # Returns what would be in PropertyAliases.txt if it existed in very old
8615 # versions of Unicode. It was derived from the one in 3.2, and pared
8616 # down based on the data that was actually in the older releases.
8617 # An attempt was made to use the existence of files to mean inclusion or
8618 # not of various aliases, but if this was not sufficient, using version
8619 # numbers was resorted to.
8620
8621 my @return;
8622
8623 # These are to be used in all versions (though some are constructed by
8624 # this program if missing)
8625 push @return, split /\n/, <<'END';
8626bc ; Bidi_Class
8627Bidi_M ; Bidi_Mirrored
8628cf ; Case_Folding
8629ccc ; Canonical_Combining_Class
8630dm ; Decomposition_Mapping
8631dt ; Decomposition_Type
8632gc ; General_Category
8633isc ; ISO_Comment
8634lc ; Lowercase_Mapping
8635na ; Name
8636na1 ; Unicode_1_Name
8637nt ; Numeric_Type
8638nv ; Numeric_Value
8639sfc ; Simple_Case_Folding
8640slc ; Simple_Lowercase_Mapping
8641stc ; Simple_Titlecase_Mapping
8642suc ; Simple_Uppercase_Mapping
8643tc ; Titlecase_Mapping
8644uc ; Uppercase_Mapping
8645END
8646
8647 if (-e 'Blocks.txt') {
8648 push @return, "blk ; Block\n";
8649 }
8650 if (-e 'ArabicShaping.txt') {
8651 push @return, split /\n/, <<'END';
8652jg ; Joining_Group
8653jt ; Joining_Type
8654END
8655 }
8656 if (-e 'PropList.txt') {
8657
8658 # This first set is in the original old-style proplist.
8659 push @return, split /\n/, <<'END';
8660Alpha ; Alphabetic
8661Bidi_C ; Bidi_Control
8662Dash ; Dash
8663Dia ; Diacritic
8664Ext ; Extender
8665Hex ; Hex_Digit
8666Hyphen ; Hyphen
8667IDC ; ID_Continue
8668Ideo ; Ideographic
8669Join_C ; Join_Control
8670Math ; Math
8671QMark ; Quotation_Mark
8672Term ; Terminal_Punctuation
8673WSpace ; White_Space
8674END
8675 # The next sets were added later
8676 if ($v_version ge v3.0.0) {
8677 push @return, split /\n/, <<'END';
8678Upper ; Uppercase
8679Lower ; Lowercase
8680END
8681 }
8682 if ($v_version ge v3.0.1) {
8683 push @return, split /\n/, <<'END';
8684NChar ; Noncharacter_Code_Point
8685END
8686 }
8687 # The next sets were added in the new-style
8688 if ($v_version ge v3.1.0) {
8689 push @return, split /\n/, <<'END';
8690OAlpha ; Other_Alphabetic
8691OLower ; Other_Lowercase
8692OMath ; Other_Math
8693OUpper ; Other_Uppercase
8694END
8695 }
8696 if ($v_version ge v3.1.1) {
8697 push @return, "AHex ; ASCII_Hex_Digit\n";
8698 }
8699 }
8700 if (-e 'EastAsianWidth.txt') {
8701 push @return, "ea ; East_Asian_Width\n";
8702 }
8703 if (-e 'CompositionExclusions.txt') {
8704 push @return, "CE ; Composition_Exclusion\n";
8705 }
8706 if (-e 'LineBreak.txt') {
8707 push @return, "lb ; Line_Break\n";
8708 }
8709 if (-e 'BidiMirroring.txt') {
8710 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8711 }
8712 if (-e 'Scripts.txt') {
8713 push @return, "sc ; Script\n";
8714 }
8715 if (-e 'DNormalizationProps.txt') {
8716 push @return, split /\n/, <<'END';
8717Comp_Ex ; Full_Composition_Exclusion
8718FC_NFKC ; FC_NFKC_Closure
8719NFC_QC ; NFC_Quick_Check
8720NFD_QC ; NFD_Quick_Check
8721NFKC_QC ; NFKC_Quick_Check
8722NFKD_QC ; NFKD_Quick_Check
8723XO_NFC ; Expands_On_NFC
8724XO_NFD ; Expands_On_NFD
8725XO_NFKC ; Expands_On_NFKC
8726XO_NFKD ; Expands_On_NFKD
8727END
8728 }
8729 if (-e 'DCoreProperties.txt') {
8730 push @return, split /\n/, <<'END';
8731IDS ; ID_Start
8732XIDC ; XID_Continue
8733XIDS ; XID_Start
8734END
8735 # These can also appear in some versions of PropList.txt
8736 push @return, "Lower ; Lowercase\n"
8737 unless grep { $_ =~ /^Lower\b/} @return;
8738 push @return, "Upper ; Uppercase\n"
8739 unless grep { $_ =~ /^Upper\b/} @return;
8740 }
8741
8742 # This flag requires the DAge.txt file to be copied into the directory.
8743 if (DEBUG && $compare_versions) {
8744 push @return, 'age ; Age';
8745 }
8746
8747 return @return;
8748}
8749
8750sub process_PropValueAliases {
8751 # This file contains values that properties look like:
8752 # bc ; AL ; Arabic_Letter
8753 # blk; n/a ; Greek_And_Coptic ; Greek
8754 #
8755 # Field 0 is the property.
8756 # Field 1 is the short name of a property value or 'n/a' if no
8757 # short name exists;
8758 # Field 2 is the full property value name;
8759 # Any other fields are more synonyms for the property value.
8760 # Purely numeric property values are omitted from the file; as are some
8761 # others, fewer and fewer in later releases
8762
8763 # Entries for the ccc property have an extra field before the
8764 # abbreviation:
8765 # ccc; 0; NR ; Not_Reordered
8766 # It is the numeric value that the names are synonyms for.
8767
8768 # There are comment entries for values missing from this file:
8769 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8770 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8771
8772 my $file= shift;
8773 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8774
8775 # This whole file was non-existent in early releases, so use our own
8776 # internal one if necessary.
8777 if (! -e 'PropValueAliases.txt') {
8778 $file->insert_lines(get_old_property_value_aliases());
8779 }
8780
8781 # Add any explicit cjk values
8782 $file->insert_lines(@cjk_property_values);
8783
8784 # This line is used only for testing the code that checks for name
8785 # conflicts. There is a script Inherited, and when this line is executed
8786 # it causes there to be a name conflict with the 'Inherited' that this
8787 # program generates for this block property value
8788 #$file->insert_lines('blk; n/a; Herited');
8789
8790
8791 # Process each line of the file ...
8792 while ($file->next_line) {
8793
8794 my ($property, @data) = split /\s*;\s*/;
8795
8796 # The full name for the ccc property value is in field 2 of the
8797 # remaining ones; field 1 for all other properties. Swap ccc fields 1
8798 # and 2. (Rightmost splice removes field 2, returning it; left splice
8799 # inserts that into field 1, thus shifting former field 1 to field 2.)
8800 splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8801
8802 # If there is no short name, use the full one in element 1
8803 $data[0] = $data[1] if $data[0] eq "n/a";
8804
8805 # Earlier releases had the pseudo property 'qc' that should expand to
8806 # the ones that replace it below.
8807 if ($property eq 'qc') {
8808 if (lc $data[0] eq 'y') {
8809 $file->insert_lines('NFC_QC; Y ; Yes',
8810 'NFD_QC; Y ; Yes',
8811 'NFKC_QC; Y ; Yes',
8812 'NFKD_QC; Y ; Yes',
8813 );
8814 }
8815 elsif (lc $data[0] eq 'n') {
8816 $file->insert_lines('NFC_QC; N ; No',
8817 'NFD_QC; N ; No',
8818 'NFKC_QC; N ; No',
8819 'NFKD_QC; N ; No',
8820 );
8821 }
8822 elsif (lc $data[0] eq 'm') {
8823 $file->insert_lines('NFC_QC; M ; Maybe',
8824 'NFKC_QC; M ; Maybe',
8825 );
8826 }
8827 else {
8828 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8829 }
8830 next;
8831 }
8832
8833 # The first field is the short name, 2nd is the full one.
8834 my $property_object = property_ref($property);
8835 my $table = $property_object->add_match_table($data[0],
8836 Full_Name => $data[1]);
8837
8838 # Start looking for more aliases after these two.
8839 for my $i (2 .. @data - 1) {
8840 $table->add_alias($data[$i]);
8841 }
8842 } # End of looping through the file
8843
8844 # As noted in the comments early in the program, it generates tables for
8845 # the default values for all releases, even those for which the concept
8846 # didn't exist at the time. Here we add those if missing.
8847 my $age = property_ref('age');
8848 if (defined $age && ! defined $age->table('Unassigned')) {
8849 $age->add_match_table('Unassigned');
8850 }
8851 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8852 && ! defined $block->table('No_Block');
8853
8854
8855 # Now set the default mappings of the properties from the file. This is
8856 # done after the loop because a number of properties have only @missings
8857 # entries in the file, and may not show up until the end.
8858 my @defaults = $file->get_missings;
8859 foreach my $default_ref (@defaults) {
8860 my $default = $default_ref->[0];
8861 my $property = property_ref($default_ref->[1]);
8862 $property->set_default_map($default);
8863 }
8864 return;
8865}
8866
8867sub get_old_property_value_aliases () {
8868 # Returns what would be in PropValueAliases.txt if it existed in very old
8869 # versions of Unicode. It was derived from the one in 3.2, and pared
8870 # down. An attempt was made to use the existence of files to mean
8871 # inclusion or not of various aliases, but if this was not sufficient,
8872 # using version numbers was resorted to.
8873
8874 my @return = split /\n/, <<'END';
8875bc ; AN ; Arabic_Number
8876bc ; B ; Paragraph_Separator
8877bc ; CS ; Common_Separator
8878bc ; EN ; European_Number
8879bc ; ES ; European_Separator
8880bc ; ET ; European_Terminator
8881bc ; L ; Left_To_Right
8882bc ; ON ; Other_Neutral
8883bc ; R ; Right_To_Left
8884bc ; WS ; White_Space
8885
8886# The standard combining classes are very much different in v1, so only use
8887# ones that look right (not checked thoroughly)
8888ccc; 0; NR ; Not_Reordered
8889ccc; 1; OV ; Overlay
8890ccc; 7; NK ; Nukta
8891ccc; 8; KV ; Kana_Voicing
8892ccc; 9; VR ; Virama
8893ccc; 202; ATBL ; Attached_Below_Left
8894ccc; 216; ATAR ; Attached_Above_Right
8895ccc; 218; BL ; Below_Left
8896ccc; 220; B ; Below
8897ccc; 222; BR ; Below_Right
8898ccc; 224; L ; Left
8899ccc; 228; AL ; Above_Left
8900ccc; 230; A ; Above
8901ccc; 232; AR ; Above_Right
8902ccc; 234; DA ; Double_Above
8903
8904dt ; can ; canonical
8905dt ; enc ; circle
8906dt ; fin ; final
8907dt ; font ; font
8908dt ; fra ; fraction
8909dt ; init ; initial
8910dt ; iso ; isolated
8911dt ; med ; medial
8912dt ; n/a ; none
8913dt ; nb ; noBreak
8914dt ; sqr ; square
8915dt ; sub ; sub
8916dt ; sup ; super
8917
8918gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8919gc ; Cc ; Control
8920gc ; Cn ; Unassigned
8921gc ; Co ; Private_Use
8922gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8923gc ; LC ; Cased_Letter # Ll | Lt | Lu
8924gc ; Ll ; Lowercase_Letter
8925gc ; Lm ; Modifier_Letter
8926gc ; Lo ; Other_Letter
8927gc ; Lu ; Uppercase_Letter
8928gc ; M ; Mark # Mc | Me | Mn
8929gc ; Mc ; Spacing_Mark
8930gc ; Mn ; Nonspacing_Mark
8931gc ; N ; Number # Nd | Nl | No
8932gc ; Nd ; Decimal_Number
8933gc ; No ; Other_Number
8934gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8935gc ; Pd ; Dash_Punctuation
8936gc ; Pe ; Close_Punctuation
8937gc ; Po ; Other_Punctuation
8938gc ; Ps ; Open_Punctuation
8939gc ; S ; Symbol # Sc | Sk | Sm | So
8940gc ; Sc ; Currency_Symbol
8941gc ; Sm ; Math_Symbol
8942gc ; So ; Other_Symbol
8943gc ; Z ; Separator # Zl | Zp | Zs
8944gc ; Zl ; Line_Separator
8945gc ; Zp ; Paragraph_Separator
8946gc ; Zs ; Space_Separator
8947
8948nt ; de ; Decimal
8949nt ; di ; Digit
8950nt ; n/a ; None
8951nt ; nu ; Numeric
8952END
8953
8954 if (-e 'ArabicShaping.txt') {
8955 push @return, split /\n/, <<'END';
8956jg ; n/a ; AIN
8957jg ; n/a ; ALEF
8958jg ; n/a ; DAL
8959jg ; n/a ; GAF
8960jg ; n/a ; LAM
8961jg ; n/a ; MEEM
8962jg ; n/a ; NO_JOINING_GROUP
8963jg ; n/a ; NOON
8964jg ; n/a ; QAF
8965jg ; n/a ; SAD
8966jg ; n/a ; SEEN
8967jg ; n/a ; TAH
8968jg ; n/a ; WAW
8969
8970jt ; C ; Join_Causing
8971jt ; D ; Dual_Joining
8972jt ; L ; Left_Joining
8973jt ; R ; Right_Joining
8974jt ; U ; Non_Joining
8975jt ; T ; Transparent
8976END
8977 if ($v_version ge v3.0.0) {
8978 push @return, split /\n/, <<'END';
8979jg ; n/a ; ALAPH
8980jg ; n/a ; BEH
8981jg ; n/a ; BETH
8982jg ; n/a ; DALATH_RISH
8983jg ; n/a ; E
8984jg ; n/a ; FEH
8985jg ; n/a ; FINAL_SEMKATH
8986jg ; n/a ; GAMAL
8987jg ; n/a ; HAH
8988jg ; n/a ; HAMZA_ON_HEH_GOAL
8989jg ; n/a ; HE
8990jg ; n/a ; HEH
8991jg ; n/a ; HEH_GOAL
8992jg ; n/a ; HETH
8993jg ; n/a ; KAF
8994jg ; n/a ; KAPH
8995jg ; n/a ; KNOTTED_HEH
8996jg ; n/a ; LAMADH
8997jg ; n/a ; MIM
8998jg ; n/a ; NUN
8999jg ; n/a ; PE
9000jg ; n/a ; QAPH
9001jg ; n/a ; REH
9002jg ; n/a ; REVERSED_PE
9003jg ; n/a ; SADHE
9004jg ; n/a ; SEMKATH
9005jg ; n/a ; SHIN
9006jg ; n/a ; SWASH_KAF
9007jg ; n/a ; TAW
9008jg ; n/a ; TEH_MARBUTA
9009jg ; n/a ; TETH
9010jg ; n/a ; YEH
9011jg ; n/a ; YEH_BARREE
9012jg ; n/a ; YEH_WITH_TAIL
9013jg ; n/a ; YUDH
9014jg ; n/a ; YUDH_HE
9015jg ; n/a ; ZAIN
9016END
9017 }
9018 }
9019
9020
9021 if (-e 'EastAsianWidth.txt') {
9022 push @return, split /\n/, <<'END';
9023ea ; A ; Ambiguous
9024ea ; F ; Fullwidth
9025ea ; H ; Halfwidth
9026ea ; N ; Neutral
9027ea ; Na ; Narrow
9028ea ; W ; Wide
9029END
9030 }
9031
9032 if (-e 'LineBreak.txt') {
9033 push @return, split /\n/, <<'END';
9034lb ; AI ; Ambiguous
9035lb ; AL ; Alphabetic
9036lb ; B2 ; Break_Both
9037lb ; BA ; Break_After
9038lb ; BB ; Break_Before
9039lb ; BK ; Mandatory_Break
9040lb ; CB ; Contingent_Break
9041lb ; CL ; Close_Punctuation
9042lb ; CM ; Combining_Mark
9043lb ; CR ; Carriage_Return
9044lb ; EX ; Exclamation
9045lb ; GL ; Glue
9046lb ; HY ; Hyphen
9047lb ; ID ; Ideographic
9048lb ; IN ; Inseperable
9049lb ; IS ; Infix_Numeric
9050lb ; LF ; Line_Feed
9051lb ; NS ; Nonstarter
9052lb ; NU ; Numeric
9053lb ; OP ; Open_Punctuation
9054lb ; PO ; Postfix_Numeric
9055lb ; PR ; Prefix_Numeric
9056lb ; QU ; Quotation
9057lb ; SA ; Complex_Context
9058lb ; SG ; Surrogate
9059lb ; SP ; Space
9060lb ; SY ; Break_Symbols
9061lb ; XX ; Unknown
9062lb ; ZW ; ZWSpace
9063END
9064 }
9065
9066 if (-e 'DNormalizationProps.txt') {
9067 push @return, split /\n/, <<'END';
9068qc ; M ; Maybe
9069qc ; N ; No
9070qc ; Y ; Yes
9071END
9072 }
9073
9074 if (-e 'Scripts.txt') {
9075 push @return, split /\n/, <<'END';
9076sc ; Arab ; Arabic
9077sc ; Armn ; Armenian
9078sc ; Beng ; Bengali
9079sc ; Bopo ; Bopomofo
9080sc ; Cans ; Canadian_Aboriginal
9081sc ; Cher ; Cherokee
9082sc ; Cyrl ; Cyrillic
9083sc ; Deva ; Devanagari
9084sc ; Dsrt ; Deseret
9085sc ; Ethi ; Ethiopic
9086sc ; Geor ; Georgian
9087sc ; Goth ; Gothic
9088sc ; Grek ; Greek
9089sc ; Gujr ; Gujarati
9090sc ; Guru ; Gurmukhi
9091sc ; Hang ; Hangul
9092sc ; Hani ; Han
9093sc ; Hebr ; Hebrew
9094sc ; Hira ; Hiragana
9095sc ; Ital ; Old_Italic
9096sc ; Kana ; Katakana
9097sc ; Khmr ; Khmer
9098sc ; Knda ; Kannada
9099sc ; Laoo ; Lao
9100sc ; Latn ; Latin
9101sc ; Mlym ; Malayalam
9102sc ; Mong ; Mongolian
9103sc ; Mymr ; Myanmar
9104sc ; Ogam ; Ogham
9105sc ; Orya ; Oriya
9106sc ; Qaai ; Inherited
9107sc ; Runr ; Runic
9108sc ; Sinh ; Sinhala
9109sc ; Syrc ; Syriac
9110sc ; Taml ; Tamil
9111sc ; Telu ; Telugu
9112sc ; Thaa ; Thaana
9113sc ; Thai ; Thai
9114sc ; Tibt ; Tibetan
9115sc ; Yiii ; Yi
9116sc ; Zyyy ; Common
9117END
9118 }
9119
9120 if ($v_version ge v2.0.0) {
9121 push @return, split /\n/, <<'END';
9122dt ; com ; compat
9123dt ; nar ; narrow
9124dt ; sml ; small
9125dt ; vert ; vertical
9126dt ; wide ; wide
9127
9128gc ; Cf ; Format
9129gc ; Cs ; Surrogate
9130gc ; Lt ; Titlecase_Letter
9131gc ; Me ; Enclosing_Mark
9132gc ; Nl ; Letter_Number
9133gc ; Pc ; Connector_Punctuation
9134gc ; Sk ; Modifier_Symbol
9135END
9136 }
9137 if ($v_version ge v2.1.2) {
9138 push @return, "bc ; S ; Segment_Separator\n";
9139 }
9140 if ($v_version ge v2.1.5) {
9141 push @return, split /\n/, <<'END';
9142gc ; Pf ; Final_Punctuation
9143gc ; Pi ; Initial_Punctuation
9144END
9145 }
9146 if ($v_version ge v2.1.8) {
9147 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9148 }
9149
9150 if ($v_version ge v3.0.0) {
9151 push @return, split /\n/, <<'END';
9152bc ; AL ; Arabic_Letter
9153bc ; BN ; Boundary_Neutral
9154bc ; LRE ; Left_To_Right_Embedding
9155bc ; LRO ; Left_To_Right_Override
9156bc ; NSM ; Nonspacing_Mark
9157bc ; PDF ; Pop_Directional_Format
9158bc ; RLE ; Right_To_Left_Embedding
9159bc ; RLO ; Right_To_Left_Override
9160
9161ccc; 233; DB ; Double_Below
9162END
9163 }
9164
9165 if ($v_version ge v3.1.0) {
9166 push @return, "ccc; 226; R ; Right\n";
9167 }
9168
9169 return @return;
9170}
9171
b1c167a3
KW
9172sub output_perl_charnames_line ($$) {
9173
9174 # Output the entries in Perl_charnames specially, using 5 digits instead
9175 # of four. This makes the entries a constant length, and simplifies
9176 # charnames.pm which this table is for. Unicode can have 6 digit
9177 # ordinals, but they are all private use or noncharacters which do not
9178 # have names, so won't be in this table.
9179
73d9566f 9180 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
9181}
9182
99870f4d
KW
9183{ # Closure
9184 # This is used to store the range list of all the code points usable when
9185 # the little used $compare_versions feature is enabled.
9186 my $compare_versions_range_list;
9187
9188 sub process_generic_property_file {
9189 # This processes a file containing property mappings and puts them
9190 # into internal map tables. It should be used to handle any property
9191 # files that have mappings from a code point or range thereof to
9192 # something else. This means almost all the UCD .txt files.
9193 # each_line_handlers() should be set to adjust the lines of these
9194 # files, if necessary, to what this routine understands:
9195 #
9196 # 0374 ; NFD_QC; N
9197 # 003C..003E ; Math
9198 #
92f9d56c 9199 # the fields are: "codepoint-range ; property; map"
99870f4d
KW
9200 #
9201 # meaning the codepoints in the range all have the value 'map' under
9202 # 'property'.
98dc9551 9203 # Beginning and trailing white space in each field are not significant.
99870f4d
KW
9204 # Note there is not a trailing semi-colon in the above. A trailing
9205 # semi-colon means the map is a null-string. An omitted map, as
9206 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9207 # table syntax. (This could have been hidden from this routine by
9208 # doing it in the $file object, but that would require parsing of the
9209 # line there, so would have to parse it twice, or change the interface
9210 # to pass this an array. So not done.)
9211 #
9212 # The map field may begin with a sequence of commands that apply to
9213 # this range. Each such command begins and ends with $CMD_DELIM.
9214 # These are used to indicate, for example, that the mapping for a
9215 # range has a non-default type.
9216 #
9217 # This loops through the file, calling it's next_line() method, and
9218 # then taking the map and adding it to the property's table.
9219 # Complications arise because any number of properties can be in the
9220 # file, in any order, interspersed in any way. The first time a
9221 # property is seen, it gets information about that property and
f86864ac 9222 # caches it for quick retrieval later. It also normalizes the maps
5d7f7709
KW
9223 # so that only one of many synonyms is stored. The Unicode input
9224 # files do use some multiple synonyms.
99870f4d
KW
9225
9226 my $file = shift;
9227 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9228
9229 my %property_info; # To keep track of what properties
9230 # have already had entries in the
9231 # current file, and info about each,
9232 # so don't have to recompute.
9233 my $property_name; # property currently being worked on
9234 my $property_type; # and its type
9235 my $previous_property_name = ""; # name from last time through loop
9236 my $property_object; # pointer to the current property's
9237 # object
9238 my $property_addr; # the address of that object
9239 my $default_map; # the string that code points missing
9240 # from the file map to
9241 my $default_table; # For non-string properties, a
9242 # reference to the match table that
9243 # will contain the list of code
9244 # points that map to $default_map.
9245
9246 # Get the next real non-comment line
9247 LINE:
9248 while ($file->next_line) {
9249
9250 # Default replacement type; means that if parts of the range have
9251 # already been stored in our tables, the new map overrides them if
9252 # they differ more than cosmetically
9253 my $replace = $IF_NOT_EQUIVALENT;
9254 my $map_type; # Default type for the map of this range
9255
9256 #local $to_trace = 1 if main::DEBUG;
9257 trace $_ if main::DEBUG && $to_trace;
9258
9259 # Split the line into components
9260 my ($range, $property_name, $map, @remainder)
9261 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9262
9263 # If more or less on the line than we are expecting, warn and skip
9264 # the line
9265 if (@remainder) {
9266 $file->carp_bad_line('Extra fields');
9267 next LINE;
9268 }
9269 elsif ( ! defined $property_name) {
9270 $file->carp_bad_line('Missing property');
9271 next LINE;
9272 }
9273
9274 # Examine the range.
9275 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9276 {
9277 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9278 next LINE;
9279 }
9280 my $low = hex $1;
9281 my $high = (defined $2) ? hex $2 : $low;
9282
9283 # For the very specialized case of comparing two Unicode
9284 # versions...
9285 if (DEBUG && $compare_versions) {
9286 if ($property_name eq 'Age') {
9287
9288 # Only allow code points at least as old as the version
9289 # specified.
9290 my $age = pack "C*", split(/\./, $map); # v string
9291 next LINE if $age gt $compare_versions;
9292 }
9293 else {
9294
9295 # Again, we throw out code points younger than those of
9296 # the specified version. By now, the Age property is
9297 # populated. We use the intersection of each input range
9298 # with this property to find what code points in it are
9299 # valid. To do the intersection, we have to convert the
9300 # Age property map to a Range_list. We only have to do
9301 # this once.
9302 if (! defined $compare_versions_range_list) {
9303 my $age = property_ref('Age');
9304 if (! -e 'DAge.txt') {
9305 croak "Need to have 'DAge.txt' file to do version comparison";
9306 }
9307 elsif ($age->count == 0) {
9308 croak "The 'Age' table is empty, but its file exists";
9309 }
9310 $compare_versions_range_list
9311 = Range_List->new(Initialize => $age);
9312 }
9313
9314 # An undefined map is always 'Y'
9315 $map = 'Y' if ! defined $map;
9316
9317 # Calculate the intersection of the input range with the
9318 # code points that are known in the specified version
9319 my @ranges = ($compare_versions_range_list
9320 & Range->new($low, $high))->ranges;
9321
9322 # If the intersection is empty, throw away this range
9323 next LINE unless @ranges;
9324
9325 # Only examine the first range this time through the loop.
9326 my $this_range = shift @ranges;
9327
9328 # Put any remaining ranges in the queue to be processed
9329 # later. Note that there is unnecessary work here, as we
9330 # will do the intersection again for each of these ranges
9331 # during some future iteration of the LINE loop, but this
9332 # code is not used in production. The later intersections
9333 # are guaranteed to not splinter, so this will not become
9334 # an infinite loop.
9335 my $line = join ';', $property_name, $map;
9336 foreach my $range (@ranges) {
9337 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9338 $range->start,
9339 $range->end,
9340 $line));
9341 }
9342
9343 # And process the first range, like any other.
9344 $low = $this_range->start;
9345 $high = $this_range->end;
9346 }
9347 } # End of $compare_versions
9348
9349 # If changing to a new property, get the things constant per
9350 # property
9351 if ($previous_property_name ne $property_name) {
9352
9353 $property_object = property_ref($property_name);
9354 if (! defined $property_object) {
9355 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
9356 next LINE;
9357 }
051df77b 9358 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
9359
9360 # Defer changing names until have a line that is acceptable
9361 # (the 'next' statement above means is unacceptable)
9362 $previous_property_name = $property_name;
9363
9364 # If not the first time for this property, retrieve info about
9365 # it from the cache
9366 if (defined ($property_info{$property_addr}{'type'})) {
9367 $property_type = $property_info{$property_addr}{'type'};
9368 $default_map = $property_info{$property_addr}{'default'};
9369 $map_type
9370 = $property_info{$property_addr}{'pseudo_map_type'};
9371 $default_table
9372 = $property_info{$property_addr}{'default_table'};
9373 }
9374 else {
9375
9376 # Here, is the first time for this property. Set up the
9377 # cache.
9378 $property_type = $property_info{$property_addr}{'type'}
9379 = $property_object->type;
9380 $map_type
9381 = $property_info{$property_addr}{'pseudo_map_type'}
9382 = $property_object->pseudo_map_type;
9383
9384 # The Unicode files are set up so that if the map is not
9385 # defined, it is a binary property
9386 if (! defined $map && $property_type != $BINARY) {
9387 if ($property_type != $UNKNOWN
9388 && $property_type != $NON_STRING)
9389 {
9390 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
9391 }
9392 else {
9393 $property_object->set_type($BINARY);
9394 $property_type
9395 = $property_info{$property_addr}{'type'}
9396 = $BINARY;
9397 }
9398 }
9399
9400 # Get any @missings default for this property. This
9401 # should precede the first entry for the property in the
9402 # input file, and is located in a comment that has been
9403 # stored by the Input_file class until we access it here.
9404 # It's possible that there is more than one such line
9405 # waiting for us; collect them all, and parse
9406 my @missings_list = $file->get_missings
9407 if $file->has_missings_defaults;
9408 foreach my $default_ref (@missings_list) {
9409 my $default = $default_ref->[0];
ffe43484 9410 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
9411
9412 # For string properties, the default is just what the
9413 # file says, but non-string properties should already
9414 # have set up a table for the default property value;
9415 # use the table for these, so can resolve synonyms
9416 # later to a single standard one.
9417 if ($property_type == $STRING
9418 || $property_type == $UNKNOWN)
9419 {
9420 $property_info{$addr}{'missings'} = $default;
9421 }
9422 else {
9423 $property_info{$addr}{'missings'}
9424 = $property_object->table($default);
9425 }
9426 }
9427
9428 # Finished storing all the @missings defaults in the input
9429 # file so far. Get the one for the current property.
9430 my $missings = $property_info{$property_addr}{'missings'};
9431
9432 # But we likely have separately stored what the default
9433 # should be. (This is to accommodate versions of the
9434 # standard where the @missings lines are absent or
9435 # incomplete.) Hopefully the two will match. But check
9436 # it out.
9437 $default_map = $property_object->default_map;
9438
9439 # If the map is a ref, it means that the default won't be
9440 # processed until later, so undef it, so next few lines
9441 # will redefine it to something that nothing will match
9442 undef $default_map if ref $default_map;
9443
9444 # Create a $default_map if don't have one; maybe a dummy
9445 # that won't match anything.
9446 if (! defined $default_map) {
9447
9448 # Use any @missings line in the file.
9449 if (defined $missings) {
9450 if (ref $missings) {
9451 $default_map = $missings->full_name;
9452 $default_table = $missings;
9453 }
9454 else {
9455 $default_map = $missings;
9456 }
678f13d5 9457
99870f4d
KW
9458 # And store it with the property for outside use.
9459 $property_object->set_default_map($default_map);
9460 }
9461 else {
9462
9463 # Neither an @missings nor a default map. Create
9464 # a dummy one, so won't have to test definedness
9465 # in the main loop.
9466 $default_map = '_Perl This will never be in a file
9467 from Unicode';
9468 }
9469 }
9470
9471 # Here, we have $default_map defined, possibly in terms of
9472 # $missings, but maybe not, and possibly is a dummy one.
9473 if (defined $missings) {
9474
9475 # Make sure there is no conflict between the two.
9476 # $missings has priority.
9477 if (ref $missings) {
23e33b60
KW
9478 $default_table
9479 = $property_object->table($default_map);
99870f4d
KW
9480 if (! defined $default_table
9481 || $default_table != $missings)
9482 {
9483 if (! defined $default_table) {
9484 $default_table = $UNDEF;
9485 }
9486 $file->carp_bad_line(<<END
9487The \@missings line for $property_name in $file says that missings default to
9488$missings, but we expect it to be $default_table. $missings used.
9489END
9490 );
9491 $default_table = $missings;
9492 $default_map = $missings->full_name;
9493 }
9494 $property_info{$property_addr}{'default_table'}
9495 = $default_table;
9496 }
9497 elsif ($default_map ne $missings) {
9498 $file->carp_bad_line(<<END
9499The \@missings line for $property_name in $file says that missings default to
9500$missings, but we expect it to be $default_map. $missings used.
9501END
9502 );
9503 $default_map = $missings;
9504 }
9505 }
9506
9507 $property_info{$property_addr}{'default'}
9508 = $default_map;
9509
9510 # If haven't done so already, find the table corresponding
9511 # to this map for non-string properties.
9512 if (! defined $default_table
9513 && $property_type != $STRING
9514 && $property_type != $UNKNOWN)
9515 {
9516 $default_table = $property_info{$property_addr}
9517 {'default_table'}
9518 = $property_object->table($default_map);
9519 }
9520 } # End of is first time for this property
9521 } # End of switching properties.
9522
9523 # Ready to process the line.
9524 # The Unicode files are set up so that if the map is not defined,
9525 # it is a binary property with value 'Y'
9526 if (! defined $map) {
9527 $map = 'Y';
9528 }
9529 else {
9530
9531 # If the map begins with a special command to us (enclosed in
9532 # delimiters), extract the command(s).
a35d7f90
KW
9533 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
9534 my $command = $1;
9535 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
9536 $replace = $1;
99870f4d 9537 }
a35d7f90
KW
9538 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
9539 $map_type = $1;
9540 }
9541 else {
9542 $file->carp_bad_line("Unknown command line: '$1'");
9543 next LINE;
9544 }
9545 }
99870f4d
KW
9546 }
9547
9548 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
9549 {
9550
9551 # Here, we have a map to a particular code point, and the
9552 # default map is to a code point itself. If the range
9553 # includes the particular code point, change that portion of
9554 # the range to the default. This makes sure that in the final
9555 # table only the non-defaults are listed.
9556 my $decimal_map = hex $map;
9557 if ($low <= $decimal_map && $decimal_map <= $high) {
9558
9559 # If the range includes stuff before or after the map
9560 # we're changing, split it and process the split-off parts
9561 # later.
9562 if ($low < $decimal_map) {
9563 $file->insert_adjusted_lines(
9564 sprintf("%04X..%04X; %s; %s",
9565 $low,
9566 $decimal_map - 1,
9567 $property_name,
9568 $map));
9569 }
9570 if ($high > $decimal_map) {
9571 $file->insert_adjusted_lines(
9572 sprintf("%04X..%04X; %s; %s",
9573 $decimal_map + 1,
9574 $high,
9575 $property_name,
9576 $map));
9577 }
9578 $low = $high = $decimal_map;
9579 $map = $CODE_POINT;
9580 }
9581 }
9582
9583 # If we can tell that this is a synonym for the default map, use
9584 # the default one instead.
9585 if ($property_type != $STRING
9586 && $property_type != $UNKNOWN)
9587 {
9588 my $table = $property_object->table($map);
9589 if (defined $table && $table == $default_table) {
9590 $map = $default_map;
9591 }
9592 }
9593
9594 # And figure out the map type if not known.
9595 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9596 if ($map eq "") { # Nulls are always $NULL map type
9597 $map_type = $NULL;
9598 } # Otherwise, non-strings, and those that don't allow
9599 # $MULTI_CP, and those that aren't multiple code points are
9600 # 0
9601 elsif
9602 (($property_type != $STRING && $property_type != $UNKNOWN)
9603 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9604 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9605 {
9606 $map_type = 0;
9607 }
9608 else {
9609 $map_type = $MULTI_CP;
9610 }
9611 }
9612
9613 $property_object->add_map($low, $high,
9614 $map,
9615 Type => $map_type,
9616 Replace => $replace);
9617 } # End of loop through file's lines
9618
9619 return;
9620 }
9621}
9622
99870f4d
KW
9623{ # Closure for UnicodeData.txt handling
9624
9625 # This file was the first one in the UCD; its design leads to some
9626 # awkwardness in processing. Here is a sample line:
9627 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9628 # The fields in order are:
9629 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9630 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9631 my $CATEGORY = $i++; # category (e.g. "Lu")
9632 my $CCC = $i++; # Canonical combining class (e.g. "230")
9633 my $BIDI = $i++; # directional class (e.g. "L")
9634 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9635 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9636 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9637 # Dual-use in this program; see below
9638 my $NUMERIC = $i++; # numeric value
9639 my $MIRRORED = $i++; # ? mirrored
9640 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9641 my $COMMENT = $i++; # iso comment
9642 my $UPPER = $i++; # simple uppercase mapping
9643 my $LOWER = $i++; # simple lowercase mapping
9644 my $TITLE = $i++; # simple titlecase mapping
9645 my $input_field_count = $i;
9646
9647 # This routine in addition outputs these extra fields:
9648 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9649
9650 # These fields are modifications of ones above, and are usually
9651 # suppressed; they must come last, as for speed, the loop upper bound is
9652 # normally set to ignore them
9653 my $NAME = $i++; # This is the strict name field, not the one that
9654 # charnames uses.
9655 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9656 # by Unicode::Normalize
99870f4d
KW
9657 my $last_field = $i - 1;
9658
9659 # All these are read into an array for each line, with the indices defined
9660 # above. The empty fields in the example line above indicate that the
9661 # value is defaulted. The handler called for each line of the input
9662 # changes these to their defaults.
9663
9664 # Here are the official names of the properties, in a parallel array:
9665 my @field_names;
9666 $field_names[$BIDI] = 'Bidi_Class';
9667 $field_names[$CATEGORY] = 'General_Category';
9668 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9669 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9670 $field_names[$COMMENT] = 'ISO_Comment';
9671 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9672 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9673 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9674 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9675 $field_names[$NAME] = 'Name';
9676 $field_names[$NUMERIC] = 'Numeric_Value';
9677 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9678 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9679 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9680 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9681 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9682 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9683
28093d0e
KW
9684 # Some of these need a little more explanation:
9685 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9686 # property, but is used in calculating the Numeric_Type. Perl however,
9687 # creates a file from this field, so a Perl property is created from it.
9688 # Similarly, the Other_Digit field is used only for calculating the
9689 # Numeric_Type, and so it can be safely re-used as the place to store
9690 # the value for Numeric_Type; hence it is referred to as
9691 # $NUMERIC_TYPE_OTHER_DIGIT.
9692 # The input field named $PERL_DECOMPOSITION is a combination of both the
9693 # decomposition mapping and its type. Perl creates a file containing
9694 # exactly this field, so it is used for that. The two properties are
9695 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9696 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9697 # output it), as Perl doesn't use it directly.
9698 # The input field named here $CHARNAME is used to construct the
9699 # Perl_Charnames property, which is a combination of the Name property
9700 # (which the input field contains), and the Unicode_1_Name property, and
9701 # others from other files. Since, the strict Name property is not used
9702 # by Perl, this field is used for the table that Perl does use. The
9703 # strict Name property table is usually suppressed (unless the lists are
9704 # changed to output it), so it is accumulated in a separate field,
9705 # $NAME, which to save time is discarded unless the table is actually to
9706 # be output
99870f4d
KW
9707
9708 # This file is processed like most in this program. Control is passed to
9709 # process_generic_property_file() which calls filter_UnicodeData_line()
9710 # for each input line. This filter converts the input into line(s) that
9711 # process_generic_property_file() understands. There is also a setup
9712 # routine called before any of the file is processed, and a handler for
9713 # EOF processing, all in this closure.
9714
9715 # A huge speed-up occurred at the cost of some added complexity when these
9716 # routines were altered to buffer the outputs into ranges. Almost all the
9717 # lines of the input file apply to just one code point, and for most
9718 # properties, the map for the next code point up is the same as the
9719 # current one. So instead of creating a line for each property for each
9720 # input line, filter_UnicodeData_line() remembers what the previous map
9721 # of a property was, and doesn't generate a line to pass on until it has
9722 # to, as when the map changes; and that passed-on line encompasses the
9723 # whole contiguous range of code points that have the same map for that
9724 # property. This means a slight amount of extra setup, and having to
9725 # flush these buffers on EOF, testing if the maps have changed, plus
9726 # remembering state information in the closure. But it means a lot less
9727 # real time in not having to change the data base for each property on
9728 # each line.
9729
9730 # Another complication is that there are already a few ranges designated
9731 # in the input. There are two lines for each, with the same maps except
9732 # the code point and name on each line. This was actually the hardest
9733 # thing to design around. The code points in those ranges may actually
9734 # have real maps not given by these two lines. These maps will either
98dc9551 9735 # be algorithmically determinable, or in the extracted files furnished
99870f4d
KW
9736 # with the UCD. In the event of conflicts between these extracted files,
9737 # and this one, Unicode says that this one prevails. But it shouldn't
9738 # prevail for conflicts that occur in these ranges. The data from the
9739 # extracted files prevails in those cases. So, this program is structured
9740 # so that those files are processed first, storing maps. Then the other
9741 # files are processed, generally overwriting what the extracted files
9742 # stored. But just the range lines in this input file are processed
9743 # without overwriting. This is accomplished by adding a special string to
9744 # the lines output to tell process_generic_property_file() to turn off the
9745 # overwriting for just this one line.
9746 # A similar mechanism is used to tell it that the map is of a non-default
9747 # type.
9748
9749 sub setup_UnicodeData { # Called before any lines of the input are read
9750 my $file = shift;
9751 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9752
28093d0e
KW
9753 # Create a new property specially located that is a combination of the
9754 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9755 # Name_Alias properties. (The final duplicates elements of the
9756 # first.) A comment for it will later be constructed based on the
9757 # actual properties present and used
3e20195b 9758 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9759 Core_Access => '\N{...} and "use charnames"',
9760 Default_Map => "",
9761 Directory => File::Spec->curdir(),
9762 File => 'Name',
9763 Internal_Only_Warning => 1,
9764 Perl_Extension => 1,
b1c167a3 9765 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9766 Type => $STRING,
9767 );
9768
99870f4d 9769 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9770 Directory => File::Spec->curdir(),
99870f4d 9771 File => 'Decomposition',
a14f3cb1 9772 Format => $DECOMP_STRING_FORMAT,
99870f4d
KW
9773 Internal_Only_Warning => 1,
9774 Perl_Extension => 1,
9775 Default_Map => $CODE_POINT,
9776
0c07e538
KW
9777 # normalize.pm can't cope with these
9778 Output_Range_Counts => 0,
9779
99870f4d
KW
9780 # This is a specially formatted table
9781 # explicitly for normalize.pm, which
9782 # is expecting a particular format,
9783 # which means that mappings containing
9784 # multiple code points are in the main
9785 # body of the table
9786 Map_Type => $COMPUTE_NO_MULTI_CP,
9787 Type => $STRING,
9788 );
9789 $Perl_decomp->add_comment(join_lines(<<END
9790This mapping is a combination of the Unicode 'Decomposition_Type' and
9791'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9792identical to the official Unicode 'Decomposition_Mapping' property except for
9793two things:
9794 1) It omits the algorithmically determinable Hangul syllable decompositions,
9795which normalize.pm handles algorithmically.
9796 2) It contains the decomposition type as well. Non-canonical decompositions
9797begin with a word in angle brackets, like <super>, which denotes the
9798compatible decomposition type. If the map does not begin with the <angle
9799brackets>, the decomposition is canonical.
9800END
9801 ));
9802
9803 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9804 Default_Map => "",
9805 Perl_Extension => 1,
9806 File => 'Digit', # Trad. location
9807 Directory => $map_directory,
9808 Type => $STRING,
9809 Range_Size_1 => 1,
9810 );
9811 $Decimal_Digit->add_comment(join_lines(<<END
9812This file gives the mapping of all code points which represent a single
9813decimal digit [0-9] to their respective digits. For example, the code point
9814U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9815that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9816numerals.
9817END
9818 ));
9819
28093d0e
KW
9820 # These properties are not used for generating anything else, and are
9821 # usually not output. By making them last in the list, we can just
99870f4d 9822 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
9823 # generating a table(s) that is/are just going to get thrown away.
9824 if (! property_ref('Decomposition_Mapping')->to_output_map
9825 && ! property_ref('Name')->to_output_map)
9826 {
9827 $last_field = min($NAME, $DECOMP_MAP) - 1;
9828 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9829 $last_field = $DECOMP_MAP;
9830 } elsif (property_ref('Name')->to_output_map) {
9831 $last_field = $NAME;
99870f4d
KW
9832 }
9833 return;
9834 }
9835
9836 my $first_time = 1; # ? Is this the first line of the file
9837 my $in_range = 0; # ? Are we in one of the file's ranges
9838 my $previous_cp; # hex code point of previous line
9839 my $decimal_previous_cp = -1; # And its decimal equivalent
9840 my @start; # For each field, the current starting
9841 # code point in hex for the range
9842 # being accumulated.
9843 my @fields; # The input fields;
9844 my @previous_fields; # And those from the previous call
9845
9846 sub filter_UnicodeData_line {
9847 # Handle a single input line from UnicodeData.txt; see comments above
9848 # Conceptually this takes a single line from the file containing N
9849 # properties, and converts it into N lines with one property per line,
9850 # which is what the final handler expects. But there are
9851 # complications due to the quirkiness of the input file, and to save
9852 # time, it accumulates ranges where the property values don't change
9853 # and only emits lines when necessary. This is about an order of
9854 # magnitude fewer lines emitted.
9855
9856 my $file = shift;
9857 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9858
9859 # $_ contains the input line.
9860 # -1 in split means retain trailing null fields
9861 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9862
9863 #local $to_trace = 1 if main::DEBUG;
9864 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9865 if (@fields > $input_field_count) {
9866 $file->carp_bad_line('Extra fields');
9867 $_ = "";
9868 return;
9869 }
9870
9871 my $decimal_cp = hex $cp;
9872
9873 # We have to output all the buffered ranges when the next code point
9874 # is not exactly one after the previous one, which means there is a
9875 # gap in the ranges.
9876 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9877
9878 # The decomposition mapping field requires special handling. It looks
9879 # like either:
9880 #
9881 # <compat> 0032 0020
9882 # 0041 0300
9883 #
9884 # The decomposition type is enclosed in <brackets>; if missing, it
9885 # means the type is canonical. There are two decomposition mapping
9886 # tables: the one for use by Perl's normalize.pm has a special format
9887 # which is this field intact; the other, for general use is of
9888 # standard format. In either case we have to find the decomposition
9889 # type. Empty fields have None as their type, and map to the code
9890 # point itself
9891 if ($fields[$PERL_DECOMPOSITION] eq "") {
9892 $fields[$DECOMP_TYPE] = 'None';
9893 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9894 }
9895 else {
9896 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9897 =~ / < ( .+? ) > \s* ( .+ ) /x;
9898 if (! defined $fields[$DECOMP_TYPE]) {
9899 $fields[$DECOMP_TYPE] = 'Canonical';
9900 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9901 }
9902 else {
9903 $fields[$DECOMP_MAP] = $map;
9904 }
9905 }
9906
9907 # The 3 numeric fields also require special handling. The 2 digit
9908 # fields must be either empty or match the number field. This means
9909 # that if it is empty, they must be as well, and the numeric type is
9910 # None, and the numeric value is 'Nan'.
9911 # The decimal digit field must be empty or match the other digit
9912 # field. If the decimal digit field is non-empty, the code point is
9913 # a decimal digit, and the other two fields will have the same value.
9914 # If it is empty, but the other digit field is non-empty, the code
9915 # point is an 'other digit', and the number field will have the same
9916 # value as the other digit field. If the other digit field is empty,
9917 # but the number field is non-empty, the code point is a generic
9918 # numeric type.
9919 if ($fields[$NUMERIC] eq "") {
9920 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9921 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9922 ) {
9923 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9924 }
9925 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9926 $fields[$NUMERIC] = 'NaN';
9927 }
9928 else {
9929 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9930 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9931 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9932 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9933 }
9934 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9935 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9936 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9937 }
9938 else {
9939 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9940
9941 # Rationals require extra effort.
9942 register_fraction($fields[$NUMERIC])
9943 if $fields[$NUMERIC] =~ qr{/};
9944 }
9945 }
9946
9947 # For the properties that have empty fields in the file, and which
9948 # mean something different from empty, change them to that default.
9949 # Certain fields just haven't been empty so far in any Unicode
9950 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9951 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 9952 # the defaults; which are very unlikely to ever change.
99870f4d
KW
9953 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9954 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9955
9956 # UAX44 says that if title is empty, it is the same as whatever upper
9957 # is,
9958 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9959
9960 # There are a few pairs of lines like:
9961 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9962 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9963 # that define ranges. These should be processed after the fields are
9964 # adjusted above, as they may override some of them; but mostly what
28093d0e 9965 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
9966 # paired lines start with a '<', but this is also true of '<control>,
9967 # which isn't one of these special ones.
28093d0e 9968 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
9969
9970 # Some code points in this file have the pseudo-name
9971 # '<control>', but the official name for such ones is the null
28093d0e 9972 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 9973 $fields[$NAME] = "";
28093d0e 9974 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
9975
9976 # We had better not be in between range lines.
9977 if ($in_range) {
28093d0e 9978 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9979 $in_range = 0;
9980 }
9981 }
28093d0e 9982 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
9983
9984 # Here is a non-range line. We had better not be in between range
9985 # lines.
9986 if ($in_range) {
28093d0e 9987 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9988 $in_range = 0;
9989 }
edb80b88
KW
9990 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
9991
9992 # These are code points whose names end in their code points,
9993 # which means the names are algorithmically derivable from the
9994 # code points. To shorten the output Name file, the algorithm
9995 # for deriving these is placed in the file instead of each
9996 # code point, so they have map type $CP_IN_NAME
9997 $fields[$CHARNAME] = $CMD_DELIM
9998 . $MAP_TYPE_CMD
9999 . '='
10000 . $CP_IN_NAME
10001 . $CMD_DELIM
10002 . $fields[$CHARNAME];
10003 }
28093d0e 10004 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 10005 }
28093d0e
KW
10006 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10007 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
10008
10009 # Here we are at the beginning of a range pair.
10010 if ($in_range) {
28093d0e 10011 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
10012 }
10013 $in_range = 1;
10014
10015 # Because the properties in the range do not overwrite any already
10016 # in the db, we must flush the buffers of what's already there, so
10017 # they get handled in the normal scheme.
10018 $force_output = 1;
10019
10020 }
28093d0e
KW
10021 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10022 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
10023 $_ = "";
10024 return;
10025 }
10026 else { # Here, we are at the last line of a range pair.
10027
10028 if (! $in_range) {
28093d0e 10029 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
10030 $_ = "";
10031 return;
10032 }
10033 $in_range = 0;
10034
28093d0e
KW
10035 $fields[$NAME] = $fields[$CHARNAME];
10036
99870f4d
KW
10037 # Check that the input is valid: that the closing of the range is
10038 # the same as the beginning.
10039 foreach my $i (0 .. $last_field) {
10040 next if $fields[$i] eq $previous_fields[$i];
10041 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10042 }
10043
10044 # The processing differs depending on the type of range,
28093d0e
KW
10045 # determined by its $CHARNAME
10046 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
10047
10048 # Check that the data looks right.
10049 if ($decimal_previous_cp != $SBase) {
10050 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10051 }
10052 if ($decimal_cp != $SBase + $SCount - 1) {
10053 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10054 }
10055
10056 # The Hangul syllable range has a somewhat complicated name
10057 # generation algorithm. Each code point in it has a canonical
10058 # decomposition also computable by an algorithm. The
10059 # perl decomposition map table built from these is used only
10060 # by normalize.pm, which has the algorithm built in it, so the
10061 # decomposition maps are not needed, and are large, so are
10062 # omitted from it. If the full decomposition map table is to
10063 # be output, the decompositions are generated for it, in the
10064 # EOF handling code for this input file.
10065
10066 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10067
10068 # This range is stored in our internal structure with its
10069 # own map type, different from all others.
28093d0e
KW
10070 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10071 = $CMD_DELIM
99870f4d
KW
10072 . $MAP_TYPE_CMD
10073 . '='
10074 . $HANGUL_SYLLABLE
10075 . $CMD_DELIM
28093d0e 10076 . $fields[$CHARNAME];
99870f4d 10077 }
28093d0e 10078 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
10079
10080 # The name for these contains the code point itself, and all
10081 # are defined to have the same base name, regardless of what
10082 # is in the file. They are stored in our internal structure
10083 # with a map type of $CP_IN_NAME
28093d0e
KW
10084 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10085 = $CMD_DELIM
99870f4d
KW
10086 . $MAP_TYPE_CMD
10087 . '='
10088 . $CP_IN_NAME
10089 . $CMD_DELIM
10090 . 'CJK UNIFIED IDEOGRAPH';
10091
10092 }
10093 elsif ($fields[$CATEGORY] eq 'Co'
10094 || $fields[$CATEGORY] eq 'Cs')
10095 {
10096 # The names of all the code points in these ranges are set to
10097 # null, as there are no names for the private use and
10098 # surrogate code points.
10099
28093d0e 10100 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
10101 }
10102 else {
28093d0e 10103 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
10104 }
10105
10106 # The first line of the range caused everything else to be output,
10107 # and then its values were stored as the beginning values for the
10108 # next set of ranges, which this one ends. Now, for each value,
10109 # add a command to tell the handler that these values should not
10110 # replace any existing ones in our database.
10111 foreach my $i (0 .. $last_field) {
10112 $previous_fields[$i] = $CMD_DELIM
10113 . $REPLACE_CMD
10114 . '='
10115 . $NO
10116 . $CMD_DELIM
10117 . $previous_fields[$i];
10118 }
10119
10120 # And change things so it looks like the entire range has been
10121 # gone through with this being the final part of it. Adding the
10122 # command above to each field will cause this range to be flushed
10123 # during the next iteration, as it guaranteed that the stored
10124 # field won't match whatever value the next one has.
10125 $previous_cp = $cp;
10126 $decimal_previous_cp = $decimal_cp;
10127
10128 # We are now set up for the next iteration; so skip the remaining
10129 # code in this subroutine that does the same thing, but doesn't
10130 # know about these ranges.
10131 $_ = "";
c1739a4a 10132
99870f4d
KW
10133 return;
10134 }
10135
10136 # On the very first line, we fake it so the code below thinks there is
10137 # nothing to output, and initialize so that when it does get output it
10138 # uses the first line's values for the lowest part of the range.
10139 # (One could avoid this by using peek(), but then one would need to
10140 # know the adjustments done above and do the same ones in the setup
10141 # routine; not worth it)
10142 if ($first_time) {
10143 $first_time = 0;
10144 @previous_fields = @fields;
10145 @start = ($cp) x scalar @fields;
10146 $decimal_previous_cp = $decimal_cp - 1;
10147 }
10148
10149 # For each field, output the stored up ranges that this code point
10150 # doesn't fit in. Earlier we figured out if all ranges should be
10151 # terminated because of changing the replace or map type styles, or if
10152 # there is a gap between this new code point and the previous one, and
10153 # that is stored in $force_output. But even if those aren't true, we
10154 # need to output the range if this new code point's value for the
10155 # given property doesn't match the stored range's.
10156 #local $to_trace = 1 if main::DEBUG;
10157 foreach my $i (0 .. $last_field) {
10158 my $field = $fields[$i];
10159 if ($force_output || $field ne $previous_fields[$i]) {
10160
10161 # Flush the buffer of stored values.
10162 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10163
10164 # Start a new range with this code point and its value
10165 $start[$i] = $cp;
10166 $previous_fields[$i] = $field;
10167 }
10168 }
10169
10170 # Set the values for the next time.
10171 $previous_cp = $cp;
10172 $decimal_previous_cp = $decimal_cp;
10173
10174 # The input line has generated whatever adjusted lines are needed, and
10175 # should not be looked at further.
10176 $_ = "";
10177 return;
10178 }
10179
10180 sub EOF_UnicodeData {
10181 # Called upon EOF to flush the buffers, and create the Hangul
10182 # decomposition mappings if needed.
10183
10184 my $file = shift;
10185 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10186
10187 # Flush the buffers.
10188 foreach my $i (1 .. $last_field) {
10189 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10190 }
10191
10192 if (-e 'Jamo.txt') {
10193
10194 # The algorithm is published by Unicode, based on values in
10195 # Jamo.txt, (which should have been processed before this
10196 # subroutine), and the results left in %Jamo
10197 unless (%Jamo) {
10198 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10199 return;
10200 }
10201
10202 # If the full decomposition map table is being output, insert
10203 # into it the Hangul syllable mappings. This is to avoid having
10204 # to publish a subroutine in it to compute them. (which would
10205 # essentially be this code.) This uses the algorithm published by
10206 # Unicode.
10207 if (property_ref('Decomposition_Mapping')->to_output_map) {
10208 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10209 use integer;
10210 my $SIndex = $S - $SBase;
10211 my $L = $LBase + $SIndex / $NCount;
10212 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10213 my $T = $TBase + $SIndex % $TCount;
10214
10215 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10216 my $decomposition = sprintf("%04X %04X", $L, $V);
10217 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10218 $file->insert_adjusted_lines(
10219 sprintf("%04X; Decomposition_Mapping; %s",
10220 $S,
10221 $decomposition));
10222 }
10223 }
10224 }
10225
10226 return;
10227 }
10228
10229 sub filter_v1_ucd {
10230 # Fix UCD lines in version 1. This is probably overkill, but this
10231 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10232 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10233 # removed. This program retains them
10234 # 2) didn't include ranges, which it should have, and which are now
10235 # added in @corrected_lines below. It was hand populated by
10236 # taking the data from Version 2, verified by analyzing
10237 # DAge.txt.
10238 # 3) There is a syntax error in the entry for U+09F8 which could
10239 # cause problems for utf8_heavy, and so is changed. It's
10240 # numeric value was simply a minus sign, without any number.
10241 # (Eventually Unicode changed the code point to non-numeric.)
10242 # 4) The decomposition types often don't match later versions
10243 # exactly, and the whole syntax of that field is different; so
10244 # the syntax is changed as well as the types to their later
10245 # terminology. Otherwise normalize.pm would be very unhappy
10246 # 5) Many ccc classes are different. These are left intact.
10247 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
10248 # fields. These are unchanged because it doesn't really cause
10249 # problems for Perl.
10250 # 7) A number of code points, such as controls, don't have their
10251 # Unicode Version 1 Names in this file. These are unchanged.
10252
10253 my @corrected_lines = split /\n/, <<'END';
102544E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
102559FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10256E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10257F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10258F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10259FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10260END
10261
10262 my $file = shift;
10263 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10264
10265 #local $to_trace = 1 if main::DEBUG;
10266 trace $_ if main::DEBUG && $to_trace;
10267
10268 # -1 => retain trailing null fields
10269 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10270
10271 # At the first place that is wrong in the input, insert all the
10272 # corrections, replacing the wrong line.
10273 if ($code_point eq '4E00') {
10274 my @copy = @corrected_lines;
10275 $_ = shift @copy;
10276 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10277
10278 $file->insert_lines(@copy);
10279 }
10280
10281
10282 if ($fields[$NUMERIC] eq '-') {
10283 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10284 }
10285
10286 if ($fields[$PERL_DECOMPOSITION] ne "") {
10287
10288 # Several entries have this change to superscript 2 or 3 in the
10289 # middle. Convert these to the modern version, which is to use
10290 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10291 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10292 # 'HHHH HHHH 00B3 HHHH'.
10293 # It turns out that all of these that don't have another
10294 # decomposition defined at the beginning of the line have the
10295 # <square> decomposition in later releases.
10296 if ($code_point ne '00B2' && $code_point ne '00B3') {
10297 if ($fields[$PERL_DECOMPOSITION]
10298 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10299 {
10300 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10301 $fields[$PERL_DECOMPOSITION] = '<square> '
10302 . $fields[$PERL_DECOMPOSITION];
10303 }
10304 }
10305 }
10306
10307 # If is like '<+circled> 0052 <-circled>', convert to
10308 # '<circled> 0052'
10309 $fields[$PERL_DECOMPOSITION] =~
10310 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
10311
10312 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10313 $fields[$PERL_DECOMPOSITION] =~
10314 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10315 or $fields[$PERL_DECOMPOSITION] =~
10316 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10317 or $fields[$PERL_DECOMPOSITION] =~
10318 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10319 or $fields[$PERL_DECOMPOSITION] =~
10320 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10321
10322 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10323 $fields[$PERL_DECOMPOSITION] =~
10324 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10325
10326 # Change names to modern form.
10327 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10328 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10329 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10330 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10331
10332 # One entry has weird braces
10333 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10334 }
10335
10336 $_ = join ';', $code_point, @fields;
10337 trace $_ if main::DEBUG && $to_trace;
10338 return;
10339 }
10340
10341 sub filter_v2_1_5_ucd {
10342 # A dozen entries in this 2.1.5 file had the mirrored and numeric
10343 # columns swapped; These all had mirrored be 'N'. So if the numeric
10344 # column appears to be N, swap it back.
10345
10346 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10347 if ($fields[$NUMERIC] eq 'N') {
10348 $fields[$NUMERIC] = $fields[$MIRRORED];
10349 $fields[$MIRRORED] = 'N';
10350 $_ = join ';', $code_point, @fields;
10351 }
10352 return;
10353 }
3ffed8c2
KW
10354
10355 sub filter_v6_ucd {
10356
10357 # Unicode 6.0 co-opted the name BELL for U+1F514, so change the input
10358 # to pretend that U+0007 is ALERT instead, and for Perl 5.14, don't
10359 # allow the BELL name for U+1F514, so that the old usage can be
10360 # deprecated for one cycle.
10361
484741e1 10362 return if $_ !~ /^(?:0007|1F514|070F);/;
3ffed8c2
KW
10363
10364 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10365 if ($code_point eq '0007') {
0e429600 10366 $fields[$CHARNAME] = "ALERT";
3ffed8c2 10367 }
484741e1
KW
10368 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
10369 # http://www.unicode.org/versions/corrigendum8.html
10370 $fields[$BIDI] = "AL";
10371 }
10914c78 10372 elsif ($^V lt v5.17.0) { # For 5.18 will convert to use Unicode's name
3ffed8c2
KW
10373 $fields[$CHARNAME] = "";
10374 }
10375
10376 $_ = join ';', $code_point, @fields;
10377
10378 return;
10379 }
99870f4d
KW
10380} # End closure for UnicodeData
10381
37e2e78e
KW
10382sub process_GCB_test {
10383
10384 my $file = shift;
10385 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10386
10387 while ($file->next_line) {
10388 push @backslash_X_tests, $_;
10389 }
678f13d5 10390
37e2e78e
KW
10391 return;
10392}
10393
99870f4d
KW
10394sub process_NamedSequences {
10395 # NamedSequences.txt entries are just added to an array. Because these
10396 # don't look like the other tables, they have their own handler.
10397 # An example:
10398 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
10399 #
10400 # This just adds the sequence to an array for later handling
10401
99870f4d
KW
10402 my $file = shift;
10403 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10404
10405 while ($file->next_line) {
10406 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
10407 if (@remainder) {
10408 $file->carp_bad_line(
10409 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
10410 next;
10411 }
fb121860
KW
10412
10413 # Note single \t in keeping with special output format of
10414 # Perl_charnames. But it turns out that the code points don't have to
10415 # be 5 digits long, like the rest, based on the internal workings of
10416 # charnames.pm. This could be easily changed for consistency.
10417 push @named_sequences, "$sequence\t$name";
99870f4d
KW
10418 }
10419 return;
10420}
10421
10422{ # Closure
10423
10424 my $first_range;
10425
10426 sub filter_early_ea_lb {
10427 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
10428 # third field be the name of the code point, which can be ignored in
10429 # most cases. But it can be meaningful if it marks a range:
10430 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
10431 # 3400;W;<CJK Ideograph Extension A, First>
10432 #
10433 # We need to see the First in the example above to know it's a range.
10434 # They did not use the later range syntaxes. This routine changes it
10435 # to use the modern syntax.
10436 # $1 is the Input_file object.
10437
10438 my @fields = split /\s*;\s*/;
10439 if ($fields[2] =~ /^<.*, First>/) {
10440 $first_range = $fields[0];
10441 $_ = "";
10442 }
10443 elsif ($fields[2] =~ /^<.*, Last>/) {
10444 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
10445 }
10446 else {
10447 undef $first_range;
10448 $_ = "$fields[0]; $fields[1]";
10449 }
10450
10451 return;
10452 }
10453}
10454
10455sub filter_old_style_arabic_shaping {
10456 # Early versions used a different term for the later one.
10457
10458 my @fields = split /\s*;\s*/;
10459 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
10460 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
10461 $_ = join ';', @fields;
10462 return;
10463}
10464
10465sub filter_arabic_shaping_line {
10466 # ArabicShaping.txt has entries that look like:
10467 # 062A; TEH; D; BEH
10468 # The field containing 'TEH' is not used. The next field is Joining_Type
10469 # and the last is Joining_Group
10470 # This generates two lines to pass on, one for each property on the input
10471 # line.
10472
10473 my $file = shift;
10474 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10475
10476 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10477
10478 if (@fields > 4) {
10479 $file->carp_bad_line('Extra fields');
10480 $_ = "";
10481 return;
10482 }
10483
10484 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
10485 $_ = "$fields[0]; Joining_Type; $fields[2]";
10486
10487 return;
10488}
10489
d3fed3dd
KW
10490{ # Closure
10491 my $lc; # Table for lowercase mapping
10492 my $tc;
10493 my $uc;
10494
6c0259ad
KW
10495 sub setup_special_casing {
10496 # SpecialCasing.txt contains the non-simple case change mappings. The
10497 # simple ones are in UnicodeData.txt, which should already have been
10498 # read in to the full property data structures, so as to initialize
10499 # these with the simple ones. Then the SpecialCasing.txt entries
10500 # overwrite the ones which have different full mappings.
10501
10502 # This routine sees if the simple mappings are to be output, and if
10503 # so, copies what has already been put into the full mapping tables,
10504 # while they still contain only the simple mappings.
10505
10506 # The reason it is done this way is that the simple mappings are
10507 # probably not going to be output, so it saves work to initialize the
10508 # full tables with the simple mappings, and then overwrite those
10509 # relatively few entries in them that have different full mappings,
10510 # and thus skip the simple mapping tables altogether.
10511
10512 my $file= shift;
10513 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10514
6c0259ad
KW
10515 $lc = property_ref('lc');
10516 $tc = property_ref('tc');
10517 $uc = property_ref('uc');
10518
10519 # For each of the case change mappings...
10520 foreach my $case_table ($lc, $tc, $uc) {
10521 my $case = $case_table->name;
10522 my $full = property_ref($case);
10523 unless (defined $full && ! $full->is_empty) {
10524 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
10525 }
10526
10527 # The simple version's name in each mapping merely has an 's' in
10528 # front of the full one's
10529 my $simple = property_ref('s' . $case);
10530 $simple->initialize($full) if $simple->to_output_map();
10531
10532 my $simple_only = Property->new("_s$case",
10533 Type => $STRING,
10534 Default_Map => $CODE_POINT,
10535 Perl_Extension => 1,
10536 Description => "The simple mappings for $case for code points that have full mappings as well");
10537 $simple_only->set_to_output_map($INTERNAL_MAP);
10538 $simple_only->add_comment(join_lines( <<END
d3fed3dd
KW
10539This file is for UCD.pm so that it can construct simple mappings that would
10540otherwise be lost because they are overridden by full mappings.
10541END
6c0259ad
KW
10542 ));
10543 }
99870f4d 10544
6c0259ad
KW
10545 return;
10546 }
99870f4d 10547
6c0259ad
KW
10548 sub filter_special_casing_line {
10549 # Change the format of $_ from SpecialCasing.txt into something that
10550 # the generic handler understands. Each input line contains three
10551 # case mappings. This will generate three lines to pass to the
10552 # generic handler for each of those.
99870f4d 10553
6c0259ad
KW
10554 # The input syntax (after stripping comments and trailing white space
10555 # is like one of the following (with the final two being entries that
10556 # we ignore):
10557 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
10558 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
10559 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
10560 # Note the trailing semi-colon, unlike many of the input files. That
10561 # means that there will be an extra null field generated by the split
99870f4d 10562
6c0259ad
KW
10563 my $file = shift;
10564 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
99870f4d 10565
6c0259ad
KW
10566 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
10567 # fields
10568
10569 # field #4 is when this mapping is conditional. If any of these get
10570 # implemented, it would be by hard-coding in the casing functions in
10571 # the Perl core, not through tables. But if there is a new condition
10572 # we don't know about, output a warning. We know about all the
10573 # conditions through 6.0
10574 if ($fields[4] ne "") {
10575 my @conditions = split ' ', $fields[4];
10576 if ($conditions[0] ne 'tr' # We know that these languages have
10577 # conditions, and some are multiple
10578 && $conditions[0] ne 'az'
10579 && $conditions[0] ne 'lt'
10580
10581 # And, we know about a single condition Final_Sigma, but
10582 # nothing else.
10583 && ($v_version gt v5.2.0
10584 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
10585 {
10586 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
10587 }
10588 elsif ($conditions[0] ne 'Final_Sigma') {
99870f4d 10589
6c0259ad
KW
10590 # Don't print out a message for Final_Sigma, because we
10591 # have hard-coded handling for it. (But the standard
10592 # could change what the rule should be, but it wouldn't
10593 # show up here anyway.
99870f4d 10594
6c0259ad 10595 print "# SKIPPING Special Casing: $_\n"
99870f4d 10596 if $verbosity >= $VERBOSE;
6c0259ad
KW
10597 }
10598 $_ = "";
10599 return;
10600 }
10601 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
10602 $file->carp_bad_line('Extra fields');
10603 $_ = "";
10604 return;
99870f4d 10605 }
99870f4d 10606
6c0259ad
KW
10607 $_ = "$fields[0]; lc; $fields[1]";
10608 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
10609 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
99870f4d 10610
6c0259ad
KW
10611 # Copy any simple case change to the special tables constructed if
10612 # being overridden by a multi-character case change.
10613 if ($fields[1] ne $fields[0]
10614 && (my $value = $lc->value_of(hex $fields[0])) ne $CODE_POINT)
10615 {
10616 $file->insert_adjusted_lines("$fields[0]; _slc; $value");
10617 }
10618 if ($fields[2] ne $fields[0]
10619 && (my $value = $tc->value_of(hex $fields[0])) ne $CODE_POINT)
10620 {
10621 $file->insert_adjusted_lines("$fields[0]; _stc; $value");
10622 }
10623 if ($fields[3] ne $fields[0]
10624 && (my $value = $uc->value_of(hex $fields[0])) ne $CODE_POINT)
10625 {
10626 $file->insert_adjusted_lines("$fields[0]; _suc; $value");
10627 }
d3fed3dd 10628
6c0259ad
KW
10629 return;
10630 }
d3fed3dd 10631}
99870f4d
KW
10632
10633sub filter_old_style_case_folding {
10634 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 10635 # and later style. Different letters were used in the earlier.
99870f4d
KW
10636
10637 my $file = shift;
10638 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10639
10640 my @fields = split /\s*;\s*/;
10641 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10642 $fields[1] = 'I';
10643 }
10644 elsif ($fields[1] eq 'L') {
10645 $fields[1] = 'C'; # L => C always
10646 }
10647 elsif ($fields[1] eq 'E') {
10648 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10649 $fields[1] = 'F'
10650 }
10651 else {
10652 $fields[1] = 'C'
10653 }
10654 }
10655 else {
10656 $file->carp_bad_line("Expecting L or E in second field");
10657 $_ = "";
10658 return;
10659 }
10660 $_ = join("; ", @fields) . ';';
10661 return;
10662}
10663
10664{ # Closure for case folding
10665
10666 # Create the map for simple only if are going to output it, for otherwise
10667 # it takes no part in anything we do.
10668 my $to_output_simple;
10669
99870f4d
KW
10670 sub setup_case_folding($) {
10671 # Read in the case foldings in CaseFolding.txt. This handles both
10672 # simple and full case folding.
10673
10674 $to_output_simple
10675 = property_ref('Simple_Case_Folding')->to_output_map;
10676
10677 return;
10678 }
10679
10680 sub filter_case_folding_line {
10681 # Called for each line in CaseFolding.txt
10682 # Input lines look like:
10683 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10684 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10685 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10686 #
10687 # 'C' means that folding is the same for both simple and full
10688 # 'F' that it is only for full folding
10689 # 'S' that it is only for simple folding
10690 # 'T' is locale-dependent, and ignored
10691 # 'I' is a type of 'F' used in some early releases.
10692 # Note the trailing semi-colon, unlike many of the input files. That
10693 # means that there will be an extra null field generated by the split
10694 # below, which we ignore and hence is not an error.
10695
10696 my $file = shift;
10697 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10698
10699 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10700 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10701 $file->carp_bad_line('Extra fields');
10702 $_ = "";
10703 return;
10704 }
10705
10706 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10707 $_ = "";
10708 return;
10709 }
10710
10711 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
3c099872
KW
10712 # I are all full foldings; S is single-char. For S, there is always
10713 # an F entry, so we must allow multiple values for the same code
10714 # point. Fortunately this table doesn't need further manipulation
10715 # which would preclude using multiple-values. The S is now included
10716 # so that _swash_inversion_hash() is able to construct closures
10717 # without having to worry about F mappings.
10718 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
10719 $_ = "$range; Case_Folding; $CMD_DELIM$REPLACE_CMD=$MULTIPLE$CMD_DELIM$map";
99870f4d
KW
10720 }
10721 else {
10722 $_ = "";
3c099872 10723 $file->carp_bad_line('Expecting C F I S or T in second field');
99870f4d
KW
10724 }
10725
10726 # C and S are simple foldings, but simple case folding is not needed
10727 # unless we explicitly want its map table output.
10728 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10729 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10730 }
10731
99870f4d
KW
10732 return;
10733 }
10734
99870f4d
KW
10735} # End case fold closure
10736
10737sub filter_jamo_line {
10738 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10739 # from this file that is used in generating the Name property for Jamo
10740 # code points. But, it also is used to convert early versions' syntax
10741 # into the modern form. Here are two examples:
10742 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10743 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10744 #
10745 # The input is $_, the output is $_ filtered.
10746
10747 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10748
10749 # Let the caller handle unexpected input. In earlier versions, there was
10750 # a third field which is supposed to be a comment, but did not have a '#'
10751 # before it.
10752 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10753
10754 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10755 # beginning.
10756
10757 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10758 $fields[1] = 'R' if $fields[0] eq '1105';
10759
10760 # Add to structure so can generate Names from it.
10761 my $cp = hex $fields[0];
10762 my $short_name = $fields[1];
10763 $Jamo{$cp} = $short_name;
10764 if ($cp <= $LBase + $LCount) {
10765 $Jamo_L{$short_name} = $cp - $LBase;
10766 }
10767 elsif ($cp <= $VBase + $VCount) {
10768 $Jamo_V{$short_name} = $cp - $VBase;
10769 }
10770 elsif ($cp <= $TBase + $TCount) {
10771 $Jamo_T{$short_name} = $cp - $TBase;
10772 }
10773 else {
10774 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10775 }
10776
10777
10778 # Reassemble using just the first two fields to look like a typical
10779 # property file line
10780 $_ = "$fields[0]; $fields[1]";
10781
10782 return;
10783}
10784
99870f4d
KW
10785sub register_fraction($) {
10786 # This registers the input rational number so that it can be passed on to
10787 # utf8_heavy.pl, both in rational and floating forms.
10788
10789 my $rational = shift;
10790
10791 my $float = eval $rational;
10792 $nv_floating_to_rational{$float} = $rational;
10793 return;
10794}
10795
10796sub filter_numeric_value_line {
10797 # DNumValues contains lines of a different syntax than the typical
10798 # property file:
10799 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10800 #
10801 # This routine transforms $_ containing the anomalous syntax to the
10802 # typical, by filtering out the extra columns, and convert early version
10803 # decimal numbers to strings that look like rational numbers.
10804
10805 my $file = shift;
10806 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10807
10808 # Starting in 5.1, there is a rational field. Just use that, omitting the
10809 # extra columns. Otherwise convert the decimal number in the second field
10810 # to a rational, and omit extraneous columns.
10811 my @fields = split /\s*;\s*/, $_, -1;
10812 my $rational;
10813
10814 if ($v_version ge v5.1.0) {
10815 if (@fields != 4) {
10816 $file->carp_bad_line('Not 4 semi-colon separated fields');
10817 $_ = "";
10818 return;
10819 }
10820 $rational = $fields[3];
10821 $_ = join '; ', @fields[ 0, 3 ];
10822 }
10823 else {
10824
10825 # Here, is an older Unicode file, which has decimal numbers instead of
10826 # rationals in it. Use the fraction to calculate the denominator and
10827 # convert to rational.
10828
10829 if (@fields != 2 && @fields != 3) {
10830 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10831 $_ = "";
10832 return;
10833 }
10834
10835 my $codepoints = $fields[0];
10836 my $decimal = $fields[1];
10837 if ($decimal =~ s/\.0+$//) {
10838
10839 # Anything ending with a decimal followed by nothing but 0's is an
10840 # integer
10841 $_ = "$codepoints; $decimal";
10842 $rational = $decimal;
10843 }
10844 else {
10845
10846 my $denominator;
10847 if ($decimal =~ /\.50*$/) {
10848 $denominator = 2;
10849 }
10850
10851 # Here have the hardcoded repeating decimals in the fraction, and
10852 # the denominator they imply. There were only a few denominators
10853 # in the older Unicode versions of this file which this code
10854 # handles, so it is easy to convert them.
10855
10856 # The 4 is because of a round-off error in the Unicode 3.2 files
10857 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10858 $denominator = 3;
10859 }
10860 elsif ($decimal =~ /\.[27]50*$/) {
10861 $denominator = 4;
10862 }
10863 elsif ($decimal =~ /\.[2468]0*$/) {
10864 $denominator = 5;
10865 }
10866 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10867 $denominator = 6;
10868 }
10869 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10870 $denominator = 8;
10871 }
10872 if ($denominator) {
10873 my $sign = ($decimal < 0) ? "-" : "";
10874 my $numerator = int((abs($decimal) * $denominator) + .5);
10875 $rational = "$sign$numerator/$denominator";
10876 $_ = "$codepoints; $rational";
10877 }
10878 else {
10879 $file->carp_bad_line("Can't cope with number '$decimal'.");
10880 $_ = "";
10881 return;
10882 }
10883 }
10884 }
10885
10886 register_fraction($rational) if $rational =~ qr{/};
10887 return;
10888}
10889
10890{ # Closure
10891 my %unihan_properties;
10892 my $iicore;
10893
10894
10895 sub setup_unihan {
10896 # Do any special setup for Unihan properties.
10897
10898 # This property gives the wrong computed type, so override.
10899 my $usource = property_ref('kIRG_USource');
10900 $usource->set_type($STRING) if defined $usource;
10901
10902 # This property is to be considered binary, so change all the values
10903 # to Y.
10904 $iicore = property_ref('kIICore');
10905 if (defined $iicore) {
10906 $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10907
10908 # We have to change the default map, because the @missing line is
10909 # misleading, given that we are treating it as binary.
10910 $iicore->set_default_map('N');
10911 $iicore->set_type($BINARY);
10912 }
10913
10914 return;
10915 }
10916
10917 sub filter_unihan_line {
10918 # Change unihan db lines to look like the others in the db. Here is
10919 # an input sample:
10920 # U+341C kCangjie IEKN
10921
10922 # Tabs are used instead of semi-colons to separate fields; therefore
10923 # they may have semi-colons embedded in them. Change these to periods
10924 # so won't screw up the rest of the code.
10925 s/;/./g;
10926
10927 # Remove lines that don't look like ones we accept.
10928 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10929 $_ = "";
10930 return;
10931 }
10932
10933 # Extract the property, and save a reference to its object.
10934 my $property = $1;
10935 if (! exists $unihan_properties{$property}) {
10936 $unihan_properties{$property} = property_ref($property);
10937 }
10938
10939 # Don't do anything unless the property is one we're handling, which
10940 # we determine by seeing if there is an object defined for it or not
10941 if (! defined $unihan_properties{$property}) {
10942 $_ = "";
10943 return;
10944 }
10945
10946 # The iicore property is supposed to be a boolean, so convert to our
10947 # standard boolean form.
10948 if (defined $iicore && $unihan_properties{$property} == $iicore) {
10949 $_ =~ s/$property.*/$property\tY/
10950 }
10951
10952 # Convert the tab separators to our standard semi-colons, and convert
10953 # the U+HHHH notation to the rest of the standard's HHHH
10954 s/\t/;/g;
10955 s/\b U \+ (?= $code_point_re )//xg;
10956
10957 #local $to_trace = 1 if main::DEBUG;
10958 trace $_ if main::DEBUG && $to_trace;
10959
10960 return;
10961 }
10962}
10963
10964sub filter_blocks_lines {
10965 # In the Blocks.txt file, the names of the blocks don't quite match the
10966 # names given in PropertyValueAliases.txt, so this changes them so they
10967 # do match: Blanks and hyphens are changed into underscores. Also makes
10968 # early release versions look like later ones
10969 #
10970 # $_ is transformed to the correct value.
10971
10972 my $file = shift;
10973 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10974
10975 if ($v_version lt v3.2.0) {
10976 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10977 $_ = "";
10978 return;
10979 }
10980
10981 # Old versions used a different syntax to mark the range.
10982 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10983 }
10984
10985 my @fields = split /\s*;\s*/, $_, -1;
10986 if (@fields != 2) {
10987 $file->carp_bad_line("Expecting exactly two fields");
10988 $_ = "";
10989 return;
10990 }
10991
10992 # Change hyphens and blanks in the block name field only
10993 $fields[1] =~ s/[ -]/_/g;
10994 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
10995
10996 $_ = join("; ", @fields);
10997 return;
10998}
10999
11000{ # Closure
11001 my $current_property;
11002
11003 sub filter_old_style_proplist {
11004 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11005 # was in a completely different syntax. Ken Whistler of Unicode says
11006 # that it was something he used as an aid for his own purposes, but
11007 # was never an official part of the standard. However, comments in
11008 # DAge.txt indicate that non-character code points were available in
11009 # the UCD as of 3.1. It is unclear to me (khw) how they could be
11010 # there except through this file (but on the other hand, they first
11011 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
11012 # not. But the claim is that it was published as an aid to others who
11013 # might want some more information than was given in the official UCD
11014 # of the time. Many of the properties in it were incorporated into
11015 # the later PropList.txt, but some were not. This program uses this
11016 # early file to generate property tables that are otherwise not
11017 # accessible in the early UCD's, and most were probably not really
11018 # official at that time, so one could argue that it should be ignored,
11019 # and you can easily modify things to skip this. And there are bugs
11020 # in this file in various versions. (For example, the 2.1.9 version
11021 # removes from Alphabetic the CJK range starting at 4E00, and they
11022 # weren't added back in until 3.1.0.) Many of this file's properties
11023 # were later sanctioned, so this code generates tables for those
11024 # properties that aren't otherwise in the UCD of the time but
11025 # eventually did become official, and throws away the rest. Here is a
11026 # list of all the ones that are thrown away:
11027 # Bidi=* duplicates UnicodeData.txt
11028 # Combining never made into official property;
11029 # is \P{ccc=0}
11030 # Composite never made into official property.
11031 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11032 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11033 # Delimiter never made into official property;
11034 # removed in 3.0.1
11035 # Format Control never made into official property;
11036 # similar to gc=cf
11037 # High Surrogate duplicates Blocks.txt
11038 # Ignorable Control never made into official property;
11039 # similar to di=y
11040 # ISO Control duplicates UnicodeData.txt: gc=cc
11041 # Left of Pair never made into official property;
11042 # Line Separator duplicates UnicodeData.txt: gc=zl
11043 # Low Surrogate duplicates Blocks.txt
11044 # Non-break was actually listed as a property
11045 # in 3.2, but without any code
11046 # points. Unicode denies that this
11047 # was ever an official property
11048 # Non-spacing duplicate UnicodeData.txt: gc=mn
11049 # Numeric duplicates UnicodeData.txt: gc=cc
11050 # Paired Punctuation never made into official property;
11051 # appears to be gc=ps + gc=pe
11052 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11053 # Private Use duplicates UnicodeData.txt: gc=co
11054 # Private Use High Surrogate duplicates Blocks.txt
11055 # Punctuation duplicates UnicodeData.txt: gc=p
11056 # Space different definition than eventual
11057 # one.
11058 # Titlecase duplicates UnicodeData.txt: gc=lt
11059 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
98dc9551 11060 # Zero-width never made into official property;
99870f4d
KW
11061 # subset of gc=cf
11062 # Most of the properties have the same names in this file as in later
11063 # versions, but a couple do not.
11064 #
11065 # This subroutine filters $_, converting it from the old style into
11066 # the new style. Here's a sample of the old-style
11067 #
11068 # *******************************************
11069 #
11070 # Property dump for: 0x100000A0 (Join Control)
11071 #
11072 # 200C..200D (2 chars)
11073 #
11074 # In the example, the property is "Join Control". It is kept in this
11075 # closure between calls to the subroutine. The numbers beginning with
11076 # 0x were internal to Ken's program that generated this file.
11077
11078 # If this line contains the property name, extract it.
11079 if (/^Property dump for: [^(]*\((.*)\)/) {
11080 $_ = $1;
11081
11082 # Convert white space to underscores.
11083 s/ /_/g;
11084
11085 # Convert the few properties that don't have the same name as
11086 # their modern counterparts
11087 s/Identifier_Part/ID_Continue/
11088 or s/Not_a_Character/NChar/;
11089
11090 # If the name matches an existing property, use it.
11091 if (defined property_ref($_)) {
11092 trace "new property=", $_ if main::DEBUG && $to_trace;
11093 $current_property = $_;
11094 }
11095 else { # Otherwise discard it
11096 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11097 undef $current_property;
11098 }
11099 $_ = ""; # The property is saved for the next lines of the
11100 # file, but this defining line is of no further use,
11101 # so clear it so that the caller won't process it
11102 # further.
11103 }
11104 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11105
11106 # Here, the input line isn't a header defining a property for the
11107 # following section, and either we aren't in such a section, or
11108 # the line doesn't look like one that defines the code points in
11109 # such a section. Ignore this line.
11110 $_ = "";
11111 }
11112 else {
11113
11114 # Here, we have a line defining the code points for the current
11115 # stashed property. Anything starting with the first blank is
11116 # extraneous. Otherwise, it should look like a normal range to
11117 # the caller. Append the property name so that it looks just like
11118 # a modern PropList entry.
11119
11120 $_ =~ s/\s.*//;
11121 $_ .= "; $current_property";
11122 }
11123 trace $_ if main::DEBUG && $to_trace;
11124 return;
11125 }
11126} # End closure for old style proplist
11127
11128sub filter_old_style_normalization_lines {
11129 # For early releases of Unicode, the lines were like:
11130 # 74..2A76 ; NFKD_NO
11131 # For later releases this became:
11132 # 74..2A76 ; NFKD_QC; N
11133 # Filter $_ to look like those in later releases.
11134 # Similarly for MAYBEs
11135
11136 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11137
11138 # Also, the property FC_NFKC was abbreviated to FNC
11139 s/FNC/FC_NFKC/;
11140 return;
11141}
11142
82aed44a
KW
11143sub setup_script_extensions {
11144 # The Script_Extensions property starts out with a clone of the Script
11145 # property.
11146
11147 my $sc = property_ref("Script");
11148 my $scx = Property->new("scx", Full_Name => "Script_Extensions",
11149 Initialize => $sc,
11150 Default_Map => $sc->default_map,
11151 Pre_Declared_Maps => 0,
11152 );
11153 $scx->add_comment(join_lines( <<END
11154The values for code points that appear in one script are just the same as for
11155the 'Script' property. Likewise the values for those that appear in many
11156scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
11157values of code points that appear in a few scripts are a space separated list
11158of those scripts.
11159END
11160 ));
11161
11162 # Make the scx's tables and aliases for them the same as sc's
11163 foreach my $table ($sc->tables) {
11164 my $scx_table = $scx->add_match_table($table->name,
11165 Full_Name => $table->full_name);
11166 foreach my $alias ($table->aliases) {
11167 $scx_table->add_alias($alias->name);
11168 }
11169 }
11170}
11171
99870f4d
KW
11172sub finish_Unicode() {
11173 # This routine should be called after all the Unicode files have been read
11174 # in. It:
11175 # 1) Adds the mappings for code points missing from the files which have
11176 # defaults specified for them.
11177 # 2) At this this point all mappings are known, so it computes the type of
11178 # each property whose type hasn't been determined yet.
11179 # 3) Calculates all the regular expression match tables based on the
11180 # mappings.
11181 # 3) Calculates and adds the tables which are defined by Unicode, but
11182 # which aren't derived by them
11183
11184 # For each property, fill in any missing mappings, and calculate the re
11185 # match tables. If a property has more than one missing mapping, the
11186 # default is a reference to a data structure, and requires data from other
11187 # properties to resolve. The sort is used to cause these to be processed
11188 # last, after all the other properties have been calculated.
11189 # (Fortunately, the missing properties so far don't depend on each other.)
11190 foreach my $property
11191 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
11192 property_ref('*'))
11193 {
11194 # $perl has been defined, but isn't one of the Unicode properties that
11195 # need to be finished up.
11196 next if $property == $perl;
11197
11198 # Handle the properties that have more than one possible default
11199 if (ref $property->default_map) {
11200 my $default_map = $property->default_map;
11201
11202 # These properties have stored in the default_map:
11203 # One or more of:
11204 # 1) A default map which applies to all code points in a
11205 # certain class
11206 # 2) an expression which will evaluate to the list of code
11207 # points in that class
11208 # And
11209 # 3) the default map which applies to every other missing code
11210 # point.
11211 #
11212 # Go through each list.
11213 while (my ($default, $eval) = $default_map->get_next_defaults) {
11214
11215 # Get the class list, and intersect it with all the so-far
11216 # unspecified code points yielding all the code points
11217 # in the class that haven't been specified.
11218 my $list = eval $eval;
11219 if ($@) {
11220 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
11221 last;
11222 }
11223
11224 # Narrow down the list to just those code points we don't have
11225 # maps for yet.
11226 $list = $list & $property->inverse_list;
11227
11228 # Add mappings to the property for each code point in the list
11229 foreach my $range ($list->ranges) {
56343c78
KW
11230 $property->add_map($range->start, $range->end, $default,
11231 Replace => $CROAK);
99870f4d
KW
11232 }
11233 }
11234
11235 # All remaining code points have the other mapping. Set that up
11236 # so the normal single-default mapping code will work on them
11237 $property->set_default_map($default_map->other_default);
11238
11239 # And fall through to do that
11240 }
11241
11242 # We should have enough data now to compute the type of the property.
11243 $property->compute_type;
11244 my $property_type = $property->type;
11245
11246 next if ! $property->to_create_match_tables;
11247
11248 # Here want to create match tables for this property
11249
11250 # The Unicode db always (so far, and they claim into the future) have
11251 # the default for missing entries in binary properties be 'N' (unless
11252 # there is a '@missing' line that specifies otherwise)
11253 if ($property_type == $BINARY && ! defined $property->default_map) {
11254 $property->set_default_map('N');
11255 }
11256
11257 # Add any remaining code points to the mapping, using the default for
5d7f7709 11258 # missing code points.
99870f4d 11259 if (defined (my $default_map = $property->default_map)) {
1520492f 11260
f4c2a127
KW
11261 # Make sure there is a match table for the default
11262 my $default_table;
11263 if (! defined ($default_table = $property->table($default_map))) {
11264 $default_table = $property->add_match_table($default_map);
11265 }
11266
a92d5c2e
KW
11267 # And, if the property is binary, the default table will just
11268 # be the complement of the other table.
11269 if ($property_type == $BINARY) {
11270 my $non_default_table;
11271
11272 # Find the non-default table.
11273 for my $table ($property->tables) {
11274 next if $table == $default_table;
11275 $non_default_table = $table;
11276 }
11277 $default_table->set_complement($non_default_table);
11278 }
11279
e1759d04
KW
11280 # This fills in any missing values with the default. It's
11281 # tempting to save some time and memory in running this program
11282 # by skipping this step for binary tables where the default
11283 # is easily calculated. But it is needed for generating
11284 # the test file, and other changes would also be required to do
11285 # so.
1520492f
KW
11286 $property->add_map(0, $LAST_UNICODE_CODEPOINT,
11287 $default_map, Replace => $NO);
99870f4d
KW
11288 }
11289
11290 # Have all we need to populate the match tables.
11291 my $property_name = $property->name;
56557540 11292 my $maps_should_be_defined = $property->pre_declared_maps;
99870f4d
KW
11293 foreach my $range ($property->ranges) {
11294 my $map = $range->value;
11295 my $table = property_ref($property_name)->table($map);
11296 if (! defined $table) {
11297
11298 # Integral and rational property values are not necessarily
56557540
KW
11299 # defined in PropValueAliases, but whether all the other ones
11300 # should be depends on the property.
11301 if ($maps_should_be_defined
99870f4d
KW
11302 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
11303 {
11304 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
11305 }
11306 $table = property_ref($property_name)->add_match_table($map);
11307 }
11308
11309 $table->add_range($range->start, $range->end);
11310 }
11311
807807b7
KW
11312 # For Perl 5.6 compatibility, all properties matchable in regexes can
11313 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
11314 # But warn if this creates a conflict with a (new) Unicode property
11315 # name, although it appears that Unicode has made a decision never to
11316 # begin a property name with 'Is_', so this shouldn't happen.
99870f4d
KW
11317 foreach my $alias ($property->aliases) {
11318 my $Is_name = 'Is_' . $alias->name;
807807b7 11319 if (defined (my $pre_existing = property_ref($Is_name))) {
99870f4d 11320 Carp::my_carp(<<END
807807b7
KW
11321There is already an alias named $Is_name (from " . $pre_existing . "), so
11322creating one for $property won't work. This is bad news. If it is not too
11323late, get Unicode to back off. Otherwise go back to the old scheme (findable
11324from the git blame log for this area of the code that suppressed individual
11325aliases that conflict with the new Unicode names. Proceeding anyway.
99870f4d
KW
11326END
11327 );
99870f4d
KW
11328 }
11329 } # End of loop through aliases for this property
11330 } # End of loop through all Unicode properties.
11331
11332 # Fill in the mappings that Unicode doesn't completely furnish. First the
11333 # single letter major general categories. If Unicode were to start
11334 # delivering the values, this would be redundant, but better that than to
11335 # try to figure out if should skip and not get it right. Ths could happen
11336 # if a new major category were to be introduced, and the hard-coded test
11337 # wouldn't know about it.
11338 # This routine depends on the standard names for the general categories
11339 # being what it thinks they are, like 'Cn'. The major categories are the
11340 # union of all the general category tables which have the same first
11341 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
11342 foreach my $minor_table ($gc->tables) {
11343 my $minor_name = $minor_table->name;
11344 next if length $minor_name == 1;
11345 if (length $minor_name != 2) {
11346 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
11347 next;
11348 }
11349
11350 my $major_name = uc(substr($minor_name, 0, 1));
11351 my $major_table = $gc->table($major_name);
11352 $major_table += $minor_table;
11353 }
11354
11355 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
11356 # defines it as LC)
11357 my $LC = $gc->table('LC');
11358 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
11359 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
11360
11361
11362 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
11363 # deliver the correct values in it
11364 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
11365
11366 # Lt not in release 1.
a5c376b7
KW
11367 if (defined $gc->table('Lt')) {
11368 $LC += $gc->table('Lt');
11369 $gc->table('Lt')->set_caseless_equivalent($LC);
11370 }
99870f4d
KW
11371 }
11372 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
11373
a5c376b7
KW
11374 $gc->table('Ll')->set_caseless_equivalent($LC);
11375 $gc->table('Lu')->set_caseless_equivalent($LC);
11376
99870f4d 11377 my $Cs = $gc->table('Cs');
99870f4d
KW
11378
11379
11380 # Folding information was introduced later into Unicode data. To get
11381 # Perl's case ignore (/i) to work at all in releases that don't have
11382 # folding, use the best available alternative, which is lower casing.
11383 my $fold = property_ref('Simple_Case_Folding');
11384 if ($fold->is_empty) {
11385 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
11386 $fold->add_note(join_lines(<<END
11387WARNING: This table uses lower case as a substitute for missing fold
11388information
11389END
11390 ));
11391 }
11392
11393 # Multiple-character mapping was introduced later into Unicode data. If
11394 # missing, use the single-characters maps as best available alternative
11395 foreach my $map (qw { Uppercase_Mapping
11396 Lowercase_Mapping
11397 Titlecase_Mapping
11398 Case_Folding
11399 } ) {
11400 my $full = property_ref($map);
11401 if ($full->is_empty) {
11402 my $simple = property_ref('Simple_' . $map);
11403 $full->initialize($simple);
11404 $full->add_comment($simple->comment) if ($simple->comment);
11405 $full->add_note(join_lines(<<END
11406WARNING: This table uses simple mapping (single-character only) as a
11407substitute for missing multiple-character information
11408END
11409 ));
11410 }
11411 }
82aed44a
KW
11412
11413 # The Script_Extensions property started out as a clone of the Script
11414 # property. But processing its data file caused some elements to be
11415 # replaced with different data. (These elements were for the Common and
11416 # Inherited properties.) This data is a qw() list of all the scripts that
11417 # the code points in the given range are in. An example line is:
11418 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
11419 #
11420 # The code above has created a new match table named "Arab Syrc Thaa"
11421 # which contains 060C. (The cloned table started out with this code point
11422 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
11423 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
11424 # match table. This is repeated for all these tables and ranges. The map
11425 # data is retained in the map table for reference, but the spurious match
11426 # tables are deleted.
11427
11428 my $scx = property_ref("Script_Extensions");
11429 foreach my $table ($scx->tables) {
11430 next unless $table->name =~ /\s/; # Only the new tables have a space
11431 # in their names, and all do
11432 my @scripts = split /\s+/, $table->name;
11433 foreach my $script (@scripts) {
11434 my $script_table = $scx->table($script);
11435 $script_table += $table;
11436 }
11437 $scx->delete_match_table($table);
11438 }
11439
11440 return;
99870f4d
KW
11441}
11442
11443sub compile_perl() {
11444 # Create perl-defined tables. Almost all are part of the pseudo-property
11445 # named 'perl' internally to this program. Many of these are recommended
11446 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
11447 # on those found there.
11448 # Almost all of these are equivalent to some Unicode property.
11449 # A number of these properties have equivalents restricted to the ASCII
11450 # range, with their names prefaced by 'Posix', to signify that these match
11451 # what the Posix standard says they should match. A couple are
11452 # effectively this, but the name doesn't have 'Posix' in it because there
cbc24f92
KW
11453 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
11454 # to the full Unicode range, by our guesses as to what is appropriate.
99870f4d
KW
11455
11456 # 'Any' is all code points. As an error check, instead of just setting it
11457 # to be that, construct it to be the union of all the major categories
7fc6cb55 11458 $Any = $perl->add_match_table('Any',
99870f4d
KW
11459 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
11460 Matches_All => 1);
11461
11462 foreach my $major_table ($gc->tables) {
11463
11464 # Major categories are the ones with single letter names.
11465 next if length($major_table->name) != 1;
11466
11467 $Any += $major_table;
11468 }
11469
11470 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
11471 Carp::my_carp_bug("Generated highest code point ("
11472 . sprintf("%X", $Any->max)
11473 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
11474 }
11475 if ($Any->range_count != 1 || $Any->min != 0) {
11476 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
11477 }
11478
11479 $Any->add_alias('All');
11480
11481 # Assigned is the opposite of gc=unassigned
11482 my $Assigned = $perl->add_match_table('Assigned',
11483 Description => "All assigned code points",
11484 Initialize => ~ $gc->table('Unassigned'),
11485 );
11486
11487 # Our internal-only property should be treated as more than just a
11488 # synonym.
11489 $perl->add_match_table('_CombAbove')
11490 ->set_equivalent_to(property_ref('ccc')->table('Above'),
11491 Related => 1);
11492
11493 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
11494 if (defined $block) { # This is equivalent to the block if have it.
11495 my $Unicode_ASCII = $block->table('Basic_Latin');
11496 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
11497 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
11498 }
11499 }
11500
11501 # Very early releases didn't have blocks, so initialize ASCII ourselves if
11502 # necessary
11503 if ($ASCII->is_empty) {
11504 $ASCII->initialize([ 0..127 ]);
11505 }
11506
99870f4d
KW
11507 # Get the best available case definitions. Early Unicode versions didn't
11508 # have Uppercase and Lowercase defined, so use the general category
11509 # instead for them.
11510 my $Lower = $perl->add_match_table('Lower');
11511 my $Unicode_Lower = property_ref('Lowercase');
11512 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
11513 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
a5c376b7
KW
11514 $Unicode_Lower->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11515 $Unicode_Lower->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11516 $Lower->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11517
99870f4d
KW
11518 }
11519 else {
11520 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
11521 Related => 1);
11522 }
cbc24f92 11523 $Lower->add_alias('XPosixLower');
a5c376b7 11524 my $Posix_Lower = $perl->add_match_table("PosixLower",
ad5e8af1
KW
11525 Description => "[a-z]",
11526 Initialize => $Lower & $ASCII,
11527 );
99870f4d
KW
11528
11529 my $Upper = $perl->add_match_table('Upper');
11530 my $Unicode_Upper = property_ref('Uppercase');
11531 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
11532 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
a5c376b7
KW
11533 $Unicode_Upper->table('Y')->set_caseless_equivalent(property_ref('Cased')->table('Y'));
11534 $Unicode_Upper->table('N')->set_caseless_equivalent(property_ref('Cased')->table('N'));
11535 $Upper->set_caseless_equivalent(property_ref('Cased')->table('Y'));
99870f4d
KW
11536 }
11537 else {
11538 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
11539 Related => 1);
11540 }
cbc24f92 11541 $Upper->add_alias('XPosixUpper');
a5c376b7 11542 my $Posix_Upper = $perl->add_match_table("PosixUpper",
ad5e8af1
KW
11543 Description => "[A-Z]",
11544 Initialize => $Upper & $ASCII,
11545 );
99870f4d
KW
11546
11547 # Earliest releases didn't have title case. Initialize it to empty if not
11548 # otherwise present
4364919a
KW
11549 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
11550 Description => '(= \p{Gc=Lt})');
99870f4d 11551 my $lt = $gc->table('Lt');
a5c376b7
KW
11552
11553 # Earlier versions of mktables had this related to $lt since they have
11554 # identical code points, but their casefolds are not equivalent, and so
11555 # now must be kept as separate entities.
11556 $Title += $lt if defined $lt;
99870f4d
KW
11557
11558 # If this Unicode version doesn't have Cased, set up our own. From
11559 # Unicode 5.1: Definition D120: A character C is defined to be cased if
11560 # and only if C has the Lowercase or Uppercase property or has a
11561 # General_Category value of Titlecase_Letter.
a5c376b7
KW
11562 my $Unicode_Cased = property_ref('Cased');
11563 unless (defined $Unicode_Cased) {
99870f4d
KW
11564 my $cased = $perl->add_match_table('Cased',
11565 Initialize => $Lower + $Upper + $Title,
11566 Description => 'Uppercase or Lowercase or Titlecase',
11567 );
a5c376b7 11568 $Unicode_Cased = $cased;
99870f4d 11569 }
a5c376b7 11570 $Title->set_caseless_equivalent($Unicode_Cased->table('Y'));
99870f4d
KW
11571
11572 # Similarly, set up our own Case_Ignorable property if this Unicode
11573 # version doesn't have it. From Unicode 5.1: Definition D121: A character
11574 # C is defined to be case-ignorable if C has the value MidLetter or the
11575 # value MidNumLet for the Word_Break property or its General_Category is
11576 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
11577 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
11578
11579 # Perl has long had an internal-only alias for this property.
11580 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
11581 my $case_ignorable = property_ref('Case_Ignorable');
11582 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
11583 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
11584 Related => 1);
11585 }
11586 else {
11587
11588 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
11589
11590 # The following three properties are not in early releases
11591 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
11592 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
11593 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
11594
11595 # For versions 4.1 - 5.0, there is no MidNumLet property, and
11596 # correspondingly the case-ignorable definition lacks that one. For
11597 # 4.0, it appears that it was meant to be the same definition, but was
11598 # inadvertently omitted from the standard's text, so add it if the
11599 # property actually is there
11600 my $wb = property_ref('Word_Break');
11601 if (defined $wb) {
11602 my $midlet = $wb->table('MidLetter');
11603 $perl_case_ignorable += $midlet if defined $midlet;
11604 my $midnumlet = $wb->table('MidNumLet');
11605 $perl_case_ignorable += $midnumlet if defined $midnumlet;
11606 }
11607 else {
11608
11609 # In earlier versions of the standard, instead of the above two
11610 # properties , just the following characters were used:
11611 $perl_case_ignorable += 0x0027 # APOSTROPHE
11612 + 0x00AD # SOFT HYPHEN (SHY)
11613 + 0x2019; # RIGHT SINGLE QUOTATION MARK
11614 }
11615 }
11616
11617 # The remaining perl defined tables are mostly based on Unicode TR 18,
11618 # "Annex C: Compatibility Properties". All of these have two versions,
11619 # one whose name generally begins with Posix that is posix-compliant, and
11620 # one that matches Unicode characters beyond the Posix, ASCII range
11621
ad5e8af1 11622 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
11623
11624 # Alphabetic was not present in early releases
11625 my $Alphabetic = property_ref('Alphabetic');
11626 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
11627 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
11628 }
11629 else {
11630
11631 # For early releases, we don't get it exactly right. The below
11632 # includes more than it should, which in 5.2 terms is: L + Nl +
11633 # Other_Alphabetic. Other_Alphabetic contains many characters from
11634 # Mn and Mc. It's better to match more than we should, than less than
11635 # we should.
11636 $Alpha->initialize($gc->table('Letter')
11637 + $gc->table('Mn')
11638 + $gc->table('Mc'));
11639 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 11640 $Alpha->add_description('Alphabetic');
99870f4d 11641 }
cbc24f92 11642 $Alpha->add_alias('XPosixAlpha');
a5c376b7 11643 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
ad5e8af1
KW
11644 Description => "[A-Za-z]",
11645 Initialize => $Alpha & $ASCII,
11646 );
a5c376b7
KW
11647 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
11648 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
99870f4d
KW
11649
11650 my $Alnum = $perl->add_match_table('Alnum',
ad5e8af1 11651 Description => 'Alphabetic and (Decimal) Numeric',
99870f4d
KW
11652 Initialize => $Alpha + $gc->table('Decimal_Number'),
11653 );
cbc24f92 11654 $Alnum->add_alias('XPosixAlnum');
ad5e8af1
KW
11655 $perl->add_match_table("PosixAlnum",
11656 Description => "[A-Za-z0-9]",
11657 Initialize => $Alnum & $ASCII,
11658 );
99870f4d
KW
11659
11660 my $Word = $perl->add_match_table('Word',
d35dd6c6
KW
11661 Description => '\w, including beyond ASCII;'
11662 . ' = \p{Alnum} + \pM + \p{Pc}',
99870f4d
KW
11663 Initialize => $Alnum + $gc->table('Mark'),
11664 );
cbc24f92 11665 $Word->add_alias('XPosixWord');
99870f4d
KW
11666 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
11667 $Word += $Pc if defined $Pc;
11668
f38f76ae 11669 # This is a Perl extension, so the name doesn't begin with Posix.
cbc24f92 11670 my $PerlWord = $perl->add_match_table('PerlWord',
99870f4d
KW
11671 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
11672 Initialize => $Word & $ASCII,
11673 );
cbc24f92 11674 $PerlWord->add_alias('PosixWord');
99870f4d
KW
11675
11676 my $Blank = $perl->add_match_table('Blank',
11677 Description => '\h, Horizontal white space',
11678
11679 # 200B is Zero Width Space which is for line
11680 # break control, and was listed as
11681 # Space_Separator in early releases
11682 Initialize => $gc->table('Space_Separator')
11683 + 0x0009 # TAB
11684 - 0x200B, # ZWSP
11685 );
11686 $Blank->add_alias('HorizSpace'); # Another name for it.
cbc24f92 11687 $Blank->add_alias('XPosixBlank');
ad5e8af1
KW
11688 $perl->add_match_table("PosixBlank",
11689 Description => "\\t and ' '",
11690 Initialize => $Blank & $ASCII,
11691 );
99870f4d
KW
11692
11693 my $VertSpace = $perl->add_match_table('VertSpace',
11694 Description => '\v',
11695 Initialize => $gc->table('Line_Separator')
11696 + $gc->table('Paragraph_Separator')
11697 + 0x000A # LINE FEED
11698 + 0x000B # VERTICAL TAB
11699 + 0x000C # FORM FEED
11700 + 0x000D # CARRIAGE RETURN
11701 + 0x0085, # NEL
11702 );
11703 # No Posix equivalent for vertical space
11704
11705 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
11706 Description => '\s including beyond ASCII plus vertical tab',
11707 Initialize => $Blank + $VertSpace,
99870f4d 11708 );
cbc24f92 11709 $Space->add_alias('XPosixSpace');
ad5e8af1 11710 $perl->add_match_table("PosixSpace",
f38f76ae 11711 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
11712 Initialize => $Space & $ASCII,
11713 );
99870f4d
KW
11714
11715 # Perl's traditional space doesn't include Vertical Tab
cbc24f92 11716 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
99870f4d
KW
11717 Description => '\s, including beyond ASCII',
11718 Initialize => $Space - 0x000B,
11719 );
cbc24f92
KW
11720 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
11721 my $PerlSpace = $perl->add_match_table('PerlSpace',
de25ec47
KW
11722 Description => '\s, restricted to ASCII = [ \f\n\r\t]',
11723 Initialize => $XPerlSpace & $ASCII,
99870f4d
KW
11724 );
11725
cbc24f92 11726
99870f4d 11727 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 11728 Description => 'Control characters');
99870f4d 11729 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
cbc24f92 11730 $Cntrl->add_alias('XPosixCntrl');
ad5e8af1 11731 $perl->add_match_table("PosixCntrl",
f38f76ae 11732 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
11733 Initialize => $Cntrl & $ASCII,
11734 );
99870f4d
KW
11735
11736 # $controls is a temporary used to construct Graph.
11737 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11738 + $gc->table('Control'));
11739 # Cs not in release 1
11740 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11741
11742 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11743 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 11744 Description => 'Characters that are graphical',
99870f4d
KW
11745 Initialize => ~ ($Space + $controls),
11746 );
cbc24f92 11747 $Graph->add_alias('XPosixGraph');
ad5e8af1 11748 $perl->add_match_table("PosixGraph",
f38f76ae
KW
11749 Description =>
11750 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
11751 Initialize => $Graph & $ASCII,
11752 );
99870f4d 11753
3e20195b 11754 $print = $perl->add_match_table('Print',
ad5e8af1 11755 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 11756 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 11757 );
cbc24f92 11758 $print->add_alias('XPosixPrint');
ad5e8af1 11759 $perl->add_match_table("PosixPrint",
66fd7fd0 11760 Description =>
f38f76ae 11761 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 11762 Initialize => $print & $ASCII,
ad5e8af1 11763 );
99870f4d
KW
11764
11765 my $Punct = $perl->add_match_table('Punct');
11766 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11767
11768 # \p{punct} doesn't include the symbols, which posix does
cbc24f92
KW
11769 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
11770 Description => '\p{Punct} + ASCII-range \p{Symbol}',
11771 Initialize => $gc->table('Punctuation')
11772 + ($ASCII & $gc->table('Symbol')),
11773 );
99870f4d 11774 $perl->add_match_table('PosixPunct',
f38f76ae 11775 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
cbc24f92 11776 Initialize => $ASCII & $XPosixPunct,
ad5e8af1 11777 );
99870f4d
KW
11778
11779 my $Digit = $perl->add_match_table('Digit',
f3a73f6e 11780 Description => '[0-9] + all other decimal digits');
99870f4d 11781 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
cbc24f92 11782 $Digit->add_alias('XPosixDigit');
ad5e8af1
KW
11783 my $PosixDigit = $perl->add_match_table("PosixDigit",
11784 Description => '[0-9]',
11785 Initialize => $Digit & $ASCII,
11786 );
99870f4d 11787
eadadd41
KW
11788 # Hex_Digit was not present in first release
11789 my $Xdigit = $perl->add_match_table('XDigit');
cbc24f92 11790 $Xdigit->add_alias('XPosixXDigit');
eadadd41
KW
11791 my $Hex = property_ref('Hex_Digit');
11792 if (defined $Hex && ! $Hex->is_empty) {
11793 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
11794 }
11795 else {
eadadd41
KW
11796 # (Have to use hex instead of e.g. '0', because could be running on an
11797 # non-ASCII machine, and we want the Unicode (ASCII) values)
11798 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11799 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11800 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d 11801 }
4efcc33b
KW
11802
11803 # AHex was not present in early releases
11804 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
11805 my $AHex = property_ref('ASCII_Hex_Digit');
11806 if (defined $AHex && ! $AHex->is_empty) {
11807 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
11808 }
11809 else {
11810 $PosixXDigit->initialize($Xdigit & $ASCII);
11811 }
11812 $PosixXDigit->add_description('[0-9A-Fa-f]');
99870f4d 11813
99870f4d
KW
11814 my $dt = property_ref('Decomposition_Type');
11815 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11816 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11817 Perl_Extension => 1,
d57ccc9a 11818 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
11819 );
11820
11821 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11822 # than SD appeared, construct it ourselves, based on the first release SD
11823 # was in.
11824 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11825 my $soft_dotted = property_ref('Soft_Dotted');
11826 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11827 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11828 }
11829 else {
11830
11831 # This list came from 3.2 Soft_Dotted.
11832 $CanonDCIJ->initialize([ 0x0069,
11833 0x006A,
11834 0x012F,
11835 0x0268,
11836 0x0456,
11837 0x0458,
11838 0x1E2D,
11839 0x1ECB,
11840 ]);
11841 $CanonDCIJ = $CanonDCIJ & $Assigned;
11842 }
11843
f86864ac 11844 # These are used in Unicode's definition of \X
37e2e78e
KW
11845 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11846 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11847
99870f4d 11848 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 11849
678f13d5 11850 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
11851 # definition differs too much from the traditional Perl one to use.
11852 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11853
11854 # Note that assumes HST is defined; it came in an earlier release than
11855 # GCB. In the line below, two negatives means: yes hangul
11856 $begin += ~ property_ref('Hangul_Syllable_Type')
11857 ->table('Not_Applicable')
11858 + ~ ($gcb->table('Control')
11859 + $gcb->table('CR')
11860 + $gcb->table('LF'));
11861 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11862
11863 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11864 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
11865 }
11866 else { # Old definition, used on early releases.
f86864ac 11867 $extend += $gc->table('Mark')
37e2e78e
KW
11868 + 0x200C # ZWNJ
11869 + 0x200D; # ZWJ
11870 $begin += ~ $extend;
11871
11872 # Here we may have a release that has the regular grapheme cluster
11873 # defined, or a release that doesn't have anything defined.
11874 # We set things up so the Perl core degrades gracefully, possibly with
11875 # placeholders that match nothing.
11876
11877 if (! defined $gcb) {
11878 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11879 }
11880 my $hst = property_ref('HST');
11881 if (!defined $hst) {
11882 $hst = Property->new('HST', Status => $PLACEHOLDER);
11883 $hst->add_match_table('Not_Applicable',
11884 Initialize => $Any,
11885 Matches_All => 1);
11886 }
11887
11888 # On some releases, here we may not have the needed tables for the
11889 # perl core, in some releases we may.
11890 foreach my $name (qw{ L LV LVT T V prepend }) {
11891 my $table = $gcb->table($name);
11892 if (! defined $table) {
11893 $table = $gcb->add_match_table($name);
11894 push @tables_that_may_be_empty, $table->complete_name;
11895 }
11896
11897 # The HST property predates the GCB one, and has identical tables
11898 # for some of them, so use it if we can.
11899 if ($table->is_empty
11900 && defined $hst
11901 && defined $hst->table($name))
11902 {
11903 $table += $hst->table($name);
11904 }
11905 }
11906 }
11907
11908 # More GCB. If we found some hangul syllables, populate a combined
11909 # table.
11910 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11911 my $LV = $gcb->table('LV');
11912 if ($LV->is_empty) {
11913 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11914 } else {
11915 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11916 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
11917 }
11918
28093d0e 11919 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
11920 my @composition = ('Name', 'Unicode_1_Name');
11921
11922 if (@named_sequences) {
11923 push @composition, 'Named_Sequence';
11924 foreach my $sequence (@named_sequences) {
11925 $perl_charname->add_anomalous_entry($sequence);
11926 }
11927 }
11928
11929 my $alias_sentence = "";
11930 my $alias = property_ref('Name_Alias');
11931 if (defined $alias) {
11932 push @composition, 'Name_Alias';
11933 $alias->reset_each_range;
11934 while (my ($range) = $alias->each_range) {
11935 next if $range->value eq "";
11936 if ($range->start != $range->end) {
11937 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
11938 }
11939 $perl_charname->add_duplicate($range->start, $range->value);
11940 }
11941 $alias_sentence = <<END;
11942The Name_Alias property adds duplicate code point entries with a corrected
11943name. The original (less correct, but still valid) name will be physically
53d84487 11944last.
99870f4d
KW
11945END
11946 }
11947 my $comment;
11948 if (@composition <= 2) { # Always at least 2
11949 $comment = join " and ", @composition;
11950 }
11951 else {
11952 $comment = join ", ", @composition[0 .. scalar @composition - 2];
11953 $comment .= ", and $composition[-1]";
11954 }
11955
99870f4d
KW
11956 $perl_charname->add_comment(join_lines( <<END
11957This file is for charnames.pm. It is the union of the $comment properties.
11958Unicode_1_Name entries are used only for otherwise nameless code
11959points.
11960$alias_sentence
11961END
11962 ));
11963
11964 # The combining class property used by Perl's normalize.pm is not located
11965 # in the normal mapping directory; create a copy for it.
11966 my $ccc = property_ref('Canonical_Combining_Class');
11967 my $perl_ccc = Property->new('Perl_ccc',
11968 Default_Map => $ccc->default_map,
11969 Full_Name => 'Perl_Canonical_Combining_Class',
11970 Internal_Only_Warning => 1,
11971 Perl_Extension => 1,
11972 Pod_Entry =>0,
11973 Type => $ENUM,
11974 Initialize => $ccc,
11975 File => 'CombiningClass',
517956bf 11976 Directory => File::Spec->curdir(),
99870f4d 11977 );
8572ace0 11978 $perl_ccc->set_to_output_map($EXTERNAL_MAP);
99870f4d
KW
11979 $perl_ccc->add_comment(join_lines(<<END
11980This mapping is for normalize.pm. It is currently identical to the Unicode
11981Canonical_Combining_Class property.
11982END
11983 ));
11984
11985 # This one match table for it is needed for calculations on output
11986 my $default = $perl_ccc->add_match_table($ccc->default_map,
11987 Initialize => $ccc->table($ccc->default_map),
11988 Status => $SUPPRESSED);
11989
11990 # Construct the Present_In property from the Age property.
11991 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11992 my $default_map = $age->default_map;
11993 my $in = Property->new('In',
11994 Default_Map => $default_map,
11995 Full_Name => "Present_In",
11996 Internal_Only_Warning => 1,
11997 Perl_Extension => 1,
11998 Type => $ENUM,
11999 Initialize => $age,
12000 );
12001 $in->add_comment(join_lines(<<END
12002This file should not be used for any purpose. The values in this file are the
12003same as for $age, and not for what $in really means. This is because anything
12004defined in a given release should have multiple values: that release and all
12005higher ones. But only one value per code point can be represented in a table
12006like this.
12007END
12008 ));
12009
12010 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
12011 # lowest numbered (earliest) come first, with the non-numeric one
12012 # last.
12013 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
12014 ? 1
12015 : ($b->name !~ /^[\d.]*$/)
12016 ? -1
12017 : $a->name <=> $b->name
12018 } $age->tables;
12019
12020 # The Present_In property is the cumulative age properties. The first
12021 # one hence is identical to the first age one.
12022 my $previous_in = $in->add_match_table($first_age->name);
12023 $previous_in->set_equivalent_to($first_age, Related => 1);
12024
12025 my $description_start = "Code point's usage introduced in version ";
12026 $first_age->add_description($description_start . $first_age->name);
12027
98dc9551 12028 # To construct the accumulated values, for each of the age tables
99870f4d
KW
12029 # starting with the 2nd earliest, merge the earliest with it, to get
12030 # all those code points existing in the 2nd earliest. Repeat merging
12031 # the new 2nd earliest with the 3rd earliest to get all those existing
12032 # in the 3rd earliest, and so on.
12033 foreach my $current_age (@rest_ages) {
12034 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
12035
12036 my $current_in = $in->add_match_table(
12037 $current_age->name,
12038 Initialize => $current_age + $previous_in,
12039 Description => $description_start
12040 . $current_age->name
12041 . ' or earlier',
12042 );
12043 $previous_in = $current_in;
12044
12045 # Add clarifying material for the corresponding age file. This is
12046 # in part because of the confusing and contradictory information
12047 # given in the Standard's documentation itself, as of 5.2.
12048 $current_age->add_description(
12049 "Code point's usage was introduced in version "
12050 . $current_age->name);
12051 $current_age->add_note("See also $in");
12052
12053 }
12054
12055 # And finally the code points whose usages have yet to be decided are
12056 # the same in both properties. Note that permanently unassigned code
12057 # points actually have their usage assigned (as being permanently
12058 # unassigned), so that these tables are not the same as gc=cn.
12059 my $unassigned = $in->add_match_table($default_map);
12060 my $age_default = $age->table($default_map);
12061 $age_default->add_description(<<END
12062Code point's usage has not been assigned in any Unicode release thus far.
12063END
12064 );
12065 $unassigned->set_equivalent_to($age_default, Related => 1);
12066 }
12067
12068
12069 # Finished creating all the perl properties. All non-internal non-string
12070 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
12071 # an underscore.) These do not get a separate entry in the pod file
12072 foreach my $table ($perl->tables) {
12073 foreach my $alias ($table->aliases) {
12074 next if $alias->name =~ /^_/;
12075 $table->add_alias('Is_' . $alias->name,
12076 Pod_Entry => 0,
12077 Status => $alias->status,
12078 Externally_Ok => 0);
12079 }
12080 }
12081
c4019d52
KW
12082 # Here done with all the basic stuff. Ready to populate the information
12083 # about each character if annotating them.
558712cf 12084 if ($annotate) {
c4019d52
KW
12085
12086 # See comments at its declaration
12087 $annotate_ranges = Range_Map->new;
12088
12089 # This separates out the non-characters from the other unassigneds, so
12090 # can give different annotations for each.
12091 $unassigned_sans_noncharacters = Range_List->new(
12092 Initialize => $gc->table('Unassigned')
12093 & property_ref('Noncharacter_Code_Point')->table('N'));
12094
12095 for (my $i = 0; $i <= $LAST_UNICODE_CODEPOINT; $i++ ) {
12096 $i = populate_char_info($i); # Note sets $i so may cause skips
12097 }
12098 }
12099
99870f4d
KW
12100 return;
12101}
12102
12103sub add_perl_synonyms() {
12104 # A number of Unicode tables have Perl synonyms that are expressed in
12105 # the single-form, \p{name}. These are:
12106 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
12107 # \p{Is_Name} as synonyms
12108 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
12109 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
12110 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
12111 # conflict, \p{Value} and \p{Is_Value} as well
12112 #
12113 # This routine generates these synonyms, warning of any unexpected
12114 # conflicts.
12115
12116 # Construct the list of tables to get synonyms for. Start with all the
12117 # binary and the General_Category ones.
12118 my @tables = grep { $_->type == $BINARY } property_ref('*');
12119 push @tables, $gc->tables;
12120
12121 # If the version of Unicode includes the Script property, add its tables
12122 if (defined property_ref('Script')) {
12123 push @tables, property_ref('Script')->tables;
12124 }
12125
12126 # The Block tables are kept separate because they are treated differently.
12127 # And the earliest versions of Unicode didn't include them, so add only if
12128 # there are some.
12129 my @blocks;
12130 push @blocks, $block->tables if defined $block;
12131
12132 # Here, have the lists of tables constructed. Process blocks last so that
12133 # if there are name collisions with them, blocks have lowest priority.
12134 # Should there ever be other collisions, manual intervention would be
12135 # required. See the comments at the beginning of the program for a
12136 # possible way to handle those semi-automatically.
12137 foreach my $table (@tables, @blocks) {
12138
12139 # For non-binary properties, the synonym is just the name of the
12140 # table, like Greek, but for binary properties the synonym is the name
12141 # of the property, and means the code points in its 'Y' table.
12142 my $nominal = $table;
12143 my $nominal_property = $nominal->property;
12144 my $actual;
12145 if (! $nominal->isa('Property')) {
12146 $actual = $table;
12147 }
12148 else {
12149
12150 # Here is a binary property. Use the 'Y' table. Verify that is
12151 # there
12152 my $yes = $nominal->table('Y');
12153 unless (defined $yes) { # Must be defined, but is permissible to
12154 # be empty.
12155 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
12156 next;
12157 }
12158 $actual = $yes;
12159 }
12160
12161 foreach my $alias ($nominal->aliases) {
12162
12163 # Attempt to create a table in the perl directory for the
12164 # candidate table, using whatever aliases in it that don't
12165 # conflict. Also add non-conflicting aliases for all these
12166 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
12167 PREFIX:
12168 foreach my $prefix ("", 'Is_', 'In_') {
12169
12170 # Only Block properties can have added 'In_' aliases.
12171 next if $prefix eq 'In_' and $nominal_property != $block;
12172
12173 my $proposed_name = $prefix . $alias->name;
12174
12175 # No Is_Is, In_In, nor combinations thereof
12176 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
12177 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
12178
12179 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
12180
12181 # Get a reference to any existing table in the perl
12182 # directory with the desired name.
12183 my $pre_existing = $perl->table($proposed_name);
12184
12185 if (! defined $pre_existing) {
12186
12187 # No name collision, so ok to add the perl synonym.
12188
12189 my $make_pod_entry;
12190 my $externally_ok;
4cd1260a 12191 my $status = $alias->status;
99870f4d
KW
12192 if ($nominal_property == $block) {
12193
12194 # For block properties, the 'In' form is preferred for
12195 # external use; the pod file contains wild cards for
12196 # this and the 'Is' form so no entries for those; and
12197 # we don't want people using the name without the
12198 # 'In', so discourage that.
12199 if ($prefix eq "") {
12200 $make_pod_entry = 1;
12201 $status = $status || $DISCOURAGED;
12202 $externally_ok = 0;
12203 }
12204 elsif ($prefix eq 'In_') {
12205 $make_pod_entry = 0;
12206 $status = $status || $NORMAL;
12207 $externally_ok = 1;
12208 }
12209 else {
12210 $make_pod_entry = 0;
12211 $status = $status || $DISCOURAGED;
12212 $externally_ok = 0;
12213 }
12214 }
12215 elsif ($prefix ne "") {
12216
12217 # The 'Is' prefix is handled in the pod by a wild
12218 # card, and we won't use it for an external name
12219 $make_pod_entry = 0;
12220 $status = $status || $NORMAL;
12221 $externally_ok = 0;
12222 }
12223 else {
12224
12225 # Here, is an empty prefix, non block. This gets its
12226 # own pod entry and can be used for an external name.
12227 $make_pod_entry = 1;
12228 $status = $status || $NORMAL;
12229 $externally_ok = 1;
12230 }
12231
12232 # Here, there isn't a perl pre-existing table with the
12233 # name. Look through the list of equivalents of this
12234 # table to see if one is a perl table.
12235 foreach my $equivalent ($actual->leader->equivalents) {
12236 next if $equivalent->property != $perl;
12237
12238 # Here, have found a table for $perl. Add this alias
12239 # to it, and are done with this prefix.
12240 $equivalent->add_alias($proposed_name,
12241 Pod_Entry => $make_pod_entry,
12242 Status => $status,
12243 Externally_Ok => $externally_ok);
12244 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
12245 next PREFIX;
12246 }
12247
12248 # Here, $perl doesn't already have a table that is a
12249 # synonym for this property, add one.
12250 my $added_table = $perl->add_match_table($proposed_name,
12251 Pod_Entry => $make_pod_entry,
12252 Status => $status,
12253 Externally_Ok => $externally_ok);
12254 # And it will be related to the actual table, since it is
12255 # based on it.
12256 $added_table->set_equivalent_to($actual, Related => 1);
12257 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
12258 next;
12259 } # End of no pre-existing.
12260
12261 # Here, there is a pre-existing table that has the proposed
12262 # name. We could be in trouble, but not if this is just a
12263 # synonym for another table that we have already made a child
12264 # of the pre-existing one.
6505c6e2 12265 if ($pre_existing->is_set_equivalent_to($actual)) {
99870f4d
KW
12266 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
12267 $pre_existing->add_alias($proposed_name);
12268 next;
12269 }
12270
12271 # Here, there is a name collision, but it still could be ok if
12272 # the tables match the identical set of code points, in which
12273 # case, we can combine the names. Compare each table's code
12274 # point list to see if they are identical.
12275 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
12276 if ($pre_existing->matches_identically_to($actual)) {
12277
12278 # Here, they do match identically. Not a real conflict.
12279 # Make the perl version a child of the Unicode one, except
12280 # in the non-obvious case of where the perl name is
12281 # already a synonym of another Unicode property. (This is
12282 # excluded by the test for it being its own parent.) The
12283 # reason for this exclusion is that then the two Unicode
12284 # properties become related; and we don't really know if
12285 # they are or not. We generate documentation based on
12286 # relatedness, and this would be misleading. Code
12287 # later executed in the process will cause the tables to
12288 # be represented by a single file anyway, without making
12289 # it look in the pod like they are necessarily related.
12290 if ($pre_existing->parent == $pre_existing
12291 && ($pre_existing->property == $perl
12292 || $actual->property == $perl))
12293 {
12294 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
12295 $pre_existing->set_equivalent_to($actual, Related => 1);
12296 }
12297 elsif (main::DEBUG && $to_trace) {
12298 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
12299 trace $pre_existing->parent;
12300 }
12301 next PREFIX;
12302 }
12303
12304 # Here they didn't match identically, there is a real conflict
12305 # between our new name and a pre-existing property.
12306 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
12307 $pre_existing->add_conflicting($nominal->full_name,
12308 'p',
12309 $actual);
12310
12311 # Don't output a warning for aliases for the block
12312 # properties (unless they start with 'In_') as it is
12313 # expected that there will be conflicts and the block
12314 # form loses.
12315 if ($verbosity >= $NORMAL_VERBOSITY
12316 && ($actual->property != $block || $prefix eq 'In_'))
12317 {
12318 print simple_fold(join_lines(<<END
12319There is already an alias named $proposed_name (from " . $pre_existing . "),
12320so not creating this alias for " . $actual
12321END
12322 ), "", 4);
12323 }
12324
12325 # Keep track for documentation purposes.
12326 $has_In_conflicts++ if $prefix eq 'In_';
12327 $has_Is_conflicts++ if $prefix eq 'Is_';
12328 }
12329 }
12330 }
12331
12332 # There are some properties which have No and Yes (and N and Y) as
12333 # property values, but aren't binary, and could possibly be confused with
12334 # binary ones. So create caveats for them. There are tables that are
12335 # named 'No', and tables that are named 'N', but confusion is not likely
12336 # unless they are the same table. For example, N meaning Number or
12337 # Neutral is not likely to cause confusion, so don't add caveats to things
12338 # like them.
12339 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
12340 my $yes = $property->table('Yes');
12341 if (defined $yes) {
12342 my $y = $property->table('Y');
12343 if (defined $y && $yes == $y) {
12344 foreach my $alias ($property->aliases) {
12345 $yes->add_conflicting($alias->name);
12346 }
12347 }
12348 }
12349 my $no = $property->table('No');
12350 if (defined $no) {
12351 my $n = $property->table('N');
12352 if (defined $n && $no == $n) {
12353 foreach my $alias ($property->aliases) {
12354 $no->add_conflicting($alias->name, 'P');
12355 }
12356 }
12357 }
12358 }
12359
12360 return;
12361}
12362
12363sub register_file_for_name($$$) {
12364 # Given info about a table and a datafile that it should be associated
98dc9551 12365 # with, register that association
99870f4d
KW
12366
12367 my $table = shift;
12368 my $directory_ref = shift; # Array of the directory path for the file
e6ebc4c0 12369 my $file = shift; # The file name in the final directory.
99870f4d
KW
12370 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12371
12372 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
12373
12374 if ($table->isa('Property')) {
12375 $table->set_file_path(@$directory_ref, $file);
12376 push @map_properties, $table
12377 if $directory_ref->[0] eq $map_directory;
12378 return;
12379 }
12380
12381 # Do all of the work for all equivalent tables when called with the leader
12382 # table, so skip if isn't the leader.
12383 return if $table->leader != $table;
12384
a92d5c2e
KW
12385 # If this is a complement of another file, use that other file instead,
12386 # with a ! prepended to it.
12387 my $complement;
12388 if (($complement = $table->complement) != 0) {
12389 my @directories = $complement->file_path;
12390
12391 # This assumes that the 0th element is something like 'lib',
12392 # the 1th element the property name (in its own directory), like
12393 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
12394 # appended to it later.
12395 $directories[1] =~ s/^/!/;
12396 $file = pop @directories;
12397 $directory_ref =\@directories;
12398 }
12399
99870f4d
KW
12400 # Join all the file path components together, using slashes.
12401 my $full_filename = join('/', @$directory_ref, $file);
12402
12403 # All go in the same subdirectory of unicore
12404 if ($directory_ref->[0] ne $matches_directory) {
12405 Carp::my_carp("Unexpected directory in "
12406 . join('/', @{$directory_ref}, $file));
12407 }
12408
12409 # For this table and all its equivalents ...
12410 foreach my $table ($table, $table->equivalents) {
12411
12412 # Associate it with its file internally. Don't include the
12413 # $matches_directory first component
12414 $table->set_file_path(@$directory_ref, $file);
12415 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
12416
12417 my $property = $table->property;
12418 $property = ($property == $perl)
12419 ? "" # 'perl' is never explicitly stated
12420 : standardize($property->name) . '=';
12421
12422 my $deprecated = ($table->status eq $DEPRECATED)
12423 ? $table->status_info
12424 : "";
d867ccfb 12425 my $caseless_equivalent = $table->caseless_equivalent;
99870f4d
KW
12426
12427 # And for each of the table's aliases... This inner loop eventually
12428 # goes through all aliases in the UCD that we generate regex match
12429 # files for
12430 foreach my $alias ($table->aliases) {
c85f591a 12431 my $standard = utf8_heavy_name($table, $alias);
99870f4d
KW
12432
12433 # Generate an entry in either the loose or strict hashes, which
12434 # will translate the property and alias names combination into the
12435 # file where the table for them is stored.
99870f4d 12436 if ($alias->loose_match) {
99870f4d
KW
12437 if (exists $loose_to_file_of{$standard}) {
12438 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
12439 }
12440 else {
12441 $loose_to_file_of{$standard} = $sub_filename;
12442 }
12443 }
12444 else {
99870f4d
KW
12445 if (exists $stricter_to_file_of{$standard}) {
12446 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
12447 }
12448 else {
12449 $stricter_to_file_of{$standard} = $sub_filename;
12450
12451 # Tightly coupled with how utf8_heavy.pl works, for a
12452 # floating point number that is a whole number, get rid of
12453 # the trailing decimal point and 0's, so that utf8_heavy
12454 # will work. Also note that this assumes that such a
12455 # number is matched strictly; so if that were to change,
12456 # this would be wrong.
c85f591a 12457 if ((my $integer_name = $alias->name)
99870f4d
KW
12458 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
12459 {
12460 $stricter_to_file_of{$property . $integer_name}
12461 = $sub_filename;
12462 }
12463 }
12464 }
12465
12466 # Keep a list of the deprecated properties and their filenames
a92d5c2e 12467 if ($deprecated && $complement == 0) {
99870f4d
KW
12468 $utf8::why_deprecated{$sub_filename} = $deprecated;
12469 }
d867ccfb
KW
12470
12471 # And a substitute table, if any, for case-insensitive matching
12472 if ($caseless_equivalent != 0) {
12473 $caseless_equivalent_to{$standard} = $caseless_equivalent;
12474 }
99870f4d
KW
12475 }
12476 }
12477
12478 return;
12479}
12480
12481{ # Closure
12482 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
12483 # conflicts
12484 my %full_dir_name_of; # Full length names of directories used.
12485
12486 sub construct_filename($$$) {
12487 # Return a file name for a table, based on the table name, but perhaps
12488 # changed to get rid of non-portable characters in it, and to make
12489 # sure that it is unique on a file system that allows the names before
12490 # any period to be at most 8 characters (DOS). While we're at it
12491 # check and complain if there are any directory conflicts.
12492
12493 my $name = shift; # The name to start with
12494 my $mutable = shift; # Boolean: can it be changed? If no, but
12495 # yet it must be to work properly, a warning
12496 # is given
12497 my $directories_ref = shift; # A reference to an array containing the
12498 # path to the file, with each element one path
12499 # component. This is used because the same
12500 # name can be used in different directories.
12501 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12502
12503 my $warn = ! defined wantarray; # If true, then if the name is
12504 # changed, a warning is issued as well.
12505
12506 if (! defined $name) {
12507 Carp::my_carp("Undefined name in directory "
12508 . File::Spec->join(@$directories_ref)
12509 . ". '_' used");
12510 return '_';
12511 }
12512
12513 # Make sure that no directory names conflict with each other. Look at
12514 # each directory in the input file's path. If it is already in use,
12515 # assume it is correct, and is merely being re-used, but if we
12516 # truncate it to 8 characters, and find that there are two directories
12517 # that are the same for the first 8 characters, but differ after that,
12518 # then that is a problem.
12519 foreach my $directory (@$directories_ref) {
12520 my $short_dir = substr($directory, 0, 8);
12521 if (defined $full_dir_name_of{$short_dir}) {
12522 next if $full_dir_name_of{$short_dir} eq $directory;
12523 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
12524 }
12525 else {
12526 $full_dir_name_of{$short_dir} = $directory;
12527 }
12528 }
12529
12530 my $path = join '/', @$directories_ref;
12531 $path .= '/' if $path;
12532
12533 # Remove interior underscores.
12534 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
12535
12536 # Change any non-word character into an underscore, and truncate to 8.
12537 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
12538 substr($filename, 8) = "" if length($filename) > 8;
12539
12540 # Make sure the basename doesn't conflict with something we
12541 # might have already written. If we have, say,
12542 # InGreekExtended1
12543 # InGreekExtended2
12544 # they become
12545 # InGreekE
12546 # InGreek2
12547 my $warned = 0;
12548 while (my $num = $base_names{$path}{lc $filename}++) {
12549 $num++; # so basenames with numbers start with '2', which
12550 # just looks more natural.
12551
12552 # Want to append $num, but if it'll make the basename longer
12553 # than 8 characters, pre-truncate $filename so that the result
12554 # is acceptable.
12555 my $delta = length($filename) + length($num) - 8;
12556 if ($delta > 0) {
12557 substr($filename, -$delta) = $num;
12558 }
12559 else {
12560 $filename .= $num;
12561 }
12562 if ($warn && ! $warned) {
12563 $warned = 1;
12564 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
12565 }
12566 }
12567
12568 return $filename if $mutable;
12569
12570 # If not changeable, must return the input name, but warn if needed to
12571 # change it beyond shortening it.
12572 if ($name ne $filename
12573 && substr($name, 0, length($filename)) ne $filename) {
12574 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
12575 }
12576 return $name;
12577 }
12578}
12579
12580# The pod file contains a very large table. Many of the lines in that table
12581# would exceed a typical output window's size, and so need to be wrapped with
12582# a hanging indent to make them look good. The pod language is really
12583# insufficient here. There is no general construct to do that in pod, so it
12584# is done here by beginning each such line with a space to cause the result to
12585# be output without formatting, and doing all the formatting here. This leads
12586# to the result that if the eventual display window is too narrow it won't
12587# look good, and if the window is too wide, no advantage is taken of that
12588# extra width. A further complication is that the output may be indented by
12589# the formatter so that there is less space than expected. What I (khw) have
12590# done is to assume that that indent is a particular number of spaces based on
12591# what it is in my Linux system; people can always resize their windows if
12592# necessary, but this is obviously less than desirable, but the best that can
12593# be expected.
12594my $automatic_pod_indent = 8;
12595
12596# Try to format so that uses fewest lines, but few long left column entries
12597# slide into the right column. An experiment on 5.1 data yielded the
12598# following percentages that didn't cut into the other side along with the
12599# associated first-column widths
12600# 69% = 24
12601# 80% not too bad except for a few blocks
12602# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
12603# 95% = 37;
12604my $indent_info_column = 27; # 75% of lines didn't have overlap
12605
12606my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
12607 # The 3 is because of:
12608 # 1 for the leading space to tell the pod formatter to
12609 # output as-is
12610 # 1 for the flag
12611 # 1 for the space between the flag and the main data
12612
12613sub format_pod_line ($$$;$$) {
12614 # Take a pod line and return it, formatted properly
12615
12616 my $first_column_width = shift;
12617 my $entry = shift; # Contents of left column
12618 my $info = shift; # Contents of right column
12619
12620 my $status = shift || ""; # Any flag
12621
12622 my $loose_match = shift; # Boolean.
12623 $loose_match = 1 unless defined $loose_match;
12624
12625 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12626
12627 my $flags = "";
12628 $flags .= $STRICTER if ! $loose_match;
12629
12630 $flags .= $status if $status;
12631
12632 # There is a blank in the left column to cause the pod formatter to
12633 # output the line as-is.
12634 return sprintf " %-*s%-*s %s\n",
12635 # The first * in the format is replaced by this, the -1 is
12636 # to account for the leading blank. There isn't a
12637 # hard-coded blank after this to separate the flags from
12638 # the rest of the line, so that in the unlikely event that
12639 # multiple flags are shown on the same line, they both
12640 # will get displayed at the expense of that separation,
12641 # but since they are left justified, a blank will be
12642 # inserted in the normal case.
12643 $FILLER - 1,
12644 $flags,
12645
12646 # The other * in the format is replaced by this number to
12647 # cause the first main column to right fill with blanks.
12648 # The -1 is for the guaranteed blank following it.
12649 $first_column_width - $FILLER - 1,
12650 $entry,
12651 $info;
12652}
12653
12654my @zero_match_tables; # List of tables that have no matches in this release
12655
12656sub make_table_pod_entries($) {
12657 # This generates the entries for the pod file for a given table.
12658 # Also done at this time are any children tables. The output looks like:
12659 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
12660
12661 my $input_table = shift; # Table the entry is for
12662 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12663
12664 # Generate parent and all its children at the same time.
12665 return if $input_table->parent != $input_table;
12666
12667 my $property = $input_table->property;
12668 my $type = $property->type;
12669 my $full_name = $property->full_name;
12670
12671 my $count = $input_table->count;
12672 my $string_count = clarify_number($count);
12673 my $status = $input_table->status;
12674 my $status_info = $input_table->status_info;
56ca34ca 12675 my $caseless_equivalent = $input_table->caseless_equivalent;
99870f4d
KW
12676
12677 my $entry_for_first_table; # The entry for the first table output.
12678 # Almost certainly, it is the parent.
12679
12680 # For each related table (including itself), we will generate a pod entry
12681 # for each name each table goes by
12682 foreach my $table ($input_table, $input_table->children) {
12683
12684 # utf8_heavy.pl cannot deal with null string property values, so don't
12685 # output any.
12686 next if $table->name eq "";
12687
12688 # First, gather all the info that applies to this table as a whole.
12689
12690 push @zero_match_tables, $table if $count == 0;
12691
12692 my $table_property = $table->property;
12693
12694 # The short name has all the underscores removed, while the full name
12695 # retains them. Later, we decide whether to output a short synonym
12696 # for the full one, we need to compare apples to apples, so we use the
12697 # short name's length including underscores.
12698 my $table_property_short_name_length;
12699 my $table_property_short_name
12700 = $table_property->short_name(\$table_property_short_name_length);
12701 my $table_property_full_name = $table_property->full_name;
12702
12703 # Get how much savings there is in the short name over the full one
12704 # (delta will always be <= 0)
12705 my $table_property_short_delta = $table_property_short_name_length
12706 - length($table_property_full_name);
12707 my @table_description = $table->description;
12708 my @table_note = $table->note;
12709
12710 # Generate an entry for each alias in this table.
12711 my $entry_for_first_alias; # saves the first one encountered.
12712 foreach my $alias ($table->aliases) {
12713
12714 # Skip if not to go in pod.
12715 next unless $alias->make_pod_entry;
12716
12717 # Start gathering all the components for the entry
12718 my $name = $alias->name;
12719
12720 my $entry; # Holds the left column, may include extras
12721 my $entry_ref; # To refer to the left column's contents from
12722 # another entry; has no extras
12723
12724 # First the left column of the pod entry. Tables for the $perl
12725 # property always use the single form.
12726 if ($table_property == $perl) {
12727 $entry = "\\p{$name}";
12728 $entry_ref = "\\p{$name}";
12729 }
12730 else { # Compound form.
12731
12732 # Only generate one entry for all the aliases that mean true
12733 # or false in binary properties. Append a '*' to indicate
12734 # some are missing. (The heading comment notes this.)
12735 my $wild_card_mark;
12736 if ($type == $BINARY) {
12737 next if $name ne 'N' && $name ne 'Y';
12738 $wild_card_mark = '*';
12739 }
12740 else {
12741 $wild_card_mark = "";
12742 }
12743
12744 # Colon-space is used to give a little more space to be easier
12745 # to read;
12746 $entry = "\\p{"
12747 . $table_property_full_name
12748 . ": $name$wild_card_mark}";
12749
12750 # But for the reference to this entry, which will go in the
12751 # right column, where space is at a premium, use equals
12752 # without a space
12753 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
12754 }
12755
12756 # Then the right (info) column. This is stored as components of
12757 # an array for the moment, then joined into a string later. For
12758 # non-internal only properties, begin the info with the entry for
12759 # the first table we encountered (if any), as things are ordered
12760 # so that that one is the most descriptive. This leads to the
12761 # info column of an entry being a more descriptive version of the
12762 # name column
12763 my @info;
12764 if ($name =~ /^_/) {
12765 push @info,
12766 '(For internal use by Perl, not necessarily stable)';
12767 }
12768 elsif ($entry_for_first_alias) {
12769 push @info, $entry_for_first_alias;
12770 }
12771
12772 # If this entry is equivalent to another, add that to the info,
12773 # using the first such table we encountered
12774 if ($entry_for_first_table) {
12775 if (@info) {
12776 push @info, "(= $entry_for_first_table)";
12777 }
12778 else {
12779 push @info, $entry_for_first_table;
12780 }
12781 }
12782
12783 # If the name is a large integer, add an equivalent with an
12784 # exponent for better readability
12785 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12786 push @info, sprintf "(= %.1e)", $name
12787 }
12788
12789 my $parenthesized = "";
12790 if (! $entry_for_first_alias) {
12791
12792 # This is the first alias for the current table. The alias
12793 # array is ordered so that this is the fullest, most
12794 # descriptive alias, so it gets the fullest info. The other
12795 # aliases are mostly merely pointers to this one, using the
12796 # information already added above.
12797
12798 # Display any status message, but only on the parent table
12799 if ($status && ! $entry_for_first_table) {
12800 push @info, $status_info;
12801 }
12802
12803 # Put out any descriptive info
12804 if (@table_description || @table_note) {
12805 push @info, join "; ", @table_description, @table_note;
12806 }
12807
12808 # Look to see if there is a shorter name we can point people
12809 # at
12810 my $standard_name = standardize($name);
12811 my $short_name;
12812 my $proposed_short = $table->short_name;
12813 if (defined $proposed_short) {
12814 my $standard_short = standardize($proposed_short);
12815
12816 # If the short name is shorter than the standard one, or
12817 # even it it's not, but the combination of it and its
12818 # short property name (as in \p{prop=short} ($perl doesn't
12819 # have this form)) saves at least two characters, then,
12820 # cause it to be listed as a shorter synonym.
12821 if (length $standard_short < length $standard_name
12822 || ($table_property != $perl
12823 && (length($standard_short)
12824 - length($standard_name)
12825 + $table_property_short_delta) # (<= 0)
12826 < -2))
12827 {
12828 $short_name = $proposed_short;
12829 if ($table_property != $perl) {
12830 $short_name = $table_property_short_name
12831 . "=$short_name";
12832 }
12833 $short_name = "\\p{$short_name}";
12834 }
12835 }
12836
12837 # And if this is a compound form name, see if there is a
12838 # single form equivalent
12839 my $single_form;
12840 if ($table_property != $perl) {
12841
12842 # Special case the binary N tables, so that will print
12843 # \P{single}, but use the Y table values to populate
12844 # 'single', as we haven't populated the N table.
12845 my $test_table;
12846 my $p;
12847 if ($type == $BINARY
12848 && $input_table == $property->table('No'))
12849 {
12850 $test_table = $property->table('Yes');
12851 $p = 'P';
12852 }
12853 else {
12854 $test_table = $input_table;
12855 $p = 'p';
12856 }
12857
12858 # Look for a single form amongst all the children.
12859 foreach my $table ($test_table->children) {
12860 next if $table->property != $perl;
12861 my $proposed_name = $table->short_name;
12862 next if ! defined $proposed_name;
12863
12864 # Don't mention internal-only properties as a possible
12865 # single form synonym
12866 next if substr($proposed_name, 0, 1) eq '_';
12867
12868 $proposed_name = "\\$p\{$proposed_name}";
12869 if (! defined $single_form
12870 || length($proposed_name) < length $single_form)
12871 {
12872 $single_form = $proposed_name;
12873
12874 # The goal here is to find a single form; not the
12875 # shortest possible one. We've already found a
12876 # short name. So, stop at the first single form
12877 # found, which is likely to be closer to the
12878 # original.
12879 last;
12880 }
12881 }
12882 }
12883
12884 # Ouput both short and single in the same parenthesized
12885 # expression, but with only one of 'Single', 'Short' if there
12886 # are both items.
12887 if ($short_name || $single_form || $table->conflicting) {
99870f4d
KW
12888 $parenthesized .= "Short: $short_name" if $short_name;
12889 if ($short_name && $single_form) {
12890 $parenthesized .= ', ';
12891 }
12892 elsif ($single_form) {
12893 $parenthesized .= 'Single: ';
12894 }
12895 $parenthesized .= $single_form if $single_form;
12896 }
12897 }
12898
56ca34ca
KW
12899 if ($caseless_equivalent != 0) {
12900 $parenthesized .= '; ' if $parenthesized ne "";
12901 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
12902 }
12903
99870f4d
KW
12904
12905 # Warn if this property isn't the same as one that a
12906 # semi-casual user might expect. The other components of this
12907 # parenthesized structure are calculated only for the first entry
12908 # for this table, but the conflicting is deemed important enough
12909 # to go on every entry.
12910 my $conflicting = join " NOR ", $table->conflicting;
12911 if ($conflicting) {
e5228720 12912 $parenthesized .= '; ' if $parenthesized ne "";
99870f4d
KW
12913 $parenthesized .= "NOT $conflicting";
12914 }
99870f4d 12915
e5228720 12916 push @info, "($parenthesized)" if $parenthesized;
d57ccc9a 12917
0f88d393
KW
12918 if ($name =~ /_$/ && $alias->loose_match) {
12919 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
12920 }
12921
d57ccc9a
KW
12922 if ($table_property != $perl && $table->perl_extension) {
12923 push @info, '(Perl extension)';
12924 }
2cf724d4 12925 push @info, "($string_count)";
99870f4d
KW
12926
12927 # Now, we have both the entry and info so add them to the
12928 # list of all the properties.
12929 push @match_properties,
12930 format_pod_line($indent_info_column,
12931 $entry,
12932 join( " ", @info),
12933 $alias->status,
12934 $alias->loose_match);
12935
12936 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12937 } # End of looping through the aliases for this table.
12938
12939 if (! $entry_for_first_table) {
12940 $entry_for_first_table = $entry_for_first_alias;
12941 }
12942 } # End of looping through all the related tables
12943 return;
12944}
12945
12946sub pod_alphanumeric_sort {
12947 # Sort pod entries alphanumerically.
12948
99f78760
KW
12949 # The first few character columns are filler, plus the '\p{'; and get rid
12950 # of all the trailing stuff, starting with the trailing '}', so as to sort
12951 # on just 'Name=Value'
12952 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 12953 $a =~ s/}.*//;
99f78760 12954 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
12955 $b =~ s/}.*//;
12956
99f78760
KW
12957 # Determine if the two operands are both internal only or both not.
12958 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
12959 # should be the underscore that begins internal only
12960 my $a_is_internal = (substr($a, 0, 1) eq '_');
12961 my $b_is_internal = (substr($b, 0, 1) eq '_');
12962
12963 # Sort so the internals come last in the table instead of first (which the
12964 # leading underscore would otherwise indicate).
12965 if ($a_is_internal != $b_is_internal) {
12966 return 1 if $a_is_internal;
12967 return -1
12968 }
12969
99870f4d 12970 # Determine if the two operands are numeric property values or not.
99f78760 12971 # A numeric property will look like xyz: 3. But the number
99870f4d 12972 # can begin with an optional minus sign, and may have a
99f78760 12973 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
12974 # isn't numeric, use alphabetic sort.
12975 my ($a_initial, $a_number) =
99f78760 12976 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
12977 return $a cmp $b unless defined $a_number;
12978 my ($b_initial, $b_number) =
99f78760 12979 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
12980 return $a cmp $b unless defined $b_number;
12981
12982 # Here they are both numeric, but use alphabetic sort if the
12983 # initial parts don't match
12984 return $a cmp $b if $a_initial ne $b_initial;
12985
12986 # Convert rationals to floating for the comparison.
12987 $a_number = eval $a_number if $a_number =~ qr{/};
12988 $b_number = eval $b_number if $b_number =~ qr{/};
12989
12990 return $a_number <=> $b_number;
12991}
12992
12993sub make_pod () {
12994 # Create the .pod file. This generates the various subsections and then
12995 # combines them in one big HERE document.
12996
12997 return unless defined $pod_directory;
12998 print "Making pod file\n" if $verbosity >= $PROGRESS;
12999
13000 my $exception_message =
13001 '(Any exceptions are individually noted beginning with the word NOT.)';
13002 my @block_warning;
13003 if (-e 'Blocks.txt') {
13004
13005 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
13006 # if the global $has_In_conflicts indicates we have them.
13007 push @match_properties, format_pod_line($indent_info_column,
13008 '\p{In_*}',
13009 '\p{Block: *}'
13010 . (($has_In_conflicts)
13011 ? " $exception_message"
13012 : ""));
13013 @block_warning = << "END";
13014
77173124
KW
13015Matches in the Block property have shortcuts that begin with "In_". For
13016example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
13017backward compatibility, if there is no conflict with another shortcut, these
13018may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
13019are numerous such conflicting shortcuts. Use of these forms for Block is
13020discouraged, and are flagged as such, not only because of the potential
13021confusion as to what is meant, but also because a later release of Unicode may
13022preempt the shortcut, and your program would no longer be correct. Use the
13023"In_" form instead to avoid this, or even more clearly, use the compound form,
13024e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
13025about this.
99870f4d
KW
13026END
13027 }
77173124 13028 my $text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
99870f4d
KW
13029 $text = "$exception_message $text" if $has_Is_conflicts;
13030
13031 # And the 'Is_ line';
13032 push @match_properties, format_pod_line($indent_info_column,
13033 '\p{Is_*}',
13034 "\\p{*} $text");
13035
13036 # Sort the properties array for output. It is sorted alphabetically
13037 # except numerically for numeric properties, and only output unique lines.
13038 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
13039
13040 my $formatted_properties = simple_fold(\@match_properties,
13041 "",
13042 # indent succeeding lines by two extra
13043 # which looks better
13044 $indent_info_column + 2,
13045
13046 # shorten the line length by how much
13047 # the formatter indents, so the folded
13048 # line will fit in the space
13049 # presumably available
13050 $automatic_pod_indent);
13051 # Add column headings, indented to be a little more centered, but not
13052 # exactly
13053 $formatted_properties = format_pod_line($indent_info_column,
13054 ' NAME',
13055 ' INFO')
13056 . "\n"
13057 . $formatted_properties;
13058
13059 # Generate pod documentation lines for the tables that match nothing
13060 my $zero_matches;
13061 if (@zero_match_tables) {
13062 @zero_match_tables = uniques(@zero_match_tables);
13063 $zero_matches = join "\n\n",
13064 map { $_ = '=item \p{' . $_->complete_name . "}" }
13065 sort { $a->complete_name cmp $b->complete_name }
13066 uniques(@zero_match_tables);
13067
13068 $zero_matches = <<END;
13069
77173124 13070=head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
99870f4d
KW
13071
13072Unicode has some property-value pairs that currently don't match anything.
13073This happens generally either because they are obsolete, or for symmetry with
13074other forms, but no language has yet been encoded that uses them. In this
13075version of Unicode, the following match zero code points:
13076
13077=over 4
13078
13079$zero_matches
13080
13081=back
13082
13083END
13084 }
13085
13086 # Generate list of properties that we don't accept, grouped by the reasons
13087 # why. This is so only put out the 'why' once, and then list all the
13088 # properties that have that reason under it.
13089
13090 my %why_list; # The keys are the reasons; the values are lists of
13091 # properties that have the key as their reason
13092
13093 # For each property, add it to the list that are suppressed for its reason
13094 # The sort will cause the alphabetically first properties to be added to
13095 # each list first, so each list will be sorted.
13096 foreach my $property (sort keys %why_suppressed) {
13097 push @{$why_list{$why_suppressed{$property}}}, $property;
13098 }
13099
13100 # For each reason (sorted by the first property that has that reason)...
13101 my @bad_re_properties;
13102 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
13103 keys %why_list)
13104 {
13105 # Add to the output, all the properties that have that reason. Start
13106 # with an empty line.
13107 push @bad_re_properties, "\n\n";
13108
13109 my $has_item = 0; # Flag if actually output anything.
13110 foreach my $name (@{$why_list{$why}}) {
13111
13112 # Split compound names into $property and $table components
13113 my $property = $name;
13114 my $table;
13115 if ($property =~ / (.*) = (.*) /x) {
13116 $property = $1;
13117 $table = $2;
13118 }
13119
13120 # This release of Unicode may not have a property that is
13121 # suppressed, so don't reference a non-existent one.
13122 $property = property_ref($property);
13123 next if ! defined $property;
13124
13125 # And since this list is only for match tables, don't list the
13126 # ones that don't have match tables.
13127 next if ! $property->to_create_match_tables;
13128
13129 # Find any abbreviation, and turn it into a compound name if this
13130 # is a property=value pair.
13131 my $short_name = $property->name;
13132 $short_name .= '=' . $property->table($table)->name if $table;
13133
13134 # And add the property as an item for the reason.
13135 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
13136 $has_item = 1;
13137 }
13138
13139 # And add the reason under the list of properties, if such a list
13140 # actually got generated. Note that the header got added
13141 # unconditionally before. But pod ignores extra blank lines, so no
13142 # harm.
13143 push @bad_re_properties, "\n$why\n" if $has_item;
13144
13145 } # End of looping through each reason.
13146
13147 # Generate a list of the properties whose map table we output, from the
13148 # global @map_properties.
13149 my @map_tables_actually_output;
13150 my $info_indent = 20; # Left column is narrower than \p{} table.
13151 foreach my $property (@map_properties) {
13152
13153 # Get the path to the file; don't output any not in the standard
13154 # directory.
13155 my @path = $property->file_path;
13156 next if $path[0] ne $map_directory;
8572ace0
KW
13157
13158 # Don't mention map tables that are for internal-use only
13159 next if $property->to_output_map == $INTERNAL_MAP;
13160
99870f4d
KW
13161 shift @path; # Remove the standard name
13162
13163 my $file = join '/', @path; # In case is in sub directory
13164 my $info = $property->full_name;
13165 my $short_name = $property->name;
13166 if ($info ne $short_name) {
13167 $info .= " ($short_name)";
13168 }
13169 foreach my $more_info ($property->description,
13170 $property->note,
13171 $property->status_info)
13172 {
13173 next unless $more_info;
13174 $info =~ s/\.\Z//;
13175 $info .= ". $more_info";
13176 }
13177 push @map_tables_actually_output, format_pod_line($info_indent,
13178 $file,
13179 $info,
13180 $property->status);
13181 }
13182
13183 # Sort alphabetically, and fold for output
13184 @map_tables_actually_output = sort
13185 pod_alphanumeric_sort @map_tables_actually_output;
13186 @map_tables_actually_output
13187 = simple_fold(\@map_tables_actually_output,
13188 ' ',
13189 $info_indent,
13190 $automatic_pod_indent);
13191
13192 # Generate a list of the formats that can appear in the map tables.
13193 my @map_table_formats;
13194 foreach my $format (sort keys %map_table_formats) {
12916dad 13195 push @map_table_formats, " $format $map_table_formats{$format}\n";
99870f4d
KW
13196 }
13197
12916dad
MS
13198 local $" = "";
13199
99870f4d
KW
13200 # Everything is ready to assemble.
13201 my @OUT = << "END";
13202=begin comment
13203
13204$HEADER
13205
13206To change this file, edit $0 instead.
13207
13208=end comment
13209
13210=head1 NAME
13211
51f494cc 13212$pod_file - Index of Unicode Version $string_version properties in Perl
99870f4d
KW
13213
13214=head1 DESCRIPTION
13215
13216There are many properties in Unicode, and Perl provides access to almost all of
13217them, as well as some additional extensions and short-cut synonyms.
13218
13219And just about all of the few that aren't accessible through the Perl
77173124
KW
13220core are accessible through the modules: L<Unicode::Normalize> and
13221L<Unicode::UCD>, and for Unihan properties, via the CPAN module
13222L<Unicode::Unihan>.
99870f4d
KW
13223
13224This document merely lists all available properties and does not attempt to
13225explain what each property really means. There is a brief description of each
13226Perl extension. There is some detail about Blocks, Scripts, General_Category,
13227and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
13228Unicode properties, refer to the Unicode standard. A good starting place is
13229L<$unicode_reference_url>. More information on the Perl extensions is in
78bb419c 13230L<perlunicode/Other Properties>.
99870f4d
KW
13231
13232Note that you can define your own properties; see
13233L<perlunicode/"User-Defined Character Properties">.
13234
77173124 13235=head1 Properties accessible through C<\\p{}> and C<\\P{}>
99870f4d 13236
77173124
KW
13237The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
13238most of the Unicode character properties. The table below shows all these
13239constructs, both single and compound forms.
99870f4d
KW
13240
13241B<Compound forms> consist of two components, separated by an equals sign or a
13242colon. The first component is the property name, and the second component is
13243the particular value of the property to match against, for example,
77173124 13244C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
99870f4d
KW
13245whose Script property is Greek.
13246
77173124 13247B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
99870f4d 13248their equivalent compound forms. The table shows these equivalences. (In our
77173124 13249example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
99870f4d 13250There are also a few Perl-defined single forms that are not shortcuts for a
77173124 13251compound form. One such is C<\\p{Word}>. These are also listed in the table.
99870f4d
KW
13252
13253In parsing these constructs, Perl always ignores Upper/lower case differences
77173124
KW
13254everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
13255C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
13256the left brace completely changes the meaning of the construct, from "match"
13257(for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
13258for improved legibility.
99870f4d
KW
13259
13260Also, white space, hyphens, and underscores are also normally ignored
13261everywhere between the {braces}, and hence can be freely added or removed
13262even if the C</x> modifier hasn't been specified on the regular expression.
13263But $a_bold_stricter at the beginning of an entry in the table below
13264means that tighter (stricter) rules are used for that entry:
13265
13266=over 4
13267
77173124 13268=item Single form (C<\\p{name}>) tighter rules:
99870f4d
KW
13269
13270White space, hyphens, and underscores ARE significant
13271except for:
13272
13273=over 4
13274
13275=item * white space adjacent to a non-word character
13276
13277=item * underscores separating digits in numbers
13278
13279=back
13280
13281That means, for example, that you can freely add or remove white space
13282adjacent to (but within) the braces without affecting the meaning.
13283
77173124 13284=item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
99870f4d
KW
13285
13286The tighter rules given above for the single form apply to everything to the
13287right of the colon or equals; the looser rules still apply to everything to
13288the left.
13289
13290That means, for example, that you can freely add or remove white space
13291adjacent to (but within) the braces and the colon or equal sign.
13292
13293=back
13294
78bb419c
KW
13295Some properties are considered obsolete by Unicode, but still available.
13296There are several varieties of obsolescence:
99870f4d
KW
13297
13298=over 4
13299
13300=item Obsolete
13301
13302Properties marked with $a_bold_obsolete in the table are considered
99870f4d
KW
13303obsolete.
13304
13305=item Stabilized
13306
5f7264c7
KW
13307Obsolete properties may be stabilized. Such a determination does not indicate
13308that the property should or should not be used; instead it is a declaration
13309that the property will not be maintained nor extended for newly encoded
13310characters. Such properties are marked with $a_bold_stabilized in the
13311table.
99870f4d
KW
13312
13313=item Deprecated
13314
5f7264c7 13315An obsolete property may be deprecated, perhaps because its original intent
78bb419c
KW
13316has been replaced by another property, or because its specification was
13317somehow defective. This means that its use is strongly
99870f4d
KW
13318discouraged, so much so that a warning will be issued if used, unless the
13319regular expression is in the scope of a C<S<no warnings 'deprecated'>>
13320statement. $A_bold_deprecated flags each such entry in the table, and
13321the entry there for the longest, most descriptive version of the property will
13322give the reason it is deprecated, and perhaps advice. Perl may issue such a
13323warning, even for properties that aren't officially deprecated by Unicode,
13324when there used to be characters or code points that were matched by them, but
13325no longer. This is to warn you that your program may not work like it did on
13326earlier Unicode releases.
13327
13328A deprecated property may be made unavailable in a future Perl version, so it
13329is best to move away from them.
13330
13331=back
13332
13333Some Perl extensions are present for backwards compatibility and are
13334discouraged from being used, but not obsolete. $A_bold_discouraged
13335flags each such entry in the table.
13336
13337@block_warning
13338
77173124 13339The table below has two columns. The left column contains the C<\\p{}>
98dc9551 13340constructs to look up, possibly preceded by the flags mentioned above; and
99870f4d
KW
13341the right column contains information about them, like a description, or
13342synonyms. It shows both the single and compound forms for each property that
13343has them. If the left column is a short name for a property, the right column
13344will give its longer, more descriptive name; and if the left column is the
13345longest name, the right column will show any equivalent shortest name, in both
13346single and compound forms if applicable.
13347
13348The right column will also caution you if a property means something different
13349than what might normally be expected.
13350
d57ccc9a
KW
13351All single forms are Perl extensions; a few compound forms are as well, and
13352are noted as such.
13353
99870f4d
KW
13354Numbers in (parentheses) indicate the total number of code points matched by
13355the property. For emphasis, those properties that match no code points at all
13356are listed as well in a separate section following the table.
13357
56ca34ca
KW
13358Most properties match the same code points regardless of whether C<"/i">
13359case-insensitive matching is specified or not. But a few properties are
13360affected. These are shown with the notation
13361
13362 (/i= other_property)
13363
13364in the second column. Under case-insensitive matching they match the
13365same code pode points as the property "other_property".
13366
99870f4d 13367There is no description given for most non-Perl defined properties (See
77173124 13368L<$unicode_reference_url> for that).
d73e5302 13369
99870f4d
KW
13370For compactness, 'B<*>' is used as a wildcard instead of showing all possible
13371combinations. For example, entries like:
d73e5302 13372
99870f4d 13373 \\p{Gc: *} \\p{General_Category: *}
5beb625e 13374
99870f4d
KW
13375mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
13376for the latter is also valid for the former. Similarly,
5beb625e 13377
99870f4d 13378 \\p{Is_*} \\p{*}
5beb625e 13379
77173124
KW
13380means that if and only if, for example, C<\\p{Foo}> exists, then
13381C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
13382And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
13383C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
13384underscore.
5beb625e 13385
99870f4d
KW
13386Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
13387And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
13388'N*' to indicate this, and doesn't have separate entries for the other
13389possibilities. Note that not all properties which have values 'Yes' and 'No'
13390are binary, and they have all their values spelled out without using this wild
13391card, and a C<NOT> clause in their description that highlights their not being
13392binary. These also require the compound form to match them, whereas true
13393binary properties have both single and compound forms available.
5beb625e 13394
99870f4d
KW
13395Note that all non-essential underscores are removed in the display of the
13396short names below.
5beb625e 13397
99870f4d 13398B<Summary legend:>
5beb625e 13399
99870f4d 13400=over 4
cf25bb62 13401
21405004 13402=item Z<>B<*> is a wild-card
cf25bb62 13403
99870f4d
KW
13404=item B<(\\d+)> in the info column gives the number of code points matched by
13405this property.
cf25bb62 13406
99870f4d 13407=item B<$DEPRECATED> means this is deprecated.
cf25bb62 13408
99870f4d 13409=item B<$OBSOLETE> means this is obsolete.
cf25bb62 13410
99870f4d 13411=item B<$STABILIZED> means this is stabilized.
cf25bb62 13412
99870f4d 13413=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 13414
99870f4d 13415=item B<$DISCOURAGED> means use of this form is discouraged.
5beb625e 13416
99870f4d 13417=back
da7fcca4 13418
99870f4d 13419$formatted_properties
cf25bb62 13420
99870f4d 13421$zero_matches
cf25bb62 13422
99870f4d 13423=head1 Properties not accessible through \\p{} and \\P{}
cf25bb62 13424
99870f4d
KW
13425A few properties are accessible in Perl via various function calls only.
13426These are:
78bb419c 13427
99870f4d
KW
13428 Lowercase_Mapping lc() and lcfirst()
13429 Titlecase_Mapping ucfirst()
13430 Uppercase_Mapping uc()
12ac2576 13431
77173124 13432Case_Folding is accessible through the C</i> modifier in regular expressions.
cf25bb62 13433
77173124 13434The Name property is accessible through the C<\\N{}> interpolation in
99870f4d 13435double-quoted strings and regular expressions, but both usages require a C<use
fb121860
KW
13436charnames;> to be specified, which also contains related functions viacode(),
13437vianame(), and string_vianame().
cf25bb62 13438
99870f4d 13439=head1 Unicode regular expression properties that are NOT accepted by Perl
d2d499f5 13440
99870f4d
KW
13441Perl will generate an error for a few character properties in Unicode when
13442used in a regular expression. The non-Unihan ones are listed below, with the
13443reasons they are not accepted, perhaps with work-arounds. The short names for
13444the properties are listed enclosed in (parentheses).
ae6979a8 13445
99870f4d 13446=over 4
ae6979a8 13447
99870f4d 13448@bad_re_properties
a3a8c5f0 13449
99870f4d 13450=back
a3a8c5f0 13451
b7986f4f
KW
13452An installation can choose to allow any of these to be matched by downloading
13453the Unicode database from L<http://www.unicode.org/Public/> to
f3514a2f
KW
13454C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
13455controlling lists contained in the program
13456C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
13457(C<\%Config> is available from the Config module).
d73e5302 13458
99870f4d 13459=head1 Files in the I<To> directory (for serious hackers only)
12ac2576 13460
99870f4d
KW
13461All Unicode properties are really mappings (in the mathematical sense) from
13462code points to their respective values. As part of its build process,
13463Perl constructs tables containing these mappings for all properties that it
50b27e73 13464deals with. Some, but not all, of these are written out into files.
99870f4d 13465Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
77173124 13466(C<%Config> is available from the C<Config> module).
7ebf06b3 13467
50b27e73
KW
13468Perl reserves the right to change the format and even the existence of any of
13469those files without notice, except the ones that were in existence prior to
13470release 5.13. If those change, a deprecation cycle will be done first. These
13471are:
12ac2576 13472
99870f4d 13473@map_tables_actually_output
12ac2576 13474
ec2f0128
KW
13475Each of the files in this directory defines several hash entries to help
13476reading programs decipher it. One of them looks like this:
12ac2576 13477
99870f4d 13478 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
d73e5302 13479
77173124
KW
13480where "NAME" is a name to indicate the property. For backwards compatibility,
13481this is not necessarily the property's official Unicode name. (The "To" is
99870f4d
KW
13482also for backwards compatibility.) The hash entry gives the format of the
13483mapping fields of the table, currently one of the following:
d73e5302 13484
12916dad 13485@map_table_formats
d73e5302 13486
99870f4d
KW
13487This format applies only to the entries in the main body of the table.
13488Entries defined in hashes or ones that are missing from the list can have a
13489different format.
d73e5302 13490
ec2f0128 13491The value that the missing entries have is given by another SwashInfo hash
99870f4d 13492entry line; it looks like this:
d73e5302 13493
99870f4d 13494 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
d73e5302 13495
99870f4d 13496This example line says that any Unicode code points not explicitly listed in
77173124 13497the file have the value "NaN" under the property indicated by NAME. If the
99870f4d
KW
13498value is the special string C<< <code point> >>, it means that the value for
13499any missing code point is the code point itself. This happens, for example,
13500in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
77173124 13501character "A", are missing because the uppercase of "A" is itself.
d73e5302 13502
ec2f0128
KW
13503Finally, if the file contains a hash for special case entries, its name is
13504specified by an entry that looks like this:
13505
13506 \$utf8::SwashInfo{'ToNAME'}{'specials_name'} = 'utf8::ToSpecNAME';
13507
99870f4d 13508=head1 SEE ALSO
d73e5302 13509
99870f4d 13510L<$unicode_reference_url>
12ac2576 13511
99870f4d 13512L<perlrecharclass>
12ac2576 13513
99870f4d 13514L<perlunicode>
d73e5302 13515
99870f4d 13516END
d73e5302 13517
9218f1cf
KW
13518 # And write it. The 0 means no utf8.
13519 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
99870f4d
KW
13520 return;
13521}
d73e5302 13522
99870f4d
KW
13523sub make_Heavy () {
13524 # Create and write Heavy.pl, which passes info about the tables to
13525 # utf8_heavy.pl
12ac2576 13526
99870f4d
KW
13527 my @heavy = <<END;
13528$HEADER
13529$INTERNAL_ONLY
d73e5302 13530
99870f4d 13531# This file is for the use of utf8_heavy.pl
12ac2576 13532
99870f4d
KW
13533# Maps property names in loose standard form to its standard name
13534\%utf8::loose_property_name_of = (
13535END
cf25bb62 13536
99870f4d
KW
13537 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
13538 push @heavy, <<END;
13539);
12ac2576 13540
99870f4d
KW
13541# Maps property, table to file for those using stricter matching
13542\%utf8::stricter_to_file_of = (
13543END
13544 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
13545 push @heavy, <<END;
13546);
12ac2576 13547
99870f4d
KW
13548# Maps property, table to file for those using loose matching
13549\%utf8::loose_to_file_of = (
13550END
13551 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
13552 push @heavy, <<END;
13553);
12ac2576 13554
99870f4d
KW
13555# Maps floating point to fractional form
13556\%utf8::nv_floating_to_rational = (
13557END
13558 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
13559 push @heavy, <<END;
13560);
12ac2576 13561
99870f4d
KW
13562# If a floating point number doesn't have enough digits in it to get this
13563# close to a fraction, it isn't considered to be that fraction even if all the
13564# digits it does have match.
13565\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 13566
99870f4d
KW
13567# Deprecated tables to generate a warning for. The key is the file containing
13568# the table, so as to avoid duplication, as many property names can map to the
13569# file, but we only need one entry for all of them.
13570\%utf8::why_deprecated = (
13571END
12ac2576 13572
99870f4d
KW
13573 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
13574 push @heavy, <<END;
13575);
12ac2576 13576
d867ccfb
KW
13577# A few properties have different behavior under /i matching. This maps the
13578# those to substitute files to use under /i.
13579\%utf8::caseless_equivalent = (
13580END
13581
13582
13583 # We set the key to the file when we associated files with tables, but we
13584 # couldn't do the same for the value then, as we might not have the file
13585 # for the alternate table figured out at that time.
13586 foreach my $cased (keys %caseless_equivalent_to) {
13587 my @path = $caseless_equivalent_to{$cased}->file_path;
13588 my $path = join '/', @path[1, -1];
d867ccfb
KW
13589 $utf8::caseless_equivalent_to{$cased} = $path;
13590 }
13591 push @heavy, simple_dumper (\%utf8::caseless_equivalent_to, ' ' x 4);
13592 push @heavy, <<END;
13593);
13594
99870f4d
KW
135951;
13596END
12ac2576 13597
9218f1cf 13598 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
99870f4d 13599 return;
12ac2576
JP
13600}
13601
99870f4d
KW
13602sub write_all_tables() {
13603 # Write out all the tables generated by this program to files, as well as
13604 # the supporting data structures, pod file, and .t file.
13605
13606 my @writables; # List of tables that actually get written
13607 my %match_tables_to_write; # Used to collapse identical match tables
13608 # into one file. Each key is a hash function
13609 # result to partition tables into buckets.
13610 # Each value is an array of the tables that
13611 # fit in the bucket.
13612
13613 # For each property ...
13614 # (sort so that if there is an immutable file name, it has precedence, so
13615 # some other property can't come in and take over its file name. If b's
13616 # file name is defined, will return 1, meaning to take it first; don't
7fc6cb55
KW
13617 # care if both defined, as they had better be different anyway. And the
13618 # property named 'Perl' needs to be first (it doesn't have any immutable
13619 # file name) because empty properties are defined in terms of it's table
13620 # named 'Any'.)
99870f4d 13621 PROPERTY:
7fc6cb55
KW
13622 foreach my $property (sort { return -1 if $a == $perl;
13623 return 1 if $b == $perl;
13624 return defined $b->file
13625 } property_ref('*'))
13626 {
99870f4d
KW
13627 my $type = $property->type;
13628
13629 # And for each table for that property, starting with the mapping
13630 # table for it ...
13631 TABLE:
13632 foreach my $table($property,
13633
13634 # and all the match tables for it (if any), sorted so
13635 # the ones with the shortest associated file name come
13636 # first. The length sorting prevents problems of a
13637 # longer file taking a name that might have to be used
13638 # by a shorter one. The alphabetic sorting prevents
13639 # differences between releases
13640 sort { my $ext_a = $a->external_name;
13641 return 1 if ! defined $ext_a;
13642 my $ext_b = $b->external_name;
13643 return -1 if ! defined $ext_b;
a92d5c2e
KW
13644
13645 # But return the non-complement table before
13646 # the complement one, as the latter is defined
13647 # in terms of the former, and needs to have
13648 # the information for the former available.
13649 return 1 if $a->complement != 0;
13650 return -1 if $b->complement != 0;
13651
99870f4d
KW
13652 my $cmp = length $ext_a <=> length $ext_b;
13653
13654 # Return result if lengths not equal
13655 return $cmp if $cmp;
13656
13657 # Alphabetic if lengths equal
13658 return $ext_a cmp $ext_b
13659 } $property->tables
13660 )
13661 {
12ac2576 13662
99870f4d
KW
13663 # Here we have a table associated with a property. It could be
13664 # the map table (done first for each property), or one of the
13665 # other tables. Determine which type.
13666 my $is_property = $table->isa('Property');
13667
13668 my $name = $table->name;
13669 my $complete_name = $table->complete_name;
13670
13671 # See if should suppress the table if is empty, but warn if it
13672 # contains something.
13673 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
13674 keys %why_suppress_if_empty_warn_if_not;
13675
13676 # Calculate if this table should have any code points associated
13677 # with it or not.
13678 my $expected_empty =
13679
13680 # $perl should be empty, as well as properties that we just
13681 # don't do anything with
13682 ($is_property
13683 && ($table == $perl
13684 || grep { $complete_name eq $_ }
13685 @unimplemented_properties
13686 )
13687 )
13688
13689 # Match tables in properties we skipped populating should be
13690 # empty
13691 || (! $is_property && ! $property->to_create_match_tables)
13692
13693 # Tables and properties that are expected to have no code
13694 # points should be empty
13695 || $suppress_if_empty_warn_if_not
13696 ;
13697
13698 # Set a boolean if this table is the complement of an empty binary
13699 # table
13700 my $is_complement_of_empty_binary =
13701 $type == $BINARY &&
13702 (($table == $property->table('Y')
13703 && $property->table('N')->is_empty)
13704 || ($table == $property->table('N')
13705 && $property->table('Y')->is_empty));
13706
13707
13708 # Some tables should match everything
13709 my $expected_full =
13710 ($is_property)
13711 ? # All these types of map tables will be full because
13712 # they will have been populated with defaults
13713 ($type == $ENUM || $type == $BINARY)
13714
13715 : # A match table should match everything if its method
13716 # shows it should
13717 ($table->matches_all
13718
13719 # The complement of an empty binary table will match
13720 # everything
13721 || $is_complement_of_empty_binary
13722 )
13723 ;
13724
13725 if ($table->is_empty) {
13726
13727
13728 if ($suppress_if_empty_warn_if_not) {
13729 $table->set_status($SUPPRESSED,
13730 $why_suppress_if_empty_warn_if_not{$complete_name});
13731 }
12ac2576 13732
99870f4d
KW
13733 # Suppress expected empty tables.
13734 next TABLE if $expected_empty;
13735
13736 # And setup to later output a warning for those that aren't
13737 # known to be allowed to be empty. Don't do the warning if
13738 # this table is a child of another one to avoid duplicating
13739 # the warning that should come from the parent one.
13740 if (($table == $property || $table->parent == $table)
13741 && $table->status ne $SUPPRESSED
13742 && ! grep { $complete_name =~ /^$_$/ }
13743 @tables_that_may_be_empty)
13744 {
13745 push @unhandled_properties, "$table";
13746 }
7fc6cb55
KW
13747
13748 # An empty table is just the complement of everything.
13749 $table->set_complement($Any) if $table != $property;
99870f4d
KW
13750 }
13751 elsif ($expected_empty) {
13752 my $because = "";
13753 if ($suppress_if_empty_warn_if_not) {
13754 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
13755 }
12ac2576 13756
99870f4d
KW
13757 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
13758 }
12ac2576 13759
99870f4d
KW
13760 my $count = $table->count;
13761 if ($expected_full) {
13762 if ($count != $MAX_UNICODE_CODEPOINTS) {
13763 Carp::my_carp("$table matches only "
13764 . clarify_number($count)
13765 . " Unicode code points but should match "
13766 . clarify_number($MAX_UNICODE_CODEPOINTS)
13767 . " (off by "
13768 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
13769 . "). Proceeding anyway.");
13770 }
12ac2576 13771
99870f4d
KW
13772 # Here is expected to be full. If it is because it is the
13773 # complement of an (empty) binary table that is to be
13774 # suppressed, then suppress this one as well.
13775 if ($is_complement_of_empty_binary) {
13776 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
13777 my $opposing = $property->table($opposing_name);
13778 my $opposing_status = $opposing->status;
13779 if ($opposing_status) {
13780 $table->set_status($opposing_status,
13781 $opposing->status_info);
13782 }
13783 }
13784 }
13785 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
13786 if ($table == $property || $table->leader == $table) {
13787 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
13788 }
13789 }
d73e5302 13790
99870f4d
KW
13791 if ($table->status eq $SUPPRESSED) {
13792 if (! $is_property) {
13793 my @children = $table->children;
13794 foreach my $child (@children) {
13795 if ($child->status ne $SUPPRESSED) {
13796 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
13797 }
13798 }
13799 }
13800 next TABLE;
d73e5302 13801
99870f4d
KW
13802 }
13803 if (! $is_property) {
13804
13805 # Several things need to be done just once for each related
13806 # group of match tables. Do them on the parent.
13807 if ($table->parent == $table) {
13808
13809 # Add an entry in the pod file for the table; it also does
13810 # the children.
23e33b60 13811 make_table_pod_entries($table) if defined $pod_directory;
99870f4d
KW
13812
13813 # See if the the table matches identical code points with
13814 # something that has already been output. In that case,
13815 # no need to have two files with the same code points in
13816 # them. We use the table's hash() method to store these
13817 # in buckets, so that it is quite likely that if two
13818 # tables are in the same bucket they will be identical, so
13819 # don't have to compare tables frequently. The tables
13820 # have to have the same status to share a file, so add
13821 # this to the bucket hash. (The reason for this latter is
13822 # that Heavy.pl associates a status with a file.)
13823 my $hash = $table->hash . ';' . $table->status;
13824
13825 # Look at each table that is in the same bucket as this
13826 # one would be.
13827 foreach my $comparison (@{$match_tables_to_write{$hash}})
13828 {
13829 if ($table->matches_identically_to($comparison)) {
13830 $table->set_equivalent_to($comparison,
13831 Related => 0);
13832 next TABLE;
13833 }
13834 }
d73e5302 13835
99870f4d
KW
13836 # Here, not equivalent, add this table to the bucket.
13837 push @{$match_tables_to_write{$hash}}, $table;
13838 }
13839 }
13840 else {
13841
13842 # Here is the property itself.
13843 # Don't write out or make references to the $perl property
13844 next if $table == $perl;
13845
13846 if ($type != $STRING) {
13847
13848 # There is a mapping stored of the various synonyms to the
13849 # standardized name of the property for utf8_heavy.pl.
13850 # Also, the pod file contains entries of the form:
13851 # \p{alias: *} \p{full: *}
13852 # rather than show every possible combination of things.
13853
13854 my @property_aliases = $property->aliases;
13855
13856 # The full name of this property is stored by convention
13857 # first in the alias array
13858 my $full_property_name =
13859 '\p{' . $property_aliases[0]->name . ': *}';
13860 my $standard_property_name = standardize($table->name);
13861
13862 # For each synonym ...
13863 for my $i (0 .. @property_aliases - 1) {
13864 my $alias = $property_aliases[$i];
13865 my $alias_name = $alias->name;
13866 my $alias_standard = standardize($alias_name);
13867
13868 # Set the mapping for utf8_heavy of the alias to the
13869 # property
13870 if (exists ($loose_property_name_of{$alias_standard}))
13871 {
13872 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
13873 }
13874 else {
13875 $loose_property_name_of{$alias_standard}
13876 = $standard_property_name;
13877 }
13878
23e33b60
KW
13879 # Now for the pod entry for this alias. Skip if not
13880 # outputting a pod; skip the first one, which is the
13881 # full name so won't have an entry like: '\p{full: *}
13882 # \p{full: *}', and skip if don't want an entry for
13883 # this one.
13884 next if $i == 0
13885 || ! defined $pod_directory
13886 || ! $alias->make_pod_entry;
99870f4d 13887
d57ccc9a
KW
13888 my $rhs = $full_property_name;
13889 if ($property != $perl && $table->perl_extension) {
13890 $rhs .= ' (Perl extension)';
13891 }
99870f4d
KW
13892 push @match_properties,
13893 format_pod_line($indent_info_column,
13894 '\p{' . $alias->name . ': *}',
d57ccc9a 13895 $rhs,
99870f4d
KW
13896 $alias->status);
13897 }
13898 } # End of non-string-like property code
d73e5302 13899
d73e5302 13900
99870f4d
KW
13901 # Don't output a mapping file if not desired.
13902 next if ! $property->to_output_map;
13903 }
d73e5302 13904
99870f4d
KW
13905 # Here, we know we want to write out the table, but don't do it
13906 # yet because there may be other tables that come along and will
13907 # want to share the file, and the file's comments will change to
13908 # mention them. So save for later.
13909 push @writables, $table;
13910
13911 } # End of looping through the property and all its tables.
13912 } # End of looping through all properties.
13913
13914 # Now have all the tables that will have files written for them. Do it.
13915 foreach my $table (@writables) {
13916 my @directory;
13917 my $filename;
13918 my $property = $table->property;
13919 my $is_property = ($table == $property);
13920 if (! $is_property) {
13921
13922 # Match tables for the property go in lib/$subdirectory, which is
13923 # the property's name. Don't use the standard file name for this,
13924 # as may get an unfamiliar alias
13925 @directory = ($matches_directory, $property->external_name);
13926 }
13927 else {
d73e5302 13928
99870f4d
KW
13929 @directory = $table->directory;
13930 $filename = $table->file;
13931 }
d73e5302 13932
98dc9551 13933 # Use specified filename if available, or default to property's
99870f4d
KW
13934 # shortest name. We need an 8.3 safe filename (which means "an 8
13935 # safe" filename, since after the dot is only 'pl', which is < 3)
13936 # The 2nd parameter is if the filename shouldn't be changed, and
13937 # it shouldn't iff there is a hard-coded name for this table.
13938 $filename = construct_filename(
13939 $filename || $table->external_name,
13940 ! $filename, # mutable if no filename
13941 \@directory);
d73e5302 13942
99870f4d 13943 register_file_for_name($table, \@directory, $filename);
d73e5302 13944
99870f4d
KW
13945 # Only need to write one file when shared by more than one
13946 # property
a92d5c2e
KW
13947 next if ! $is_property
13948 && ($table->leader != $table || $table->complement != 0);
d73e5302 13949
99870f4d
KW
13950 # Construct a nice comment to add to the file
13951 $table->set_final_comment;
13952
13953 $table->write;
cf25bb62 13954 }
d73e5302 13955
d73e5302 13956
99870f4d
KW
13957 # Write out the pod file
13958 make_pod;
13959
13960 # And Heavy.pl
13961 make_Heavy;
d73e5302 13962
99870f4d
KW
13963 make_property_test_script() if $make_test_script;
13964 return;
cf25bb62 13965}
d73e5302 13966
99870f4d
KW
13967my @white_space_separators = ( # This used only for making the test script.
13968 "",
13969 ' ',
13970 "\t",
13971 ' '
13972 );
d73e5302 13973
99870f4d
KW
13974sub generate_separator($) {
13975 # This used only for making the test script. It generates the colon or
13976 # equal separator between the property and property value, with random
13977 # white space surrounding the separator
d73e5302 13978
99870f4d 13979 my $lhs = shift;
d73e5302 13980
99870f4d 13981 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 13982
99870f4d
KW
13983 # Choose space before and after randomly
13984 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
13985 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 13986
99870f4d
KW
13987 # And return the whole complex, half the time using a colon, half the
13988 # equals
13989 return $spaces_before
13990 . (rand() < 0.5) ? '=' : ':'
13991 . $spaces_after;
13992}
76ccdbe2 13993
430ada4c 13994sub generate_tests($$$$$) {
99870f4d
KW
13995 # This used only for making the test script. It generates test cases that
13996 # are expected to compile successfully in perl. Note that the lhs and
13997 # rhs are assumed to already be as randomized as the caller wants.
13998
99870f4d
KW
13999 my $lhs = shift; # The property: what's to the left of the colon
14000 # or equals separator
14001 my $rhs = shift; # The property value; what's to the right
14002 my $valid_code = shift; # A code point that's known to be in the
14003 # table given by lhs=rhs; undef if table is
14004 # empty
14005 my $invalid_code = shift; # A code point known to not be in the table;
14006 # undef if the table is all code points
14007 my $warning = shift;
14008
14009 # Get the colon or equal
14010 my $separator = generate_separator($lhs);
14011
14012 # The whole 'property=value'
14013 my $name = "$lhs$separator$rhs";
14014
430ada4c 14015 my @output;
99870f4d
KW
14016 # Create a complete set of tests, with complements.
14017 if (defined $valid_code) {
430ada4c
NC
14018 push @output, <<"EOC"
14019Expect(1, $valid_code, '\\p{$name}', $warning);
14020Expect(0, $valid_code, '\\p{^$name}', $warning);
14021Expect(0, $valid_code, '\\P{$name}', $warning);
14022Expect(1, $valid_code, '\\P{^$name}', $warning);
14023EOC
99870f4d
KW
14024 }
14025 if (defined $invalid_code) {
430ada4c
NC
14026 push @output, <<"EOC"
14027Expect(0, $invalid_code, '\\p{$name}', $warning);
14028Expect(1, $invalid_code, '\\p{^$name}', $warning);
14029Expect(1, $invalid_code, '\\P{$name}', $warning);
14030Expect(0, $invalid_code, '\\P{^$name}', $warning);
14031EOC
14032 }
14033 return @output;
99870f4d 14034}
cf25bb62 14035
430ada4c 14036sub generate_error($$$) {
99870f4d
KW
14037 # This used only for making the test script. It generates test cases that
14038 # are expected to not only not match, but to be syntax or similar errors
14039
99870f4d
KW
14040 my $lhs = shift; # The property: what's to the left of the
14041 # colon or equals separator
14042 my $rhs = shift; # The property value; what's to the right
14043 my $already_in_error = shift; # Boolean; if true it's known that the
14044 # unmodified lhs and rhs will cause an error.
14045 # This routine should not force another one
14046 # Get the colon or equal
14047 my $separator = generate_separator($lhs);
14048
14049 # Since this is an error only, don't bother to randomly decide whether to
14050 # put the error on the left or right side; and assume that the rhs is
14051 # loosely matched, again for convenience rather than rigor.
14052 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
14053
14054 my $property = $lhs . $separator . $rhs;
14055
430ada4c
NC
14056 return <<"EOC";
14057Error('\\p{$property}');
14058Error('\\P{$property}');
14059EOC
d73e5302
JH
14060}
14061
99870f4d
KW
14062# These are used only for making the test script
14063# XXX Maybe should also have a bad strict seps, which includes underscore.
14064
14065my @good_loose_seps = (
14066 " ",
14067 "-",
14068 "\t",
14069 "",
14070 "_",
14071 );
14072my @bad_loose_seps = (
14073 "/a/",
14074 ':=',
14075 );
14076
14077sub randomize_stricter_name {
14078 # This used only for making the test script. Take the input name and
14079 # return a randomized, but valid version of it under the stricter matching
14080 # rules.
14081
14082 my $name = shift;
14083 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14084
14085 # If the name looks like a number (integer, floating, or rational), do
14086 # some extra work
14087 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
14088 my $sign = $1;
14089 my $number = $2;
14090 my $separator = $3;
14091
14092 # If there isn't a sign, part of the time add a plus
14093 # Note: Not testing having any denominator having a minus sign
14094 if (! $sign) {
14095 $sign = '+' if rand() <= .3;
14096 }
14097
14098 # And add 0 or more leading zeros.
14099 $name = $sign . ('0' x int rand(10)) . $number;
14100
14101 if (defined $separator) {
14102 my $extra_zeros = '0' x int rand(10);
cf25bb62 14103
99870f4d
KW
14104 if ($separator eq '.') {
14105
14106 # Similarly, add 0 or more trailing zeros after a decimal
14107 # point
14108 $name .= $extra_zeros;
14109 }
14110 else {
14111
14112 # Or, leading zeros before the denominator
14113 $name =~ s,/,/$extra_zeros,;
14114 }
14115 }
cf25bb62 14116 }
d73e5302 14117
99870f4d
KW
14118 # For legibility of the test, only change the case of whole sections at a
14119 # time. To do this, first split into sections. The split returns the
14120 # delimiters
14121 my @sections;
14122 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
14123 trace $section if main::DEBUG && $to_trace;
14124
14125 if (length $section > 1 && $section !~ /\D/) {
14126
14127 # If the section is a sequence of digits, about half the time
14128 # randomly add underscores between some of them.
14129 if (rand() > .5) {
14130
14131 # Figure out how many underscores to add. max is 1 less than
14132 # the number of digits. (But add 1 at the end to make sure
14133 # result isn't 0, and compensate earlier by subtracting 2
14134 # instead of 1)
14135 my $num_underscores = int rand(length($section) - 2) + 1;
14136
14137 # And add them evenly throughout, for convenience, not rigor
14138 use integer;
14139 my $spacing = (length($section) - 1)/ $num_underscores;
14140 my $temp = $section;
14141 $section = "";
14142 for my $i (1 .. $num_underscores) {
14143 $section .= substr($temp, 0, $spacing, "") . '_';
14144 }
14145 $section .= $temp;
14146 }
14147 push @sections, $section;
14148 }
14149 else {
d73e5302 14150
99870f4d
KW
14151 # Here not a sequence of digits. Change the case of the section
14152 # randomly
14153 my $switch = int rand(4);
14154 if ($switch == 0) {
14155 push @sections, uc $section;
14156 }
14157 elsif ($switch == 1) {
14158 push @sections, lc $section;
14159 }
14160 elsif ($switch == 2) {
14161 push @sections, ucfirst $section;
14162 }
14163 else {
14164 push @sections, $section;
14165 }
14166 }
cf25bb62 14167 }
99870f4d
KW
14168 trace "returning", join "", @sections if main::DEBUG && $to_trace;
14169 return join "", @sections;
14170}
71d929cb 14171
99870f4d
KW
14172sub randomize_loose_name($;$) {
14173 # This used only for making the test script
71d929cb 14174
99870f4d
KW
14175 my $name = shift;
14176 my $want_error = shift; # if true, make an error
14177 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14178
14179 $name = randomize_stricter_name($name);
5beb625e
JH
14180
14181 my @parts;
99870f4d 14182 push @parts, $good_loose_seps[rand(@good_loose_seps)];
45376db6
KW
14183
14184 # Preserve trailing ones for the sake of not stripping the underscore from
14185 # 'L_'
14186 for my $part (split /[-\s_]+ (?= . )/, $name) {
5beb625e 14187 if (@parts) {
99870f4d
KW
14188 if ($want_error and rand() < 0.3) {
14189 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
14190 $want_error = 0;
14191 }
14192 else {
14193 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
14194 }
14195 }
99870f4d 14196 push @parts, $part;
5beb625e 14197 }
99870f4d
KW
14198 my $new = join("", @parts);
14199 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 14200
99870f4d 14201 if ($want_error) {
5beb625e 14202 if (rand() >= 0.5) {
99870f4d
KW
14203 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
14204 }
14205 else {
14206 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
14207 }
14208 }
14209 return $new;
14210}
14211
99870f4d
KW
14212# Used to make sure don't generate duplicate test cases.
14213my %test_generated;
5beb625e 14214
99870f4d
KW
14215sub make_property_test_script() {
14216 # This used only for making the test script
14217 # this written directly -- it's huge.
5beb625e 14218
99870f4d 14219 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 14220
99870f4d
KW
14221 # This uses randomness to test different possibilities without testing all
14222 # possibilities. To ensure repeatability, set the seed to 0. But if
14223 # tests are added, it will perturb all later ones in the .t file
14224 srand 0;
5beb625e 14225
3df51b85
KW
14226 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
14227
99870f4d
KW
14228 # Keep going down an order of magnitude
14229 # until find that adding this quantity to
14230 # 1 remains 1; but put an upper limit on
14231 # this so in case this algorithm doesn't
14232 # work properly on some platform, that we
14233 # won't loop forever.
14234 my $digits = 0;
14235 my $min_floating_slop = 1;
14236 while (1+ $min_floating_slop != 1
14237 && $digits++ < 50)
5beb625e 14238 {
99870f4d
KW
14239 my $next = $min_floating_slop / 10;
14240 last if $next == 0; # If underflows,
14241 # use previous one
14242 $min_floating_slop = $next;
5beb625e 14243 }
430ada4c
NC
14244
14245 # It doesn't matter whether the elements of this array contain single lines
14246 # or multiple lines. main::write doesn't count the lines.
14247 my @output;
99870f4d
KW
14248
14249 foreach my $property (property_ref('*')) {
14250 foreach my $table ($property->tables) {
14251
14252 # Find code points that match, and don't match this table.
14253 my $valid = $table->get_valid_code_point;
14254 my $invalid = $table->get_invalid_code_point;
14255 my $warning = ($table->status eq $DEPRECATED)
14256 ? "'deprecated'"
14257 : '""';
14258
14259 # Test each possible combination of the property's aliases with
14260 # the table's. If this gets to be too many, could do what is done
14261 # in the set_final_comment() for Tables
14262 my @table_aliases = $table->aliases;
14263 my @property_aliases = $table->property->aliases;
807807b7
KW
14264
14265 # Every property can be optionally be prefixed by 'Is_', so test
14266 # that those work, by creating such a new alias for each
14267 # pre-existing one.
14268 push @property_aliases, map { Alias->new("Is_" . $_->name,
14269 $_->loose_match,
14270 $_->make_pod_entry,
14271 $_->externally_ok,
14272 $_->status)
14273 } @property_aliases;
99870f4d
KW
14274 my $max = max(scalar @table_aliases, scalar @property_aliases);
14275 for my $j (0 .. $max - 1) {
14276
14277 # The current alias for property is the next one on the list,
14278 # or if beyond the end, start over. Similarly for table
14279 my $property_name
14280 = $property_aliases[$j % @property_aliases]->name;
14281
14282 $property_name = "" if $table->property == $perl;
14283 my $table_alias = $table_aliases[$j % @table_aliases];
14284 my $table_name = $table_alias->name;
14285 my $loose_match = $table_alias->loose_match;
14286
14287 # If the table doesn't have a file, any test for it is
14288 # already guaranteed to be in error
14289 my $already_error = ! $table->file_path;
14290
14291 # Generate error cases for this alias.
430ada4c
NC
14292 push @output, generate_error($property_name,
14293 $table_name,
14294 $already_error);
99870f4d
KW
14295
14296 # If the table is guaranteed to always generate an error,
14297 # quit now without generating success cases.
14298 next if $already_error;
14299
14300 # Now for the success cases.
14301 my $random;
14302 if ($loose_match) {
14303
14304 # For loose matching, create an extra test case for the
14305 # standard name.
14306 my $standard = standardize($table_name);
14307
14308 # $test_name should be a unique combination for each test
14309 # case; used just to avoid duplicate tests
14310 my $test_name = "$property_name=$standard";
14311
14312 # Don't output duplicate test cases.
14313 if (! exists $test_generated{$test_name}) {
14314 $test_generated{$test_name} = 1;
430ada4c
NC
14315 push @output, generate_tests($property_name,
14316 $standard,
14317 $valid,
14318 $invalid,
14319 $warning,
14320 );
5beb625e 14321 }
99870f4d
KW
14322 $random = randomize_loose_name($table_name)
14323 }
14324 else { # Stricter match
14325 $random = randomize_stricter_name($table_name);
99598c8c 14326 }
99598c8c 14327
99870f4d
KW
14328 # Now for the main test case for this alias.
14329 my $test_name = "$property_name=$random";
14330 if (! exists $test_generated{$test_name}) {
14331 $test_generated{$test_name} = 1;
430ada4c
NC
14332 push @output, generate_tests($property_name,
14333 $random,
14334 $valid,
14335 $invalid,
14336 $warning,
14337 );
99870f4d
KW
14338
14339 # If the name is a rational number, add tests for the
14340 # floating point equivalent.
14341 if ($table_name =~ qr{/}) {
14342
14343 # Calculate the float, and find just the fraction.
14344 my $float = eval $table_name;
14345 my ($whole, $fraction)
14346 = $float =~ / (.*) \. (.*) /x;
14347
14348 # Starting with one digit after the decimal point,
14349 # create a test for each possible precision (number of
14350 # digits past the decimal point) until well beyond the
14351 # native number found on this machine. (If we started
14352 # with 0 digits, it would be an integer, which could
14353 # well match an unrelated table)
14354 PLACE:
14355 for my $i (1 .. $min_floating_slop + 3) {
14356 my $table_name = sprintf("%.*f", $i, $float);
14357 if ($i < $MIN_FRACTION_LENGTH) {
14358
14359 # If the test case has fewer digits than the
14360 # minimum acceptable precision, it shouldn't
14361 # succeed, so we expect an error for it.
14362 # E.g., 2/3 = .7 at one decimal point, and we
14363 # shouldn't say it matches .7. We should make
14364 # it be .667 at least before agreeing that the
14365 # intent was to match 2/3. But at the
14366 # less-than- acceptable level of precision, it
14367 # might actually match an unrelated number.
14368 # So don't generate a test case if this
14369 # conflating is possible. In our example, we
14370 # don't want 2/3 matching 7/10, if there is
14371 # a 7/10 code point.
14372 for my $existing
14373 (keys %nv_floating_to_rational)
14374 {
14375 next PLACE
14376 if abs($table_name - $existing)
14377 < $MAX_FLOATING_SLOP;
14378 }
430ada4c
NC
14379 push @output, generate_error($property_name,
14380 $table_name,
14381 1 # 1 => already an error
14382 );
99870f4d
KW
14383 }
14384 else {
14385
14386 # Here the number of digits exceeds the
14387 # minimum we think is needed. So generate a
14388 # success test case for it.
430ada4c
NC
14389 push @output, generate_tests($property_name,
14390 $table_name,
14391 $valid,
14392 $invalid,
14393 $warning,
14394 );
99870f4d
KW
14395 }
14396 }
99598c8c
JH
14397 }
14398 }
99870f4d
KW
14399 }
14400 }
14401 }
37e2e78e 14402
9218f1cf
KW
14403 &write($t_path,
14404 0, # Not utf8;
14405 [<DATA>,
14406 @output,
14407 (map {"Test_X('$_');\n"} @backslash_X_tests),
14408 "Finished();\n"]);
99870f4d
KW
14409 return;
14410}
99598c8c 14411
99870f4d
KW
14412# This is a list of the input files and how to handle them. The files are
14413# processed in their order in this list. Some reordering is possible if
14414# desired, but the v0 files should be first, and the extracted before the
14415# others except DAge.txt (as data in an extracted file can be over-ridden by
14416# the non-extracted. Some other files depend on data derived from an earlier
14417# file, like UnicodeData requires data from Jamo, and the case changing and
14418# folding requires data from Unicode. Mostly, it safest to order by first
14419# version releases in (except the Jamo). DAge.txt is read before the
14420# extracted ones because of the rarely used feature $compare_versions. In the
14421# unlikely event that there were ever an extracted file that contained the Age
14422# property information, it would have to go in front of DAge.
14423#
14424# The version strings allow the program to know whether to expect a file or
14425# not, but if a file exists in the directory, it will be processed, even if it
14426# is in a version earlier than expected, so you can copy files from a later
14427# release into an earlier release's directory.
14428my @input_file_objects = (
14429 Input_file->new('PropertyAliases.txt', v0,
14430 Handler => \&process_PropertyAliases,
14431 ),
14432 Input_file->new(undef, v0, # No file associated with this
3df51b85 14433 Progress_Message => 'Finishing property setup',
99870f4d
KW
14434 Handler => \&finish_property_setup,
14435 ),
14436 Input_file->new('PropValueAliases.txt', v0,
14437 Handler => \&process_PropValueAliases,
14438 Has_Missings_Defaults => $NOT_IGNORED,
14439 ),
14440 Input_file->new('DAge.txt', v3.2.0,
14441 Has_Missings_Defaults => $NOT_IGNORED,
14442 Property => 'Age'
14443 ),
14444 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
14445 Property => 'General_Category',
14446 ),
14447 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
14448 Property => 'Canonical_Combining_Class',
14449 Has_Missings_Defaults => $NOT_IGNORED,
14450 ),
14451 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
14452 Property => 'Numeric_Type',
14453 Has_Missings_Defaults => $NOT_IGNORED,
14454 ),
14455 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
14456 Property => 'East_Asian_Width',
14457 Has_Missings_Defaults => $NOT_IGNORED,
14458 ),
14459 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
14460 Property => 'Line_Break',
14461 Has_Missings_Defaults => $NOT_IGNORED,
14462 ),
14463 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
14464 Property => 'Bidi_Class',
14465 Has_Missings_Defaults => $NOT_IGNORED,
14466 ),
14467 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
14468 Property => 'Decomposition_Type',
14469 Has_Missings_Defaults => $NOT_IGNORED,
14470 ),
14471 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
14472 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
14473 Property => 'Numeric_Value',
14474 Each_Line_Handler => \&filter_numeric_value_line,
14475 Has_Missings_Defaults => $NOT_IGNORED,
14476 ),
14477 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
14478 Property => 'Joining_Group',
14479 Has_Missings_Defaults => $NOT_IGNORED,
14480 ),
14481
14482 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
14483 Property => 'Joining_Type',
14484 Has_Missings_Defaults => $NOT_IGNORED,
14485 ),
14486 Input_file->new('Jamo.txt', v2.0.0,
14487 Property => 'Jamo_Short_Name',
14488 Each_Line_Handler => \&filter_jamo_line,
14489 ),
14490 Input_file->new('UnicodeData.txt', v1.1.5,
14491 Pre_Handler => \&setup_UnicodeData,
14492
14493 # We clean up this file for some early versions.
14494 Each_Line_Handler => [ (($v_version lt v2.0.0 )
14495 ? \&filter_v1_ucd
14496 : ($v_version eq v2.1.5)
14497 ? \&filter_v2_1_5_ucd
3ffed8c2
KW
14498
14499 # And for 5.14 Perls with 6.0,
14500 # have to also make changes
14501 : ($v_version ge v6.0.0)
14502 ? \&filter_v6_ucd
14503 : undef),
99870f4d
KW
14504
14505 # And the main filter
14506 \&filter_UnicodeData_line,
14507 ],
14508 EOF_Handler => \&EOF_UnicodeData,
14509 ),
14510 Input_file->new('ArabicShaping.txt', v2.0.0,
14511 Each_Line_Handler =>
14512 [ ($v_version lt 4.1.0)
14513 ? \&filter_old_style_arabic_shaping
14514 : undef,
14515 \&filter_arabic_shaping_line,
14516 ],
14517 Has_Missings_Defaults => $NOT_IGNORED,
14518 ),
14519 Input_file->new('Blocks.txt', v2.0.0,
14520 Property => 'Block',
14521 Has_Missings_Defaults => $NOT_IGNORED,
14522 Each_Line_Handler => \&filter_blocks_lines
14523 ),
14524 Input_file->new('PropList.txt', v2.0.0,
14525 Each_Line_Handler => (($v_version lt v3.1.0)
14526 ? \&filter_old_style_proplist
14527 : undef),
14528 ),
14529 Input_file->new('Unihan.txt', v2.0.0,
14530 Pre_Handler => \&setup_unihan,
14531 Optional => 1,
14532 Each_Line_Handler => \&filter_unihan_line,
14533 ),
14534 Input_file->new('SpecialCasing.txt', v2.1.8,
14535 Each_Line_Handler => \&filter_special_casing_line,
14536 Pre_Handler => \&setup_special_casing,
14537 ),
14538 Input_file->new(
14539 'LineBreak.txt', v3.0.0,
14540 Has_Missings_Defaults => $NOT_IGNORED,
14541 Property => 'Line_Break',
14542 # Early versions had problematic syntax
14543 Each_Line_Handler => (($v_version lt v3.1.0)
14544 ? \&filter_early_ea_lb
14545 : undef),
14546 ),
14547 Input_file->new('EastAsianWidth.txt', v3.0.0,
14548 Property => 'East_Asian_Width',
14549 Has_Missings_Defaults => $NOT_IGNORED,
14550 # Early versions had problematic syntax
14551 Each_Line_Handler => (($v_version lt v3.1.0)
14552 ? \&filter_early_ea_lb
14553 : undef),
14554 ),
14555 Input_file->new('CompositionExclusions.txt', v3.0.0,
14556 Property => 'Composition_Exclusion',
14557 ),
14558 Input_file->new('BidiMirroring.txt', v3.0.1,
14559 Property => 'Bidi_Mirroring_Glyph',
14560 ),
37e2e78e
KW
14561 Input_file->new("NormalizationTest.txt", v3.0.1,
14562 Skip => 1,
14563 ),
99870f4d
KW
14564 Input_file->new('CaseFolding.txt', v3.0.1,
14565 Pre_Handler => \&setup_case_folding,
14566 Each_Line_Handler =>
14567 [ ($v_version lt v3.1.0)
14568 ? \&filter_old_style_case_folding
14569 : undef,
14570 \&filter_case_folding_line
14571 ],
99870f4d
KW
14572 ),
14573 Input_file->new('DCoreProperties.txt', v3.1.0,
14574 # 5.2 changed this file
14575 Has_Missings_Defaults => (($v_version ge v5.2.0)
14576 ? $NOT_IGNORED
14577 : $NO_DEFAULTS),
14578 ),
14579 Input_file->new('Scripts.txt', v3.1.0,
14580 Property => 'Script',
14581 Has_Missings_Defaults => $NOT_IGNORED,
14582 ),
14583 Input_file->new('DNormalizationProps.txt', v3.1.0,
14584 Has_Missings_Defaults => $NOT_IGNORED,
14585 Each_Line_Handler => (($v_version lt v4.0.1)
14586 ? \&filter_old_style_normalization_lines
14587 : undef),
14588 ),
14589 Input_file->new('HangulSyllableType.txt', v4.0.0,
14590 Has_Missings_Defaults => $NOT_IGNORED,
14591 Property => 'Hangul_Syllable_Type'),
14592 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
14593 Property => 'Word_Break',
14594 Has_Missings_Defaults => $NOT_IGNORED,
14595 ),
14596 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
14597 Property => 'Grapheme_Cluster_Break',
14598 Has_Missings_Defaults => $NOT_IGNORED,
14599 ),
37e2e78e
KW
14600 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
14601 Handler => \&process_GCB_test,
14602 ),
14603 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
14604 Skip => 1,
14605 ),
14606 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
14607 Skip => 1,
14608 ),
14609 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
14610 Skip => 1,
14611 ),
99870f4d
KW
14612 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
14613 Property => 'Sentence_Break',
14614 Has_Missings_Defaults => $NOT_IGNORED,
14615 ),
14616 Input_file->new('NamedSequences.txt', v4.1.0,
14617 Handler => \&process_NamedSequences
14618 ),
14619 Input_file->new('NameAliases.txt', v5.0.0,
14620 Property => 'Name_Alias',
14621 ),
37e2e78e
KW
14622 Input_file->new("BidiTest.txt", v5.2.0,
14623 Skip => 1,
14624 ),
99870f4d
KW
14625 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
14626 Optional => 1,
14627 Each_Line_Handler => \&filter_unihan_line,
14628 ),
14629 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
14630 Optional => 1,
14631 Each_Line_Handler => \&filter_unihan_line,
14632 ),
14633 Input_file->new('UnihanIRGSources.txt', v5.2.0,
14634 Optional => 1,
14635 Pre_Handler => \&setup_unihan,
14636 Each_Line_Handler => \&filter_unihan_line,
14637 ),
14638 Input_file->new('UnihanNumericValues.txt', v5.2.0,
14639 Optional => 1,
14640 Each_Line_Handler => \&filter_unihan_line,
14641 ),
14642 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
14643 Optional => 1,
14644 Each_Line_Handler => \&filter_unihan_line,
14645 ),
14646 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
14647 Optional => 1,
14648 Each_Line_Handler => \&filter_unihan_line,
14649 ),
14650 Input_file->new('UnihanReadings.txt', v5.2.0,
14651 Optional => 1,
14652 Each_Line_Handler => \&filter_unihan_line,
14653 ),
14654 Input_file->new('UnihanVariants.txt', v5.2.0,
14655 Optional => 1,
14656 Each_Line_Handler => \&filter_unihan_line,
14657 ),
82aed44a
KW
14658 Input_file->new('ScriptExtensions.txt', v6.0.0,
14659 Property => 'Script_Extensions',
14660 Pre_Handler => \&setup_script_extensions,
14661 ),
99870f4d 14662);
99598c8c 14663
99870f4d
KW
14664# End of all the preliminaries.
14665# Do it...
99598c8c 14666
99870f4d
KW
14667if ($compare_versions) {
14668 Carp::my_carp(<<END
14669Warning. \$compare_versions is set. Output is not suitable for production
14670END
14671 );
14672}
99598c8c 14673
99870f4d
KW
14674# Put into %potential_files a list of all the files in the directory structure
14675# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 14676# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
14677my @ignored_files_full_names = map { File::Spec->rel2abs(
14678 internal_file_to_platform($_))
14679 } keys %ignored_files;
14680File::Find::find({
14681 wanted=>sub {
37e2e78e 14682 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 14683 my $full = lc(File::Spec->rel2abs($_));
99870f4d 14684 $potential_files{$full} = 1
37e2e78e 14685 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
14686 return;
14687 }
14688}, File::Spec->curdir());
99598c8c 14689
99870f4d 14690my @mktables_list_output_files;
cdcef19a 14691my $old_start_time = 0;
cf25bb62 14692
3644ba60
KW
14693if (! -e $file_list) {
14694 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
14695 $write_unchanged_files = 1;
14696} elsif ($write_unchanged_files) {
99870f4d
KW
14697 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
14698}
14699else {
14700 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
14701 my $file_handle;
23e33b60 14702 if (! open $file_handle, "<", $file_list) {
3644ba60 14703 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
14704 $glob_list = 1;
14705 }
14706 else {
14707 my @input;
14708
14709 # Read and parse mktables.lst, placing the results from the first part
14710 # into @input, and the second part into @mktables_list_output_files
14711 for my $list ( \@input, \@mktables_list_output_files ) {
14712 while (<$file_handle>) {
14713 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
14714 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
14715 $old_start_time = $1;
14716 }
99870f4d
KW
14717 next if /^ \s* (?: \# .* )? $/x;
14718 last if /^ =+ $/x;
14719 my ( $file ) = split /\t/;
14720 push @$list, $file;
cf25bb62 14721 }
99870f4d
KW
14722 @$list = uniques(@$list);
14723 next;
cf25bb62
JH
14724 }
14725
99870f4d
KW
14726 # Look through all the input files
14727 foreach my $input (@input) {
14728 next if $input eq 'version'; # Already have checked this.
cf25bb62 14729
99870f4d
KW
14730 # Ignore if doesn't exist. The checking about whether we care or
14731 # not is done via the Input_file object.
14732 next if ! file_exists($input);
5beb625e 14733
99870f4d
KW
14734 # The paths are stored with relative names, and with '/' as the
14735 # delimiter; convert to absolute on this machine
517956bf 14736 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
99870f4d 14737 $potential_files{$full} = 1
517956bf 14738 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 14739 }
5beb625e 14740 }
cf25bb62 14741
99870f4d
KW
14742 close $file_handle;
14743}
14744
14745if ($glob_list) {
14746
14747 # Here wants to process all .txt files in the directory structure.
14748 # Convert them to full path names. They are stored in the platform's
14749 # relative style
f86864ac
KW
14750 my @known_files;
14751 foreach my $object (@input_file_objects) {
14752 my $file = $object->file;
14753 next unless defined $file;
14754 push @known_files, File::Spec->rel2abs($file);
14755 }
99870f4d
KW
14756
14757 my @unknown_input_files;
14758 foreach my $file (keys %potential_files) {
517956bf 14759 next if grep { lc($file) eq lc($_) } @known_files;
99870f4d
KW
14760
14761 # Here, the file is unknown to us. Get relative path name
14762 $file = File::Spec->abs2rel($file);
14763 push @unknown_input_files, $file;
14764
14765 # What will happen is we create a data structure for it, and add it to
14766 # the list of input files to process. First get the subdirectories
14767 # into an array
14768 my (undef, $directories, undef) = File::Spec->splitpath($file);
14769 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14770 my @directories = File::Spec->splitdir($directories);
14771
14772 # If the file isn't extracted (meaning none of the directories is the
14773 # extracted one), just add it to the end of the list of inputs.
14774 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 14775 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
14776 }
14777 else {
14778
14779 # Here, the file is extracted. It needs to go ahead of most other
14780 # processing. Search for the first input file that isn't a
14781 # special required property (that is, find one whose first_release
14782 # is non-0), and isn't extracted. Also, the Age property file is
14783 # processed before the extracted ones, just in case
14784 # $compare_versions is set.
14785 for (my $i = 0; $i < @input_file_objects; $i++) {
14786 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
14787 && lc($input_file_objects[$i]->file) ne 'dage.txt'
14788 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 14789 {
99f78760 14790 splice @input_file_objects, $i, 0,
37e2e78e 14791 Input_file->new($file, v0);
99870f4d
KW
14792 last;
14793 }
cf25bb62 14794 }
99870f4d 14795
cf25bb62 14796 }
d2d499f5 14797 }
99870f4d 14798 if (@unknown_input_files) {
23e33b60 14799 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
14800
14801The following files are unknown as to how to handle. Assuming they are
14802typical property files. You'll know by later error messages if it worked or
14803not:
14804END
99f78760 14805 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
14806 }
14807} # End of looking through directory structure for more .txt files.
5beb625e 14808
99870f4d
KW
14809# Create the list of input files from the objects we have defined, plus
14810# version
14811my @input_files = 'version';
14812foreach my $object (@input_file_objects) {
14813 my $file = $object->file;
14814 next if ! defined $file; # Not all objects have files
14815 next if $object->optional && ! -e $file;
14816 push @input_files, $file;
14817}
5beb625e 14818
99870f4d
KW
14819if ( $verbosity >= $VERBOSE ) {
14820 print "Expecting ".scalar( @input_files )." input files. ",
14821 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
14822}
cf25bb62 14823
aeab6150
KW
14824# We set $most_recent to be the most recently changed input file, including
14825# this program itself (done much earlier in this file)
99870f4d 14826foreach my $in (@input_files) {
cdcef19a
KW
14827 next unless -e $in; # Keep going even if missing a file
14828 my $mod_time = (stat $in)[9];
aeab6150 14829 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
14830
14831 # See that the input files have distinct names, to warn someone if they
14832 # are adding a new one
14833 if ($make_list) {
14834 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
14835 $directories =~ s;/$;;; # Can have extraneous trailing '/'
14836 my @directories = File::Spec->splitdir($directories);
14837 my $base = $file =~ s/\.txt$//;
14838 construct_filename($file, 'mutable', \@directories);
cf25bb62 14839 }
99870f4d 14840}
cf25bb62 14841
dff6c046 14842my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 14843 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 14844 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 14845
99870f4d
KW
14846# Now we check to see if any output files are older than youngest, if
14847# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 14848if (! $rebuild) {
99870f4d
KW
14849 foreach my $out (@mktables_list_output_files) {
14850 if ( ! file_exists($out)) {
14851 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 14852 $rebuild = 1;
99870f4d
KW
14853 last;
14854 }
14855 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
14856 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
14857 if ( (stat $out)[9] <= $most_recent ) {
14858 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 14859 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 14860 $rebuild = 1;
99870f4d 14861 last;
cf25bb62 14862 }
cf25bb62 14863 }
99870f4d 14864}
d1d1cd7a 14865if (! $rebuild) {
1265e11f 14866 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
14867 exit(0);
14868}
14869print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 14870
99870f4d
KW
14871# Ready to do the major processing. First create the perl pseudo-property.
14872$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 14873
99870f4d
KW
14874# Process each input file
14875foreach my $file (@input_file_objects) {
14876 $file->run;
d2d499f5
JH
14877}
14878
99870f4d 14879# Finish the table generation.
c4051cc5 14880
99870f4d
KW
14881print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
14882finish_Unicode();
c4051cc5 14883
99870f4d
KW
14884print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
14885compile_perl();
c4051cc5 14886
99870f4d
KW
14887print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
14888add_perl_synonyms();
c4051cc5 14889
99870f4d
KW
14890print "Writing tables\n" if $verbosity >= $PROGRESS;
14891write_all_tables();
c4051cc5 14892
99870f4d
KW
14893# Write mktables.lst
14894if ( $file_list and $make_list ) {
c4051cc5 14895
99870f4d
KW
14896 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
14897 foreach my $file (@input_files, @files_actually_output) {
14898 my (undef, $directories, $file) = File::Spec->splitpath($file);
14899 my @directories = File::Spec->splitdir($directories);
14900 $file = join '/', @directories, $file;
14901 }
14902
14903 my $ofh;
14904 if (! open $ofh,">",$file_list) {
14905 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
14906 return
14907 }
14908 else {
cdcef19a 14909 my $localtime = localtime $start_time;
99870f4d
KW
14910 print $ofh <<"END";
14911#
14912# $file_list -- File list for $0.
97050450 14913#
cdcef19a 14914# Autogenerated starting on $start_time ($localtime)
97050450
YO
14915#
14916# - First section is input files
99870f4d 14917# ($0 itself is not listed but is automatically considered an input)
98dc9551 14918# - Section separator is /^=+\$/
97050450
YO
14919# - Second section is a list of output files.
14920# - Lines matching /^\\s*#/ are treated as comments
14921# which along with blank lines are ignored.
14922#
14923
14924# Input files:
14925
99870f4d
KW
14926END
14927 print $ofh "$_\n" for sort(@input_files);
14928 print $ofh "\n=================================\n# Output files:\n\n";
14929 print $ofh "$_\n" for sort @files_actually_output;
14930 print $ofh "\n# ",scalar(@input_files)," input files\n",
14931 "# ",scalar(@files_actually_output)+1," output files\n\n",
14932 "# End list\n";
14933 close $ofh
14934 or Carp::my_carp("Failed to close $ofh: $!");
14935
14936 print "Filelist has ",scalar(@input_files)," input files and ",
14937 scalar(@files_actually_output)+1," output files\n"
14938 if $verbosity >= $VERBOSE;
14939 }
14940}
14941
14942# Output these warnings unless -q explicitly specified.
c83dffeb 14943if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
99870f4d
KW
14944 if (@unhandled_properties) {
14945 print "\nProperties and tables that unexpectedly have no code points\n";
14946 foreach my $property (sort @unhandled_properties) {
14947 print $property, "\n";
14948 }
14949 }
14950
14951 if (%potential_files) {
14952 print "\nInput files that are not considered:\n";
14953 foreach my $file (sort keys %potential_files) {
14954 print File::Spec->abs2rel($file), "\n";
14955 }
14956 }
14957 print "\nAll done\n" if $verbosity >= $VERBOSE;
14958}
5beb625e 14959exit(0);
cf25bb62 14960
99870f4d 14961# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 14962__DATA__
99870f4d 14963
5beb625e
JH
14964use strict;
14965use warnings;
14966
66fd7fd0
KW
14967# If run outside the normal test suite on an ASCII platform, you can
14968# just create a latin1_to_native() function that just returns its
14969# inputs, because that's the only function used from test.pl
14970require "test.pl";
14971
37e2e78e
KW
14972# Test qr/\X/ and the \p{} regular expression constructs. This file is
14973# constructed by mktables from the tables it generates, so if mktables is
14974# buggy, this won't necessarily catch those bugs. Tests are generated for all
14975# feasible properties; a few aren't currently feasible; see
14976# is_code_point_usable() in mktables for details.
99870f4d
KW
14977
14978# Standard test packages are not used because this manipulates SIG_WARN. It
14979# exits 0 if every non-skipped test succeeded; -1 if any failed.
14980
5beb625e
JH
14981my $Tests = 0;
14982my $Fails = 0;
99870f4d 14983
99870f4d
KW
14984sub Expect($$$$) {
14985 my $expected = shift;
14986 my $ord = shift;
14987 my $regex = shift;
14988 my $warning_type = shift; # Type of warning message, like 'deprecated'
14989 # or empty if none
14990 my $line = (caller)[2];
66fd7fd0 14991 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 14992
99870f4d 14993 # Convert the code point to hex form
23e33b60 14994 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 14995
99870f4d 14996 my @tests = "";
5beb625e 14997
37e2e78e
KW
14998 # The first time through, use all warnings. If the input should generate
14999 # a warning, add another time through with them turned off
99870f4d
KW
15000 push @tests, "no warnings '$warning_type';" if $warning_type;
15001
15002 foreach my $no_warnings (@tests) {
15003
15004 # Store any warning messages instead of outputting them
15005 local $SIG{__WARN__} = $SIG{__WARN__};
15006 my $warning_message;
15007 $SIG{__WARN__} = sub { $warning_message = $_[0] };
15008
15009 $Tests++;
15010
15011 # A string eval is needed because of the 'no warnings'.
15012 # Assumes no parens in the regular expression
15013 my $result = eval "$no_warnings
15014 my \$RegObj = qr($regex);
15015 $string =~ \$RegObj ? 1 : 0";
15016 if (not defined $result) {
15017 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
15018 $Fails++;
15019 }
15020 elsif ($result ^ $expected) {
15021 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
15022 $Fails++;
15023 }
15024 elsif ($warning_message) {
15025 if (! $warning_type || ($warning_type && $no_warnings)) {
15026 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
15027 $Fails++;
15028 }
15029 else {
15030 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
15031 }
15032 }
15033 elsif ($warning_type && ! $no_warnings) {
15034 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
15035 $Fails++;
15036 }
15037 else {
15038 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
15039 }
5beb625e 15040 }
99870f4d 15041 return;
5beb625e 15042}
d73e5302 15043
99870f4d
KW
15044sub Error($) {
15045 my $regex = shift;
5beb625e 15046 $Tests++;
99870f4d 15047 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 15048 $Fails++;
99870f4d
KW
15049 my $line = (caller)[2];
15050 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 15051 }
99870f4d
KW
15052 else {
15053 my $line = (caller)[2];
15054 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
15055 }
15056 return;
5beb625e
JH
15057}
15058
37e2e78e
KW
15059# GCBTest.txt character that separates grapheme clusters
15060my $breakable_utf8 = my $breakable = chr(0xF7);
15061utf8::upgrade($breakable_utf8);
15062
15063# GCBTest.txt character that indicates that the adjoining code points are part
15064# of the same grapheme cluster
15065my $nobreak_utf8 = my $nobreak = chr(0xD7);
15066utf8::upgrade($nobreak_utf8);
15067
15068sub Test_X($) {
15069 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
15070 # Each such line is a sequence of code points given by their hex numbers,
15071 # separated by the two characters defined just before this subroutine that
15072 # indicate that either there can or cannot be a break between the adjacent
15073 # code points. If there isn't a break, that means the sequence forms an
15074 # extended grapheme cluster, which means that \X should match the whole
15075 # thing. If there is a break, \X should stop there. This is all
15076 # converted by this routine into a match:
15077 # $string =~ /(\X)/,
15078 # Each \X should match the next cluster; and that is what is checked.
15079
15080 my $template = shift;
15081
15082 my $line = (caller)[2];
15083
15084 # The line contains characters above the ASCII range, but in Latin1. It
15085 # may or may not be in utf8, and if it is, it may or may not know it. So,
15086 # convert these characters to 8 bits. If knows is in utf8, simply
15087 # downgrade.
15088 if (utf8::is_utf8($template)) {
15089 utf8::downgrade($template);
15090 } else {
15091
15092 # Otherwise, if it is in utf8, but doesn't know it, the next lines
15093 # convert the two problematic characters to their 8-bit equivalents.
15094 # If it isn't in utf8, they don't harm anything.
15095 use bytes;
15096 $template =~ s/$nobreak_utf8/$nobreak/g;
15097 $template =~ s/$breakable_utf8/$breakable/g;
15098 }
15099
15100 # Get rid of the leading and trailing breakables
15101 $template =~ s/^ \s* $breakable \s* //x;
15102 $template =~ s/ \s* $breakable \s* $ //x;
15103
15104 # And no-breaks become just a space.
15105 $template =~ s/ \s* $nobreak \s* / /xg;
15106
15107 # Split the input into segments that are breakable between them.
15108 my @segments = split /\s*$breakable\s*/, $template;
15109
15110 my $string = "";
15111 my $display_string = "";
15112 my @should_match;
15113 my @should_display;
15114
15115 # Convert the code point sequence in each segment into a Perl string of
15116 # characters
15117 foreach my $segment (@segments) {
15118 my @code_points = split /\s+/, $segment;
15119 my $this_string = "";
15120 my $this_display = "";
15121 foreach my $code_point (@code_points) {
66fd7fd0 15122 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
15123 $this_display .= "\\x{$code_point}";
15124 }
15125
15126 # The next cluster should match the string in this segment.
15127 push @should_match, $this_string;
15128 push @should_display, $this_display;
15129 $string .= $this_string;
15130 $display_string .= $this_display;
15131 }
15132
15133 # If a string can be represented in both non-ut8 and utf8, test both cases
15134 UPGRADE:
15135 for my $to_upgrade (0 .. 1) {
678f13d5 15136
37e2e78e
KW
15137 if ($to_upgrade) {
15138
15139 # If already in utf8, would just be a repeat
15140 next UPGRADE if utf8::is_utf8($string);
15141
15142 utf8::upgrade($string);
15143 }
15144
15145 # Finally, do the \X match.
15146 my @matches = $string =~ /(\X)/g;
15147
15148 # Look through each matched cluster to verify that it matches what we
15149 # expect.
15150 my $min = (@matches < @should_match) ? @matches : @should_match;
15151 for my $i (0 .. $min - 1) {
15152 $Tests++;
15153 if ($matches[$i] eq $should_match[$i]) {
15154 print "ok $Tests - ";
15155 if ($i == 0) {
15156 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
15157 } else {
15158 print "And \\X #", $i + 1,
15159 }
15160 print " correctly matched $should_display[$i]; line $line\n";
15161 } else {
15162 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
15163 unpack("U*", $matches[$i]));
15164 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
15165 $i + 1,
15166 " should have matched $should_display[$i]",
15167 " but instead matched $matches[$i]",
15168 ". Abandoning rest of line $line\n";
15169 next UPGRADE;
15170 }
15171 }
15172
15173 # And the number of matches should equal the number of expected matches.
15174 $Tests++;
15175 if (@matches == @should_match) {
15176 print "ok $Tests - Nothing was left over; line $line\n";
15177 } else {
15178 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
15179 }
15180 }
15181
15182 return;
15183}
15184
99870f4d 15185sub Finished() {
f86864ac 15186 print "1..$Tests\n";
99870f4d 15187 exit($Fails ? -1 : 0);
5beb625e 15188}
99870f4d
KW
15189
15190Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 15191Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
15192Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
15193Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 15194Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726