This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Unicode 1.5 only had 2**16 code points
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
9be22ca5
KW
7# There was an attempt when this was first rewritten to make it 5.8
8# compatible, but that has now been abandoned, and newer constructs are used
9# as convenient.
6c68572b 10
80f41172
JH
11# NOTE: this script can run quite slowly in older/slower systems.
12# It can also consume a lot of memory (128 MB or more), you may need
13# to raise your process resource limits (e.g. in bash, "ulimit -a"
14# to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
15
cdcef19a 16my $start_time;
98dc9551 17BEGIN { # Get the time the script started running; do it at compilation to
cdcef19a
KW
18 # get it as close as possible
19 $start_time= time;
20}
21
23e33b60 22require 5.010_001;
d73e5302 23use strict;
99870f4d 24use warnings;
cf25bb62 25use Carp;
bd9ebcfd 26use Config;
99870f4d
KW
27use File::Find;
28use File::Path;
d07a55ed 29use File::Spec;
99870f4d 30use Text::Tabs;
6b64c11c 31use re "/aa";
0dee21d6 32use feature 'state';
99870f4d
KW
33
34sub DEBUG () { 0 } # Set to 0 for production; 1 for development
bd9ebcfd 35my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
99870f4d 36
74cd47d0
KW
37sub NON_ASCII_PLATFORM { ord("A") != 65 }
38
99870f4d
KW
39##########################################################################
40#
41# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42# from the Unicode database files (lib/unicore/.../*.txt), It also generates
232ed87f 43# a pod file and .t files, depending on option parameters.
99870f4d
KW
44#
45# The structure of this file is:
46# First these introductory comments; then
47# code needed for everywhere, such as debugging stuff; then
48# code to handle input parameters; then
49# data structures likely to be of external interest (some of which depend on
50# the input parameters, so follows them; then
51# more data structures and subroutine and package (class) definitions; then
52# the small actual loop to process the input files and finish up; then
53# a __DATA__ section, for the .t tests
54#
232ed87f
KW
55# This program works on all releases of Unicode so far. The outputs have been
56# scrutinized most intently for release 5.1. The others have been checked for
57# somewhat more than just sanity. It can handle all non-provisional Unicode
58# character properties in those releases.
99870f4d 59#
99870f4d
KW
60# This program is mostly about Unicode character (or code point) properties.
61# A property describes some attribute or quality of a code point, like if it
62# is lowercase or not, its name, what version of Unicode it was first defined
63# in, or what its uppercase equivalent is. Unicode deals with these disparate
64# possibilities by making all properties into mappings from each code point
65# into some corresponding value. In the case of it being lowercase or not,
66# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67# property maps each Unicode code point to a single value, called a "property
232ed87f
KW
68# value". (Some more recently defined properties, map a code point to a set
69# of values.)
99870f4d
KW
70#
71# When using a property in a regular expression, what is desired isn't the
72# mapping of the code point to its property's value, but the reverse (or the
73# mathematical "inverse relation"): starting with the property value, "Does a
74# code point map to it?" These are written in a "compound" form:
75# \p{property=value}, e.g., \p{category=punctuation}. This program generates
76# files containing the lists of code points that map to each such regular
77# expression property value, one file per list
78#
79# There is also a single form shortcut that Perl adds for many of the commonly
80# used properties. This happens for all binary properties, plus script,
81# general_category, and block properties.
82#
83# Thus the outputs of this program are files. There are map files, mostly in
84# the 'To' directory; and there are list files for use in regular expression
85# matching, all in subdirectories of the 'lib' directory, with each
86# subdirectory being named for the property that the lists in it are for.
87# Bookkeeping, test, and documentation files are also generated.
88
89my $matches_directory = 'lib'; # Where match (\p{}) files go.
90my $map_directory = 'To'; # Where map files go.
91
92# DATA STRUCTURES
93#
94# The major data structures of this program are Property, of course, but also
95# Table. There are two kinds of tables, very similar to each other.
96# "Match_Table" is the data structure giving the list of code points that have
97# a particular property value, mentioned above. There is also a "Map_Table"
98# data structure which gives the property's mapping from code point to value.
99# There are two structures because the match tables need to be combined in
100# various ways, such as constructing unions, intersections, complements, etc.,
101# and the map ones don't. And there would be problems, perhaps subtle, if
102# a map table were inadvertently operated on in some of those ways.
103# The use of separate classes with operations defined on one but not the other
104# prevents accidentally confusing the two.
105#
106# At the heart of each table's data structure is a "Range_List", which is just
107# an ordered list of "Ranges", plus ancillary information, and methods to
108# operate on them. A Range is a compact way to store property information.
109# Each range has a starting code point, an ending code point, and a value that
110# is meant to apply to all the code points between the two end points,
111# inclusive. For a map table, this value is the property value for those
112# code points. Two such ranges could be written like this:
113# 0x41 .. 0x5A, 'Upper',
114# 0x61 .. 0x7A, 'Lower'
115#
116# Each range also has a type used as a convenience to classify the values.
117# Most ranges in this program will be Type 0, or normal, but there are some
118# ranges that have a non-zero type. These are used only in map tables, and
119# are for mappings that don't fit into the normal scheme of things. Mappings
120# that require a hash entry to communicate with utf8.c are one example;
121# another example is mappings for charnames.pm to use which indicate a name
232ed87f 122# that is algorithmically determinable from its code point (and the reverse).
99870f4d
KW
123# These are used to significantly compact these tables, instead of listing
124# each one of the tens of thousands individually.
125#
126# In a match table, the value of a range is irrelevant (and hence the type as
127# well, which will always be 0), and arbitrarily set to the null string.
128# Using the example above, there would be two match tables for those two
129# entries, one named Upper would contain the 0x41..0x5A range, and the other
130# named Lower would contain 0x61..0x7A.
131#
132# Actually, there are two types of range lists, "Range_Map" is the one
133# associated with map tables, and "Range_List" with match tables.
232ed87f
KW
134# Again, this is so that methods can be defined on one and not the others so
135# as to prevent operating on them in incorrect ways.
99870f4d
KW
136#
137# Eventually, most tables are written out to files to be read by utf8_heavy.pl
138# in the perl core. All tables could in theory be written, but some are
139# suppressed because there is no current practical use for them. It is easy
140# to change which get written by changing various lists that are near the top
141# of the actual code in this file. The table data structures contain enough
142# ancillary information to allow them to be treated as separate entities for
143# writing, such as the path to each one's file. There is a heading in each
144# map table that gives the format of its entries, and what the map is for all
145# the code points missing from it. (This allows tables to be more compact.)
678f13d5 146#
99870f4d
KW
147# The Property data structure contains one or more tables. All properties
148# contain a map table (except the $perl property which is a
149# pseudo-property containing only match tables), and any properties that
150# are usable in regular expression matches also contain various matching
151# tables, one for each value the property can have. A binary property can
152# have two values, True and False (or Y and N, which are preferred by Unicode
153# terminology). Thus each of these properties will have a map table that
154# takes every code point and maps it to Y or N (but having ranges cuts the
155# number of entries in that table way down), and two match tables, one
156# which has a list of all the code points that map to Y, and one for all the
232ed87f 157# code points that map to N. (For each binary property, a third table is also
99870f4d 158# generated for the pseudo Perl property. It contains the identical code
232ed87f
KW
159# points as the Y table, but can be written in regular expressions, not in the
160# compound form, but in a "single" form like \p{IsUppercase}.) Many
161# properties are binary, but some properties have several possible values,
162# some have many, and properties like Name have a different value for every
163# named code point. Those will not, unless the controlling lists are changed,
164# have their match tables written out. But all the ones which can be used in
165# regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
166# a property would have either its map table or its match tables written but
167# not both. Again, what gets written is controlled by lists which can easily
168# be changed. Starting in 5.14, advantage was taken of this, and all the map
169# tables needed to reconstruct the Unicode db are now written out, while
170# suppressing the Unicode .txt files that contain the data. Our tables are
171# much more compact than the .txt files, so a significant space savings was
172# achieved. Also, tables are not written out that are trivially derivable
173# from tables that do get written. So, there typically is no file containing
174# the code points not matched by a binary property (the table for \P{} versus
175# lowercase \p{}), since you just need to invert the True table to get the
176# False table.
177
178# Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
179# how many match tables there are and the content of the maps. This 'Type' is
c12f2655
KW
180# different than a range 'Type', so don't get confused by the two concepts
181# having the same name.
678f13d5 182#
99870f4d
KW
183# For information about the Unicode properties, see Unicode's UAX44 document:
184
185my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
186
187# As stated earlier, this program will work on any release of Unicode so far.
188# Most obvious problems in earlier data have NOT been corrected except when
be864b6c 189# necessary to make Perl or this program work reasonably, and to keep out
232ed87f
KW
190# potential security issues. For example, no folding information was given in
191# early releases, so this program substitutes lower case instead, just so that
192# a regular expression with the /i option will do something that actually
193# gives the right results in many cases. There are also a couple other
194# corrections for version 1.1.5, commented at the point they are made. As an
195# example of corrections that weren't made (but could be) is this statement
196# from DerivedAge.txt: "The supplementary private use code points and the
197# non-character code points were assigned in version 2.0, but not specifically
198# listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
199# it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
200# further down in these introductory comments.
99870f4d 201#
232ed87f
KW
202# This program works on all non-provisional properties as of the current
203# Unicode release, though the files for some are suppressed for various
204# reasons. You can change which are output by changing lists in this program.
678f13d5 205#
dc85bd38 206# The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
99870f4d
KW
207# loose matchings rules (from Unicode TR18):
208#
209# The recommended names for UCD properties and property values are in
210# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
211# [PropValue]. There are both abbreviated names and longer, more
212# descriptive names. It is strongly recommended that both names be
213# recognized, and that loose matching of property names be used,
214# whereby the case distinctions, whitespace, hyphens, and underbar
215# are ignored.
232ed87f 216#
99870f4d
KW
217# The program still allows Fuzzy to override its determination of if loose
218# matching should be used, but it isn't currently used, as it is no longer
219# needed; the calculations it makes are good enough.
678f13d5 220#
99870f4d
KW
221# SUMMARY OF HOW IT WORKS:
222#
223# Process arguments
224#
225# A list is constructed containing each input file that is to be processed
226#
227# Each file on the list is processed in a loop, using the associated handler
228# code for each:
229# The PropertyAliases.txt and PropValueAliases.txt files are processed
230# first. These files name the properties and property values.
231# Objects are created of all the property and property value names
232# that the rest of the input should expect, including all synonyms.
233# The other input files give mappings from properties to property
234# values. That is, they list code points and say what the mapping
235# is under the given property. Some files give the mappings for
236# just one property; and some for many. This program goes through
232ed87f
KW
237# each file and populates the properties and their map tables from
238# them. Some properties are listed in more than one file, and
239# Unicode has set up a precedence as to which has priority if there
240# is a conflict. Thus the order of processing matters, and this
241# program handles the conflict possibility by processing the
242# overriding input files last, so that if necessary they replace
243# earlier values.
99870f4d
KW
244# After this is all done, the program creates the property mappings not
245# furnished by Unicode, but derivable from what it does give.
246# The tables of code points that match each property value in each
247# property that is accessible by regular expressions are created.
248# The Perl-defined properties are created and populated. Many of these
249# require data determined from the earlier steps
250# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 251# and Unicode are reconciled and warned about.
99870f4d
KW
252# All the properties are written to files
253# Any other files are written, and final warnings issued.
678f13d5 254#
99870f4d
KW
255# For clarity, a number of operators have been overloaded to work on tables:
256# ~ means invert (take all characters not in the set). The more
257# conventional '!' is not used because of the possibility of confusing
258# it with the actual boolean operation.
259# + means union
260# - means subtraction
261# & means intersection
262# The precedence of these is the order listed. Parentheses should be
263# copiously used. These are not a general scheme. The operations aren't
264# defined for a number of things, deliberately, to avoid getting into trouble.
265# Operations are done on references and affect the underlying structures, so
266# that the copy constructors for them have been overloaded to not return a new
267# clone, but the input object itself.
678f13d5 268#
99870f4d
KW
269# The bool operator is deliberately not overloaded to avoid confusion with
270# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
271#
272# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
273#
274# This program needs to be able to run under miniperl. Therefore, it uses a
275# minimum of other modules, and hence implements some things itself that could
276# be gotten from CPAN
277#
278# This program uses inputs published by the Unicode Consortium. These can
279# change incompatibly between releases without the Perl maintainers realizing
280# it. Therefore this program is now designed to try to flag these. It looks
281# at the directories where the inputs are, and flags any unrecognized files.
282# It keeps track of all the properties in the files it handles, and flags any
283# that it doesn't know how to handle. It also flags any input lines that
284# don't match the expected syntax, among other checks.
285#
286# It is also designed so if a new input file matches one of the known
287# templates, one hopefully just needs to add it to a list to have it
288# processed.
289#
290# As mentioned earlier, some properties are given in more than one file. In
291# particular, the files in the extracted directory are supposedly just
292# reformattings of the others. But they contain information not easily
9be22ca5
KW
293# derivable from the other files, including results for Unihan (which isn't
294# usually available to this program) and for unassigned code points. They
678f13d5
KW
295# also have historically had errors or been incomplete. In an attempt to
296# create the best possible data, this program thus processes them first to
297# glean information missing from the other files; then processes those other
298# files to override any errors in the extracted ones. Much of the design was
299# driven by this need to store things and then possibly override them.
300#
301# It tries to keep fatal errors to a minimum, to generate something usable for
302# testing purposes. It always looks for files that could be inputs, and will
303# warn about any that it doesn't know how to handle (the -q option suppresses
304# the warning).
99870f4d 305#
678f13d5
KW
306# Why is there more than one type of range?
307# This simplified things. There are some very specialized code points that
308# have to be handled specially for output, such as Hangul syllable names.
309# By creating a range type (done late in the development process), it
310# allowed this to be stored with the range, and overridden by other input.
311# Originally these were stored in another data structure, and it became a
312# mess trying to decide if a second file that was for the same property was
313# overriding the earlier one or not.
314#
315# Why are there two kinds of tables, match and map?
316# (And there is a base class shared by the two as well.) As stated above,
317# they actually are for different things. Development proceeded much more
318# smoothly when I (khw) realized the distinction. Map tables are used to
319# give the property value for every code point (actually every code point
320# that doesn't map to a default value). Match tables are used for regular
321# expression matches, and are essentially the inverse mapping. Separating
322# the two allows more specialized methods, and error checks so that one
323# can't just take the intersection of two map tables, for example, as that
324# is nonsensical.
99870f4d 325#
232ed87f
KW
326# What about 'fate' and 'status'. The concept of a table's fate was created
327# late when it became clear that something more was needed. The difference
328# between this and 'status' is unclean, and could be improved if someone
329# wanted to spend the effort.
330#
23e33b60
KW
331# DEBUGGING
332#
678f13d5
KW
333# This program is written so it will run under miniperl. Occasionally changes
334# will cause an error where the backtrace doesn't work well under miniperl.
335# To diagnose the problem, you can instead run it under regular perl, if you
336# have one compiled.
337#
338# There is a good trace facility. To enable it, first sub DEBUG must be set
339# to return true. Then a line like
340#
341# local $to_trace = 1 if main::DEBUG;
342#
232ed87f
KW
343# can be added to enable tracing in its lexical scope (plus dynamic) or until
344# you insert another line:
678f13d5
KW
345#
346# local $to_trace = 0 if main::DEBUG;
347#
232ed87f 348# To actually trace, use a line like "trace $a, @b, %c, ...;
678f13d5
KW
349#
350# Some of the more complex subroutines already have trace statements in them.
351# Permanent trace statements should be like:
352#
353# trace ... if main::DEBUG && $to_trace;
354#
355# If there is just one or a few files that you're debugging, you can easily
356# cause most everything else to be skipped. Change the line
357#
358# my $debug_skip = 0;
359#
360# to 1, and every file whose object is in @input_file_objects and doesn't have
232ed87f
KW
361# a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
362# Jamo.txt or UnicodeData.txt will likely cause fatal errors.
678f13d5 363#
b4a0206c 364# To compare the output tables, it may be useful to specify the -annotate
97a8a595
KW
365# flag. (As of this writing, this can't be done on a clean workspace, due to
366# requirements in Text::Tabs used in this option; so first run mktables
367# without this option.) This option adds comment lines to each table, one for
368# each non-algorithmically named character giving, currently its code point,
369# name, and graphic representation if printable (and you have a font that
370# knows about it). This makes it easier to see what the particular code
371# points are in each output table. Non-named code points are annotated with a
372# description of their status, and contiguous ones with the same description
373# will be output as a range rather than individually. Algorithmically named
374# characters are also output as ranges, except when there are just a few
375# contiguous ones.
c4019d52 376#
99870f4d
KW
377# FUTURE ISSUES
378#
379# The program would break if Unicode were to change its names so that
380# interior white space, underscores, or dashes differences were significant
381# within property and property value names.
382#
383# It might be easier to use the xml versions of the UCD if this program ever
384# would need heavy revision, and the ability to handle old versions was not
385# required.
386#
387# There is the potential for name collisions, in that Perl has chosen names
388# that Unicode could decide it also likes. There have been such collisions in
389# the past, with mostly Perl deciding to adopt the Unicode definition of the
390# name. However in the 5.2 Unicode beta testing, there were a number of such
391# collisions, which were withdrawn before the final release, because of Perl's
392# and other's protests. These all involved new properties which began with
393# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
394# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
395# Unicode document, so they are unlikely to be used by Unicode for another
396# purpose. However, they might try something beginning with 'In', or use any
397# of the other Perl-defined properties. This program will warn you of name
398# collisions, and refuse to generate tables with them, but manual intervention
399# will be required in this event. One scheme that could be implemented, if
400# necessary, would be to have this program generate another file, or add a
401# field to mktables.lst that gives the date of first definition of a property.
402# Each new release of Unicode would use that file as a basis for the next
403# iteration. And the Perl synonym addition code could sort based on the age
404# of the property, so older properties get priority, and newer ones that clash
405# would be refused; hence existing code would not be impacted, and some other
406# synonym would have to be used for the new property. This is ugly, and
407# manual intervention would certainly be easier to do in the short run; lets
408# hope it never comes to this.
678f13d5 409#
99870f4d
KW
410# A NOTE ON UNIHAN
411#
b9e57660
KW
412# This program can generate tables from the Unihan database. But that db
413# isn't normally available, so it is marked as optional. Prior to version
414# 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
415# was split into 8 different files, all beginning with the letters 'Unihan'.
416# If you plunk those files down into the directory mktables ($0) is in, this
417# program will read them and automatically create tables for the properties
418# from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
419# plus any you add to the @cjk_properties array and the @cjk_property_values
420# array, being sure to add necessary '# @missings' lines to the latter. For
421# Unicode versions earlier than 5.2, most of the Unihan properties are not
422# listed at all in PropertyAliases nor PropValueAliases. This program assumes
423# for these early releases that you want the properties that are specified in
424# the 5.2 release.
99870f4d
KW
425#
426# You may need to adjust the entries to suit your purposes. setup_unihan(),
427# and filter_unihan_line() are the functions where this is done. This program
428# already does some adjusting to make the lines look more like the rest of the
429# Unicode DB; You can see what that is in filter_unihan_line()
430#
431# There is a bug in the 3.2 data file in which some values for the
432# kPrimaryNumeric property have commas and an unexpected comment. A filter
9be22ca5
KW
433# could be added to correct these; or for a particular installation, the
434# Unihan.txt file could be edited to fix them.
99870f4d 435#
678f13d5
KW
436# HOW TO ADD A FILE TO BE PROCESSED
437#
438# A new file from Unicode needs to have an object constructed for it in
439# @input_file_objects, probably at the end or at the end of the extracted
440# ones. The program should warn you if its name will clash with others on
441# restrictive file systems, like DOS. If so, figure out a better name, and
442# add lines to the README.perl file giving that. If the file is a character
232ed87f 443# property, it should be in the format that Unicode has implicitly
678f13d5
KW
444# standardized for such files for the more recently introduced ones.
445# If so, the Input_file constructor for @input_file_objects can just be the
446# file name and release it first appeared in. If not, then it should be
447# possible to construct an each_line_handler() to massage the line into the
448# standardized form.
449#
450# For non-character properties, more code will be needed. You can look at
451# the existing entries for clues.
452#
453# UNICODE VERSIONS NOTES
454#
455# The Unicode UCD has had a number of errors in it over the versions. And
456# these remain, by policy, in the standard for that version. Therefore it is
457# risky to correct them, because code may be expecting the error. So this
458# program doesn't generally make changes, unless the error breaks the Perl
459# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
460# for U+1105, which causes real problems for the algorithms for Jamo
461# calculations, so it is changed here.
462#
463# But it isn't so clear cut as to what to do about concepts that are
464# introduced in a later release; should they extend back to earlier releases
465# where the concept just didn't exist? It was easier to do this than to not,
466# so that's what was done. For example, the default value for code points not
467# in the files for various properties was probably undefined until changed by
468# some version. No_Block for blocks is such an example. This program will
469# assign No_Block even in Unicode versions that didn't have it. This has the
470# benefit that code being written doesn't have to special case earlier
471# versions; and the detriment that it doesn't match the Standard precisely for
472# the affected versions.
473#
474# Here are some observations about some of the issues in early versions:
475#
232ed87f
KW
476# Prior to version 3.0, there were 3 character decompositions. These are not
477# handled by Unicode::Normalize, nor will it compile when presented a version
478# that has them. However, you can trivially get it to compile by simply
479# ignoring those decompositions, by changing the croak to a carp. At the time
9be22ca5
KW
480# of this writing, the line (in cpan/Unicode-Normalize/Normalize.pm or
481# cpan/Unicode-Normalize/mkheader) reads
232ed87f
KW
482#
483# croak("Weird Canonical Decomposition of U+$h");
484#
28807e1d 485# Simply comment it out. It will compile, but will not know about any three
9be22ca5 486# character decompositions.
232ed87f
KW
487
488# The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
489# that the reason is that the CJK block starting at 4E00 was removed from
490# PropList, and was not put back in until 3.1.0. The Perl extension (the
491# single property name \p{alpha}) has the correct values. But the compound
492# form is simply not generated until 3.1, as it can be argued that prior to
493# this release, this was not an official property. The comments for
494# filter_old_style_proplist() give more details.
678f13d5
KW
495#
496# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
497# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
498# reason is that 3.2 introduced U+205F=medium math space, which was not
499# classed as white space, but Perl figured out that it should have been. 4.0
500# reclassified it correctly.
501#
502# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
232ed87f
KW
503# this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
504# became 202, and ATBL was left with no code points, as all the ones that
505# mapped to 202 stayed mapped to 202. Thus if your program used the numeric
506# name for the class, it would not have been affected, but if it used the
507# mnemonic, it would have been.
678f13d5 508#
9be22ca5 509# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code
678f13d5
KW
510# points which eventually came to have this script property value, instead
511# mapped to "Unknown". But in the next release all these code points were
512# moved to \p{sc=common} instead.
9be22ca5
KW
513
514# The tests furnished by Unicode for testing WordBreak and SentenceBreak
515# generate errors in 5.0 and earlier.
99870f4d
KW
516#
517# The default for missing code points for BidiClass is complicated. Starting
518# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
519# tries to do the best it can for earlier releases. It is done in
520# process_PropertyAliases()
521#
232ed87f
KW
522# In version 2.1.2, the entry in UnicodeData.txt:
523# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
524# should instead be
525# 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
526# Without this change, there are casing problems for this character.
527#
7803ad2d
KW
528# Search for $string_compare_versions to see how to compare changes to
529# properties between Unicode versions
530#
99870f4d
KW
531##############################################################################
532
533my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
534 # and errors
535my $MAX_LINE_WIDTH = 78;
536
537# Debugging aid to skip most files so as to not be distracted by them when
538# concentrating on the ones being debugged. Add
539# non_skip => 1,
540# to the constructor for those files you want processed when you set this.
541# Files with a first version number of 0 are special: they are always
c12f2655
KW
542# processed regardless of the state of this flag. Generally, Jamo.txt and
543# UnicodeData.txt must not be skipped if you want this program to not die
544# before normal completion.
99870f4d
KW
545my $debug_skip = 0;
546
e9c4b4f8
KW
547
548# Normally these are suppressed.
549my $write_Unicode_deprecated_tables = 0;
550
99870f4d
KW
551# Set to 1 to enable tracing.
552our $to_trace = 0;
553
554{ # Closure for trace: debugging aid
555 my $print_caller = 1; # ? Include calling subroutine name
556 my $main_with_colon = 'main::';
557 my $main_colon_length = length($main_with_colon);
558
559 sub trace {
560 return unless $to_trace; # Do nothing if global flag not set
561
562 my @input = @_;
563
564 local $DB::trace = 0;
565 $DB::trace = 0; # Quiet 'used only once' message
566
567 my $line_number;
568
569 # Loop looking up the stack to get the first non-trace caller
570 my $caller_line;
571 my $caller_name;
572 my $i = 0;
573 do {
574 $line_number = $caller_line;
575 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
576 $caller = $main_with_colon unless defined $caller;
577
578 $caller_name = $caller;
579
580 # get rid of pkg
581 $caller_name =~ s/.*:://;
582 if (substr($caller_name, 0, $main_colon_length)
583 eq $main_with_colon)
584 {
585 $caller_name = substr($caller_name, $main_colon_length);
586 }
587
588 } until ($caller_name ne 'trace');
589
590 # If the stack was empty, we were called from the top level
591 $caller_name = 'main' if ($caller_name eq ""
592 || $caller_name eq 'trace');
593
594 my $output = "";
9346f59a 595 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
99870f4d 596 foreach my $string (@input) {
99870f4d
KW
597 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
598 $output .= simple_dumper($string);
599 }
600 else {
601 $string = "$string" if ref $string;
602 $string = $UNDEF unless defined $string;
603 chomp $string;
604 $string = '""' if $string eq "";
605 $output .= " " if $output ne ""
606 && $string ne ""
607 && substr($output, -1, 1) ne " "
608 && substr($string, 0, 1) ne " ";
609 $output .= $string;
610 }
611 }
612
99f78760
KW
613 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
614 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
615 print STDERR $output, "\n";
616 return;
617 }
618}
619
620# This is for a rarely used development feature that allows you to compare two
621# versions of the Unicode standard without having to deal with changes caused
9be22ca5 622# by the code points introduced in the later version. You probably also want
8f21889c
KW
623# to use the -annotate option when using this. Run this program on a unicore
624# containing the starting release you want to compare. Save that output
625# structrue. Then, switching to a unicore with the ending release, change the
626# 0 in the $string_compare_versions definition just below to a string
627# containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
628# to the starting release. This program will then compile, but throw away all
629# code points introduced after the starting release. Finally use a diff tool
630# to compare the two directory structures. They include only the code points
631# common to both releases, and you can see the changes caused just by the
632# underlying release semantic changes. For versions earlier than 3.2, you
633# must copy a version of DAge.txt into the directory.
c12f2655 634my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
99870f4d
KW
635my $compare_versions = DEBUG
636 && $string_compare_versions
637 && pack "C*", split /\./, $string_compare_versions;
638
639sub uniques {
640 # Returns non-duplicated input values. From "Perl Best Practices:
641 # Encapsulated Cleverness". p. 455 in first edition.
642
643 my %seen;
0e407844
NC
644 # Arguably this breaks encapsulation, if the goal is to permit multiple
645 # distinct objects to stringify to the same value, and be interchangeable.
646 # However, for this program, no two objects stringify identically, and all
647 # lists passed to this function are either objects or strings. So this
648 # doesn't affect correctness, but it does give a couple of percent speedup.
649 no overloading;
99870f4d
KW
650 return grep { ! $seen{$_}++ } @_;
651}
652
653$0 = File::Spec->canonpath($0);
654
655my $make_test_script = 0; # ? Should we output a test script
6b5ab373 656my $make_norm_test_script = 0; # ? Should we output a normalization test script
99870f4d
KW
657my $write_unchanged_files = 0; # ? Should we update the output files even if
658 # we don't think they have changed
659my $use_directory = ""; # ? Should we chdir somewhere.
660my $pod_directory; # input directory to store the pod file.
661my $pod_file = 'perluniprops';
662my $t_path; # Path to the .t test file
663my $file_list = 'mktables.lst'; # File to store input and output file names.
664 # This is used to speed up the build, by not
665 # executing the main body of the program if
666 # nothing on the list has changed since the
667 # previous build
668my $make_list = 1; # ? Should we write $file_list. Set to always
669 # make a list so that when the pumpking is
670 # preparing a release, s/he won't have to do
671 # special things
672my $glob_list = 0; # ? Should we try to include unknown .txt files
673 # in the input.
bd9ebcfd
KW
674my $output_range_counts = $debugging_build; # ? Should we include the number
675 # of code points in ranges in
676 # the output
558712cf 677my $annotate = 0; # ? Should character names be in the output
9ef2b94f 678
99870f4d
KW
679# Verbosity levels; 0 is quiet
680my $NORMAL_VERBOSITY = 1;
681my $PROGRESS = 2;
682my $VERBOSE = 3;
683
684my $verbosity = $NORMAL_VERBOSITY;
685
0458fbc1
KW
686# Stored in mktables.lst so that if this program is called with different
687# options, will regenerate even if the files otherwise look like they're
688# up-to-date.
689my $command_line_arguments = join " ", @ARGV;
690
99870f4d
KW
691# Process arguments
692while (@ARGV) {
cf25bb62
JH
693 my $arg = shift @ARGV;
694 if ($arg eq '-v') {
99870f4d
KW
695 $verbosity = $VERBOSE;
696 }
697 elsif ($arg eq '-p') {
698 $verbosity = $PROGRESS;
699 $| = 1; # Flush buffers as we go.
700 }
701 elsif ($arg eq '-q') {
702 $verbosity = 0;
703 }
704 elsif ($arg eq '-w') {
705 $write_unchanged_files = 1; # update the files even if havent changed
706 }
707 elsif ($arg eq '-check') {
6ae7e459
YO
708 my $this = shift @ARGV;
709 my $ok = shift @ARGV;
710 if ($this ne $ok) {
711 print "Skipping as check params are not the same.\n";
712 exit(0);
713 }
00a8df5c 714 }
99870f4d
KW
715 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
716 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
717 }
3df51b85
KW
718 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
719 {
99870f4d 720 $make_test_script = 1;
99870f4d 721 }
6b5ab373
KW
722 elsif ($arg eq '-makenormtest')
723 {
724 $make_norm_test_script = 1;
725 }
99870f4d
KW
726 elsif ($arg eq '-makelist') {
727 $make_list = 1;
728 }
729 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
730 -d $use_directory or croak "Unknown directory '$use_directory'";
731 }
732 elsif ($arg eq '-L') {
733
734 # Existence not tested until have chdir'd
735 $file_list = shift;
736 }
737 elsif ($arg eq '-globlist') {
738 $glob_list = 1;
739 }
740 elsif ($arg eq '-c') {
741 $output_range_counts = ! $output_range_counts
742 }
b4a0206c 743 elsif ($arg eq '-annotate') {
558712cf 744 $annotate = 1;
bd9ebcfd
KW
745 $debugging_build = 1;
746 $output_range_counts = 1;
9ef2b94f 747 }
99870f4d
KW
748 else {
749 my $with_c = 'with';
750 $with_c .= 'out' if $output_range_counts; # Complements the state
751 croak <<END;
752usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
753 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
754 [-check A B ]
755 -c : Output comments $with_c number of code points in ranges
756 -q : Quiet Mode: Only output serious warnings.
757 -p : Set verbosity level to normal plus show progress.
758 -v : Set Verbosity level high: Show progress and non-serious
759 warnings
760 -w : Write files regardless
761 -C dir : Change to this directory before proceeding. All relative paths
762 except those specified by the -P and -T options will be done
763 with respect to this directory.
764 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 765 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
766 -L filelist : Use alternate 'filelist' instead of standard one
767 -globlist : Take as input all non-Test *.txt files in current and sub
768 directories
3df51b85
KW
769 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
770 overrides -T
99870f4d 771 -makelist : Rewrite the file list $file_list based on current setup
b4a0206c 772 -annotate : Output an annotation for each character in the table files;
97a8a595
KW
773 useful for debugging mktables, looking at diffs; but is slow
774 and memory intensive
99870f4d
KW
775 -check A B : Executes $0 only if A and B are the same
776END
777 }
778}
779
780# Stores the most-recently changed file. If none have changed, can skip the
781# build
aeab6150 782my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
783
784# Change directories now, because need to read 'version' early.
785if ($use_directory) {
3df51b85 786 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
787 $pod_directory = File::Spec->rel2abs($pod_directory);
788 }
3df51b85 789 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 790 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 791 }
99870f4d 792 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 793 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 794 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 795 }
3df51b85 796 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 797 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 798 }
00a8df5c
YO
799}
800
99870f4d
KW
801# Get Unicode version into regular and v-string. This is done now because
802# various tables below get populated based on it. These tables are populated
803# here to be near the top of the file, and so easily seeable by those needing
804# to modify things.
805open my $VERSION, "<", "version"
806 or croak "$0: can't open required file 'version': $!\n";
807my $string_version = <$VERSION>;
808close $VERSION;
809chomp $string_version;
810my $v_version = pack "C*", split /\./, $string_version; # v string
811
8f21889c
KW
812my $unicode_version = ($compare_versions)
813 ? ( "$string_compare_versions (using "
814 . "$string_version rules)")
815 : $string_version;
816
99870f4d
KW
817# The following are the complete names of properties with property values that
818# are known to not match any code points in some versions of Unicode, but that
819# may change in the future so they should be matchable, hence an empty file is
820# generated for them.
a9c9e371
KW
821my @tables_that_may_be_empty;
822push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
823 if $v_version lt v6.3.0;
99870f4d
KW
824push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
825push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
826push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
827 if $v_version ge v4.1.0;
82aed44a
KW
828push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
829 if $v_version ge v6.0.0;
f583b44c
KW
830push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
831 if $v_version ge v6.1.0;
1e958ea9
KW
832push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
833 if $v_version ge v6.2.0;
99870f4d
KW
834
835# The lists below are hashes, so the key is the item in the list, and the
836# value is the reason why it is in the list. This makes generation of
837# documentation easier.
838
839my %why_suppressed; # No file generated for these.
840
841# Files aren't generated for empty extraneous properties. This is arguable.
842# Extraneous properties generally come about because a property is no longer
843# used in a newer version of Unicode. If we generated a file without code
844# points, programs that used to work on that property will still execute
845# without errors. It just won't ever match (or will always match, with \P{}).
846# This means that the logic is now likely wrong. I (khw) think its better to
847# find this out by getting an error message. Just move them to the table
848# above to change this behavior
849my %why_suppress_if_empty_warn_if_not = (
850
851 # It is the only property that has ever officially been removed from the
852 # Standard. The database never contained any code points for it.
853 'Special_Case_Condition' => 'Obsolete',
854
855 # Apparently never official, but there were code points in some versions of
856 # old-style PropList.txt
857 'Non_Break' => 'Obsolete',
858);
859
860# These would normally go in the warn table just above, but they were changed
861# a long time before this program was written, so warnings about them are
862# moot.
863if ($v_version gt v3.2.0) {
864 push @tables_that_may_be_empty,
865 'Canonical_Combining_Class=Attached_Below_Left'
866}
867
9be22ca5
KW
868# Enum values for to_output_map() method in the Map_Table package. (0 is don't
869# output)
272501f6
KW
870my $EXTERNAL_MAP = 1;
871my $INTERNAL_MAP = 2;
ce712c88 872my $OUTPUT_ADJUSTED = 3;
272501f6 873
fcf1973c
KW
874# To override computed values for writing the map tables for these properties.
875# The default for enum map tables is to write them out, so that the Unicode
876# .txt files can be removed, but all the data to compute any property value
877# for any code point is available in a more compact form.
878my %global_to_output_map = (
879 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
c12f2655
KW
880 # get stuck supporting it if things change. Since it is a STRING
881 # property, it normally would be listed in the pod, but INTERNAL_MAP
882 # suppresses that.
fcf1973c
KW
883 Unicode_1_Name => $INTERNAL_MAP,
884
885 Present_In => 0, # Suppress, as easily computed from Age
74cd47d0
KW
886 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is
887 # retained, but needed for
888 # non-ASCII
53d34b6c
KW
889
890 # Suppress, as mapping can be found instead from the
891 # Perl_Decomposition_Mapping file
892 Decomposition_Type => 0,
fcf1973c
KW
893);
894
99870f4d
KW
895# There are several types of obsolete properties defined by Unicode. These
896# must be hand-edited for every new Unicode release.
897my %why_deprecated; # Generates a deprecated warning message if used.
898my %why_stabilized; # Documentation only
899my %why_obsolete; # Documentation only
900
901{ # Closure
8364f9eb 902 my $simple = 'Perl uses the more complete version';
99870f4d
KW
903 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
904
905 my $other_properties = 'other properties';
906 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
5d294d41 907 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
99870f4d
KW
908
909 %why_deprecated = (
5f7264c7 910 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
99870f4d
KW
911 'Jamo_Short_Name' => $contributory,
912 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
913 'Other_Alphabetic' => $contributory,
914 'Other_Default_Ignorable_Code_Point' => $contributory,
915 'Other_Grapheme_Extend' => $contributory,
916 'Other_ID_Continue' => $contributory,
917 'Other_ID_Start' => $contributory,
918 'Other_Lowercase' => $contributory,
919 'Other_Math' => $contributory,
920 'Other_Uppercase' => $contributory,
e22aaf5c
KW
921 'Expands_On_NFC' => $why_no_expand,
922 'Expands_On_NFD' => $why_no_expand,
923 'Expands_On_NFKC' => $why_no_expand,
924 'Expands_On_NFKD' => $why_no_expand,
99870f4d
KW
925 );
926
927 %why_suppressed = (
5f7264c7 928 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
99870f4d
KW
929 # contains the same information, but without the algorithmically
930 # determinable Hangul syllables'. This file is not published, so it's
931 # existence is not noted in the comment.
12fee290 932 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
99870f4d 933
5f8d1a89
KW
934 # Don't suppress ISO_Comment, as otherwise special handling is needed
935 # to differentiate between it and gc=c, which can be written as 'isc',
936 # which is the same characters as ISO_Comment's short name.
99870f4d 937
12fee290 938 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
e0b29447 939
12fee290
KW
940 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
941 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
942 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
943 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
99870f4d 944
dac6f618 945 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
99870f4d
KW
946 );
947
1704a0ea
KW
948 foreach my $property (
949
950 # The following are suppressed because they were made contributory
951 # or deprecated by Unicode before Perl ever thought about
952 # supporting them.
953 'Jamo_Short_Name',
954 'Grapheme_Link',
955 'Expands_On_NFC',
956 'Expands_On_NFD',
957 'Expands_On_NFKC',
958 'Expands_On_NFKD',
959
960 # The following are suppressed because they have been marked
961 # as deprecated for a sufficient amount of time
962 'Other_Alphabetic',
963 'Other_Default_Ignorable_Code_Point',
964 'Other_Grapheme_Extend',
965 'Other_ID_Continue',
966 'Other_ID_Start',
967 'Other_Lowercase',
968 'Other_Math',
969 'Other_Uppercase',
e22aaf5c 970 ) {
99870f4d
KW
971 $why_suppressed{$property} = $why_deprecated{$property};
972 }
cf25bb62 973
99870f4d
KW
974 # Customize the message for all the 'Other_' properties
975 foreach my $property (keys %why_deprecated) {
976 next if (my $main_property = $property) !~ s/^Other_//;
977 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
978 }
979}
980
e9c4b4f8
KW
981if ($write_Unicode_deprecated_tables) {
982 foreach my $property (keys %why_suppressed) {
983 delete $why_suppressed{$property} if $property =~
984 / ^ Other | Grapheme /x;
985 }
986}
987
99870f4d
KW
988if ($v_version ge 4.0.0) {
989 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
5f7264c7
KW
990 if ($v_version ge 6.0.0) {
991 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
992 }
99870f4d 993}
5f7264c7 994if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
99870f4d 995 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
5f7264c7 996 if ($v_version ge 6.0.0) {
63f74647 997 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
5f7264c7 998 }
99870f4d
KW
999}
1000
1001# Probably obsolete forever
1002if ($v_version ge v4.1.0) {
82aed44a
KW
1003 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1004}
1005if ($v_version ge v6.0.0) {
caa75395 1006 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
2b352efd 1007 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
99870f4d
KW
1008}
1009
1010# This program can create files for enumerated-like properties, such as
1011# 'Numeric_Type'. This file would be the same format as for a string
1012# property, with a mapping from code point to its value, so you could look up,
1013# for example, the script a code point is in. But no one so far wants this
1014# mapping, or they have found another way to get it since this is a new
1015# feature. So no file is generated except if it is in this list.
1016my @output_mapped_properties = split "\n", <<END;
1017END
1018
b9e57660
KW
1019# If you want more Unihan properties than the default, you need to add them to
1020# these arrays. Depending on the property type, @missing lines might have to
1021# be added to the second array. A sample entry would be (including the '#'):
1022# @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
99870f4d 1023my @cjk_properties = split "\n", <<'END';
99870f4d 1024END
99870f4d 1025my @cjk_property_values = split "\n", <<'END';
99870f4d
KW
1026END
1027
1028# The input files don't list every code point. Those not listed are to be
1029# defaulted to some value. Below are hard-coded what those values are for
1030# non-binary properties as of 5.1. Starting in 5.0, there are
caa75395 1031# machine-parsable comment lines in the files that give the defaults; so this
99870f4d
KW
1032# list shouldn't have to be extended. The claim is that all missing entries
1033# for binary properties will default to 'N'. Unicode tried to change that in
1034# 5.2, but the beta period produced enough protest that they backed off.
1035#
1036# The defaults for the fields that appear in UnicodeData.txt in this hash must
1037# be in the form that it expects. The others may be synonyms.
1038my $CODE_POINT = '<code point>';
1039my %default_mapping = (
1040 Age => "Unassigned",
1041 # Bidi_Class => Complicated; set in code
1042 Bidi_Mirroring_Glyph => "",
1043 Block => 'No_Block',
1044 Canonical_Combining_Class => 0,
1045 Case_Folding => $CODE_POINT,
1046 Decomposition_Mapping => $CODE_POINT,
1047 Decomposition_Type => 'None',
1048 East_Asian_Width => "Neutral",
1049 FC_NFKC_Closure => $CODE_POINT,
ec2cd995 1050 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
99870f4d
KW
1051 Grapheme_Cluster_Break => 'Other',
1052 Hangul_Syllable_Type => 'NA',
1053 ISO_Comment => "",
1054 Jamo_Short_Name => "",
1055 Joining_Group => "No_Joining_Group",
1056 # Joining_Type => Complicated; set in code
1057 kIICore => 'N', # Is converted to binary
1058 #Line_Break => Complicated; set in code
1059 Lowercase_Mapping => $CODE_POINT,
1060 Name => "",
1061 Name_Alias => "",
1062 NFC_QC => 'Yes',
1063 NFD_QC => 'Yes',
1064 NFKC_QC => 'Yes',
1065 NFKD_QC => 'Yes',
1066 Numeric_Type => 'None',
1067 Numeric_Value => 'NaN',
1068 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1069 Sentence_Break => 'Other',
1070 Simple_Case_Folding => $CODE_POINT,
1071 Simple_Lowercase_Mapping => $CODE_POINT,
1072 Simple_Titlecase_Mapping => $CODE_POINT,
1073 Simple_Uppercase_Mapping => $CODE_POINT,
1074 Titlecase_Mapping => $CODE_POINT,
1075 Unicode_1_Name => "",
1076 Unicode_Radical_Stroke => "",
1077 Uppercase_Mapping => $CODE_POINT,
1078 Word_Break => 'Other',
1079);
1080
678f13d5 1081### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1082
1083my $HEADER=<<"EOF";
1084# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85 1085# This file is machine-generated by $0 from the Unicode
8f21889c 1086# database, Version $unicode_version. Any changes made here will be lost!
cf25bb62
JH
1087EOF
1088
126c3d4e 1089my $INTERNAL_ONLY_HEADER = <<"EOF";
99870f4d
KW
1090
1091# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
fac53429
KW
1092# This file is for internal use by core Perl only. The format and even the
1093# name or existence of this file are subject to change without notice. Don't
d9ae3878
KW
1094# use it directly. Use Unicode::UCD to access the Unicode character data
1095# base.
99870f4d
KW
1096EOF
1097
1098my $DEVELOPMENT_ONLY=<<"EOF";
1099# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1100# This file contains information artificially constrained to code points
1101# present in Unicode release $string_compare_versions.
1102# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1103# not be used for production.
b6922eda
KW
1104
1105EOF
1106
346a20cf
KW
1107my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1108 ? "10FFFF"
1109 : "FFFF";
6189eadc
KW
1110my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1111my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
99870f4d 1112
2d88a86a
KW
1113# We work with above-Unicode code points, up to UV_MAX. But when you get
1114# that high, above IV_MAX, some operations don't work, and you can easily get
1115# overflow. Therefore for internal use, we use a much smaller number,
1116# translating it to UV_MAX only for output. The exact number is immaterial
1117# (all Unicode code points are treated exactly the same), but the algorithm
1118# requires it to be at least 2 * $MAX_UNICODE_CODEPOINTS + 1;
1119my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1120my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1121my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1122
1123my $MAX_PLATFORM_CODEPOINT = ~0;
1124
99870f4d
KW
1125# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1126# two must be 10; if there are 5, the first must not be a 0. Written this way
92199589
KW
1127# to decrease backtracking. The first regex allows the code point to be at
1128# the end of a word, but to work properly, the word shouldn't end with a valid
1129# hex character. The second one won't match a code point at the end of a
1130# word, and doesn't have the run-on issue
8c32d378
KW
1131my $run_on_code_point_re =
1132 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1133my $code_point_re = qr/\b$run_on_code_point_re/;
99870f4d
KW
1134
1135# This matches the beginning of the line in the Unicode db files that give the
1136# defaults for code points not listed (i.e., missing) in the file. The code
1137# depends on this ending with a semi-colon, so it can assume it is a valid
1138# field when the line is split() by semi-colons
346a20cf 1139my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
99870f4d
KW
1140
1141# Property types. Unicode has more types, but these are sufficient for our
1142# purposes.
1143my $UNKNOWN = -1; # initialized to illegal value
1144my $NON_STRING = 1; # Either binary or enum
1145my $BINARY = 2;
06f26c45
KW
1146my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1147 # tables, additional true and false tables are
1148 # generated so that false is anything matching the
1149 # default value, and true is everything else.
1150my $ENUM = 4; # Include catalog
1151my $STRING = 5; # Anything else: string or misc
99870f4d
KW
1152
1153# Some input files have lines that give default values for code points not
1154# contained in the file. Sometimes these should be ignored.
1155my $NO_DEFAULTS = 0; # Must evaluate to false
1156my $NOT_IGNORED = 1;
1157my $IGNORED = 2;
1158
1159# Range types. Each range has a type. Most ranges are type 0, for normal,
1160# and will appear in the main body of the tables in the output files, but
1161# there are other types of ranges as well, listed below, that are specially
1162# handled. There are pseudo-types as well that will never be stored as a
1163# type, but will affect the calculation of the type.
1164
1165# 0 is for normal, non-specials
1166my $MULTI_CP = 1; # Sequence of more than code point
1167my $HANGUL_SYLLABLE = 2;
1168my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1169my $NULL = 4; # The map is to the null string; utf8.c can't
1170 # handle these, nor is there an accepted syntax
1171 # for them in \p{} constructs
f86864ac 1172my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1173 # otherwise be $MULTI_CP type are instead type 0
1174
1175# process_generic_property_file() can accept certain overrides in its input.
1176# Each of these must begin AND end with $CMD_DELIM.
1177my $CMD_DELIM = "\a";
1178my $REPLACE_CMD = 'replace'; # Override the Replace
1179my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1180
1181my $NO = 0;
1182my $YES = 1;
1183
1184# Values for the Replace argument to add_range.
1185# $NO # Don't replace; add only the code points not
1186 # already present.
1187my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1188 # the comments at the subroutine definition.
1189my $UNCONDITIONALLY = 2; # Replace without conditions.
9470941f 1190my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
99870f4d 1191 # already there
7f4b1e25
KW
1192my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1193 # already there
1194my $CROAK = 6; # Die with an error if is already there
99870f4d
KW
1195
1196# Flags to give property statuses. The phrases are to remind maintainers that
1197# if the flag is changed, the indefinite article referring to it in the
1198# documentation may need to be as well.
1199my $NORMAL = "";
99870f4d
KW
1200my $DEPRECATED = 'D';
1201my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1202my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1203my $DISCOURAGED = 'X';
1204my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1205my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1206my $STRICTER = 'T';
1207my $a_bold_stricter = "a 'B<$STRICTER>'";
1208my $A_bold_stricter = "A 'B<$STRICTER>'";
1209my $STABILIZED = 'S';
1210my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1211my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1212my $OBSOLETE = 'O';
1213my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1214my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1215
5c6d9947
KW
1216# Aliases can also have an extra status:
1217my $INTERNAL_ALIAS = 'P';
1218
99870f4d
KW
1219my %status_past_participles = (
1220 $DISCOURAGED => 'discouraged',
99870f4d
KW
1221 $STABILIZED => 'stabilized',
1222 $OBSOLETE => 'obsolete',
37e2e78e 1223 $DEPRECATED => 'deprecated',
5c6d9947 1224 $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
99870f4d
KW
1225);
1226
395dfc19
KW
1227# Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1228# externally documented.
301ba948 1229my $ORDINARY = 0; # The normal fate.
395dfc19
KW
1230my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1231 # but there is a file written that can be used to
1232 # reconstruct this table
3cdaf629 1233my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
301ba948 1234 # for Perl's internal use only
277b7b16
KW
1235my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl.
1236 # Is for backwards compatibility for applications that
1237 # read the file directly, so it's format is
1238 # unchangeable.
1239my $SUPPRESSED = 4; # The file for this table is not written out, and as a
3cdaf629
KW
1240 # result, we don't bother to do many computations on
1241 # it.
277b7b16 1242my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the
3cdaf629
KW
1243 # computations anyway, as the values are needed for
1244 # things to work. This happens when we have Perl
1245 # extensions that depend on Unicode tables that
1246 # wouldn't normally be in a given Unicode version.
301ba948 1247
f5817e0a
KW
1248# The format of the values of the tables:
1249my $EMPTY_FORMAT = "";
99870f4d
KW
1250my $BINARY_FORMAT = 'b';
1251my $DECIMAL_FORMAT = 'd';
1252my $FLOAT_FORMAT = 'f';
1253my $INTEGER_FORMAT = 'i';
1254my $HEX_FORMAT = 'x';
1255my $RATIONAL_FORMAT = 'r';
1256my $STRING_FORMAT = 's';
d11155ec 1257my $ADJUST_FORMAT = 'a';
24303724 1258my $HEX_ADJUST_FORMAT = 'ax';
a14f3cb1 1259my $DECOMP_STRING_FORMAT = 'c';
c3ff2976 1260my $STRING_WHITE_SPACE_LIST = 'sw';
99870f4d
KW
1261
1262my %map_table_formats = (
1263 $BINARY_FORMAT => 'binary',
1264 $DECIMAL_FORMAT => 'single decimal digit',
1265 $FLOAT_FORMAT => 'floating point number',
1266 $INTEGER_FORMAT => 'integer',
add63c13 1267 $HEX_FORMAT => 'non-negative hex whole number; a code point',
99870f4d 1268 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1269 $STRING_FORMAT => 'string',
d11155ec 1270 $ADJUST_FORMAT => 'some entries need adjustment',
24303724 1271 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
92f9d56c 1272 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
c3ff2976 1273 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
99870f4d
KW
1274);
1275
1276# Unicode didn't put such derived files in a separate directory at first.
1277my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1278my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1279my $AUXILIARY = 'auxiliary';
1280
3854b4b8
KW
1281# Hashes and arrays that will eventually go into Heavy.pl for the use of
1282# utf8_heavy.pl and into UCD.pl for the use of UCD.pm
99870f4d
KW
1283my %loose_to_file_of; # loosely maps table names to their respective
1284 # files
1285my %stricter_to_file_of; # same; but for stricter mapping.
315bfd4e 1286my %loose_property_to_file_of; # Maps a loose property name to its map file
e1dc048d 1287my %strict_property_to_file_of; # Same, but strict
3854b4b8
KW
1288my @inline_definitions = "V0"; # Each element gives a definition of a unique
1289 # inversion list. When a definition is inlined,
1290 # its value in the hash it's in (one of the two
1291 # defined just above) will include an index into
1292 # this array. The 0th element is initialized to
9be22ca5 1293 # the definition for a zero length inversion list
89cf10cc
KW
1294my %file_to_swash_name; # Maps the file name to its corresponding key name
1295 # in the hash %utf8::SwashInfo
99870f4d
KW
1296my %nv_floating_to_rational; # maps numeric values floating point numbers to
1297 # their rational equivalent
c12f2655
KW
1298my %loose_property_name_of; # Loosely maps (non_string) property names to
1299 # standard form
e1dc048d
KW
1300my %strict_property_name_of; # Strictly maps (non_string) property names to
1301 # standard form
86a52d1e 1302my %string_property_loose_to_name; # Same, for string properties.
c15fda25
KW
1303my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1304 # the property name in standard loose form, and
1305 # 'value' is the default value for that property,
1306 # also in standard loose form.
9e4a1e86
KW
1307my %loose_to_standard_value; # loosely maps table names to the canonical
1308 # alias for them
2df7880f
KW
1309my %ambiguous_names; # keys are alias names (in standard form) that
1310 # have more than one possible meaning.
d963b40d
KW
1311my %combination_property; # keys are alias names (in standard form) that
1312 # have both a map table, and a binary one that
1313 # yields true for all non-null maps.
5d1df013
KW
1314my %prop_aliases; # Keys are standard property name; values are each
1315 # one's aliases
1e863613
KW
1316my %prop_value_aliases; # Keys of top level are standard property name;
1317 # values are keys to another hash, Each one is
1318 # one of the property's values, in standard form.
1319 # The values are that prop-val's aliases.
b9e57660 1320my %skipped_files; # List of files that we skip
2df7880f 1321my %ucd_pod; # Holds entries that will go into the UCD section of the pod
99870f4d 1322
d867ccfb
KW
1323# Most properties are immune to caseless matching, otherwise you would get
1324# nonsensical results, as properties are a function of a code point, not
1325# everything that is caselessly equivalent to that code point. For example,
1326# Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1327# be true because 's' and 'S' are equivalent caselessly. However,
1328# traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1329# extend that concept to those very few properties that are like this. Each
1330# such property will match the full range caselessly. They are hard-coded in
1331# the program; it's not worth trying to make it general as it's extremely
1332# unlikely that they will ever change.
1333my %caseless_equivalent_to;
1334
53adf6a2
KW
1335# This is the range of characters that were in Release 1 of Unicode, and
1336# removed in Release 2 (replaced with the current Hangul syllables starting at
1337# U+AC00). The range was reused starting in Release 3 for other purposes.
1338my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1339my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1340
99870f4d
KW
1341# These constants names and values were taken from the Unicode standard,
1342# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1343# syllables. The '_string' versions are so generated tables can retain the
1344# hex format, which is the more familiar value
1345my $SBase_string = "0xAC00";
1346my $SBase = CORE::hex $SBase_string;
1347my $LBase_string = "0x1100";
1348my $LBase = CORE::hex $LBase_string;
1349my $VBase_string = "0x1161";
1350my $VBase = CORE::hex $VBase_string;
1351my $TBase_string = "0x11A7";
1352my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1353my $SCount = 11172;
1354my $LCount = 19;
1355my $VCount = 21;
1356my $TCount = 28;
1357my $NCount = $VCount * $TCount;
1358
1359# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1360# with the above published constants.
1361my %Jamo;
1362my %Jamo_L; # Leading consonants
1363my %Jamo_V; # Vowels
1364my %Jamo_T; # Trailing consonants
1365
bb1dd3da
KW
1366# For code points whose name contains its ordinal as a '-ABCD' suffix.
1367# The key is the base name of the code point, and the value is an
1368# array giving all the ranges that use this base name. Each range
1369# is actually a hash giving the 'low' and 'high' values of it.
1370my %names_ending_in_code_point;
1371my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1372 # removed from the names
1373# Inverse mapping. The list of ranges that have these kinds of
1374# names. Each element contains the low, high, and base names in an
1375# anonymous hash.
1376my @code_points_ending_in_code_point;
1377
6b5ab373
KW
1378# To hold Unicode's normalization test suite
1379my @normalization_tests;
1380
bb1dd3da
KW
1381# Boolean: does this Unicode version have the hangul syllables, and are we
1382# writing out a table for them?
1383my $has_hangul_syllables = 0;
1384
1385# Does this Unicode version have code points whose names end in their
1386# respective code points, and are we writing out a table for them? 0 for no;
1387# otherwise points to first property that a table is needed for them, so that
1388# if multiple tables are needed, we don't create duplicates
1389my $needing_code_points_ending_in_code_point = 0;
1390
37e2e78e 1391my @backslash_X_tests; # List of tests read in for testing \X
6b659339 1392my @LB_tests; # List of tests read in for testing \b{lb}
06ae2722 1393my @SB_tests; # List of tests read in for testing \b{sb}
ae3bb8ea 1394my @WB_tests; # List of tests read in for testing \b{wb}
99870f4d
KW
1395my @unhandled_properties; # Will contain a list of properties found in
1396 # the input that we didn't process.
f86864ac 1397my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1398 # listed in the pod
1399my @map_properties; # Properties that get map files written
1400my @named_sequences; # NamedSequences.txt contents.
1401my %potential_files; # Generated list of all .txt files in the directory
1402 # structure so we can warn if something is being
1403 # ignored.
a0204bb1
KW
1404my @missing_early_files; # Generated list of absent files that we need to
1405 # proceed in compiling this early Unicode version
99870f4d
KW
1406my @files_actually_output; # List of files we generated.
1407my @more_Names; # Some code point names are compound; this is used
1408 # to store the extra components of them.
1409my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1410 # the minimum before we consider it equivalent to a
1411 # candidate rational
1412my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1413
1414# These store references to certain commonly used property objects
1254636b 1415my $age;
3c88a801 1416my $ccc;
99870f4d
KW
1417my $gc;
1418my $perl;
1419my $block;
3e20195b
KW
1420my $perl_charname;
1421my $print;
ac7dbdb6 1422my $All;
bc0c431d 1423my $Assigned; # All assigned characters in this Unicode release
e47e66b9
KW
1424my $DI; # Default_Ignorable_Code_Point property
1425my $NChar; # Noncharacter_Code_Point property
359523e2 1426my $script;
99870f4d
KW
1427
1428# Are there conflicting names because of beginning with 'In_', or 'Is_'
1429my $has_In_conflicts = 0;
1430my $has_Is_conflicts = 0;
1431
1432sub internal_file_to_platform ($) {
1433 # Convert our file paths which have '/' separators to those of the
1434 # platform.
1435
1436 my $file = shift;
1437 return undef unless defined $file;
1438
1439 return File::Spec->join(split '/', $file);
d07a55ed 1440}
5beb625e 1441
99870f4d
KW
1442sub file_exists ($) { # platform independent '-e'. This program internally
1443 # uses slash as a path separator.
1444 my $file = shift;
1445 return 0 if ! defined $file;
1446 return -e internal_file_to_platform($file);
1447}
5beb625e 1448
99870f4d 1449sub objaddr($) {
23e33b60
KW
1450 # Returns the address of the blessed input object.
1451 # It doesn't check for blessedness because that would do a string eval
1452 # every call, and the program is structured so that this is never called
1453 # for a non-blessed object.
99870f4d 1454
23e33b60 1455 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1456
1457 # Numifying a ref gives its address.
051df77b 1458 return pack 'J', $_[0];
99870f4d
KW
1459}
1460
558712cf 1461# These are used only if $annotate is true.
c4019d52
KW
1462# The entire range of Unicode characters is examined to populate these
1463# after all the input has been processed. But most can be skipped, as they
1464# have the same descriptive phrases, such as being unassigned
1465my @viacode; # Contains the 1 million character names
4b06b8de 1466my @age; # And their ages ("" if none)
c4019d52
KW
1467my @printable; # boolean: And are those characters printable?
1468my @annotate_char_type; # Contains a type of those characters, specifically
1469 # for the purposes of annotation.
1470my $annotate_ranges; # A map of ranges of code points that have the same
98dc9551 1471 # name for the purposes of annotation. They map to the
c4019d52
KW
1472 # upper edge of the range, so that the end point can
1473 # be immediately found. This is used to skip ahead to
1474 # the end of a range, and avoid processing each
1475 # individual code point in it.
1476my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1477 # characters, but excluding those which are
1478 # also noncharacter code points
1479
1480# The annotation types are an extension of the regular range types, though
1481# some of the latter are folded into one. Make the new types negative to
1482# avoid conflicting with the regular types
1483my $SURROGATE_TYPE = -1;
1484my $UNASSIGNED_TYPE = -2;
1485my $PRIVATE_USE_TYPE = -3;
1486my $NONCHARACTER_TYPE = -4;
1487my $CONTROL_TYPE = -5;
2d88a86a
KW
1488my $ABOVE_UNICODE_TYPE = -6;
1489my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program
c4019d52
KW
1490
1491sub populate_char_info ($) {
558712cf 1492 # Used only with the $annotate option. Populates the arrays with the
c4019d52
KW
1493 # input code point's info that are needed for outputting more detailed
1494 # comments. If calling context wants a return, it is the end point of
1495 # any contiguous range of characters that share essentially the same info
1496
1497 my $i = shift;
1498 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1499
1500 $viacode[$i] = $perl_charname->value_of($i) || "";
1254636b
KW
1501 $age[$i] = (defined $age)
1502 ? (($age->value_of($i) =~ / ^ \d \. \d $ /x)
1503 ? $age->value_of($i)
1504 : "")
1505 : "";
c4019d52
KW
1506
1507 # A character is generally printable if Unicode says it is,
1508 # but below we make sure that most Unicode general category 'C' types
1509 # aren't.
1510 $printable[$i] = $print->contains($i);
1511
1254636b
KW
1512 # But the characters in this range were removed in v2.0 and replaced by
1513 # different ones later. Modern fonts will be for the replacement
1514 # characters, so suppress printing them.
1515 if (($v_version lt v2.0
1516 || ($compare_versions && $compare_versions lt v2.0))
1517 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1518 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1519 {
1520 $printable[$i] = 0;
1521 }
1522
c4019d52
KW
1523 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1524
1525 # Only these two regular types are treated specially for annotations
1526 # purposes
1527 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1528 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1529
1530 # Give a generic name to all code points that don't have a real name.
1531 # We output ranges, if applicable, for these. Also calculate the end
1532 # point of the range.
1533 my $end;
1534 if (! $viacode[$i]) {
2d88a86a
KW
1535 if ($i > $MAX_UNICODE_CODEPOINT) {
1536 $viacode[$i] = 'Above-Unicode';
1537 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1538 $printable[$i] = 0;
1539 $end = $MAX_WORKING_CODEPOINT;
1540 }
1541 elsif ($gc-> table('Private_use')->contains($i)) {
c4019d52
KW
1542 $viacode[$i] = 'Private Use';
1543 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1544 $printable[$i] = 0;
1545 $end = $gc->table('Private_Use')->containing_range($i)->end;
1546 }
e47e66b9 1547 elsif ($NChar->contains($i)) {
c4019d52
KW
1548 $viacode[$i] = 'Noncharacter';
1549 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1550 $printable[$i] = 0;
e47e66b9 1551 $end = $NChar->containing_range($i)->end;
c4019d52
KW
1552 }
1553 elsif ($gc-> table('Control')->contains($i)) {
1254636b
KW
1554 my $name_ref = property_ref('Name_Alias');
1555 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1556 $viacode[$i] = (defined $name_ref)
1557 ? $name_ref->value_of($i)
1558 : 'Control';
c4019d52
KW
1559 $annotate_char_type[$i] = $CONTROL_TYPE;
1560 $printable[$i] = 0;
c4019d52
KW
1561 }
1562 elsif ($gc-> table('Unassigned')->contains($i)) {
c4019d52
KW
1563 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1564 $printable[$i] = 0;
1254636b
KW
1565 $viacode[$i] = 'Unassigned';
1566
349edad4 1567 if (defined $block) { # No blocks in earliest releases
1254636b 1568 $viacode[$i] .= ', block=' . $block-> value_of($i);
1d025d66 1569 $end = $gc-> table('Unassigned')->containing_range($i)->end;
c4019d52 1570
bf06c733
KW
1571 # Because we name the unassigned by the blocks they are in, it
1572 # can't go past the end of that block, and it also can't go
1573 # past the unassigned range it is in. The special table makes
1574 # sure that the non-characters, which are unassigned, are
1575 # separated out.
1576 $end = min($block->containing_range($i)->end,
1577 $unassigned_sans_noncharacters->
1578 containing_range($i)->end);
1d025d66 1579 }
1254636b
KW
1580 else {
1581 $end = $i + 1;
1582 while ($unassigned_sans_noncharacters->contains($end)) {
1583 $end++;
1584 }
1585 $end--;
1586 }
1d025d66 1587 }
35e2b416 1588 elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1d025d66
KW
1589 $viacode[$i] = 'Surrogate';
1590 $annotate_char_type[$i] = $SURROGATE_TYPE;
1591 $printable[$i] = 0;
1592 $end = $gc->table('Surrogate')->containing_range($i)->end;
13ca76ff
KW
1593 }
1594 else {
1595 Carp::my_carp_bug("Can't figure out how to annotate "
1596 . sprintf("U+%04X", $i)
1597 . ". Proceeding anyway.");
c4019d52
KW
1598 $viacode[$i] = 'UNKNOWN';
1599 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1600 $printable[$i] = 0;
1601 }
1602 }
1603
1604 # Here, has a name, but if it's one in which the code point number is
1605 # appended to the name, do that.
1606 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1607 $viacode[$i] .= sprintf("-%04X", $i);
4b06b8de 1608
4b06b8de 1609 my $limit = $perl_charname->containing_range($i)->end;
1254636b
KW
1610 if (defined $age) {
1611 # Do all these as groups of the same age, instead of individually,
1612 # because their names are so meaningless, and there are typically
1613 # large quantities of them.
1614 $end = $i + 1;
1615 while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1616 $end++;
1617 }
1618 $end--;
1619 }
1620 else {
1621 $end = $limit;
4b06b8de 1622 }
c4019d52
KW
1623 }
1624
1625 # And here, has a name, but if it's a hangul syllable one, replace it with
1626 # the correct name from the Unicode algorithm
1627 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1628 use integer;
1629 my $SIndex = $i - $SBase;
1630 my $L = $LBase + $SIndex / $NCount;
1631 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1632 my $T = $TBase + $SIndex % $TCount;
1633 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1634 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1635 $end = $perl_charname->containing_range($i)->end;
1636 }
1637
1638 return if ! defined wantarray;
1639 return $i if ! defined $end; # If not a range, return the input
1640
1641 # Save this whole range so can find the end point quickly
1642 $annotate_ranges->add_map($i, $end, $end);
1643
1644 return $end;
1645}
1646
23e33b60
KW
1647# Commented code below should work on Perl 5.8.
1648## This 'require' doesn't necessarily work in miniperl, and even if it does,
1649## the native perl version of it (which is what would operate under miniperl)
1650## is extremely slow, as it does a string eval every call.
7e017d6d 1651#my $has_fast_scalar_util = $^X !~ /miniperl/
23e33b60
KW
1652# && defined eval "require Scalar::Util";
1653#
1654#sub objaddr($) {
1655# # Returns the address of the blessed input object. Uses the XS version if
1656# # available. It doesn't check for blessedness because that would do a
1657# # string eval every call, and the program is structured so that this is
1658# # never called for a non-blessed object.
1659#
1660# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1661#
1662# # Check at least that is a ref.
1663# my $pkg = ref($_[0]) or return undef;
1664#
1665# # Change to a fake package to defeat any overloaded stringify
1666# bless $_[0], 'main::Fake';
1667#
1668# # Numifying a ref gives its address.
051df77b 1669# my $addr = pack 'J', $_[0];
23e33b60
KW
1670#
1671# # Return to original class
1672# bless $_[0], $pkg;
1673# return $addr;
1674#}
1675
99870f4d
KW
1676sub max ($$) {
1677 my $a = shift;
1678 my $b = shift;
1679 return $a if $a >= $b;
1680 return $b;
1681}
1682
1683sub min ($$) {
1684 my $a = shift;
1685 my $b = shift;
1686 return $a if $a <= $b;
1687 return $b;
1688}
1689
1690sub clarify_number ($) {
1691 # This returns the input number with underscores inserted every 3 digits
1692 # in large (5 digits or more) numbers. Input must be entirely digits, not
1693 # checked.
1694
1695 my $number = shift;
1696 my $pos = length($number) - 3;
1697 return $number if $pos <= 1;
1698 while ($pos > 0) {
1699 substr($number, $pos, 0) = '_';
1700 $pos -= 3;
5beb625e 1701 }
99870f4d 1702 return $number;
99598c8c
JH
1703}
1704
731cb813
KW
1705sub clarify_code_point_count ($) {
1706 # This is like clarify_number(), but the input is assumed to be a count of
1707 # code points, rather than a generic number.
1708
2d88a86a
KW
1709 my $append = "";
1710
1711 my $number = shift;
1712 if ($number > $MAX_UNICODE_CODEPOINTS) {
1713 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1714 return "All above-Unicode code points" if $number == 0;
1715 $append = " + all above-Unicode code points";
1716 }
1717 return clarify_number($number) . $append;
731cb813 1718}
12ac2576 1719
99870f4d 1720package Carp;
7ebf06b3 1721
99870f4d
KW
1722# These routines give a uniform treatment of messages in this program. They
1723# are placed in the Carp package to cause the stack trace to not include them,
1724# although an alternative would be to use another package and set @CARP_NOT
1725# for it.
12ac2576 1726
99870f4d 1727our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1728
99f78760
KW
1729# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1730# and overload trying to load Scalar:Util under miniperl. See
1731# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1732undef $overload::VERSION;
1733
99870f4d
KW
1734sub my_carp {
1735 my $message = shift || "";
1736 my $nofold = shift || 0;
7ebf06b3 1737
99870f4d
KW
1738 if ($message) {
1739 $message = main::join_lines($message);
1740 $message =~ s/^$0: *//; # Remove initial program name
1741 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1742 $message = "\n$0: $message;";
12ac2576 1743
99870f4d
KW
1744 # Fold the message with program name, semi-colon end punctuation
1745 # (which looks good with the message that carp appends to it), and a
1746 # hanging indent for continuation lines.
1747 $message = main::simple_fold($message, "", 4) unless $nofold;
1748 $message =~ s/\n$//; # Remove the trailing nl so what carp
1749 # appends is to the same line
1750 }
12ac2576 1751
99870f4d 1752 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1753
99870f4d
KW
1754 carp $message;
1755 return;
1756}
7ebf06b3 1757
99870f4d
KW
1758sub my_carp_bug {
1759 # This is called when it is clear that the problem is caused by a bug in
1760 # this program.
7ebf06b3 1761
99870f4d
KW
1762 my $message = shift;
1763 $message =~ s/^$0: *//;
1764 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1765 carp $message;
1766 return;
1767}
7ebf06b3 1768
99870f4d
KW
1769sub carp_too_few_args {
1770 if (@_ != 2) {
1771 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1772 return;
12ac2576 1773 }
7ebf06b3 1774
99870f4d
KW
1775 my $args_ref = shift;
1776 my $count = shift;
7ebf06b3 1777
99870f4d
KW
1778 my_carp_bug("Need at least $count arguments to "
1779 . (caller 1)[3]
1780 . ". Instead got: '"
1781 . join ', ', @$args_ref
1782 . "'. No action taken.");
1783 return;
12ac2576
JP
1784}
1785
99870f4d
KW
1786sub carp_extra_args {
1787 my $args_ref = shift;
1788 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1789
99870f4d
KW
1790 unless (ref $args_ref) {
1791 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1792 return;
1793 }
1794 my ($package, $file, $line) = caller;
1795 my $subroutine = (caller 1)[3];
cf25bb62 1796
99870f4d
KW
1797 my $list;
1798 if (ref $args_ref eq 'HASH') {
1799 foreach my $key (keys %$args_ref) {
1800 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1801 }
99870f4d 1802 $list = join ', ', each %{$args_ref};
cf25bb62 1803 }
99870f4d
KW
1804 elsif (ref $args_ref eq 'ARRAY') {
1805 foreach my $arg (@$args_ref) {
1806 $arg = $UNDEF unless defined $arg;
1807 }
1808 $list = join ', ', @$args_ref;
1809 }
1810 else {
1811 my_carp_bug("Can't cope with ref "
1812 . ref($args_ref)
1813 . " . argument to 'carp_extra_args'. Not checking arguments.");
1814 return;
1815 }
1816
1817 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1818 return;
d73e5302
JH
1819}
1820
99870f4d
KW
1821package main;
1822
1823{ # Closure
1824
1825 # This program uses the inside-out method for objects, as recommended in
1b6b3fa9
KW
1826 # "Perl Best Practices". (This is the best solution still, since this has
1827 # to run under miniperl.) This closure aids in generating those. There
99870f4d
KW
1828 # are two routines. setup_package() is called once per package to set
1829 # things up, and then set_access() is called for each hash representing a
1830 # field in the object. These routines arrange for the object to be
1831 # properly destroyed when no longer used, and for standard accessor
1832 # functions to be generated. If you need more complex accessors, just
1833 # write your own and leave those accesses out of the call to set_access().
1834 # More details below.
1835
1836 my %constructor_fields; # fields that are to be used in constructors; see
1837 # below
1838
1839 # The values of this hash will be the package names as keys to other
1840 # hashes containing the name of each field in the package as keys, and
1841 # references to their respective hashes as values.
1842 my %package_fields;
1843
1844 sub setup_package {
1845 # Sets up the package, creating standard DESTROY and dump methods
1846 # (unless already defined). The dump method is used in debugging by
1847 # simple_dumper().
1848 # The optional parameters are:
1849 # a) a reference to a hash, that gets populated by later
1850 # set_access() calls with one of the accesses being
1851 # 'constructor'. The caller can then refer to this, but it is
1852 # not otherwise used by these two routines.
1853 # b) a reference to a callback routine to call during destruction
1854 # of the object, before any fields are actually destroyed
1855
1856 my %args = @_;
1857 my $constructor_ref = delete $args{'Constructor_Fields'};
1858 my $destroy_callback = delete $args{'Destroy_Callback'};
1859 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1860
1861 my %fields;
1862 my $package = (caller)[0];
1863
1864 $package_fields{$package} = \%fields;
1865 $constructor_fields{$package} = $constructor_ref;
1866
1867 unless ($package->can('DESTROY')) {
1868 my $destroy_name = "${package}::DESTROY";
1869 no strict "refs";
1870
1871 # Use typeglob to give the anonymous subroutine the name we want
1872 *$destroy_name = sub {
1873 my $self = shift;
ffe43484 1874 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1875
1876 $self->$destroy_callback if $destroy_callback;
1877 foreach my $field (keys %{$package_fields{$package}}) {
1878 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1879 delete $package_fields{$package}{$field}{$addr};
1880 }
1881 return;
1882 }
1883 }
1884
1885 unless ($package->can('dump')) {
1886 my $dump_name = "${package}::dump";
1887 no strict "refs";
1888 *$dump_name = sub {
1889 my $self = shift;
1890 return dump_inside_out($self, $package_fields{$package}, @_);
1891 }
1892 }
1893 return;
1894 }
1895
1896 sub set_access {
1897 # Arrange for the input field to be garbage collected when no longer
1898 # needed. Also, creates standard accessor functions for the field
1899 # based on the optional parameters-- none if none of these parameters:
1900 # 'addable' creates an 'add_NAME()' accessor function.
1901 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1902 # function.
1903 # 'settable' creates a 'set_NAME()' accessor function.
1904 # 'constructor' doesn't create an accessor function, but adds the
1905 # field to the hash that was previously passed to
1906 # setup_package();
1907 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1908 # 'add' etc. all mean 'addable'.
1909 # The read accessor function will work on both array and scalar
1910 # values. If another accessor in the parameter list is 'a', the read
1911 # access assumes an array. You can also force it to be array access
1912 # by specifying 'readable_array' instead of 'readable'
1913 #
1914 # A sort-of 'protected' access can be set-up by preceding the addable,
1915 # readable or settable with some initial portion of 'protected_' (but,
1916 # the underscore is required), like 'p_a', 'pro_set', etc. The
1917 # "protection" is only by convention. All that happens is that the
1918 # accessor functions' names begin with an underscore. So instead of
1919 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1920 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1921 # end of each package, and then storing the __LINE__ ranges and
1922 # checking them on every accessor. But that is way overkill.)
1923
1924 # We create anonymous subroutines as the accessors and then use
1925 # typeglobs to assign them to the proper package and name
1926
1927 my $name = shift; # Name of the field
1928 my $field = shift; # Reference to the inside-out hash containing the
1929 # field
1930
1931 my $package = (caller)[0];
1932
1933 if (! exists $package_fields{$package}) {
1934 croak "$0: Must call 'setup_package' before 'set_access'";
1935 }
d73e5302 1936
99870f4d
KW
1937 # Stash the field so DESTROY can get it.
1938 $package_fields{$package}{$name} = $field;
cf25bb62 1939
99870f4d
KW
1940 # Remaining arguments are the accessors. For each...
1941 foreach my $access (@_) {
1942 my $access = lc $access;
cf25bb62 1943
99870f4d 1944 my $protected = "";
cf25bb62 1945
99870f4d
KW
1946 # Match the input as far as it goes.
1947 if ($access =~ /^(p[^_]*)_/) {
1948 $protected = $1;
1949 if (substr('protected_', 0, length $protected)
1950 eq $protected)
1951 {
1952
1953 # Add 1 for the underscore not included in $protected
1954 $access = substr($access, length($protected) + 1);
1955 $protected = '_';
1956 }
1957 else {
1958 $protected = "";
1959 }
1960 }
1961
1962 if (substr('addable', 0, length $access) eq $access) {
1963 my $subname = "${package}::${protected}add_$name";
1964 no strict "refs";
1965
1966 # add_ accessor. Don't add if already there, which we
1967 # determine using 'eq' for scalars and '==' otherwise.
1968 *$subname = sub {
1969 use strict "refs";
1970 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1971 my $self = shift;
1972 my $value = shift;
ffe43484 1973 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1974 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1975 if (ref $value) {
f998e60c 1976 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1977 }
1978 else {
f998e60c 1979 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1980 }
f998e60c 1981 push @{$field->{$addr}}, $value;
99870f4d
KW
1982 return;
1983 }
1984 }
1985 elsif (substr('constructor', 0, length $access) eq $access) {
1986 if ($protected) {
1987 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1988 }
1989 else {
1990 $constructor_fields{$package}{$name} = $field;
1991 }
1992 }
1993 elsif (substr('readable_array', 0, length $access) eq $access) {
1994
1995 # Here has read access. If one of the other parameters for
1996 # access is array, or this one specifies array (by being more
1997 # than just 'readable_'), then create a subroutine that
1998 # assumes the data is an array. Otherwise just a scalar
1999 my $subname = "${package}::${protected}$name";
2000 if (grep { /^a/i } @_
2001 or length($access) > length('readable_'))
2002 {
2003 no strict "refs";
2004 *$subname = sub {
2005 use strict "refs";
23e33b60 2006 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 2007 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
2008 if (ref $field->{$addr} ne 'ARRAY') {
2009 my $type = ref $field->{$addr};
2010 $type = 'scalar' unless $type;
2011 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
2012 return;
2013 }
2014 return scalar @{$field->{$addr}} unless wantarray;
2015
2016 # Make a copy; had problems with caller modifying the
2017 # original otherwise
2018 my @return = @{$field->{$addr}};
2019 return @return;
2020 }
2021 }
2022 else {
2023
2024 # Here not an array value, a simpler function.
2025 no strict "refs";
2026 *$subname = sub {
2027 use strict "refs";
23e33b60 2028 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 2029 no overloading;
051df77b 2030 return $field->{pack 'J', $_[0]};
99870f4d
KW
2031 }
2032 }
2033 }
2034 elsif (substr('settable', 0, length $access) eq $access) {
2035 my $subname = "${package}::${protected}set_$name";
2036 no strict "refs";
2037 *$subname = sub {
2038 use strict "refs";
23e33b60
KW
2039 if (main::DEBUG) {
2040 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2041 Carp::carp_extra_args(\@_) if @_ > 2;
2042 }
2043 # $self is $_[0]; $value is $_[1]
f998e60c 2044 no overloading;
051df77b 2045 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
2046 return;
2047 }
2048 }
2049 else {
2050 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2051 }
cf25bb62 2052 }
99870f4d 2053 return;
cf25bb62 2054 }
99870f4d
KW
2055}
2056
2057package Input_file;
2058
2059# All input files use this object, which stores various attributes about them,
2060# and provides for convenient, uniform handling. The run method wraps the
2061# processing. It handles all the bookkeeping of opening, reading, and closing
2062# the file, returning only significant input lines.
2063#
2064# Each object gets a handler which processes the body of the file, and is
74cd47d0
KW
2065# called by run(). All character property files must use the generic,
2066# default handler, which has code scrubbed to handle things you might not
2067# expect, including automatic EBCDIC handling. For files that don't deal with
2068# mapping code points to a property value, such as test files,
2069# PropertyAliases, PropValueAliases, and named sequences, you can override the
2070# handler to be a custom one. Such a handler should basically be a
2071# while(next_line()) {...} loop.
99870f4d
KW
2072#
2073# You can also set up handlers to
40045c2d 2074# 0) call during object construction time, after everything else is done
537124e4 2075# 1) call before the first line is read, for pre processing
83b68635
KW
2076# 2) call to adjust each line of the input before the main handler gets
2077# them. This can be automatically generated, if appropriately simple
2078# enough, by specifiying a Properties parameter in the constructor.
99870f4d 2079# 3) call upon EOF before the main handler exits its loop
537124e4 2080# 4) call at the end, for post processing
99870f4d
KW
2081#
2082# $_ is used to store the input line, and is to be filtered by the
2083# each_line_handler()s. So, if the format of the line is not in the desired
2084# format for the main handler, these are used to do that adjusting. They can
2085# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
3848e5b7
KW
2086# so the $_ output of one is used as the input to the next. The eof handler
2087# is also stackable, but none of the others are, but could easily be changed
2088# to be so.
99870f4d 2089#
a0204bb1
KW
2090# Some properties are used by the Perl core but aren't defined until later
2091# Unicode releases. The perl interpreter would have problems working when
2092# compiled with an earlier Unicode version that doesn't have them, so we need
2093# to define them somehow for those releases. The 'Early' constructor
2094# parameter can be used to automatically handle this. It is essentially
2095# ignored if the Unicode version being compiled has a data file for this
2096# property. Either code to execute or a file to read can be specified.
2097# Details are at the %early definition.
2098#
99870f4d
KW
2099# Most of the handlers can call insert_lines() or insert_adjusted_lines()
2100# which insert the parameters as lines to be processed before the next input
3848e5b7 2101# file line is read. This allows the EOF handler(s) to flush buffers, for
99870f4d
KW
2102# example. The difference between the two routines is that the lines inserted
2103# by insert_lines() are subjected to the each_line_handler()s. (So if you
9be22ca5
KW
2104# called it from such a handler, you would get infinite recursion without some
2105# mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go
2106# directly to the main handler without any adjustments. If the
2107# post-processing handler calls any of these, there will be no effect. Some
2108# error checking for these conditions could be added, but it hasn't been done.
99870f4d
KW
2109#
2110# carp_bad_line() should be called to warn of bad input lines, which clears $_
2111# to prevent further processing of the line. This routine will output the
2112# message as a warning once, and then keep a count of the lines that have the
2113# same message, and output that count at the end of the file's processing.
2114# This keeps the number of messages down to a manageable amount.
2115#
2116# get_missings() should be called to retrieve any @missing input lines.
2117# Messages will be raised if this isn't done if the options aren't to ignore
2118# missings.
2119
2120sub trace { return main::trace(@_); }
2121
99870f4d
KW
2122{ # Closure
2123 # Keep track of fields that are to be put into the constructor.
2124 my %constructor_fields;
2125
2126 main::setup_package(Constructor_Fields => \%constructor_fields);
2127
2128 my %file; # Input file name, required
2129 main::set_access('file', \%file, qw{ c r });
2130
2131 my %first_released; # Unicode version file was first released in, required
2132 main::set_access('first_released', \%first_released, qw{ c r });
2133
2134 my %handler; # Subroutine to process the input file, defaults to
2135 # 'process_generic_property_file'
2136 main::set_access('handler', \%handler, qw{ c });
2137
2138 my %property;
2139 # name of property this file is for. defaults to none, meaning not
2140 # applicable, or is otherwise determinable, for example, from each line.
696609bf 2141 main::set_access('property', \%property, qw{ c r });
99870f4d
KW
2142
2143 my %optional;
3274d896
KW
2144 # This is either an unsigned number, or a list of property names. In the
2145 # former case, if it is non-zero, it means the file is optional, so if the
2146 # file is absent, no warning about that is output. In the latter case, it
2147 # is a list of properties that the file (exclusively) defines. If the
2148 # file is present, tables for those properties will be produced; if
2149 # absent, none will, even if they are listed elsewhere (namely
2150 # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2151 # and no warnings will be raised about them not being available. (And no
2152 # warning about the file itself will be raised.)
2153 main::set_access('optional', \%optional, qw{ c readable_array } );
99870f4d
KW
2154
2155 my %non_skip;
2156 # This is used for debugging, to skip processing of all but a few input
2157 # files. Add 'non_skip => 1' to the constructor for those files you want
2158 # processed when you set the $debug_skip global.
2159 main::set_access('non_skip', \%non_skip, 'c');
2160
37e2e78e 2161 my %skip;
0dee21d6
KW
2162 # This is used to skip processing of this input file (semi-) permanently.
2163 # The value should be the reason the file is being skipped. It is used
2164 # for files that we aren't planning to process anytime soon, but want to
2165 # allow to be in the directory and be checked for their names not
2166 # conflicting with any other files on a DOS 8.3 name filesystem, but to
2167 # not otherwise be processed, and to not raise a warning about not being
2168 # handled. In the constructor call, any value that evaluates to a numeric
2169 # 0 or undef means don't skip. Any other value is a string giving the
2170 # reason it is being skippped, and this will appear in generated pod.
2171 # However, an empty string reason will suppress the pod entry.
2172 # Internally, calls that evaluate to numeric 0 are changed into undef to
2173 # distinguish them from an empty string call.
2174 main::set_access('skip', \%skip, 'c', 'r');
37e2e78e 2175
99870f4d
KW
2176 my %each_line_handler;
2177 # list of subroutines to look at and filter each non-comment line in the
2178 # file. defaults to none. The subroutines are called in order, each is
2179 # to adjust $_ for the next one, and the final one adjusts it for
2180 # 'handler'
2181 main::set_access('each_line_handler', \%each_line_handler, 'c');
2182
83b68635
KW
2183 my %properties; # Optional ordered list of the properties that occur in each
2184 # meaningful line of the input file. If present, an appropriate
2185 # each_line_handler() is automatically generated and pushed onto the stack
2186 # of such handlers. This is useful when a file contains multiple
2187 # proerties per line, but no other special considerations are necessary.
2188 # The special value "<ignored>" means to discard the corresponding input
2189 # field.
2190 # Any @missing lines in the file should also match this syntax; no such
2191 # files exist as of 6.3. But if it happens in a future release, the code
2192 # could be expanded to properly parse them.
2193 main::set_access('properties', \%properties, qw{ c r });
2194
99870f4d
KW
2195 my %has_missings_defaults;
2196 # ? Are there lines in the file giving default values for code points
2197 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2198 # the norm, but IGNORED means it has such lines, but the handler doesn't
2199 # use them. Having these three states allows us to catch changes to the
83b68635
KW
2200 # UCD that this program should track. XXX This could be expanded to
2201 # specify the syntax for such lines, like %properties above.
99870f4d
KW
2202 main::set_access('has_missings_defaults',
2203 \%has_missings_defaults, qw{ c r });
2204
40045c2d
KW
2205 my %construction_time_handler;
2206 # Subroutine to call at the end of the new method. If undef, no such
2207 # handler is called.
2208 main::set_access('construction_time_handler',
2209 \%construction_time_handler, qw{ c });
2210
99870f4d
KW
2211 my %pre_handler;
2212 # Subroutine to call before doing anything else in the file. If undef, no
2213 # such handler is called.
2214 main::set_access('pre_handler', \%pre_handler, qw{ c });
2215
2216 my %eof_handler;
3848e5b7 2217 # Subroutines to call upon getting an EOF on the input file, but before
99870f4d
KW
2218 # that is returned to the main handler. This is to allow buffers to be
2219 # flushed. The handler is expected to call insert_lines() or
2220 # insert_adjusted() with the buffered material
3848e5b7 2221 main::set_access('eof_handler', \%eof_handler, qw{ c });
99870f4d
KW
2222
2223 my %post_handler;
2224 # Subroutine to call after all the lines of the file are read in and
9be22ca5
KW
2225 # processed. If undef, no such handler is called. Note that this cannot
2226 # add lines to be processed; instead use eof_handler
99870f4d
KW
2227 main::set_access('post_handler', \%post_handler, qw{ c });
2228
2229 my %progress_message;
2230 # Message to print to display progress in lieu of the standard one
2231 main::set_access('progress_message', \%progress_message, qw{ c });
2232
2233 my %handle;
2234 # cache open file handle, internal. Is undef if file hasn't been
2235 # processed at all, empty if has;
2236 main::set_access('handle', \%handle);
2237
2238 my %added_lines;
2239 # cache of lines added virtually to the file, internal
2240 main::set_access('added_lines', \%added_lines);
2241
74cd47d0
KW
2242 my %remapped_lines;
2243 # cache of lines added virtually to the file, internal
2244 main::set_access('remapped_lines', \%remapped_lines);
2245
99870f4d
KW
2246 my %errors;
2247 # cache of errors found, internal
2248 main::set_access('errors', \%errors);
2249
2250 my %missings;
2251 # storage of '@missing' defaults lines
2252 main::set_access('missings', \%missings);
2253
a0204bb1
KW
2254 my %early;
2255 # Used for properties that must be defined (for Perl's purposes) on
2256 # versions of Unicode earlier than Unicode itself defines them. The
2257 # parameter is an array (it would be better to be a hash, but not worth
2258 # bothering about due to its rare use).
2259 #
2260 # The first element is either a code reference to call when in a release
2261 # earlier than the Unicode file is available in, or it is an alternate
2262 # file to use instead of the non-existent one. This file must have been
2263 # plunked down in the same directory as mktables. Should you be compiling
2264 # on a release that needs such a file, mktables will abort the
2265 # compilation, and tell you where to get the necessary file(s), and what
2266 # name(s) to use to store them as.
2267 # In the case of specifying an alternate file, the array must contain two
2268 # further elements:
2269 #
2270 # [1] is the name of the property that will be generated by this file.
2271 # The class automatically takes the input file and excludes any code
2272 # points in it that were not assigned in the Unicode version being
2273 # compiled. It then uses this result to define the property in the given
2274 # version. Since the property doesn't actually exist in the Unicode
2275 # version being compiled, this should be a name accessible only by core
2276 # perl. If it is the same name as the regular property, the constructor
2277 # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2278 # get output, and so will be unusable by non-core code. Otherwise it gets
2279 # marked as $INTERNAL_ONLY.
2280 #
2281 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2282 # the Hangul syllables in that release (which were ripped out in version
2283 # 2) for the given property . (Hence it is ignored except when compiling
2284 # version 1. You only get one value that applies to all of them, which
2285 # may not be the actual reality, but probably nobody cares anyway for
2286 # these obsolete characters.)
2287 #
3d36fe12
KW
2288 # [3] if present is the default value for the property to assign for code
2289 # points not given in the input. If not present, the default from the
2290 # normal property is used
2291 #
2292 # [4] if present must be the string 'ONLY_EARLY'. Normally, when
cbdbe9d4
KW
2293 # compiling Unicode versions that don't invoke the early handling, the
2294 # name in [1] is added as an alias to the property name used for these.
2295 # This parameter says to not do this.
2296 #
a0204bb1
KW
2297 # Not all files can be handled in the above way, and so the code ref
2298 # alternative is available. It can do whatever it needs to. The other
2299 # array elements are optional in this case, and the code is free to use or
2300 # ignore them if they are present.
2301 #
2302 # Internally, the constructor unshifts a 0 or 1 onto this array to
2303 # indicate if an early alternative is actually being used or not. This
2304 # makes for easier testing later on.
2305 main::set_access('early', \%early, 'c');
2306
0dee21d6
KW
2307 my %required_even_in_debug_skip;
2308 # debug_skip is used to speed up compilation during debugging by skipping
2309 # processing files that are not needed for the task at hand. However,
2310 # some files pretty much can never be skipped, and this is used to specify
2311 # that this is one of them. In order to skip this file, the call to the
2312 # constructor must be edited to comment out this parameter.
2313 main::set_access('required_even_in_debug_skip',
2314 \%required_even_in_debug_skip, 'c');
2315
d8062573
KW
2316 my %withdrawn;
2317 # Some files get removed from the Unicode DB. This is a version object
2318 # giving the first release without this file.
2319 main::set_access('withdrawn', \%withdrawn, 'c');
2320
0dee21d6
KW
2321 my %in_this_release;
2322 # Calculated value from %first_released and %withdrawn. Are we compiling
2323 # a Unicode release which includes this file?
2324 main::set_access('in_this_release', \%in_this_release);
2325
74cd47d0
KW
2326 sub _next_line;
2327 sub _next_line_with_remapped_range;
2328
99870f4d
KW
2329 sub new {
2330 my $class = shift;
2331
2332 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 2333 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2334
2335 # Set defaults
2336 $handler{$addr} = \&main::process_generic_property_file;
2337 $non_skip{$addr} = 0;
0dee21d6 2338 $skip{$addr} = undef;
99870f4d
KW
2339 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2340 $handle{$addr} = undef;
2341 $added_lines{$addr} = [ ];
74cd47d0 2342 $remapped_lines{$addr} = [ ];
99870f4d 2343 $each_line_handler{$addr} = [ ];
3848e5b7 2344 $eof_handler{$addr} = [ ];
99870f4d
KW
2345 $errors{$addr} = { };
2346 $missings{$addr} = [ ];
a0204bb1 2347 $early{$addr} = [ ];
3274d896 2348 $optional{$addr} = [ ];
99870f4d
KW
2349
2350 # Two positional parameters.
99f78760 2351 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
2352 $file{$addr} = main::internal_file_to_platform(shift);
2353 $first_released{$addr} = shift;
2354
2355 # The rest of the arguments are key => value pairs
2356 # %constructor_fields has been set up earlier to list all possible
2357 # ones. Either set or push, depending on how the default has been set
2358 # up just above.
2359 my %args = @_;
2360 foreach my $key (keys %args) {
2361 my $argument = $args{$key};
2362
2363 # Note that the fields are the lower case of the constructor keys
2364 my $hash = $constructor_fields{lc $key};
2365 if (! defined $hash) {
2366 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2367 next;
2368 }
2369 if (ref $hash->{$addr} eq 'ARRAY') {
2370 if (ref $argument eq 'ARRAY') {
2371 foreach my $argument (@{$argument}) {
2372 next if ! defined $argument;
2373 push @{$hash->{$addr}}, $argument;
2374 }
2375 }
2376 else {
2377 push @{$hash->{$addr}}, $argument if defined $argument;
2378 }
2379 }
2380 else {
2381 $hash->{$addr} = $argument;
2382 }
2383 delete $args{$key};
2384 };
2385
0dee21d6
KW
2386 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2387
2388 # Convert 0 (meaning don't skip) to undef
2389 undef $skip{$addr} unless $skip{$addr};
2390
3274d896
KW
2391 # Handle the case where this file is optional
2392 my $pod_message_for_non_existent_optional = "";
2393 if ($optional{$addr}->@*) {
2394
2395 # First element is the pod message
2396 $pod_message_for_non_existent_optional
2397 = shift $optional{$addr}->@*;
2398 # Convert a 0 'Optional' argument to an empty list to make later
2399 # code more concise.
2400 if ( $optional{$addr}->@*
2401 && $optional{$addr}->@* == 1
2402 && $optional{$addr}[0] ne ""
2403 && $optional{$addr}[0] !~ /\D/
2404 && $optional{$addr}[0] == 0)
2405 {
2406 $optional{$addr} = [ ];
2407 }
2408 else { # But if the only element doesn't evaluate to 0, make sure
2409 # that this file is indeed considered optional below.
2410 unshift $optional{$addr}->@*, 1;
2411 }
2412 }
2413
0dee21d6 2414 my $progress;
a0204bb1
KW
2415 my $function_instead_of_file = 0;
2416
3d36fe12
KW
2417 if ($early{$addr}->@* > 4 && $early{$addr}[4] ne 'ONLY_EARLY') {
2418 Carp::my_carp_bug("If present, element [4] in 'Early => [ ... ]'"
cbdbe9d4
KW
2419 . " must be the string 'ONLY_EARLY'");
2420 }
2421
a0204bb1
KW
2422 # If we are compiling a Unicode release earlier than the file became
2423 # available, the constructor may have supplied a substitute
2424 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2425
2426 # Yes, we have a substitute, that we will use; mark it so
2427 unshift $early{$addr}->@*, 1;
2428
2429 # See the definition of %early for what the array elements mean.
2430 # If we have a property this defines, create a table and default
2431 # map for it now (at essentially compile time), so that it will be
2432 # available for the whole of run time. (We will want to add this
2433 # name as an alias when we are using the official property name;
2434 # but this must be deferred until run(), because at construction
2435 # time the official names have yet to be defined.)
2436 if ($early{$addr}[2]) {
2437 my $fate = ($property{$addr}
2438 && $property{$addr} eq $early{$addr}[2])
2439 ? $PLACEHOLDER
2440 : $INTERNAL_ONLY;
2441 my $prop_object = Property->new($early{$addr}[2],
2442 Fate => $fate,
2443 Perl_Extension => 1,
2444 );
2445
3d36fe12
KW
2446 # If not specified by the constructor, use the default mapping
2447 # for the regular property for this substitute one.
2448 if ($early{$addr}[3]) {
2449 $prop_object->set_default_map($early{$addr}[3]);
2450 }
2451 elsif ( defined $property{$addr}
2452 && defined $default_mapping{$property{$addr}})
a0204bb1
KW
2453 {
2454 $prop_object
2455 ->set_default_map($default_mapping{$property{$addr}});
2456 }
2457 }
2458
2459 if (ref $early{$addr}[1] eq 'CODE') {
2460 $function_instead_of_file = 1;
2461
2462 # If the first element of the array is a code ref, the others
2463 # are optional.
2464 $handler{$addr} = $early{$addr}[1];
2465 $property{$addr} = $early{$addr}[2]
2466 if defined $early{$addr}[2];
2467 $progress = "substitute $file{$addr}";
2468
2469 undef $file{$addr};
2470 }
2471 else { # Specifying a substitute file
2472
2473 if (! main::file_exists($early{$addr}[1])) {
2474
2475 # If we don't see the substitute file, generate an error
2476 # message giving the needed things, and add it to the list
2477 # of such to output before actual processing happens
2478 # (hence the user finds out all of them in one run).
ce8d64d9
KW
2479 # Instead of creating a general method for NameAliases,
2480 # hard-code it here, as there is unlikely to ever be a
2481 # second one which needs special handling.
2482 my $string_version = ($file{$addr} eq "NameAliases.txt")
2483 ? 'at least 6.1 (the later, the better)'
2484 : sprintf "%vd", $first_released{$addr};
a0204bb1
KW
2485 push @missing_early_files, <<END;
2486'$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2487END
2488 ;
2489 return;
2490 }
2491 $progress = $early{$addr}[1];
2492 $progress .= ", substituting for $file{$addr}" if $file{$addr};
2493 $file{$addr} = $early{$addr}[1];
2494 $property{$addr} = $early{$addr}[2];
0dee21d6 2495
a0204bb1
KW
2496 # Ignore code points not in the version being compiled
2497 push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2498
2499 if ( $v_version lt v2.0 # Hanguls in this release ...
3d36fe12 2500 && defined $early{$addr}[4]) # ... need special treatment
a0204bb1
KW
2501 {
2502 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2503 }
2504 }
2505
2506 # And this substitute is valid for all releases.
2507 $first_released{$addr} = v0;
2508 }
2509 else { # Normal behavior
0dee21d6 2510 $progress = $file{$addr};
a0204bb1 2511 unshift $early{$addr}->@*, 0; # No substitute
99870f4d
KW
2512 }
2513
0dee21d6
KW
2514 my $file = $file{$addr};
2515 $progress_message{$addr} = "Processing $progress"
2516 unless $progress_message{$addr};
2517
d8062573
KW
2518 # A file should be there if it is within the window of versions for
2519 # which Unicode supplies it
2520 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2521 $in_this_release{$addr} = 0;
2522 $skip{$addr} = "";
2523 }
2524 else {
0dee21d6
KW
2525 $in_this_release{$addr} = $first_released{$addr} le $v_version;
2526
a0204bb1
KW
2527 # Check that the file for this object (possibly using a substitute
2528 # for early releases) exists or we have a function alternative
2529 if ( ! $function_instead_of_file
2530 && ! main::file_exists($file))
0dee21d6
KW
2531 {
2532 # Here there is nothing available for this release. This is
2533 # fine if we aren't expecting anything in this release.
2534 if (! $in_this_release{$addr}) {
2535 $skip{$addr} = ""; # Don't remark since we expected
2536 # nothing and got nothing
2537 }
3274d896 2538 elsif ($optional{$addr}->@*) {
0dee21d6 2539
3274d896
KW
2540 # Here the file is optional in this release; Use the
2541 # passed in text to document this case in the pod.
2542 $skip{$addr} = $pod_message_for_non_existent_optional;
0dee21d6
KW
2543 }
2544 elsif ( $in_this_release{$addr}
2545 && ! defined $skip{$addr}
2546 && defined $file)
2547 { # Doesn't exist but should.
2548 $skip{$addr} = "'$file' not found. Possibly Big problems";
2549 Carp::my_carp($skip{$addr});
2550 }
2551 }
2552 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2553 {
2554
2555 # The file exists; if not skipped for another reason, and we are
2556 # skipping most everything during debugging builds, use that as
2557 # the skip reason.
2558 $skip{$addr} = '$debug_skip is on'
2559 }
d8062573 2560 }
0dee21d6
KW
2561
2562 if ( ! $debug_skip
2563 && $non_skip{$addr}
2564 && ! $required_even_in_debug_skip{$addr}
2565 && $verbosity)
2566 {
2567 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
a3a8c5f0 2568 }
99870f4d 2569
0dee21d6 2570 # Here, we have figured out if we will be skipping this file or not.
3274d896
KW
2571 # If so, we add any single property it defines to any passed in
2572 # optional property list. These will be dealt with at run time.
0dee21d6 2573 if (defined $skip{$addr}) {
3274d896
KW
2574 if ($property{$addr}) {
2575 push $optional{$addr}->@*, $property{$addr};
2576 }
2577 } # Otherwise, are going to process the file.
0dee21d6
KW
2578 elsif ($property{$addr}) {
2579
2580 # If the file has a property defined in the constructor for it, it
2581 # means that the property is not listed in the file's entries. So
2582 # add a handler (to the list of line handlers) to insert the
2583 # property name into the lines, to provide a uniform interface to
2584 # the final processing subroutine.
2585 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
09ca89ce 2586 }
83b68635
KW
2587 elsif ($properties{$addr}) {
2588
9be22ca5
KW
2589 # Similarly, there may be more than one property represented on
2590 # each line, with no clue but the constructor input what those
2591 # might be. Add a handler for each line in the input so that it
2592 # creates a separate input line for each property in those input
2593 # lines, thus making them suitable to handle generically.
83b68635
KW
2594
2595 push @{$each_line_handler{$addr}},
2596 sub {
2597 my $file = shift;
2598 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2599
2600 my @fields = split /\s*;\s*/, $_, -1;
2601
2602 if (@fields - 1 > @{$properties{$addr}}) {
2603 $file->carp_bad_line('Extra fields');
2604 $_ = "";
2605 return;
2606 }
2607 my $range = shift @fields; # 0th element is always the
2608 # range
2609
2610 # The next fields in the input line correspond
2611 # respectively to the stored properties.
2612 for my $i (0 .. @{$properties{$addr}} - 1) {
2613 my $property_name = $properties{$addr}[$i];
2614 next if $property_name eq '<ignored>';
2615 $file->insert_adjusted_lines(
2616 "$range; $property_name; $fields[$i]");
2617 }
2618 $_ = "";
2619
2620 return;
2621 };
2622 }
37e2e78e 2623
9be22ca5 2624 { # On non-ascii platforms, we use a special pre-handler
74cd47d0
KW
2625 no strict;
2626 no warnings 'once';
2627 *next_line = (main::NON_ASCII_PLATFORM)
2628 ? *_next_line_with_remapped_range
2629 : *_next_line;
2630 }
2631
40045c2d
KW
2632 &{$construction_time_handler{$addr}}($self)
2633 if $construction_time_handler{$addr};
2634
99870f4d 2635 return $self;
d73e5302
JH
2636 }
2637
cf25bb62 2638
99870f4d
KW
2639 use overload
2640 fallback => 0,
2641 qw("") => "_operator_stringify",
2642 "." => \&main::_operator_dot,
1285127e 2643 ".=" => \&main::_operator_dot_equal,
99870f4d 2644 ;
cf25bb62 2645
99870f4d
KW
2646 sub _operator_stringify {
2647 my $self = shift;
cf25bb62 2648
99870f4d 2649 return __PACKAGE__ . " object for " . $self->file;
d73e5302 2650 }
d73e5302 2651
99870f4d
KW
2652 sub run {
2653 # Process the input object $self. This opens and closes the file and
2654 # calls all the handlers for it. Currently, this can only be called
3848e5b7 2655 # once per file, as it destroy's the EOF handlers
d73e5302 2656
0dee21d6 2657 # flag to make sure extracted files are processed early
f8b69004 2658 state $seen_non_extracted = 0;
0dee21d6 2659
99870f4d
KW
2660 my $self = shift;
2661 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 2662
ffe43484 2663 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 2664
99870f4d 2665 my $file = $file{$addr};
d73e5302 2666
0dee21d6
KW
2667 if (! $file) {
2668 $handle{$addr} = 'pretend_is_open';
99870f4d
KW
2669 }
2670 else {
f8b69004 2671 if ($seen_non_extracted) {
9be22ca5
KW
2672 if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2673 # case of the file's name
2674 {
1675ea0d 2675 Carp::my_carp_bug(main::join_lines(<<END
99f78760 2676$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
2677anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2678have subtle problems
2679END
2680 ));
2681 }
2682 }
2683 elsif ($EXTRACTED_DIR
0dee21d6
KW
2684
2685 # We only do this check for generic property files
2686 && $handler{$addr} == \&main::process_generic_property_file
2687
f8b69004 2688 && $file !~ /$EXTRACTED/i)
99870f4d
KW
2689 {
2690 # We don't set this (by the 'if' above) if we have no
2691 # extracted directory, so if running on an early version,
2692 # this test won't work. Not worth worrying about.
f8b69004 2693 $seen_non_extracted = 1;
99870f4d
KW
2694 }
2695
9be22ca5 2696 # Mark the file as having being processed, and warn if it
99870f4d
KW
2697 # isn't a file we are expecting. As we process the files,
2698 # they are deleted from the hash, so any that remain at the
2699 # end of the program are files that we didn't process.
517956bf 2700 my $fkey = File::Spec->rel2abs($file);
0dee21d6 2701 my $exists = delete $potential_files{lc($fkey)};
faf3cf6b 2702
0dee21d6
KW
2703 Carp::my_carp("Was not expecting '$file'.")
2704 if $exists && ! $in_this_release{$addr};
2705
a0204bb1
KW
2706 # If there is special handling for compiling Unicode releases
2707 # earlier than the first one in which Unicode defines this
2708 # property ...
2709 if ($early{$addr}->@* > 1) {
2710
2711 # Mark as processed any substitute file that would be used in
2712 # such a release
2713 $fkey = File::Spec->rel2abs($early{$addr}[1]);
2714 delete $potential_files{lc($fkey)};
2715
2716 # As commented in the constructor code, when using the
2717 # official property, we still have to allow the publicly
2718 # inaccessible early name so that the core code which uses it
2719 # will work regardless.
2720 if (! $early{$addr}[0] && $early{$addr}->@* > 2) {
2721 my $early_property_name = $early{$addr}[2];
cbdbe9d4
KW
2722 if ( $property{$addr} ne $early_property_name
2723 && $early{$addr}->@* < 5)
2724 {
a0204bb1
KW
2725 main::property_ref($property{$addr})
2726 ->add_alias($early_property_name);
2727 }
2728 }
2729 }
2730
0dee21d6
KW
2731 # We may be skipping this file ...
2732 if (defined $skip{$addr}) {
2733
2734 # If the file isn't supposed to be in this release, there is
2735 # nothing to do
2736 if ($in_this_release{$addr}) {
2737
2738 # But otherwise, we may print a message
2739 if ($debug_skip) {
2740 print STDERR "Skipping input file '$file'",
2741 " because '$skip{$addr}'\n";
2742 }
2743
2744 # And add it to the list of skipped files, which is later
2745 # used to make the pod
2746 $skipped_files{$file} = $skip{$addr};
3274d896
KW
2747
2748 # The 'optional' list contains properties that are also to
2749 # be skipped along with the file. (There may also be
2750 # digits which are just placeholders to make sure it isn't
2751 # an empty list
2752 foreach my $property ($optional{$addr}->@*) {
2753 next unless $property =~ /\D/;
2754 my $prop_object = main::property_ref($property);
2755 next unless defined $prop_object;
2756 $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2757 }
0dee21d6 2758 }
99870f4d 2759
37e2e78e
KW
2760 return;
2761 }
2762
9be22ca5
KW
2763 # Here, we are going to process the file. Open it, converting the
2764 # slashes used in this program into the proper form for the OS
99870f4d
KW
2765 my $file_handle;
2766 if (not open $file_handle, "<", $file) {
2767 Carp::my_carp("Can't open $file. Skipping: $!");
d3664eb9 2768 return;
99870f4d
KW
2769 }
2770 $handle{$addr} = $file_handle; # Cache the open file handle
9e65c3f4 2771
0dee21d6
KW
2772 # If possible, make sure that the file is the correct version.
2773 # (This data isn't available on early Unicode releases or in
2774 # UnicodeData.txt.) We don't do this check if we are using a
2775 # substitute file instead of the official one (though the code
2776 # could be extended to do so).
2777 if ($in_this_release{$addr}
a0204bb1 2778 && ! $early{$addr}[0]
0dee21d6
KW
2779 && lc($file) ne 'unicodedata.txt')
2780 {
96f226dc 2781 if ($file !~ /^Unihan/i) {
0dee21d6
KW
2782
2783 # The non-Unihan files started getting version numbers in
2784 # 3.2, but some files in 4.0 are unchanged from 3.2, and
2785 # marked as 3.2. 4.0.1 is the first version where there
2786 # are no files marked as being from less than 4.0, though
2787 # some are marked as 4.0. In versions after that, the
2788 # numbers are correct.
2789 if ($v_version ge v4.0.1) {
2790 $_ = <$file_handle>; # The version number is in the
2791 # very first line
2792 if ($_ !~ / - $string_version \. /x) {
2793 chomp;
2794 $_ =~ s/^#\s*//;
2795
2796 # 4.0.1 had some valid files that weren't updated.
2797 if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
2798 die Carp::my_carp("File '$file' is version "
2799 . "'$_'. It should be "
2800 . "version $string_version");
2801 }
2802 }
cafe9cf0 2803 }
96f226dc 2804 }
0dee21d6
KW
2805 elsif ($v_version ge v6.0.0) { # Unihan
2806
2807 # Unihan files didn't get accurate version numbers until
2808 # 6.0. The version is somewhere in the first comment
2809 # block
96f226dc
KW
2810 while (<$file_handle>) {
2811 if ($_ !~ /^#/) {
0dee21d6
KW
2812 Carp::my_carp_bug("Could not find the expected "
2813 . "version info in file '$file'");
96f226dc
KW
2814 last;
2815 }
2816 chomp;
2817 $_ =~ s/^#\s*//;
2818 next if $_ !~ / version: /x;
2819 last if $_ =~ /$string_version/;
0dee21d6
KW
2820 die Carp::my_carp("File '$file' is version "
2821 . "'$_'. It should be "
2822 . "version $string_version");
96f226dc
KW
2823 }
2824 }
9e65c3f4 2825 }
99870f4d
KW
2826 }
2827
0dee21d6 2828 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
99870f4d
KW
2829
2830 # Call any special handler for before the file.
2831 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2832
2833 # Then the main handler
2834 &{$handler{$addr}}($self);
2835
2836 # Then any special post-file handler.
2837 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2838
2839 # If any errors have been accumulated, output the counts (as the first
2840 # error message in each class was output when it was encountered).
2841 if ($errors{$addr}) {
2842 my $total = 0;
2843 my $types = 0;
2844 foreach my $error (keys %{$errors{$addr}}) {
2845 $total += $errors{$addr}->{$error};
2846 delete $errors{$addr}->{$error};
2847 $types++;
2848 }
2849 if ($total > 1) {
2850 my $message
2851 = "A total of $total lines had errors in $file. ";
2852
2853 $message .= ($types == 1)
2854 ? '(Only the first one was displayed.)'
2855 : '(Only the first of each type was displayed.)';
2856 Carp::my_carp($message);
2857 }
2858 }
2859
2860 if (@{$missings{$addr}}) {
2861 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2862 }
2863
2864 # If a real file handle, close it.
2865 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2866 ref $handle{$addr};
2867 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2868 # the file, as opposed to undef
2869 return;
2870 }
2871
74cd47d0 2872 sub _next_line {
99870f4d
KW
2873 # Sets $_ to be the next logical input line, if any. Returns non-zero
2874 # if such a line exists. 'logical' means that any lines that have
2875 # been added via insert_lines() will be returned in $_ before the file
2876 # is read again.
2877
2878 my $self = shift;
2879 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2880
ffe43484 2881 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2882
2883 # Here the file is open (or if the handle is not a ref, is an open
2884 # 'virtual' file). Get the next line; any inserted lines get priority
2885 # over the file itself.
2886 my $adjusted;
2887
2888 LINE:
2889 while (1) { # Loop until find non-comment, non-empty line
2890 #local $to_trace = 1 if main::DEBUG;
2891 my $inserted_ref = shift @{$added_lines{$addr}};
2892 if (defined $inserted_ref) {
2893 ($adjusted, $_) = @{$inserted_ref};
2894 trace $adjusted, $_ if main::DEBUG && $to_trace;
2895 return 1 if $adjusted;
2896 }
2897 else {
2898 last if ! ref $handle{$addr}; # Don't read unless is real file
2899 last if ! defined ($_ = readline $handle{$addr});
2900 }
2901 chomp;
2902 trace $_ if main::DEBUG && $to_trace;
2903
2904 # See if this line is the comment line that defines what property
2905 # value that code points that are not listed in the file should
2906 # have. The format or existence of these lines is not guaranteed
2907 # by Unicode since they are comments, but the documentation says
2908 # that this was added for machine-readability, so probably won't
2909 # change. This works starting in Unicode Version 5.0. They look
2910 # like:
2911 #
2912 # @missing: 0000..10FFFF; Not_Reordered
2913 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2914 # @missing: 0000..10FFFF; ; NaN
2915 #
2916 # Save the line for a later get_missings() call.
2917 if (/$missing_defaults_prefix/) {
2918 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2919 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2920 }
2921 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2922 my @defaults = split /\s* ; \s*/x, $_;
2923
2924 # The first field is the @missing, which ends in a
2925 # semi-colon, so can safely shift.
2926 shift @defaults;
2927
2928 # Some of these lines may have empty field placeholders
2929 # which get in the way. An example is:
2930 # @missing: 0000..10FFFF; ; NaN
2931 # Remove them. Process starting from the top so the
2932 # splice doesn't affect things still to be looked at.
2933 for (my $i = @defaults - 1; $i >= 0; $i--) {
2934 next if $defaults[$i] ne "";
2935 splice @defaults, $i, 1;
2936 }
2937
2938 # What's left should be just the property (maybe) and the
2939 # default. Having only one element means it doesn't have
2940 # the property.
2941 my $default;
2942 my $property;
2943 if (@defaults >= 1) {
2944 if (@defaults == 1) {
2945 $default = $defaults[0];
2946 }
2947 else {
2948 $property = $defaults[0];
2949 $default = $defaults[1];
2950 }
2951 }
2952
2953 if (@defaults < 1
2954 || @defaults > 2
2955 || ($default =~ /^</
2956 && $default !~ /^<code *point>$/i
09f8d0ac
KW
2957 && $default !~ /^<none>$/i
2958 && $default !~ /^<script>$/i))
99870f4d
KW
2959 {
2960 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2961 }
2962 else {
2963
2964 # If the property is missing from the line, it should
2965 # be the one for the whole file
2966 $property = $property{$addr} if ! defined $property;
2967
2968 # Change <none> to the null string, which is what it
2969 # really means. If the default is the code point
2970 # itself, set it to <code point>, which is what
2971 # Unicode uses (but sometimes they've forgotten the
2972 # space)
2973 if ($default =~ /^<none>$/i) {
2974 $default = "";
2975 }
2976 elsif ($default =~ /^<code *point>$/i) {
2977 $default = $CODE_POINT;
2978 }
09f8d0ac
KW
2979 elsif ($default =~ /^<script>$/i) {
2980
2981 # Special case this one. Currently is from
2982 # ScriptExtensions.txt, and means for all unlisted
2983 # code points, use their Script property values.
2984 # For the code points not listed in that file, the
2985 # default value is 'Unknown'.
2986 $default = "Unknown";
2987 }
99870f4d
KW
2988
2989 # Store them as a sub-arrays with both components.
2990 push @{$missings{$addr}}, [ $default, $property ];
2991 }
2992 }
2993
2994 # There is nothing for the caller to process on this comment
2995 # line.
2996 next;
2997 }
2998
2999 # Remove comments and trailing space, and skip this line if the
3000 # result is empty
3001 s/#.*//;
3002 s/\s+$//;
3003 next if /^$/;
3004
3005 # Call any handlers for this line, and skip further processing of
3006 # the line if the handler sets the line to null.
3007 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3008 &{$sub_ref}($self);
3009 next LINE if /^$/;
3010 }
3011
3012 # Here the line is ok. return success.
3013 return 1;
3014 } # End of looping through lines.
3015
3848e5b7 3016 # If there are EOF handlers, call each (only once) and if it generates
99870f4d 3017 # more lines to process go back in the loop to handle them.
3848e5b7
KW
3018 while ($eof_handler{$addr}->@*) {
3019 &{$eof_handler{$addr}[0]}($self);
3020 shift $eof_handler{$addr}->@*; # Currently only get one shot at it.
99870f4d
KW
3021 goto LINE if $added_lines{$addr};
3022 }
3023
3024 # Return failure -- no more lines.
3025 return 0;
3026
3027 }
3028
74cd47d0
KW
3029 sub _next_line_with_remapped_range {
3030 my $self = shift;
3031 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3032
3033 # like _next_line(), but for use on non-ASCII platforms. It sets $_
3034 # to be the next logical input line, if any. Returns non-zero if such
3035 # a line exists. 'logical' means that any lines that have been added
3036 # via insert_lines() will be returned in $_ before the file is read
3037 # again.
3038 #
3039 # The difference from _next_line() is that this remaps the Unicode
3040 # code points in the input to those of the native platform. Each
3041 # input line contains a single code point, or a single contiguous
3042 # range of them This routine splits each range into its individual
3043 # code points and caches them. It returns the cached values,
3044 # translated into their native equivalents, one at a time, for each
3045 # call, before reading the next line. Since native values can only be
3046 # a single byte wide, no translation is needed for code points above
3047 # 0xFF, and ranges that are entirely above that number are not split.
3048 # If an input line contains the range 254-1000, it would be split into
3049 # three elements: 254, 255, and 256-1000. (The downstream table
3050 # insertion code will sort and coalesce the individual code points
3051 # into appropriate ranges.)
3052
3053 my $addr = do { no overloading; pack 'J', $self; };
3054
3055 while (1) {
3056
3057 # Look in cache before reading the next line. Return any cached
3058 # value, translated
3059 my $inserted = shift @{$remapped_lines{$addr}};
3060 if (defined $inserted) {
3061 trace $inserted if main::DEBUG && $to_trace;
3062 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3063 trace $_ if main::DEBUG && $to_trace;
3064 return 1;
3065 }
3066
3067 # Get the next line.
3068 return 0 unless _next_line($self);
3069
3070 # If there is a special handler for it, return the line,
3071 # untranslated. This should happen only for files that are
3072 # special, not being code-point related, such as property names.
3073 return 1 if $handler{$addr}
3074 != \&main::process_generic_property_file;
3075
3076 my ($range, $property_name, $map, @remainder)
3077 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3078
3079 if (@remainder
3080 || ! defined $property_name
3081 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3082 {
3083 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored");
3084 }
3085
3086 my $low = hex $1;
3087 my $high = (defined $2) ? hex $2 : $low;
3088
3089 # If the input maps the range to another code point, remap the
3090 # target if it is between 0 and 255.
3091 my $tail;
3092 if (defined $map) {
3093 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3094 $tail = "$property_name; $map";
3095 $_ = "$range; $tail";
3096 }
3097 else {
3098 $tail = $property_name;
3099 }
3100
3101 # If entire range is above 255, just return it, unchanged (except
3102 # any mapped-to code point, already changed above)
3103 return 1 if $low > 255;
3104
3105 # Cache an entry for every code point < 255. For those in the
3106 # range above 255, return a dummy entry for just that portion of
3107 # the range. Note that this will be out-of-order, but that is not
3108 # a problem.
3109 foreach my $code_point ($low .. $high) {
3110 if ($code_point > 255) {
3111 $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3112 return 1;
3113 }
3114 push @{$remapped_lines{$addr}}, "$code_point; $tail";
3115 }
3116 } # End of looping through lines.
3117
3118 # NOTREACHED
3119 }
3120
99870f4d
KW
3121# Not currently used, not fully tested.
3122# sub peek {
f67a5002 3123# # Non-destructive lookahead one non-adjusted, non-comment, non-blank
99870f4d
KW
3124# # record. Not callable from an each_line_handler(), nor does it call
3125# # an each_line_handler() on the line.
3126#
3127# my $self = shift;
ffe43484 3128# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3129#
3130# foreach my $inserted_ref (@{$added_lines{$addr}}) {
3131# my ($adjusted, $line) = @{$inserted_ref};
3132# next if $adjusted;
3133#
3134# # Remove comments and trailing space, and return a non-empty
3135# # resulting line
3136# $line =~ s/#.*//;
3137# $line =~ s/\s+$//;
3138# return $line if $line ne "";
3139# }
3140#
3141# return if ! ref $handle{$addr}; # Don't read unless is real file
3142# while (1) { # Loop until find non-comment, non-empty line
3143# local $to_trace = 1 if main::DEBUG;
3144# trace $_ if main::DEBUG && $to_trace;
3145# return if ! defined (my $line = readline $handle{$addr});
3146# chomp $line;
3147# push @{$added_lines{$addr}}, [ 0, $line ];
3148#
3149# $line =~ s/#.*//;
3150# $line =~ s/\s+$//;
3151# return $line if $line ne "";
3152# }
3153#
3154# return;
3155# }
3156
3157
3158 sub insert_lines {
3159 # Lines can be inserted so that it looks like they were in the input
3160 # file at the place it was when this routine is called. See also
3161 # insert_adjusted_lines(). Lines inserted via this routine go through
3162 # any each_line_handler()
3163
3164 my $self = shift;
3165
3166 # Each inserted line is an array, with the first element being 0 to
3167 # indicate that this line hasn't been adjusted, and needs to be
3168 # processed.
f998e60c 3169 no overloading;
051df77b 3170 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
3171 return;
3172 }
3173
3174 sub insert_adjusted_lines {
3175 # Lines can be inserted so that it looks like they were in the input
3176 # file at the place it was when this routine is called. See also
3177 # insert_lines(). Lines inserted via this routine are already fully
3178 # adjusted, ready to be processed; each_line_handler()s handlers will
3179 # not be called. This means this is not a completely general
3180 # facility, as only the last each_line_handler on the stack should
3181 # call this. It could be made more general, by passing to each of the
3182 # line_handlers their position on the stack, which they would pass on
3183 # to this routine, and that would replace the boolean first element in
3184 # the anonymous array pushed here, so that the next_line routine could
3185 # use that to call only those handlers whose index is after it on the
3186 # stack. But this is overkill for what is needed now.
3187
3188 my $self = shift;
3189 trace $_[0] if main::DEBUG && $to_trace;
3190
3191 # Each inserted line is an array, with the first element being 1 to
3192 # indicate that this line has been adjusted
f998e60c 3193 no overloading;
051df77b 3194 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
3195 return;
3196 }
3197
3198 sub get_missings {
3199 # Returns the stored up @missings lines' values, and clears the list.
3200 # The values are in an array, consisting of the default in the first
3201 # element, and the property in the 2nd. However, since these lines
3202 # can be stacked up, the return is an array of all these arrays.
3203
3204 my $self = shift;
3205 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3206
ffe43484 3207 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3208
3209 # If not accepting a list return, just return the first one.
3210 return shift @{$missings{$addr}} unless wantarray;
3211
3212 my @return = @{$missings{$addr}};
3213 undef @{$missings{$addr}};
3214 return @return;
3215 }
3216
a0204bb1
KW
3217 sub _exclude_unassigned {
3218
3219 # Takes the range in $_ and excludes code points that aren't assigned
3220 # in this release
3221
3222 state $skip_inserted_count = 0;
3223
3224 # Ignore recursive calls.
3225 if ($skip_inserted_count) {
3226 $skip_inserted_count--;
3227 return;
3228 }
3229
3230 # Find what code points are assigned in this release
3231 main::calculate_Assigned() if ! defined $Assigned;
3232
3233 my $self = shift;
3234 my $addr = do { no overloading; pack 'J', $self; };
3235 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3236
3237 my ($range, @remainder)
3238 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3239
3240 # Examine the range.
3241 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3242 {
3243 my $low = hex $1;
3244 my $high = (defined $2) ? hex $2 : $low;
3245
3246 # Split the range into subranges of just those code points in it
3247 # that are assigned.
3248 my @ranges = (Range_List->new(Initialize
3249 => Range->new($low, $high)) & $Assigned)->ranges;
3250
3251 # Do nothing if nothing in the original range is assigned in this
3252 # release; handle normally if everything is in this release.
3253 if (! @ranges) {
3254 $_ = "";
3255 }
3256 elsif (@ranges != 1) {
3257
3258 # Here, some code points in the original range aren't in this
3259 # release; @ranges gives the ones that are. Create fake input
3260 # lines for each of the ranges, and set things up so that when
3261 # this routine is called on that fake input, it will do
3262 # nothing.
3263 $skip_inserted_count = @ranges;
3264 my $remainder = join ";", @remainder;
3265 for my $range (@ranges) {
3266 $self->insert_lines(sprintf("%04X..%04X;%s",
3267 $range->start, $range->end, $remainder));
3268 }
3269 $_ = ""; # The original range is now defunct.
3270 }
3271 }
3272
3273 return;
3274 }
3275
3276 sub _fixup_obsolete_hanguls {
3277
3278 # This is called only when compiling Unicode version 1. All Unicode
3279 # data for subsequent releases assumes that the code points that were
3280 # Hangul syllables in this release only are something else, so if
3281 # using such data, we have to override it
3282
3283 my $self = shift;
3284 my $addr = do { no overloading; pack 'J', $self; };
3285 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3286
3287 my $object = main::property_ref($property{$addr});
53adf6a2
KW
3288 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3289 $FINAL_REMOVED_HANGUL_SYLLABLE,
a0204bb1
KW
3290 $early{$addr}[3], # Passed-in value for these
3291 Replace => $UNCONDITIONALLY);
3292 }
3293
99870f4d
KW
3294 sub _insert_property_into_line {
3295 # Add a property field to $_, if this file requires it.
3296
f998e60c 3297 my $self = shift;
ffe43484 3298 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 3299 my $property = $property{$addr};
99870f4d
KW
3300 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3301
3302 $_ =~ s/(;|$)/; $property$1/;
3303 return;
3304 }
3305
3306 sub carp_bad_line {
3307 # Output consistent error messages, using either a generic one, or the
3308 # one given by the optional parameter. To avoid gazillions of the
3309 # same message in case the syntax of a file is way off, this routine
3310 # only outputs the first instance of each message, incrementing a
3311 # count so the totals can be output at the end of the file.
3312
3313 my $self = shift;
3314 my $message = shift;
3315 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3316
ffe43484 3317 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3318
3319 $message = 'Unexpected line' unless $message;
3320
3321 # No trailing punctuation so as to fit with our addenda.
3322 $message =~ s/[.:;,]$//;
3323
3324 # If haven't seen this exact message before, output it now. Otherwise
3325 # increment the count of how many times it has occurred
3326 unless ($errors{$addr}->{$message}) {
3327 Carp::my_carp("$message in '$_' in "
f998e60c 3328 . $file{$addr}
99870f4d
KW
3329 . " at line $.. Skipping this line;");
3330 $errors{$addr}->{$message} = 1;
3331 }
3332 else {
3333 $errors{$addr}->{$message}++;
3334 }
3335
3336 # Clear the line to prevent any further (meaningful) processing of it.
3337 $_ = "";
3338
3339 return;
3340 }
3341} # End closure
3342
3343package Multi_Default;
3344
3345# Certain properties in early versions of Unicode had more than one possible
3346# default for code points missing from the files. In these cases, one
3347# default applies to everything left over after all the others are applied,
3348# and for each of the others, there is a description of which class of code
3349# points applies to it. This object helps implement this by storing the
3350# defaults, and for all but that final default, an eval string that generates
3351# the class that it applies to.
3352
3353
3354{ # Closure
3355
3356 main::setup_package();
3357
3358 my %class_defaults;
3359 # The defaults structure for the classes
3360 main::set_access('class_defaults', \%class_defaults);
3361
3362 my %other_default;
3363 # The default that applies to everything left over.
3364 main::set_access('other_default', \%other_default, 'r');
3365
3366
3367 sub new {
3368 # The constructor is called with default => eval pairs, terminated by
3369 # the left-over default. e.g.
3370 # Multi_Default->new(
3371 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3372 # - 0x200D',
3373 # 'R' => 'some other expression that evaluates to code points',
3374 # .
3375 # .
3376 # .
3377 # 'U'));
68c3923f
KW
3378 # It is best to leave the final value be the one that matches the
3379 # above-Unicode code points.
99870f4d
KW
3380
3381 my $class = shift;
3382
3383 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 3384 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3385
3386 while (@_ > 1) {
3387 my $default = shift;
3388 my $eval = shift;
3389 $class_defaults{$addr}->{$default} = $eval;
3390 }
3391
3392 $other_default{$addr} = shift;
3393
3394 return $self;
3395 }
3396
3397 sub get_next_defaults {
3398 # Iterates and returns the next class of defaults.
3399 my $self = shift;
3400 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3401
ffe43484 3402 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3403
3404 return each %{$class_defaults{$addr}};
3405 }
3406}
3407
3408package Alias;
3409
3410# An alias is one of the names that a table goes by. This class defines them
3411# including some attributes. Everything is currently setup in the
3412# constructor.
3413
3414
3415{ # Closure
3416
3417 main::setup_package();
3418
3419 my %name;
3420 main::set_access('name', \%name, 'r');
3421
3422 my %loose_match;
c12f2655 3423 # Should this name match loosely or not.
99870f4d
KW
3424 main::set_access('loose_match', \%loose_match, 'r');
3425
33e96e72
KW
3426 my %make_re_pod_entry;
3427 # Some aliases should not get their own entries in the re section of the
3428 # pod, because they are covered by a wild-card, and some we want to
3429 # discourage use of. Binary
f82fe4ba 3430 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
99870f4d 3431
fd1e3e84
KW
3432 my %ucd;
3433 # Is this documented to be accessible via Unicode::UCD
3434 main::set_access('ucd', \%ucd, 'r', 's');
3435
99870f4d
KW
3436 my %status;
3437 # Aliases have a status, like deprecated, or even suppressed (which means
3438 # they don't appear in documentation). Enum
3439 main::set_access('status', \%status, 'r');
3440
0eac1e20 3441 my %ok_as_filename;
99870f4d
KW
3442 # Similarly, some aliases should not be considered as usable ones for
3443 # external use, such as file names, or we don't want documentation to
3444 # recommend them. Boolean
0eac1e20 3445 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
99870f4d
KW
3446
3447 sub new {
3448 my $class = shift;
3449
3450 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 3451 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3452
3453 $name{$addr} = shift;
3454 $loose_match{$addr} = shift;
33e96e72 3455 $make_re_pod_entry{$addr} = shift;
0eac1e20 3456 $ok_as_filename{$addr} = shift;
99870f4d 3457 $status{$addr} = shift;
fd1e3e84 3458 $ucd{$addr} = shift;
99870f4d
KW
3459
3460 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3461
3462 # Null names are never ok externally
0eac1e20 3463 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
99870f4d
KW
3464
3465 return $self;
3466 }
3467}
3468
3469package Range;
3470
3471# A range is the basic unit for storing code points, and is described in the
3472# comments at the beginning of the program. Each range has a starting code
3473# point; an ending code point (not less than the starting one); a value
3474# that applies to every code point in between the two end-points, inclusive;
3475# and an enum type that applies to the value. The type is for the user's
3476# convenience, and has no meaning here, except that a non-zero type is
3477# considered to not obey the normal Unicode rules for having standard forms.
3478#
3479# The same structure is used for both map and match tables, even though in the
3480# latter, the value (and hence type) is irrelevant and could be used as a
3481# comment. In map tables, the value is what all the code points in the range
3482# map to. Type 0 values have the standardized version of the value stored as
3483# well, so as to not have to recalculate it a lot.
3484
3485sub trace { return main::trace(@_); }
3486
3487{ # Closure
3488
3489 main::setup_package();
3490
3491 my %start;
3492 main::set_access('start', \%start, 'r', 's');
3493
3494 my %end;
3495 main::set_access('end', \%end, 'r', 's');
3496
3497 my %value;
3498 main::set_access('value', \%value, 'r');
3499
3500 my %type;
3501 main::set_access('type', \%type, 'r');
3502
3503 my %standard_form;
3504 # The value in internal standard form. Defined only if the type is 0.
3505 main::set_access('standard_form', \%standard_form);
3506
3507 # Note that if these fields change, the dump() method should as well
3508
3509 sub new {
3510 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3511 my $class = shift;
3512
3513 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 3514 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3515
3516 $start{$addr} = shift;
3517 $end{$addr} = shift;
3518
3519 my %args = @_;
3520
3521 my $value = delete $args{'Value'}; # Can be 0
3522 $value = "" unless defined $value;
3523 $value{$addr} = $value;
3524
3525 $type{$addr} = delete $args{'Type'} || 0;
3526
3527 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3528
99870f4d
KW
3529 return $self;
3530 }
3531
3532 use overload
3533 fallback => 0,
3534 qw("") => "_operator_stringify",
3535 "." => \&main::_operator_dot,
1285127e 3536 ".=" => \&main::_operator_dot_equal,
99870f4d
KW
3537 ;
3538
3539 sub _operator_stringify {
3540 my $self = shift;
ffe43484 3541 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3542
3543 # Output it like '0041..0065 (value)'
3544 my $return = sprintf("%04X", $start{$addr})
3545 . '..'
3546 . sprintf("%04X", $end{$addr});
3547 my $value = $value{$addr};
3548 my $type = $type{$addr};
3549 $return .= ' (';
3550 $return .= "$value";
3551 $return .= ", Type=$type" if $type != 0;
3552 $return .= ')';
3553
3554 return $return;
3555 }
3556
3557 sub standard_form {
c292d35a
NC
3558 # Calculate the standard form only if needed, and cache the result.
3559 # The standard form is the value itself if the type is special.
3560 # This represents a considerable CPU and memory saving - at the time
3561 # of writing there are 368676 non-special objects, but the standard
3562 # form is only requested for 22047 of them - ie about 6%.
99870f4d
KW
3563
3564 my $self = shift;
3565 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3566
ffe43484 3567 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3568
3569 return $standard_form{$addr} if defined $standard_form{$addr};
c292d35a
NC
3570
3571 my $value = $value{$addr};
3572 return $value if $type{$addr};
3573 return $standard_form{$addr} = main::standardize($value);
99870f4d
KW
3574 }
3575
3576 sub dump {
3577 # Human, not machine readable. For machine readable, comment out this
3578 # entire routine and let the standard one take effect.
3579 my $self = shift;
3580 my $indent = shift;
3581 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3582
ffe43484 3583 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3584
3585 my $return = $indent
3586 . sprintf("%04X", $start{$addr})
3587 . '..'
3588 . sprintf("%04X", $end{$addr})
3589 . " '$value{$addr}';";
3590 if (! defined $standard_form{$addr}) {
3591 $return .= "(type=$type{$addr})";
3592 }
3593 elsif ($standard_form{$addr} ne $value{$addr}) {
3594 $return .= "(standard '$standard_form{$addr}')";
3595 }
3596 return $return;
3597 }
3598} # End closure
3599
3600package _Range_List_Base;
3601
3602# Base class for range lists. A range list is simply an ordered list of
3603# ranges, so that the ranges with the lowest starting numbers are first in it.
3604#
3605# When a new range is added that is adjacent to an existing range that has the
3606# same value and type, it merges with it to form a larger range.
3607#
3608# Ranges generally do not overlap, except that there can be multiple entries
3609# of single code point ranges. This is because of NameAliases.txt.
3610#
3611# In this program, there is a standard value such that if two different
3612# values, have the same standard value, they are considered equivalent. This
3613# value was chosen so that it gives correct results on Unicode data
3614
3615# There are a number of methods to manipulate range lists, and some operators
3616# are overloaded to handle them.
3617
99870f4d
KW
3618sub trace { return main::trace(@_); }
3619
3620{ # Closure
3621
3622 our $addr;
3623
5b348b71
KW
3624 # Max is initialized to a negative value that isn't adjacent to 0, for
3625 # simpler tests
3626 my $max_init = -2;
3627
99870f4d
KW
3628 main::setup_package();
3629
3630 my %ranges;
3631 # The list of ranges
3632 main::set_access('ranges', \%ranges, 'readable_array');
3633
3634 my %max;
3635 # The highest code point in the list. This was originally a method, but
3636 # actual measurements said it was used a lot.
3637 main::set_access('max', \%max, 'r');
3638
3639 my %each_range_iterator;
3640 # Iterator position for each_range()
3641 main::set_access('each_range_iterator', \%each_range_iterator);
3642
3643 my %owner_name_of;
3644 # Name of parent this is attached to, if any. Solely for better error
3645 # messages.
3646 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3647
3648 my %_search_ranges_cache;
3649 # A cache of the previous result from _search_ranges(), for better
3650 # performance
3651 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3652
3653 sub new {
3654 my $class = shift;
3655 my %args = @_;
3656
3657 # Optional initialization data for the range list.
3658 my $initialize = delete $args{'Initialize'};
3659
3660 my $self;
3661
3662 # Use _union() to initialize. _union() returns an object of this
3663 # class, which means that it will call this constructor recursively.
3664 # But it won't have this $initialize parameter so that it won't
3665 # infinitely loop on this.
3666 return _union($class, $initialize, %args) if defined $initialize;
3667
3668 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 3669 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3670
3671 # Optional parent object, only for debug info.
3672 $owner_name_of{$addr} = delete $args{'Owner'};
3673 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3674
3675 # Stringify, in case it is an object.
3676 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3677
3678 # This is used only for error messages, and so a colon is added
3679 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3680
3681 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3682
5b348b71 3683 $max{$addr} = $max_init;
99870f4d
KW
3684
3685 $_search_ranges_cache{$addr} = 0;
3686 $ranges{$addr} = [];
3687
3688 return $self;
3689 }
3690
3691 use overload
3692 fallback => 0,
3693 qw("") => "_operator_stringify",
3694 "." => \&main::_operator_dot,
1285127e 3695 ".=" => \&main::_operator_dot_equal,
99870f4d
KW
3696 ;
3697
3698 sub _operator_stringify {
3699 my $self = shift;
ffe43484 3700 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3701
3702 return "Range_List attached to '$owner_name_of{$addr}'"