This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Save reference to two commonly used tables
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
23e33b60 19require 5.010_001;
d73e5302 20use strict;
99870f4d 21use warnings;
cf25bb62 22use Carp;
99870f4d
KW
23use File::Find;
24use File::Path;
d07a55ed 25use File::Spec;
99870f4d
KW
26use Text::Tabs;
27
28sub DEBUG () { 0 } # Set to 0 for production; 1 for development
29
30##########################################################################
31#
32# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
33# from the Unicode database files (lib/unicore/.../*.txt), It also generates
34# a pod file and a .t file
35#
36# The structure of this file is:
37# First these introductory comments; then
38# code needed for everywhere, such as debugging stuff; then
39# code to handle input parameters; then
40# data structures likely to be of external interest (some of which depend on
41# the input parameters, so follows them; then
42# more data structures and subroutine and package (class) definitions; then
43# the small actual loop to process the input files and finish up; then
44# a __DATA__ section, for the .t tests
45#
46# This program works on all releases of Unicode through at least 5.2. The
47# outputs have been scrutinized most intently for release 5.1. The others
48# have been checked for somewhat more than just sanity. It can handle all
49# existing Unicode character properties in those releases.
50#
99870f4d
KW
51# This program is mostly about Unicode character (or code point) properties.
52# A property describes some attribute or quality of a code point, like if it
53# is lowercase or not, its name, what version of Unicode it was first defined
54# in, or what its uppercase equivalent is. Unicode deals with these disparate
55# possibilities by making all properties into mappings from each code point
56# into some corresponding value. In the case of it being lowercase or not,
57# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
58# property maps each Unicode code point to a single value, called a "property
59# value". (Hence each Unicode property is a true mathematical function with
60# exactly one value per code point.)
61#
62# When using a property in a regular expression, what is desired isn't the
63# mapping of the code point to its property's value, but the reverse (or the
64# mathematical "inverse relation"): starting with the property value, "Does a
65# code point map to it?" These are written in a "compound" form:
66# \p{property=value}, e.g., \p{category=punctuation}. This program generates
67# files containing the lists of code points that map to each such regular
68# expression property value, one file per list
69#
70# There is also a single form shortcut that Perl adds for many of the commonly
71# used properties. This happens for all binary properties, plus script,
72# general_category, and block properties.
73#
74# Thus the outputs of this program are files. There are map files, mostly in
75# the 'To' directory; and there are list files for use in regular expression
76# matching, all in subdirectories of the 'lib' directory, with each
77# subdirectory being named for the property that the lists in it are for.
78# Bookkeeping, test, and documentation files are also generated.
79
80my $matches_directory = 'lib'; # Where match (\p{}) files go.
81my $map_directory = 'To'; # Where map files go.
82
83# DATA STRUCTURES
84#
85# The major data structures of this program are Property, of course, but also
86# Table. There are two kinds of tables, very similar to each other.
87# "Match_Table" is the data structure giving the list of code points that have
88# a particular property value, mentioned above. There is also a "Map_Table"
89# data structure which gives the property's mapping from code point to value.
90# There are two structures because the match tables need to be combined in
91# various ways, such as constructing unions, intersections, complements, etc.,
92# and the map ones don't. And there would be problems, perhaps subtle, if
93# a map table were inadvertently operated on in some of those ways.
94# The use of separate classes with operations defined on one but not the other
95# prevents accidentally confusing the two.
96#
97# At the heart of each table's data structure is a "Range_List", which is just
98# an ordered list of "Ranges", plus ancillary information, and methods to
99# operate on them. A Range is a compact way to store property information.
100# Each range has a starting code point, an ending code point, and a value that
101# is meant to apply to all the code points between the two end points,
102# inclusive. For a map table, this value is the property value for those
103# code points. Two such ranges could be written like this:
104# 0x41 .. 0x5A, 'Upper',
105# 0x61 .. 0x7A, 'Lower'
106#
107# Each range also has a type used as a convenience to classify the values.
108# Most ranges in this program will be Type 0, or normal, but there are some
109# ranges that have a non-zero type. These are used only in map tables, and
110# are for mappings that don't fit into the normal scheme of things. Mappings
111# that require a hash entry to communicate with utf8.c are one example;
112# another example is mappings for charnames.pm to use which indicate a name
113# that is algorithmically determinable from its code point (and vice-versa).
114# These are used to significantly compact these tables, instead of listing
115# each one of the tens of thousands individually.
116#
117# In a match table, the value of a range is irrelevant (and hence the type as
118# well, which will always be 0), and arbitrarily set to the null string.
119# Using the example above, there would be two match tables for those two
120# entries, one named Upper would contain the 0x41..0x5A range, and the other
121# named Lower would contain 0x61..0x7A.
122#
123# Actually, there are two types of range lists, "Range_Map" is the one
124# associated with map tables, and "Range_List" with match tables.
125# Again, this is so that methods can be defined on one and not the other so as
126# to prevent operating on them in incorrect ways.
127#
128# Eventually, most tables are written out to files to be read by utf8_heavy.pl
129# in the perl core. All tables could in theory be written, but some are
130# suppressed because there is no current practical use for them. It is easy
131# to change which get written by changing various lists that are near the top
132# of the actual code in this file. The table data structures contain enough
133# ancillary information to allow them to be treated as separate entities for
134# writing, such as the path to each one's file. There is a heading in each
135# map table that gives the format of its entries, and what the map is for all
136# the code points missing from it. (This allows tables to be more compact.)
678f13d5 137#
99870f4d
KW
138# The Property data structure contains one or more tables. All properties
139# contain a map table (except the $perl property which is a
140# pseudo-property containing only match tables), and any properties that
141# are usable in regular expression matches also contain various matching
142# tables, one for each value the property can have. A binary property can
143# have two values, True and False (or Y and N, which are preferred by Unicode
144# terminology). Thus each of these properties will have a map table that
145# takes every code point and maps it to Y or N (but having ranges cuts the
146# number of entries in that table way down), and two match tables, one
147# which has a list of all the code points that map to Y, and one for all the
148# code points that map to N. (For each of these, a third table is also
149# generated for the pseudo Perl property. It contains the identical code
150# points as the Y table, but can be written, not in the compound form, but in
151# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
152# properties have several possible values, some have many, and properties like
153# Name have a different value for every named code point. Those will not,
154# unless the controlling lists are changed, have their match tables written
155# out. But all the ones which can be used in regular expression \p{} and \P{}
156# constructs will. Generally a property will have either its map table or its
157# match tables written but not both. Again, what gets written is controlled
158# by lists which can easily be changed.
678f13d5 159#
99870f4d
KW
160# For information about the Unicode properties, see Unicode's UAX44 document:
161
162my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
163
164# As stated earlier, this program will work on any release of Unicode so far.
165# Most obvious problems in earlier data have NOT been corrected except when
166# necessary to make Perl or this program work reasonably. For example, no
167# folding information was given in early releases, so this program uses the
168# substitute of lower case, just so that a regular expression with the /i
169# option will do something that actually gives the right results in many
170# cases. There are also a couple other corrections for version 1.1.5,
171# commented at the point they are made. As an example of corrections that
172# weren't made (but could be) is this statement from DerivedAge.txt: "The
173# supplementary private use code points and the non-character code points were
174# assigned in version 2.0, but not specifically listed in the UCD until
175# versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0)
176# More information on Unicode version glitches is further down in these
177# introductory comments.
178#
179# This program works on all properties as of 5.2, though the files for some
678f13d5
KW
180# are suppressed from apparent lack of demand for them. You can change which
181# are output by changing lists in this program.
182#
99870f4d
KW
183# The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
184# loose matchings rules (from Unicode TR18):
185#
186# The recommended names for UCD properties and property values are in
187# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
188# [PropValue]. There are both abbreviated names and longer, more
189# descriptive names. It is strongly recommended that both names be
190# recognized, and that loose matching of property names be used,
191# whereby the case distinctions, whitespace, hyphens, and underbar
192# are ignored.
193# The program still allows Fuzzy to override its determination of if loose
194# matching should be used, but it isn't currently used, as it is no longer
195# needed; the calculations it makes are good enough.
678f13d5 196#
99870f4d
KW
197# SUMMARY OF HOW IT WORKS:
198#
199# Process arguments
200#
201# A list is constructed containing each input file that is to be processed
202#
203# Each file on the list is processed in a loop, using the associated handler
204# code for each:
205# The PropertyAliases.txt and PropValueAliases.txt files are processed
206# first. These files name the properties and property values.
207# Objects are created of all the property and property value names
208# that the rest of the input should expect, including all synonyms.
209# The other input files give mappings from properties to property
210# values. That is, they list code points and say what the mapping
211# is under the given property. Some files give the mappings for
212# just one property; and some for many. This program goes through
213# each file and populates the properties from them. Some properties
214# are listed in more than one file, and Unicode has set up a
215# precedence as to which has priority if there is a conflict. Thus
216# the order of processing matters, and this program handles the
217# conflict possibility by processing the overriding input files
218# last, so that if necessary they replace earlier values.
219# After this is all done, the program creates the property mappings not
220# furnished by Unicode, but derivable from what it does give.
221# The tables of code points that match each property value in each
222# property that is accessible by regular expressions are created.
223# The Perl-defined properties are created and populated. Many of these
224# require data determined from the earlier steps
225# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 226# and Unicode are reconciled and warned about.
99870f4d
KW
227# All the properties are written to files
228# Any other files are written, and final warnings issued.
678f13d5 229#
99870f4d
KW
230# For clarity, a number of operators have been overloaded to work on tables:
231# ~ means invert (take all characters not in the set). The more
232# conventional '!' is not used because of the possibility of confusing
233# it with the actual boolean operation.
234# + means union
235# - means subtraction
236# & means intersection
237# The precedence of these is the order listed. Parentheses should be
238# copiously used. These are not a general scheme. The operations aren't
239# defined for a number of things, deliberately, to avoid getting into trouble.
240# Operations are done on references and affect the underlying structures, so
241# that the copy constructors for them have been overloaded to not return a new
242# clone, but the input object itself.
678f13d5 243#
99870f4d
KW
244# The bool operator is deliberately not overloaded to avoid confusion with
245# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
246#
247# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
248#
249# This program needs to be able to run under miniperl. Therefore, it uses a
250# minimum of other modules, and hence implements some things itself that could
251# be gotten from CPAN
252#
253# This program uses inputs published by the Unicode Consortium. These can
254# change incompatibly between releases without the Perl maintainers realizing
255# it. Therefore this program is now designed to try to flag these. It looks
256# at the directories where the inputs are, and flags any unrecognized files.
257# It keeps track of all the properties in the files it handles, and flags any
258# that it doesn't know how to handle. It also flags any input lines that
259# don't match the expected syntax, among other checks.
260#
261# It is also designed so if a new input file matches one of the known
262# templates, one hopefully just needs to add it to a list to have it
263# processed.
264#
265# As mentioned earlier, some properties are given in more than one file. In
266# particular, the files in the extracted directory are supposedly just
267# reformattings of the others. But they contain information not easily
268# derivable from the other files, including results for Unihan, which this
269# program doesn't ordinarily look at, and for unassigned code points. They
270# also have historically had errors or been incomplete. In an attempt to
271# create the best possible data, this program thus processes them first to
272# glean information missing from the other files; then processes those other
273# files to override any errors in the extracted ones. Much of the design was
274# driven by this need to store things and then possibly override them.
275#
276# It tries to keep fatal errors to a minimum, to generate something usable for
277# testing purposes. It always looks for files that could be inputs, and will
278# warn about any that it doesn't know how to handle (the -q option suppresses
279# the warning).
99870f4d
KW
280#
281# Why have files written out for binary 'N' matches?
282# For binary properties, if you know the mapping for either Y or N; the
678f13d5
KW
283# other is trivial to construct, so could be done at Perl run-time by just
284# complementing the result, instead of having a file for it. That is, if
285# someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
286# not need a file. The problem is communicating to Perl that a given
287# property is binary. Perl can't figure it out from looking at the N (or
288# No), as some non-binary properties have these as property values. So
289# rather than inventing a way to communicate this info back to the core,
290# which would have required changes there as well, it was simpler just to
291# add the extra tables.
292#
293# Why is there more than one type of range?
294# This simplified things. There are some very specialized code points that
295# have to be handled specially for output, such as Hangul syllable names.
296# By creating a range type (done late in the development process), it
297# allowed this to be stored with the range, and overridden by other input.
298# Originally these were stored in another data structure, and it became a
299# mess trying to decide if a second file that was for the same property was
300# overriding the earlier one or not.
301#
302# Why are there two kinds of tables, match and map?
303# (And there is a base class shared by the two as well.) As stated above,
304# they actually are for different things. Development proceeded much more
305# smoothly when I (khw) realized the distinction. Map tables are used to
306# give the property value for every code point (actually every code point
307# that doesn't map to a default value). Match tables are used for regular
308# expression matches, and are essentially the inverse mapping. Separating
309# the two allows more specialized methods, and error checks so that one
310# can't just take the intersection of two map tables, for example, as that
311# is nonsensical.
99870f4d
KW
312#
313# There are no match tables generated for matches of the null string. These
c1739a4a 314# would look like qr/\p{JSN=}/ currently without modifying the regex code.
678f13d5
KW
315# Perhaps something like them could be added if necessary. The JSN does have
316# a real code point U+110B that maps to the null string, but it is a
317# contributory property, and therefore not output by default. And it's easily
318# handled so far by making the null string the default where it is a
319# possibility.
99870f4d 320#
23e33b60
KW
321# DEBUGGING
322#
678f13d5
KW
323# This program is written so it will run under miniperl. Occasionally changes
324# will cause an error where the backtrace doesn't work well under miniperl.
325# To diagnose the problem, you can instead run it under regular perl, if you
326# have one compiled.
327#
328# There is a good trace facility. To enable it, first sub DEBUG must be set
329# to return true. Then a line like
330#
331# local $to_trace = 1 if main::DEBUG;
332#
333# can be added to enable tracing in its lexical scope or until you insert
334# another line:
335#
336# local $to_trace = 0 if main::DEBUG;
337#
338# then use a line like "trace $a, @b, %c, ...;
339#
340# Some of the more complex subroutines already have trace statements in them.
341# Permanent trace statements should be like:
342#
343# trace ... if main::DEBUG && $to_trace;
344#
345# If there is just one or a few files that you're debugging, you can easily
346# cause most everything else to be skipped. Change the line
347#
348# my $debug_skip = 0;
349#
350# to 1, and every file whose object is in @input_file_objects and doesn't have
351# a, 'non_skip => 1,' in its constructor will be skipped.
352#
99870f4d
KW
353# FUTURE ISSUES
354#
355# The program would break if Unicode were to change its names so that
356# interior white space, underscores, or dashes differences were significant
357# within property and property value names.
358#
359# It might be easier to use the xml versions of the UCD if this program ever
360# would need heavy revision, and the ability to handle old versions was not
361# required.
362#
363# There is the potential for name collisions, in that Perl has chosen names
364# that Unicode could decide it also likes. There have been such collisions in
365# the past, with mostly Perl deciding to adopt the Unicode definition of the
366# name. However in the 5.2 Unicode beta testing, there were a number of such
367# collisions, which were withdrawn before the final release, because of Perl's
368# and other's protests. These all involved new properties which began with
369# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
370# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
371# Unicode document, so they are unlikely to be used by Unicode for another
372# purpose. However, they might try something beginning with 'In', or use any
373# of the other Perl-defined properties. This program will warn you of name
374# collisions, and refuse to generate tables with them, but manual intervention
375# will be required in this event. One scheme that could be implemented, if
376# necessary, would be to have this program generate another file, or add a
377# field to mktables.lst that gives the date of first definition of a property.
378# Each new release of Unicode would use that file as a basis for the next
379# iteration. And the Perl synonym addition code could sort based on the age
380# of the property, so older properties get priority, and newer ones that clash
381# would be refused; hence existing code would not be impacted, and some other
382# synonym would have to be used for the new property. This is ugly, and
383# manual intervention would certainly be easier to do in the short run; lets
384# hope it never comes to this.
678f13d5 385#
99870f4d
KW
386# A NOTE ON UNIHAN
387#
388# This program can generate tables from the Unihan database. But it doesn't
389# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
390# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
391# database was split into 8 different files, all beginning with the letters
392# 'Unihan'. This program will read those file(s) if present, but it needs to
393# know which of the many properties in the file(s) should have tables created
394# for them. It will create tables for any properties listed in
395# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
396# @cjk_properties array and the @cjk_property_values array. Thus, if a
397# property you want is not in those files of the release you are building
398# against, you must add it to those two arrays. Starting in 4.0, the
399# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
400# is present in the directory, a table will be generated for that property.
401# In 5.2, several more properties were added. For your convenience, the two
402# arrays are initialized with all the 5.2 listed properties that are also in
403# earlier releases. But these are commented out. You can just uncomment the
404# ones you want, or use them as a template for adding entries for other
405# properties.
406#
407# You may need to adjust the entries to suit your purposes. setup_unihan(),
408# and filter_unihan_line() are the functions where this is done. This program
409# already does some adjusting to make the lines look more like the rest of the
410# Unicode DB; You can see what that is in filter_unihan_line()
411#
412# There is a bug in the 3.2 data file in which some values for the
413# kPrimaryNumeric property have commas and an unexpected comment. A filter
414# could be added for these; or for a particular installation, the Unihan.txt
415# file could be edited to fix them.
99870f4d 416#
678f13d5
KW
417# HOW TO ADD A FILE TO BE PROCESSED
418#
419# A new file from Unicode needs to have an object constructed for it in
420# @input_file_objects, probably at the end or at the end of the extracted
421# ones. The program should warn you if its name will clash with others on
422# restrictive file systems, like DOS. If so, figure out a better name, and
423# add lines to the README.perl file giving that. If the file is a character
424# property, it should be in the format that Unicode has by default
425# standardized for such files for the more recently introduced ones.
426# If so, the Input_file constructor for @input_file_objects can just be the
427# file name and release it first appeared in. If not, then it should be
428# possible to construct an each_line_handler() to massage the line into the
429# standardized form.
430#
431# For non-character properties, more code will be needed. You can look at
432# the existing entries for clues.
433#
434# UNICODE VERSIONS NOTES
435#
436# The Unicode UCD has had a number of errors in it over the versions. And
437# these remain, by policy, in the standard for that version. Therefore it is
438# risky to correct them, because code may be expecting the error. So this
439# program doesn't generally make changes, unless the error breaks the Perl
440# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
441# for U+1105, which causes real problems for the algorithms for Jamo
442# calculations, so it is changed here.
443#
444# But it isn't so clear cut as to what to do about concepts that are
445# introduced in a later release; should they extend back to earlier releases
446# where the concept just didn't exist? It was easier to do this than to not,
447# so that's what was done. For example, the default value for code points not
448# in the files for various properties was probably undefined until changed by
449# some version. No_Block for blocks is such an example. This program will
450# assign No_Block even in Unicode versions that didn't have it. This has the
451# benefit that code being written doesn't have to special case earlier
452# versions; and the detriment that it doesn't match the Standard precisely for
453# the affected versions.
454#
455# Here are some observations about some of the issues in early versions:
456#
457# The number of code points in \p{alpha} halve in 2.1.9. It turns out that
458# the reason is that the CJK block starting at 4E00 was removed from PropList,
459# and was not put back in until 3.1.0
460#
461# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
462# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
463# reason is that 3.2 introduced U+205F=medium math space, which was not
464# classed as white space, but Perl figured out that it should have been. 4.0
465# reclassified it correctly.
466#
467# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
468# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
469# was left with no code points, as all the ones that mapped to 202 stayed
470# mapped to 202. Thus if your program used the numeric name for the class,
471# it would not have been affected, but if it used the mnemonic, it would have
472# been.
473#
474# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
475# points which eventually came to have this script property value, instead
476# mapped to "Unknown". But in the next release all these code points were
477# moved to \p{sc=common} instead.
99870f4d
KW
478#
479# The default for missing code points for BidiClass is complicated. Starting
480# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
481# tries to do the best it can for earlier releases. It is done in
482# process_PropertyAliases()
483#
484##############################################################################
485
486my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
487 # and errors
488my $MAX_LINE_WIDTH = 78;
489
490# Debugging aid to skip most files so as to not be distracted by them when
491# concentrating on the ones being debugged. Add
492# non_skip => 1,
493# to the constructor for those files you want processed when you set this.
494# Files with a first version number of 0 are special: they are always
495# processed regardless of the state of this flag.
496my $debug_skip = 0;
497
498# Set to 1 to enable tracing.
499our $to_trace = 0;
500
501{ # Closure for trace: debugging aid
502 my $print_caller = 1; # ? Include calling subroutine name
503 my $main_with_colon = 'main::';
504 my $main_colon_length = length($main_with_colon);
505
506 sub trace {
507 return unless $to_trace; # Do nothing if global flag not set
508
509 my @input = @_;
510
511 local $DB::trace = 0;
512 $DB::trace = 0; # Quiet 'used only once' message
513
514 my $line_number;
515
516 # Loop looking up the stack to get the first non-trace caller
517 my $caller_line;
518 my $caller_name;
519 my $i = 0;
520 do {
521 $line_number = $caller_line;
522 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
523 $caller = $main_with_colon unless defined $caller;
524
525 $caller_name = $caller;
526
527 # get rid of pkg
528 $caller_name =~ s/.*:://;
529 if (substr($caller_name, 0, $main_colon_length)
530 eq $main_with_colon)
531 {
532 $caller_name = substr($caller_name, $main_colon_length);
533 }
534
535 } until ($caller_name ne 'trace');
536
537 # If the stack was empty, we were called from the top level
538 $caller_name = 'main' if ($caller_name eq ""
539 || $caller_name eq 'trace');
540
541 my $output = "";
542 foreach my $string (@input) {
543 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
544 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
545 $output .= simple_dumper($string);
546 }
547 else {
548 $string = "$string" if ref $string;
549 $string = $UNDEF unless defined $string;
550 chomp $string;
551 $string = '""' if $string eq "";
552 $output .= " " if $output ne ""
553 && $string ne ""
554 && substr($output, -1, 1) ne " "
555 && substr($string, 0, 1) ne " ";
556 $output .= $string;
557 }
558 }
559
99f78760
KW
560 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
561 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
562 print STDERR $output, "\n";
563 return;
564 }
565}
566
567# This is for a rarely used development feature that allows you to compare two
568# versions of the Unicode standard without having to deal with changes caused
569# by the code points introduced in the later verson. Change the 0 to a SINGLE
570# dotted Unicode release number (e.g. 2.1). Only code points introduced in
571# that release and earlier will be used; later ones are thrown away. You use
572# the version number of the earliest one you want to compare; then run this
573# program on directory structures containing each release, and compare the
574# outputs. These outputs will therefore include only the code points common
575# to both releases, and you can see the changes caused just by the underlying
576# release semantic changes. For versions earlier than 3.2, you must copy a
577# version of DAge.txt into the directory.
578my $string_compare_versions = DEBUG && 0; # e.g., v2.1;
579my $compare_versions = DEBUG
580 && $string_compare_versions
581 && pack "C*", split /\./, $string_compare_versions;
582
583sub uniques {
584 # Returns non-duplicated input values. From "Perl Best Practices:
585 # Encapsulated Cleverness". p. 455 in first edition.
586
587 my %seen;
0e407844
NC
588 # Arguably this breaks encapsulation, if the goal is to permit multiple
589 # distinct objects to stringify to the same value, and be interchangeable.
590 # However, for this program, no two objects stringify identically, and all
591 # lists passed to this function are either objects or strings. So this
592 # doesn't affect correctness, but it does give a couple of percent speedup.
593 no overloading;
99870f4d
KW
594 return grep { ! $seen{$_}++ } @_;
595}
596
597$0 = File::Spec->canonpath($0);
598
599my $make_test_script = 0; # ? Should we output a test script
600my $write_unchanged_files = 0; # ? Should we update the output files even if
601 # we don't think they have changed
602my $use_directory = ""; # ? Should we chdir somewhere.
603my $pod_directory; # input directory to store the pod file.
604my $pod_file = 'perluniprops';
605my $t_path; # Path to the .t test file
606my $file_list = 'mktables.lst'; # File to store input and output file names.
607 # This is used to speed up the build, by not
608 # executing the main body of the program if
609 # nothing on the list has changed since the
610 # previous build
611my $make_list = 1; # ? Should we write $file_list. Set to always
612 # make a list so that when the pumpking is
613 # preparing a release, s/he won't have to do
614 # special things
615my $glob_list = 0; # ? Should we try to include unknown .txt files
616 # in the input.
617my $output_range_counts = 1; # ? Should we include the number of code points
618 # in ranges in the output
9ef2b94f
KW
619my $output_names = 0; # ? Should character names be in the output
620my @viacode; # Contains the 1 million character names, if
621 # $output_names is true
622
99870f4d
KW
623# Verbosity levels; 0 is quiet
624my $NORMAL_VERBOSITY = 1;
625my $PROGRESS = 2;
626my $VERBOSE = 3;
627
628my $verbosity = $NORMAL_VERBOSITY;
629
630# Process arguments
631while (@ARGV) {
cf25bb62
JH
632 my $arg = shift @ARGV;
633 if ($arg eq '-v') {
99870f4d
KW
634 $verbosity = $VERBOSE;
635 }
636 elsif ($arg eq '-p') {
637 $verbosity = $PROGRESS;
638 $| = 1; # Flush buffers as we go.
639 }
640 elsif ($arg eq '-q') {
641 $verbosity = 0;
642 }
643 elsif ($arg eq '-w') {
644 $write_unchanged_files = 1; # update the files even if havent changed
645 }
646 elsif ($arg eq '-check') {
6ae7e459
YO
647 my $this = shift @ARGV;
648 my $ok = shift @ARGV;
649 if ($this ne $ok) {
650 print "Skipping as check params are not the same.\n";
651 exit(0);
652 }
00a8df5c 653 }
99870f4d
KW
654 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
655 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
656 }
3df51b85
KW
657 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
658 {
99870f4d 659 $make_test_script = 1;
99870f4d
KW
660 }
661 elsif ($arg eq '-makelist') {
662 $make_list = 1;
663 }
664 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
665 -d $use_directory or croak "Unknown directory '$use_directory'";
666 }
667 elsif ($arg eq '-L') {
668
669 # Existence not tested until have chdir'd
670 $file_list = shift;
671 }
672 elsif ($arg eq '-globlist') {
673 $glob_list = 1;
674 }
675 elsif ($arg eq '-c') {
676 $output_range_counts = ! $output_range_counts
677 }
9ef2b94f
KW
678 elsif ($arg eq '-output_names') {
679 $output_names = 1;
680 }
99870f4d
KW
681 else {
682 my $with_c = 'with';
683 $with_c .= 'out' if $output_range_counts; # Complements the state
684 croak <<END;
685usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
686 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
687 [-check A B ]
688 -c : Output comments $with_c number of code points in ranges
689 -q : Quiet Mode: Only output serious warnings.
690 -p : Set verbosity level to normal plus show progress.
691 -v : Set Verbosity level high: Show progress and non-serious
692 warnings
693 -w : Write files regardless
694 -C dir : Change to this directory before proceeding. All relative paths
695 except those specified by the -P and -T options will be done
696 with respect to this directory.
697 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 698 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
699 -L filelist : Use alternate 'filelist' instead of standard one
700 -globlist : Take as input all non-Test *.txt files in current and sub
701 directories
3df51b85
KW
702 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
703 overrides -T
99870f4d 704 -makelist : Rewrite the file list $file_list based on current setup
9ef2b94f
KW
705 -output_names : Output each character's name in the table files; useful for
706 doing what-ifs, looking at diffs; is slow, memory intensive,
707 resulting tables are usable but very large.
99870f4d
KW
708 -check A B : Executes $0 only if A and B are the same
709END
710 }
711}
712
713# Stores the most-recently changed file. If none have changed, can skip the
714# build
715my $youngest = -M $0; # Do this before the chdir!
716
717# Change directories now, because need to read 'version' early.
718if ($use_directory) {
3df51b85 719 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
720 $pod_directory = File::Spec->rel2abs($pod_directory);
721 }
3df51b85 722 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 723 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 724 }
99870f4d 725 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 726 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 727 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 728 }
3df51b85 729 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 730 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 731 }
00a8df5c
YO
732}
733
99870f4d
KW
734# Get Unicode version into regular and v-string. This is done now because
735# various tables below get populated based on it. These tables are populated
736# here to be near the top of the file, and so easily seeable by those needing
737# to modify things.
738open my $VERSION, "<", "version"
739 or croak "$0: can't open required file 'version': $!\n";
740my $string_version = <$VERSION>;
741close $VERSION;
742chomp $string_version;
743my $v_version = pack "C*", split /\./, $string_version; # v string
744
745# The following are the complete names of properties with property values that
746# are known to not match any code points in some versions of Unicode, but that
747# may change in the future so they should be matchable, hence an empty file is
748# generated for them.
749my @tables_that_may_be_empty = (
750 'Joining_Type=Left_Joining',
751 );
752push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
753push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
754push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
755 if $v_version ge v4.1.0;
756
757# The lists below are hashes, so the key is the item in the list, and the
758# value is the reason why it is in the list. This makes generation of
759# documentation easier.
760
761my %why_suppressed; # No file generated for these.
762
763# Files aren't generated for empty extraneous properties. This is arguable.
764# Extraneous properties generally come about because a property is no longer
765# used in a newer version of Unicode. If we generated a file without code
766# points, programs that used to work on that property will still execute
767# without errors. It just won't ever match (or will always match, with \P{}).
768# This means that the logic is now likely wrong. I (khw) think its better to
769# find this out by getting an error message. Just move them to the table
770# above to change this behavior
771my %why_suppress_if_empty_warn_if_not = (
772
773 # It is the only property that has ever officially been removed from the
774 # Standard. The database never contained any code points for it.
775 'Special_Case_Condition' => 'Obsolete',
776
777 # Apparently never official, but there were code points in some versions of
778 # old-style PropList.txt
779 'Non_Break' => 'Obsolete',
780);
781
782# These would normally go in the warn table just above, but they were changed
783# a long time before this program was written, so warnings about them are
784# moot.
785if ($v_version gt v3.2.0) {
786 push @tables_that_may_be_empty,
787 'Canonical_Combining_Class=Attached_Below_Left'
788}
789
790# These are listed in the Property aliases file in 5.2, but Unihan is ignored
791# unless explicitly added.
792if ($v_version ge v5.2.0) {
793 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 794 foreach my $table (qw (
99870f4d
KW
795 kAccountingNumeric
796 kOtherNumeric
797 kPrimaryNumeric
798 kCompatibilityVariant
799 kIICore
800 kIRG_GSource
801 kIRG_HSource
802 kIRG_JSource
803 kIRG_KPSource
804 kIRG_MSource
805 kIRG_KSource
806 kIRG_TSource
807 kIRG_USource
808 kIRG_VSource
809 kRSUnicode
ea25a9b2 810 ))
99870f4d
KW
811 {
812 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
813 }
ca12659b
NC
814}
815
99870f4d
KW
816# Properties that this program ignores.
817my @unimplemented_properties = (
818'Unicode_Radical_Stroke' # Remove if changing to handle this one.
819);
d73e5302 820
99870f4d
KW
821# There are several types of obsolete properties defined by Unicode. These
822# must be hand-edited for every new Unicode release.
823my %why_deprecated; # Generates a deprecated warning message if used.
824my %why_stabilized; # Documentation only
825my %why_obsolete; # Documentation only
826
827{ # Closure
828 my $simple = 'Perl uses the more complete version of this property';
829 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
830
831 my $other_properties = 'other properties';
832 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
833 my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
834
835 %why_deprecated = (
836 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead',
837 'Jamo_Short_Name' => $contributory,
838 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
839 'Other_Alphabetic' => $contributory,
840 'Other_Default_Ignorable_Code_Point' => $contributory,
841 'Other_Grapheme_Extend' => $contributory,
842 'Other_ID_Continue' => $contributory,
843 'Other_ID_Start' => $contributory,
844 'Other_Lowercase' => $contributory,
845 'Other_Math' => $contributory,
846 'Other_Uppercase' => $contributory,
847 );
848
849 %why_suppressed = (
850 # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
851 # contains the same information, but without the algorithmically
852 # determinable Hangul syllables'. This file is not published, so it's
853 # existence is not noted in the comment.
854 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
855
856 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
857 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames",
858
859 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
860 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
861 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
862 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
863
864 'Name' => "Accessible via 'use charnames;'",
865 'Name_Alias' => "Accessible via 'use charnames;'",
866
867 # These are sort of jumping the gun; deprecation is proposed for
868 # Unicode version 6.0, but they have never been exposed by Perl, and
869 # likely are soon to be deprecated, so best not to expose them.
870 FC_NFKC_Closure => 'Use NFKC_Casefold instead',
871 Expands_On_NFC => $why_no_expand,
872 Expands_On_NFD => $why_no_expand,
873 Expands_On_NFKC => $why_no_expand,
874 Expands_On_NFKD => $why_no_expand,
875 );
876
877 # The following are suppressed because they were made contributory or
878 # deprecated by Unicode before Perl ever thought about supporting them.
879 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
880 $why_suppressed{$property} = $why_deprecated{$property};
881 }
cf25bb62 882
99870f4d
KW
883 # Customize the message for all the 'Other_' properties
884 foreach my $property (keys %why_deprecated) {
885 next if (my $main_property = $property) !~ s/^Other_//;
886 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
887 }
888}
889
890if ($v_version ge 4.0.0) {
891 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
892}
893if ($v_version ge 5.2.0) {
894 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
895}
896
897# Probably obsolete forever
898if ($v_version ge v4.1.0) {
899 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common"';
900}
901
902# This program can create files for enumerated-like properties, such as
903# 'Numeric_Type'. This file would be the same format as for a string
904# property, with a mapping from code point to its value, so you could look up,
905# for example, the script a code point is in. But no one so far wants this
906# mapping, or they have found another way to get it since this is a new
907# feature. So no file is generated except if it is in this list.
908my @output_mapped_properties = split "\n", <<END;
909END
910
911# If you are using the Unihan database, you need to add the properties that
912# you want to extract from it to this table. For your convenience, the
913# properties in the 5.2 PropertyAliases.txt file are listed, commented out
914my @cjk_properties = split "\n", <<'END';
915#cjkAccountingNumeric; kAccountingNumeric
916#cjkOtherNumeric; kOtherNumeric
917#cjkPrimaryNumeric; kPrimaryNumeric
918#cjkCompatibilityVariant; kCompatibilityVariant
919#cjkIICore ; kIICore
920#cjkIRG_GSource; kIRG_GSource
921#cjkIRG_HSource; kIRG_HSource
922#cjkIRG_JSource; kIRG_JSource
923#cjkIRG_KPSource; kIRG_KPSource
924#cjkIRG_KSource; kIRG_KSource
925#cjkIRG_TSource; kIRG_TSource
926#cjkIRG_USource; kIRG_USource
927#cjkIRG_VSource; kIRG_VSource
928#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
929END
930
931# Similarly for the property values. For your convenience, the lines in the
932# 5.2 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
933# '#' marks
934my @cjk_property_values = split "\n", <<'END';
935## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
936## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
937## @missing: 0000..10FFFF; cjkIICore; <none>
938## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
939## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
940## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
941## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
942## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
943## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
944## @missing: 0000..10FFFF; cjkIRG_USource; <none>
945## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
946## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
947## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
948## @missing: 0000..10FFFF; cjkRSUnicode; <none>
949END
950
951# The input files don't list every code point. Those not listed are to be
952# defaulted to some value. Below are hard-coded what those values are for
953# non-binary properties as of 5.1. Starting in 5.0, there are
954# machine-parsable comment lines in the files the give the defaults; so this
955# list shouldn't have to be extended. The claim is that all missing entries
956# for binary properties will default to 'N'. Unicode tried to change that in
957# 5.2, but the beta period produced enough protest that they backed off.
958#
959# The defaults for the fields that appear in UnicodeData.txt in this hash must
960# be in the form that it expects. The others may be synonyms.
961my $CODE_POINT = '<code point>';
962my %default_mapping = (
963 Age => "Unassigned",
964 # Bidi_Class => Complicated; set in code
965 Bidi_Mirroring_Glyph => "",
966 Block => 'No_Block',
967 Canonical_Combining_Class => 0,
968 Case_Folding => $CODE_POINT,
969 Decomposition_Mapping => $CODE_POINT,
970 Decomposition_Type => 'None',
971 East_Asian_Width => "Neutral",
972 FC_NFKC_Closure => $CODE_POINT,
973 General_Category => 'Cn',
974 Grapheme_Cluster_Break => 'Other',
975 Hangul_Syllable_Type => 'NA',
976 ISO_Comment => "",
977 Jamo_Short_Name => "",
978 Joining_Group => "No_Joining_Group",
979 # Joining_Type => Complicated; set in code
980 kIICore => 'N', # Is converted to binary
981 #Line_Break => Complicated; set in code
982 Lowercase_Mapping => $CODE_POINT,
983 Name => "",
984 Name_Alias => "",
985 NFC_QC => 'Yes',
986 NFD_QC => 'Yes',
987 NFKC_QC => 'Yes',
988 NFKD_QC => 'Yes',
989 Numeric_Type => 'None',
990 Numeric_Value => 'NaN',
991 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
992 Sentence_Break => 'Other',
993 Simple_Case_Folding => $CODE_POINT,
994 Simple_Lowercase_Mapping => $CODE_POINT,
995 Simple_Titlecase_Mapping => $CODE_POINT,
996 Simple_Uppercase_Mapping => $CODE_POINT,
997 Titlecase_Mapping => $CODE_POINT,
998 Unicode_1_Name => "",
999 Unicode_Radical_Stroke => "",
1000 Uppercase_Mapping => $CODE_POINT,
1001 Word_Break => 'Other',
1002);
1003
1004# Below are files that Unicode furnishes, but this program ignores, and why
1005my %ignored_files = (
1006 'CJKRadicals.txt' => 'Unihan data',
1007 'Index.txt' => 'An index, not actual data',
1008 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1009 'NamesList.txt' => 'Just adds commentary',
1010 'NormalizationCorrections.txt' => 'Data is already in other files.',
1011 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1012 'ReadMe.txt' => 'Just comments',
1013 'README.TXT' => 'Just comments',
1014 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
1015);
1016
678f13d5 1017### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1018
1019my $HEADER=<<"EOF";
1020# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1021# This file is machine-generated by $0 from the Unicode
1022# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1023EOF
1024
b6922eda 1025my $INTERNAL_ONLY=<<"EOF";
99870f4d
KW
1026
1027# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
b6922eda 1028# This file is for internal use by the Perl program only. The format and even
99870f4d
KW
1029# the name or existence of this file are subject to change without notice.
1030# Don't use it directly.
1031EOF
1032
1033my $DEVELOPMENT_ONLY=<<"EOF";
1034# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1035# This file contains information artificially constrained to code points
1036# present in Unicode release $string_compare_versions.
1037# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1038# not be used for production.
b6922eda
KW
1039
1040EOF
1041
99870f4d
KW
1042my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1043my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1044my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1045
1046# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1047# two must be 10; if there are 5, the first must not be a 0. Written this way
1048# to decrease backtracking
1049my $code_point_re =
1050 qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1051
1052# This matches the beginning of the line in the Unicode db files that give the
1053# defaults for code points not listed (i.e., missing) in the file. The code
1054# depends on this ending with a semi-colon, so it can assume it is a valid
1055# field when the line is split() by semi-colons
1056my $missing_defaults_prefix =
1057 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1058
1059# Property types. Unicode has more types, but these are sufficient for our
1060# purposes.
1061my $UNKNOWN = -1; # initialized to illegal value
1062my $NON_STRING = 1; # Either binary or enum
1063my $BINARY = 2;
1064my $ENUM = 3; # Include catalog
1065my $STRING = 4; # Anything else: string or misc
1066
1067# Some input files have lines that give default values for code points not
1068# contained in the file. Sometimes these should be ignored.
1069my $NO_DEFAULTS = 0; # Must evaluate to false
1070my $NOT_IGNORED = 1;
1071my $IGNORED = 2;
1072
1073# Range types. Each range has a type. Most ranges are type 0, for normal,
1074# and will appear in the main body of the tables in the output files, but
1075# there are other types of ranges as well, listed below, that are specially
1076# handled. There are pseudo-types as well that will never be stored as a
1077# type, but will affect the calculation of the type.
1078
1079# 0 is for normal, non-specials
1080my $MULTI_CP = 1; # Sequence of more than code point
1081my $HANGUL_SYLLABLE = 2;
1082my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1083my $NULL = 4; # The map is to the null string; utf8.c can't
1084 # handle these, nor is there an accepted syntax
1085 # for them in \p{} constructs
f86864ac 1086my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1087 # otherwise be $MULTI_CP type are instead type 0
1088
1089# process_generic_property_file() can accept certain overrides in its input.
1090# Each of these must begin AND end with $CMD_DELIM.
1091my $CMD_DELIM = "\a";
1092my $REPLACE_CMD = 'replace'; # Override the Replace
1093my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1094
1095my $NO = 0;
1096my $YES = 1;
1097
1098# Values for the Replace argument to add_range.
1099# $NO # Don't replace; add only the code points not
1100 # already present.
1101my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1102 # the comments at the subroutine definition.
1103my $UNCONDITIONALLY = 2; # Replace without conditions.
1104my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1105 # already there
1106
1107# Flags to give property statuses. The phrases are to remind maintainers that
1108# if the flag is changed, the indefinite article referring to it in the
1109# documentation may need to be as well.
1110my $NORMAL = "";
1111my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1112 # it is suppressed
37e2e78e 1113my $PLACEHOLDER = 'P'; # Implies no pod entry generated
99870f4d
KW
1114my $DEPRECATED = 'D';
1115my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1116my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1117my $DISCOURAGED = 'X';
1118my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1119my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1120my $STRICTER = 'T';
1121my $a_bold_stricter = "a 'B<$STRICTER>'";
1122my $A_bold_stricter = "A 'B<$STRICTER>'";
1123my $STABILIZED = 'S';
1124my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1125my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1126my $OBSOLETE = 'O';
1127my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1128my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1129
1130my %status_past_participles = (
1131 $DISCOURAGED => 'discouraged',
1132 $SUPPRESSED => 'should never be generated',
1133 $STABILIZED => 'stabilized',
1134 $OBSOLETE => 'obsolete',
37e2e78e 1135 $DEPRECATED => 'deprecated',
99870f4d
KW
1136);
1137
1138# The format of the values of the map tables:
1139my $BINARY_FORMAT = 'b';
1140my $DECIMAL_FORMAT = 'd';
1141my $FLOAT_FORMAT = 'f';
1142my $INTEGER_FORMAT = 'i';
1143my $HEX_FORMAT = 'x';
1144my $RATIONAL_FORMAT = 'r';
1145my $STRING_FORMAT = 's';
1146
1147my %map_table_formats = (
1148 $BINARY_FORMAT => 'binary',
1149 $DECIMAL_FORMAT => 'single decimal digit',
1150 $FLOAT_FORMAT => 'floating point number',
1151 $INTEGER_FORMAT => 'integer',
1152 $HEX_FORMAT => 'positive hex whole number; a code point',
1153 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1154 $STRING_FORMAT => 'arbitrary string',
1155);
1156
1157# Unicode didn't put such derived files in a separate directory at first.
1158my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1159my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1160my $AUXILIARY = 'auxiliary';
1161
1162# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1163my %loose_to_file_of; # loosely maps table names to their respective
1164 # files
1165my %stricter_to_file_of; # same; but for stricter mapping.
1166my %nv_floating_to_rational; # maps numeric values floating point numbers to
1167 # their rational equivalent
1168my %loose_property_name_of; # Loosely maps property names to standard form
1169
1170# These constants names and values were taken from the Unicode standard,
1171# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1172# syllables. The '_string' versions are so generated tables can retain the
1173# hex format, which is the more familiar value
1174my $SBase_string = "0xAC00";
1175my $SBase = CORE::hex $SBase_string;
1176my $LBase_string = "0x1100";
1177my $LBase = CORE::hex $LBase_string;
1178my $VBase_string = "0x1161";
1179my $VBase = CORE::hex $VBase_string;
1180my $TBase_string = "0x11A7";
1181my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1182my $SCount = 11172;
1183my $LCount = 19;
1184my $VCount = 21;
1185my $TCount = 28;
1186my $NCount = $VCount * $TCount;
1187
1188# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1189# with the above published constants.
1190my %Jamo;
1191my %Jamo_L; # Leading consonants
1192my %Jamo_V; # Vowels
1193my %Jamo_T; # Trailing consonants
1194
37e2e78e 1195my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1196my @unhandled_properties; # Will contain a list of properties found in
1197 # the input that we didn't process.
f86864ac 1198my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1199 # listed in the pod
1200my @map_properties; # Properties that get map files written
1201my @named_sequences; # NamedSequences.txt contents.
1202my %potential_files; # Generated list of all .txt files in the directory
1203 # structure so we can warn if something is being
1204 # ignored.
1205my @files_actually_output; # List of files we generated.
1206my @more_Names; # Some code point names are compound; this is used
1207 # to store the extra components of them.
1208my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1209 # the minimum before we consider it equivalent to a
1210 # candidate rational
1211my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1212
1213# These store references to certain commonly used property objects
1214my $gc;
1215my $perl;
1216my $block;
3e20195b
KW
1217my $perl_charname;
1218my $print;
99870f4d
KW
1219
1220# Are there conflicting names because of beginning with 'In_', or 'Is_'
1221my $has_In_conflicts = 0;
1222my $has_Is_conflicts = 0;
1223
1224sub internal_file_to_platform ($) {
1225 # Convert our file paths which have '/' separators to those of the
1226 # platform.
1227
1228 my $file = shift;
1229 return undef unless defined $file;
1230
1231 return File::Spec->join(split '/', $file);
d07a55ed 1232}
5beb625e 1233
99870f4d
KW
1234sub file_exists ($) { # platform independent '-e'. This program internally
1235 # uses slash as a path separator.
1236 my $file = shift;
1237 return 0 if ! defined $file;
1238 return -e internal_file_to_platform($file);
1239}
5beb625e 1240
99870f4d 1241sub objaddr($) {
23e33b60
KW
1242 # Returns the address of the blessed input object.
1243 # It doesn't check for blessedness because that would do a string eval
1244 # every call, and the program is structured so that this is never called
1245 # for a non-blessed object.
99870f4d 1246
23e33b60 1247 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1248
1249 # Numifying a ref gives its address.
051df77b 1250 return pack 'J', $_[0];
99870f4d
KW
1251}
1252
23e33b60
KW
1253# Commented code below should work on Perl 5.8.
1254## This 'require' doesn't necessarily work in miniperl, and even if it does,
1255## the native perl version of it (which is what would operate under miniperl)
1256## is extremely slow, as it does a string eval every call.
1257#my $has_fast_scalar_util = $\18 !~ /miniperl/
1258# && defined eval "require Scalar::Util";
1259#
1260#sub objaddr($) {
1261# # Returns the address of the blessed input object. Uses the XS version if
1262# # available. It doesn't check for blessedness because that would do a
1263# # string eval every call, and the program is structured so that this is
1264# # never called for a non-blessed object.
1265#
1266# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1267#
1268# # Check at least that is a ref.
1269# my $pkg = ref($_[0]) or return undef;
1270#
1271# # Change to a fake package to defeat any overloaded stringify
1272# bless $_[0], 'main::Fake';
1273#
1274# # Numifying a ref gives its address.
051df77b 1275# my $addr = pack 'J', $_[0];
23e33b60
KW
1276#
1277# # Return to original class
1278# bless $_[0], $pkg;
1279# return $addr;
1280#}
1281
99870f4d
KW
1282sub max ($$) {
1283 my $a = shift;
1284 my $b = shift;
1285 return $a if $a >= $b;
1286 return $b;
1287}
1288
1289sub min ($$) {
1290 my $a = shift;
1291 my $b = shift;
1292 return $a if $a <= $b;
1293 return $b;
1294}
1295
1296sub clarify_number ($) {
1297 # This returns the input number with underscores inserted every 3 digits
1298 # in large (5 digits or more) numbers. Input must be entirely digits, not
1299 # checked.
1300
1301 my $number = shift;
1302 my $pos = length($number) - 3;
1303 return $number if $pos <= 1;
1304 while ($pos > 0) {
1305 substr($number, $pos, 0) = '_';
1306 $pos -= 3;
5beb625e 1307 }
99870f4d 1308 return $number;
99598c8c
JH
1309}
1310
12ac2576 1311
99870f4d 1312package Carp;
7ebf06b3 1313
99870f4d
KW
1314# These routines give a uniform treatment of messages in this program. They
1315# are placed in the Carp package to cause the stack trace to not include them,
1316# although an alternative would be to use another package and set @CARP_NOT
1317# for it.
12ac2576 1318
99870f4d 1319our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1320
99f78760
KW
1321# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1322# and overload trying to load Scalar:Util under miniperl. See
1323# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1324undef $overload::VERSION;
1325
99870f4d
KW
1326sub my_carp {
1327 my $message = shift || "";
1328 my $nofold = shift || 0;
7ebf06b3 1329
99870f4d
KW
1330 if ($message) {
1331 $message = main::join_lines($message);
1332 $message =~ s/^$0: *//; # Remove initial program name
1333 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1334 $message = "\n$0: $message;";
12ac2576 1335
99870f4d
KW
1336 # Fold the message with program name, semi-colon end punctuation
1337 # (which looks good with the message that carp appends to it), and a
1338 # hanging indent for continuation lines.
1339 $message = main::simple_fold($message, "", 4) unless $nofold;
1340 $message =~ s/\n$//; # Remove the trailing nl so what carp
1341 # appends is to the same line
1342 }
12ac2576 1343
99870f4d 1344 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1345
99870f4d
KW
1346 carp $message;
1347 return;
1348}
7ebf06b3 1349
99870f4d
KW
1350sub my_carp_bug {
1351 # This is called when it is clear that the problem is caused by a bug in
1352 # this program.
7ebf06b3 1353
99870f4d
KW
1354 my $message = shift;
1355 $message =~ s/^$0: *//;
1356 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1357 carp $message;
1358 return;
1359}
7ebf06b3 1360
99870f4d
KW
1361sub carp_too_few_args {
1362 if (@_ != 2) {
1363 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1364 return;
12ac2576 1365 }
7ebf06b3 1366
99870f4d
KW
1367 my $args_ref = shift;
1368 my $count = shift;
7ebf06b3 1369
99870f4d
KW
1370 my_carp_bug("Need at least $count arguments to "
1371 . (caller 1)[3]
1372 . ". Instead got: '"
1373 . join ', ', @$args_ref
1374 . "'. No action taken.");
1375 return;
12ac2576
JP
1376}
1377
99870f4d
KW
1378sub carp_extra_args {
1379 my $args_ref = shift;
1380 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1381
99870f4d
KW
1382 unless (ref $args_ref) {
1383 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1384 return;
1385 }
1386 my ($package, $file, $line) = caller;
1387 my $subroutine = (caller 1)[3];
cf25bb62 1388
99870f4d
KW
1389 my $list;
1390 if (ref $args_ref eq 'HASH') {
1391 foreach my $key (keys %$args_ref) {
1392 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1393 }
99870f4d 1394 $list = join ', ', each %{$args_ref};
cf25bb62 1395 }
99870f4d
KW
1396 elsif (ref $args_ref eq 'ARRAY') {
1397 foreach my $arg (@$args_ref) {
1398 $arg = $UNDEF unless defined $arg;
1399 }
1400 $list = join ', ', @$args_ref;
1401 }
1402 else {
1403 my_carp_bug("Can't cope with ref "
1404 . ref($args_ref)
1405 . " . argument to 'carp_extra_args'. Not checking arguments.");
1406 return;
1407 }
1408
1409 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1410 return;
d73e5302
JH
1411}
1412
99870f4d
KW
1413package main;
1414
1415{ # Closure
1416
1417 # This program uses the inside-out method for objects, as recommended in
1418 # "Perl Best Practices". This closure aids in generating those. There
1419 # are two routines. setup_package() is called once per package to set
1420 # things up, and then set_access() is called for each hash representing a
1421 # field in the object. These routines arrange for the object to be
1422 # properly destroyed when no longer used, and for standard accessor
1423 # functions to be generated. If you need more complex accessors, just
1424 # write your own and leave those accesses out of the call to set_access().
1425 # More details below.
1426
1427 my %constructor_fields; # fields that are to be used in constructors; see
1428 # below
1429
1430 # The values of this hash will be the package names as keys to other
1431 # hashes containing the name of each field in the package as keys, and
1432 # references to their respective hashes as values.
1433 my %package_fields;
1434
1435 sub setup_package {
1436 # Sets up the package, creating standard DESTROY and dump methods
1437 # (unless already defined). The dump method is used in debugging by
1438 # simple_dumper().
1439 # The optional parameters are:
1440 # a) a reference to a hash, that gets populated by later
1441 # set_access() calls with one of the accesses being
1442 # 'constructor'. The caller can then refer to this, but it is
1443 # not otherwise used by these two routines.
1444 # b) a reference to a callback routine to call during destruction
1445 # of the object, before any fields are actually destroyed
1446
1447 my %args = @_;
1448 my $constructor_ref = delete $args{'Constructor_Fields'};
1449 my $destroy_callback = delete $args{'Destroy_Callback'};
1450 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1451
1452 my %fields;
1453 my $package = (caller)[0];
1454
1455 $package_fields{$package} = \%fields;
1456 $constructor_fields{$package} = $constructor_ref;
1457
1458 unless ($package->can('DESTROY')) {
1459 my $destroy_name = "${package}::DESTROY";
1460 no strict "refs";
1461
1462 # Use typeglob to give the anonymous subroutine the name we want
1463 *$destroy_name = sub {
1464 my $self = shift;
ffe43484 1465 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1466
1467 $self->$destroy_callback if $destroy_callback;
1468 foreach my $field (keys %{$package_fields{$package}}) {
1469 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1470 delete $package_fields{$package}{$field}{$addr};
1471 }
1472 return;
1473 }
1474 }
1475
1476 unless ($package->can('dump')) {
1477 my $dump_name = "${package}::dump";
1478 no strict "refs";
1479 *$dump_name = sub {
1480 my $self = shift;
1481 return dump_inside_out($self, $package_fields{$package}, @_);
1482 }
1483 }
1484 return;
1485 }
1486
1487 sub set_access {
1488 # Arrange for the input field to be garbage collected when no longer
1489 # needed. Also, creates standard accessor functions for the field
1490 # based on the optional parameters-- none if none of these parameters:
1491 # 'addable' creates an 'add_NAME()' accessor function.
1492 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1493 # function.
1494 # 'settable' creates a 'set_NAME()' accessor function.
1495 # 'constructor' doesn't create an accessor function, but adds the
1496 # field to the hash that was previously passed to
1497 # setup_package();
1498 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1499 # 'add' etc. all mean 'addable'.
1500 # The read accessor function will work on both array and scalar
1501 # values. If another accessor in the parameter list is 'a', the read
1502 # access assumes an array. You can also force it to be array access
1503 # by specifying 'readable_array' instead of 'readable'
1504 #
1505 # A sort-of 'protected' access can be set-up by preceding the addable,
1506 # readable or settable with some initial portion of 'protected_' (but,
1507 # the underscore is required), like 'p_a', 'pro_set', etc. The
1508 # "protection" is only by convention. All that happens is that the
1509 # accessor functions' names begin with an underscore. So instead of
1510 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1511 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1512 # end of each package, and then storing the __LINE__ ranges and
1513 # checking them on every accessor. But that is way overkill.)
1514
1515 # We create anonymous subroutines as the accessors and then use
1516 # typeglobs to assign them to the proper package and name
1517
1518 my $name = shift; # Name of the field
1519 my $field = shift; # Reference to the inside-out hash containing the
1520 # field
1521
1522 my $package = (caller)[0];
1523
1524 if (! exists $package_fields{$package}) {
1525 croak "$0: Must call 'setup_package' before 'set_access'";
1526 }
d73e5302 1527
99870f4d
KW
1528 # Stash the field so DESTROY can get it.
1529 $package_fields{$package}{$name} = $field;
cf25bb62 1530
99870f4d
KW
1531 # Remaining arguments are the accessors. For each...
1532 foreach my $access (@_) {
1533 my $access = lc $access;
cf25bb62 1534
99870f4d 1535 my $protected = "";
cf25bb62 1536
99870f4d
KW
1537 # Match the input as far as it goes.
1538 if ($access =~ /^(p[^_]*)_/) {
1539 $protected = $1;
1540 if (substr('protected_', 0, length $protected)
1541 eq $protected)
1542 {
1543
1544 # Add 1 for the underscore not included in $protected
1545 $access = substr($access, length($protected) + 1);
1546 $protected = '_';
1547 }
1548 else {
1549 $protected = "";
1550 }
1551 }
1552
1553 if (substr('addable', 0, length $access) eq $access) {
1554 my $subname = "${package}::${protected}add_$name";
1555 no strict "refs";
1556
1557 # add_ accessor. Don't add if already there, which we
1558 # determine using 'eq' for scalars and '==' otherwise.
1559 *$subname = sub {
1560 use strict "refs";
1561 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1562 my $self = shift;
1563 my $value = shift;
ffe43484 1564 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1565 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1566 if (ref $value) {
f998e60c 1567 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1568 }
1569 else {
f998e60c 1570 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1571 }
f998e60c 1572 push @{$field->{$addr}}, $value;
99870f4d
KW
1573 return;
1574 }
1575 }
1576 elsif (substr('constructor', 0, length $access) eq $access) {
1577 if ($protected) {
1578 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1579 }
1580 else {
1581 $constructor_fields{$package}{$name} = $field;
1582 }
1583 }
1584 elsif (substr('readable_array', 0, length $access) eq $access) {
1585
1586 # Here has read access. If one of the other parameters for
1587 # access is array, or this one specifies array (by being more
1588 # than just 'readable_'), then create a subroutine that
1589 # assumes the data is an array. Otherwise just a scalar
1590 my $subname = "${package}::${protected}$name";
1591 if (grep { /^a/i } @_
1592 or length($access) > length('readable_'))
1593 {
1594 no strict "refs";
1595 *$subname = sub {
1596 use strict "refs";
23e33b60 1597 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1598 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1599 if (ref $field->{$addr} ne 'ARRAY') {
1600 my $type = ref $field->{$addr};
1601 $type = 'scalar' unless $type;
1602 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1603 return;
1604 }
1605 return scalar @{$field->{$addr}} unless wantarray;
1606
1607 # Make a copy; had problems with caller modifying the
1608 # original otherwise
1609 my @return = @{$field->{$addr}};
1610 return @return;
1611 }
1612 }
1613 else {
1614
1615 # Here not an array value, a simpler function.
1616 no strict "refs";
1617 *$subname = sub {
1618 use strict "refs";
23e33b60 1619 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1620 no overloading;
051df77b 1621 return $field->{pack 'J', $_[0]};
99870f4d
KW
1622 }
1623 }
1624 }
1625 elsif (substr('settable', 0, length $access) eq $access) {
1626 my $subname = "${package}::${protected}set_$name";
1627 no strict "refs";
1628 *$subname = sub {
1629 use strict "refs";
23e33b60
KW
1630 if (main::DEBUG) {
1631 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1632 Carp::carp_extra_args(\@_) if @_ > 2;
1633 }
1634 # $self is $_[0]; $value is $_[1]
f998e60c 1635 no overloading;
051df77b 1636 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1637 return;
1638 }
1639 }
1640 else {
1641 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1642 }
cf25bb62 1643 }
99870f4d 1644 return;
cf25bb62 1645 }
99870f4d
KW
1646}
1647
1648package Input_file;
1649
1650# All input files use this object, which stores various attributes about them,
1651# and provides for convenient, uniform handling. The run method wraps the
1652# processing. It handles all the bookkeeping of opening, reading, and closing
1653# the file, returning only significant input lines.
1654#
1655# Each object gets a handler which processes the body of the file, and is
1656# called by run(). Most should use the generic, default handler, which has
1657# code scrubbed to handle things you might not expect. A handler should
1658# basically be a while(next_line()) {...} loop.
1659#
1660# You can also set up handlers to
1661# 1) call before the first line is read for pre processing
1662# 2) call to adjust each line of the input before the main handler gets them
1663# 3) call upon EOF before the main handler exits its loop
1664# 4) call at the end for post processing
1665#
1666# $_ is used to store the input line, and is to be filtered by the
1667# each_line_handler()s. So, if the format of the line is not in the desired
1668# format for the main handler, these are used to do that adjusting. They can
1669# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1670# so the $_ output of one is used as the input to the next. None of the other
1671# handlers are stackable, but could easily be changed to be so.
1672#
1673# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1674# which insert the parameters as lines to be processed before the next input
1675# file line is read. This allows the EOF handler to flush buffers, for
1676# example. The difference between the two routines is that the lines inserted
1677# by insert_lines() are subjected to the each_line_handler()s. (So if you
1678# called it from such a handler, you would get infinite recursion.) Lines
1679# inserted by insert_adjusted_lines() go directly to the main handler without
1680# any adjustments. If the post-processing handler calls any of these, there
1681# will be no effect. Some error checking for these conditions could be added,
1682# but it hasn't been done.
1683#
1684# carp_bad_line() should be called to warn of bad input lines, which clears $_
1685# to prevent further processing of the line. This routine will output the
1686# message as a warning once, and then keep a count of the lines that have the
1687# same message, and output that count at the end of the file's processing.
1688# This keeps the number of messages down to a manageable amount.
1689#
1690# get_missings() should be called to retrieve any @missing input lines.
1691# Messages will be raised if this isn't done if the options aren't to ignore
1692# missings.
1693
1694sub trace { return main::trace(@_); }
1695
99870f4d
KW
1696{ # Closure
1697 # Keep track of fields that are to be put into the constructor.
1698 my %constructor_fields;
1699
1700 main::setup_package(Constructor_Fields => \%constructor_fields);
1701
1702 my %file; # Input file name, required
1703 main::set_access('file', \%file, qw{ c r });
1704
1705 my %first_released; # Unicode version file was first released in, required
1706 main::set_access('first_released', \%first_released, qw{ c r });
1707
1708 my %handler; # Subroutine to process the input file, defaults to
1709 # 'process_generic_property_file'
1710 main::set_access('handler', \%handler, qw{ c });
1711
1712 my %property;
1713 # name of property this file is for. defaults to none, meaning not
1714 # applicable, or is otherwise determinable, for example, from each line.
1715 main::set_access('property', \%property, qw{ c });
1716
1717 my %optional;
1718 # If this is true, the file is optional. If not present, no warning is
1719 # output. If it is present, the string given by this parameter is
1720 # evaluated, and if false the file is not processed.
1721 main::set_access('optional', \%optional, 'c', 'r');
1722
1723 my %non_skip;
1724 # This is used for debugging, to skip processing of all but a few input
1725 # files. Add 'non_skip => 1' to the constructor for those files you want
1726 # processed when you set the $debug_skip global.
1727 main::set_access('non_skip', \%non_skip, 'c');
1728
37e2e78e
KW
1729 my %skip;
1730 # This is used to skip processing of this input file semi-permanently.
1731 # It is used for files that we aren't planning to process anytime soon,
1732 # but want to allow to be in the directory and not raise a message that we
1733 # are not handling. Mostly for test files. This is in contrast to the
1734 # non_skip element, which is supposed to be used very temporarily for
1735 # debugging. Sets 'optional' to 1
1736 main::set_access('skip', \%skip, 'c');
1737
99870f4d
KW
1738 my %each_line_handler;
1739 # list of subroutines to look at and filter each non-comment line in the
1740 # file. defaults to none. The subroutines are called in order, each is
1741 # to adjust $_ for the next one, and the final one adjusts it for
1742 # 'handler'
1743 main::set_access('each_line_handler', \%each_line_handler, 'c');
1744
1745 my %has_missings_defaults;
1746 # ? Are there lines in the file giving default values for code points
1747 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1748 # the norm, but IGNORED means it has such lines, but the handler doesn't
1749 # use them. Having these three states allows us to catch changes to the
1750 # UCD that this program should track
1751 main::set_access('has_missings_defaults',
1752 \%has_missings_defaults, qw{ c r });
1753
1754 my %pre_handler;
1755 # Subroutine to call before doing anything else in the file. If undef, no
1756 # such handler is called.
1757 main::set_access('pre_handler', \%pre_handler, qw{ c });
1758
1759 my %eof_handler;
1760 # Subroutine to call upon getting an EOF on the input file, but before
1761 # that is returned to the main handler. This is to allow buffers to be
1762 # flushed. The handler is expected to call insert_lines() or
1763 # insert_adjusted() with the buffered material
1764 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1765
1766 my %post_handler;
1767 # Subroutine to call after all the lines of the file are read in and
1768 # processed. If undef, no such handler is called.
1769 main::set_access('post_handler', \%post_handler, qw{ c });
1770
1771 my %progress_message;
1772 # Message to print to display progress in lieu of the standard one
1773 main::set_access('progress_message', \%progress_message, qw{ c });
1774
1775 my %handle;
1776 # cache open file handle, internal. Is undef if file hasn't been
1777 # processed at all, empty if has;
1778 main::set_access('handle', \%handle);
1779
1780 my %added_lines;
1781 # cache of lines added virtually to the file, internal
1782 main::set_access('added_lines', \%added_lines);
1783
1784 my %errors;
1785 # cache of errors found, internal
1786 main::set_access('errors', \%errors);
1787
1788 my %missings;
1789 # storage of '@missing' defaults lines
1790 main::set_access('missings', \%missings);
1791
1792 sub new {
1793 my $class = shift;
1794
1795 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 1796 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1797
1798 # Set defaults
1799 $handler{$addr} = \&main::process_generic_property_file;
1800 $non_skip{$addr} = 0;
37e2e78e 1801 $skip{$addr} = 0;
99870f4d
KW
1802 $has_missings_defaults{$addr} = $NO_DEFAULTS;
1803 $handle{$addr} = undef;
1804 $added_lines{$addr} = [ ];
1805 $each_line_handler{$addr} = [ ];
1806 $errors{$addr} = { };
1807 $missings{$addr} = [ ];
1808
1809 # Two positional parameters.
99f78760 1810 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
1811 $file{$addr} = main::internal_file_to_platform(shift);
1812 $first_released{$addr} = shift;
1813
1814 # The rest of the arguments are key => value pairs
1815 # %constructor_fields has been set up earlier to list all possible
1816 # ones. Either set or push, depending on how the default has been set
1817 # up just above.
1818 my %args = @_;
1819 foreach my $key (keys %args) {
1820 my $argument = $args{$key};
1821
1822 # Note that the fields are the lower case of the constructor keys
1823 my $hash = $constructor_fields{lc $key};
1824 if (! defined $hash) {
1825 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
1826 next;
1827 }
1828 if (ref $hash->{$addr} eq 'ARRAY') {
1829 if (ref $argument eq 'ARRAY') {
1830 foreach my $argument (@{$argument}) {
1831 next if ! defined $argument;
1832 push @{$hash->{$addr}}, $argument;
1833 }
1834 }
1835 else {
1836 push @{$hash->{$addr}}, $argument if defined $argument;
1837 }
1838 }
1839 else {
1840 $hash->{$addr} = $argument;
1841 }
1842 delete $args{$key};
1843 };
1844
1845 # If the file has a property for it, it means that the property is not
1846 # listed in the file's entries. So add a handler to the list of line
1847 # handlers to insert the property name into the lines, to provide a
1848 # uniform interface to the final processing subroutine.
1849 # the final code doesn't have to worry about that.
1850 if ($property{$addr}) {
1851 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1852 }
1853
1854 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1855 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 1856 }
99870f4d 1857
37e2e78e
KW
1858 $optional{$addr} = 1 if $skip{$addr};
1859
99870f4d 1860 return $self;
d73e5302
JH
1861 }
1862
cf25bb62 1863
99870f4d
KW
1864 use overload
1865 fallback => 0,
1866 qw("") => "_operator_stringify",
1867 "." => \&main::_operator_dot,
1868 ;
cf25bb62 1869
99870f4d
KW
1870 sub _operator_stringify {
1871 my $self = shift;
cf25bb62 1872
99870f4d 1873 return __PACKAGE__ . " object for " . $self->file;
d73e5302 1874 }
d73e5302 1875
99870f4d
KW
1876 # flag to make sure extracted files are processed early
1877 my $seen_non_extracted_non_age = 0;
d73e5302 1878
99870f4d
KW
1879 sub run {
1880 # Process the input object $self. This opens and closes the file and
1881 # calls all the handlers for it. Currently, this can only be called
1882 # once per file, as it destroy's the EOF handler
d73e5302 1883
99870f4d
KW
1884 my $self = shift;
1885 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 1886
ffe43484 1887 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 1888
99870f4d 1889 my $file = $file{$addr};
d73e5302 1890
99870f4d
KW
1891 # Don't process if not expecting this file (because released later
1892 # than this Unicode version), and isn't there. This means if someone
1893 # copies it into an earlier version's directory, we will go ahead and
1894 # process it.
1895 return if $first_released{$addr} gt $v_version && ! -e $file;
1896
1897 # If in debugging mode and this file doesn't have the non-skip
1898 # flag set, and isn't one of the critical files, skip it.
1899 if ($debug_skip
1900 && $first_released{$addr} ne v0
1901 && ! $non_skip{$addr})
1902 {
1903 print "Skipping $file in debugging\n" if $verbosity;
1904 return;
1905 }
1906
1907 # File could be optional
37e2e78e 1908 if ($optional{$addr}) {
99870f4d
KW
1909 return unless -e $file;
1910 my $result = eval $optional{$addr};
1911 if (! defined $result) {
1912 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
1913 return;
1914 }
1915 if (! $result) {
1916 if ($verbosity) {
1917 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1918 }
1919 return;
1920 }
1921 }
1922
1923 if (! defined $file || ! -e $file) {
1924
1925 # If the file doesn't exist, see if have internal data for it
1926 # (based on first_released being 0).
1927 if ($first_released{$addr} eq v0) {
1928 $handle{$addr} = 'pretend_is_open';
1929 }
1930 else {
1931 if (! $optional{$addr} # File could be optional
1932 && $v_version ge $first_released{$addr})
1933 {
1934 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1935 }
1936 return;
1937 }
1938 }
1939 else {
1940
37e2e78e
KW
1941 # Here, the file exists. Some platforms may change the case of
1942 # its name
99870f4d 1943 if ($seen_non_extracted_non_age) {
517956bf 1944 if ($file =~ /$EXTRACTED/i) {
99870f4d 1945 Carp::my_carp_bug(join_lines(<<END
99f78760 1946$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
1947anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
1948have subtle problems
1949END
1950 ));
1951 }
1952 }
1953 elsif ($EXTRACTED_DIR
1954 && $first_released{$addr} ne v0
517956bf
CB
1955 && $file !~ /$EXTRACTED/i
1956 && lc($file) ne 'dage.txt')
99870f4d
KW
1957 {
1958 # We don't set this (by the 'if' above) if we have no
1959 # extracted directory, so if running on an early version,
1960 # this test won't work. Not worth worrying about.
1961 $seen_non_extracted_non_age = 1;
1962 }
1963
1964 # And mark the file as having being processed, and warn if it
1965 # isn't a file we are expecting. As we process the files,
1966 # they are deleted from the hash, so any that remain at the
1967 # end of the program are files that we didn't process.
517956bf
CB
1968 my $fkey = File::Spec->rel2abs($file);
1969 my $expecting = delete $potential_files{$fkey};
1970 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
678f13d5
KW
1971 Carp::my_carp("Was not expecting '$file'.") if
1972 ! $expecting
99870f4d
KW
1973 && ! defined $handle{$addr};
1974
37e2e78e
KW
1975 # Having deleted from expected files, we can quit if not to do
1976 # anything. Don't print progress unless really want verbosity
1977 if ($skip{$addr}) {
1978 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
1979 return;
1980 }
1981
99870f4d
KW
1982 # Open the file, converting the slashes used in this program
1983 # into the proper form for the OS
1984 my $file_handle;
1985 if (not open $file_handle, "<", $file) {
1986 Carp::my_carp("Can't open $file. Skipping: $!");
1987 return 0;
1988 }
1989 $handle{$addr} = $file_handle; # Cache the open file handle
1990 }
1991
1992 if ($verbosity >= $PROGRESS) {
1993 if ($progress_message{$addr}) {
1994 print "$progress_message{$addr}\n";
1995 }
1996 else {
1997 # If using a virtual file, say so.
1998 print "Processing ", (-e $file)
1999 ? $file
2000 : "substitute $file",
2001 "\n";
2002 }
2003 }
2004
2005
2006 # Call any special handler for before the file.
2007 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2008
2009 # Then the main handler
2010 &{$handler{$addr}}($self);
2011
2012 # Then any special post-file handler.
2013 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2014
2015 # If any errors have been accumulated, output the counts (as the first
2016 # error message in each class was output when it was encountered).
2017 if ($errors{$addr}) {
2018 my $total = 0;
2019 my $types = 0;
2020 foreach my $error (keys %{$errors{$addr}}) {
2021 $total += $errors{$addr}->{$error};
2022 delete $errors{$addr}->{$error};
2023 $types++;
2024 }
2025 if ($total > 1) {
2026 my $message
2027 = "A total of $total lines had errors in $file. ";
2028
2029 $message .= ($types == 1)
2030 ? '(Only the first one was displayed.)'
2031 : '(Only the first of each type was displayed.)';
2032 Carp::my_carp($message);
2033 }
2034 }
2035
2036 if (@{$missings{$addr}}) {
2037 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2038 }
2039
2040 # If a real file handle, close it.
2041 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2042 ref $handle{$addr};
2043 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2044 # the file, as opposed to undef
2045 return;
2046 }
2047
2048 sub next_line {
2049 # Sets $_ to be the next logical input line, if any. Returns non-zero
2050 # if such a line exists. 'logical' means that any lines that have
2051 # been added via insert_lines() will be returned in $_ before the file
2052 # is read again.
2053
2054 my $self = shift;
2055 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2056
ffe43484 2057 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2058
2059 # Here the file is open (or if the handle is not a ref, is an open
2060 # 'virtual' file). Get the next line; any inserted lines get priority
2061 # over the file itself.
2062 my $adjusted;
2063
2064 LINE:
2065 while (1) { # Loop until find non-comment, non-empty line
2066 #local $to_trace = 1 if main::DEBUG;
2067 my $inserted_ref = shift @{$added_lines{$addr}};
2068 if (defined $inserted_ref) {
2069 ($adjusted, $_) = @{$inserted_ref};
2070 trace $adjusted, $_ if main::DEBUG && $to_trace;
2071 return 1 if $adjusted;
2072 }
2073 else {
2074 last if ! ref $handle{$addr}; # Don't read unless is real file
2075 last if ! defined ($_ = readline $handle{$addr});
2076 }
2077 chomp;
2078 trace $_ if main::DEBUG && $to_trace;
2079
2080 # See if this line is the comment line that defines what property
2081 # value that code points that are not listed in the file should
2082 # have. The format or existence of these lines is not guaranteed
2083 # by Unicode since they are comments, but the documentation says
2084 # that this was added for machine-readability, so probably won't
2085 # change. This works starting in Unicode Version 5.0. They look
2086 # like:
2087 #
2088 # @missing: 0000..10FFFF; Not_Reordered
2089 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2090 # @missing: 0000..10FFFF; ; NaN
2091 #
2092 # Save the line for a later get_missings() call.
2093 if (/$missing_defaults_prefix/) {
2094 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2095 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2096 }
2097 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2098 my @defaults = split /\s* ; \s*/x, $_;
2099
2100 # The first field is the @missing, which ends in a
2101 # semi-colon, so can safely shift.
2102 shift @defaults;
2103
2104 # Some of these lines may have empty field placeholders
2105 # which get in the way. An example is:
2106 # @missing: 0000..10FFFF; ; NaN
2107 # Remove them. Process starting from the top so the
2108 # splice doesn't affect things still to be looked at.
2109 for (my $i = @defaults - 1; $i >= 0; $i--) {
2110 next if $defaults[$i] ne "";
2111 splice @defaults, $i, 1;
2112 }
2113
2114 # What's left should be just the property (maybe) and the
2115 # default. Having only one element means it doesn't have
2116 # the property.
2117 my $default;
2118 my $property;
2119 if (@defaults >= 1) {
2120 if (@defaults == 1) {
2121 $default = $defaults[0];
2122 }
2123 else {
2124 $property = $defaults[0];
2125 $default = $defaults[1];
2126 }
2127 }
2128
2129 if (@defaults < 1
2130 || @defaults > 2
2131 || ($default =~ /^</
2132 && $default !~ /^<code *point>$/i
2133 && $default !~ /^<none>$/i))
2134 {
2135 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2136 }
2137 else {
2138
2139 # If the property is missing from the line, it should
2140 # be the one for the whole file
2141 $property = $property{$addr} if ! defined $property;
2142
2143 # Change <none> to the null string, which is what it
2144 # really means. If the default is the code point
2145 # itself, set it to <code point>, which is what
2146 # Unicode uses (but sometimes they've forgotten the
2147 # space)
2148 if ($default =~ /^<none>$/i) {
2149 $default = "";
2150 }
2151 elsif ($default =~ /^<code *point>$/i) {
2152 $default = $CODE_POINT;
2153 }
2154
2155 # Store them as a sub-arrays with both components.
2156 push @{$missings{$addr}}, [ $default, $property ];
2157 }
2158 }
2159
2160 # There is nothing for the caller to process on this comment
2161 # line.
2162 next;
2163 }
2164
2165 # Remove comments and trailing space, and skip this line if the
2166 # result is empty
2167 s/#.*//;
2168 s/\s+$//;
2169 next if /^$/;
2170
2171 # Call any handlers for this line, and skip further processing of
2172 # the line if the handler sets the line to null.
2173 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2174 &{$sub_ref}($self);
2175 next LINE if /^$/;
2176 }
2177
2178 # Here the line is ok. return success.
2179 return 1;
2180 } # End of looping through lines.
2181
2182 # If there is an EOF handler, call it (only once) and if it generates
2183 # more lines to process go back in the loop to handle them.
2184 if ($eof_handler{$addr}) {
2185 &{$eof_handler{$addr}}($self);
2186 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2187 goto LINE if $added_lines{$addr};
2188 }
2189
2190 # Return failure -- no more lines.
2191 return 0;
2192
2193 }
2194
2195# Not currently used, not fully tested.
2196# sub peek {
2197# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2198# # record. Not callable from an each_line_handler(), nor does it call
2199# # an each_line_handler() on the line.
2200#
2201# my $self = shift;
ffe43484 2202# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2203#
2204# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2205# my ($adjusted, $line) = @{$inserted_ref};
2206# next if $adjusted;
2207#
2208# # Remove comments and trailing space, and return a non-empty
2209# # resulting line
2210# $line =~ s/#.*//;
2211# $line =~ s/\s+$//;
2212# return $line if $line ne "";
2213# }
2214#
2215# return if ! ref $handle{$addr}; # Don't read unless is real file
2216# while (1) { # Loop until find non-comment, non-empty line
2217# local $to_trace = 1 if main::DEBUG;
2218# trace $_ if main::DEBUG && $to_trace;
2219# return if ! defined (my $line = readline $handle{$addr});
2220# chomp $line;
2221# push @{$added_lines{$addr}}, [ 0, $line ];
2222#
2223# $line =~ s/#.*//;
2224# $line =~ s/\s+$//;
2225# return $line if $line ne "";
2226# }
2227#
2228# return;
2229# }
2230
2231
2232 sub insert_lines {
2233 # Lines can be inserted so that it looks like they were in the input
2234 # file at the place it was when this routine is called. See also
2235 # insert_adjusted_lines(). Lines inserted via this routine go through
2236 # any each_line_handler()
2237
2238 my $self = shift;
2239
2240 # Each inserted line is an array, with the first element being 0 to
2241 # indicate that this line hasn't been adjusted, and needs to be
2242 # processed.
f998e60c 2243 no overloading;
051df77b 2244 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2245 return;
2246 }
2247
2248 sub insert_adjusted_lines {
2249 # Lines can be inserted so that it looks like they were in the input
2250 # file at the place it was when this routine is called. See also
2251 # insert_lines(). Lines inserted via this routine are already fully
2252 # adjusted, ready to be processed; each_line_handler()s handlers will
2253 # not be called. This means this is not a completely general
2254 # facility, as only the last each_line_handler on the stack should
2255 # call this. It could be made more general, by passing to each of the
2256 # line_handlers their position on the stack, which they would pass on
2257 # to this routine, and that would replace the boolean first element in
2258 # the anonymous array pushed here, so that the next_line routine could
2259 # use that to call only those handlers whose index is after it on the
2260 # stack. But this is overkill for what is needed now.
2261
2262 my $self = shift;
2263 trace $_[0] if main::DEBUG && $to_trace;
2264
2265 # Each inserted line is an array, with the first element being 1 to
2266 # indicate that this line has been adjusted
f998e60c 2267 no overloading;
051df77b 2268 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2269 return;
2270 }
2271
2272 sub get_missings {
2273 # Returns the stored up @missings lines' values, and clears the list.
2274 # The values are in an array, consisting of the default in the first
2275 # element, and the property in the 2nd. However, since these lines
2276 # can be stacked up, the return is an array of all these arrays.
2277
2278 my $self = shift;
2279 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2280
ffe43484 2281 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2282
2283 # If not accepting a list return, just return the first one.
2284 return shift @{$missings{$addr}} unless wantarray;
2285
2286 my @return = @{$missings{$addr}};
2287 undef @{$missings{$addr}};
2288 return @return;
2289 }
2290
2291 sub _insert_property_into_line {
2292 # Add a property field to $_, if this file requires it.
2293
f998e60c 2294 my $self = shift;
ffe43484 2295 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2296 my $property = $property{$addr};
99870f4d
KW
2297 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2298
2299 $_ =~ s/(;|$)/; $property$1/;
2300 return;
2301 }
2302
2303 sub carp_bad_line {
2304 # Output consistent error messages, using either a generic one, or the
2305 # one given by the optional parameter. To avoid gazillions of the
2306 # same message in case the syntax of a file is way off, this routine
2307 # only outputs the first instance of each message, incrementing a
2308 # count so the totals can be output at the end of the file.
2309
2310 my $self = shift;
2311 my $message = shift;
2312 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2313
ffe43484 2314 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2315
2316 $message = 'Unexpected line' unless $message;
2317
2318 # No trailing punctuation so as to fit with our addenda.
2319 $message =~ s/[.:;,]$//;
2320
2321 # If haven't seen this exact message before, output it now. Otherwise
2322 # increment the count of how many times it has occurred
2323 unless ($errors{$addr}->{$message}) {
2324 Carp::my_carp("$message in '$_' in "
f998e60c 2325 . $file{$addr}
99870f4d
KW
2326 . " at line $.. Skipping this line;");
2327 $errors{$addr}->{$message} = 1;
2328 }
2329 else {
2330 $errors{$addr}->{$message}++;
2331 }
2332
2333 # Clear the line to prevent any further (meaningful) processing of it.
2334 $_ = "";
2335
2336 return;
2337 }
2338} # End closure
2339
2340package Multi_Default;
2341
2342# Certain properties in early versions of Unicode had more than one possible
2343# default for code points missing from the files. In these cases, one
2344# default applies to everything left over after all the others are applied,
2345# and for each of the others, there is a description of which class of code
2346# points applies to it. This object helps implement this by storing the
2347# defaults, and for all but that final default, an eval string that generates
2348# the class that it applies to.
2349
2350
2351{ # Closure
2352
2353 main::setup_package();
2354
2355 my %class_defaults;
2356 # The defaults structure for the classes
2357 main::set_access('class_defaults', \%class_defaults);
2358
2359 my %other_default;
2360 # The default that applies to everything left over.
2361 main::set_access('other_default', \%other_default, 'r');
2362
2363
2364 sub new {
2365 # The constructor is called with default => eval pairs, terminated by
2366 # the left-over default. e.g.
2367 # Multi_Default->new(
2368 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2369 # - 0x200D',
2370 # 'R' => 'some other expression that evaluates to code points',
2371 # .
2372 # .
2373 # .
2374 # 'U'));
2375
2376 my $class = shift;
2377
2378 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2379 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2380
2381 while (@_ > 1) {
2382 my $default = shift;
2383 my $eval = shift;
2384 $class_defaults{$addr}->{$default} = $eval;
2385 }
2386
2387 $other_default{$addr} = shift;
2388
2389 return $self;
2390 }
2391
2392 sub get_next_defaults {
2393 # Iterates and returns the next class of defaults.
2394 my $self = shift;
2395 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2396
ffe43484 2397 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2398
2399 return each %{$class_defaults{$addr}};
2400 }
2401}
2402
2403package Alias;
2404
2405# An alias is one of the names that a table goes by. This class defines them
2406# including some attributes. Everything is currently setup in the
2407# constructor.
2408
2409
2410{ # Closure
2411
2412 main::setup_package();
2413
2414 my %name;
2415 main::set_access('name', \%name, 'r');
2416
2417 my %loose_match;
2418 # Determined by the constructor code if this name should match loosely or
2419 # not. The constructor parameters can override this, but it isn't fully
2420 # implemented, as should have ability to override Unicode one's via
2421 # something like a set_loose_match()
2422 main::set_access('loose_match', \%loose_match, 'r');
2423
2424 my %make_pod_entry;
2425 # Some aliases should not get their own entries because they are covered
2426 # by a wild-card, and some we want to discourage use of. Binary
2427 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2428
2429 my %status;
2430 # Aliases have a status, like deprecated, or even suppressed (which means
2431 # they don't appear in documentation). Enum
2432 main::set_access('status', \%status, 'r');
2433
2434 my %externally_ok;
2435 # Similarly, some aliases should not be considered as usable ones for
2436 # external use, such as file names, or we don't want documentation to
2437 # recommend them. Boolean
2438 main::set_access('externally_ok', \%externally_ok, 'r');
2439
2440 sub new {
2441 my $class = shift;
2442
2443 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2444 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2445
2446 $name{$addr} = shift;
2447 $loose_match{$addr} = shift;
2448 $make_pod_entry{$addr} = shift;
2449 $externally_ok{$addr} = shift;
2450 $status{$addr} = shift;
2451
2452 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2453
2454 # Null names are never ok externally
2455 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2456
2457 return $self;
2458 }
2459}
2460
2461package Range;
2462
2463# A range is the basic unit for storing code points, and is described in the
2464# comments at the beginning of the program. Each range has a starting code
2465# point; an ending code point (not less than the starting one); a value
2466# that applies to every code point in between the two end-points, inclusive;
2467# and an enum type that applies to the value. The type is for the user's
2468# convenience, and has no meaning here, except that a non-zero type is
2469# considered to not obey the normal Unicode rules for having standard forms.
2470#
2471# The same structure is used for both map and match tables, even though in the
2472# latter, the value (and hence type) is irrelevant and could be used as a
2473# comment. In map tables, the value is what all the code points in the range
2474# map to. Type 0 values have the standardized version of the value stored as
2475# well, so as to not have to recalculate it a lot.
2476
2477sub trace { return main::trace(@_); }
2478
2479{ # Closure
2480
2481 main::setup_package();
2482
2483 my %start;
2484 main::set_access('start', \%start, 'r', 's');
2485
2486 my %end;
2487 main::set_access('end', \%end, 'r', 's');
2488
2489 my %value;
2490 main::set_access('value', \%value, 'r');
2491
2492 my %type;
2493 main::set_access('type', \%type, 'r');
2494
2495 my %standard_form;
2496 # The value in internal standard form. Defined only if the type is 0.
2497 main::set_access('standard_form', \%standard_form);
2498
2499 # Note that if these fields change, the dump() method should as well
2500
2501 sub new {
2502 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2503 my $class = shift;
2504
2505 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2506 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2507
2508 $start{$addr} = shift;
2509 $end{$addr} = shift;
2510
2511 my %args = @_;
2512
2513 my $value = delete $args{'Value'}; # Can be 0
2514 $value = "" unless defined $value;
2515 $value{$addr} = $value;
2516
2517 $type{$addr} = delete $args{'Type'} || 0;
2518
2519 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2520
2521 if (! $type{$addr}) {
2522 $standard_form{$addr} = main::standardize($value);
2523 }
2524
2525 return $self;
2526 }
2527
2528 use overload
2529 fallback => 0,
2530 qw("") => "_operator_stringify",
2531 "." => \&main::_operator_dot,
2532 ;
2533
2534 sub _operator_stringify {
2535 my $self = shift;
ffe43484 2536 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2537
2538 # Output it like '0041..0065 (value)'
2539 my $return = sprintf("%04X", $start{$addr})
2540 . '..'
2541 . sprintf("%04X", $end{$addr});
2542 my $value = $value{$addr};
2543 my $type = $type{$addr};
2544 $return .= ' (';
2545 $return .= "$value";
2546 $return .= ", Type=$type" if $type != 0;
2547 $return .= ')';
2548
2549 return $return;
2550 }
2551
2552 sub standard_form {
2553 # The standard form is the value itself if the standard form is
2554 # undefined (that is if the value is special)
2555
2556 my $self = shift;
2557 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2558
ffe43484 2559 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2560
2561 return $standard_form{$addr} if defined $standard_form{$addr};
2562 return $value{$addr};
2563 }
2564
2565 sub dump {
2566 # Human, not machine readable. For machine readable, comment out this
2567 # entire routine and let the standard one take effect.
2568 my $self = shift;
2569 my $indent = shift;
2570 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2571
ffe43484 2572 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2573
2574 my $return = $indent
2575 . sprintf("%04X", $start{$addr})
2576 . '..'
2577 . sprintf("%04X", $end{$addr})
2578 . " '$value{$addr}';";
2579 if (! defined $standard_form{$addr}) {
2580 $return .= "(type=$type{$addr})";
2581 }
2582 elsif ($standard_form{$addr} ne $value{$addr}) {
2583 $return .= "(standard '$standard_form{$addr}')";
2584 }
2585 return $return;
2586 }
2587} # End closure
2588
2589package _Range_List_Base;
2590
2591# Base class for range lists. A range list is simply an ordered list of
2592# ranges, so that the ranges with the lowest starting numbers are first in it.
2593#
2594# When a new range is added that is adjacent to an existing range that has the
2595# same value and type, it merges with it to form a larger range.
2596#
2597# Ranges generally do not overlap, except that there can be multiple entries
2598# of single code point ranges. This is because of NameAliases.txt.
2599#
2600# In this program, there is a standard value such that if two different
2601# values, have the same standard value, they are considered equivalent. This
2602# value was chosen so that it gives correct results on Unicode data
2603
2604# There are a number of methods to manipulate range lists, and some operators
2605# are overloaded to handle them.
2606
99870f4d
KW
2607sub trace { return main::trace(@_); }
2608
2609{ # Closure
2610
2611 our $addr;
2612
2613 main::setup_package();
2614
2615 my %ranges;
2616 # The list of ranges
2617 main::set_access('ranges', \%ranges, 'readable_array');
2618
2619 my %max;
2620 # The highest code point in the list. This was originally a method, but
2621 # actual measurements said it was used a lot.
2622 main::set_access('max', \%max, 'r');
2623
2624 my %each_range_iterator;
2625 # Iterator position for each_range()
2626 main::set_access('each_range_iterator', \%each_range_iterator);
2627
2628 my %owner_name_of;
2629 # Name of parent this is attached to, if any. Solely for better error
2630 # messages.
2631 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2632
2633 my %_search_ranges_cache;
2634 # A cache of the previous result from _search_ranges(), for better
2635 # performance
2636 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2637
2638 sub new {
2639 my $class = shift;
2640 my %args = @_;
2641
2642 # Optional initialization data for the range list.
2643 my $initialize = delete $args{'Initialize'};
2644
2645 my $self;
2646
2647 # Use _union() to initialize. _union() returns an object of this
2648 # class, which means that it will call this constructor recursively.
2649 # But it won't have this $initialize parameter so that it won't
2650 # infinitely loop on this.
2651 return _union($class, $initialize, %args) if defined $initialize;
2652
2653 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2654 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2655
2656 # Optional parent object, only for debug info.
2657 $owner_name_of{$addr} = delete $args{'Owner'};
2658 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2659
2660 # Stringify, in case it is an object.
2661 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2662
2663 # This is used only for error messages, and so a colon is added
2664 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2665
2666 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2667
2668 # Max is initialized to a negative value that isn't adjacent to 0,
2669 # for simpler tests
2670 $max{$addr} = -2;
2671
2672 $_search_ranges_cache{$addr} = 0;
2673 $ranges{$addr} = [];
2674
2675 return $self;
2676 }
2677
2678 use overload
2679 fallback => 0,
2680 qw("") => "_operator_stringify",
2681 "." => \&main::_operator_dot,
2682 ;
2683
2684 sub _operator_stringify {
2685 my $self = shift;
ffe43484 2686 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2687
2688 return "Range_List attached to '$owner_name_of{$addr}'"
2689 if $owner_name_of{$addr};
2690 return "anonymous Range_List " . \$self;
2691 }
2692
2693 sub _union {
2694 # Returns the union of the input code points. It can be called as
2695 # either a constructor or a method. If called as a method, the result
2696 # will be a new() instance of the calling object, containing the union
2697 # of that object with the other parameter's code points; if called as
2698 # a constructor, the first parameter gives the class the new object
2699 # should be, and the second parameter gives the code points to go into
2700 # it.
2701 # In either case, there are two parameters looked at by this routine;
2702 # any additional parameters are passed to the new() constructor.
2703 #
2704 # The code points can come in the form of some object that contains
2705 # ranges, and has a conventionally named method to access them; or
2706 # they can be an array of individual code points (as integers); or
2707 # just a single code point.
2708 #
2709 # If they are ranges, this routine doesn't make any effort to preserve
2710 # the range values of one input over the other. Therefore this base
2711 # class should not allow _union to be called from other than
2712 # initialization code, so as to prevent two tables from being added
2713 # together where the range values matter. The general form of this
2714 # routine therefore belongs in a derived class, but it was moved here
2715 # to avoid duplication of code. The failure to overload this in this
2716 # class keeps it safe.
2717 #
2718
2719 my $self;
2720 my @args; # Arguments to pass to the constructor
2721
2722 my $class = shift;
2723
2724 # If a method call, will start the union with the object itself, and
2725 # the class of the new object will be the same as self.
2726 if (ref $class) {
2727 $self = $class;
2728 $class = ref $self;
2729 push @args, $self;
2730 }
2731
2732 # Add the other required parameter.
2733 push @args, shift;
2734 # Rest of parameters are passed on to the constructor
2735
2736 # Accumulate all records from both lists.
2737 my @records;
2738 for my $arg (@args) {
2739 #local $to_trace = 0 if main::DEBUG;
2740 trace "argument = $arg" if main::DEBUG && $to_trace;
2741 if (! defined $arg) {
2742 my $message = "";
2743 if (defined $self) {
f998e60c 2744 no overloading;
051df77b 2745 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2746 }
2747 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2748 return;
2749 }
2750 $arg = [ $arg ] if ! ref $arg;
2751 my $type = ref $arg;
2752 if ($type eq 'ARRAY') {
2753 foreach my $element (@$arg) {
2754 push @records, Range->new($element, $element);
2755 }
2756 }
2757 elsif ($arg->isa('Range')) {
2758 push @records, $arg;
2759 }
2760 elsif ($arg->can('ranges')) {
2761 push @records, $arg->ranges;
2762 }
2763 else {
2764 my $message = "";
2765 if (defined $self) {
f998e60c 2766 no overloading;
051df77b 2767 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2768 }
2769 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2770 return;
2771 }
2772 }
2773
2774 # Sort with the range containing the lowest ordinal first, but if
2775 # two ranges start at the same code point, sort with the bigger range
2776 # of the two first, because it takes fewer cycles.
2777 @records = sort { ($a->start <=> $b->start)
2778 or
2779 # if b is shorter than a, b->end will be
2780 # less than a->end, and we want to select
2781 # a, so want to return -1
2782 ($b->end <=> $a->end)
2783 } @records;
2784
2785 my $new = $class->new(@_);
2786
2787 # Fold in records so long as they add new information.
2788 for my $set (@records) {
2789 my $start = $set->start;
2790 my $end = $set->end;
2791 my $value = $set->value;
2792 if ($start > $new->max) {
2793 $new->_add_delete('+', $start, $end, $value);
2794 }
2795 elsif ($end > $new->max) {
2796 $new->_add_delete('+', $new->max +1, $end, $value);
2797 }
2798 }
2799
2800 return $new;
2801 }
2802
2803 sub range_count { # Return the number of ranges in the range list
2804 my $self = shift;
2805 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2806
f998e60c 2807 no overloading;
051df77b 2808 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
2809 }
2810
2811 sub min {
2812 # Returns the minimum code point currently in the range list, or if
2813 # the range list is empty, 2 beyond the max possible. This is a
2814 # method because used so rarely, that not worth saving between calls,
2815 # and having to worry about changing it as ranges are added and
2816 # deleted.
2817
2818 my $self = shift;
2819 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2820
ffe43484 2821 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2822
2823 # If the range list is empty, return a large value that isn't adjacent
2824 # to any that could be in the range list, for simpler tests
2825 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2826 return $ranges{$addr}->[0]->start;
2827 }
2828
2829 sub contains {
2830 # Boolean: Is argument in the range list? If so returns $i such that:
2831 # range[$i]->end < $codepoint <= range[$i+1]->end
2832 # which is one beyond what you want; this is so that the 0th range
2833 # doesn't return false
2834 my $self = shift;
2835 my $codepoint = shift;
2836 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2837
99870f4d
KW
2838 my $i = $self->_search_ranges($codepoint);
2839 return 0 unless defined $i;
2840
2841 # The search returns $i, such that
2842 # range[$i-1]->end < $codepoint <= range[$i]->end
2843 # So is in the table if and only iff it is at least the start position
2844 # of range $i.
f998e60c 2845 no overloading;
051df77b 2846 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
2847 return $i + 1;
2848 }
2849
2850 sub value_of {
2851 # Returns the value associated with the code point, undef if none
2852
2853 my $self = shift;
2854 my $codepoint = shift;
2855 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2856
99870f4d
KW
2857 my $i = $self->contains($codepoint);
2858 return unless $i;
2859
2860 # contains() returns 1 beyond where we should look
f998e60c 2861 no overloading;
051df77b 2862 return $ranges{pack 'J', $self}->[$i-1]->value;
99870f4d
KW
2863 }
2864
2865 sub _search_ranges {
2866 # Find the range in the list which contains a code point, or where it
2867 # should go if were to add it. That is, it returns $i, such that:
2868 # range[$i-1]->end < $codepoint <= range[$i]->end
2869 # Returns undef if no such $i is possible (e.g. at end of table), or
2870 # if there is an error.
2871
2872 my $self = shift;
2873 my $code_point = shift;
2874 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2875
ffe43484 2876 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2877
2878 return if $code_point > $max{$addr};
2879 my $r = $ranges{$addr}; # The current list of ranges
2880 my $range_list_size = scalar @$r;
2881 my $i;
2882
2883 use integer; # want integer division
2884
2885 # Use the cached result as the starting guess for this one, because,
2886 # an experiment on 5.1 showed that 90% of the time the cache was the
2887 # same as the result on the next call (and 7% it was one less).
2888 $i = $_search_ranges_cache{$addr};
2889 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
2890 # from an intervening deletion
2891 #local $to_trace = 1 if main::DEBUG;
2892 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2893 return $i if $code_point <= $r->[$i]->end
2894 && ($i == 0 || $r->[$i-1]->end < $code_point);
2895
2896 # Here the cache doesn't yield the correct $i. Try adding 1.
2897 if ($i < $range_list_size - 1
2898 && $r->[$i]->end < $code_point &&
2899 $code_point <= $r->[$i+1]->end)
2900 {
2901 $i++;
2902 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2903 $_search_ranges_cache{$addr} = $i;
2904 return $i;
2905 }
2906
2907 # Here, adding 1 also didn't work. We do a binary search to
2908 # find the correct position, starting with current $i
2909 my $lower = 0;
2910 my $upper = $range_list_size - 1;
2911 while (1) {
2912 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2913
2914 if ($code_point <= $r->[$i]->end) {
2915
2916 # Here we have met the upper constraint. We can quit if we
2917 # also meet the lower one.
2918 last if $i == 0 || $r->[$i-1]->end < $code_point;
2919
2920 $upper = $i; # Still too high.
2921
2922 }
2923 else {
2924
2925 # Here, $r[$i]->end < $code_point, so look higher up.
2926 $lower = $i;
2927 }
2928
2929 # Split search domain in half to try again.
2930 my $temp = ($upper + $lower) / 2;
2931
2932 # No point in continuing unless $i changes for next time
2933 # in the loop.
2934 if ($temp == $i) {
2935
2936 # We can't reach the highest element because of the averaging.
2937 # So if one below the upper edge, force it there and try one
2938 # more time.
2939 if ($i == $range_list_size - 2) {
2940
2941 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2942 $i = $range_list_size - 1;
2943
2944 # Change $lower as well so if fails next time through,
2945 # taking the average will yield the same $i, and we will
2946 # quit with the error message just below.
2947 $lower = $i;
2948 next;
2949 }
2950 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
2951 return;
2952 }
2953 $i = $temp;
2954 } # End of while loop
2955
2956 if (main::DEBUG && $to_trace) {
2957 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2958 trace "i= [ $i ]", $r->[$i];
2959 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2960 }
2961
2962 # Here we have found the offset. Cache it as a starting point for the
2963 # next call.
2964 $_search_ranges_cache{$addr} = $i;
2965 return $i;
2966 }
2967
2968 sub _add_delete {
2969 # Add, replace or delete ranges to or from a list. The $type
2970 # parameter gives which:
2971 # '+' => insert or replace a range, returning a list of any changed
2972 # ranges.
2973 # '-' => delete a range, returning a list of any deleted ranges.
2974 #
2975 # The next three parameters give respectively the start, end, and
2976 # value associated with the range. 'value' should be null unless the
2977 # operation is '+';
2978 #
2979 # The range list is kept sorted so that the range with the lowest
2980 # starting position is first in the list, and generally, adjacent
c1739a4a 2981 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
2982 # exceptions below).
2983 #
c1739a4a 2984 # There are more parameters; all are key => value pairs:
99870f4d
KW
2985 # Type gives the type of the value. It is only valid for '+'.
2986 # All ranges have types; if this parameter is omitted, 0 is
2987 # assumed. Ranges with type 0 are assumed to obey the
2988 # Unicode rules for casing, etc; ranges with other types are
2989 # not. Otherwise, the type is arbitrary, for the caller's
2990 # convenience, and looked at only by this routine to keep
2991 # adjacent ranges of different types from being merged into
2992 # a single larger range, and when Replace =>
2993 # $IF_NOT_EQUIVALENT is specified (see just below).
2994 # Replace determines what to do if the range list already contains
2995 # ranges which coincide with all or portions of the input
2996 # range. It is only valid for '+':
2997 # => $NO means that the new value is not to replace
2998 # any existing ones, but any empty gaps of the
2999 # range list coinciding with the input range
3000 # will be filled in with the new value.
3001 # => $UNCONDITIONALLY means to replace the existing values with
3002 # this one unconditionally. However, if the
3003 # new and old values are identical, the
3004 # replacement is skipped to save cycles
3005 # => $IF_NOT_EQUIVALENT means to replace the existing values
3006 # with this one if they are not equivalent.
3007 # Ranges are equivalent if their types are the
c1739a4a 3008 # same, and they are the same string; or if
99870f4d
KW
3009 # both are type 0 ranges, if their Unicode
3010 # standard forms are identical. In this last
3011 # case, the routine chooses the more "modern"
3012 # one to use. This is because some of the
3013 # older files are formatted with values that
3014 # are, for example, ALL CAPs, whereas the
3015 # derived files have a more modern style,
3016 # which looks better. By looking for this
3017 # style when the pre-existing and replacement
3018 # standard forms are the same, we can move to
3019 # the modern style
3020 # => $MULTIPLE means that if this range duplicates an
3021 # existing one, but has a different value,
3022 # don't replace the existing one, but insert
3023 # this, one so that the same range can occur
3024 # multiple times.
3025 # => anything else is the same as => $IF_NOT_EQUIVALENT
3026 #
c1739a4a
KW
3027 # "same value" means identical for non-type-0 ranges, and it means
3028 # having the same standard forms for type-0 ranges.
99870f4d
KW
3029
3030 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3031
3032 my $self = shift;
3033 my $operation = shift; # '+' for add/replace; '-' for delete;
3034 my $start = shift;
3035 my $end = shift;
3036 my $value = shift;
3037
3038 my %args = @_;
3039
3040 $value = "" if not defined $value; # warning: $value can be "0"
3041
3042 my $replace = delete $args{'Replace'};
3043 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3044
3045 my $type = delete $args{'Type'};
3046 $type = 0 unless defined $type;
3047
3048 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3049
ffe43484 3050 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3051
3052 if ($operation ne '+' && $operation ne '-') {
3053 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3054 return;
3055 }
3056 unless (defined $start && defined $end) {
3057 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3058 return;
3059 }
3060 unless ($end >= $start) {
3061 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3062 return;
3063 }
3064 #local $to_trace = 1 if main::DEBUG;
3065
3066 if ($operation eq '-') {
3067 if ($replace != $IF_NOT_EQUIVALENT) {
3068 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3069 $replace = $IF_NOT_EQUIVALENT;
3070 }
3071 if ($type) {
3072 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3073 $type = 0;
3074 }
3075 if ($value ne "") {
3076 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3077 $value = "";
3078 }
3079 }
3080
3081 my $r = $ranges{$addr}; # The current list of ranges
3082 my $range_list_size = scalar @$r; # And its size
3083 my $max = $max{$addr}; # The current high code point in
3084 # the list of ranges
3085
3086 # Do a special case requiring fewer machine cycles when the new range
3087 # starts after the current highest point. The Unicode input data is
3088 # structured so this is common.
3089 if ($start > $max) {
3090
3091 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3092 return if $operation eq '-'; # Deleting a non-existing range is a
3093 # no-op
3094
3095 # If the new range doesn't logically extend the current final one
3096 # in the range list, create a new range at the end of the range
3097 # list. (max cleverly is initialized to a negative number not
3098 # adjacent to 0 if the range list is empty, so even adding a range
3099 # to an empty range list starting at 0 will have this 'if'
3100 # succeed.)
3101 if ($start > $max + 1 # non-adjacent means can't extend.
3102 || @{$r}[-1]->value ne $value # values differ, can't extend.
3103 || @{$r}[-1]->type != $type # types differ, can't extend.
3104 ) {
3105 push @$r, Range->new($start, $end,
3106 Value => $value,
3107 Type => $type);
3108 }
3109 else {
3110
3111 # Here, the new range starts just after the current highest in
3112 # the range list, and they have the same type and value.
3113 # Extend the current range to incorporate the new one.
3114 @{$r}[-1]->set_end($end);
3115 }
3116
3117 # This becomes the new maximum.
3118 $max{$addr} = $end;
3119
3120 return;
3121 }
3122 #local $to_trace = 0 if main::DEBUG;
3123
3124 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3125
3126 # Here, the input range isn't after the whole rest of the range list.
3127 # Most likely 'splice' will be needed. The rest of the routine finds
3128 # the needed splice parameters, and if necessary, does the splice.
3129 # First, find the offset parameter needed by the splice function for
3130 # the input range. Note that the input range may span multiple
3131 # existing ones, but we'll worry about that later. For now, just find
3132 # the beginning. If the input range is to be inserted starting in a
3133 # position not currently in the range list, it must (obviously) come
3134 # just after the range below it, and just before the range above it.
3135 # Slightly less obviously, it will occupy the position currently
3136 # occupied by the range that is to come after it. More formally, we
3137 # are looking for the position, $i, in the array of ranges, such that:
3138 #
3139 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3140 #
3141 # (The ordered relationships within existing ranges are also shown in
3142 # the equation above). However, if the start of the input range is
3143 # within an existing range, the splice offset should point to that
3144 # existing range's position in the list; that is $i satisfies a
3145 # somewhat different equation, namely:
3146 #
3147 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3148 #
3149 # More briefly, $start can come before or after r[$i]->start, and at
3150 # this point, we don't know which it will be. However, these
3151 # two equations share these constraints:
3152 #
3153 # r[$i-1]->end < $start <= r[$i]->end
3154 #
3155 # And that is good enough to find $i.
3156
3157 my $i = $self->_search_ranges($start);
3158 if (! defined $i) {
3159 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3160 return;
3161 }
3162
3163 # The search function returns $i such that:
3164 #
3165 # r[$i-1]->end < $start <= r[$i]->end
3166 #
3167 # That means that $i points to the first range in the range list
3168 # that could possibly be affected by this operation. We still don't
3169 # know if the start of the input range is within r[$i], or if it
3170 # points to empty space between r[$i-1] and r[$i].
3171 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3172
3173 # Special case the insertion of data that is not to replace any
3174 # existing data.
3175 if ($replace == $NO) { # If $NO, has to be operation '+'
3176 #local $to_trace = 1 if main::DEBUG;
3177 trace "Doesn't replace" if main::DEBUG && $to_trace;
3178
3179 # Here, the new range is to take effect only on those code points
3180 # that aren't already in an existing range. This can be done by
3181 # looking through the existing range list and finding the gaps in
3182 # the ranges that this new range affects, and then calling this
3183 # function recursively on each of those gaps, leaving untouched
3184 # anything already in the list. Gather up a list of the changed
3185 # gaps first so that changes to the internal state as new ranges
3186 # are added won't be a problem.
3187 my @gap_list;
3188
3189 # First, if the starting point of the input range is outside an
3190 # existing one, there is a gap from there to the beginning of the
3191 # existing range -- add a span to fill the part that this new
3192 # range occupies
3193 if ($start < $r->[$i]->start) {
3194 push @gap_list, Range->new($start,
3195 main::min($end,
3196 $r->[$i]->start - 1),
3197 Type => $type);
3198 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3199 }
3200
3201 # Then look through the range list for other gaps until we reach
3202 # the highest range affected by the input one.
3203 my $j;
3204 for ($j = $i+1; $j < $range_list_size; $j++) {
3205 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3206 last if $end < $r->[$j]->start;
3207
3208 # If there is a gap between when this range starts and the
3209 # previous one ends, add a span to fill it. Note that just
3210 # because there are two ranges doesn't mean there is a
3211 # non-zero gap between them. It could be that they have
3212 # different values or types
3213 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3214 push @gap_list,
3215 Range->new($r->[$j-1]->end + 1,
3216 $r->[$j]->start - 1,
3217 Type => $type);
3218 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3219 }
3220 }
3221
3222 # Here, we have either found an existing range in the range list,
3223 # beyond the area affected by the input one, or we fell off the
3224 # end of the loop because the input range affects the whole rest
3225 # of the range list. In either case, $j is 1 higher than the
3226 # highest affected range. If $j == $i, it means that there are no
3227 # affected ranges, that the entire insertion is in the gap between
3228 # r[$i-1], and r[$i], which we already have taken care of before
3229 # the loop.
3230 # On the other hand, if there are affected ranges, it might be
3231 # that there is a gap that needs filling after the final such
3232 # range to the end of the input range
3233 if ($r->[$j-1]->end < $end) {
3234 push @gap_list, Range->new(main::max($start,
3235 $r->[$j-1]->end + 1),
3236 $end,
3237 Type => $type);
3238 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3239 }
3240
3241 # Call recursively to fill in all the gaps.
3242 foreach my $gap (@gap_list) {
3243 $self->_add_delete($operation,
3244 $gap->start,
3245 $gap->end,
3246 $value,
3247 Type => $type);
3248 }
3249
3250 return;
3251 }
3252
3253 # Here, we have taken care of the case where $replace is $NO, which
3254 # means that whatever action we now take is done unconditionally. It
3255 # still could be that this call will result in a no-op, if duplicates
3256 # aren't allowed, and we are inserting a range that merely duplicates
3257 # data already in the range list; or also if deleting a non-existent
3258 # range.
3259 # $i still points to the first potential affected range. Now find the
3260 # highest range affected, which will determine the length parameter to
3261 # splice. (The input range can span multiple existing ones.) While
3262 # we are looking through the range list, see also if this is an
3263 # insertion that will change the values of at least one of the
3264 # affected ranges. We don't need to do this check unless this is an
3265 # insertion of non-multiples, and also since this is a boolean, we
3266 # don't need to do it if have already determined that it will make a
3267 # change; just unconditionally change them. $cdm is created to be 1
3268 # if either of these is true. (The 'c' in the name comes from below)
3269 my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3270 my $j; # This will point to the highest affected range
3271
3272 # For non-zero types, the standard form is the value itself;
3273 my $standard_form = ($type) ? $value : main::standardize($value);
3274
3275 for ($j = $i; $j < $range_list_size; $j++) {
3276 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3277
3278 # If find a range that it doesn't overlap into, we can stop
3279 # searching
3280 last if $end < $r->[$j]->start;
3281
3282 # Here, overlaps the range at $j. If the value's don't match,
3283 # and this is supposedly an insertion, it becomes a change
3284 # instead. This is what the 'c' stands for in $cdm.
3285 if (! $cdm) {
3286 if ($r->[$j]->standard_form ne $standard_form) {
3287 $cdm = 1;
3288 }
3289 else {
3290
3291 # Here, the two values are essentially the same. If the
3292 # two are actually identical, replacing wouldn't change
3293 # anything so skip it.
3294 my $pre_existing = $r->[$j]->value;
3295 if ($pre_existing ne $value) {
3296
3297 # Here the new and old standardized values are the
3298 # same, but the non-standardized values aren't. If
3299 # replacing unconditionally, then replace
3300 if( $replace == $UNCONDITIONALLY) {
3301 $cdm = 1;
3302 }
3303 else {
3304
3305 # Here, are replacing conditionally. Decide to
3306 # replace or not based on which appears to look
3307 # the "nicest". If one is mixed case and the
3308 # other isn't, choose the mixed case one.
3309 my $new_mixed = $value =~ /[A-Z]/
3310 && $value =~ /[a-z]/;
3311 my $old_mixed = $pre_existing =~ /[A-Z]/
3312 && $pre_existing =~ /[a-z]/;
3313
3314 if ($old_mixed != $new_mixed) {
3315 $cdm = 1 if $new_mixed;
3316 if (main::DEBUG && $to_trace) {
3317 if ($cdm) {
3318 trace "Replacing $pre_existing with $value";
3319 }
3320 else {
3321 trace "Retaining $pre_existing over $value";
3322 }
3323 }
3324 }
3325 else {
3326
3327 # Here casing wasn't different between the two.
3328 # If one has hyphens or underscores and the
3329 # other doesn't, choose the one with the
3330 # punctuation.
3331 my $new_punct = $value =~ /[-_]/;
3332 my $old_punct = $pre_existing =~ /[-_]/;
3333
3334 if ($old_punct != $new_punct) {
3335 $cdm = 1 if $new_punct;
3336 if (main::DEBUG && $to_trace) {
3337 if ($cdm) {
3338 trace "Replacing $pre_existing with $value";
3339 }
3340 else {
3341 trace "Retaining $pre_existing over $value";
3342 }
3343 }
3344 } # else existing one is just as "good";
3345 # retain it to save cycles.
3346 }
3347 }
3348 }
3349 }
3350 }
3351 } # End of loop looking for highest affected range.
3352
3353 # Here, $j points to one beyond the highest range that this insertion
3354 # affects (hence to beyond the range list if that range is the final
3355 # one in the range list).
3356
3357 # The splice length is all the affected ranges. Get it before
3358 # subtracting, for efficiency, so we don't have to later add 1.
3359 my $length = $j - $i;
3360
3361 $j--; # $j now points to the highest affected range.
3362 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3363
3364 # If inserting a multiple record, this is where it goes, after all the
3365 # existing ones for this range. This implies an insertion, and no
3366 # change to any existing ranges. Note that $j can be -1 if this new
3367 # range doesn't actually duplicate any existing, and comes at the
3368 # beginning of the list, in which case we can handle it like any other
3369 # insertion, and is easier to do so.
3370 if ($replace == $MULTIPLE && $j >= 0) {
3371
3372 # This restriction could be remedied with a little extra work, but
3373 # it won't hopefully ever be necessary
3374 if ($r->[$j]->start != $r->[$j]->end) {
3375 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken.");
3376 return;
3377 }
3378
3379 # Don't add an exact duplicate, as it isn't really a multiple
3380 return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3381
3382 trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3383 my @return = splice @$r,
3384 $j+1,
3385 0,
3386 Range->new($start,
3387 $end,
3388 Value => $value,
3389 Type => $type);
3390 if (main::DEBUG && $to_trace) {
3391 trace "After splice:";
3392 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3393 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3394 trace "j =[", $j, "]", $r->[$j] if $j >= 0;
3395 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3396 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3397 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3398 }
3399 return @return;
3400 }
3401
3402 # Here, have taken care of $NO and $MULTIPLE replaces.
3403 # $j points to the highest affected range. But it can be < $i or even
3404 # -1. These happen only if the insertion is entirely in the gap
3405 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3406 # above exited first time through with $end < $r->[$i]->start. (And
3407 # then we subtracted one from j) This implies also that $start <
3408 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3409 # $start, so the entire input range is in the gap.
3410 if ($j < $i) {
3411
3412 # Here the entire input range is in the gap before $i.
3413
3414 if (main::DEBUG && $to_trace) {
3415 if ($i) {
3416 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3417 }
3418 else {
3419 trace "Entire range is before $r->[$i]";
3420 }
3421 }
3422 return if $operation ne '+'; # Deletion of a non-existent range is
3423 # a no-op
3424 }
3425 else {
3426
3427 # Here the entire input range is not in the gap before $i. There
3428 # is an affected one, and $j points to the highest such one.
3429
3430 # At this point, here is the situation:
3431 # This is not an insertion of a multiple, nor of tentative ($NO)
3432 # data.
3433 # $i points to the first element in the current range list that
3434 # may be affected by this operation. In fact, we know
3435 # that the range at $i is affected because we are in
3436 # the else branch of this 'if'
3437 # $j points to the highest affected range.
3438 # In other words,
3439 # r[$i-1]->end < $start <= r[$i]->end
3440 # And:
3441 # r[$i-1]->end < $start <= $end <= r[$j]->end
3442 #
3443 # Also:
3444 # $cdm is a boolean which is set true if and only if this is a
3445 # change or deletion (multiple was handled above). In
3446 # other words, it could be renamed to be just $cd.
3447
3448 # We now have enough information to decide if this call is a no-op
3449 # or not. It is a no-op if it is a deletion of a non-existent
3450 # range, or an insertion of already existing data.
3451
3452 if (main::DEBUG && $to_trace && ! $cdm
3453 && $i == $j
3454 && $start >= $r->[$i]->start)
3455 {
3456 trace "no-op";
3457 }
3458 return if ! $cdm # change or delete => not no-op
3459 && $i == $j # more than one affected range => not no-op
3460
3461 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3462 # Further, $start and/or $end is >= r[$i]->start
3463 # The test below hence guarantees that
3464 # r[$i]->start < $start <= $end <= r[$i]->end
3465 # This means the input range is contained entirely in
3466 # the one at $i, so is a no-op
3467 && $start >= $r->[$i]->start;
3468 }
3469
3470 # Here, we know that some action will have to be taken. We have
3471 # calculated the offset and length (though adjustments may be needed)
3472 # for the splice. Now start constructing the replacement list.
3473 my @replacement;
3474 my $splice_start = $i;
3475
3476 my $extends_below;
3477 my $extends_above;
3478
3479 # See if should extend any adjacent ranges.
3480 if ($operation eq '-') { # Don't extend deletions
3481 $extends_below = $extends_above = 0;
3482 }
3483 else { # Here, should extend any adjacent ranges. See if there are
3484 # any.
3485 $extends_below = ($i > 0
3486 # can't extend unless adjacent
3487 && $r->[$i-1]->end == $start -1
3488 # can't extend unless are same standard value
3489 && $r->[$i-1]->standard_form eq $standard_form
3490 # can't extend unless share type
3491 && $r->[$i-1]->type == $type);
3492 $extends_above = ($j+1 < $range_list_size
3493 && $r->[$j+1]->start == $end +1
3494 && $r->[$j+1]->standard_form eq $standard_form
3495 && $r->[$j-1]->type == $type);
3496 }
3497 if ($extends_below && $extends_above) { # Adds to both
3498 $splice_start--; # start replace at element below
3499 $length += 2; # will replace on both sides
3500 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3501
3502 # The result will fill in any gap, replacing both sides, and
3503 # create one large range.
3504 @replacement = Range->new($r->[$i-1]->start,
3505 $r->[$j+1]->end,
3506 Value => $value,
3507 Type => $type);
3508 }
3509 else {
3510
3511 # Here we know that the result won't just be the conglomeration of
3512 # a new range with both its adjacent neighbors. But it could
3513 # extend one of them.
3514
3515 if ($extends_below) {
3516
3517 # Here the new element adds to the one below, but not to the
3518 # one above. If inserting, and only to that one range, can
3519 # just change its ending to include the new one.
3520 if ($length == 0 && ! $cdm) {
3521 $r->[$i-1]->set_end($end);
3522 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3523 return;
3524 }
3525 else {
3526 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3527 $splice_start--; # start replace at element below
3528 $length++; # will replace the element below
3529 $start = $r->[$i-1]->start;
3530 }
3531 }
3532 elsif ($extends_above) {
3533
3534 # Here the new element adds to the one above, but not below.
3535 # Mirror the code above
3536 if ($length == 0 && ! $cdm) {
3537 $r->[$j+1]->set_start($start);
3538 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3539 return;
3540 }
3541 else {
3542 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3543 $length++; # will replace the element above
3544 $end = $r->[$j+1]->end;
3545 }
3546 }
3547
3548 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3549
3550 # Finally, here we know there will have to be a splice.
3551 # If the change or delete affects only the highest portion of the
3552 # first affected range, the range will have to be split. The
3553 # splice will remove the whole range, but will replace it by a new
3554 # range containing just the unaffected part. So, in this case,
3555 # add to the replacement list just this unaffected portion.
3556 if (! $extends_below
3557 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3558 {
3559 push @replacement,
3560 Range->new($r->[$i]->start,
3561 $start - 1,
3562 Value => $r->[$i]->value,
3563 Type => $r->[$i]->type);
3564 }
3565
3566 # In the case of an insert or change, but not a delete, we have to
3567 # put in the new stuff; this comes next.
3568 if ($operation eq '+') {
3569 push @replacement, Range->new($start,
3570 $end,
3571 Value => $value,
3572 Type => $type);
3573 }
3574
3575 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3576 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3577
3578 # And finally, if we're changing or deleting only a portion of the
3579 # highest affected range, it must be split, as the lowest one was.
3580 if (! $extends_above
3581 && $j >= 0 # Remember that j can be -1 if before first
3582 # current element
3583 && $end >= $r->[$j]->start
3584 && $end < $r->[$j]->end)
3585 {
3586 push @replacement,
3587 Range->new($end + 1,
3588 $r->[$j]->end,
3589 Value => $r->[$j]->value,
3590 Type => $r->[$j]->type);
3591 }
3592 }
3593
3594 # And do the splice, as calculated above
3595 if (main::DEBUG && $to_trace) {
3596 trace "replacing $length element(s) at $i with ";
3597 foreach my $replacement (@replacement) {
3598 trace " $replacement";
3599 }
3600 trace "Before splice:";
3601 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3602 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3603 trace "i =[", $i, "]", $r->[$i];
3604 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3605 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3606 }
3607
3608 my @return = splice @$r, $splice_start, $length, @replacement;
3609
3610 if (main::DEBUG && $to_trace) {
3611 trace "After splice:";
3612 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3613 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3614 trace "i =[", $i, "]", $r->[$i];
3615 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3616 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3617 trace "removed @return";
3618 }
3619
3620 # An actual deletion could have changed the maximum in the list.
3621 # There was no deletion if the splice didn't return something, but
3622 # otherwise recalculate it. This is done too rarely to worry about
3623 # performance.
3624 if ($operation eq '-' && @return) {
3625 $max{$addr} = $r->[-1]->end;
3626 }
3627 return @return;
3628 }
3629
3630 sub reset_each_range { # reset the iterator for each_range();
3631 my $self = shift;
3632 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3633
f998e60c 3634 no overloading;
051df77b 3635 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
3636 return;
3637 }
3638
3639 sub each_range {
3640 # Iterate over each range in a range list. Results are undefined if
3641 # the range list is changed during the iteration.
3642
3643 my $self = shift;
3644 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3645
ffe43484 3646 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3647
3648 return if $self->is_empty;
3649
3650 $each_range_iterator{$addr} = -1
3651 if ! defined $each_range_iterator{$addr};
3652 $each_range_iterator{$addr}++;
3653 return $ranges{$addr}->[$each_range_iterator{$addr}]
3654 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3655 undef $each_range_iterator{$addr};
3656 return;
3657 }
3658
3659 sub count { # Returns count of code points in range list
3660 my $self = shift;
3661 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3662
ffe43484 3663 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3664
3665 my $count = 0;
3666 foreach my $range (@{$ranges{$addr}}) {
3667 $count += $range->end - $range->start + 1;
3668 }
3669 return $count;
3670 }
3671
3672 sub delete_range { # Delete a range
3673 my $self = shift;
3674 my $start = shift;
3675 my $end = shift;
3676
3677 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3678
3679 return $self->_add_delete('-', $start, $end, "");
3680 }
3681
3682 sub is_empty { # Returns boolean as to if a range list is empty
3683 my $self = shift;
3684 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3685
f998e60c 3686 no overloading;
051df77b 3687 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
3688 }
3689
3690 sub hash {
3691 # Quickly returns a scalar suitable for separating tables into
3692 # buckets, i.e. it is a hash function of the contents of a table, so
3693 # there are relatively few conflicts.
3694
3695 my $self = shift;
3696 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3697
ffe43484 3698 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3699
3700 # These are quickly computable. Return looks like 'min..max;count'
3701 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3702 }
3703} # End closure for _Range_List_Base
3704
3705package Range_List;
3706use base '_Range_List_Base';
3707
3708# A Range_List is a range list for match tables; i.e. the range values are
3709# not significant. Thus a number of operations can be safely added to it,
3710# such as inversion, intersection. Note that union is also an unsafe
3711# operation when range values are cared about, and that method is in the base
3712# class, not here. But things are set up so that that method is callable only
3713# during initialization. Only in this derived class, is there an operation
3714# that combines two tables. A Range_Map can thus be used to initialize a
3715# Range_List, and its mappings will be in the list, but are not significant to
3716# this class.
3717
3718sub trace { return main::trace(@_); }
3719
3720{ # Closure
3721
3722 use overload
3723 fallback => 0,
3724 '+' => sub { my $self = shift;
3725 my $other = shift;
3726
3727 return $self->_union($other)
3728 },
3729 '&' => sub { my $self = shift;
3730 my $other = shift;
3731
3732 return $self->_intersect($other, 0);
3733 },
3734 '~' => "_invert",
3735 '-' => "_subtract",
3736 ;
3737
3738 sub _invert {
3739 # Returns a new Range_List that gives all code points not in $self.
3740
3741 my $self = shift;
3742
3743 my $new = Range_List->new;
3744
3745 # Go through each range in the table, finding the gaps between them
3746 my $max = -1; # Set so no gap before range beginning at 0
3747 for my $range ($self->ranges) {
3748 my $start = $range->start;
3749 my $end = $range->end;
3750
3751 # If there is a gap before this range, the inverse will contain
3752 # that gap.
3753 if ($start > $max + 1) {
3754 $new->add_range($max + 1, $start - 1);
3755 }
3756 $max = $end;
3757 }
3758
3759 # And finally, add the gap from the end of the table to the max
3760 # possible code point
3761 if ($max < $LAST_UNICODE_CODEPOINT) {
3762 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3763 }
3764 return $new;
3765 }
3766
3767 sub _subtract {
3768 # Returns a new Range_List with the argument deleted from it. The
3769 # argument can be a single code point, a range, or something that has
3770 # a range, with the _range_list() method on it returning them
3771
3772 my $self = shift;
3773 my $other = shift;
3774 my $reversed = shift;
3775 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3776
3777 if ($reversed) {
3778 Carp::my_carp_bug("Can't cope with a "
3779 . __PACKAGE__
3780 . " being the second parameter in a '-'. Subtraction ignored.");
3781 return $self;
3782 }
3783
3784 my $new = Range_List->new(Initialize => $self);
3785
3786 if (! ref $other) { # Single code point
3787 $new->delete_range($other, $other);
3788 }
3789 elsif ($other->isa('Range')) {
3790 $new->delete_range($other->start, $other->end);
3791 }
3792 elsif ($other->can('_range_list')) {
3793 foreach my $range ($other->_range_list->ranges) {
3794 $new->delete_range($range->start, $range->end);
3795 }
3796 }
3797 else {
3798 Carp::my_carp_bug("Can't cope with a "
3799 . ref($other)
3800 . " argument to '-'. Subtraction ignored."
3801 );
3802 return $self;
3803 }
3804
3805 return $new;
3806 }
3807
3808 sub _intersect {
3809 # Returns either a boolean giving whether the two inputs' range lists
3810 # intersect (overlap), or a new Range_List containing the intersection
3811 # of the two lists. The optional final parameter being true indicates
3812 # to do the check instead of the intersection.
3813
3814 my $a_object = shift;
3815 my $b_object = shift;
3816 my $check_if_overlapping = shift;
3817 $check_if_overlapping = 0 unless defined $check_if_overlapping;
3818 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3819
3820 if (! defined $b_object) {
3821 my $message = "";
3822 $message .= $a_object->_owner_name_of if defined $a_object;
3823 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
3824 return;
3825 }
3826
3827 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3828 # Thus the intersection could be much more simply be written:
3829 # return ~(~$a_object + ~$b_object);
3830 # But, this is slower, and when taking the inverse of a large
3831 # range_size_1 table, back when such tables were always stored that
3832 # way, it became prohibitively slow, hence the code was changed to the
3833 # below
3834
3835 if ($b_object->isa('Range')) {
3836 $b_object = Range_List->new(Initialize => $b_object,
3837 Owner => $a_object->_owner_name_of);
3838 }
3839 $b_object = $b_object->_range_list if $b_object->can('_range_list');
3840
3841 my @a_ranges = $a_object->ranges;
3842 my @b_ranges = $b_object->ranges;
3843
3844 #local $to_trace = 1 if main::DEBUG;
3845 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3846
3847 # Start with the first range in each list
3848 my $a_i = 0;
3849 my $range_a = $a_ranges[$a_i];
3850 my $b_i = 0;
3851 my $range_b = $b_ranges[$b_i];
3852
3853 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3854 if ! $check_if_overlapping;
3855
3856 # If either list is empty, there is no intersection and no overlap
3857 if (! defined $range_a || ! defined $range_b) {
3858 return $check_if_overlapping ? 0 : $new;
3859 }
3860 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3861
3862 # Otherwise, must calculate the intersection/overlap. Start with the
3863 # very first code point in each list
3864 my $a = $range_a->start;
3865 my $b = $range_b->start;
3866
3867 # Loop through all the ranges of each list; in each iteration, $a and
3868 # $b are the current code points in their respective lists
3869 while (1) {
3870
3871 # If $a and $b are the same code point, ...
3872 if ($a == $b) {
3873
3874 # it means the lists overlap. If just checking for overlap
3875 # know the answer now,
3876 return 1 if $check_if_overlapping;
3877
3878 # The intersection includes this code point plus anything else
3879 # common to both current ranges.
3880 my $start = $a;
3881 my $end = main::min($range_a->end, $range_b->end);
3882 if (! $check_if_overlapping) {
3883 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3884 $new->add_range($start, $end);
3885 }
3886
3887 # Skip ahead to the end of the current intersect
3888 $a = $b = $end;
3889
3890 # If the current intersect ends at the end of either range (as
3891 # it must for at least one of them), the next possible one
3892 # will be the beginning code point in it's list's next range.
3893 if ($a == $range_a->end) {
3894 $range_a = $a_ranges[++$a_i];
3895 last unless defined $range_a;
3896 $a = $range_a->start;
3897 }
3898 if ($b == $range_b->end) {
3899 $range_b = $b_ranges[++$b_i];
3900 last unless defined $range_b;
3901 $b = $range_b->start;
3902 }
3903
3904 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3905 }
3906 elsif ($a < $b) {
3907
3908 # Not equal, but if the range containing $a encompasses $b,
3909 # change $a to be the middle of the range where it does equal
3910 # $b, so the next iteration will get the intersection
3911 if ($range_a->end >= $b) {
3912 $a = $b;
3913 }
3914 else {
3915
3916 # Here, the current range containing $a is entirely below
3917 # $b. Go try to find a range that could contain $b.
3918 $a_i = $a_object->_search_ranges($b);
3919
3920 # If no range found, quit.
3921 last unless defined $a_i;
3922
3923 # The search returns $a_i, such that
3924 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3925 # Set $a to the beginning of this new range, and repeat.
3926 $range_a = $a_ranges[$a_i];
3927 $a = $range_a->start;
3928 }
3929 }
3930 else { # Here, $b < $a.
3931
3932 # Mirror image code to the leg just above
3933 if ($range_b->end >= $a) {
3934 $b = $a;
3935 }
3936 else {
3937 $b_i = $b_object->_search_ranges($a);
3938 last unless defined $b_i;
3939 $range_b = $b_ranges[$b_i];
3940 $b = $range_b->start;
3941 }
3942 }
3943 } # End of looping through ranges.
3944
3945 # Intersection fully computed, or now know that there is no overlap
3946 return $check_if_overlapping ? 0 : $new;
3947 }
3948
3949 sub overlaps {
3950 # Returns boolean giving whether the two arguments overlap somewhere
3951
3952 my $self = shift;
3953 my $other = shift;
3954 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3955
3956 return $self->_intersect($other, 1);
3957 }
3958
3959 sub add_range {
3960 # Add a range to the list.
3961
3962 my $self = shift;
3963 my $start = shift;
3964 my $end = shift;
3965 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3966
3967 return $self->_add_delete('+', $start, $end, "");
3968 }
3969
99870f4d
KW
3970 sub is_code_point_usable {
3971 # This used only for making the test script. See if the input
3972 # proposed trial code point is one that Perl will handle. If second
3973 # parameter is 0, it won't select some code points for various
3974 # reasons, noted below.
3975
3976 my $code = shift;
3977 my $try_hard = shift;
3978 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3979
3980 return 0 if $code < 0; # Never use a negative
3981
99870f4d
KW
3982 # shun null. I'm (khw) not sure why this was done, but NULL would be
3983 # the character very frequently used.
3984 return $try_hard if $code == 0x0000;
3985
3986 return 0 if $try_hard; # XXX Temporary until fix utf8.c
3987
3988 # shun non-character code points.
3989 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
3990 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
3991
3992 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
3993 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
3994
3995 return 1;
3996 }
3997
3998 sub get_valid_code_point {
3999 # Return a code point that's part of the range list. Returns nothing
4000 # if the table is empty or we can't find a suitable code point. This
4001 # used only for making the test script.
4002
4003 my $self = shift;
4004 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4005
ffe43484 4006 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4007
4008 # On first pass, don't choose less desirable code points; if no good
4009 # one is found, repeat, allowing a less desirable one to be selected.
4010 for my $try_hard (0, 1) {
4011
4012 # Look through all the ranges for a usable code point.
4013 for my $set ($self->ranges) {
4014
4015 # Try the edge cases first, starting with the end point of the
4016 # range.
4017 my $end = $set->end;
4018 return $end if is_code_point_usable($end, $try_hard);
4019
4020 # End point didn't, work. Start at the beginning and try
4021 # every one until find one that does work.
4022 for my $trial ($set->start .. $end - 1) {
4023 return $trial if is_code_point_usable($trial, $try_hard);
4024 }
4025 }
4026 }
4027 return (); # If none found, give up.
4028 }
4029
4030 sub get_invalid_code_point {
4031 # Return a code point that's not part of the table. Returns nothing
4032 # if the table covers all code points or a suitable code point can't
4033 # be found. This used only for making the test script.
4034
4035 my $self = shift;
4036 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4037
4038 # Just find a valid code point of the inverse, if any.
4039 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4040 }
4041} # end closure for Range_List
4042
4043package Range_Map;
4044use base '_Range_List_Base';
4045
4046# A Range_Map is a range list in which the range values (called maps) are
4047# significant, and hence shouldn't be manipulated by our other code, which
4048# could be ambiguous or lose things. For example, in taking the union of two
4049# lists, which share code points, but which have differing values, which one
4050# has precedence in the union?
4051# It turns out that these operations aren't really necessary for map tables,
4052# and so this class was created to make sure they aren't accidentally
4053# applied to them.
4054
4055{ # Closure
4056
4057 sub add_map {
4058 # Add a range containing a mapping value to the list
4059
4060 my $self = shift;
4061 # Rest of parameters passed on
4062
4063 return $self->_add_delete('+', @_);
4064 }
4065
4066 sub add_duplicate {
4067 # Adds entry to a range list which can duplicate an existing entry
4068
4069 my $self = shift;
4070 my $code_point = shift;
4071 my $value = shift;
4072 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4073
4074 return $self->add_map($code_point, $code_point,
4075 $value, Replace => $MULTIPLE);
4076 }
4077} # End of closure for package Range_Map
4078
4079package _Base_Table;
4080
4081# A table is the basic data structure that gets written out into a file for
4082# use by the Perl core. This is the abstract base class implementing the
4083# common elements from the derived ones. A list of the methods to be
4084# furnished by an implementing class is just after the constructor.
4085
4086sub standardize { return main::standardize($_[0]); }
4087sub trace { return main::trace(@_); }
4088
4089{ # Closure
4090
4091 main::setup_package();
4092
4093 my %range_list;
4094 # Object containing the ranges of the table.
4095 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4096
4097 my %full_name;
4098 # The full table name.
4099 main::set_access('full_name', \%full_name, 'r');
4100
4101 my %name;
4102 # The table name, almost always shorter
4103 main::set_access('name', \%name, 'r');
4104
4105 my %short_name;
4106 # The shortest of all the aliases for this table, with underscores removed
4107 main::set_access('short_name', \%short_name);
4108
4109 my %nominal_short_name_length;
4110 # The length of short_name before removing underscores
4111 main::set_access('nominal_short_name_length',
4112 \%nominal_short_name_length);
4113
23e33b60
KW
4114 my %complete_name;
4115 # The complete name, including property.
4116 main::set_access('complete_name', \%complete_name, 'r');
4117
99870f4d
KW
4118 my %property;
4119 # Parent property this table is attached to.
4120 main::set_access('property', \%property, 'r');
4121
4122 my %aliases;
4123 # Ordered list of aliases of the table's name. The first ones in the list
4124 # are output first in comments
4125 main::set_access('aliases', \%aliases, 'readable_array');
4126
4127 my %comment;
4128 # A comment associated with the table for human readers of the files
4129 main::set_access('comment', \%comment, 's');
4130
4131 my %description;
4132 # A comment giving a short description of the table's meaning for human
4133 # readers of the files.
4134 main::set_access('description', \%description, 'readable_array');
4135
4136 my %note;
4137 # A comment giving a short note about the table for human readers of the
4138 # files.
4139 main::set_access('note', \%note, 'readable_array');
4140
4141 my %internal_only;
4142 # Boolean; if set means any file that contains this table is marked as for
4143 # internal-only use.
4144 main::set_access('internal_only', \%internal_only);
4145
4146 my %find_table_from_alias;
4147 # The parent property passes this pointer to a hash which this class adds
4148 # all its aliases to, so that the parent can quickly take an alias and
4149 # find this table.
4150 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4151
4152 my %locked;
4153 # After this table is made equivalent to another one; we shouldn't go
4154 # changing the contents because that could mean it's no longer equivalent
4155 main::set_access('locked', \%locked, 'r');
4156
4157 my %file_path;
4158 # This gives the final path to the file containing the table. Each
4159 # directory in the path is an element in the array
4160 main::set_access('file_path', \%file_path, 'readable_array');
4161
4162 my %status;
4163 # What is the table's status, normal, $OBSOLETE, etc. Enum
4164 main::set_access('status', \%status, 'r');
4165
4166 my %status_info;
4167 # A comment about its being obsolete, or whatever non normal status it has
4168 main::set_access('status_info', \%status_info, 'r');
4169
4170 my %range_size_1;
4171 # Is the table to be output with each range only a single code point?
4172 # This is done to avoid breaking existing code that may have come to rely
4173 # on this behavior in previous versions of this program.)
4174 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4175
4176 my %perl_extension;
4177 # A boolean set iff this table is a Perl extension to the Unicode
4178 # standard.
4179 main::set_access('perl_extension', \%perl_extension, 'r');
4180
0c07e538
KW
4181 my %output_range_counts;
4182 # A boolean set iff this table is to have comments written in the
4183 # output file that contain the number of code points in the range.
4184 # The constructor can override the global flag of the same name.
4185 main::set_access('output_range_counts', \%output_range_counts, 'r');
4186
99870f4d
KW
4187 sub new {
4188 # All arguments are key => value pairs, which you can see below, most
4189 # of which match fields documented above. Otherwise: Pod_Entry,
4190 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4191 # documented in the Alias package
4192
4193 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4194
4195 my $class = shift;
4196
4197 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4198 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4199
4200 my %args = @_;
4201
4202 $name{$addr} = delete $args{'Name'};
4203 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4204 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4205 my $complete_name = $complete_name{$addr}
4206 = delete $args{'Complete_Name'};
99870f4d 4207 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
0c07e538 4208 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4209 $property{$addr} = delete $args{'_Property'};
4210 $range_list{$addr} = delete $args{'_Range_List'};
4211 $status{$addr} = delete $args{'Status'} || $NORMAL;
4212 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4213 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
9ef2b94f 4214 $range_size_1{$addr} = 1 if $output_names; # Make sure 1 name per line
99870f4d
KW
4215
4216 my $description = delete $args{'Description'};
4217 my $externally_ok = delete $args{'Externally_Ok'};
4218 my $loose_match = delete $args{'Fuzzy'};
4219 my $note = delete $args{'Note'};
4220 my $make_pod_entry = delete $args{'Pod_Entry'};
37e2e78e 4221 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4222
4223 # Shouldn't have any left over
4224 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4225
4226 # Can't use || above because conceivably the name could be 0, and
4227 # can't use // operator in case this program gets used in Perl 5.8
4228 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4229 $output_range_counts{$addr} = $output_range_counts if
4230 ! defined $output_range_counts{$addr};
99870f4d
KW
4231
4232 $aliases{$addr} = [ ];
4233 $comment{$addr} = [ ];
4234 $description{$addr} = [ ];
4235 $note{$addr} = [ ];
4236 $file_path{$addr} = [ ];
4237 $locked{$addr} = "";
4238
4239 push @{$description{$addr}}, $description if $description;
4240 push @{$note{$addr}}, $note if $note;
4241
37e2e78e
KW
4242 if ($status{$addr} eq $PLACEHOLDER) {
4243
4244 # A placeholder table doesn't get documented, is a perl extension,
4245 # and quite likely will be empty
4246 $make_pod_entry = 0 if ! defined $make_pod_entry;
4247 $perl_extension = 1 if ! defined $perl_extension;
4248 push @tables_that_may_be_empty, $complete_name{$addr};
4249 }
4250 elsif (! $status{$addr}) {
4251
4252 # If hasn't set its status already, see if it is on one of the
4253 # lists of properties or tables that have particular statuses; if
4254 # not, is normal. The lists are prioritized so the most serious
4255 # ones are checked first
ec11e5f4
KW
4256 if (exists $why_suppressed{$complete_name}
4257 # Don't suppress if overriden
4258 && ! grep { $_ eq $complete_name{$addr} }
4259 @output_mapped_properties)
4260 {
99870f4d
KW
4261 $status{$addr} = $SUPPRESSED;
4262 }
4263 elsif (exists $why_deprecated{$complete_name}) {
4264 $status{$addr} = $DEPRECATED;
4265 }
4266 elsif (exists $why_stabilized{$complete_name}) {
4267 $status{$addr} = $STABILIZED;
4268 }
4269 elsif (exists $why_obsolete{$complete_name}) {
4270 $status{$addr} = $OBSOLETE;
4271 }
4272
4273 # Existence above doesn't necessarily mean there is a message
4274 # associated with it. Use the most serious message.
4275 if ($status{$addr}) {
4276 if ($why_suppressed{$complete_name}) {
4277 $status_info{$addr}
4278 = $why_suppressed{$complete_name};
4279 }
4280 elsif ($why_deprecated{$complete_name}) {
4281 $status_info{$addr}
4282 = $why_deprecated{$complete_name};
4283 }
4284 elsif ($why_stabilized{$complete_name}) {
4285 $status_info{$addr}
4286 = $why_stabilized{$complete_name};
4287 }
4288 elsif ($why_obsolete{$complete_name}) {
4289 $status_info{$addr}
4290 = $why_obsolete{$complete_name};
4291 }
4292 }
4293 }
4294
37e2e78e
KW
4295 $perl_extension{$addr} = $perl_extension || 0;
4296
99870f4d
KW
4297 # By convention what typically gets printed only or first is what's
4298 # first in the list, so put the full name there for good output
4299 # clarity. Other routines rely on the full name being first on the
4300 # list
4301 $self->add_alias($full_name{$addr},
4302 Externally_Ok => $externally_ok,
4303 Fuzzy => $loose_match,
4304 Pod_Entry => $make_pod_entry,
4305 Status => $status{$addr},
4306 );
4307
4308 # Then comes the other name, if meaningfully different.
4309 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4310 $self->add_alias($name{$addr},
4311 Externally_Ok => $externally_ok,
4312 Fuzzy => $loose_match,
4313 Pod_Entry => $make_pod_entry,
4314 Status => $status{$addr},
4315 );
4316 }
4317
4318 return $self;
4319 }
4320
4321 # Here are the methods that are required to be defined by any derived
4322 # class
ea25a9b2 4323 for my $sub (qw(
99870f4d 4324 append_to_body
99870f4d 4325 pre_body
ea25a9b2 4326 ))
99870f4d
KW
4327 # append_to_body and pre_body are called in the write() method
4328 # to add stuff after the main body of the table, but before
4329 # its close; and to prepend stuff before the beginning of the
4330 # table.
99870f4d
KW
4331 {
4332 no strict "refs";
4333 *$sub = sub {
4334 Carp::my_carp_bug( __LINE__
4335 . ": Must create method '$sub()' for "
4336 . ref shift);
4337 return;
4338 }
4339 }
4340
4341 use overload
4342 fallback => 0,
4343 "." => \&main::_operator_dot,
4344 '!=' => \&main::_operator_not_equal,
4345 '==' => \&main::_operator_equal,
4346 ;
4347
4348 sub ranges {
4349 # Returns the array of ranges associated with this table.
4350
f998e60c 4351 no overloading;
051df77b 4352 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4353 }
4354
4355 sub add_alias {
4356 # Add a synonym for this table.
4357
4358 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4359
4360 my $self = shift;
4361 my $name = shift; # The name to add.
4362 my $pointer = shift; # What the alias hash should point to. For
4363 # map tables, this is the parent property;
4364 # for match tables, it is the table itself.
4365
4366 my %args = @_;
4367 my $loose_match = delete $args{'Fuzzy'};
4368
4369 my $make_pod_entry = delete $args{'Pod_Entry'};
4370 $make_pod_entry = $YES unless defined $make_pod_entry;
4371
4372 my $externally_ok = delete $args{'Externally_Ok'};
4373 $externally_ok = 1 unless defined $externally_ok;
4374
4375 my $status = delete $args{'Status'};
4376 $status = $NORMAL unless defined $status;
4377
4378 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4379
4380 # Capitalize the first letter of the alias unless it is one of the CJK
4381 # ones which specifically begins with a lower 'k'. Do this because
4382 # Unicode has varied whether they capitalize first letters or not, and
4383 # have later changed their minds and capitalized them, but not the
4384 # other way around. So do it always and avoid changes from release to
4385 # release
4386 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4387
ffe43484 4388 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4389
4390 # Figure out if should be loosely matched if not already specified.
4391 if (! defined $loose_match) {
4392
4393 # Is a loose_match if isn't null, and doesn't begin with an
4394 # underscore and isn't just a number
4395 if ($name ne ""
4396 && substr($name, 0, 1) ne '_'
4397 && $name !~ qr{^[0-9_.+-/]+$})
4398 {
4399 $loose_match = 1;
4400 }
4401 else {
4402 $loose_match = 0;
4403 }
4404 }
4405
4406 # If this alias has already been defined, do nothing.
4407 return if defined $find_table_from_alias{$addr}->{$name};
4408
4409 # That includes if it is standardly equivalent to an existing alias,
4410 # in which case, add this name to the list, so won't have to search
4411 # for it again.
4412 my $standard_name = main::standardize($name);
4413 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4414 $find_table_from_alias{$addr}->{$name}
4415 = $find_table_from_alias{$addr}->{$standard_name};
4416 return;
4417 }
4418
4419 # Set the index hash for this alias for future quick reference.
4420 $find_table_from_alias{$addr}->{$name} = $pointer;
4421 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4422 local $to_trace = 0 if main::DEBUG;
4423 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4424 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4425
4426
4427 # Put the new alias at the end of the list of aliases unless the final
4428 # element begins with an underscore (meaning it is for internal perl
4429 # use) or is all numeric, in which case, put the new one before that
4430 # one. This floats any all-numeric or underscore-beginning aliases to
4431 # the end. This is done so that they are listed last in output lists,
4432 # to encourage the user to use a better name (either more descriptive
4433 # or not an internal-only one) instead. This ordering is relied on
4434 # implicitly elsewhere in this program, like in short_name()
4435 my $list = $aliases{$addr};
4436 my $insert_position = (@$list == 0
4437 || (substr($list->[-1]->name, 0, 1) ne '_'
4438 && $list->[-1]->name =~ /\D/))
4439 ? @$list
4440 : @$list - 1;
4441 splice @$list,
4442 $insert_position,
4443 0,
4444 Alias->new($name, $loose_match, $make_pod_entry,
4445 $externally_ok, $status);
4446
4447 # This name may be shorter than any existing ones, so clear the cache
4448 # of the shortest, so will have to be recalculated.
f998e60c 4449 no overloading;
051df77b 4450 undef $short_name{pack 'J', $self};
99870f4d
KW
4451 return;
4452 }
4453
4454 sub short_name {
4455 # Returns a name suitable for use as the base part of a file name.
4456 # That is, shorter wins. It can return undef if there is no suitable
4457 # name. The name has all non-essential underscores removed.
4458
4459 # The optional second parameter is a reference to a scalar in which
4460 # this routine will store the length the returned name had before the
4461 # underscores were removed, or undef if the return is undef.
4462
4463 # The shortest name can change if new aliases are added. So using
4464 # this should be deferred until after all these are added. The code
4465 # that does that should clear this one's cache.
4466 # Any name with alphabetics is preferred over an all numeric one, even
4467 # if longer.
4468
4469 my $self = shift;
4470 my $nominal_length_ptr = shift;
4471 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4472
ffe43484 4473 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4474
4475 # For efficiency, don't recalculate, but this means that adding new
4476 # aliases could change what the shortest is, so the code that does
4477 # that needs to undef this.
4478 if (defined $short_name{$addr}) {
4479 if ($nominal_length_ptr) {
4480 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4481 }
4482 return $short_name{$addr};
4483 }
4484
4485 # Look at each alias
4486 foreach my $alias ($self->aliases()) {
4487
4488 # Don't use an alias that isn't ok to use for an external name.
4489 next if ! $alias->externally_ok;
4490
4491 my $name = main::Standardize($alias->name);
4492 trace $self, $name if main::DEBUG && $to_trace;
4493
4494 # Take the first one, or a shorter one that isn't numeric. This
4495 # relies on numeric aliases always being last in the array
4496 # returned by aliases(). Any alpha one will have precedence.
4497 if (! defined $short_name{$addr}
4498 || ($name =~ /\D/
4499 && length($name) < length($short_name{$addr})))
4500 {
4501 # Remove interior underscores.
4502 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4503
4504 $nominal_short_name_length{$addr} = length $name;
4505 }
4506 }
4507
4508 # If no suitable external name return undef
4509 if (! defined $short_name{$addr}) {
4510 $$nominal_length_ptr = undef if $nominal_length_ptr;
4511 return;
4512 }
4513
4514 # Don't allow a null external name.
4515 if ($short_name{$addr} eq "") {
4516 $short_name{$addr} = '_';
4517 $nominal_short_name_length{$addr} = 1;
4518 }
4519
4520 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4521
4522 if ($nominal_length_ptr) {
4523 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4524 }
4525 return $short_name{$addr};
4526 }
4527
4528 sub external_name {
4529 # Returns the external name that this table should be known by. This
4530 # is usually the short_name, but not if the short_name is undefined.
4531
4532 my $self = shift;
4533 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4534
4535 my $short = $self->short_name;
4536 return $short if defined $short;
4537
4538 return '_';
4539 }
4540
4541 sub add_description { # Adds the parameter as a short description.
4542
4543 my $self = shift;
4544 my $description = shift;
4545 chomp $description;
4546 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4547
f998e60c 4548 no overloading;
051df77b 4549 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
4550
4551 return;
4552 }
4553
4554 sub add_note { # Adds the parameter as a short note.
4555
4556 my $self = shift;
4557 my $note = shift;
4558 chomp $note;
4559 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4560
f998e60c 4561 no overloading;
051df77b 4562 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
4563
4564 return;
4565 }
4566
4567 sub add_comment { # Adds the parameter as a comment.
4568
4569 my $self = shift;
4570 my $comment = shift;
4571 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4572
4573 chomp $comment;
f998e60c
KW
4574
4575 no overloading;
051df77b 4576 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
4577
4578 return;
4579 }
4580
4581 sub comment {
4582 # Return the current comment for this table. If called in list
4583 # context, returns the array of comments. In scalar, returns a string
4584 # of each element joined together with a period ending each.
4585
4586 my $self = shift;
4587 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4588
ffe43484 4589 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 4590 my @list = @{$comment{$addr}};
99870f4d
KW
4591 return @list if wantarray;
4592 my $return = "";
4593 foreach my $sentence (@list) {
4594 $return .= '. ' if $return;
4595 $return .= $sentence;
4596 $return =~ s/\.$//;
4597 }
4598 $return .= '.' if $return;
4599 return $return;
4600 }
4601
4602 sub initialize {
4603 # Initialize the table with the argument which is any valid
4604 # initialization for range lists.
4605
4606 my $self = shift;
ffe43484 4607 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4608 my $initialization = shift;
4609 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4610
4611 # Replace the current range list with a new one of the same exact
4612 # type.
f998e60c
KW
4613 my $class = ref $range_list{$addr};
4614 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
4615 Initialize => $initialization);
4616 return;
4617
4618 }
4619
4620 sub header {
4621 # The header that is output for the table in the file it is written
4622 # in.
4623
4624 my $self = shift;
4625 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4626
4627 my $return = "";
4628 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4629 $return .= $HEADER;
f998e60c 4630 no overloading;
051df77b 4631 $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
99870f4d
KW
4632 return $return;
4633 }
4634
4635 sub write {
4636 # Write a representation of the table to its file.
4637
4638 my $self = shift;
4639 my $tab_stops = shift; # The number of tab stops over to put any
4640 # comment.
4641 my $suppress_value = shift; # Optional, if the value associated with
4642 # a range equals this one, don't write
4643 # the range
4644 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4645
ffe43484 4646 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4647
4648 # Start with the header
4649 my @OUT = $self->header;
4650
4651 # Then the comments
4652 push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4653 if $comment{$addr};
4654
4655 # Then any pre-body stuff.
4656 my $pre_body = $self->pre_body;
4657 push @OUT, $pre_body, "\n" if $pre_body;
4658
4659 # The main body looks like a 'here' document
4660 push @OUT, "return <<'END';\n";
4661
4662 if ($range_list{$addr}->is_empty) {
4663
4664 # This is a kludge for empty tables to silence a warning in
4665 # utf8.c, which can't really deal with empty tables, but it can
4666 # deal with a table that matches nothing, as the inverse of 'Any'
4667 # does.
4668 push @OUT, "!utf8::IsAny\n";
4669 }
4670 else {
4671 my $range_size_1 = $range_size_1{$addr};
4672
4673 # Output each range as part of the here document.
4674 for my $set ($range_list{$addr}->ranges) {
4675 my $start = $set->start;
4676 my $end = $set->end;
4677 my $value = $set->value;
4678
4679 # Don't output ranges whose value is the one to suppress
4680 next if defined $suppress_value && $value eq $suppress_value;
4681
4682 # If has or wants a single point range output
4683 if ($start == $end || $range_size_1) {
b1c167a3
KW
4684 if (ref $range_size_1 eq 'CODE') {
4685 for my $i ($start .. $end) {
4686 push @OUT, &$range_size_1($i, $value);
4687 }
4688 }
4689 else {
4690 for my $i ($start .. $end) {
4691 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4692 if ($output_names) {
4693 if (! defined $viacode[$i]) {
4694 $viacode[$i] =
4695 Property::property_ref('Perl_Charnames')
4696 ->value_of($i)
4697 || "";
4698 }
4699 $OUT[-1] =~ s/\n/\t# $viacode[$i]\n/;
9ef2b94f 4700 }
9ef2b94f 4701 }
99870f4d
KW
4702 }
4703 }
4704 else {
4705 push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4706
4707 # Add a comment with the size of the range, if requested.
4708 # Expand Tabs to make sure they all start in the same
4709 # column, and then unexpand to use mostly tabs.
0c07e538 4710 if (! $output_range_counts{$addr}) {
99870f4d
KW
4711 $OUT[-1] .= "\n";
4712 }
4713 else {
4714 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4715 my $count = main::clarify_number($end - $start + 1);
4716 use integer;
4717
4718 my $width = $tab_stops * 8 - 1;
4719 $OUT[-1] = sprintf("%-*s # [%s]\n",
4720 $width,
4721 $OUT[-1],
4722 $count);
4723 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4724 }
4725 }
4726 } # End of loop through all the table's ranges
4727 }
4728
4729 # Add anything that goes after the main body, but within the here
4730 # document,
4731 my $append_to_body = $self->append_to_body;
4732 push @OUT, $append_to_body if $append_to_body;
4733
4734 # And finish the here document.
4735 push @OUT, "END\n";
4736
4737 # All these files have a .pl suffix
4738 $file_path{$addr}->[-1] .= '.pl';
4739
4740 main::write($file_path{$addr}, \@OUT);
4741 return;
4742 }
4743
4744 sub set_status { # Set the table's status
4745 my $self = shift;
4746 my $status = shift; # The status enum value
4747 my $info = shift; # Any message associated with it.
4748 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4749
ffe43484 4750 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4751
4752 $status{$addr} = $status;
4753 $status_info{$addr} = $info;
4754 return;
4755 }
4756
4757 sub lock {
4758 # Don't allow changes to the table from now on. This stores a stack
4759 # trace of where it was called, so that later attempts to modify it
4760 # can immediately show where it got locked.
4761
4762 my $self = shift;
4763 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4764
ffe43484 4765 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4766
4767 $locked{$addr} = "";
4768
4769 my $line = (caller(0))[2];
4770 my $i = 1;
4771
4772 # Accumulate the stack trace
4773 while (1) {
4774 my ($pkg, $file, $caller_line, $caller) = caller $i++;
4775
4776 last unless defined $caller;
4777
4778 $locked{$addr} .= " called from $caller() at line $line\n";
4779 $line = $caller_line;
4780 }
4781 $locked{$addr} .= " called from main at line $line\n";
4782
4783 return;
4784 }
4785
4786 sub carp_if_locked {
4787 # Return whether a table is locked or not, and, by the way, complain
4788 # if is locked
4789
4790 my $self = shift;
4791 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4792
ffe43484 4793 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4794
4795 return 0 if ! $locked{$addr};
4796 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4797 return 1;
4798 }
4799
4800 sub set_file_path { # Set the final directory path for this table
4801 my $self = shift;
4802 # Rest of parameters passed on
4803
f998e60c 4804 no overloading;
051df77b 4805 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
4806 return
4807 }
4808
4809 # Accessors for the range list stored in this table. First for
4810 # unconditional
ea25a9b2 4811 for my $sub (qw(
99870f4d
KW
4812 contains
4813 count
4814 each_range
4815 hash
4816 is_empty
4817 max
4818 min
4819 range_count
4820 reset_each_range
4821 value_of
ea25a9b2 4822 ))
99870f4d
KW
4823 {
4824 no strict "refs";
4825 *$sub = sub {
4826 use strict "refs";
4827 my $self = shift;
f998e60c 4828 no overloading;
051df77b 4829 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
4830 }
4831 }
4832
4833 # Then for ones that should fail if locked
ea25a9b2 4834 for my $sub (qw(
99870f4d 4835 delete_range
ea25a9b2 4836 ))
99870f4d
KW
4837 {
4838 no strict "refs";
4839 *$sub = sub {
4840 use strict "refs";
4841 my $self = shift;
4842
4843 return if $self->carp_if_locked;
f998e60c 4844 no overloading;
051df77b 4845 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
4846 }
4847 }
4848
4849} # End closure
4850
4851package Map_Table;
4852use base '_Base_Table';
4853
4854# A Map Table is a table that contains the mappings from code points to
4855# values. There are two weird cases:
4856# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4857# are written in the table's file at the end of the table nonetheless. It
4858# requires specially constructed code to handle these; utf8.c can not read
4859# these in, so they should not go in $map_directory. As of this writing,
4860# the only case that these happen is for named sequences used in
4861# charnames.pm. But this code doesn't enforce any syntax on these, so
4862# something else could come along that uses it.
4863# 2) Specials are anything that doesn't fit syntactically into the body of the
4864# table. The ranges for these have a map type of non-zero. The code below
4865# knows about and handles each possible type. In most cases, these are
4866# written as part of the header.
4867#
4868# A map table deliberately can't be manipulated at will unlike match tables.
4869# This is because of the ambiguities having to do with what to do with
4870# overlapping code points. And there just isn't a need for those things;
4871# what one wants to do is just query, add, replace, or delete mappings, plus
4872# write the final result.
4873# However, there is a method to get the list of possible ranges that aren't in
4874# this table to use for defaulting missing code point mappings. And,
4875# map_add_or_replace_non_nulls() does allow one to add another table to this
4876# one, but it is clearly very specialized, and defined that the other's
4877# non-null values replace this one's if there is any overlap.
4878
4879sub trace { return main::trace(@_); }
4880
4881{ # Closure
4882
4883 main::setup_package();
4884
4885 my %default_map;
4886 # Many input files omit some entries; this gives what the mapping for the
4887 # missing entries should be
4888 main::set_access('default_map', \%default_map, 'r');
4889
4890 my %anomalous_entries;
4891 # Things that go in the body of the table which don't fit the normal
4892 # scheme of things, like having a range. Not much can be done with these
4893 # once there except to output them. This was created to handle named
4894 # sequences.
4895 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4896 main::set_access('anomalous_entries', # Append singular, read plural
4897 \%anomalous_entries,
4898 'readable_array');
4899
4900 my %format;
4901 # The format of the entries of the table. This is calculated from the
4902 # data in the table (or passed in the constructor). This is an enum e.g.,
4903 # $STRING_FORMAT
4904 main::set_access('format', \%format);
4905
4906 my %core_access;
4907 # This is a string, solely for documentation, indicating how one can get
4908 # access to this property via the Perl core.
4909 main::set_access('core_access', \%core_access, 'r', 's');
4910
4911 my %has_specials;
4912 # Boolean set when non-zero map-type ranges are added to this table,
4913 # which happens in only a few tables. This is purely for performance, to
4914 # avoid having to search through every table upon output, so if all the
4915 # non-zero maps got deleted before output, this would remain set, and the
4916 # only penalty would be performance. Currently, most map tables that get
4917 # output have specials in them, so this doesn't help that much anyway.
4918 main::set_access('has_specials', \%has_specials);
4919
4920 my %to_output_map;
4921 # Boolean as to whether or not to write out this map table
4922 main::set_access('to_output_map', \%to_output_map, 's');
4923
4924
4925 sub new {
4926 my $class = shift;
4927 my $name = shift;
4928
4929 my %args = @_;
4930
4931 # Optional initialization data for the table.
4932 my $initialize = delete $args{'Initialize'};
4933
4934 my $core_access = delete $args{'Core_Access'};
4935 my $default_map = delete $args{'Default_Map'};
4936 my $format = delete $args{'Format'};
4937 my $property = delete $args{'_Property'};
23e33b60 4938 my $full_name = delete $args{'Full_Name'};
99870f4d
KW
4939 # Rest of parameters passed on
4940
4941 my $range_list = Range_Map->new(Owner => $property);
4942
4943 my $self = $class->SUPER::new(
4944 Name => $name,
23e33b60
KW
4945 Complete_Name => $full_name,
4946 Full_Name => $full_name,
99870f4d
KW
4947 _Property => $property,
4948 _Range_List => $range_list,
4949 %args);
4950
ffe43484 4951 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4952
4953 $anomalous_entries{$addr} = [];
4954 $core_access{$addr} = $core_access;
4955 $default_map{$addr} = $default_map;
4956 $format{$addr} = $format;
4957
4958 $self->initialize($initialize) if defined $initialize;
4959
4960 return $self;
4961 }
4962
4963 use overload
4964 fallback => 0,
4965 qw("") => "_operator_stringify",
4966 ;
4967
4968 sub _operator_stringify {
4969 my $self = shift;
4970
4971 my $name = $self->property->full_name;
4972 $name = '""' if $name eq "";
4973 return "Map table for Property '$name'";
4974 }
4975
99870f4d
KW
4976 sub add_alias {
4977 # Add a synonym for this table (which means the property itself)
4978 my $self = shift;
4979 my $name = shift;
4980 # Rest of parameters passed on.
4981
4982 $self->SUPER::add_alias($name, $self->property, @_);
4983 return;
4984 }
4985
4986 sub add_map {
4987 # Add a range of code points to the list of specially-handled code
4988 # points. $MULTI_CP is assumed if the type of special is not passed
4989 # in.
4990
4991 my $self = shift;
4992 my $lower = shift;
4993 my $upper = shift;
4994 my $string = shift;
4995 my %args = @_;
4996
4997 my $type = delete $args{'Type'} || 0;
4998 # Rest of parameters passed on
4999
5000 # Can't change the table if locked.
5001 return if $self->carp_if_locked;
5002
ffe43484 5003 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5004
5005 $has_specials{$addr} = 1 if $type;
5006
5007 $self->_range_list->add_map($lower, $upper,
5008 $string,
5009 @_,
5010 Type => $type);
5011 return;
5012 }
5013
5014 sub append_to_body {
5015 # Adds to the written HERE document of the table's body any anomalous
5016 # entries in the table..
5017
5018 my $self = shift;
5019 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5020
ffe43484 5021 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5022
5023 return "" unless @{$anomalous_entries{$addr}};
5024 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5025 }
5026
5027 sub map_add_or_replace_non_nulls {
5028 # This adds the mappings in the table $other to $self. Non-null
5029 # mappings from $other override those in $self. It essentially merges
5030 # the two tables, with the second having priority except for null
5031 # mappings.
5032
5033 my $self = shift;
5034 my $other = shift;
5035 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5036
5037 return if $self->carp_if_locked;
5038
5039 if (! $other->isa(__PACKAGE__)) {
5040 Carp::my_carp_bug("$other should be a "
5041 . __PACKAGE__
5042 . ". Not a '"
5043 . ref($other)
5044 . "'. Not added;");
5045 return;
5046 }
5047
ffe43484
NC
5048 my $addr = do { no overloading; pack 'J', $self; };
5049 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5050
5051 local $to_trace = 0 if main::DEBUG;
5052
5053 my $self_range_list = $self->_range_list;
5054 my $other_range_list = $other->_range_list;
5055 foreach my $range ($other_range_list->ranges) {
5056 my $value = $range->value;
5057 next if $value eq "";
5058 $self_range_list->_add_delete('+',
5059 $range->start,
5060 $range->end,
5061 $value,
5062 Type => $range->type,
5063 Replace => $UNCONDITIONALLY);
5064 }
5065
5066 # Copy the specials information from the other table to $self
5067 if ($has_specials{$other_addr}) {
5068 $has_specials{$addr} = 1;
5069 }
5070
5071 return;
5072 }
5073
5074 sub set_default_map {
5075 # Define what code points that are missing from the input files should
5076 # map to
5077
5078 my $self = shift;
5079 my $map = shift;
5080 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5081
ffe43484 5082 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5083
5084 # Convert the input to the standard equivalent, if any (won't have any
5085 # for $STRING properties)
5086 my $standard = $self->_find_table_from_alias->{$map};
5087 $map = $standard->name if defined $standard;
5088
5089 # Warn if there already is a non-equivalent default map for this
5090 # property. Note that a default map can be a ref, which means that
5091 # what it actually means is delayed until later in the program, and it
5092 # IS permissible to override it here without a message.
5093 my $default_map = $default_map{$addr};
5094 if (defined $default_map
5095 && ! ref($default_map)
5096 && $default_map ne $map
5097 && main::Standardize($map) ne $default_map)
5098 {
5099 my $property = $self->property;
5100 my $map_table = $property->table($map);
5101 my $default_table = $property->table($default_map);
5102 if (defined $map_table
5103 && defined $default_table
5104 && $map_table != $default_table)
5105 {
5106 Carp::my_carp("Changing the default mapping for "
5107 . $property
5108 . " from $default_map to $map'");
5109 }
5110 }
5111
5112 $default_map{$addr} = $map;
5113
5114 # Don't also create any missing table for this map at this point,
5115 # because if we did, it could get done before the main table add is
5116 # done for PropValueAliases.txt; instead the caller will have to make
5117 # sure it exists, if desired.
5118 return;
5119 }
5120
5121 sub to_output_map {
5122 # Returns boolean: should we write this map table?
5123
5124 my $self = shift;
5125 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5126
ffe43484 5127 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5128
5129 # If overridden, use that
5130 return $to_output_map{$addr} if defined $to_output_map{$addr};
5131
5132 my $full_name = $self->full_name;
5133
5134 # If table says to output, do so; if says to suppress it, do do.
5135 return 1 if grep { $_ eq $full_name } @output_mapped_properties;
5136 return 0 if $self->status eq $SUPPRESSED;
5137
5138 my $type = $self->property->type;
5139
5140 # Don't want to output binary map tables even for debugging.
5141 return 0 if $type == $BINARY;
5142
5143 # But do want to output string ones.
5144 return 1 if $type == $STRING;
5145
5146 # Otherwise is an $ENUM, don't output it
5147 return 0;
5148 }
5149
5150 sub inverse_list {
5151 # Returns a Range_List that is gaps of the current table. That is,
5152 # the inversion
5153
5154 my $self = shift;
5155 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5156
5157 my $current = Range_List->new(Initialize => $self->_range_list,
5158 Owner => $self->property);
5159 return ~ $current;
5160 }
5161
5162 sub set_final_comment {
5163 # Just before output, create the comment that heads the file
5164 # containing this table.
5165
5166 my $self = shift;
5167 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5168
5169 # No sense generating a comment if aren't going to write it out.
5170 return if ! $self->to_output_map;
5171
ffe43484 5172 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5173
5174 my $property = $self->property;
5175
5176 # Get all the possible names for this property. Don't use any that
5177 # aren't ok for use in a file name, etc. This is perhaps causing that
5178 # flag to do double duty, and may have to be changed in the future to
5179 # have our own flag for just this purpose; but it works now to exclude
5180 # Perl generated synonyms from the lists for properties, where the
5181 # name is always the proper Unicode one.
5182 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5183
5184 my $count = $self->count;
5185 my $default_map = $default_map{$addr};
5186
5187 # The ranges that map to the default aren't output, so subtract that
5188 # to get those actually output. A property with matching tables
5189 # already has the information calculated.
5190 if ($property->type != $STRING) {
5191 $count -= $property->table($default_map)->count;
5192 }
5193 elsif (defined $default_map) {
5194
5195 # But for $STRING properties, must calculate now. Subtract the
5196 # count from each range that maps to the default.
5197 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5198 if ($range->value eq $default_map) {
5199 $count -= $range->end +1 - $range->start;
5200 }
5201 }
5202
5203 }
5204
5205 # Get a string version of $count with underscores in large numbers,
5206 # for clarity.
5207 my $string_count = main::clarify_number($count);
5208
5209 my $code_points = ($count == 1)
5210 ? 'single code point'
5211 : "$string_count code points";
5212
5213 my $mapping;
5214 my $these_mappings;
5215 my $are;
5216 if (@property_aliases <= 1) {
5217 $mapping = 'mapping';
5218 $these_mappings = 'this mapping';
5219 $are = 'is'
5220 }
5221 else {
5222 $mapping = 'synonymous mappings';
5223 $these_mappings = 'these mappings';
5224 $are = 'are'
5225 }
5226 my $cp;
5227 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5228 $cp = "any code point in Unicode Version $string_version";
5229 }
5230 else {
5231 my $map_to;
5232 if ($default_map eq "") {
5233 $map_to = 'the null string';
5234 }
5235 elsif ($default_map eq $CODE_POINT) {
5236 $map_to = "itself";
5237 }
5238 else {
5239 $map_to = "'$default_map'";
5240 }
5241 if ($count == 1) {
5242 $cp = "the single code point";
5243 }
5244 else {
5245 $cp = "one of the $code_points";
5246 }
5247 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5248 }
5249
5250 my $comment = "";
5251
5252 my $status = $self->status;
5253 if ($status) {
5254 my $warn = uc $status_past_participles{$status};
5255 $comment .= <<END;
5256
5257!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5258 All property or property=value combinations contained in this file are $warn.
5259 See $unicode_reference_url for what this means.
5260
5261END
5262 }
5263 $comment .= "This file returns the $mapping:\n";
5264
5265 for my $i (0 .. @property_aliases - 1) {
5266 $comment .= sprintf("%-8s%s\n",
5267 " ",
5268 $property_aliases[$i]->name . '(cp)'
5269 );
5270 }
5271 $comment .=
5272 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5273
5274 my $access = $core_access{$addr};
5275 if ($access) {
5276 $comment .= "accessible through the Perl core via $access.";
5277 }
5278 else {
5279 $comment .= "not accessible through the Perl core directly.";
5280 }
5281
5282 # And append any commentary already set from the actual property.
5283 $comment .= "\n\n" . $self->comment if $self->comment;
5284 if ($self->description) {
5285 $comment .= "\n\n" . join " ", $self->description;
5286 }
5287 if ($self->note) {
5288 $comment .= "\n\n" . join " ", $self->note;
5289 }
5290 $comment .= "\n";
5291
5292 if (! $self->perl_extension) {
5293 $comment .= <<END;
5294
5295For information about what this property really means, see:
5296$unicode_reference_url
5297END
5298 }
5299
5300 if ($count) { # Format differs for empty table
5301 $comment.= "\nThe format of the ";
5302 if ($self->range_size_1) {
5303 $comment.= <<END;
5304main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5305is in hex; MAPPING is what CODE_POINT maps to.
5306END
5307 }
5308 else {
5309
5310 # There are tables which end up only having one element per
5311 # range, but it is not worth keeping track of for making just
5312 # this comment a little better.
5313 $comment.= <<END;
5314non-comment portions of the main body of lines of this file is:
5315START\\tSTOP\\tMAPPING where START is the starting code point of the
5316range, in hex; STOP is the ending point, or if omitted, the range has just one
5317code point; MAPPING is what each code point between START and STOP maps to.
5318END
0c07e538 5319 if ($self->output_range_counts) {
99870f4d
KW
5320 $comment .= <<END;
5321Numbers in comments in [brackets] indicate how many code points are in the
5322range (omitted when the range is a single code point or if the mapping is to
5323the null string).
5324END
5325 }
5326 }
5327 }
5328 $self->set_comment(main::join_lines($comment));
5329 return;
5330 }
5331
5332 my %swash_keys; # Makes sure don't duplicate swash names.
5333
5334 sub pre_body {
5335 # Returns the string that should be output in the file before the main
5336 # body of this table. This includes some hash entries identifying the
5337 # format of the body, and what the single value should be for all
5338 # ranges missing from it. It also includes any code points which have
5339 # map_types that don't go in the main table.
5340
5341 my $self = shift;
5342 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5343
ffe43484 5344 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5345
5346 my $name = $self->property->swash_name;
5347
5348 if (defined $swash_keys{$name}) {
5349 Carp::my_carp(join_lines(<<END
5350Already created a swash name '$name' for $swash_keys{$name}. This means that
5351the same name desired for $self shouldn't be used. Bad News. This must be
5352fixed before production use, but proceeding anyway
5353END
5354 ));
5355 }
5356 $swash_keys{$name} = "$self";
5357
5358 my $default_map = $default_map{$addr};
5359
5360 my $pre_body = "";
5361 if ($has_specials{$addr}) {
5362
5363 # Here, some maps with non-zero type have been added to the table.
5364 # Go through the table and handle each of them. None will appear
5365 # in the body of the table, so delete each one as we go. The
5366 # code point count has already been calculated, so ok to delete
5367 # now.
5368
5369 my @multi_code_point_maps;
5370 my $has_hangul_syllables = 0;
5371
5372 # The key is the base name of the code point, and the value is an
5373 # array giving all the ranges that use this base name. Each range
5374 # is actually a hash giving the 'low' and 'high' values of it.
5375 my %names_ending_in_code_point;
5376
5377 # Inverse mapping. The list of ranges that have these kinds of
5378 # names. Each element contains the low, high, and base names in a
5379 # hash.
5380 my @code_points_ending_in_code_point;
5381
5382 my $range_map = $self->_range_list;
5383 foreach my $range ($range_map->ranges) {
5384 next unless $range->type != 0;
5385 my $low = $range->start;
5386 my $high = $range->end;
5387 my $map = $range->value;
5388 my $type = $range->type;
5389
5390 # No need to output the range if it maps to the default. And
5391 # the write method won't output it either, so no need to
5392 # delete it to keep it from being output, and is faster to
5393 # skip than to delete anyway.
5394 next if $map eq $default_map;
5395
5396 # Delete the range to keep write() from trying to output it
5397 $range_map->delete_range($low, $high);
5398
5399 # Switch based on the map type...
5400 if ($type == $HANGUL_SYLLABLE) {
5401
5402 # These are entirely algorithmically determinable based on
5403 # some constants furnished by Unicode; for now, just set a
5404 # flag to indicate that have them. Below we will output
5405 # the code that does the algorithm.
5406 $has_hangul_syllables = 1;
5407 }
5408 elsif ($type == $CP_IN_NAME) {
5409
5410 # If the name ends in the code point it represents, are
5411 # also algorithmically determinable, but need information
5412 # about the map to do so. Both the map and its inverse
5413 # are stored in data structures output in the file.
5414 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5415 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5416
5417 push @code_points_ending_in_code_point, { low => $low,
5418 high => $high,
5419 name => $map
5420 };
5421 }
5422 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5423
5424 # Multi-code point maps and null string maps have an entry
5425 # for each code point in the range. They use the same
5426 # output format.
5427 for my $code_point ($low .. $high) {
5428
5429 # The pack() below can't cope with surrogates.
5430 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5431 Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
5432 next;
5433 }
5434
5435 # Generate the hash entries for these in the form that
5436 # utf8.c understands.
5437 my $tostr = "";
5438 foreach my $to (split " ", $map) {
5439 if ($to !~ /^$code_point_re$/) {
5440 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5441 next;
5442 }
5443 $tostr .= sprintf "\\x{%s}", $to;
5444 }
5445
5446 # I (khw) have never waded through this line to
5447 # understand it well enough to comment it.
5448 my $utf8 = sprintf(qq["%s" => "$tostr",],
5449 join("", map { sprintf "\\x%02X", $_ }
5450 unpack("U0C*", pack("U", $code_point))));
5451
5452 # Add a comment so that a human reader can more easily
5453 # see what's going on.
5454 push @multi_code_point_maps,
5455 sprintf("%-45s # U+%04X => %s", $utf8,
5456 $code_point,
5457 $map);
5458 }
5459 }
5460 else {
5461 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Using type 0 instead");
5462 $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5463 }
5464 } # End of loop through all ranges
5465
5466 # Here have gone through the whole file. If actually generated
5467 # anything for each map type, add its respective header and
5468 # trailer
5469 if (@multi_code_point_maps) {
5470 $pre_body .= <<END;
5471
5472# Some code points require special handling because their mappings are each to
5473# multiple code points. These do not appear in the main body, but are defined
5474# in the hash below.
5475
76591e2b
KW
5476# Each key is the string of N bytes that together make up the UTF-8 encoding
5477# for the code point. (i.e. the same as looking at the code point's UTF-8
5478# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
99870f4d
KW
5479%utf8::ToSpec$name = (
5480END
5481 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5482 }
5483
5484 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5485
5486 # Convert these structures to output format.
5487 my $code_points_ending_in_code_point =
5488 main::simple_dumper(\@code_points_ending_in_code_point,
5489 ' ' x 8);
5490 my $names = main::simple_dumper(\%names_ending_in_code_point,
5491 ' ' x 8);
5492
5493 # Do the same with the Hangul names,
5494 my $jamo;
5495 my $jamo_l;
5496 my $jamo_v;
5497 my $jamo_t;
5498 my $jamo_re;
5499 if ($has_hangul_syllables) {
5500
5501 # Construct a regular expression of all the possible
5502 # combinations of the Hangul syllables.
5503 my @L_re; # Leading consonants
5504 for my $i ($LBase .. $LBase + $LCount - 1) {
5505 push @L_re, $Jamo{$i}
5506 }
5507 my @V_re; # Middle vowels
5508 for my $i ($VBase .. $VBase + $VCount - 1) {
5509 push @V_re, $Jamo{$i}
5510 }
5511 my @T_re; # Trailing consonants
5512 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5513 push @T_re, $Jamo{$i}
5514 }
5515
5516 # The whole re is made up of the L V T combination.
5517 $jamo_re = '('
5518 . join ('|', sort @L_re)
5519 . ')('
5520 . join ('|', sort @V_re)
5521 . ')('
5522 . join ('|', sort @T_re)
5523 . ')?';
5524
5525 # These hashes needed by the algorithm were generated
5526 # during reading of the Jamo.txt file
5527 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5528 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5529 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5530 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5531 }
5532
5533 $pre_body .= <<END;
5534
5535# To achieve significant memory savings when this file is read in,
5536# algorithmically derivable code points are omitted from the main body below.
5537# Instead, the following routines can be used to translate between name and
5538# code point and vice versa
5539
5540{ # Closure
5541
5542 # Matches legal code point. 4-6 hex numbers, If there are 6, the
5543 # first two must be '10'; if there are 5, the first must not be a '0'.
5544 my \$code_point_re = qr/$code_point_re/;
5545
5546 # In the following hash, the keys are the bases of names which includes
5547 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
5548 # of each key is another hash which is used to get the low and high ends
5549 # for each range of code points that apply to the name
5550 my %names_ending_in_code_point = (
5551$names
5552 );
5553
5554 # And the following array gives the inverse mapping from code points to
5555 # names. Lowest code points are first
5556 my \@code_points_ending_in_code_point = (
5557$code_points_ending_in_code_point
5558 );
5559END
5560 # Earlier releases didn't have Jamos. No sense outputting
5561 # them unless will be used.
5562 if ($has_hangul_syllables) {
5563 $pre_body .= <<END;
5564
5565 # Convert from code point to Jamo short name for use in composing Hangul
5566 # syllable names
5567 my %Jamo = (
5568$jamo
5569 );
5570
5571 # Leading consonant (can be null)
5572 my %Jamo_L = (
5573$jamo_l
5574 );
5575
5576 # Vowel
5577 my %Jamo_V = (
5578$jamo_v
5579 );
5580
5581 # Optional trailing consonant
5582 my %Jamo_T = (
5583$jamo_t
5584 );
5585
5586 # Computed re that splits up a Hangul name into LVT or LV syllables
5587 my \$syllable_re = qr/$jamo_re/;
5588
5589 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5590 my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5591
5592 # These constants names and values were taken from the Unicode standard,
5593 # version 5.1, section 3.12. They are used in conjunction with Hangul
5594 # syllables
6e5a209b
KW
5595 my \$SBase = $SBase_string;
5596 my \$LBase = $LBase_string;
5597 my \$VBase = $VBase_string;
5598 my \$TBase = $TBase_string;
5599 my \$SCount = $SCount;
5600 my \$LCount = $LCount;
5601 my \$VCount = $VCount;
5602 my \$TCount = $TCount;
99870f4d
KW
5603 my \$NCount = \$VCount * \$TCount;
5604END
5605 } # End of has Jamos
5606
5607 $pre_body .= << 'END';
5608
5609 sub name_to_code_point_special {
5610 my $name = shift;
5611
5612 # Returns undef if not one of the specially handled names; otherwise
5613 # returns the code point equivalent to the input name
5614END
5615 if ($has_hangul_syllables) {
5616 $pre_body .= << 'END';
5617
5618 if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5619 $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5620 return if $name !~ qr/^$syllable_re$/;
5621 my $L = $Jamo_L{$1};
5622 my $V = $Jamo_V{$2};
5623 my $T = (defined $3) ? $Jamo_T{$3} : 0;
5624 return ($L * $VCount + $V) * $TCount + $T + $SBase;
5625 }
5626END
5627 }
5628 $pre_body .= << 'END';
5629
5630 # Name must end in '-code_point' for this to handle.
5631 if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5632 return;
5633 }
5634
5635 my $base = $1;
5636 my $code_point = CORE::hex $2;
5637
5638 # Name must be one of the ones which has the code point in it.
5639 return if ! $names_ending_in_code_point{$base};
5640
5641 # Look through the list of ranges that apply to this name to see if
5642 # the code point is in one of them.
5643 for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5644 return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5645 next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5646
5647 # Here, the code point is in the range.
5648 return $code_point;
5649 }
5650
5651 # Here, looked like the name had a code point number in it, but
5652 # did not match one of the valid ones.
5653 return;
5654 }
5655
5656 sub code_point_to_name_special {
5657 my $code_point = shift;
5658
5659 # Returns the name of a code point if algorithmically determinable;
5660 # undef if not
5661END
5662 if ($has_hangul_syllables) {
5663 $pre_body .= << 'END';
5664
5665 # If in the Hangul range, calculate the name based on Unicode's
5666 # algorithm
5667 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5668 use integer;
5669 my $SIndex = $code_point - $SBase;
5670 my $L = $LBase + $SIndex / $NCount;
5671 my $V = $VBase + ($SIndex % $NCount) / $TCount;
5672 my $T = $TBase + $SIndex % $TCount;
03e1aa51 5673 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
99870f4d
KW
5674 $name .= $Jamo{$T} if $T != $TBase;
5675 return $name;
5676 }
5677END
5678 }
5679 $pre_body .= << 'END';
5680
5681 # Look through list of these code points for one in range.
5682 foreach my $hash (@code_points_ending_in_code_point) {
5683 return if $code_point < $hash->{'low'};
5684 if ($code_point <= $hash->{'high'}) {
5685 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5686 }
5687 }
5688 return; # None found
5689 }
5690} # End closure
5691
5692END
5693 } # End of has hangul or code point in name maps.
5694 } # End of has specials
5695
5696 # Calculate the format of the table if not already done.
5697 my $format = $format{$addr};
5698 my $property = $self->property;
5699 my $type = $property->type;
5700 if (! defined $format) {
5701 if ($type == $BINARY) {
5702
5703 # Don't bother checking the values, because we elsewhere
5704 # verify that a binary table has only 2 values.
5705 $format = $BINARY_FORMAT;
5706 }
5707 else {
5708 my @ranges = $self->_range_list->ranges;
5709
5710 # default an empty table based on its type and default map
5711 if (! @ranges) {
5712
5713 # But it turns out that the only one we can say is a
5714 # non-string (besides binary, handled above) is when the
5715 # table is a string and the default map is to a code point
5716 if ($type == $STRING && $default_map eq $CODE_POINT) {
5717 $format = $HEX_FORMAT;
5718 }
5719 else {
5720 $format = $STRING_FORMAT;
5721 }
5722 }
5723 else {
5724
5725 # Start with the most restrictive format, and as we find
5726 # something that doesn't fit with that, change to the next
5727 # most restrictive, and so on.
5728 $format = $DECIMAL_FORMAT;
5729 foreach my $range (@ranges) {
5730 my $map = $range->value;
5731 if ($map ne $default_map) {
5732 last if $format eq $STRING_FORMAT; # already at
5733 # least
5734 # restrictive
5735 $format = $INTEGER_FORMAT
5736 if $format eq $DECIMAL_FORMAT
5737 && $map !~ / ^ [0-9] $ /x;
5738 $format = $FLOAT_FORMAT
5739 if $format eq $INTEGER_FORMAT
5740 && $map !~ / ^ -? [0-9]+ $ /x;
5741 $format = $RATIONAL_FORMAT
5742 if $format eq $FLOAT_FORMAT
5743 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5744 $format = $HEX_FORMAT
5745 if $format eq $RATIONAL_FORMAT
5746 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5747 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5748 && $map =~ /[^0-9A-F]/;
5749 }
5750 }
5751 }
5752 }
5753 } # end of calculating format
5754
5755 my $return = <<END;
5756# The name this swash is to be known by, with the format of the mappings in
5757# the main body of the table, and what all code points missing from this file
5758# map to.
5759\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5760END
5761 my $missing = $default_map;
5762 if ($missing eq $CODE_POINT
5763 && $format ne $HEX_FORMAT
5764 && ! defined $format{$addr}) # Is expected if was manually set
5765 {
5766 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5767 }
5768 $format{$addr} = $format;
5769 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5770 if ($missing eq $CODE_POINT) {
5771 $return .= ' # code point maps to itself';
5772 }
5773 elsif ($missing eq "") {
5774 $return .= ' # code point maps to the null string';
5775 }
5776 $return .= "\n";
5777
5778 $return .= $pre_body;
5779
5780 return $return;
5781 }
5782
5783 sub write {
5784 # Write the table to the file.
5785
5786 my $self = shift;
5787 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5788
ffe43484 5789 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5790
5791 return $self->SUPER::write(
5792 ($self->property == $block)
5793 ? 7 # block file needs more tab stops
5794 : 3,
5795 $default_map{$addr}); # don't write defaulteds
5796 }
5797
5798 # Accessors for the underlying list that should fail if locked.
ea25a9b2 5799 for my $sub (qw(
99870f4d 5800 add_duplicate
ea25a9b2 5801 ))
99870f4d
KW
5802 {
5803 no strict "refs";
5804 *$sub = sub {
5805 use strict "refs";
5806 my $self = shift;
5807
5808 return if $self->carp_if_locked;
5809 return $self->_range_list->$sub(@_);
5810 }
5811 }
5812} # End closure for Map_Table
5813
5814package Match_Table;
5815use base '_Base_Table';
5816
5817# A Match table is one which is a list of all the code points that have
5818# the same property and property value, for use in \p{property=value}
5819# constructs in regular expressions. It adds very little data to the base
5820# structure, but many methods, as these lists can be combined in many ways to
5821# form new ones.
5822# There are only a few concepts added:
5823# 1) Equivalents and Relatedness.
5824# Two tables can match the identical code points, but have different names.
5825# This always happens when there is a perl single form extension
5826# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
5827# tables are set to be related, with the Perl extension being a child, and
5828# the Unicode property being the parent.
5829#
5830# It may be that two tables match the identical code points and we don't
5831# know if they are related or not. This happens most frequently when the
5832# Block and Script properties have the exact range. But note that a
5833# revision to Unicode could add new code points to the script, which would
5834# now have to be in a different block (as the block was filled, or there
5835# would have been 'Unknown' script code points in it and they wouldn't have
5836# been identical). So we can't rely on any two properties from Unicode
5837# always matching the same code points from release to release, and thus
5838# these tables are considered coincidentally equivalent--not related. When
5839# two tables are unrelated but equivalent, one is arbitrarily chosen as the
5840# 'leader', and the others are 'equivalents'. This concept is useful
5841# to minimize the number of tables written out. Only one file is used for
5842# any identical set of code points, with entries in Heavy.pl mapping all
5843# the involved tables to it.
5844#
5845# Related tables will always be identical; we set them up to be so. Thus
5846# if the Unicode one is deprecated, the Perl one will be too. Not so for
5847# unrelated tables. Relatedness makes generating the documentation easier.
5848#
5849# 2) Conflicting. It may be that there will eventually be name clashes, with
5850# the same name meaning different things. For a while, there actually were
5851# conflicts, but they have so far been resolved by changing Perl's or
5852# Unicode's definitions to match the other, but when this code was written,
5853# it wasn't clear that that was what was going to happen. (Unicode changed
5854# because of protests during their beta period.) Name clashes are warned
5855# about during compilation, and the documentation. The generated tables
5856# are sane, free of name clashes, because the code suppresses the Perl
5857# version. But manual intervention to decide what the actual behavior
5858# should be may be required should this happen. The introductory comments
5859# have more to say about this.
5860
5861sub standardize { return main::standardize($_[0]); }
5862sub trace { return main::trace(@_); }
5863
5864
5865{ # Closure
5866
5867 main::setup_package();
5868
5869 my %leader;
5870 # The leader table of this one; initially $self.
5871 main::set_access('leader', \%leader, 'r');
5872
5873 my %equivalents;
5874 # An array of any tables that have this one as their leader
5875 main::set_access('equivalents', \%equivalents, 'readable_array');
5876
5877 my %parent;
5878 # The parent table to this one, initially $self. This allows us to
5879 # distinguish between equivalent tables that are related, and those which
5880 # may not be, but share the same output file because they match the exact
5881 # same set of code points in the current Unicode release.
5882 main::set_access('parent', \%parent, 'r');
5883
5884 my %children;
5885 # An array of any tables that have this one as their parent
5886 main::set_access('children', \%children, 'readable_array');
5887
5888 my %conflicting;
5889 # Array of any tables that would have the same name as this one with
5890 # a different meaning. This is used for the generated documentation.
5891 main::set_access('conflicting', \%conflicting, 'readable_array');
5892
5893 my %matches_all;
5894 # Set in the constructor for tables that are expected to match all code
5895 # points.
5896 main::set_access('matches_all', \%matches_all, 'r');
5897
5898 sub new {
5899 my $class = shift;
5900
5901 my %args = @_;
5902
5903 # The property for which this table is a listing of property values.
5904 my $property = delete $args{'_Property'};
5905
23e33b60
KW
5906 my $name = delete $args{'Name'};
5907 my $full_name = delete $args{'Full_Name'};
5908 $full_name = $name if ! defined $full_name;
5909
99870f4d
KW
5910 # Optional
5911 my $initialize = delete $args{'Initialize'};
5912 my $matches_all = delete $args{'Matches_All'} || 0;
5913 # Rest of parameters passed on.
5914
5915 my $range_list = Range_List->new(Initialize => $initialize,
5916 Owner => $property);
5917
23e33b60
KW
5918 my $complete = $full_name;
5919 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
5920 # but this helps debug if it
5921 # does
5922 # The complete name for a match table includes it's property in a
5923 # compound form 'property=table', except if the property is the
5924 # pseudo-property, perl, in which case it is just the single form,
5925 # 'table' (If you change the '=' must also change the ':' in lots of
5926 # places in this program that assume an equal sign)
5927 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 5928
99870f4d 5929 my $self = $class->SUPER::new(%args,
23e33b60
KW
5930 Name => $name,
5931 Complete_Name => $complete,
5932 Full_Name => $full_name,
99870f4d
KW
5933 _Property => $property,
5934 _Range_List => $range_list,
5935 );
ffe43484 5936 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5937
5938 $conflicting{$addr} = [ ];
5939 $equivalents{$addr} = [ ];
5940 $children{$addr} = [ ];
5941 $matches_all{$addr} = $matches_all;
5942 $leader{$addr} = $self;
5943 $parent{$addr} = $self;
5944
5945 return $self;
5946 }
5947
5948 # See this program's beginning comment block about overloading these.
5949 use overload
5950 fallback => 0,
5951 qw("") => "_operator_stringify",
5952 '=' => sub {
5953 my $self = shift;
5954
5955 return if $self->carp_if_locked;
5956 return $self;
5957 },
5958
5959 '+' => sub {
5960 my $self = shift;
5961 my $other = shift;
5962
5963 return $self->_range_list + $other;
5964 },
5965 '&' => sub {
5966 my $self = shift;
5967 my $other = shift;
5968
5969 return $self->_range_list & $other;
5970 },
5971 '+=' => sub {
5972 my $self = shift;
5973 my $other = shift;
5974
5975 return if $self->carp_if_locked;
5976
ffe43484 5977 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5978
5979 if (ref $other) {
5980
5981 # Change the range list of this table to be the
5982 # union of the two.
5983 $self->_set_range_list($self->_range_list
5984 + $other);
5985 }
5986 else { # $other is just a simple value
5987 $self->add_range($other, $other);
5988 }
5989 return $self;
5990 },
5991 '-' => sub { my $self = shift;
5992 my $other = shift;
5993 my $reversed = shift;
5994
5995 if ($reversed) {
5996 Carp::my_carp_bug("Can't cope with a "
5997 . __PACKAGE__
5998 . " being the first parameter in a '-'. Subtraction ignored.");
5999 return;
6000 }
6001
6002 return $self->_range_list - $other;
6003 },
6004 '~' => sub { my $self = shift;
6005 return ~ $self->_range_list;
6006 },
6007 ;
6008
6009 sub _operator_stringify {
6010 my $self = shift;
6011
23e33b60 6012 my $name = $self->complete_name;
99870f4d
KW
6013 return "Table '$name'";
6014 }
6015
6016 sub add_alias {
6017 # Add a synonym for this table. See the comments in the base class
6018
6019 my $self = shift;
6020 my $name = shift;
6021 # Rest of parameters passed on.
6022
6023 $self->SUPER::add_alias($name, $self, @_);
6024 return;
6025 }
6026
6027 sub add_conflicting {
6028 # Add the name of some other object to the list of ones that name
6029 # clash with this match table.
6030
6031 my $self = shift;
6032 my $conflicting_name = shift; # The name of the conflicting object
6033 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6034 my $conflicting_object = shift; # Optional, the conflicting object
6035 # itself. This is used to
6036 # disambiguate the text if the input
6037 # name is identical to any of the
6038 # aliases $self is known by.
6039 # Sometimes the conflicting object is
6040 # merely hypothetical, so this has to
6041 # be an optional parameter.
6042 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6043
ffe43484 6044 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6045
6046 # Check if the conflicting name is exactly the same as any existing
6047 # alias in this table (as long as there is a real object there to
6048 # disambiguate with).
6049 if (defined $conflicting_object) {
6050 foreach my $alias ($self->aliases) {
6051 if ($alias->name eq $conflicting_name) {
6052
6053 # Here, there is an exact match. This results in
6054 # ambiguous comments, so disambiguate by changing the
6055 # conflicting name to its object's complete equivalent.
6056 $conflicting_name = $conflicting_object->complete_name;
6057 last;
6058 }
6059 }
6060 }
6061
6062 # Convert to the \p{...} final name
6063 $conflicting_name = "\\$p" . "{$conflicting_name}";
6064
6065 # Only add once
6066 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6067
6068 push @{$conflicting{$addr}}, $conflicting_name;
6069
6070 return;
6071 }
6072
6073 sub is_equivalent_to {
6074 # Return boolean of whether or not the other object is a table of this
6075 # type and has been marked equivalent to this one.
6076
6077 my $self = shift;
6078 my $other = shift;
6079 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6080
6081 return 0 if ! defined $other; # Can happen for incomplete early
6082 # releases
6083 unless ($other->isa(__PACKAGE__)) {
6084 my $ref_other = ref $other;
6085 my $ref_self = ref $self;
6086 Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6087 return 0;
6088 }
6089
6090 # Two tables are equivalent if they have the same leader.
f998e60c 6091 no overloading;
051df77b 6092 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6093 return;
6094 }
6095
6096 sub matches_identically_to {
6097 # Return a boolean as to whether or not two tables match identical
6098 # sets of code points.
6099
6100 my $self = shift;
6101 my $other = shift;
6102 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6103
6104 unless ($other->isa(__PACKAGE__)) {
6105 my $ref_other = ref $other;
6106 my $ref_self = ref $self;
6107 Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6108 return 0;
6109 }
6110
6111 # These are ordered in increasing real time to figure out (at least
6112 # until a patch changes that and doesn't change this)
6113 return 0 if $self->max != $other->max;
6114 return 0 if $self->min != $other->min;
6115 return 0 if $self->range_count != $other->range_count;
6116 return 0 if $self->count != $other->count;
6117
6118 # Here they could be identical because all the tests above passed.
6119 # The loop below is somewhat simpler since we know they have the same
6120 # number of elements. Compare range by range, until reach the end or
6121 # find something that differs.
6122 my @a_ranges = $self->_range_list->ranges;
6123 my @b_ranges = $other->_range_list->ranges;
6124 for my $i (0 .. @a_ranges - 1) {
6125 my $a = $a_ranges[$i];
6126 my $b = $b_ranges[$i];
6127 trace "self $a; other $b" if main::DEBUG && $to_trace;
6128 return 0 if $a->start != $b->start || $a->end != $b->end;
6129 }
6130 return 1;
6131 }
6132
6133 sub set_equivalent_to {
6134 # Set $self equivalent to the parameter table.
6135 # The required Related => 'x' parameter is a boolean indicating
6136 # whether these tables are related or not. If related, $other becomes
6137 # the 'parent' of $self; if unrelated it becomes the 'leader'
6138 #
6139 # Related tables share all characteristics except names; equivalents
6140 # not quite so many.
6141 # If they are related, one must be a perl extension. This is because
6142 # we can't guarantee that Unicode won't change one or the other in a
6143 # later release even if they are idential now.
6144
6145 my $self = shift;
6146 my $other = shift;
6147
6148 my %args = @_;
6149 my $related = delete $args{'Related'};
6150
6151 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6152
6153 return if ! defined $other; # Keep on going; happens in some early
6154 # Unicode releases.
6155
6156 if (! defined $related) {
6157 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6158 $related = 0;
6159 }
6160
6161 # If already are equivalent, no need to re-do it; if subroutine
6162 # returns null, it found an error, also do nothing
6163 my $are_equivalent = $self->is_equivalent_to($other);
6164 return if ! defined $are_equivalent || $are_equivalent;
6165
ffe43484 6166 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6167 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d
KW
6168
6169 if ($related &&
6170 ! $other->perl_extension
6171 && ! $current_leader->perl_extension)
6172 {
6173 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6174 $related = 0;
6175 }
6176
ffe43484
NC
6177 my $leader = do { no overloading; pack 'J', $current_leader; };
6178 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6179
6180 # Any tables that are equivalent to or children of this table must now
6181 # instead be equivalent to or (children) to the new leader (parent),
6182 # still equivalent. The equivalency includes their matches_all info,
6183 # and for related tables, their status
6184 # All related tables are of necessity equivalent, but the converse
6185 # isn't necessarily true
6186 my $status = $other->status;
6187 my $status_info = $other->status_info;
6188 my $matches_all = $matches_all{other_addr};
6189 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6190 next if $table == $other;
6191 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6192
ffe43484 6193 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6194 $leader{$table_addr} = $other;
6195 $matches_all{$table_addr} = $matches_all;
6196 $self->_set_range_list($other->_range_list);
6197 push @{$equivalents{$other_addr}}, $table;
6198 if ($related) {
6199 $parent{$table_addr} = $other;
6200 push @{$children{$other_addr}}, $table;
6201 $table->set_status($status, $status_info);
6202 }
6203 }
6204
6205 # Now that we've declared these to be equivalent, any changes to one
6206 # of the tables would invalidate that equivalency.
6207 $self->lock;
6208 $other->lock;
6209 return;
6210 }
6211
6212 sub add_range { # Add a range to the list for this table.
6213 my $self = shift;
6214 # Rest of parameters passed on
6215
6216 return if $self->carp_if_locked;
6217 return $self->_range_list->add_range(@_);
6218 }
6219
99870f4d
KW
6220 sub pre_body { # Does nothing for match tables.
6221 return
6222 }
6223
6224 sub append_to_body { # Does nothing for match tables.
6225 return
6226 }
6227
6228 sub write {
6229 my $self = shift;
6230 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6231
6232 return $self->SUPER::write(2); # 2 tab stops
6233 }
6234
6235 sub set_final_comment {
6236 # This creates a comment for the file that is to hold the match table
6237 # $self. It is somewhat convoluted to make the English read nicely,
6238 # but, heh, it's just a comment.
6239 # This should be called only with the leader match table of all the
6240 # ones that share the same file. It lists all such tables, ordered so
6241 # that related ones are together.
6242
6243 my $leader = shift; # Should only be called on the leader table of
6244 # an equivalent group
6245 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6246
ffe43484 6247 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6248
6249 if ($leader{$addr} != $leader) {
6250 Carp::my_carp_bug(<<END
6251set_final_comment() must be called on a leader table, which $leader is not.
6252It is equivalent to $leader{$addr}. No comment created
6253END
6254 );
6255 return;
6256 }
6257
6258 # Get the number of code points matched by each of the tables in this
6259 # file, and add underscores for clarity.
6260 my $count = $leader->count;
6261 my $string_count = main::clarify_number($count);
6262
6263 my $loose_count = 0; # how many aliases loosely matched
6264 my $compound_name = ""; # ? Are any names compound?, and if so, an
6265 # example
6266 my $properties_with_compound_names = 0; # count of these
6267
6268
6269 my %flags; # The status flags used in the file
6270 my $total_entries = 0; # number of entries written in the comment
6271 my $matches_comment = ""; # The portion of the comment about the
6272 # \p{}'s
6273 my @global_comments; # List of all the tables' comments that are
6274 # there before this routine was called.
6275
6276 # Get list of all the parent tables that are equivalent to this one
6277 # (including itself).
6278 my @parents = grep { $parent{main::objaddr $_} == $_ }
6279 main::uniques($leader, @{$equivalents{$addr}});
6280 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6281 # tables
6282
6283 for my $parent (@parents) {
6284
6285 my $property = $parent->property;
6286
6287 # Special case 'N' tables in properties with two match tables when
6288 # the other is a 'Y' one. These are likely to be binary tables,
6289 # but not necessarily. In either case, \P{} will match the
6290 # complement of \p{}, and so if something is a synonym of \p, the
6291 # complement of that something will be the synonym of \P. This
6292 # would be true of any property with just two match tables, not
6293 # just those whose values are Y and N; but that would require a
6294 # little extra work, and there are none such so far in Unicode.
6295 my $perl_p = 'p'; # which is it? \p{} or \P{}
6296 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6297
6298 if (scalar $property->tables == 2
6299 && $parent == $property->table('N')
6300 && defined (my $yes = $property->table('Y')))
6301 {
ffe43484 6302 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6303 @yes_perl_synonyms
6304 = grep { $_->property == $perl }
6305 main::uniques($yes,
6306 $parent{$yes_addr},
6307 $parent{$yes_addr}->children);
6308
6309 # But these synonyms are \P{} ,not \p{}
6310 $perl_p = 'P';
6311 }
6312
6313 my @description; # Will hold the table description
6314 my @note; # Will hold the table notes.
6315 my @conflicting; # Will hold the table conflicts.
6316
6317 # Look at the parent, any yes synonyms, and all the children
ffe43484 6318 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6319 for my $table ($parent,
6320 @yes_perl_synonyms,
f998e60c 6321 @{$children{$parent_addr}})
99870f4d 6322 {
ffe43484 6323 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6324 my $table_property = $table->property;
6325
6326 # Tables are separated by a blank line to create a grouping.
6327 $matches_comment .= "\n" if $matches_comment;
6328
6329 # The table is named based on the property and value
6330 # combination it is for, like script=greek. But there may be
6331 # a number of synonyms for each side, like 'sc' for 'script',
6332 # and 'grek' for 'greek'. Any combination of these is a valid
6333 # name for this table. In this case, there are three more,
6334 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6335 # listing all possible combinations in the comment, we make
6336 # sure that each synonym occurs at least once, and add
6337 # commentary that the other combinations are possible.
6338 my @property_aliases = $table_property->aliases;
6339 my @table_aliases = $table->aliases;
6340
6341 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6342
6343 # The alias lists above are already ordered in the order we
6344 # want to output them. To ensure that each synonym is listed,
6345 # we must use the max of the two numbers.
6346 my $listed_combos = main::max(scalar @table_aliases,
6347 scalar @property_aliases);
6348 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6349
6350 my $property_had_compound_name = 0;
6351
6352 for my $i (0 .. $listed_combos - 1) {
6353 $total_entries++;
6354
6355 # The current alias for the property is the next one on
6356 # the list, or if beyond the end, start over. Similarly
6357 # for the table (\p{prop=table})
6358 my $property_alias = $property_aliases
6359 [$i % @property_aliases]->name;
6360 my $table_alias_object = $table_aliases
6361 [$i % @table_aliases];
6362 my $table_alias = $table_alias_object->name;
6363 my $loose_match = $table_alias_object->loose_match;
6364
6365 if ($table_alias !~ /\D/) { # Clarify large numbers.
6366 $table_alias = main::clarify_number($table_alias)
6367 }
6368
6369 # Add a comment for this alias combination
6370 my $current_match_comment;
6371 if ($table_property == $perl) {
6372 $current_match_comment = "\\$perl_p"
6373 . "{$table_alias}";
6374 }
6375 else {
6376 $current_match_comment
6377 = "\\p{$property_alias=$table_alias}";
6378 $property_had_compound_name = 1;
6379 }
6380
6381 # Flag any abnormal status for this table.
6382 my $flag = $property->status
6383 || $table->status
6384 || $table_alias_object->status;
37e2e78e
KW
6385 if ($flag) {
6386 if ($flag ne $PLACEHOLDER) {
6387 $flags{$flag} = $status_past_participles{$flag};
6388 } else {
6389 $flags{$flag} = <<END;
6390a placeholder because it is not in Version $string_version of Unicode, but is
6391needed by the Perl core to work gracefully. Because it is not in this version
6392of Unicode, it will not be listed in $pod_file.pod
6393END
6394 }
6395 }
99870f4d
KW
6396
6397 $loose_count++;
6398
6399 # Pretty up the comment. Note the \b; it says don't make
6400 # this line a continuation.
6401 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6402 $flag,
6403 " " x 7,
6404 $current_match_comment);
6405 } # End of generating the entries for this table.
6406
6407 # Save these for output after this group of related tables.
6408 push @description, $table->description;
6409 push @note, $table->note;
6410 push @conflicting, $table->conflicting;
6411
37e2e78e
KW
6412 # And this for output after all the tables.
6413 push @global_comments, $table->comment;
6414
99870f4d
KW
6415 # Compute an alternate compound name using the final property
6416 # synonym and the first table synonym with a colon instead of
6417 # the equal sign used elsewhere.
6418 if ($property_had_compound_name) {
6419 $properties_with_compound_names ++;
6420 if (! $compound_name || @property_aliases > 1) {
6421 $compound_name = $property_aliases[-1]->name
6422 . ': '
6423 . $table_aliases[0]->name;
6424 }
6425 }
6426 } # End of looping through all children of this table
6427
6428 # Here have assembled in $matches_comment all the related tables
6429 # to the current parent (preceded by the same info for all the
6430 # previous parents). Put out information that applies to all of
6431 # the current family.
6432 if (@conflicting) {
6433
6434 # But output the conflicting information now, as it applies to
6435 # just this table.
6436 my $conflicting = join ", ", @conflicting;
6437 if ($conflicting) {
6438 $matches_comment .= <<END;
6439
6440 Note that contrary to what you might expect, the above is NOT the same as
6441END
6442 $matches_comment .= "any of: " if @conflicting > 1;
6443 $matches_comment .= "$conflicting\n";
6444 }
6445 }
6446 if (@description) {
6447 $matches_comment .= "\n Meaning: "
6448 . join('; ', @description)
6449 . "\n";
6450 }
6451 if (@note) {
6452 $matches_comment .= "\n Note: "
6453 . join("\n ", @note)
6454 . "\n";
6455 }
6456 } # End of looping through all tables
6457
6458
6459 my $code_points;
6460 my $match;
6461 my $any_of_these;
6462 if ($count == 1) {
6463 $match = 'matches';
6464 $code_points = 'single code point';
6465 }
6466 else {
6467 $match = 'match';
6468 $code_points = "$string_count code points";
6469 }
6470
6471 my $synonyms;
6472 my $entries;
6473 if ($total_entries <= 1) {
6474 $synonyms = "";
6475 $entries = 'entry';
6476 $any_of_these = 'this'
6477 }
6478 else {
6479 $synonyms = " any of the following regular expression constructs";
6480 $entries = 'entries';
6481 $any_of_these = 'any of these'
6482 }
6483
6484 my $comment = "";
6485 if ($has_unrelated) {
6486 $comment .= <<END;
6487This file is for tables that are not necessarily related: To conserve
6488resources, every table that matches the identical set of code points in this
6489version of Unicode uses this file. Each one is listed in a separate group
6490below. It could be that the tables will match the same set of code points in
6491other Unicode releases, or it could be purely coincidence that they happen to
6492be the same in Unicode $string_version, and hence may not in other versions.
6493
6494END
6495 }
6496
6497 if (%flags) {
6498 foreach my $flag (sort keys %flags) {
6499 $comment .= <<END;
37e2e78e 6500'$flag' below means that this form is $flags{$flag}.
99870f4d 6501END
37e2e78e
KW
6502 next if $flag eq $PLACEHOLDER;
6503 $comment .= "Consult $pod_file.pod\n";
99870f4d
KW
6504 }
6505 $comment .= "\n";
6506 }
6507
6508 $comment .= <<END;
6509This file returns the $code_points in Unicode Version $string_version that
6510$match$synonyms:
6511
6512$matches_comment
37e2e78e 6513$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
6514including if adding or subtracting white space, underscore, and hyphen
6515characters matters or doesn't matter, and other permissible syntactic
6516variants. Upper/lower case distinctions never matter.
6517END
6518
6519 if ($compound_name) {
6520 $comment .= <<END;
6521
6522A colon can be substituted for the equals sign, and
6523END
6524 if ($properties_with_compound_names > 1) {
6525 $comment .= <<END;
6526within each group above,
6527END
6528 }
6529 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6530
6531 # Note the \b below, it says don't make that line a continuation.
6532 $comment .= <<END;
6533anything to the left of the equals (or colon) can be combined with anything to
6534the right. Thus, for example,
6535$compound_name
6536\bis also valid.
6537END
6538 }
6539
6540 # And append any comment(s) from the actual tables. They are all
6541 # gathered here, so may not read all that well.
37e2e78e
KW
6542 if (@global_comments) {
6543 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
6544 }
99870f4d
KW
6545
6546 if ($count) { # The format differs if no code points, and needs no
6547 # explanation in that case
6548 $comment.= <<END;
6549
6550The format of the lines of this file is:
6551END
6552 $comment.= <<END;
6553START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6554STOP is the ending point, or if omitted, the range has just one code point.
6555END
0c07e538 6556 if ($leader->output_range_counts) {
99870f4d
KW
6557 $comment .= <<END;
6558Numbers in comments in [brackets] indicate how many code points are in the
6559range.
6560END
6561 }
6562 }
6563
6564 $leader->set_comment(main::join_lines($comment));
6565 return;
6566 }
6567
6568 # Accessors for the underlying list
ea25a9b2 6569 for my $sub (qw(
99870f4d
KW
6570 get_valid_code_point
6571 get_invalid_code_point
ea25a9b2 6572 ))
99870f4d
KW
6573 {
6574 no strict "refs";
6575 *$sub = sub {
6576 use strict "refs";
6577 my $self = shift;
6578
6579 return $self->_range_list->$sub(@_);
6580 }
6581 }
6582} # End closure for Match_Table
6583
6584package Property;
6585
6586# The Property class represents a Unicode property, or the $perl
6587# pseudo-property. It contains a map table initialized empty at construction
6588# time, and for properties accessible through regular expressions, various
6589# match tables, created through the add_match_table() method, and referenced
6590# by the table('NAME') or tables() methods, the latter returning a list of all
6591# of the match tables. Otherwise table operations implicitly are for the map
6592# table.
6593#
6594# Most of the data in the property is actually about its map table, so it
6595# mostly just uses that table's accessors for most methods. The two could
6596# have been combined into one object, but for clarity because of their
6597# differing semantics, they have been kept separate. It could be argued that
6598# the 'file' and 'directory' fields should be kept with the map table.
6599#
6600# Each property has a type. This can be set in the constructor, or in the
6601# set_type accessor, but mostly it is figured out by the data. Every property
6602# starts with unknown type, overridden by a parameter to the constructor, or
6603# as match tables are added, or ranges added to the map table, the data is
6604# inspected, and the type changed. After the table is mostly or entirely
6605# filled, compute_type() should be called to finalize they analysis.
6606#
6607# There are very few operations defined. One can safely remove a range from
6608# the map table, and property_add_or_replace_non_nulls() adds the maps from another
6609# table to this one, replacing any in the intersection of the two.
6610
6611sub standardize { return main::standardize($_[0]); }
6612sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6613
6614{ # Closure
6615
6616 # This hash will contain as keys, all the aliases of all properties, and
6617 # as values, pointers to their respective property objects. This allows
6618 # quick look-up of a property from any of its names.
6619 my %alias_to_property_of;
6620
6621 sub dump_alias_to_property_of {
6622 # For debugging
6623
6624 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6625 return;
6626 }
6627
6628 sub property_ref {
6629 # This is a package subroutine, not called as a method.
6630 # If the single parameter is a literal '*' it returns a list of all
6631 # defined properties.
6632 # Otherwise, the single parameter is a name, and it returns a pointer
6633 # to the corresponding property object, or undef if none.
6634 #
6635 # Properties can have several different names. The 'standard' form of
6636 # each of them is stored in %alias_to_property_of as they are defined.
6637 # But it's possible that this subroutine will be called with some
6638 # variant, so if the initial lookup fails, it is repeated with the
6639 # standarized form of the input name. If found, besides returning the
6640 # result, the input name is added to the list so future calls won't
6641 # have to do the conversion again.
6642
6643 my $name = shift;
6644
6645 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6646
6647 if (! defined $name) {
6648 Carp::my_carp_bug("Undefined input property. No action taken.");
6649 return;
6650 }
6651
6652 return main::uniques(values %alias_to_property_of) if $name eq '*';
6653
6654 # Return cached result if have it.
6655 my $result = $alias_to_property_of{$name};
6656 return $result if defined $result;
6657
6658 # Convert the input to standard form.
6659 my $standard_name = standardize($name);
6660
6661 $result = $alias_to_property_of{$standard_name};
6662 return unless defined $result; # Don't cache undefs
6663
6664 # Cache the result before returning it.
6665 $alias_to_property_of{$name} = $result;
6666 return $result;
6667 }
6668
6669
6670 main::setup_package();
6671
6672 my %map;
6673 # A pointer to the map table object for this property
6674 main::set_access('map', \%map);
6675
6676 my %full_name;
6677 # The property's full name. This is a duplicate of the copy kept in the
6678 # map table, but is needed because stringify needs it during
6679 # construction of the map table, and then would have a chicken before egg
6680 # problem.
6681 main::set_access('full_name', \%full_name, 'r');
6682
6683 my %table_ref;
6684 # This hash will contain as keys, all the aliases of any match tables
6685 # attached to this property, and as values, the pointers to their
6686 # respective tables. This allows quick look-up of a table from any of its
6687 # names.
6688 main::set_access('table_ref', \%table_ref);
6689
6690 my %type;
6691 # The type of the property, $ENUM, $BINARY, etc
6692 main::set_access('type', \%type, 'r');
6693
6694 my %file;
6695 # The filename where the map table will go (if actually written).
6696 # Normally defaulted, but can be overridden.
6697 main::set_access('file', \%file, 'r', 's');
6698
6699 my %directory;
6700 # The directory where the map table will go (if actually written).
6701 # Normally defaulted, but can be overridden.
6702 main::set_access('directory', \%directory, 's');
6703
6704 my %pseudo_map_type;
6705 # This is used to affect the calculation of the map types for all the
6706 # ranges in the table. It should be set to one of the values that signify
6707 # to alter the calculation.
6708 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6709
6710 my %has_only_code_point_maps;
6711 # A boolean used to help in computing the type of data in the map table.
6712 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6713
6714 my %unique_maps;
6715 # A list of the first few distinct mappings this property has. This is
6716 # used to disambiguate between binary and enum property types, so don't
6717 # have to keep more than three.
6718 main::set_access('unique_maps', \%unique_maps);
6719
6720 sub new {
6721 # The only required parameter is the positionally first, name. All
6722 # other parameters are key => value pairs. See the documentation just
6723 # above for the meanings of the ones not passed directly on to the map
6724 # table constructor.
6725
6726 my $class = shift;
6727 my $name = shift || "";
6728
6729 my $self = property_ref($name);
6730 if (defined $self) {
6731 my $options_string = join ", ", @_;
6732 $options_string = ". Ignoring options $options_string" if $options_string;
6733 Carp::my_carp("$self is already in use. Using existing one$options_string;");
6734 return $self;
6735 }
6736
6737 my %args = @_;
6738
6739 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 6740 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6741
6742 $directory{$addr} = delete $args{'Directory'};
6743 $file{$addr} = delete $args{'File'};
6744 $full_name{$addr} = delete $args{'Full_Name'} || $name;
6745 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6746 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6747 # Rest of parameters passed on.
6748
6749 $has_only_code_point_maps{$addr} = 1;
6750 $table_ref{$addr} = { };
6751 $unique_maps{$addr} = { };
6752
6753 $map{$addr} = Map_Table->new($name,
6754 Full_Name => $full_name{$addr},
6755 _Alias_Hash => \%alias_to_property_of,
6756 _Property => $self,
6757 %args);
6758 return $self;
6759 }
6760
6761 # See this program's beginning comment block about overloading the copy
6762 # constructor. Few operations are defined on properties, but a couple are
6763 # useful. It is safe to take the inverse of a property, and to remove a
6764 # single code point from it.
6765 use overload
6766 fallback => 0,
6767 qw("") => "_operator_stringify",
6768 "." => \&main::_operator_dot,
6769 '==' => \&main::_operator_equal,
6770 '!=' => \&main::_operator_not_equal,
6771 '=' => sub { return shift },
6772 '-=' => "_minus_and_equal",
6773 ;
6774
6775 sub _operator_stringify {
6776 return "Property '" . shift->full_name . "'";
6777 }
6778
6779 sub _minus_and_equal {
6780 # Remove a single code point from the map table of a property.
6781
6782 my $self = shift;
6783 my $other = shift;
6784 my $reversed = shift;
6785 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6786
6787 if (ref $other) {
6788 Carp::my_carp_bug("Can't cope with a "
6789 . ref($other)
6790 . " argument to '-='. Subtraction ignored.");
6791 return $self;
6792 }
6793 elsif ($reversed) { # Shouldnt happen in a -=, but just in case
6794 Carp::my_carp_bug("Can't cope with a "
6795 . __PACKAGE__
6796 . " being the first parameter in a '-='. Subtraction ignored.");
6797 return $self;
6798 }
6799 else {
f998e60c 6800 no overloading;
051df77b 6801 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
6802 }
6803 return $self;
6804 }
6805
6806 sub add_match_table {
6807 # Add a new match table for this property, with name given by the
6808 # parameter. It returns a pointer to the table.
6809
6810 my $self = shift;
6811 my $name = shift;
6812 my %args = @_;
6813
ffe43484 6814 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6815
6816 my $table = $table_ref{$addr}{$name};
6817 my $standard_name = main::standardize($name);
6818 if (defined $table
6819 || (defined ($table = $table_ref{$addr}{$standard_name})))
6820 {
6821 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
6822 $table_ref{$addr}{$name} = $table;
6823 return $table;
6824 }
6825 else {
6826
6827 # See if this is a perl extension, if not passed in.
6828 my $perl_extension = delete $args{'Perl_Extension'};
6829 $perl_extension
6830 = $self->perl_extension if ! defined $perl_extension;
6831
6832 $table = Match_Table->new(
6833 Name => $name,
6834 Perl_Extension => $perl_extension,
6835 _Alias_Hash => $table_ref{$addr},
6836 _Property => $self,
6837
6838 # gets property's status by default
6839 Status => $self->status,
6840 _Status_Info => $self->status_info,
6841 %args,
6842 Internal_Only_Warning => 1); # Override any
6843 # input param
6844 return unless defined $table;
6845 }
6846
6847 # Save the names for quick look up
6848 $table_ref{$addr}{$standard_name} = $table;
6849 $table_ref{$addr}{$name} = $table;
6850
6851 # Perhaps we can figure out the type of this property based on the
6852 # fact of adding this match table. First, string properties don't
6853 # have match tables; second, a binary property can't have 3 match
6854 # tables
6855 if ($type{$addr} == $UNKNOWN) {
6856 $type{$addr} = $NON_STRING;
6857 }
6858 elsif ($type{$addr} == $STRING) {
6859 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
6860 $type{$addr} = $NON_STRING;
6861 }
6862 elsif ($type{$addr} != $ENUM) {
6863 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6864 && $type{$addr} == $BINARY)
6865 {
6866 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
6867 $type{$addr} = $ENUM;
6868 }
6869 }
6870
6871 return $table;
6872 }
6873
6874 sub table {
6875 # Return a pointer to the match table (with name given by the
6876 # parameter) associated with this property; undef if none.
6877
6878 my $self = shift;
6879 my $name = shift;
6880 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6881
ffe43484 6882 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6883
6884 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6885
6886 # If quick look-up failed, try again using the standard form of the
6887 # input name. If that succeeds, cache the result before returning so
6888 # won't have to standardize this input name again.
6889 my $standard_name = main::standardize($name);
6890 return unless defined $table_ref{$addr}{$standard_name};
6891
6892 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6893 return $table_ref{$addr}{$name};
6894 }
6895
6896 sub tables {
6897 # Return a list of pointers to all the match tables attached to this
6898 # property
6899
f998e60c 6900 no overloading;
051df77b 6901 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
6902 }
6903
6904 sub directory {
6905 # Returns the directory the map table for this property should be
6906 # output in. If a specific directory has been specified, that has
6907 # priority; 'undef' is returned if the type isn't defined;
6908 # or $map_directory for everything else.
6909
ffe43484 6910 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
6911
6912 return $directory{$addr} if defined $directory{$addr};
6913 return undef if $type{$addr} == $UNKNOWN;
6914 return $map_directory;
6915 }
6916
6917 sub swash_name {
6918 # Return the name that is used to both:
6919 # 1) Name the file that the map table is written to.
6920 # 2) The name of swash related stuff inside that file.
6921 # The reason for this is that the Perl core historically has used
6922 # certain names that aren't the same as the Unicode property names.
6923 # To continue using these, $file is hard-coded in this file for those,
6924 # but otherwise the standard name is used. This is different from the
6925 # external_name, so that the rest of the files, like in lib can use
6926 # the standard name always, without regard to historical precedent.
6927
6928 my $self = shift;
6929 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6930
ffe43484 6931 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6932
6933 return $file{$addr} if defined $file{$addr};
6934 return $map{$addr}->external_name;
6935 }
6936
6937 sub to_create_match_tables {
6938 # Returns a boolean as to whether or not match tables should be
6939 # created for this property.
6940
6941 my $self = shift;
6942 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6943
6944 # The whole point of this pseudo property is match tables.
6945 return 1 if $self == $perl;
6946
ffe43484 6947 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6948
6949 # Don't generate tables of code points that match the property values
6950 # of a string property. Such a list would most likely have many
6951 # property values, each with just one or very few code points mapping
6952 # to it.
6953 return 0 if $type{$addr} == $STRING;
6954
6955 # Don't generate anything for unimplemented properties.
6956 return 0 if grep { $self->complete_name eq $_ }
6957 @unimplemented_properties;
6958 # Otherwise, do.
6959 return 1;
6960 }
6961
6962 sub property_add_or_replace_non_nulls {
6963 # This adds the mappings in the property $other to $self. Non-null
6964 # mappings from $other override those in $self. It essentially merges
6965 # the two properties, with the second having priority except for null
6966 # mappings.
6967
6968 my $self = shift;
6969 my $other = shift;
6970 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6971
6972 if (! $other->isa(__PACKAGE__)) {
6973 Carp::my_carp_bug("$other should be a "
6974 . __PACKAGE__
6975 . ". Not a '"
6976 . ref($other)
6977 . "'. Not added;");
6978 return;
6979 }
6980
f998e60c 6981 no overloading;
051df77b 6982 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
6983 }
6984
6985 sub set_type {
6986 # Set the type of the property. Mostly this is figured out by the
6987 # data in the table. But this is used to set it explicitly. The
6988 # reason it is not a standard accessor is that when setting a binary
6989 # property, we need to make sure that all the true/false aliases are
6990 # present, as they were omitted in early Unicode releases.
6991
6992 my $self = shift;
6993 my $type = shift;
6994 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6995
6996 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
6997 Carp::my_carp("Unrecognized type '$type'. Type not set");
6998 return;
6999 }
7000
051df77b 7001 { no overloading; $type{pack 'J', $self} = $type; }
99870f4d
KW
7002 return if $type != $BINARY;
7003
7004 my $yes = $self->table('Y');
7005 $yes = $self->table('Yes') if ! defined $yes;
7006 $yes = $self->add_match_table('Y') if ! defined $yes;
7007 $yes->add_alias('Yes');
7008 $yes->add_alias('T');
7009 $yes->add_alias('True');
7010
7011 my $no = $self->table('N');
7012 $no = $self->table('No') if ! defined $no;
7013 $no = $self->add_match_table('N') if ! defined $no;
7014 $no->add_alias('No');
7015 $no->add_alias('F');
7016 $no->add_alias('False');
7017 return;
7018 }
7019
7020 sub add_map {
7021 # Add a map to the property's map table. This also keeps
7022 # track of the maps so that the property type can be determined from
7023 # its data.
7024
7025 my $self = shift;
7026 my $start = shift; # First code point in range
7027 my $end = shift; # Final code point in range
7028 my $map = shift; # What the range maps to.
7029 # Rest of parameters passed on.
7030
ffe43484 7031 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7032
7033 # If haven't the type of the property, gather information to figure it
7034 # out.
7035 if ($type{$addr} == $UNKNOWN) {
7036
7037 # If the map contains an interior blank or dash, or most other
7038 # nonword characters, it will be a string property. This
7039 # heuristic may actually miss some string properties. If so, they
7040 # may need to have explicit set_types called for them. This
7041 # happens in the Unihan properties.
7042 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7043 || $map =~ / [^\w.\/\ -] /x)
7044 {
7045 $self->set_type($STRING);
7046
7047 # $unique_maps is used for disambiguating between ENUM and
7048 # BINARY later; since we know the property is not going to be
7049 # one of those, no point in keeping the data around
7050 undef $unique_maps{$addr};
7051 }
7052 else {
7053
7054 # Not necessarily a string. The final decision has to be
7055 # deferred until all the data are in. We keep track of if all
7056 # the values are code points for that eventual decision.
7057 $has_only_code_point_maps{$addr} &=
7058 $map =~ / ^ $code_point_re $/x;
7059
7060 # For the purposes of disambiguating between binary and other
7061 # enumerations at the end, we keep track of the first three
7062 # distinct property values. Once we get to three, we know
7063 # it's not going to be binary, so no need to track more.
7064 if (scalar keys %{$unique_maps{$addr}} < 3) {
7065 $unique_maps{$addr}{main::standardize($map)} = 1;
7066 }
7067 }
7068 }
7069
7070 # Add the mapping by calling our map table's method
7071 return $map{$addr}->add_map($start, $end, $map, @_);
7072 }
7073
7074 sub compute_type {
7075 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7076 # should be called after the property is mostly filled with its maps.
7077 # We have been keeping track of what the property values have been,
7078 # and now have the necessary information to figure out the type.
7079
7080 my $self = shift;
7081 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7082
ffe43484 7083 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7084
7085 my $type = $type{$addr};
7086
7087 # If already have figured these out, no need to do so again, but we do
7088 # a double check on ENUMS to make sure that a string property hasn't
7089 # improperly been classified as an ENUM, so continue on with those.
7090 return if $type == $STRING || $type == $BINARY;
7091
7092 # If every map is to a code point, is a string property.
7093 if ($type == $UNKNOWN
7094 && ($has_only_code_point_maps{$addr}
7095 || (defined $map{$addr}->default_map
7096 && $map{$addr}->default_map eq "")))
7097 {
7098 $self->set_type($STRING);
7099 }
7100 else {
7101
7102 # Otherwise, it is to some sort of enumeration. (The case where
7103 # it is a Unicode miscellaneous property, and treated like a
7104 # string in this program is handled in add_map()). Distinguish
7105 # between binary and some other enumeration type. Of course, if
7106 # there are more than two values, it's not binary. But more
7107 # subtle is the test that the default mapping is defined means it
7108 # isn't binary. This in fact may change in the future if Unicode
7109 # changes the way its data is structured. But so far, no binary
7110 # properties ever have @missing lines for them, so the default map
7111 # isn't defined for them. The few properties that are two-valued
7112 # and aren't considered binary have the default map defined
7113 # starting in Unicode 5.0, when the @missing lines appeared; and
7114 # this program has special code to put in a default map for them
7115 # for earlier than 5.0 releases.
7116 if ($type == $ENUM
7117 || scalar keys %{$unique_maps{$addr}} > 2
7118 || defined $self->default_map)
7119 {
7120 my $tables = $self->tables;
7121 my $count = $self->count;
7122 if ($verbosity && $count > 500 && $tables/$count > .1) {
7123 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7124 }
7125 $self->set_type($ENUM);
7126 }
7127 else {
7128 $self->set_type($BINARY);
7129 }
7130 }
7131 undef $unique_maps{$addr}; # Garbage collect
7132 return;
7133 }
7134
7135 # Most of the accessors for a property actually apply to its map table.
7136 # Setup up accessor functions for those, referring to %map
ea25a9b2 7137 for my $sub (qw(
99870f4d
KW
7138 add_alias
7139 add_anomalous_entry
7140 add_comment
7141 add_conflicting
7142 add_description
7143 add_duplicate
7144 add_note
7145 aliases
7146 comment
7147 complete_name
7148 core_access
7149 count
7150 default_map
7151 delete_range
7152 description
7153 each_range
7154 external_name
7155 file_path
7156 format
7157 initialize
7158 inverse_list
7159 is_empty
7160 name
7161 note
7162 perl_extension
7163 property
7164 range_count
7165 ranges
7166 range_size_1
7167 reset_each_range
7168 set_comment
7169 set_core_access
7170 set_default_map
7171 set_file_path
7172 set_final_comment
7173 set_range_size_1
7174 set_status
7175 set_to_output_map
7176 short_name
7177 status
7178 status_info
7179 to_output_map
7180 value_of
7181 write
ea25a9b2 7182 ))
99870f4d
KW
7183 # 'property' above is for symmetry, so that one can take
7184 # the property of a property and get itself, and so don't
7185 # have to distinguish between properties and tables in
7186 # calling code
7187 {
7188 no strict "refs";
7189 *$sub = sub {
7190 use strict "refs";
7191 my $self = shift;
f998e60c 7192 no overloading;
051df77b 7193 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7194 }
7195 }
7196
7197
7198} # End closure
7199
7200package main;
7201
7202sub join_lines($) {
7203 # Returns lines of the input joined together, so that they can be folded
7204 # properly.
7205 # This causes continuation lines to be joined together into one long line
7206 # for folding. A continuation line is any line that doesn't begin with a
7207 # space or "\b" (the latter is stripped from the output). This is so
7208 # lines can be be in a HERE document so as to fit nicely in the terminal
7209 # width, but be joined together in one long line, and then folded with
7210 # indents, '#' prefixes, etc, properly handled.
7211 # A blank separates the joined lines except if there is a break; an extra
7212 # blank is inserted after a period ending a line.
7213
7214 # Intialize the return with the first line.
7215 my ($return, @lines) = split "\n", shift;
7216
7217 # If the first line is null, it was an empty line, add the \n back in
7218 $return = "\n" if $return eq "";
7219
7220 # Now join the remainder of the physical lines.
7221 for my $line (@lines) {
7222
7223 # An empty line means wanted a blank line, so add two \n's to get that
7224 # effect, and go to the next line.
7225 if (length $line == 0) {
7226 $return .= "\n\n";
7227 next;
7228 }
7229
7230 # Look at the last character of what we have so far.
7231 my $previous_char = substr($return, -1, 1);
7232
7233 # And at the next char to be output.
7234 my $next_char = substr($line, 0, 1);
7235
7236 if ($previous_char ne "\n") {
7237
7238 # Here didn't end wth a nl. If the next char a blank or \b, it
7239 # means that here there is a break anyway. So add a nl to the
7240 # output.
7241 if ($next_char eq " " || $next_char eq "\b") {
7242 $previous_char = "\n";
7243 $return .= $previous_char;
7244 }
7245
7246 # Add an extra space after periods.
7247 $return .= " " if $previous_char eq '.';
7248 }
7249
7250 # Here $previous_char is still the latest character to be output. If
7251 # it isn't a nl, it means that the next line is to be a continuation
7252 # line, with a blank inserted between them.
7253 $return .= " " if $previous_char ne "\n";
7254
7255 # Get rid of any \b
7256 substr($line, 0, 1) = "" if $next_char eq "\b";
7257
7258 # And append this next line.
7259 $return .= $line;
7260 }
7261
7262 return $return;
7263}
7264
7265sub simple_fold($;$$$) {
7266 # Returns a string of the input (string or an array of strings) folded
7267 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7268 # a \n
7269 # This is tailored for the kind of text written by this program,
7270 # especially the pod file, which can have very long names with
7271 # underscores in the middle, or words like AbcDefgHij.... We allow
7272 # breaking in the middle of such constructs if the line won't fit
7273 # otherwise. The break in such cases will come either just after an
7274 # underscore, or just before one of the Capital letters.
7275
7276 local $to_trace = 0 if main::DEBUG;
7277
7278 my $line = shift;
7279 my $prefix = shift; # Optional string to prepend to each output
7280 # line
7281 $prefix = "" unless defined $prefix;
7282
7283 my $hanging_indent = shift; # Optional number of spaces to indent
7284 # continuation lines
7285 $hanging_indent = 0 unless $hanging_indent;
7286
7287 my $right_margin = shift; # Optional number of spaces to narrow the
7288 # total width by.
7289 $right_margin = 0 unless defined $right_margin;
7290
7291 # Call carp with the 'nofold' option to avoid it from trying to call us
7292 # recursively
7293 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7294
7295 # The space available doesn't include what's automatically prepended
7296 # to each line, or what's reserved on the right.
7297 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7298 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7299
7300 if (DEBUG && $hanging_indent >= $max) {
7301 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7302 $hanging_indent = 0;
7303 }
7304
7305 # First, split into the current physical lines.
7306 my @line;
7307 if (ref $line) { # Better be an array, because not bothering to
7308 # test
7309 foreach my $line (@{$line}) {
7310 push @line, split /\n/, $line;
7311 }
7312 }
7313 else {
7314 @line = split /\n/, $line;
7315 }
7316
7317 #local $to_trace = 1 if main::DEBUG;
7318 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7319
7320 # Look at each current physical line.
7321 for (my $i = 0; $i < @line; $i++) {
7322 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7323 #local $to_trace = 1 if main::DEBUG;
7324 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7325
7326 # Remove prefix, because will be added back anyway, don't want
7327 # doubled prefix
7328 $line[$i] =~ s/^$prefix//;
7329
7330 # Remove trailing space
7331 $line[$i] =~ s/\s+\Z//;
7332
7333 # If the line is too long, fold it.
7334 if (length $line[$i] > $max) {
7335 my $remainder;
7336
7337 # Here needs to fold. Save the leading space in the line for
7338 # later.
7339 $line[$i] =~ /^ ( \s* )/x;
7340 my $leading_space = $1;
7341 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7342
7343 # If character at final permissible position is white space,
7344 # fold there, which will delete that white space
7345 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7346 $remainder = substr($line[$i], $max);
7347 $line[$i] = substr($line[$i], 0, $max - 1);
7348 }
7349 else {
7350
7351 # Otherwise fold at an acceptable break char closest to
7352 # the max length. Look at just the maximal initial
7353 # segment of the line
7354 my $segment = substr($line[$i], 0, $max - 1);
7355 if ($segment =~
7356 /^ ( .{$hanging_indent} # Don't look before the
7357 # indent.
7358 \ * # Don't look in leading
7359 # blanks past the indent
7360 [^ ] .* # Find the right-most
7361 (?: # acceptable break:
7362 [ \s = ] # space or equal
7363 | - (?! [.0-9] ) # or non-unary minus.
7364 ) # $1 includes the character
7365 )/x)
7366 {
7367 # Split into the initial part that fits, and remaining
7368 # part of the input
7369 $remainder = substr($line[$i], length $1);
7370 $line[$i] = $1;
7371 trace $line[$i] if DEBUG && $to_trace;
7372 trace $remainder if DEBUG && $to_trace;
7373 }
7374
7375 # If didn't find a good breaking spot, see if there is a
7376 # not-so-good breaking spot. These are just after
7377 # underscores or where the case changes from lower to
7378 # upper. Use \a as a soft hyphen, but give up
7379 # and don't break the line if there is actually a \a
7380 # already in the input. We use an ascii character for the
7381 # soft-hyphen to avoid any attempt by miniperl to try to
7382 # access the files that this program is creating.
7383 elsif ($segment !~ /\a/
7384 && ($segment =~ s/_/_\a/g
7385 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7386 {
7387 # Here were able to find at least one place to insert
7388 # our substitute soft hyphen. Find the right-most one
7389 # and replace it by a real hyphen.
7390 trace $segment if DEBUG && $to_trace;
7391 substr($segment,
7392 rindex($segment, "\a"),
7393 1) = '-';
7394
7395 # Then remove the soft hyphen substitutes.
7396 $segment =~ s/\a//g;
7397 trace $segment if DEBUG && $to_trace;
7398
7399 # And split into the initial part that fits, and
7400 # remainder of the line
7401 my $pos = rindex($segment, '-');
7402 $remainder = substr($line[$i], $pos);
7403 trace $remainder if DEBUG && $to_trace;
7404 $line[$i] = substr($segment, 0, $pos + 1);
7405 }
7406 }
7407
7408 # Here we know if we can fold or not. If we can, $remainder
7409 # is what remains to be processed in the next iteration.
7410 if (defined $remainder) {
7411 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7412
7413 # Insert the folded remainder of the line as a new element
7414 # of the array. (It may still be too long, but we will
7415 # deal with that next time through the loop.) Omit any
7416 # leading space in the remainder.
7417 $remainder =~ s/^\s+//;
7418 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7419
7420 # But then indent by whichever is larger of:
7421 # 1) the leading space on the input line;
7422 # 2) the hanging indent.
7423 # This preserves indentation in the original line.
7424 my $lead = ($leading_space)
7425 ? length $leading_space
7426 : $hanging_indent;
7427 $lead = max($lead, $hanging_indent);
7428 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7429 }
7430 }
7431
7432 # Ready to output the line. Get rid of any trailing space
7433 # And prefix by the required $prefix passed in.
7434 $line[$i] =~ s/\s+$//;
7435 $line[$i] = "$prefix$line[$i]\n";
7436 } # End of looping through all the lines.
7437
7438 return join "", @line;
7439}
7440
7441sub property_ref { # Returns a reference to a property object.
7442 return Property::property_ref(@_);
7443}
7444
7445sub force_unlink ($) {
7446 my $filename = shift;
7447 return unless file_exists($filename);
7448 return if CORE::unlink($filename);
7449
7450 # We might need write permission
7451 chmod 0777, $filename;
7452 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
7453 return;
7454}
7455
7456sub write ($\@) {
7457 # Given a filename and a reference to an array of lines, write the lines
7458 # to the file
7459 # Filename can be given as an arrayref of directory names
7460
7461 my $file = shift;
7462 my $lines_ref = shift;
7463 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7464
7465 if (! defined $lines_ref) {
7466 Carp::my_carp("Missing lines to write parameter for $file. Writing skipped;");
7467 return;
7468 }
7469
7470 # Get into a single string if an array, and get rid of, in Unix terms, any
7471 # leading '.'
7472 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7473 $file = File::Spec->canonpath($file);
7474
7475 # If has directories, make sure that they all exist
7476 (undef, my $directories, undef) = File::Spec->splitpath($file);
7477 File::Path::mkpath($directories) if $directories && ! -d $directories;
7478
7479 push @files_actually_output, $file;
7480
430ada4c 7481 unless (@$lines_ref) {
99870f4d
KW
7482 Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7483 }
7484
7485 force_unlink ($file);
7486
7487 my $OUT;
7488 if (not open $OUT, ">", $file) {
7489 Carp::my_carp("can't open $file for output. Skipping this file: $!");
7490 return;
7491 }
430ada4c
NC
7492
7493 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
7494 close $OUT or die Carp::my_carp("close '$file' failed: $!");
7495
99870f4d
KW
7496 print "$file written.\n" if $verbosity >= $VERBOSE;
7497
99870f4d
KW
7498 return;
7499}
7500
7501
7502sub Standardize($) {
7503 # This converts the input name string into a standardized equivalent to
7504 # use internally.
7505
7506 my $name = shift;
7507 unless (defined $name) {
7508 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
7509 return;
7510 }
7511
7512 # Remove any leading or trailing white space
7513 $name =~ s/^\s+//g;
7514 $name =~ s/\s+$//g;
7515
7516 # Convert interior white space and hypens into underscores.
7517 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7518
7519 # Capitalize the letter following an underscore, and convert a sequence of
7520 # multiple underscores to a single one
7521 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7522
7523 # And capitalize the first letter, but not for the special cjk ones.
7524 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7525 return $name;
7526}
7527
7528sub standardize ($) {
7529 # Returns a lower-cased standardized name, without underscores. This form
7530 # is chosen so that it can distinguish between any real versus superficial
7531 # Unicode name differences. It relies on the fact that Unicode doesn't
7532 # have interior underscores, white space, nor dashes in any
7533 # stricter-matched name. It should not be used on Unicode code point
7534 # names (the Name property), as they mostly, but not always follow these
7535 # rules.
7536
7537 my $name = Standardize(shift);
7538 return if !defined $name;
7539
7540 $name =~ s/ (?<= .) _ (?= . ) //xg;
7541 return lc $name;
7542}
7543
7544{ # Closure
7545
7546 my $indent_increment = " " x 2;
7547 my %already_output;
7548
7549 $main::simple_dumper_nesting = 0;
7550
7551 sub simple_dumper {
7552 # Like Simple Data::Dumper. Good enough for our needs. We can't use
7553 # the real thing as we have to run under miniperl.
7554
7555 # It is designed so that on input it is at the beginning of a line,
7556 # and the final thing output in any call is a trailing ",\n".
7557
7558 my $item = shift;
7559 my $indent = shift;
7560 $indent = "" if ! defined $indent;
7561
7562 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7563
7564 # nesting level is localized, so that as the call stack pops, it goes
7565 # back to the prior value.
7566 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7567 undef %already_output if $main::simple_dumper_nesting == 0;
7568 $main::simple_dumper_nesting++;
7569 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7570
7571 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7572
7573 # Determine the indent for recursive calls.
7574 my $next_indent = $indent . $indent_increment;
7575
7576 my $output;
7577 if (! ref $item) {
7578
7579 # Dump of scalar: just output it in quotes if not a number. To do
7580 # so we must escape certain characters, and therefore need to
7581 # operate on a copy to avoid changing the original
7582 my $copy = $item;
7583 $copy = $UNDEF unless defined $copy;
7584
7585 # Quote non-numbers (numbers also have optional leading '-' and
7586 # fractions)
7587 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7588
7589 # Escape apostrophe and backslash
7590 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7591 $copy = "'$copy'";
7592 }
7593 $output = "$indent$copy,\n";
7594 }
7595 else {
7596
7597 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 7598 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 7599 if (defined $already_output{$addr}) {
99870f4d
KW
7600 return "${indent}ALREADY OUTPUT: $item\n";
7601 }
f998e60c 7602 $already_output{$addr} = $item;
99870f4d
KW
7603
7604 if (ref $item eq 'ARRAY') {
7605 my $using_brackets;
7606 $output = $indent;
7607 if ($main::simple_dumper_nesting > 1) {
7608 $output .= '[';
7609 $using_brackets = 1;
7610 }
7611 else {
7612 $using_brackets = 0;
7613 }
7614
7615 # If the array is empty, put the closing bracket on the same
7616 # line. Otherwise, recursively add each array element
7617 if (@$item == 0) {
7618 $output .= " ";
7619 }
7620 else {
7621 $output .= "\n";
7622 for (my $i = 0; $i < @$item; $i++) {
7623
7624 # Indent array elements one level
7625 $output .= &simple_dumper($item->[$i], $next_indent);
7626 $output =~ s/\n$//; # Remove trailing nl so as to
7627 $output .= " # [$i]\n"; # add a comment giving the
7628 # array index
7629 }
7630 $output .= $indent; # Indent closing ']' to orig level
7631 }
7632 $output .= ']' if $using_brackets;
7633 $output .= ",\n";
7634 }
7635 elsif (ref $item eq 'HASH') {
7636 my $is_first_line;
7637 my $using_braces;
7638 my $body_indent;
7639
7640 # No surrounding braces at top level
7641 $output .= $indent;
7642 if ($main::simple_dumper_nesting > 1) {
7643 $output .= "{\n";
7644 $is_first_line = 0;
7645 $body_indent = $next_indent;
7646 $next_indent .= $indent_increment;
7647 $using_braces = 1;
7648 }
7649 else {
7650 $is_first_line = 1;
7651 $body_indent = $indent;
7652 $using_braces = 0;
7653 }
7654
7655 # Output hashes sorted alphabetically instead of apparently
7656 # random. Use caseless alphabetic sort
7657 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7658 {
7659 if ($is_first_line) {
7660 $is_first_line = 0;
7661 }
7662 else {
7663 $output .= "$body_indent";
7664 }
7665
7666 # The key must be a scalar, but this recursive call quotes
7667 # it
7668 $output .= &simple_dumper($key);
7669
7670 # And change the trailing comma and nl to the hash fat
7671 # comma for clarity, and so the value can be on the same
7672 # line
7673 $output =~ s/,\n$/ => /;
7674
7675 # Recursively call to get the value's dump.
7676 my $next = &simple_dumper($item->{$key}, $next_indent);
7677
7678 # If the value is all on one line, remove its indent, so
7679 # will follow the => immediately. If it takes more than
7680 # one line, start it on a new line.
7681 if ($next !~ /\n.*\n/) {
7682 $next =~ s/^ *//;
7683 }
7684 else {
7685 $output .= "\n";
7686 }
7687 $output .= $next;
7688 }
7689
7690 $output .= "$indent},\n" if $using_braces;
7691 }
7692 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7693 $output = $indent . ref($item) . "\n";
7694 # XXX see if blessed
7695 }
7696 elsif ($item->can('dump')) {
7697
7698 # By convention in this program, objects furnish a 'dump'
7699 # method. Since not doing any output at this level, just pass
7700 # on the input indent
7701 $output = $item->dump($indent);
7702 }
7703 else {
7704 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
7705 }
7706 }
7707 return $output;
7708 }
7709}
7710
7711sub dump_inside_out {
7712 # Dump inside-out hashes in an object's state by converting them to a
7713 # regular hash and then calling simple_dumper on that.
7714
7715 my $object = shift;
7716 my $fields_ref = shift;
7717 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7718
ffe43484 7719 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
7720
7721 my %hash;
7722 foreach my $key (keys %$fields_ref) {
7723 $hash{$key} = $fields_ref->{$key}{$addr};
7724 }
7725
7726 return simple_dumper(\%hash, @_);
7727}
7728
7729sub _operator_dot {
7730 # Overloaded '.' method that is common to all packages. It uses the
7731 # package's stringify method.
7732
7733 my $self = shift;
7734 my $other = shift;
7735 my $reversed = shift;
7736 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7737
7738 $other = "" unless defined $other;
7739
7740 foreach my $which (\$self, \$other) {
7741 next unless ref $$which;
7742 if ($$which->can('_operator_stringify')) {
7743 $$which = $$which->_operator_stringify;
7744 }
7745 else {
7746 my $ref = ref $$which;
ffe43484 7747 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
7748 $$which = "$ref ($addr)";
7749 }
7750 }
7751 return ($reversed)
7752 ? "$other$self"
7753 : "$self$other";
7754}
7755
7756sub _operator_equal {
7757 # Generic overloaded '==' routine. To be equal, they must be the exact
7758 # same object
7759
7760 my $self = shift;
7761 my $other = shift;
7762
7763 return 0 unless defined $other;
7764 return 0 unless ref $other;
f998e60c 7765 no overloading;
2100aa98 7766 return $self == $other;
99870f4d
KW
7767}
7768
7769sub _operator_not_equal {
7770 my $self = shift;
7771 my $other = shift;
7772
7773 return ! _operator_equal($self, $other);
7774}
7775
7776sub process_PropertyAliases($) {
7777 # This reads in the PropertyAliases.txt file, which contains almost all
7778 # the character properties in Unicode and their equivalent aliases:
7779 # scf ; Simple_Case_Folding ; sfc
7780 #
7781 # Field 0 is the preferred short name for the property.
7782 # Field 1 is the full name.
7783 # Any succeeding ones are other accepted names.
7784
7785 my $file= shift;
7786 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7787
7788 # This whole file was non-existent in early releases, so use our own
7789 # internal one.
7790 $file->insert_lines(get_old_property_aliases())
7791 if ! -e 'PropertyAliases.txt';
7792
7793 # Add any cjk properties that may have been defined.
7794 $file->insert_lines(@cjk_properties);
7795
7796 while ($file->next_line) {
7797
7798 my @data = split /\s*;\s*/;
7799
7800 my $full = $data[1];
7801
7802 my $this = Property->new($data[0], Full_Name => $full);
7803
7804 # Start looking for more aliases after these two.
7805 for my $i (2 .. @data - 1) {
7806 $this->add_alias($data[$i]);
7807 }
7808
7809 }
7810 return;
7811}
7812
7813sub finish_property_setup {
7814 # Finishes setting up after PropertyAliases.
7815
7816 my $file = shift;
7817 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7818
7819 # This entry was missing from this file in earlier Unicode versions
7820 if (-e 'Jamo.txt') {
7821 my $jsn = property_ref('JSN');
7822 if (! defined $jsn) {
7823 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7824 }
7825 }
7826
7827 # This entry is still missing as of 5.2, perhaps because no short name for
7828 # it.
7829 if (-e 'NameAliases.txt') {
7830 my $aliases = property_ref('Name_Alias');
7831 if (! defined $aliases) {
7832 $aliases = Property->new('Name_Alias');
7833 }
7834 }
7835
7836 # These are used so much, that we set globals for them.
7837 $gc = property_ref('General_Category');
7838 $block = property_ref('Block');
7839
7840 # Perl adds this alias.
7841 $gc->add_alias('Category');
7842
7843 # For backwards compatibility, these property files have particular names.
7844 my $upper = property_ref('Uppercase_Mapping');
7845 $upper->set_core_access('uc()');
7846 $upper->set_file('Upper'); # This is what utf8.c calls it
7847
7848 my $lower = property_ref('Lowercase_Mapping');
7849 $lower->set_core_access('lc()');
7850 $lower->set_file('Lower');
7851
7852 my $title = property_ref('Titlecase_Mapping');
7853 $title->set_core_access('ucfirst()');
7854 $title->set_file('Title');
7855
7856 my $fold = property_ref('Case_Folding');
7857 $fold->set_file('Fold') if defined $fold;
7858
7859 # utf8.c can't currently cope with non range-size-1 for these, and even if
7860 # it were changed to do so, someone else may be using them, expecting the
7861 # old style
7862 foreach my $property (qw {
7863 Case_Folding
7864 Lowercase_Mapping
7865 Titlecase_Mapping
7866 Uppercase_Mapping
7867 })
7868 {
7869 property_ref($property)->set_range_size_1(1);
7870 }
7871
7872 # These two properties aren't actually used in the core, but unfortunately
7873 # the names just above that are in the core interfere with these, so
7874 # choose different names. These aren't a problem unless the map tables
7875 # for these files get written out.
7876 my $lowercase = property_ref('Lowercase');
7877 $lowercase->set_file('IsLower') if defined $lowercase;
7878 my $uppercase = property_ref('Uppercase');
7879 $uppercase->set_file('IsUpper') if defined $uppercase;
7880
7881 # Set up the hard-coded default mappings, but only on properties defined
7882 # for this release
7883 foreach my $property (keys %default_mapping) {
7884 my $property_object = property_ref($property);
7885 next if ! defined $property_object;
7886 my $default_map = $default_mapping{$property};
7887 $property_object->set_default_map($default_map);
7888
7889 # A map of <code point> implies the property is string.
7890 if ($property_object->type == $UNKNOWN
7891 && $default_map eq $CODE_POINT)
7892 {
7893 $property_object->set_type($STRING);
7894 }
7895 }
7896
7897 # The following use the Multi_Default class to create objects for
7898 # defaults.
7899
7900 # Bidi class has a complicated default, but the derived file takes care of
7901 # the complications, leaving just 'L'.
7902 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7903 property_ref('Bidi_Class')->set_default_map('L');
7904 }
7905 else {
7906 my $default;
7907
7908 # The derived file was introduced in 3.1.1. The values below are
7909 # taken from table 3-8, TUS 3.0
7910 my $default_R =
7911 'my $default = Range_List->new;
7912 $default->add_range(0x0590, 0x05FF);
7913 $default->add_range(0xFB1D, 0xFB4F);'
7914 ;
7915
7916 # The defaults apply only to unassigned characters
a67f160a 7917 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
7918
7919 if ($v_version lt v3.0.0) {
7920 $default = Multi_Default->new(R => $default_R, 'L');
7921 }
7922 else {
7923
7924 # AL apparently not introduced until 3.0: TUS 2.x references are
7925 # not on-line to check it out
7926 my $default_AL =
7927 'my $default = Range_List->new;
7928 $default->add_range(0x0600, 0x07BF);
7929 $default->add_range(0xFB50, 0xFDFF);
7930 $default->add_range(0xFE70, 0xFEFF);'
7931 ;
7932
7933 # Non-character code points introduced in this release; aren't AL
7934 if ($v_version ge 3.1.0) {
7935 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7936 }
a67f160a 7937 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
7938 $default = Multi_Default->new(AL => $default_AL,
7939 R => $default_R,
7940 'L');
7941 }
7942 property_ref('Bidi_Class')->set_default_map($default);
7943 }
7944
7945 # Joining type has a complicated default, but the derived file takes care
7946 # of the complications, leaving just 'U' (or Non_Joining), except the file
7947 # is bad in 3.1.0
7948 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7949 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7950 property_ref('Joining_Type')->set_default_map('Non_Joining');
7951 }
7952 else {
7953
7954 # Otherwise, there are not one, but two possibilities for the
7955 # missing defaults: T and U.
7956 # The missing defaults that evaluate to T are given by:
7957 # T = Mn + Cf - ZWNJ - ZWJ
7958 # where Mn and Cf are the general category values. In other words,
7959 # any non-spacing mark or any format control character, except
7960 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
7961 # WIDTH JOINER (joining type C).
7962 my $default = Multi_Default->new(
7963 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
7964 'Non_Joining');
7965 property_ref('Joining_Type')->set_default_map($default);
7966 }
7967 }
7968
7969 # Line break has a complicated default in early releases. It is 'Unknown'
7970 # for non-assigned code points; 'AL' for assigned.
7971 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
7972 my $lb = property_ref('Line_Break');
7973 if ($v_version gt 3.2.0) {
7974 $lb->set_default_map('Unknown');
7975 }
7976 else {
7977 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
7978 'AL');
7979 $lb->set_default_map($default);
7980 }
7981
7982 # If has the URS property, make sure that the standard aliases are in
7983 # it, since not in the input tables in some versions.
7984 my $urs = property_ref('Unicode_Radical_Stroke');
7985 if (defined $urs) {
7986 $urs->add_alias('cjkRSUnicode');
7987 $urs->add_alias('kRSUnicode');
7988 }
7989 }
7990 return;
7991}
7992
7993sub get_old_property_aliases() {
7994 # Returns what would be in PropertyAliases.txt if it existed in very old
7995 # versions of Unicode. It was derived from the one in 3.2, and pared
7996 # down based on the data that was actually in the older releases.
7997 # An attempt was made to use the existence of files to mean inclusion or
7998 # not of various aliases, but if this was not sufficient, using version
7999 # numbers was resorted to.
8000
8001 my @return;
8002
8003 # These are to be used in all versions (though some are constructed by
8004 # this program if missing)
8005 push @return, split /\n/, <<'END';
8006bc ; Bidi_Class
8007Bidi_M ; Bidi_Mirrored
8008cf ; Case_Folding
8009ccc ; Canonical_Combining_Class
8010dm ; Decomposition_Mapping
8011dt ; Decomposition_Type
8012gc ; General_Category
8013isc ; ISO_Comment
8014lc ; Lowercase_Mapping
8015na ; Name
8016na1 ; Unicode_1_Name
8017nt ; Numeric_Type
8018nv ; Numeric_Value
8019sfc ; Simple_Case_Folding
8020slc ; Simple_Lowercase_Mapping
8021stc ; Simple_Titlecase_Mapping
8022suc ; Simple_Uppercase_Mapping
8023tc ; Titlecase_Mapping
8024uc ; Uppercase_Mapping
8025END
8026
8027 if (-e 'Blocks.txt') {
8028 push @return, "blk ; Block\n";
8029 }
8030 if (-e 'ArabicShaping.txt') {
8031 push @return, split /\n/, <<'END';
8032jg ; Joining_Group
8033jt ; Joining_Type
8034END
8035 }
8036 if (-e 'PropList.txt') {
8037
8038 # This first set is in the original old-style proplist.
8039 push @return, split /\n/, <<'END';
8040Alpha ; Alphabetic
8041Bidi_C ; Bidi_Control
8042Dash ; Dash
8043Dia ; Diacritic
8044Ext ; Extender
8045Hex ; Hex_Digit
8046Hyphen ; Hyphen
8047IDC ; ID_Continue
8048Ideo ; Ideographic
8049Join_C ; Join_Control
8050Math ; Math
8051QMark ; Quotation_Mark
8052Term ; Terminal_Punctuation
8053WSpace ; White_Space
8054END
8055 # The next sets were added later
8056 if ($v_version ge v3.0.0) {
8057 push @return, split /\n/, <<'END';
8058Upper ; Uppercase
8059Lower ; Lowercase
8060END
8061 }
8062 if ($v_version ge v3.0.1) {
8063 push @return, split /\n/, <<'END';
8064NChar ; Noncharacter_Code_Point
8065END
8066 }
8067 # The next sets were added in the new-style
8068 if ($v_version ge v3.1.0) {
8069 push @return, split /\n/, <<'END';
8070OAlpha ; Other_Alphabetic
8071OLower ; Other_Lowercase
8072OMath ; Other_Math
8073OUpper ; Other_Uppercase
8074END
8075 }
8076 if ($v_version ge v3.1.1) {
8077 push @return, "AHex ; ASCII_Hex_Digit\n";
8078 }
8079 }
8080 if (-e 'EastAsianWidth.txt') {
8081 push @return, "ea ; East_Asian_Width\n";
8082 }
8083 if (-e 'CompositionExclusions.txt') {
8084 push @return, "CE ; Composition_Exclusion\n";
8085 }
8086 if (-e 'LineBreak.txt') {
8087 push @return, "lb ; Line_Break\n";
8088 }
8089 if (-e 'BidiMirroring.txt') {
8090 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8091 }
8092 if (-e 'Scripts.txt') {
8093 push @return, "sc ; Script\n";
8094 }
8095 if (-e 'DNormalizationProps.txt') {
8096 push @return, split /\n/, <<'END';
8097Comp_Ex ; Full_Composition_Exclusion
8098FC_NFKC ; FC_NFKC_Closure
8099NFC_QC ; NFC_Quick_Check
8100NFD_QC ; NFD_Quick_Check
8101NFKC_QC ; NFKC_Quick_Check
8102NFKD_QC ; NFKD_Quick_Check
8103XO_NFC ; Expands_On_NFC
8104XO_NFD ; Expands_On_NFD
8105XO_NFKC ; Expands_On_NFKC
8106XO_NFKD ; Expands_On_NFKD
8107END
8108 }
8109 if (-e 'DCoreProperties.txt') {
8110 push @return, split /\n/, <<'END';
8111IDS ; ID_Start
8112XIDC ; XID_Continue
8113XIDS ; XID_Start
8114END
8115 # These can also appear in some versions of PropList.txt
8116 push @return, "Lower ; Lowercase\n"
8117 unless grep { $_ =~ /^Lower\b/} @return;
8118 push @return, "Upper ; Uppercase\n"
8119 unless grep { $_ =~ /^Upper\b/} @return;
8120 }
8121
8122 # This flag requires the DAge.txt file to be copied into the directory.
8123 if (DEBUG && $compare_versions) {
8124 push @return, 'age ; Age';
8125 }
8126
8127 return @return;
8128}
8129
8130sub process_PropValueAliases {
8131 # This file contains values that properties look like:
8132 # bc ; AL ; Arabic_Letter
8133 # blk; n/a ; Greek_And_Coptic ; Greek
8134 #
8135 # Field 0 is the property.
8136 # Field 1 is the short name of a property value or 'n/a' if no
8137 # short name exists;
8138 # Field 2 is the full property value name;
8139 # Any other fields are more synonyms for the property value.
8140 # Purely numeric property values are omitted from the file; as are some
8141 # others, fewer and fewer in later releases
8142
8143 # Entries for the ccc property have an extra field before the
8144 # abbreviation:
8145 # ccc; 0; NR ; Not_Reordered
8146 # It is the numeric value that the names are synonyms for.
8147
8148 # There are comment entries for values missing from this file:
8149 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8150 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8151
8152 my $file= shift;
8153 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8154
8155 # This whole file was non-existent in early releases, so use our own
8156 # internal one if necessary.
8157 if (! -e 'PropValueAliases.txt') {
8158 $file->insert_lines(get_old_property_value_aliases());
8159 }
8160
8161 # Add any explicit cjk values
8162 $file->insert_lines(@cjk_property_values);
8163
8164 # This line is used only for testing the code that checks for name
8165 # conflicts. There is a script Inherited, and when this line is executed
8166 # it causes there to be a name conflict with the 'Inherited' that this
8167 # program generates for this block property value
8168 #$file->insert_lines('blk; n/a; Herited');
8169
8170
8171 # Process each line of the file ...
8172 while ($file->next_line) {
8173
8174 my ($property, @data) = split /\s*;\s*/;
8175
8176 # The full name for the ccc property value is in field 2 of the
8177 # remaining ones; field 1 for all other properties. Swap ccc fields 1
8178 # and 2. (Rightmost splice removes field 2, returning it; left splice
8179 # inserts that into field 1, thus shifting former field 1 to field 2.)
8180 splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8181
8182 # If there is no short name, use the full one in element 1
8183 $data[0] = $data[1] if $data[0] eq "n/a";
8184
8185 # Earlier releases had the pseudo property 'qc' that should expand to
8186 # the ones that replace it below.
8187 if ($property eq 'qc') {
8188 if (lc $data[0] eq 'y') {
8189 $file->insert_lines('NFC_QC; Y ; Yes',
8190 'NFD_QC; Y ; Yes',
8191 'NFKC_QC; Y ; Yes',
8192 'NFKD_QC; Y ; Yes',
8193 );
8194 }
8195 elsif (lc $data[0] eq 'n') {
8196 $file->insert_lines('NFC_QC; N ; No',
8197 'NFD_QC; N ; No',
8198 'NFKC_QC; N ; No',
8199 'NFKD_QC; N ; No',
8200 );
8201 }
8202 elsif (lc $data[0] eq 'm') {
8203 $file->insert_lines('NFC_QC; M ; Maybe',
8204 'NFKC_QC; M ; Maybe',
8205 );
8206 }
8207 else {
8208 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8209 }
8210 next;
8211 }
8212
8213 # The first field is the short name, 2nd is the full one.
8214 my $property_object = property_ref($property);
8215 my $table = $property_object->add_match_table($data[0],
8216 Full_Name => $data[1]);
8217
8218 # Start looking for more aliases after these two.
8219 for my $i (2 .. @data - 1) {
8220 $table->add_alias($data[$i]);
8221 }
8222 } # End of looping through the file
8223
8224 # As noted in the comments early in the program, it generates tables for
8225 # the default values for all releases, even those for which the concept
8226 # didn't exist at the time. Here we add those if missing.
8227 my $age = property_ref('age');
8228 if (defined $age && ! defined $age->table('Unassigned')) {
8229 $age->add_match_table('Unassigned');
8230 }
8231 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8232 && ! defined $block->table('No_Block');
8233
8234
8235 # Now set the default mappings of the properties from the file. This is
8236 # done after the loop because a number of properties have only @missings
8237 # entries in the file, and may not show up until the end.
8238 my @defaults = $file->get_missings;
8239 foreach my $default_ref (@defaults) {
8240 my $default = $default_ref->[0];
8241 my $property = property_ref($default_ref->[1]);
8242 $property->set_default_map($default);
8243 }
8244 return;
8245}
8246
8247sub get_old_property_value_aliases () {
8248 # Returns what would be in PropValueAliases.txt if it existed in very old
8249 # versions of Unicode. It was derived from the one in 3.2, and pared
8250 # down. An attempt was made to use the existence of files to mean
8251 # inclusion or not of various aliases, but if this was not sufficient,
8252 # using version numbers was resorted to.
8253
8254 my @return = split /\n/, <<'END';
8255bc ; AN ; Arabic_Number
8256bc ; B ; Paragraph_Separator
8257bc ; CS ; Common_Separator
8258bc ; EN ; European_Number
8259bc ; ES ; European_Separator
8260bc ; ET ; European_Terminator
8261bc ; L ; Left_To_Right
8262bc ; ON ; Other_Neutral
8263bc ; R ; Right_To_Left
8264bc ; WS ; White_Space
8265
8266# The standard combining classes are very much different in v1, so only use
8267# ones that look right (not checked thoroughly)
8268ccc; 0; NR ; Not_Reordered
8269ccc; 1; OV ; Overlay
8270ccc; 7; NK ; Nukta
8271ccc; 8; KV ; Kana_Voicing
8272ccc; 9; VR ; Virama
8273ccc; 202; ATBL ; Attached_Below_Left
8274ccc; 216; ATAR ; Attached_Above_Right
8275ccc; 218; BL ; Below_Left
8276ccc; 220; B ; Below
8277ccc; 222; BR ; Below_Right
8278ccc; 224; L ; Left
8279ccc; 228; AL ; Above_Left
8280ccc; 230; A ; Above
8281ccc; 232; AR ; Above_Right
8282ccc; 234; DA ; Double_Above
8283
8284dt ; can ; canonical
8285dt ; enc ; circle
8286dt ; fin ; final
8287dt ; font ; font
8288dt ; fra ; fraction
8289dt ; init ; initial
8290dt ; iso ; isolated
8291dt ; med ; medial
8292dt ; n/a ; none
8293dt ; nb ; noBreak
8294dt ; sqr ; square
8295dt ; sub ; sub
8296dt ; sup ; super
8297
8298gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8299gc ; Cc ; Control
8300gc ; Cn ; Unassigned
8301gc ; Co ; Private_Use
8302gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8303gc ; LC ; Cased_Letter # Ll | Lt | Lu
8304gc ; Ll ; Lowercase_Letter
8305gc ; Lm ; Modifier_Letter
8306gc ; Lo ; Other_Letter
8307gc ; Lu ; Uppercase_Letter
8308gc ; M ; Mark # Mc | Me | Mn
8309gc ; Mc ; Spacing_Mark
8310gc ; Mn ; Nonspacing_Mark
8311gc ; N ; Number # Nd | Nl | No
8312gc ; Nd ; Decimal_Number
8313gc ; No ; Other_Number
8314gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8315gc ; Pd ; Dash_Punctuation
8316gc ; Pe ; Close_Punctuation
8317gc ; Po ; Other_Punctuation
8318gc ; Ps ; Open_Punctuation
8319gc ; S ; Symbol # Sc | Sk | Sm | So
8320gc ; Sc ; Currency_Symbol
8321gc ; Sm ; Math_Symbol
8322gc ; So ; Other_Symbol
8323gc ; Z ; Separator # Zl | Zp | Zs
8324gc ; Zl ; Line_Separator
8325gc ; Zp ; Paragraph_Separator
8326gc ; Zs ; Space_Separator
8327
8328nt ; de ; Decimal
8329nt ; di ; Digit
8330nt ; n/a ; None
8331nt ; nu ; Numeric
8332END
8333
8334 if (-e 'ArabicShaping.txt') {
8335 push @return, split /\n/, <<'END';
8336jg ; n/a ; AIN
8337jg ; n/a ; ALEF
8338jg ; n/a ; DAL
8339jg ; n/a ; GAF
8340jg ; n/a ; LAM
8341jg ; n/a ; MEEM
8342jg ; n/a ; NO_JOINING_GROUP
8343jg ; n/a ; NOON
8344jg ; n/a ; QAF
8345jg ; n/a ; SAD
8346jg ; n/a ; SEEN
8347jg ; n/a ; TAH
8348jg ; n/a ; WAW
8349
8350jt ; C ; Join_Causing
8351jt ; D ; Dual_Joining
8352jt ; L ; Left_Joining
8353jt ; R ; Right_Joining
8354jt ; U ; Non_Joining
8355jt ; T ; Transparent
8356END
8357 if ($v_version ge v3.0.0) {
8358 push @return, split /\n/, <<'END';
8359jg ; n/a ; ALAPH
8360jg ; n/a ; BEH
8361jg ; n/a ; BETH
8362jg ; n/a ; DALATH_RISH
8363jg ; n/a ; E
8364jg ; n/a ; FEH
8365jg ; n/a ; FINAL_SEMKATH
8366jg ; n/a ; GAMAL
8367jg ; n/a ; HAH
8368jg ; n/a ; HAMZA_ON_HEH_GOAL
8369jg ; n/a ; HE
8370jg ; n/a ; HEH
8371jg ; n/a ; HEH_GOAL
8372jg ; n/a ; HETH
8373jg ; n/a ; KAF
8374jg ; n/a ; KAPH
8375jg ; n/a ; KNOTTED_HEH
8376jg ; n/a ; LAMADH
8377jg ; n/a ; MIM
8378jg ; n/a ; NUN
8379jg ; n/a ; PE
8380jg ; n/a ; QAPH
8381jg ; n/a ; REH
8382jg ; n/a ; REVERSED_PE
8383jg ; n/a ; SADHE
8384jg ; n/a ; SEMKATH
8385jg ; n/a ; SHIN
8386jg ; n/a ; SWASH_KAF
8387jg ; n/a ; TAW
8388jg ; n/a ; TEH_MARBUTA
8389jg ; n/a ; TETH
8390jg ; n/a ; YEH
8391jg ; n/a ; YEH_BARREE
8392jg ; n/a ; YEH_WITH_TAIL
8393jg ; n/a ; YUDH
8394jg ; n/a ; YUDH_HE
8395jg ; n/a ; ZAIN
8396END
8397 }
8398 }
8399
8400
8401 if (-e 'EastAsianWidth.txt') {
8402 push @return, split /\n/, <<'END';
8403ea ; A ; Ambiguous
8404ea ; F ; Fullwidth
8405ea ; H ; Halfwidth
8406ea ; N ; Neutral
8407ea ; Na ; Narrow
8408ea ; W ; Wide
8409END
8410 }
8411
8412 if (-e 'LineBreak.txt') {
8413 push @return, split /\n/, <<'END';
8414lb ; AI ; Ambiguous
8415lb ; AL ; Alphabetic
8416lb ; B2 ; Break_Both
8417lb ; BA ; Break_After
8418lb ; BB ; Break_Before
8419lb ; BK ; Mandatory_Break
8420lb ; CB ; Contingent_Break
8421lb ; CL ; Close_Punctuation
8422lb ; CM ; Combining_Mark
8423lb ; CR ; Carriage_Return
8424lb ; EX ; Exclamation
8425lb ; GL ; Glue
8426lb ; HY ; Hyphen
8427lb ; ID ; Ideographic
8428lb ; IN ; Inseperable
8429lb ; IS ; Infix_Numeric
8430lb ; LF ; Line_Feed
8431lb ; NS ; Nonstarter
8432lb ; NU ; Numeric
8433lb ; OP ; Open_Punctuation
8434lb ; PO ; Postfix_Numeric
8435lb ; PR ; Prefix_Numeric
8436lb ; QU ; Quotation
8437lb ; SA ; Complex_Context
8438lb ; SG ; Surrogate
8439lb ; SP ; Space
8440lb ; SY ; Break_Symbols
8441lb ; XX ; Unknown
8442lb ; ZW ; ZWSpace
8443END
8444 }
8445
8446 if (-e 'DNormalizationProps.txt') {
8447 push @return, split /\n/, <<'END';
8448qc ; M ; Maybe
8449qc ; N ; No
8450qc ; Y ; Yes
8451END
8452 }
8453
8454 if (-e 'Scripts.txt') {
8455 push @return, split /\n/, <<'END';
8456sc ; Arab ; Arabic
8457sc ; Armn ; Armenian
8458sc ; Beng ; Bengali
8459sc ; Bopo ; Bopomofo
8460sc ; Cans ; Canadian_Aboriginal
8461sc ; Cher ; Cherokee
8462sc ; Cyrl ; Cyrillic
8463sc ; Deva ; Devanagari
8464sc ; Dsrt ; Deseret
8465sc ; Ethi ; Ethiopic
8466sc ; Geor ; Georgian
8467sc ; Goth ; Gothic
8468sc ; Grek ; Greek
8469sc ; Gujr ; Gujarati
8470sc ; Guru ; Gurmukhi
8471sc ; Hang ; Hangul
8472sc ; Hani ; Han
8473sc ; Hebr ; Hebrew
8474sc ; Hira ; Hiragana
8475sc ; Ital ; Old_Italic
8476sc ; Kana ; Katakana
8477sc ; Khmr ; Khmer
8478sc ; Knda ; Kannada
8479sc ; Laoo ; Lao
8480sc ; Latn ; Latin
8481sc ; Mlym ; Malayalam
8482sc ; Mong ; Mongolian
8483sc ; Mymr ; Myanmar
8484sc ; Ogam ; Ogham
8485sc ; Orya ; Oriya
8486sc ; Qaai ; Inherited
8487sc ; Runr ; Runic
8488sc ; Sinh ; Sinhala
8489sc ; Syrc ; Syriac
8490sc ; Taml ; Tamil
8491sc ; Telu ; Telugu
8492sc ; Thaa ; Thaana
8493sc ; Thai ; Thai
8494sc ; Tibt ; Tibetan
8495sc ; Yiii ; Yi
8496sc ; Zyyy ; Common
8497END
8498 }
8499
8500 if ($v_version ge v2.0.0) {
8501 push @return, split /\n/, <<'END';
8502dt ; com ; compat
8503dt ; nar ; narrow
8504dt ; sml ; small
8505dt ; vert ; vertical
8506dt ; wide ; wide
8507
8508gc ; Cf ; Format
8509gc ; Cs ; Surrogate
8510gc ; Lt ; Titlecase_Letter
8511gc ; Me ; Enclosing_Mark
8512gc ; Nl ; Letter_Number
8513gc ; Pc ; Connector_Punctuation
8514gc ; Sk ; Modifier_Symbol
8515END
8516 }
8517 if ($v_version ge v2.1.2) {
8518 push @return, "bc ; S ; Segment_Separator\n";
8519 }
8520 if ($v_version ge v2.1.5) {
8521 push @return, split /\n/, <<'END';
8522gc ; Pf ; Final_Punctuation
8523gc ; Pi ; Initial_Punctuation
8524END
8525 }
8526 if ($v_version ge v2.1.8) {
8527 push @return, "ccc; 240; IS ; Iota_Subscript\n";
8528 }
8529
8530 if ($v_version ge v3.0.0) {
8531 push @return, split /\n/, <<'END';
8532bc ; AL ; Arabic_Letter
8533bc ; BN ; Boundary_Neutral
8534bc ; LRE ; Left_To_Right_Embedding
8535bc ; LRO ; Left_To_Right_Override
8536bc ; NSM ; Nonspacing_Mark
8537bc ; PDF ; Pop_Directional_Format
8538bc ; RLE ; Right_To_Left_Embedding
8539bc ; RLO ; Right_To_Left_Override
8540
8541ccc; 233; DB ; Double_Below
8542END
8543 }
8544
8545 if ($v_version ge v3.1.0) {
8546 push @return, "ccc; 226; R ; Right\n";
8547 }
8548
8549 return @return;
8550}
8551
b1c167a3
KW
8552sub output_perl_charnames_line ($$) {
8553
8554 # Output the entries in Perl_charnames specially, using 5 digits instead
8555 # of four. This makes the entries a constant length, and simplifies
8556 # charnames.pm which this table is for. Unicode can have 6 digit
8557 # ordinals, but they are all private use or noncharacters which do not
8558 # have names, so won't be in this table.
8559
73d9566f 8560 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
8561}
8562
99870f4d
KW
8563{ # Closure
8564 # This is used to store the range list of all the code points usable when
8565 # the little used $compare_versions feature is enabled.
8566 my $compare_versions_range_list;
8567
8568 sub process_generic_property_file {
8569 # This processes a file containing property mappings and puts them
8570 # into internal map tables. It should be used to handle any property
8571 # files that have mappings from a code point or range thereof to
8572 # something else. This means almost all the UCD .txt files.
8573 # each_line_handlers() should be set to adjust the lines of these
8574 # files, if necessary, to what this routine understands:
8575 #
8576 # 0374 ; NFD_QC; N
8577 # 003C..003E ; Math
8578 #
8579 # the fields are: "codepoint range ; property; map"
8580 #
8581 # meaning the codepoints in the range all have the value 'map' under
8582 # 'property'.
8583 # Beginning and trailing white space in each field are not signficant.
8584 # Note there is not a trailing semi-colon in the above. A trailing
8585 # semi-colon means the map is a null-string. An omitted map, as
8586 # opposed to a null-string, is assumed to be 'Y', based on Unicode
8587 # table syntax. (This could have been hidden from this routine by
8588 # doing it in the $file object, but that would require parsing of the
8589 # line there, so would have to parse it twice, or change the interface
8590 # to pass this an array. So not done.)
8591 #
8592 # The map field may begin with a sequence of commands that apply to
8593 # this range. Each such command begins and ends with $CMD_DELIM.
8594 # These are used to indicate, for example, that the mapping for a
8595 # range has a non-default type.
8596 #
8597 # This loops through the file, calling it's next_line() method, and
8598 # then taking the map and adding it to the property's table.
8599 # Complications arise because any number of properties can be in the
8600 # file, in any order, interspersed in any way. The first time a
8601 # property is seen, it gets information about that property and
f86864ac 8602 # caches it for quick retrieval later. It also normalizes the maps
99870f4d
KW
8603 # so that only one of many synonym is stored. The Unicode input files
8604 # do use some multiple synonyms.
8605
8606 my $file = shift;
8607 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8608
8609 my %property_info; # To keep track of what properties
8610 # have already had entries in the
8611 # current file, and info about each,
8612 # so don't have to recompute.
8613 my $property_name; # property currently being worked on
8614 my $property_type; # and its type
8615 my $previous_property_name = ""; # name from last time through loop
8616 my $property_object; # pointer to the current property's
8617 # object
8618 my $property_addr; # the address of that object
8619 my $default_map; # the string that code points missing
8620 # from the file map to
8621 my $default_table; # For non-string properties, a
8622 # reference to the match table that
8623 # will contain the list of code
8624 # points that map to $default_map.
8625
8626 # Get the next real non-comment line
8627 LINE:
8628 while ($file->next_line) {
8629
8630 # Default replacement type; means that if parts of the range have
8631 # already been stored in our tables, the new map overrides them if
8632 # they differ more than cosmetically
8633 my $replace = $IF_NOT_EQUIVALENT;
8634 my $map_type; # Default type for the map of this range
8635
8636 #local $to_trace = 1 if main::DEBUG;
8637 trace $_ if main::DEBUG && $to_trace;
8638
8639 # Split the line into components
8640 my ($range, $property_name, $map, @remainder)
8641 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8642
8643 # If more or less on the line than we are expecting, warn and skip
8644 # the line
8645 if (@remainder) {
8646 $file->carp_bad_line('Extra fields');
8647 next LINE;
8648 }
8649 elsif ( ! defined $property_name) {
8650 $file->carp_bad_line('Missing property');
8651 next LINE;
8652 }
8653
8654 # Examine the range.
8655 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8656 {
8657 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8658 next LINE;
8659 }
8660 my $low = hex $1;
8661 my $high = (defined $2) ? hex $2 : $low;
8662
8663 # For the very specialized case of comparing two Unicode
8664 # versions...
8665 if (DEBUG && $compare_versions) {
8666 if ($property_name eq 'Age') {
8667
8668 # Only allow code points at least as old as the version
8669 # specified.
8670 my $age = pack "C*", split(/\./, $map); # v string
8671 next LINE if $age gt $compare_versions;
8672 }
8673 else {
8674
8675 # Again, we throw out code points younger than those of
8676 # the specified version. By now, the Age property is
8677 # populated. We use the intersection of each input range
8678 # with this property to find what code points in it are
8679 # valid. To do the intersection, we have to convert the
8680 # Age property map to a Range_list. We only have to do
8681 # this once.
8682 if (! defined $compare_versions_range_list) {
8683 my $age = property_ref('Age');
8684 if (! -e 'DAge.txt') {
8685 croak "Need to have 'DAge.txt' file to do version comparison";
8686 }
8687 elsif ($age->count == 0) {
8688 croak "The 'Age' table is empty, but its file exists";
8689 }
8690 $compare_versions_range_list
8691 = Range_List->new(Initialize => $age);
8692 }
8693
8694 # An undefined map is always 'Y'
8695 $map = 'Y' if ! defined $map;
8696
8697 # Calculate the intersection of the input range with the
8698 # code points that are known in the specified version
8699 my @ranges = ($compare_versions_range_list
8700 & Range->new($low, $high))->ranges;
8701
8702 # If the intersection is empty, throw away this range
8703 next LINE unless @ranges;
8704
8705 # Only examine the first range this time through the loop.
8706 my $this_range = shift @ranges;
8707
8708 # Put any remaining ranges in the queue to be processed
8709 # later. Note that there is unnecessary work here, as we
8710 # will do the intersection again for each of these ranges
8711 # during some future iteration of the LINE loop, but this
8712 # code is not used in production. The later intersections
8713 # are guaranteed to not splinter, so this will not become
8714 # an infinite loop.
8715 my $line = join ';', $property_name, $map;
8716 foreach my $range (@ranges) {
8717 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8718 $range->start,
8719 $range->end,
8720 $line));
8721 }
8722
8723 # And process the first range, like any other.
8724 $low = $this_range->start;
8725 $high = $this_range->end;
8726 }
8727 } # End of $compare_versions
8728
8729 # If changing to a new property, get the things constant per
8730 # property
8731 if ($previous_property_name ne $property_name) {
8732
8733 $property_object = property_ref($property_name);
8734 if (! defined $property_object) {
8735 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
8736 next LINE;
8737 }
051df77b 8738 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
8739
8740 # Defer changing names until have a line that is acceptable
8741 # (the 'next' statement above means is unacceptable)
8742 $previous_property_name = $property_name;
8743
8744 # If not the first time for this property, retrieve info about
8745 # it from the cache
8746 if (defined ($property_info{$property_addr}{'type'})) {
8747 $property_type = $property_info{$property_addr}{'type'};
8748 $default_map = $property_info{$property_addr}{'default'};
8749 $map_type
8750 = $property_info{$property_addr}{'pseudo_map_type'};
8751 $default_table
8752 = $property_info{$property_addr}{'default_table'};
8753 }
8754 else {
8755
8756 # Here, is the first time for this property. Set up the
8757 # cache.
8758 $property_type = $property_info{$property_addr}{'type'}
8759 = $property_object->type;
8760 $map_type
8761 = $property_info{$property_addr}{'pseudo_map_type'}
8762 = $property_object->pseudo_map_type;
8763
8764 # The Unicode files are set up so that if the map is not
8765 # defined, it is a binary property
8766 if (! defined $map && $property_type != $BINARY) {
8767 if ($property_type != $UNKNOWN
8768 && $property_type != $NON_STRING)
8769 {
8770 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
8771 }
8772 else {
8773 $property_object->set_type($BINARY);
8774 $property_type
8775 = $property_info{$property_addr}{'type'}
8776 = $BINARY;
8777 }
8778 }
8779
8780 # Get any @missings default for this property. This
8781 # should precede the first entry for the property in the
8782 # input file, and is located in a comment that has been
8783 # stored by the Input_file class until we access it here.
8784 # It's possible that there is more than one such line
8785 # waiting for us; collect them all, and parse
8786 my @missings_list = $file->get_missings
8787 if $file->has_missings_defaults;
8788 foreach my $default_ref (@missings_list) {
8789 my $default = $default_ref->[0];
ffe43484 8790 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
8791
8792 # For string properties, the default is just what the
8793 # file says, but non-string properties should already
8794 # have set up a table for the default property value;
8795 # use the table for these, so can resolve synonyms
8796 # later to a single standard one.
8797 if ($property_type == $STRING
8798 || $property_type == $UNKNOWN)
8799 {
8800 $property_info{$addr}{'missings'} = $default;
8801 }
8802 else {
8803 $property_info{$addr}{'missings'}
8804 = $property_object->table($default);
8805 }
8806 }
8807
8808 # Finished storing all the @missings defaults in the input
8809 # file so far. Get the one for the current property.
8810 my $missings = $property_info{$property_addr}{'missings'};
8811
8812 # But we likely have separately stored what the default
8813 # should be. (This is to accommodate versions of the
8814 # standard where the @missings lines are absent or
8815 # incomplete.) Hopefully the two will match. But check
8816 # it out.
8817 $default_map = $property_object->default_map;
8818
8819 # If the map is a ref, it means that the default won't be
8820 # processed until later, so undef it, so next few lines
8821 # will redefine it to something that nothing will match
8822 undef $default_map if ref $default_map;
8823
8824 # Create a $default_map if don't have one; maybe a dummy
8825 # that won't match anything.
8826 if (! defined $default_map) {
8827
8828 # Use any @missings line in the file.
8829 if (defined $missings) {
8830 if (ref $missings) {
8831 $default_map = $missings->full_name;
8832 $default_table = $missings;
8833 }
8834 else {
8835 $default_map = $missings;
8836 }
678f13d5 8837
99870f4d
KW
8838 # And store it with the property for outside use.
8839 $property_object->set_default_map($default_map);
8840 }
8841 else {
8842
8843 # Neither an @missings nor a default map. Create
8844 # a dummy one, so won't have to test definedness
8845 # in the main loop.
8846 $default_map = '_Perl This will never be in a file
8847 from Unicode';
8848 }
8849 }
8850
8851 # Here, we have $default_map defined, possibly in terms of
8852 # $missings, but maybe not, and possibly is a dummy one.
8853 if (defined $missings) {
8854
8855 # Make sure there is no conflict between the two.
8856 # $missings has priority.
8857 if (ref $missings) {
23e33b60
KW
8858 $default_table
8859 = $property_object->table($default_map);
99870f4d
KW
8860 if (! defined $default_table
8861 || $default_table != $missings)
8862 {
8863 if (! defined $default_table) {
8864 $default_table = $UNDEF;
8865 }
8866 $file->carp_bad_line(<<END
8867The \@missings line for $property_name in $file says that missings default to
8868$missings, but we expect it to be $default_table. $missings used.
8869END
8870 );
8871 $default_table = $missings;
8872 $default_map = $missings->full_name;
8873 }
8874 $property_info{$property_addr}{'default_table'}
8875 = $default_table;
8876 }
8877 elsif ($default_map ne $missings) {
8878 $file->carp_bad_line(<<END
8879The \@missings line for $property_name in $file says that missings default to
8880$missings, but we expect it to be $default_map. $missings used.
8881END
8882 );
8883 $default_map = $missings;
8884 }
8885 }
8886
8887 $property_info{$property_addr}{'default'}
8888 = $default_map;
8889
8890 # If haven't done so already, find the table corresponding
8891 # to this map for non-string properties.
8892 if (! defined $default_table
8893 && $property_type != $STRING
8894 && $property_type != $UNKNOWN)
8895 {
8896 $default_table = $property_info{$property_addr}
8897 {'default_table'}
8898 = $property_object->table($default_map);
8899 }
8900 } # End of is first time for this property
8901 } # End of switching properties.
8902
8903 # Ready to process the line.
8904 # The Unicode files are set up so that if the map is not defined,
8905 # it is a binary property with value 'Y'
8906 if (! defined $map) {
8907 $map = 'Y';
8908 }
8909 else {
8910
8911 # If the map begins with a special command to us (enclosed in
8912 # delimiters), extract the command(s).
8913 if (substr($map, 0, 1) eq $CMD_DELIM) {
8914 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8915 my $command = $1;
8916 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
8917 $replace = $1;
8918 }
8919 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
8920 $map_type = $1;
8921 }
8922 else {
8923 $file->carp_bad_line("Unknown command line: '$1'");
8924 next LINE;
8925 }
8926 }
8927 }
8928 }
8929
8930 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8931 {
8932
8933 # Here, we have a map to a particular code point, and the
8934 # default map is to a code point itself. If the range
8935 # includes the particular code point, change that portion of
8936 # the range to the default. This makes sure that in the final
8937 # table only the non-defaults are listed.
8938 my $decimal_map = hex $map;
8939 if ($low <= $decimal_map && $decimal_map <= $high) {
8940
8941 # If the range includes stuff before or after the map
8942 # we're changing, split it and process the split-off parts
8943 # later.
8944 if ($low < $decimal_map) {
8945 $file->insert_adjusted_lines(
8946 sprintf("%04X..%04X; %s; %s",
8947 $low,
8948 $decimal_map - 1,
8949 $property_name,
8950 $map));
8951 }
8952 if ($high > $decimal_map) {
8953 $file->insert_adjusted_lines(
8954 sprintf("%04X..%04X; %s; %s",
8955 $decimal_map + 1,
8956 $high,
8957 $property_name,
8958 $map));
8959 }
8960 $low = $high = $decimal_map;
8961 $map = $CODE_POINT;
8962 }
8963 }
8964
8965 # If we can tell that this is a synonym for the default map, use
8966 # the default one instead.
8967 if ($property_type != $STRING
8968 && $property_type != $UNKNOWN)
8969 {
8970 my $table = $property_object->table($map);
8971 if (defined $table && $table == $default_table) {
8972 $map = $default_map;
8973 }
8974 }
8975
8976 # And figure out the map type if not known.
8977 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
8978 if ($map eq "") { # Nulls are always $NULL map type
8979 $map_type = $NULL;
8980 } # Otherwise, non-strings, and those that don't allow
8981 # $MULTI_CP, and those that aren't multiple code points are
8982 # 0
8983 elsif
8984 (($property_type != $STRING && $property_type != $UNKNOWN)
8985 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
8986 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
8987 {
8988 $map_type = 0;
8989 }
8990 else {
8991 $map_type = $MULTI_CP;
8992 }
8993 }
8994
8995 $property_object->add_map($low, $high,
8996 $map,
8997 Type => $map_type,
8998 Replace => $replace);
8999 } # End of loop through file's lines
9000
9001 return;
9002 }
9003}
9004
99870f4d
KW
9005{ # Closure for UnicodeData.txt handling
9006
9007 # This file was the first one in the UCD; its design leads to some
9008 # awkwardness in processing. Here is a sample line:
9009 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9010 # The fields in order are:
9011 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9012 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9013 my $CATEGORY = $i++; # category (e.g. "Lu")
9014 my $CCC = $i++; # Canonical combining class (e.g. "230")
9015 my $BIDI = $i++; # directional class (e.g. "L")
9016 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9017 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9018 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9019 # Dual-use in this program; see below
9020 my $NUMERIC = $i++; # numeric value
9021 my $MIRRORED = $i++; # ? mirrored
9022 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9023 my $COMMENT = $i++; # iso comment
9024 my $UPPER = $i++; # simple uppercase mapping
9025 my $LOWER = $i++; # simple lowercase mapping
9026 my $TITLE = $i++; # simple titlecase mapping
9027 my $input_field_count = $i;
9028
9029 # This routine in addition outputs these extra fields:
9030 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9031
9032 # These fields are modifications of ones above, and are usually
9033 # suppressed; they must come last, as for speed, the loop upper bound is
9034 # normally set to ignore them
9035 my $NAME = $i++; # This is the strict name field, not the one that
9036 # charnames uses.
9037 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9038 # by Unicode::Normalize
99870f4d
KW
9039 my $last_field = $i - 1;
9040
9041 # All these are read into an array for each line, with the indices defined
9042 # above. The empty fields in the example line above indicate that the
9043 # value is defaulted. The handler called for each line of the input
9044 # changes these to their defaults.
9045
9046 # Here are the official names of the properties, in a parallel array:
9047 my @field_names;
9048 $field_names[$BIDI] = 'Bidi_Class';
9049 $field_names[$CATEGORY] = 'General_Category';
9050 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9051 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9052 $field_names[$COMMENT] = 'ISO_Comment';
9053 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9054 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9055 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9056 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9057 $field_names[$NAME] = 'Name';
9058 $field_names[$NUMERIC] = 'Numeric_Value';
9059 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9060 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9061 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9062 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9063 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9064 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9065
28093d0e
KW
9066 # Some of these need a little more explanation:
9067 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9068 # property, but is used in calculating the Numeric_Type. Perl however,
9069 # creates a file from this field, so a Perl property is created from it.
9070 # Similarly, the Other_Digit field is used only for calculating the
9071 # Numeric_Type, and so it can be safely re-used as the place to store
9072 # the value for Numeric_Type; hence it is referred to as
9073 # $NUMERIC_TYPE_OTHER_DIGIT.
9074 # The input field named $PERL_DECOMPOSITION is a combination of both the
9075 # decomposition mapping and its type. Perl creates a file containing
9076 # exactly this field, so it is used for that. The two properties are
9077 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9078 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9079 # output it), as Perl doesn't use it directly.
9080 # The input field named here $CHARNAME is used to construct the
9081 # Perl_Charnames property, which is a combination of the Name property
9082 # (which the input field contains), and the Unicode_1_Name property, and
9083 # others from other files. Since, the strict Name property is not used
9084 # by Perl, this field is used for the table that Perl does use. The
9085 # strict Name property table is usually suppressed (unless the lists are
9086 # changed to output it), so it is accumulated in a separate field,
9087 # $NAME, which to save time is discarded unless the table is actually to
9088 # be output
99870f4d
KW
9089
9090 # This file is processed like most in this program. Control is passed to
9091 # process_generic_property_file() which calls filter_UnicodeData_line()
9092 # for each input line. This filter converts the input into line(s) that
9093 # process_generic_property_file() understands. There is also a setup
9094 # routine called before any of the file is processed, and a handler for
9095 # EOF processing, all in this closure.
9096
9097 # A huge speed-up occurred at the cost of some added complexity when these
9098 # routines were altered to buffer the outputs into ranges. Almost all the
9099 # lines of the input file apply to just one code point, and for most
9100 # properties, the map for the next code point up is the same as the
9101 # current one. So instead of creating a line for each property for each
9102 # input line, filter_UnicodeData_line() remembers what the previous map
9103 # of a property was, and doesn't generate a line to pass on until it has
9104 # to, as when the map changes; and that passed-on line encompasses the
9105 # whole contiguous range of code points that have the same map for that
9106 # property. This means a slight amount of extra setup, and having to
9107 # flush these buffers on EOF, testing if the maps have changed, plus
9108 # remembering state information in the closure. But it means a lot less
9109 # real time in not having to change the data base for each property on
9110 # each line.
9111
9112 # Another complication is that there are already a few ranges designated
9113 # in the input. There are two lines for each, with the same maps except
9114 # the code point and name on each line. This was actually the hardest
9115 # thing to design around. The code points in those ranges may actually
9116 # have real maps not given by these two lines. These maps will either
9117 # be algorthimically determinable, or in the extracted files furnished
9118 # with the UCD. In the event of conflicts between these extracted files,
9119 # and this one, Unicode says that this one prevails. But it shouldn't
9120 # prevail for conflicts that occur in these ranges. The data from the
9121 # extracted files prevails in those cases. So, this program is structured
9122 # so that those files are processed first, storing maps. Then the other
9123 # files are processed, generally overwriting what the extracted files
9124 # stored. But just the range lines in this input file are processed
9125 # without overwriting. This is accomplished by adding a special string to
9126 # the lines output to tell process_generic_property_file() to turn off the
9127 # overwriting for just this one line.
9128 # A similar mechanism is used to tell it that the map is of a non-default
9129 # type.
9130
9131 sub setup_UnicodeData { # Called before any lines of the input are read
9132 my $file = shift;
9133 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9134
28093d0e
KW
9135 # Create a new property specially located that is a combination of the
9136 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9137 # Name_Alias properties. (The final duplicates elements of the
9138 # first.) A comment for it will later be constructed based on the
9139 # actual properties present and used
3e20195b 9140 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9141 Core_Access => '\N{...} and "use charnames"',
9142 Default_Map => "",
9143 Directory => File::Spec->curdir(),
9144 File => 'Name',
9145 Internal_Only_Warning => 1,
9146 Perl_Extension => 1,
b1c167a3 9147 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9148 Type => $STRING,
9149 );
9150
99870f4d 9151 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9152 Directory => File::Spec->curdir(),
99870f4d
KW
9153 File => 'Decomposition',
9154 Format => $STRING_FORMAT,
9155 Internal_Only_Warning => 1,
9156 Perl_Extension => 1,
9157 Default_Map => $CODE_POINT,
9158
0c07e538
KW
9159 # normalize.pm can't cope with these
9160 Output_Range_Counts => 0,
9161
99870f4d
KW
9162 # This is a specially formatted table
9163 # explicitly for normalize.pm, which
9164 # is expecting a particular format,
9165 # which means that mappings containing
9166 # multiple code points are in the main
9167 # body of the table
9168 Map_Type => $COMPUTE_NO_MULTI_CP,
9169 Type => $STRING,
9170 );
9171 $Perl_decomp->add_comment(join_lines(<<END
9172This mapping is a combination of the Unicode 'Decomposition_Type' and
9173'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9174identical to the official Unicode 'Decomposition_Mapping' property except for
9175two things:
9176 1) It omits the algorithmically determinable Hangul syllable decompositions,
9177which normalize.pm handles algorithmically.
9178 2) It contains the decomposition type as well. Non-canonical decompositions
9179begin with a word in angle brackets, like <super>, which denotes the
9180compatible decomposition type. If the map does not begin with the <angle
9181brackets>, the decomposition is canonical.
9182END
9183 ));
9184
9185 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9186 Default_Map => "",
9187 Perl_Extension => 1,
9188 File => 'Digit', # Trad. location
9189 Directory => $map_directory,
9190 Type => $STRING,
9191 Range_Size_1 => 1,
9192 );
9193 $Decimal_Digit->add_comment(join_lines(<<END
9194This file gives the mapping of all code points which represent a single
9195decimal digit [0-9] to their respective digits. For example, the code point
9196U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9197that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9198numerals.
9199END
9200 ));
9201
28093d0e
KW
9202 # These properties are not used for generating anything else, and are
9203 # usually not output. By making them last in the list, we can just
99870f4d 9204 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
9205 # generating a table(s) that is/are just going to get thrown away.
9206 if (! property_ref('Decomposition_Mapping')->to_output_map
9207 && ! property_ref('Name')->to_output_map)
9208 {
9209 $last_field = min($NAME, $DECOMP_MAP) - 1;
9210 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9211 $last_field = $DECOMP_MAP;
9212 } elsif (property_ref('Name')->to_output_map) {
9213 $last_field = $NAME;
99870f4d
KW
9214 }
9215 return;
9216 }
9217
9218 my $first_time = 1; # ? Is this the first line of the file
9219 my $in_range = 0; # ? Are we in one of the file's ranges
9220 my $previous_cp; # hex code point of previous line
9221 my $decimal_previous_cp = -1; # And its decimal equivalent
9222 my @start; # For each field, the current starting
9223 # code point in hex for the range
9224 # being accumulated.
9225 my @fields; # The input fields;
9226 my @previous_fields; # And those from the previous call
9227
9228 sub filter_UnicodeData_line {
9229 # Handle a single input line from UnicodeData.txt; see comments above
9230 # Conceptually this takes a single line from the file containing N
9231 # properties, and converts it into N lines with one property per line,
9232 # which is what the final handler expects. But there are
9233 # complications due to the quirkiness of the input file, and to save
9234 # time, it accumulates ranges where the property values don't change
9235 # and only emits lines when necessary. This is about an order of
9236 # magnitude fewer lines emitted.
9237
9238 my $file = shift;
9239 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9240
9241 # $_ contains the input line.
9242 # -1 in split means retain trailing null fields
9243 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9244
9245 #local $to_trace = 1 if main::DEBUG;
9246 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9247 if (@fields > $input_field_count) {
9248 $file->carp_bad_line('Extra fields');
9249 $_ = "";
9250 return;
9251 }
9252
9253 my $decimal_cp = hex $cp;
9254
9255 # We have to output all the buffered ranges when the next code point
9256 # is not exactly one after the previous one, which means there is a
9257 # gap in the ranges.
9258 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9259
9260 # The decomposition mapping field requires special handling. It looks
9261 # like either:
9262 #
9263 # <compat> 0032 0020
9264 # 0041 0300
9265 #
9266 # The decomposition type is enclosed in <brackets>; if missing, it
9267 # means the type is canonical. There are two decomposition mapping
9268 # tables: the one for use by Perl's normalize.pm has a special format
9269 # which is this field intact; the other, for general use is of
9270 # standard format. In either case we have to find the decomposition
9271 # type. Empty fields have None as their type, and map to the code
9272 # point itself
9273 if ($fields[$PERL_DECOMPOSITION] eq "") {
9274 $fields[$DECOMP_TYPE] = 'None';
9275 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9276 }
9277 else {
9278 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9279 =~ / < ( .+? ) > \s* ( .+ ) /x;
9280 if (! defined $fields[$DECOMP_TYPE]) {
9281 $fields[$DECOMP_TYPE] = 'Canonical';
9282 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9283 }
9284 else {
9285 $fields[$DECOMP_MAP] = $map;
9286 }
9287 }
9288
9289 # The 3 numeric fields also require special handling. The 2 digit
9290 # fields must be either empty or match the number field. This means
9291 # that if it is empty, they must be as well, and the numeric type is
9292 # None, and the numeric value is 'Nan'.
9293 # The decimal digit field must be empty or match the other digit
9294 # field. If the decimal digit field is non-empty, the code point is
9295 # a decimal digit, and the other two fields will have the same value.
9296 # If it is empty, but the other digit field is non-empty, the code
9297 # point is an 'other digit', and the number field will have the same
9298 # value as the other digit field. If the other digit field is empty,
9299 # but the number field is non-empty, the code point is a generic
9300 # numeric type.
9301 if ($fields[$NUMERIC] eq "") {
9302 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9303 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9304 ) {
9305 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9306 }
9307 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9308 $fields[$NUMERIC] = 'NaN';
9309 }
9310 else {
9311 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9312 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9313 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9314 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9315 }
9316 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9317 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9318 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9319 }
9320 else {
9321 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9322
9323 # Rationals require extra effort.
9324 register_fraction($fields[$NUMERIC])
9325 if $fields[$NUMERIC] =~ qr{/};
9326 }
9327 }
9328
9329 # For the properties that have empty fields in the file, and which
9330 # mean something different from empty, change them to that default.
9331 # Certain fields just haven't been empty so far in any Unicode
9332 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9333 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 9334 # the defaults; which are very unlikely to ever change.
99870f4d
KW
9335 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9336 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9337
9338 # UAX44 says that if title is empty, it is the same as whatever upper
9339 # is,
9340 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9341
9342 # There are a few pairs of lines like:
9343 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9344 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9345 # that define ranges. These should be processed after the fields are
9346 # adjusted above, as they may override some of them; but mostly what
28093d0e 9347 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
9348 # paired lines start with a '<', but this is also true of '<control>,
9349 # which isn't one of these special ones.
28093d0e 9350 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
9351
9352 # Some code points in this file have the pseudo-name
9353 # '<control>', but the official name for such ones is the null
28093d0e 9354 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 9355 $fields[$NAME] = "";
28093d0e 9356 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
9357
9358 # We had better not be in between range lines.
9359 if ($in_range) {
28093d0e 9360 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9361 $in_range = 0;
9362 }
9363 }
28093d0e 9364 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
9365
9366 # Here is a non-range line. We had better not be in between range
9367 # lines.
9368 if ($in_range) {
28093d0e 9369 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9370 $in_range = 0;
9371 }
edb80b88
KW
9372 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
9373
9374 # These are code points whose names end in their code points,
9375 # which means the names are algorithmically derivable from the
9376 # code points. To shorten the output Name file, the algorithm
9377 # for deriving these is placed in the file instead of each
9378 # code point, so they have map type $CP_IN_NAME
9379 $fields[$CHARNAME] = $CMD_DELIM
9380 . $MAP_TYPE_CMD
9381 . '='
9382 . $CP_IN_NAME
9383 . $CMD_DELIM
9384 . $fields[$CHARNAME];
9385 }
28093d0e 9386 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 9387 }
28093d0e
KW
9388 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
9389 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
9390
9391 # Here we are at the beginning of a range pair.
9392 if ($in_range) {
28093d0e 9393 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9394 }
9395 $in_range = 1;
9396
9397 # Because the properties in the range do not overwrite any already
9398 # in the db, we must flush the buffers of what's already there, so
9399 # they get handled in the normal scheme.
9400 $force_output = 1;
9401
9402 }
28093d0e
KW
9403 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
9404 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
9405 $_ = "";
9406 return;
9407 }
9408 else { # Here, we are at the last line of a range pair.
9409
9410 if (! $in_range) {
28093d0e 9411 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
9412 $_ = "";
9413 return;
9414 }
9415 $in_range = 0;
9416
28093d0e
KW
9417 $fields[$NAME] = $fields[$CHARNAME];
9418
99870f4d
KW
9419 # Check that the input is valid: that the closing of the range is
9420 # the same as the beginning.
9421 foreach my $i (0 .. $last_field) {
9422 next if $fields[$i] eq $previous_fields[$i];
9423 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
9424 }
9425
9426 # The processing differs depending on the type of range,
28093d0e
KW
9427 # determined by its $CHARNAME
9428 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
9429
9430 # Check that the data looks right.
9431 if ($decimal_previous_cp != $SBase) {
9432 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
9433 }
9434 if ($decimal_cp != $SBase + $SCount - 1) {
9435 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
9436 }
9437
9438 # The Hangul syllable range has a somewhat complicated name
9439 # generation algorithm. Each code point in it has a canonical
9440 # decomposition also computable by an algorithm. The
9441 # perl decomposition map table built from these is used only
9442 # by normalize.pm, which has the algorithm built in it, so the
9443 # decomposition maps are not needed, and are large, so are
9444 # omitted from it. If the full decomposition map table is to
9445 # be output, the decompositions are generated for it, in the
9446 # EOF handling code for this input file.
9447
9448 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9449
9450 # This range is stored in our internal structure with its
9451 # own map type, different from all others.
28093d0e
KW
9452 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9453 = $CMD_DELIM
99870f4d
KW
9454 . $MAP_TYPE_CMD
9455 . '='
9456 . $HANGUL_SYLLABLE
9457 . $CMD_DELIM
28093d0e 9458 . $fields[$CHARNAME];
99870f4d 9459 }
28093d0e 9460 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
9461
9462 # The name for these contains the code point itself, and all
9463 # are defined to have the same base name, regardless of what
9464 # is in the file. They are stored in our internal structure
9465 # with a map type of $CP_IN_NAME
28093d0e
KW
9466 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9467 = $CMD_DELIM
99870f4d
KW
9468 . $MAP_TYPE_CMD
9469 . '='
9470 . $CP_IN_NAME
9471 . $CMD_DELIM
9472 . 'CJK UNIFIED IDEOGRAPH';
9473
9474 }
9475 elsif ($fields[$CATEGORY] eq 'Co'
9476 || $fields[$CATEGORY] eq 'Cs')
9477 {
9478 # The names of all the code points in these ranges are set to
9479 # null, as there are no names for the private use and
9480 # surrogate code points.
9481
28093d0e 9482 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
9483 }
9484 else {
28093d0e 9485 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
9486 }
9487
9488 # The first line of the range caused everything else to be output,
9489 # and then its values were stored as the beginning values for the
9490 # next set of ranges, which this one ends. Now, for each value,
9491 # add a command to tell the handler that these values should not
9492 # replace any existing ones in our database.
9493 foreach my $i (0 .. $last_field) {
9494 $previous_fields[$i] = $CMD_DELIM
9495 . $REPLACE_CMD
9496 . '='
9497 . $NO
9498 . $CMD_DELIM
9499 . $previous_fields[$i];
9500 }
9501
9502 # And change things so it looks like the entire range has been
9503 # gone through with this being the final part of it. Adding the
9504 # command above to each field will cause this range to be flushed
9505 # during the next iteration, as it guaranteed that the stored
9506 # field won't match whatever value the next one has.
9507 $previous_cp = $cp;
9508 $decimal_previous_cp = $decimal_cp;
9509
9510 # We are now set up for the next iteration; so skip the remaining
9511 # code in this subroutine that does the same thing, but doesn't
9512 # know about these ranges.
9513 $_ = "";
c1739a4a 9514
99870f4d
KW
9515 return;
9516 }
9517
9518 # On the very first line, we fake it so the code below thinks there is
9519 # nothing to output, and initialize so that when it does get output it
9520 # uses the first line's values for the lowest part of the range.
9521 # (One could avoid this by using peek(), but then one would need to
9522 # know the adjustments done above and do the same ones in the setup
9523 # routine; not worth it)
9524 if ($first_time) {
9525 $first_time = 0;
9526 @previous_fields = @fields;
9527 @start = ($cp) x scalar @fields;
9528 $decimal_previous_cp = $decimal_cp - 1;
9529 }
9530
9531 # For each field, output the stored up ranges that this code point
9532 # doesn't fit in. Earlier we figured out if all ranges should be
9533 # terminated because of changing the replace or map type styles, or if
9534 # there is a gap between this new code point and the previous one, and
9535 # that is stored in $force_output. But even if those aren't true, we
9536 # need to output the range if this new code point's value for the
9537 # given property doesn't match the stored range's.
9538 #local $to_trace = 1 if main::DEBUG;
9539 foreach my $i (0 .. $last_field) {
9540 my $field = $fields[$i];
9541 if ($force_output || $field ne $previous_fields[$i]) {
9542
9543 # Flush the buffer of stored values.
9544 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9545
9546 # Start a new range with this code point and its value
9547 $start[$i] = $cp;
9548 $previous_fields[$i] = $field;
9549 }
9550 }
9551
9552 # Set the values for the next time.
9553 $previous_cp = $cp;
9554 $decimal_previous_cp = $decimal_cp;
9555
9556 # The input line has generated whatever adjusted lines are needed, and
9557 # should not be looked at further.
9558 $_ = "";
9559 return;
9560 }
9561
9562 sub EOF_UnicodeData {
9563 # Called upon EOF to flush the buffers, and create the Hangul
9564 # decomposition mappings if needed.
9565
9566 my $file = shift;
9567 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9568
9569 # Flush the buffers.
9570 foreach my $i (1 .. $last_field) {
9571 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9572 }
9573
9574 if (-e 'Jamo.txt') {
9575
9576 # The algorithm is published by Unicode, based on values in
9577 # Jamo.txt, (which should have been processed before this
9578 # subroutine), and the results left in %Jamo
9579 unless (%Jamo) {
9580 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
9581 return;
9582 }
9583
9584 # If the full decomposition map table is being output, insert
9585 # into it the Hangul syllable mappings. This is to avoid having
9586 # to publish a subroutine in it to compute them. (which would
9587 # essentially be this code.) This uses the algorithm published by
9588 # Unicode.
9589 if (property_ref('Decomposition_Mapping')->to_output_map) {
28093d0e 9590 local $to_trace = 1 if main::DEBUG;
99870f4d
KW
9591 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9592 use integer;
9593 my $SIndex = $S - $SBase;
9594 my $L = $LBase + $SIndex / $NCount;
9595 my $V = $VBase + ($SIndex % $NCount) / $TCount;
9596 my $T = $TBase + $SIndex % $TCount;
9597
9598 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9599 my $decomposition = sprintf("%04X %04X", $L, $V);
9600 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9601 $file->insert_adjusted_lines(
9602 sprintf("%04X; Decomposition_Mapping; %s",
9603 $S,
9604 $decomposition));
9605 }
9606 }
9607 }
9608
9609 return;
9610 }
9611
9612 sub filter_v1_ucd {
9613 # Fix UCD lines in version 1. This is probably overkill, but this
9614 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
9615 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
9616 # removed. This program retains them
9617 # 2) didn't include ranges, which it should have, and which are now
9618 # added in @corrected_lines below. It was hand populated by
9619 # taking the data from Version 2, verified by analyzing
9620 # DAge.txt.
9621 # 3) There is a syntax error in the entry for U+09F8 which could
9622 # cause problems for utf8_heavy, and so is changed. It's
9623 # numeric value was simply a minus sign, without any number.
9624 # (Eventually Unicode changed the code point to non-numeric.)
9625 # 4) The decomposition types often don't match later versions
9626 # exactly, and the whole syntax of that field is different; so
9627 # the syntax is changed as well as the types to their later
9628 # terminology. Otherwise normalize.pm would be very unhappy
9629 # 5) Many ccc classes are different. These are left intact.
9630 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
9631 # fields. These are unchanged because it doesn't really cause
9632 # problems for Perl.
9633 # 7) A number of code points, such as controls, don't have their
9634 # Unicode Version 1 Names in this file. These are unchanged.
9635
9636 my @corrected_lines = split /\n/, <<'END';
96374E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
96389FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9639E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9640F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9641F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9642FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9643END
9644
9645 my $file = shift;
9646 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9647
9648 #local $to_trace = 1 if main::DEBUG;
9649 trace $_ if main::DEBUG && $to_trace;
9650
9651 # -1 => retain trailing null fields
9652 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9653
9654 # At the first place that is wrong in the input, insert all the
9655 # corrections, replacing the wrong line.
9656 if ($code_point eq '4E00') {
9657 my @copy = @corrected_lines;
9658 $_ = shift @copy;
9659 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9660
9661 $file->insert_lines(@copy);
9662 }
9663
9664
9665 if ($fields[$NUMERIC] eq '-') {
9666 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
9667 }
9668
9669 if ($fields[$PERL_DECOMPOSITION] ne "") {
9670
9671 # Several entries have this change to superscript 2 or 3 in the
9672 # middle. Convert these to the modern version, which is to use
9673 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9674 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9675 # 'HHHH HHHH 00B3 HHHH'.
9676 # It turns out that all of these that don't have another
9677 # decomposition defined at the beginning of the line have the
9678 # <square> decomposition in later releases.
9679 if ($code_point ne '00B2' && $code_point ne '00B3') {
9680 if ($fields[$PERL_DECOMPOSITION]
9681 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9682 {
9683 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9684 $fields[$PERL_DECOMPOSITION] = '<square> '
9685 . $fields[$PERL_DECOMPOSITION];
9686 }
9687 }
9688 }
9689
9690 # If is like '<+circled> 0052 <-circled>', convert to
9691 # '<circled> 0052'
9692 $fields[$PERL_DECOMPOSITION] =~
9693 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9694
9695 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9696 $fields[$PERL_DECOMPOSITION] =~
9697 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9698 or $fields[$PERL_DECOMPOSITION] =~
9699 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9700 or $fields[$PERL_DECOMPOSITION] =~
9701 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9702 or $fields[$PERL_DECOMPOSITION] =~
9703 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9704
9705 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9706 $fields[$PERL_DECOMPOSITION] =~
9707 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9708
9709 # Change names to modern form.
9710 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9711 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9712 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9713 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9714
9715 # One entry has weird braces
9716 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9717 }
9718
9719 $_ = join ';', $code_point, @fields;
9720 trace $_ if main::DEBUG && $to_trace;
9721 return;
9722 }
9723
9724 sub filter_v2_1_5_ucd {
9725 # A dozen entries in this 2.1.5 file had the mirrored and numeric
9726 # columns swapped; These all had mirrored be 'N'. So if the numeric
9727 # column appears to be N, swap it back.
9728
9729 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9730 if ($fields[$NUMERIC] eq 'N') {
9731 $fields[$NUMERIC] = $fields[$MIRRORED];
9732 $fields[$MIRRORED] = 'N';
9733 $_ = join ';', $code_point, @fields;
9734 }
9735 return;
9736 }
9737} # End closure for UnicodeData
9738
37e2e78e
KW
9739sub process_GCB_test {
9740
9741 my $file = shift;
9742 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9743
9744 while ($file->next_line) {
9745 push @backslash_X_tests, $_;
9746 }
678f13d5 9747
37e2e78e
KW
9748 return;
9749}
9750
99870f4d
KW
9751sub process_NamedSequences {
9752 # NamedSequences.txt entries are just added to an array. Because these
9753 # don't look like the other tables, they have their own handler.
9754 # An example:
9755 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9756 #
9757 # This just adds the sequence to an array for later handling
9758
99870f4d
KW
9759 my $file = shift;
9760 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9761
9762 while ($file->next_line) {
9763 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9764 if (@remainder) {
9765 $file->carp_bad_line(
9766 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9767 next;
9768 }
fb121860
KW
9769
9770 # Note single \t in keeping with special output format of
9771 # Perl_charnames. But it turns out that the code points don't have to
9772 # be 5 digits long, like the rest, based on the internal workings of
9773 # charnames.pm. This could be easily changed for consistency.
9774 push @named_sequences, "$sequence\t$name";
99870f4d
KW
9775 }
9776 return;
9777}
9778
9779{ # Closure
9780
9781 my $first_range;
9782
9783 sub filter_early_ea_lb {
9784 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
9785 # third field be the name of the code point, which can be ignored in
9786 # most cases. But it can be meaningful if it marks a range:
9787 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9788 # 3400;W;<CJK Ideograph Extension A, First>
9789 #
9790 # We need to see the First in the example above to know it's a range.
9791 # They did not use the later range syntaxes. This routine changes it
9792 # to use the modern syntax.
9793 # $1 is the Input_file object.
9794
9795 my @fields = split /\s*;\s*/;
9796 if ($fields[2] =~ /^<.*, First>/) {
9797 $first_range = $fields[0];
9798 $_ = "";
9799 }
9800 elsif ($fields[2] =~ /^<.*, Last>/) {
9801 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9802 }
9803 else {
9804 undef $first_range;
9805 $_ = "$fields[0]; $fields[1]";
9806 }
9807
9808 return;
9809 }
9810}
9811
9812sub filter_old_style_arabic_shaping {
9813 # Early versions used a different term for the later one.
9814
9815 my @fields = split /\s*;\s*/;
9816 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9817 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
9818 $_ = join ';', @fields;
9819 return;
9820}
9821
9822sub filter_arabic_shaping_line {
9823 # ArabicShaping.txt has entries that look like:
9824 # 062A; TEH; D; BEH
9825 # The field containing 'TEH' is not used. The next field is Joining_Type
9826 # and the last is Joining_Group
9827 # This generates two lines to pass on, one for each property on the input
9828 # line.
9829
9830 my $file = shift;
9831 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9832
9833 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9834
9835 if (@fields > 4) {
9836 $file->carp_bad_line('Extra fields');
9837 $_ = "";
9838 return;
9839 }
9840
9841 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9842 $_ = "$fields[0]; Joining_Type; $fields[2]";
9843
9844 return;
9845}
9846
9847sub setup_special_casing {
9848 # SpecialCasing.txt contains the non-simple case change mappings. The
959ce5bf
KW
9849 # simple ones are in UnicodeData.txt, which should already have been read
9850 # in to the full property data structures, so as to initialize these with
9851 # the simple ones. Then the SpecialCasing.txt entries overwrite the ones
9852 # which have different full mappings.
9853
9854 # This routine sees if the simple mappings are to be output, and if so,
9855 # copies what has already been put into the full mapping tables, while
9856 # they still contain only the simple mappings.
9857
9858 # The reason it is done this way is that the simple mappings are probably
9859 # not going to be output, so it saves work to initialize the full tables
9860 # with the simple mappings, and then overwrite those relatively few
9861 # entries in them that have different full mappings, and thus skip the
9862 # simple mapping tables altogether.
99870f4d
KW
9863
9864 my $file= shift;
9865 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9866
9867 # For each of the case change mappings...
9868 foreach my $case ('lc', 'tc', 'uc') {
959ce5bf
KW
9869 my $full = property_ref($case);
9870 unless (defined $full && ! $full->is_empty) {
9871 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
9872 }
99870f4d
KW
9873
9874 # The simple version's name in each mapping merely has an 's' in front
9875 # of the full one's
9876 my $simple = property_ref('s' . $case);
d7078fb7 9877 $simple->initialize($full) if $simple->to_output_map();
99870f4d
KW
9878 }
9879
9880 return;
9881}
9882
9883sub filter_special_casing_line {
9884 # Change the format of $_ from SpecialCasing.txt into something that the
9885 # generic handler understands. Each input line contains three case
9886 # mappings. This will generate three lines to pass to the generic handler
9887 # for each of those.
9888
9889 # The input syntax (after stripping comments and trailing white space is
9890 # like one of the following (with the final two being entries that we
9891 # ignore):
9892 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9893 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9894 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9895 # Note the trailing semi-colon, unlike many of the input files. That
9896 # means that there will be an extra null field generated by the split
9897
9898 my $file = shift;
9899 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9900
9901 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9902
9903 # field #4 is when this mapping is conditional. If any of these get
9904 # implemented, it would be by hard-coding in the casing functions in the
9905 # Perl core, not through tables. But if there is a new condition we don't
9906 # know about, output a warning. We know about all the conditions through
9907 # 5.2
9908 if ($fields[4] ne "") {
9909 my @conditions = split ' ', $fields[4];
9910 if ($conditions[0] ne 'tr' # We know that these languages have
9911 # conditions, and some are multiple
9912 && $conditions[0] ne 'az'
9913 && $conditions[0] ne 'lt'
9914
9915 # And, we know about a single condition Final_Sigma, but
9916 # nothing else.
9917 && ($v_version gt v5.2.0
9918 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9919 {
9920 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9921 }
9922 elsif ($conditions[0] ne 'Final_Sigma') {
9923
9924 # Don't print out a message for Final_Sigma, because we have
9925 # hard-coded handling for it. (But the standard could change
9926 # what the rule should be, but it wouldn't show up here
9927 # anyway.
9928
9929 print "# SKIPPING Special Casing: $_\n"
9930 if $verbosity >= $VERBOSE;
9931 }
9932 $_ = "";
9933 return;
9934 }
9935 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9936 $file->carp_bad_line('Extra fields');
9937 $_ = "";
9938 return;
9939 }
9940
9941 $_ = "$fields[0]; lc; $fields[1]";
9942 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9943 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9944
9945 return;
9946}
9947
9948sub filter_old_style_case_folding {
9949 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 9950 # and later style. Different letters were used in the earlier.
99870f4d
KW
9951
9952 my $file = shift;
9953 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9954
9955 my @fields = split /\s*;\s*/;
9956 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
9957 $fields[1] = 'I';
9958 }
9959 elsif ($fields[1] eq 'L') {
9960 $fields[1] = 'C'; # L => C always
9961 }
9962 elsif ($fields[1] eq 'E') {
9963 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
9964 $fields[1] = 'F'
9965 }
9966 else {
9967 $fields[1] = 'C'
9968 }
9969 }
9970 else {
9971 $file->carp_bad_line("Expecting L or E in second field");
9972 $_ = "";
9973 return;
9974 }
9975 $_ = join("; ", @fields) . ';';
9976 return;
9977}
9978
9979{ # Closure for case folding
9980
9981 # Create the map for simple only if are going to output it, for otherwise
9982 # it takes no part in anything we do.
9983 my $to_output_simple;
9984
9985 # These are experimental, perhaps will need these to pass to regcomp.c to
9986 # handle the cases where for example the Kelvin sign character folds to k,
9987 # and in regcomp, we need to know which of the characters can have a
9988 # non-latin1 char fold to it, so it doesn't do the optimizations it might
9989 # otherwise.
9990 my @latin1_singly_folded;
9991 my @latin1_folded;
9992
9993 sub setup_case_folding($) {
9994 # Read in the case foldings in CaseFolding.txt. This handles both
9995 # simple and full case folding.
9996
9997 $to_output_simple
9998 = property_ref('Simple_Case_Folding')->to_output_map;
9999
10000 return;
10001 }
10002
10003 sub filter_case_folding_line {
10004 # Called for each line in CaseFolding.txt
10005 # Input lines look like:
10006 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10007 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10008 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10009 #
10010 # 'C' means that folding is the same for both simple and full
10011 # 'F' that it is only for full folding
10012 # 'S' that it is only for simple folding
10013 # 'T' is locale-dependent, and ignored
10014 # 'I' is a type of 'F' used in some early releases.
10015 # Note the trailing semi-colon, unlike many of the input files. That
10016 # means that there will be an extra null field generated by the split
10017 # below, which we ignore and hence is not an error.
10018
10019 my $file = shift;
10020 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10021
10022 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10023 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10024 $file->carp_bad_line('Extra fields');
10025 $_ = "";
10026 return;
10027 }
10028
10029 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10030 $_ = "";
10031 return;
10032 }
10033
10034 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
10035 # I are all full foldings
10036 if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
10037 $_ = "$range; Case_Folding; $map";
10038 }
10039 else {
10040 $_ = "";
10041 if ($type ne 'S') {
10042 $file->carp_bad_line('Expecting C F I S or T in second field');
10043 return;
10044 }
10045 }
10046
10047 # C and S are simple foldings, but simple case folding is not needed
10048 # unless we explicitly want its map table output.
10049 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10050 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10051 }
10052
10053 # Experimental, see comment above
10054 if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
10055 my @folded = split ' ', $map;
10056 if (hex $folded[0] < 256 && @folded == 1) {
10057 push @latin1_singly_folded, hex $folded[0];
10058 }
10059 foreach my $folded (@folded) {
10060 push @latin1_folded, hex $folded if hex $folded < 256;
10061 }
10062 }
10063
10064 return;
10065 }
10066
10067 sub post_fold {
10068 # Experimental, see comment above
10069 return;
10070
10071 #local $to_trace = 1 if main::DEBUG;
10072 @latin1_singly_folded = uniques(@latin1_singly_folded);
10073 @latin1_folded = uniques(@latin1_folded);
10074 trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
10075 trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
10076 return;
10077 }
10078} # End case fold closure
10079
10080sub filter_jamo_line {
10081 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10082 # from this file that is used in generating the Name property for Jamo
10083 # code points. But, it also is used to convert early versions' syntax
10084 # into the modern form. Here are two examples:
10085 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10086 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10087 #
10088 # The input is $_, the output is $_ filtered.
10089
10090 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10091
10092 # Let the caller handle unexpected input. In earlier versions, there was
10093 # a third field which is supposed to be a comment, but did not have a '#'
10094 # before it.
10095 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10096
10097 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10098 # beginning.
10099
10100 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10101 $fields[1] = 'R' if $fields[0] eq '1105';
10102
10103 # Add to structure so can generate Names from it.
10104 my $cp = hex $fields[0];
10105 my $short_name = $fields[1];
10106 $Jamo{$cp} = $short_name;
10107 if ($cp <= $LBase + $LCount) {
10108 $Jamo_L{$short_name} = $cp - $LBase;
10109 }
10110 elsif ($cp <= $VBase + $VCount) {
10111 $Jamo_V{$short_name} = $cp - $VBase;
10112 }
10113 elsif ($cp <= $TBase + $TCount) {
10114 $Jamo_T{$short_name} = $cp - $TBase;
10115 }
10116 else {
10117 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10118 }
10119
10120
10121 # Reassemble using just the first two fields to look like a typical
10122 # property file line
10123 $_ = "$fields[0]; $fields[1]";
10124
10125 return;
10126}
10127
99870f4d
KW
10128sub register_fraction($) {
10129 # This registers the input rational number so that it can be passed on to
10130 # utf8_heavy.pl, both in rational and floating forms.
10131
10132 my $rational = shift;
10133
10134 my $float = eval $rational;
10135 $nv_floating_to_rational{$float} = $rational;
10136 return;
10137}
10138
10139sub filter_numeric_value_line {
10140 # DNumValues contains lines of a different syntax than the typical
10141 # property file:
10142 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10143 #
10144 # This routine transforms $_ containing the anomalous syntax to the
10145 # typical, by filtering out the extra columns, and convert early version
10146 # decimal numbers to strings that look like rational numbers.
10147
10148 my $file = shift;
10149 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10150
10151 # Starting in 5.1, there is a rational field. Just use that, omitting the
10152 # extra columns. Otherwise convert the decimal number in the second field
10153 # to a rational, and omit extraneous columns.
10154 my @fields = split /\s*;\s*/, $_, -1;
10155 my $rational;
10156
10157 if ($v_version ge v5.1.0) {
10158 if (@fields != 4) {
10159 $file->carp_bad_line('Not 4 semi-colon separated fields');
10160 $_ = "";
10161 return;
10162 }
10163 $rational = $fields[3];
10164 $_ = join '; ', @fields[ 0, 3 ];
10165 }
10166 else {
10167
10168 # Here, is an older Unicode file, which has decimal numbers instead of
10169 # rationals in it. Use the fraction to calculate the denominator and
10170 # convert to rational.
10171
10172 if (@fields != 2 && @fields != 3) {
10173 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10174 $_ = "";
10175 return;
10176 }
10177
10178 my $codepoints = $fields[0];
10179 my $decimal = $fields[1];
10180 if ($decimal =~ s/\.0+$//) {
10181
10182 # Anything ending with a decimal followed by nothing but 0's is an
10183 # integer
10184 $_ = "$codepoints; $decimal";
10185 $rational = $decimal;
10186 }
10187 else {
10188
10189 my $denominator;
10190 if ($decimal =~ /\.50*$/) {
10191 $denominator = 2;
10192 }
10193
10194 # Here have the hardcoded repeating decimals in the fraction, and
10195 # the denominator they imply. There were only a few denominators
10196 # in the older Unicode versions of this file which this code
10197 # handles, so it is easy to convert them.
10198
10199 # The 4 is because of a round-off error in the Unicode 3.2 files
10200 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10201 $denominator = 3;
10202 }
10203 elsif ($decimal =~ /\.[27]50*$/) {
10204 $denominator = 4;
10205 }
10206 elsif ($decimal =~ /\.[2468]0*$/) {
10207 $denominator = 5;
10208 }
10209 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10210 $denominator = 6;
10211 }
10212 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10213 $denominator = 8;
10214 }
10215 if ($denominator) {
10216 my $sign = ($decimal < 0) ? "-" : "";
10217 my $numerator = int((abs($decimal) * $denominator) + .5);
10218 $rational = "$sign$numerator/$denominator";
10219 $_ = "$codepoints; $rational";
10220 }
10221 else {
10222 $file->carp_bad_line("Can't cope with number '$decimal'.");
10223 $_ = "";
10224 return;
10225 }
10226 }
10227 }
10228
10229 register_fraction($rational) if $rational =~ qr{/};
10230 return;
10231}
10232
10233{ # Closure
10234 my %unihan_properties;
10235 my $iicore;
10236
10237
10238 sub setup_unihan {
10239 # Do any special setup for Unihan properties.
10240
10241 # This property gives the wrong computed type, so override.
10242 my $usource = property_ref('kIRG_USource');
10243 $usource->set_type($STRING) if defined $usource;
10244
10245 # This property is to be considered binary, so change all the values
10246 # to Y.
10247 $iicore = property_ref('kIICore');
10248 if (defined $iicore) {
10249 $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10250
10251 # We have to change the default map, because the @missing line is
10252 # misleading, given that we are treating it as binary.
10253 $iicore->set_default_map('N');
10254 $iicore->set_type($BINARY);
10255 }
10256
10257 return;
10258 }
10259
10260 sub filter_unihan_line {
10261 # Change unihan db lines to look like the others in the db. Here is
10262 # an input sample:
10263 # U+341C kCangjie IEKN
10264
10265 # Tabs are used instead of semi-colons to separate fields; therefore
10266 # they may have semi-colons embedded in them. Change these to periods
10267 # so won't screw up the rest of the code.
10268 s/;/./g;
10269
10270 # Remove lines that don't look like ones we accept.
10271 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10272 $_ = "";
10273 return;
10274 }
10275
10276 # Extract the property, and save a reference to its object.
10277 my $property = $1;
10278 if (! exists $unihan_properties{$property}) {
10279 $unihan_properties{$property} = property_ref($property);
10280 }
10281
10282 # Don't do anything unless the property is one we're handling, which
10283 # we determine by seeing if there is an object defined for it or not
10284 if (! defined $unihan_properties{$property}) {
10285 $_ = "";
10286 return;
10287 }
10288
10289 # The iicore property is supposed to be a boolean, so convert to our
10290 # standard boolean form.
10291 if (defined $iicore && $unihan_properties{$property} == $iicore) {
10292 $_ =~ s/$property.*/$property\tY/
10293 }
10294
10295 # Convert the tab separators to our standard semi-colons, and convert
10296 # the U+HHHH notation to the rest of the standard's HHHH
10297 s/\t/;/g;
10298 s/\b U \+ (?= $code_point_re )//xg;
10299
10300 #local $to_trace = 1 if main::DEBUG;
10301 trace $_ if main::DEBUG && $to_trace;
10302
10303 return;
10304 }
10305}
10306
10307sub filter_blocks_lines {
10308 # In the Blocks.txt file, the names of the blocks don't quite match the
10309 # names given in PropertyValueAliases.txt, so this changes them so they
10310 # do match: Blanks and hyphens are changed into underscores. Also makes
10311 # early release versions look like later ones
10312 #
10313 # $_ is transformed to the correct value.
10314
10315 my $file = shift;
10316 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10317
10318 if ($v_version lt v3.2.0) {
10319 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10320 $_ = "";
10321 return;
10322 }
10323
10324 # Old versions used a different syntax to mark the range.
10325 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10326 }
10327
10328 my @fields = split /\s*;\s*/, $_, -1;
10329 if (@fields != 2) {
10330 $file->carp_bad_line("Expecting exactly two fields");
10331 $_ = "";
10332 return;
10333 }
10334
10335 # Change hyphens and blanks in the block name field only
10336 $fields[1] =~ s/[ -]/_/g;
10337 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
10338
10339 $_ = join("; ", @fields);
10340 return;
10341}
10342
10343{ # Closure
10344 my $current_property;
10345
10346 sub filter_old_style_proplist {
10347 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
10348 # was in a completely different syntax. Ken Whistler of Unicode says
10349 # that it was something he used as an aid for his own purposes, but
10350 # was never an official part of the standard. However, comments in
10351 # DAge.txt indicate that non-character code points were available in
10352 # the UCD as of 3.1. It is unclear to me (khw) how they could be
10353 # there except through this file (but on the other hand, they first
10354 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10355 # not. But the claim is that it was published as an aid to others who
10356 # might want some more information than was given in the official UCD
10357 # of the time. Many of the properties in it were incorporated into
10358 # the later PropList.txt, but some were not. This program uses this
10359 # early file to generate property tables that are otherwise not
10360 # accessible in the early UCD's, and most were probably not really
10361 # official at that time, so one could argue that it should be ignored,
10362 # and you can easily modify things to skip this. And there are bugs
10363 # in this file in various versions. (For example, the 2.1.9 version
10364 # removes from Alphabetic the CJK range starting at 4E00, and they
10365 # weren't added back in until 3.1.0.) Many of this file's properties
10366 # were later sanctioned, so this code generates tables for those
10367 # properties that aren't otherwise in the UCD of the time but
10368 # eventually did become official, and throws away the rest. Here is a
10369 # list of all the ones that are thrown away:
10370 # Bidi=* duplicates UnicodeData.txt
10371 # Combining never made into official property;
10372 # is \P{ccc=0}
10373 # Composite never made into official property.
10374 # Currency Symbol duplicates UnicodeData.txt: gc=sc
10375 # Decimal Digit duplicates UnicodeData.txt: gc=nd
10376 # Delimiter never made into official property;
10377 # removed in 3.0.1
10378 # Format Control never made into official property;
10379 # similar to gc=cf
10380 # High Surrogate duplicates Blocks.txt
10381 # Ignorable Control never made into official property;
10382 # similar to di=y
10383 # ISO Control duplicates UnicodeData.txt: gc=cc
10384 # Left of Pair never made into official property;
10385 # Line Separator duplicates UnicodeData.txt: gc=zl
10386 # Low Surrogate duplicates Blocks.txt
10387 # Non-break was actually listed as a property
10388 # in 3.2, but without any code
10389 # points. Unicode denies that this
10390 # was ever an official property
10391 # Non-spacing duplicate UnicodeData.txt: gc=mn
10392 # Numeric duplicates UnicodeData.txt: gc=cc
10393 # Paired Punctuation never made into official property;
10394 # appears to be gc=ps + gc=pe
10395 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
10396 # Private Use duplicates UnicodeData.txt: gc=co
10397 # Private Use High Surrogate duplicates Blocks.txt
10398 # Punctuation duplicates UnicodeData.txt: gc=p
10399 # Space different definition than eventual
10400 # one.
10401 # Titlecase duplicates UnicodeData.txt: gc=lt
10402 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
10403 # Zero-width never made into offical property;
10404 # subset of gc=cf
10405 # Most of the properties have the same names in this file as in later
10406 # versions, but a couple do not.
10407 #
10408 # This subroutine filters $_, converting it from the old style into
10409 # the new style. Here's a sample of the old-style
10410 #
10411 # *******************************************
10412 #
10413 # Property dump for: 0x100000A0 (Join Control)
10414 #
10415 # 200C..200D (2 chars)
10416 #
10417 # In the example, the property is "Join Control". It is kept in this
10418 # closure between calls to the subroutine. The numbers beginning with
10419 # 0x were internal to Ken's program that generated this file.
10420
10421 # If this line contains the property name, extract it.
10422 if (/^Property dump for: [^(]*\((.*)\)/) {
10423 $_ = $1;
10424
10425 # Convert white space to underscores.
10426 s/ /_/g;
10427
10428 # Convert the few properties that don't have the same name as
10429 # their modern counterparts
10430 s/Identifier_Part/ID_Continue/
10431 or s/Not_a_Character/NChar/;
10432
10433 # If the name matches an existing property, use it.
10434 if (defined property_ref($_)) {
10435 trace "new property=", $_ if main::DEBUG && $to_trace;
10436 $current_property = $_;
10437 }
10438 else { # Otherwise discard it
10439 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10440 undef $current_property;
10441 }
10442 $_ = ""; # The property is saved for the next lines of the
10443 # file, but this defining line is of no further use,
10444 # so clear it so that the caller won't process it
10445 # further.
10446 }
10447 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10448
10449 # Here, the input line isn't a header defining a property for the
10450 # following section, and either we aren't in such a section, or
10451 # the line doesn't look like one that defines the code points in
10452 # such a section. Ignore this line.
10453 $_ = "";
10454 }
10455 else {
10456
10457 # Here, we have a line defining the code points for the current
10458 # stashed property. Anything starting with the first blank is
10459 # extraneous. Otherwise, it should look like a normal range to
10460 # the caller. Append the property name so that it looks just like
10461 # a modern PropList entry.
10462
10463 $_ =~ s/\s.*//;
10464 $_ .= "; $current_property";
10465 }
10466 trace $_ if main::DEBUG && $to_trace;
10467 return;
10468 }
10469} # End closure for old style proplist
10470
10471sub filter_old_style_normalization_lines {
10472 # For early releases of Unicode, the lines were like:
10473 # 74..2A76 ; NFKD_NO
10474 # For later releases this became:
10475 # 74..2A76 ; NFKD_QC; N
10476 # Filter $_ to look like those in later releases.
10477 # Similarly for MAYBEs
10478
10479 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10480
10481 # Also, the property FC_NFKC was abbreviated to FNC
10482 s/FNC/FC_NFKC/;
10483 return;
10484}
10485
10486sub finish_Unicode() {
10487 # This routine should be called after all the Unicode files have been read
10488 # in. It:
10489 # 1) Adds the mappings for code points missing from the files which have
10490 # defaults specified for them.
10491 # 2) At this this point all mappings are known, so it computes the type of
10492 # each property whose type hasn't been determined yet.
10493 # 3) Calculates all the regular expression match tables based on the
10494 # mappings.
10495 # 3) Calculates and adds the tables which are defined by Unicode, but
10496 # which aren't derived by them
10497
10498 # For each property, fill in any missing mappings, and calculate the re
10499 # match tables. If a property has more than one missing mapping, the
10500 # default is a reference to a data structure, and requires data from other
10501 # properties to resolve. The sort is used to cause these to be processed
10502 # last, after all the other properties have been calculated.
10503 # (Fortunately, the missing properties so far don't depend on each other.)
10504 foreach my $property
10505 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10506 property_ref('*'))
10507 {
10508 # $perl has been defined, but isn't one of the Unicode properties that
10509 # need to be finished up.
10510 next if $property == $perl;
10511
10512 # Handle the properties that have more than one possible default
10513 if (ref $property->default_map) {
10514 my $default_map = $property->default_map;
10515
10516 # These properties have stored in the default_map:
10517 # One or more of:
10518 # 1) A default map which applies to all code points in a
10519 # certain class
10520 # 2) an expression which will evaluate to the list of code
10521 # points in that class
10522 # And
10523 # 3) the default map which applies to every other missing code
10524 # point.
10525 #
10526 # Go through each list.
10527 while (my ($default, $eval) = $default_map->get_next_defaults) {
10528
10529 # Get the class list, and intersect it with all the so-far
10530 # unspecified code points yielding all the code points
10531 # in the class that haven't been specified.
10532 my $list = eval $eval;
10533 if ($@) {
10534 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10535 last;
10536 }
10537
10538 # Narrow down the list to just those code points we don't have
10539 # maps for yet.
10540 $list = $list & $property->inverse_list;
10541
10542 # Add mappings to the property for each code point in the list
10543 foreach my $range ($list->ranges) {
10544 $property->add_map($range->start, $range->end, $default);
10545 }
10546 }
10547
10548 # All remaining code points have the other mapping. Set that up
10549 # so the normal single-default mapping code will work on them
10550 $property->set_default_map($default_map->other_default);
10551
10552 # And fall through to do that
10553 }
10554
10555 # We should have enough data now to compute the type of the property.
10556 $property->compute_type;
10557 my $property_type = $property->type;
10558
10559 next if ! $property->to_create_match_tables;
10560
10561 # Here want to create match tables for this property
10562
10563 # The Unicode db always (so far, and they claim into the future) have
10564 # the default for missing entries in binary properties be 'N' (unless
10565 # there is a '@missing' line that specifies otherwise)
10566 if ($property_type == $BINARY && ! defined $property->default_map) {
10567 $property->set_default_map('N');
10568 }
10569
10570 # Add any remaining code points to the mapping, using the default for
10571 # missing code points
10572 if (defined (my $default_map = $property->default_map)) {
10573 foreach my $range ($property->inverse_list->ranges) {
10574 $property->add_map($range->start, $range->end, $default_map);
10575 }
10576
10577 # Make sure there is a match table for the default
10578 if (! defined $property->table($default_map)) {
10579 $property->add_match_table($default_map);
10580 }
10581 }
10582
10583 # Have all we need to populate the match tables.
10584 my $property_name = $property->name;
10585 foreach my $range ($property->ranges) {
10586 my $map = $range->value;
10587 my $table = property_ref($property_name)->table($map);
10588 if (! defined $table) {
10589
10590 # Integral and rational property values are not necessarily
10591 # defined in PropValueAliases, but all other ones should be,
10592 # starting in 5.1
10593 if ($v_version ge v5.1.0
10594 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10595 {
10596 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
10597 }
10598 $table = property_ref($property_name)->add_match_table($map);
10599 }
10600
10601 $table->add_range($range->start, $range->end);
10602 }
10603
10604 # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10605 # all properties have this optional prefix. These do not get a
10606 # separate entry in the pod file, because are covered by a wild-card
10607 # entry
10608 foreach my $alias ($property->aliases) {
10609 my $Is_name = 'Is_' . $alias->name;
10610 if (! defined (my $pre_existing = property_ref($Is_name))) {
10611 $property->add_alias($Is_name,
10612 Pod_Entry => 0,
10613 Status => $alias->status,
10614 Externally_Ok => 0);
10615 }
10616 else {
10617
10618 # It seemed too much work to add in these warnings when it
10619 # appears that Unicode has made a decision never to begin a
10620 # property name with 'Is_', so this shouldn't happen, but just
10621 # in case, it is a warning.
10622 Carp::my_carp(<<END
10623There is already an alias named $Is_name (from " . $pre_existing . "), so not
10624creating this alias for $property. The generated table and pod files do not
10625warn users of this conflict.
10626END
10627 );
10628 $has_Is_conflicts++;
10629 }
10630 } # End of loop through aliases for this property
10631 } # End of loop through all Unicode properties.
10632
10633 # Fill in the mappings that Unicode doesn't completely furnish. First the
10634 # single letter major general categories. If Unicode were to start
10635 # delivering the values, this would be redundant, but better that than to
10636 # try to figure out if should skip and not get it right. Ths could happen
10637 # if a new major category were to be introduced, and the hard-coded test
10638 # wouldn't know about it.
10639 # This routine depends on the standard names for the general categories
10640 # being what it thinks they are, like 'Cn'. The major categories are the
10641 # union of all the general category tables which have the same first
10642 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10643 foreach my $minor_table ($gc->tables) {
10644 my $minor_name = $minor_table->name;
10645 next if length $minor_name == 1;
10646 if (length $minor_name != 2) {
10647 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
10648 next;
10649 }
10650
10651 my $major_name = uc(substr($minor_name, 0, 1));
10652 my $major_table = $gc->table($major_name);
10653 $major_table += $minor_table;
10654 }
10655
10656 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
10657 # defines it as LC)
10658 my $LC = $gc->table('LC');
10659 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
10660 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
10661
10662
10663 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10664 # deliver the correct values in it
10665 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10666
10667 # Lt not in release 1.
10668 $LC += $gc->table('Lt') if defined $gc->table('Lt');
10669 }
10670 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10671
10672 my $Cs = $gc->table('Cs');
10673 if (defined $Cs) {
10674 $Cs->add_note('Mostly not usable in Perl.');
10675 $Cs->add_comment(join_lines(<<END
10676Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10677Unicode text, and hence their use will generate (usually fatal) messages
10678END
10679 ));
10680 }
10681
10682
10683 # Folding information was introduced later into Unicode data. To get
10684 # Perl's case ignore (/i) to work at all in releases that don't have
10685 # folding, use the best available alternative, which is lower casing.
10686 my $fold = property_ref('Simple_Case_Folding');
10687 if ($fold->is_empty) {
10688 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10689 $fold->add_note(join_lines(<<END
10690WARNING: This table uses lower case as a substitute for missing fold
10691information
10692END
10693 ));
10694 }
10695
10696 # Multiple-character mapping was introduced later into Unicode data. If
10697 # missing, use the single-characters maps as best available alternative
10698 foreach my $map (qw { Uppercase_Mapping
10699 Lowercase_Mapping
10700 Titlecase_Mapping
10701 Case_Folding
10702 } ) {
10703 my $full = property_ref($map);
10704 if ($full->is_empty) {
10705 my $simple = property_ref('Simple_' . $map);
10706 $full->initialize($simple);
10707 $full->add_comment($simple->comment) if ($simple->comment);
10708 $full->add_note(join_lines(<<END
10709WARNING: This table uses simple mapping (single-character only) as a
10710substitute for missing multiple-character information
10711END
10712 ));
10713 }
10714 }
10715 return
10716}
10717
10718sub compile_perl() {
10719 # Create perl-defined tables. Almost all are part of the pseudo-property
10720 # named 'perl' internally to this program. Many of these are recommended
10721 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10722 # on those found there.
10723 # Almost all of these are equivalent to some Unicode property.
10724 # A number of these properties have equivalents restricted to the ASCII
10725 # range, with their names prefaced by 'Posix', to signify that these match
10726 # what the Posix standard says they should match. A couple are
10727 # effectively this, but the name doesn't have 'Posix' in it because there
10728 # just isn't any Posix equivalent.
10729
10730 # 'Any' is all code points. As an error check, instead of just setting it
10731 # to be that, construct it to be the union of all the major categories
10732 my $Any = $perl->add_match_table('Any',
10733 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10734 Matches_All => 1);
10735
10736 foreach my $major_table ($gc->tables) {
10737
10738 # Major categories are the ones with single letter names.
10739 next if length($major_table->name) != 1;
10740
10741 $Any += $major_table;
10742 }
10743
10744 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10745 Carp::my_carp_bug("Generated highest code point ("
10746 . sprintf("%X", $Any->max)
10747 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10748 }
10749 if ($Any->range_count != 1 || $Any->min != 0) {
10750 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10751 }
10752
10753 $Any->add_alias('All');
10754
10755 # Assigned is the opposite of gc=unassigned
10756 my $Assigned = $perl->add_match_table('Assigned',
10757 Description => "All assigned code points",
10758 Initialize => ~ $gc->table('Unassigned'),
10759 );
10760
10761 # Our internal-only property should be treated as more than just a
10762 # synonym.
10763 $perl->add_match_table('_CombAbove')
10764 ->set_equivalent_to(property_ref('ccc')->table('Above'),
10765 Related => 1);
10766
10767 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10768 if (defined $block) { # This is equivalent to the block if have it.
10769 my $Unicode_ASCII = $block->table('Basic_Latin');
10770 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10771 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10772 }
10773 }
10774
10775 # Very early releases didn't have blocks, so initialize ASCII ourselves if
10776 # necessary
10777 if ($ASCII->is_empty) {
10778 $ASCII->initialize([ 0..127 ]);
10779 }
10780
99870f4d
KW
10781 # Get the best available case definitions. Early Unicode versions didn't
10782 # have Uppercase and Lowercase defined, so use the general category
10783 # instead for them.
10784 my $Lower = $perl->add_match_table('Lower');
10785 my $Unicode_Lower = property_ref('Lowercase');
10786 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10787 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10788 }
10789 else {
10790 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10791 Related => 1);
10792 }
ad5e8af1
KW
10793 $perl->add_match_table("PosixLower",
10794 Description => "[a-z]",
10795 Initialize => $Lower & $ASCII,
10796 );
99870f4d
KW
10797
10798 my $Upper = $perl->add_match_table('Upper');
10799 my $Unicode_Upper = property_ref('Uppercase');
10800 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10801 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10802 }
10803 else {
10804 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10805 Related => 1);
10806 }
ad5e8af1
KW
10807 $perl->add_match_table("PosixUpper",
10808 Description => "[A-Z]",
10809 Initialize => $Upper & $ASCII,
10810 );
99870f4d
KW
10811
10812 # Earliest releases didn't have title case. Initialize it to empty if not
10813 # otherwise present
10814 my $Title = $perl->add_match_table('Title');
10815 my $lt = $gc->table('Lt');
10816 if (defined $lt) {
10817 $Title->set_equivalent_to($lt, Related => 1);
10818 }
10819
10820 # If this Unicode version doesn't have Cased, set up our own. From
10821 # Unicode 5.1: Definition D120: A character C is defined to be cased if
10822 # and only if C has the Lowercase or Uppercase property or has a
10823 # General_Category value of Titlecase_Letter.
10824 unless (defined property_ref('Cased')) {
10825 my $cased = $perl->add_match_table('Cased',
10826 Initialize => $Lower + $Upper + $Title,
10827 Description => 'Uppercase or Lowercase or Titlecase',
10828 );
10829 }
10830
10831 # Similarly, set up our own Case_Ignorable property if this Unicode
10832 # version doesn't have it. From Unicode 5.1: Definition D121: A character
10833 # C is defined to be case-ignorable if C has the value MidLetter or the
10834 # value MidNumLet for the Word_Break property or its General_Category is
10835 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10836 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10837
10838 # Perl has long had an internal-only alias for this property.
10839 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10840 my $case_ignorable = property_ref('Case_Ignorable');
10841 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10842 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10843 Related => 1);
10844 }
10845 else {
10846
10847 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10848
10849 # The following three properties are not in early releases
10850 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10851 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10852 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10853
10854 # For versions 4.1 - 5.0, there is no MidNumLet property, and
10855 # correspondingly the case-ignorable definition lacks that one. For
10856 # 4.0, it appears that it was meant to be the same definition, but was
10857 # inadvertently omitted from the standard's text, so add it if the
10858 # property actually is there
10859 my $wb = property_ref('Word_Break');
10860 if (defined $wb) {
10861 my $midlet = $wb->table('MidLetter');
10862 $perl_case_ignorable += $midlet if defined $midlet;
10863 my $midnumlet = $wb->table('MidNumLet');
10864 $perl_case_ignorable += $midnumlet if defined $midnumlet;
10865 }
10866 else {
10867
10868 # In earlier versions of the standard, instead of the above two
10869 # properties , just the following characters were used:
10870 $perl_case_ignorable += 0x0027 # APOSTROPHE
10871 + 0x00AD # SOFT HYPHEN (SHY)
10872 + 0x2019; # RIGHT SINGLE QUOTATION MARK
10873 }
10874 }
10875
10876 # The remaining perl defined tables are mostly based on Unicode TR 18,
10877 # "Annex C: Compatibility Properties". All of these have two versions,
10878 # one whose name generally begins with Posix that is posix-compliant, and
10879 # one that matches Unicode characters beyond the Posix, ASCII range
10880
ad5e8af1 10881 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
10882
10883 # Alphabetic was not present in early releases
10884 my $Alphabetic = property_ref('Alphabetic');
10885 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10886 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10887 }
10888 else {
10889
10890 # For early releases, we don't get it exactly right. The below
10891 # includes more than it should, which in 5.2 terms is: L + Nl +
10892 # Other_Alphabetic. Other_Alphabetic contains many characters from
10893 # Mn and Mc. It's better to match more than we should, than less than
10894 # we should.
10895 $Alpha->initialize($gc->table('Letter')
10896 + $gc->table('Mn')
10897 + $gc->table('Mc'));
10898 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 10899 $Alpha->add_description('Alphabetic');
99870f4d 10900 }
ad5e8af1
KW
10901 $perl->add_match_table("PosixAlpha",
10902 Description => "[A-Za-z]",
10903 Initialize => $Alpha & $ASCII,
10904 );
99870f4d
KW
10905
10906 my $Alnum = $perl->add_match_table('Alnum',
ad5e8af1 10907 Description => 'Alphabetic and (Decimal) Numeric',
99870f4d
KW
10908 Initialize => $Alpha + $gc->table('Decimal_Number'),
10909 );
ad5e8af1
KW
10910 $perl->add_match_table("PosixAlnum",
10911 Description => "[A-Za-z0-9]",
10912 Initialize => $Alnum & $ASCII,
10913 );
99870f4d
KW
10914
10915 my $Word = $perl->add_match_table('Word',
10916 Description => '\w, including beyond ASCII',
10917 Initialize => $Alnum + $gc->table('Mark'),
10918 );
10919 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10920 $Word += $Pc if defined $Pc;
10921
f38f76ae 10922 # This is a Perl extension, so the name doesn't begin with Posix.
99870f4d
KW
10923 $perl->add_match_table('PerlWord',
10924 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10925 Initialize => $Word & $ASCII,
10926 );
10927
10928 my $Blank = $perl->add_match_table('Blank',
10929 Description => '\h, Horizontal white space',
10930
10931 # 200B is Zero Width Space which is for line
10932 # break control, and was listed as
10933 # Space_Separator in early releases
10934 Initialize => $gc->table('Space_Separator')
10935 + 0x0009 # TAB
10936 - 0x200B, # ZWSP
10937 );
10938 $Blank->add_alias('HorizSpace'); # Another name for it.
ad5e8af1
KW
10939 $perl->add_match_table("PosixBlank",
10940 Description => "\\t and ' '",
10941 Initialize => $Blank & $ASCII,
10942 );
99870f4d
KW
10943
10944 my $VertSpace = $perl->add_match_table('VertSpace',
10945 Description => '\v',
10946 Initialize => $gc->table('Line_Separator')
10947 + $gc->table('Paragraph_Separator')
10948 + 0x000A # LINE FEED
10949 + 0x000B # VERTICAL TAB
10950 + 0x000C # FORM FEED
10951 + 0x000D # CARRIAGE RETURN
10952 + 0x0085, # NEL
10953 );
10954 # No Posix equivalent for vertical space
10955
10956 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
10957 Description => '\s including beyond ASCII plus vertical tab',
10958 Initialize => $Blank + $VertSpace,
99870f4d 10959 );
ad5e8af1 10960 $perl->add_match_table("PosixSpace",
f38f76ae 10961 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
10962 Initialize => $Space & $ASCII,
10963 );
99870f4d
KW
10964
10965 # Perl's traditional space doesn't include Vertical Tab
10966 my $SpacePerl = $perl->add_match_table('SpacePerl',
10967 Description => '\s, including beyond ASCII',
10968 Initialize => $Space - 0x000B,
10969 );
10970 $perl->add_match_table('PerlSpace',
10971 Description => '\s, restricted to ASCII',
10972 Initialize => $SpacePerl & $ASCII,
10973 );
10974
10975 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 10976 Description => 'Control characters');
99870f4d 10977 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
ad5e8af1 10978 $perl->add_match_table("PosixCntrl",
f38f76ae 10979 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
10980 Initialize => $Cntrl & $ASCII,
10981 );
99870f4d
KW
10982
10983 # $controls is a temporary used to construct Graph.
10984 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
10985 + $gc->table('Control'));
10986 # Cs not in release 1
10987 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
10988
10989 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
10990 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 10991 Description => 'Characters that are graphical',
99870f4d
KW
10992 Initialize => ~ ($Space + $controls),
10993 );
ad5e8af1 10994 $perl->add_match_table("PosixGraph",
f38f76ae
KW
10995 Description =>
10996 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
10997 Initialize => $Graph & $ASCII,
10998 );
99870f4d 10999
3e20195b 11000 $print = $perl->add_match_table('Print',
ad5e8af1 11001 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 11002 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 11003 );
ad5e8af1 11004 $perl->add_match_table("PosixPrint",
66fd7fd0 11005 Description =>
f38f76ae 11006 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 11007 Initialize => $print & $ASCII,
ad5e8af1 11008 );
99870f4d
KW
11009
11010 my $Punct = $perl->add_match_table('Punct');
11011 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11012
11013 # \p{punct} doesn't include the symbols, which posix does
11014 $perl->add_match_table('PosixPunct',
f38f76ae 11015 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
ad5e8af1
KW
11016 Initialize => $ASCII & ($gc->table('Punctuation')
11017 + $gc->table('Symbol')),
11018 );
99870f4d
KW
11019
11020 my $Digit = $perl->add_match_table('Digit',
11021 Description => '\d, extended beyond just [0-9]');
11022 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
ad5e8af1
KW
11023 my $PosixDigit = $perl->add_match_table("PosixDigit",
11024 Description => '[0-9]',
11025 Initialize => $Digit & $ASCII,
11026 );
99870f4d 11027
eadadd41
KW
11028 # Hex_Digit was not present in first release
11029 my $Xdigit = $perl->add_match_table('XDigit');
11030 my $Hex = property_ref('Hex_Digit');
11031 if (defined $Hex && ! $Hex->is_empty) {
11032 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
11033 }
11034 else {
eadadd41
KW
11035 # (Have to use hex instead of e.g. '0', because could be running on an
11036 # non-ASCII machine, and we want the Unicode (ASCII) values)
11037 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11038 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11039 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d
KW
11040 }
11041
99870f4d
KW
11042 my $dt = property_ref('Decomposition_Type');
11043 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11044 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11045 Perl_Extension => 1,
d57ccc9a 11046 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
11047 );
11048
11049 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11050 # than SD appeared, construct it ourselves, based on the first release SD
11051 # was in.
11052 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11053 my $soft_dotted = property_ref('Soft_Dotted');
11054 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11055 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11056 }
11057 else {
11058
11059 # This list came from 3.2 Soft_Dotted.
11060 $CanonDCIJ->initialize([ 0x0069,
11061 0x006A,
11062 0x012F,
11063 0x0268,
11064 0x0456,
11065 0x0458,
11066 0x1E2D,
11067 0x1ECB,
11068 ]);
11069 $CanonDCIJ = $CanonDCIJ & $Assigned;
11070 }
11071
f86864ac 11072 # These are used in Unicode's definition of \X
37e2e78e
KW
11073 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11074 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11075
99870f4d 11076 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 11077
678f13d5 11078 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
11079 # definition differs too much from the traditional Perl one to use.
11080 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11081
11082 # Note that assumes HST is defined; it came in an earlier release than
11083 # GCB. In the line below, two negatives means: yes hangul
11084 $begin += ~ property_ref('Hangul_Syllable_Type')
11085 ->table('Not_Applicable')
11086 + ~ ($gcb->table('Control')
11087 + $gcb->table('CR')
11088 + $gcb->table('LF'));
11089 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11090
11091 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11092 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
11093 }
11094 else { # Old definition, used on early releases.
f86864ac 11095 $extend += $gc->table('Mark')
37e2e78e
KW
11096 + 0x200C # ZWNJ
11097 + 0x200D; # ZWJ
11098 $begin += ~ $extend;
11099
11100 # Here we may have a release that has the regular grapheme cluster
11101 # defined, or a release that doesn't have anything defined.
11102 # We set things up so the Perl core degrades gracefully, possibly with
11103 # placeholders that match nothing.
11104
11105 if (! defined $gcb) {
11106 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11107 }
11108 my $hst = property_ref('HST');
11109 if (!defined $hst) {
11110 $hst = Property->new('HST', Status => $PLACEHOLDER);
11111 $hst->add_match_table('Not_Applicable',
11112 Initialize => $Any,
11113 Matches_All => 1);
11114 }
11115
11116 # On some releases, here we may not have the needed tables for the
11117 # perl core, in some releases we may.
11118 foreach my $name (qw{ L LV LVT T V prepend }) {
11119 my $table = $gcb->table($name);
11120 if (! defined $table) {
11121 $table = $gcb->add_match_table($name);
11122 push @tables_that_may_be_empty, $table->complete_name;
11123 }
11124
11125 # The HST property predates the GCB one, and has identical tables
11126 # for some of them, so use it if we can.
11127 if ($table->is_empty
11128 && defined $hst
11129 && defined $hst->table($name))
11130 {
11131 $table += $hst->table($name);
11132 }
11133 }
11134 }
11135
11136 # More GCB. If we found some hangul syllables, populate a combined
11137 # table.
11138 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11139 my $LV = $gcb->table('LV');
11140 if ($LV->is_empty) {
11141 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11142 } else {
11143 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11144 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
11145 }
11146
28093d0e 11147 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
11148 my @composition = ('Name', 'Unicode_1_Name');
11149
11150 if (@named_sequences) {
11151 push @composition, 'Named_Sequence';
11152 foreach my $sequence (@named_sequences) {
11153 $perl_charname->add_anomalous_entry($sequence);
11154 }
11155 }
11156
11157 my $alias_sentence = "";
11158 my $alias = property_ref('Name_Alias');
11159 if (defined $alias) {
11160 push @composition, 'Name_Alias';
11161 $alias->reset_each_range;
11162 while (my ($range) = $alias->each_range) {
11163 next if $range->value eq "";
11164 if ($range->start != $range->end) {
11165 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
11166 }
11167 $perl_charname->add_duplicate($range->start, $range->value);
11168 }
11169 $alias_sentence = <<END;
11170The Name_Alias property adds duplicate code point entries with a corrected
11171name. The original (less correct, but still valid) name will be physically
11172first.
11173END
11174 }
11175 my $comment;
11176 if (@composition <= 2) { # Always at least 2
11177 $comment = join " and ", @composition;
11178 }
11179 else {
11180 $comment = join ", ", @composition[0 .. scalar @composition - 2];
11181 $comment .= ", and $composition[-1]";
11182 }
11183
99870f4d
KW
11184 $perl_charname->add_comment(join_lines( <<END
11185This file is for charnames.pm. It is the union of the $comment properties.
11186Unicode_1_Name entries are used only for otherwise nameless code
11187points.
11188$alias_sentence
11189END
11190 ));
11191
11192 # The combining class property used by Perl's normalize.pm is not located
11193 # in the normal mapping directory; create a copy for it.
11194 my $ccc = property_ref('Canonical_Combining_Class');
11195 my $perl_ccc = Property->new('Perl_ccc',
11196 Default_Map => $ccc->default_map,
11197 Full_Name => 'Perl_Canonical_Combining_Class',
11198 Internal_Only_Warning => 1,
11199 Perl_Extension => 1,
11200 Pod_Entry =>0,
11201 Type => $ENUM,
11202 Initialize => $ccc,
11203 File => 'CombiningClass',
517956bf 11204 Directory => File::Spec->curdir(),
99870f4d
KW
11205 );
11206 $perl_ccc->set_to_output_map(1);
11207 $perl_ccc->add_comment(join_lines(<<END
11208This mapping is for normalize.pm. It is currently identical to the Unicode
11209Canonical_Combining_Class property.
11210END
11211 ));
11212
11213 # This one match table for it is needed for calculations on output
11214 my $default = $perl_ccc->add_match_table($ccc->default_map,
11215 Initialize => $ccc->table($ccc->default_map),
11216 Status => $SUPPRESSED);
11217
11218 # Construct the Present_In property from the Age property.
11219 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11220 my $default_map = $age->default_map;
11221 my $in = Property->new('In',
11222 Default_Map => $default_map,
11223 Full_Name => "Present_In",
11224 Internal_Only_Warning => 1,
11225 Perl_Extension => 1,
11226 Type => $ENUM,
11227 Initialize => $age,
11228 );
11229 $in->add_comment(join_lines(<<END
11230This file should not be used for any purpose. The values in this file are the
11231same as for $age, and not for what $in really means. This is because anything
11232defined in a given release should have multiple values: that release and all
11233higher ones. But only one value per code point can be represented in a table
11234like this.
11235END
11236 ));
11237
11238 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
11239 # lowest numbered (earliest) come first, with the non-numeric one
11240 # last.
11241 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
11242 ? 1
11243 : ($b->name !~ /^[\d.]*$/)
11244 ? -1
11245 : $a->name <=> $b->name
11246 } $age->tables;
11247
11248 # The Present_In property is the cumulative age properties. The first
11249 # one hence is identical to the first age one.
11250 my $previous_in = $in->add_match_table($first_age->name);
11251 $previous_in->set_equivalent_to($first_age, Related => 1);
11252
11253 my $description_start = "Code point's usage introduced in version ";
11254 $first_age->add_description($description_start . $first_age->name);
11255
11256 # To construct the accumlated values, for each of the age tables
11257 # starting with the 2nd earliest, merge the earliest with it, to get
11258 # all those code points existing in the 2nd earliest. Repeat merging
11259 # the new 2nd earliest with the 3rd earliest to get all those existing
11260 # in the 3rd earliest, and so on.
11261 foreach my $current_age (@rest_ages) {
11262 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
11263
11264 my $current_in = $in->add_match_table(
11265 $current_age->name,
11266 Initialize => $current_age + $previous_in,
11267 Description => $description_start
11268 . $current_age->name
11269 . ' or earlier',
11270 );
11271 $previous_in = $current_in;
11272
11273 # Add clarifying material for the corresponding age file. This is
11274 # in part because of the confusing and contradictory information
11275 # given in the Standard's documentation itself, as of 5.2.
11276 $current_age->add_description(
11277 "Code point's usage was introduced in version "
11278 . $current_age->name);
11279 $current_age->add_note("See also $in");
11280
11281 }
11282
11283 # And finally the code points whose usages have yet to be decided are
11284 # the same in both properties. Note that permanently unassigned code
11285 # points actually have their usage assigned (as being permanently
11286 # unassigned), so that these tables are not the same as gc=cn.
11287 my $unassigned = $in->add_match_table($default_map);
11288 my $age_default = $age->table($default_map);
11289 $age_default->add_description(<<END
11290Code point's usage has not been assigned in any Unicode release thus far.
11291END
11292 );
11293 $unassigned->set_equivalent_to($age_default, Related => 1);
11294 }
11295
11296
11297 # Finished creating all the perl properties. All non-internal non-string
11298 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
11299 # an underscore.) These do not get a separate entry in the pod file
11300 foreach my $table ($perl->tables) {
11301 foreach my $alias ($table->aliases) {
11302 next if $alias->name =~ /^_/;
11303 $table->add_alias('Is_' . $alias->name,
11304 Pod_Entry => 0,
11305 Status => $alias->status,
11306 Externally_Ok => 0);
11307 }
11308 }
11309
11310 return;
11311}
11312
11313sub add_perl_synonyms() {
11314 # A number of Unicode tables have Perl synonyms that are expressed in
11315 # the single-form, \p{name}. These are:
11316 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11317 # \p{Is_Name} as synonyms
11318 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11319 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11320 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11321 # conflict, \p{Value} and \p{Is_Value} as well
11322 #
11323 # This routine generates these synonyms, warning of any unexpected
11324 # conflicts.
11325
11326 # Construct the list of tables to get synonyms for. Start with all the
11327 # binary and the General_Category ones.
11328 my @tables = grep { $_->type == $BINARY } property_ref('*');
11329 push @tables, $gc->tables;
11330
11331 # If the version of Unicode includes the Script property, add its tables
11332 if (defined property_ref('Script')) {
11333 push @tables, property_ref('Script')->tables;
11334 }
11335
11336 # The Block tables are kept separate because they are treated differently.
11337 # And the earliest versions of Unicode didn't include them, so add only if
11338 # there are some.
11339 my @blocks;
11340 push @blocks, $block->tables if defined $block;
11341
11342 # Here, have the lists of tables constructed. Process blocks last so that
11343 # if there are name collisions with them, blocks have lowest priority.
11344 # Should there ever be other collisions, manual intervention would be
11345 # required. See the comments at the beginning of the program for a
11346 # possible way to handle those semi-automatically.
11347 foreach my $table (@tables, @blocks) {
11348
11349 # For non-binary properties, the synonym is just the name of the
11350 # table, like Greek, but for binary properties the synonym is the name
11351 # of the property, and means the code points in its 'Y' table.
11352 my $nominal = $table;
11353 my $nominal_property = $nominal->property;
11354 my $actual;
11355 if (! $nominal->isa('Property')) {
11356 $actual = $table;
11357 }
11358 else {
11359
11360 # Here is a binary property. Use the 'Y' table. Verify that is
11361 # there
11362 my $yes = $nominal->table('Y');
11363 unless (defined $yes) { # Must be defined, but is permissible to
11364 # be empty.
11365 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
11366 next;
11367 }
11368 $actual = $yes;
11369 }
11370
11371 foreach my $alias ($nominal->aliases) {
11372
11373 # Attempt to create a table in the perl directory for the
11374 # candidate table, using whatever aliases in it that don't
11375 # conflict. Also add non-conflicting aliases for all these
11376 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11377 PREFIX:
11378 foreach my $prefix ("", 'Is_', 'In_') {
11379
11380 # Only Block properties can have added 'In_' aliases.
11381 next if $prefix eq 'In_' and $nominal_property != $block;
11382
11383 my $proposed_name = $prefix . $alias->name;
11384
11385 # No Is_Is, In_In, nor combinations thereof
11386 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11387 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11388
11389 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11390
11391 # Get a reference to any existing table in the perl
11392 # directory with the desired name.
11393 my $pre_existing = $perl->table($proposed_name);
11394
11395 if (! defined $pre_existing) {
11396
11397 # No name collision, so ok to add the perl synonym.
11398
11399 my $make_pod_entry;
11400 my $externally_ok;
11401 my $status = $actual->status;
11402 if ($nominal_property == $block) {
11403
11404 # For block properties, the 'In' form is preferred for
11405 # external use; the pod file contains wild cards for
11406 # this and the 'Is' form so no entries for those; and
11407 # we don't want people using the name without the
11408 # 'In', so discourage that.
11409 if ($prefix eq "") {
11410 $make_pod_entry = 1;
11411 $status = $status || $DISCOURAGED;
11412 $externally_ok = 0;
11413 }
11414 elsif ($prefix eq 'In_') {
11415 $make_pod_entry = 0;
11416 $status = $status || $NORMAL;
11417 $externally_ok = 1;
11418 }
11419 else {
11420 $make_pod_entry = 0;
11421 $status = $status || $DISCOURAGED;
11422 $externally_ok = 0;
11423 }
11424 }
11425 elsif ($prefix ne "") {
11426
11427 # The 'Is' prefix is handled in the pod by a wild
11428 # card, and we won't use it for an external name
11429 $make_pod_entry = 0;
11430 $status = $status || $NORMAL;
11431 $externally_ok = 0;
11432 }
11433 else {
11434
11435 # Here, is an empty prefix, non block. This gets its
11436 # own pod entry and can be used for an external name.
11437 $make_pod_entry = 1;
11438 $status = $status || $NORMAL;
11439 $externally_ok = 1;
11440 }
11441
11442 # Here, there isn't a perl pre-existing table with the
11443 # name. Look through the list of equivalents of this
11444 # table to see if one is a perl table.
11445 foreach my $equivalent ($actual->leader->equivalents) {
11446 next if $equivalent->property != $perl;
11447
11448 # Here, have found a table for $perl. Add this alias
11449 # to it, and are done with this prefix.
11450 $equivalent->add_alias($proposed_name,
11451 Pod_Entry => $make_pod_entry,
11452 Status => $status,
11453 Externally_Ok => $externally_ok);
11454 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11455 next PREFIX;
11456 }
11457
11458 # Here, $perl doesn't already have a table that is a
11459 # synonym for this property, add one.
11460 my $added_table = $perl->add_match_table($proposed_name,
11461 Pod_Entry => $make_pod_entry,
11462 Status => $status,
11463 Externally_Ok => $externally_ok);
11464 # And it will be related to the actual table, since it is
11465 # based on it.
11466 $added_table->set_equivalent_to($actual, Related => 1);
11467 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11468 next;
11469 } # End of no pre-existing.
11470
11471 # Here, there is a pre-existing table that has the proposed
11472 # name. We could be in trouble, but not if this is just a
11473 # synonym for another table that we have already made a child
11474 # of the pre-existing one.
11475 if ($pre_existing->is_equivalent_to($actual)) {
11476 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11477 $pre_existing->add_alias($proposed_name);
11478 next;
11479 }
11480
11481 # Here, there is a name collision, but it still could be ok if
11482 # the tables match the identical set of code points, in which
11483 # case, we can combine the names. Compare each table's code
11484 # point list to see if they are identical.
11485 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11486 if ($pre_existing->matches_identically_to($actual)) {
11487
11488 # Here, they do match identically. Not a real conflict.
11489 # Make the perl version a child of the Unicode one, except
11490 # in the non-obvious case of where the perl name is
11491 # already a synonym of another Unicode property. (This is
11492 # excluded by the test for it being its own parent.) The
11493 # reason for this exclusion is that then the two Unicode
11494 # properties become related; and we don't really know if
11495 # they are or not. We generate documentation based on
11496 # relatedness, and this would be misleading. Code
11497 # later executed in the process will cause the tables to
11498 # be represented by a single file anyway, without making
11499 # it look in the pod like they are necessarily related.
11500 if ($pre_existing->parent == $pre_existing
11501 && ($pre_existing->property == $perl
11502 || $actual->property == $perl))
11503 {
11504 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11505 $pre_existing->set_equivalent_to($actual, Related => 1);
11506 }
11507 elsif (main::DEBUG && $to_trace) {
11508 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11509 trace $pre_existing->parent;
11510 }
11511 next PREFIX;
11512 }
11513
11514 # Here they didn't match identically, there is a real conflict
11515 # between our new name and a pre-existing property.
11516 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11517 $pre_existing->add_conflicting($nominal->full_name,
11518 'p',
11519 $actual);
11520
11521 # Don't output a warning for aliases for the block
11522 # properties (unless they start with 'In_') as it is
11523 # expected that there will be conflicts and the block
11524 # form loses.
11525 if ($verbosity >= $NORMAL_VERBOSITY
11526 && ($actual->property != $block || $prefix eq 'In_'))
11527 {
11528 print simple_fold(join_lines(<<END
11529There is already an alias named $proposed_name (from " . $pre_existing . "),
11530so not creating this alias for " . $actual
11531END
11532 ), "", 4);
11533 }
11534
11535 # Keep track for documentation purposes.
11536 $has_In_conflicts++ if $prefix eq 'In_';
11537 $has_Is_conflicts++ if $prefix eq 'Is_';
11538 }
11539 }
11540 }
11541
11542 # There are some properties which have No and Yes (and N and Y) as
11543 # property values, but aren't binary, and could possibly be confused with
11544 # binary ones. So create caveats for them. There are tables that are
11545 # named 'No', and tables that are named 'N', but confusion is not likely
11546 # unless they are the same table. For example, N meaning Number or
11547 # Neutral is not likely to cause confusion, so don't add caveats to things
11548 # like them.
11549 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11550 my $yes = $property->table('Yes');
11551 if (defined $yes) {
11552 my $y = $property->table('Y');
11553 if (defined $y && $yes == $y) {
11554 foreach my $alias ($property->aliases) {
11555 $yes->add_conflicting($alias->name);
11556 }
11557 }
11558 }
11559 my $no = $property->table('No');
11560 if (defined $no) {
11561 my $n = $property->table('N');
11562 if (defined $n && $no == $n) {
11563 foreach my $alias ($property->aliases) {
11564 $no->add_conflicting($alias->name, 'P');
11565 }
11566 }
11567 }
11568 }
11569
11570 return;
11571}
11572
11573sub register_file_for_name($$$) {
11574 # Given info about a table and a datafile that it should be associated
11575 # with, register that assocation
11576
11577 my $table = shift;
11578 my $directory_ref = shift; # Array of the directory path for the file
11579 my $file = shift; # The file name in the final directory, [-1].
11580 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11581
11582 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11583
11584 if ($table->isa('Property')) {
11585 $table->set_file_path(@$directory_ref, $file);
11586 push @map_properties, $table
11587 if $directory_ref->[0] eq $map_directory;
11588 return;
11589 }
11590
11591 # Do all of the work for all equivalent tables when called with the leader
11592 # table, so skip if isn't the leader.
11593 return if $table->leader != $table;
11594
11595 # Join all the file path components together, using slashes.
11596 my $full_filename = join('/', @$directory_ref, $file);
11597
11598 # All go in the same subdirectory of unicore
11599 if ($directory_ref->[0] ne $matches_directory) {
11600 Carp::my_carp("Unexpected directory in "
11601 . join('/', @{$directory_ref}, $file));
11602 }
11603
11604 # For this table and all its equivalents ...
11605 foreach my $table ($table, $table->equivalents) {
11606
11607 # Associate it with its file internally. Don't include the
11608 # $matches_directory first component
11609 $table->set_file_path(@$directory_ref, $file);
11610 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11611
11612 my $property = $table->property;
11613 $property = ($property == $perl)
11614 ? "" # 'perl' is never explicitly stated
11615 : standardize($property->name) . '=';
11616
11617 my $deprecated = ($table->status eq $DEPRECATED)
11618 ? $table->status_info
11619 : "";
11620
11621 # And for each of the table's aliases... This inner loop eventually
11622 # goes through all aliases in the UCD that we generate regex match
11623 # files for
11624 foreach my $alias ($table->aliases) {
11625 my $name = $alias->name;
11626
11627 # Generate an entry in either the loose or strict hashes, which
11628 # will translate the property and alias names combination into the
11629 # file where the table for them is stored.
11630 my $standard;
11631 if ($alias->loose_match) {
11632 $standard = $property . standardize($alias->name);
11633 if (exists $loose_to_file_of{$standard}) {
11634 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11635 }
11636 else {
11637 $loose_to_file_of{$standard} = $sub_filename;
11638 }
11639 }
11640 else {
11641 $standard = lc ($property . $name);
11642 if (exists $stricter_to_file_of{$standard}) {
11643 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11644 }
11645 else {
11646 $stricter_to_file_of{$standard} = $sub_filename;
11647
11648 # Tightly coupled with how utf8_heavy.pl works, for a
11649 # floating point number that is a whole number, get rid of
11650 # the trailing decimal point and 0's, so that utf8_heavy
11651 # will work. Also note that this assumes that such a
11652 # number is matched strictly; so if that were to change,
11653 # this would be wrong.
11654 if ((my $integer_name = $name)
11655 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11656 {
11657 $stricter_to_file_of{$property . $integer_name}
11658 = $sub_filename;
11659 }
11660 }
11661 }
11662
11663 # Keep a list of the deprecated properties and their filenames
11664 if ($deprecated) {
11665 $utf8::why_deprecated{$sub_filename} = $deprecated;
11666 }
11667 }
11668 }
11669
11670 return;
11671}
11672
11673{ # Closure
11674 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
11675 # conflicts
11676 my %full_dir_name_of; # Full length names of directories used.
11677
11678 sub construct_filename($$$) {
11679 # Return a file name for a table, based on the table name, but perhaps
11680 # changed to get rid of non-portable characters in it, and to make
11681 # sure that it is unique on a file system that allows the names before
11682 # any period to be at most 8 characters (DOS). While we're at it
11683 # check and complain if there are any directory conflicts.
11684
11685 my $name = shift; # The name to start with
11686 my $mutable = shift; # Boolean: can it be changed? If no, but
11687 # yet it must be to work properly, a warning
11688 # is given
11689 my $directories_ref = shift; # A reference to an array containing the
11690 # path to the file, with each element one path
11691 # component. This is used because the same
11692 # name can be used in different directories.
11693 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11694
11695 my $warn = ! defined wantarray; # If true, then if the name is
11696 # changed, a warning is issued as well.
11697
11698 if (! defined $name) {
11699 Carp::my_carp("Undefined name in directory "
11700 . File::Spec->join(@$directories_ref)
11701 . ". '_' used");
11702 return '_';
11703 }
11704
11705 # Make sure that no directory names conflict with each other. Look at
11706 # each directory in the input file's path. If it is already in use,
11707 # assume it is correct, and is merely being re-used, but if we
11708 # truncate it to 8 characters, and find that there are two directories
11709 # that are the same for the first 8 characters, but differ after that,
11710 # then that is a problem.
11711 foreach my $directory (@$directories_ref) {
11712 my $short_dir = substr($directory, 0, 8);
11713 if (defined $full_dir_name_of{$short_dir}) {
11714 next if $full_dir_name_of{$short_dir} eq $directory;
11715 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
11716 }
11717 else {
11718 $full_dir_name_of{$short_dir} = $directory;
11719 }
11720 }
11721
11722 my $path = join '/', @$directories_ref;
11723 $path .= '/' if $path;
11724
11725 # Remove interior underscores.
11726 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11727
11728 # Change any non-word character into an underscore, and truncate to 8.
11729 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
11730 substr($filename, 8) = "" if length($filename) > 8;
11731
11732 # Make sure the basename doesn't conflict with something we
11733 # might have already written. If we have, say,
11734 # InGreekExtended1
11735 # InGreekExtended2
11736 # they become
11737 # InGreekE
11738 # InGreek2
11739 my $warned = 0;
11740 while (my $num = $base_names{$path}{lc $filename}++) {
11741 $num++; # so basenames with numbers start with '2', which
11742 # just looks more natural.
11743
11744 # Want to append $num, but if it'll make the basename longer
11745 # than 8 characters, pre-truncate $filename so that the result
11746 # is acceptable.
11747 my $delta = length($filename) + length($num) - 8;
11748 if ($delta > 0) {
11749 substr($filename, -$delta) = $num;
11750 }
11751 else {
11752 $filename .= $num;
11753 }
11754 if ($warn && ! $warned) {
11755 $warned = 1;
11756 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
11757 }
11758 }
11759
11760 return $filename if $mutable;
11761
11762 # If not changeable, must return the input name, but warn if needed to
11763 # change it beyond shortening it.
11764 if ($name ne $filename
11765 && substr($name, 0, length($filename)) ne $filename) {
11766 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
11767 }
11768 return $name;
11769 }
11770}
11771
11772# The pod file contains a very large table. Many of the lines in that table
11773# would exceed a typical output window's size, and so need to be wrapped with
11774# a hanging indent to make them look good. The pod language is really
11775# insufficient here. There is no general construct to do that in pod, so it
11776# is done here by beginning each such line with a space to cause the result to
11777# be output without formatting, and doing all the formatting here. This leads
11778# to the result that if the eventual display window is too narrow it won't
11779# look good, and if the window is too wide, no advantage is taken of that
11780# extra width. A further complication is that the output may be indented by
11781# the formatter so that there is less space than expected. What I (khw) have
11782# done is to assume that that indent is a particular number of spaces based on
11783# what it is in my Linux system; people can always resize their windows if
11784# necessary, but this is obviously less than desirable, but the best that can
11785# be expected.
11786my $automatic_pod_indent = 8;
11787
11788# Try to format so that uses fewest lines, but few long left column entries
11789# slide into the right column. An experiment on 5.1 data yielded the
11790# following percentages that didn't cut into the other side along with the
11791# associated first-column widths
11792# 69% = 24
11793# 80% not too bad except for a few blocks
11794# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11795# 95% = 37;
11796my $indent_info_column = 27; # 75% of lines didn't have overlap
11797
11798my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
11799 # The 3 is because of:
11800 # 1 for the leading space to tell the pod formatter to
11801 # output as-is
11802 # 1 for the flag
11803 # 1 for the space between the flag and the main data
11804
11805sub format_pod_line ($$$;$$) {
11806 # Take a pod line and return it, formatted properly
11807
11808 my $first_column_width = shift;
11809 my $entry = shift; # Contents of left column
11810 my $info = shift; # Contents of right column
11811
11812 my $status = shift || ""; # Any flag
11813
11814 my $loose_match = shift; # Boolean.
11815 $loose_match = 1 unless defined $loose_match;
11816
11817 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11818
11819 my $flags = "";
11820 $flags .= $STRICTER if ! $loose_match;
11821
11822 $flags .= $status if $status;
11823
11824 # There is a blank in the left column to cause the pod formatter to
11825 # output the line as-is.
11826 return sprintf " %-*s%-*s %s\n",
11827 # The first * in the format is replaced by this, the -1 is
11828 # to account for the leading blank. There isn't a
11829 # hard-coded blank after this to separate the flags from
11830 # the rest of the line, so that in the unlikely event that
11831 # multiple flags are shown on the same line, they both
11832 # will get displayed at the expense of that separation,
11833 # but since they are left justified, a blank will be
11834 # inserted in the normal case.
11835 $FILLER - 1,
11836 $flags,
11837
11838 # The other * in the format is replaced by this number to
11839 # cause the first main column to right fill with blanks.
11840 # The -1 is for the guaranteed blank following it.
11841 $first_column_width - $FILLER - 1,
11842 $entry,
11843 $info;
11844}
11845
11846my @zero_match_tables; # List of tables that have no matches in this release
11847
11848sub make_table_pod_entries($) {
11849 # This generates the entries for the pod file for a given table.
11850 # Also done at this time are any children tables. The output looks like:
11851 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
11852
11853 my $input_table = shift; # Table the entry is for
11854 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11855
11856 # Generate parent and all its children at the same time.
11857 return if $input_table->parent != $input_table;
11858
11859 my $property = $input_table->property;
11860 my $type = $property->type;
11861 my $full_name = $property->full_name;
11862
11863 my $count = $input_table->count;
11864 my $string_count = clarify_number($count);
11865 my $status = $input_table->status;
11866 my $status_info = $input_table->status_info;
11867
11868 my $entry_for_first_table; # The entry for the first table output.
11869 # Almost certainly, it is the parent.
11870
11871 # For each related table (including itself), we will generate a pod entry
11872 # for each name each table goes by
11873 foreach my $table ($input_table, $input_table->children) {
11874
11875 # utf8_heavy.pl cannot deal with null string property values, so don't
11876 # output any.
11877 next if $table->name eq "";
11878
11879 # First, gather all the info that applies to this table as a whole.
11880
11881 push @zero_match_tables, $table if $count == 0;
11882
11883 my $table_property = $table->property;
11884
11885 # The short name has all the underscores removed, while the full name
11886 # retains them. Later, we decide whether to output a short synonym
11887 # for the full one, we need to compare apples to apples, so we use the
11888 # short name's length including underscores.
11889 my $table_property_short_name_length;
11890 my $table_property_short_name
11891 = $table_property->short_name(\$table_property_short_name_length);
11892 my $table_property_full_name = $table_property->full_name;
11893
11894 # Get how much savings there is in the short name over the full one
11895 # (delta will always be <= 0)
11896 my $table_property_short_delta = $table_property_short_name_length
11897 - length($table_property_full_name);
11898 my @table_description = $table->description;
11899 my @table_note = $table->note;
11900
11901 # Generate an entry for each alias in this table.
11902 my $entry_for_first_alias; # saves the first one encountered.
11903 foreach my $alias ($table->aliases) {
11904
11905 # Skip if not to go in pod.
11906 next unless $alias->make_pod_entry;
11907
11908 # Start gathering all the components for the entry
11909 my $name = $alias->name;
11910
11911 my $entry; # Holds the left column, may include extras
11912 my $entry_ref; # To refer to the left column's contents from
11913 # another entry; has no extras
11914
11915 # First the left column of the pod entry. Tables for the $perl
11916 # property always use the single form.
11917 if ($table_property == $perl) {
11918 $entry = "\\p{$name}";
11919 $entry_ref = "\\p{$name}";
11920 }
11921 else { # Compound form.
11922
11923 # Only generate one entry for all the aliases that mean true
11924 # or false in binary properties. Append a '*' to indicate
11925 # some are missing. (The heading comment notes this.)
11926 my $wild_card_mark;
11927 if ($type == $BINARY) {
11928 next if $name ne 'N' && $name ne 'Y';
11929 $wild_card_mark = '*';
11930 }
11931 else {
11932 $wild_card_mark = "";
11933 }
11934
11935 # Colon-space is used to give a little more space to be easier
11936 # to read;
11937 $entry = "\\p{"
11938 . $table_property_full_name
11939 . ": $name$wild_card_mark}";
11940
11941 # But for the reference to this entry, which will go in the
11942 # right column, where space is at a premium, use equals
11943 # without a space
11944 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11945 }
11946
11947 # Then the right (info) column. This is stored as components of
11948 # an array for the moment, then joined into a string later. For
11949 # non-internal only properties, begin the info with the entry for
11950 # the first table we encountered (if any), as things are ordered
11951 # so that that one is the most descriptive. This leads to the
11952 # info column of an entry being a more descriptive version of the
11953 # name column
11954 my @info;
11955 if ($name =~ /^_/) {
11956 push @info,
11957 '(For internal use by Perl, not necessarily stable)';
11958 }
11959 elsif ($entry_for_first_alias) {
11960 push @info, $entry_for_first_alias;
11961 }
11962
11963 # If this entry is equivalent to another, add that to the info,
11964 # using the first such table we encountered
11965 if ($entry_for_first_table) {
11966 if (@info) {
11967 push @info, "(= $entry_for_first_table)";
11968 }
11969 else {
11970 push @info, $entry_for_first_table;
11971 }
11972 }
11973
11974 # If the name is a large integer, add an equivalent with an
11975 # exponent for better readability
11976 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
11977 push @info, sprintf "(= %.1e)", $name
11978 }
11979
11980 my $parenthesized = "";
11981 if (! $entry_for_first_alias) {
11982
11983 # This is the first alias for the current table. The alias
11984 # array is ordered so that this is the fullest, most
11985 # descriptive alias, so it gets the fullest info. The other
11986 # aliases are mostly merely pointers to this one, using the
11987 # information already added above.
11988
11989 # Display any status message, but only on the parent table
11990 if ($status && ! $entry_for_first_table) {
11991 push @info, $status_info;
11992 }
11993
11994 # Put out any descriptive info
11995 if (@table_description || @table_note) {
11996 push @info, join "; ", @table_description, @table_note;
11997 }
11998
11999 # Look to see if there is a shorter name we can point people
12000 # at
12001 my $standard_name = standardize($name);
12002 my $short_name;
12003 my $proposed_short = $table->short_name;
12004 if (defined $proposed_short) {
12005 my $standard_short = standardize($proposed_short);
12006
12007 # If the short name is shorter than the standard one, or
12008 # even it it's not, but the combination of it and its
12009 # short property name (as in \p{prop=short} ($perl doesn't
12010 # have this form)) saves at least two characters, then,
12011 # cause it to be listed as a shorter synonym.
12012 if (length $standard_short < length $standard_name
12013 || ($table_property != $perl
12014 && (length($standard_short)
12015 - length($standard_name)
12016 + $table_property_short_delta) # (<= 0)
12017 < -2))
12018 {
12019 $short_name = $proposed_short;
12020 if ($table_property != $perl) {
12021 $short_name = $table_property_short_name
12022 . "=$short_name";
12023 }
12024 $short_name = "\\p{$short_name}";
12025 }
12026 }
12027
12028 # And if this is a compound form name, see if there is a
12029 # single form equivalent
12030 my $single_form;
12031 if ($table_property != $perl) {
12032
12033 # Special case the binary N tables, so that will print
12034 # \P{single}, but use the Y table values to populate
12035 # 'single', as we haven't populated the N table.
12036 my $test_table;
12037 my $p;
12038 if ($type == $BINARY
12039 && $input_table == $property->table('No'))
12040 {
12041 $test_table = $property->table('Yes');
12042 $p = 'P';
12043 }
12044 else {
12045 $test_table = $input_table;
12046 $p = 'p';
12047 }
12048
12049 # Look for a single form amongst all the children.
12050 foreach my $table ($test_table->children) {
12051 next if $table->property != $perl;
12052 my $proposed_name = $table->short_name;
12053 next if ! defined $proposed_name;
12054
12055 # Don't mention internal-only properties as a possible
12056 # single form synonym
12057 next if substr($proposed_name, 0, 1) eq '_';
12058
12059 $proposed_name = "\\$p\{$proposed_name}";
12060 if (! defined $single_form
12061 || length($proposed_name) < length $single_form)
12062 {
12063 $single_form = $proposed_name;
12064
12065 # The goal here is to find a single form; not the
12066 # shortest possible one. We've already found a
12067 # short name. So, stop at the first single form
12068 # found, which is likely to be closer to the
12069 # original.
12070 last;
12071 }
12072 }
12073 }
12074
12075 # Ouput both short and single in the same parenthesized
12076 # expression, but with only one of 'Single', 'Short' if there
12077 # are both items.
12078 if ($short_name || $single_form || $table->conflicting) {
12079 $parenthesized .= '(';
12080 $parenthesized .= "Short: $short_name" if $short_name;
12081 if ($short_name && $single_form) {
12082 $parenthesized .= ', ';
12083 }
12084 elsif ($single_form) {
12085 $parenthesized .= 'Single: ';
12086 }
12087 $parenthesized .= $single_form if $single_form;
12088 }
12089 }
12090
12091
12092 # Warn if this property isn't the same as one that a
12093 # semi-casual user might expect. The other components of this
12094 # parenthesized structure are calculated only for the first entry
12095 # for this table, but the conflicting is deemed important enough
12096 # to go on every entry.
12097 my $conflicting = join " NOR ", $table->conflicting;
12098 if ($conflicting) {
12099 $parenthesized .= '(' if ! $parenthesized;
12100 $parenthesized .= '; ' if $parenthesized ne '(';
12101 $parenthesized .= "NOT $conflicting";
12102 }
12103 $parenthesized .= ')' if $parenthesized;
12104
12105 push @info, $parenthesized if $parenthesized;
d57ccc9a
KW
12106
12107 if ($table_property != $perl && $table->perl_extension) {
12108 push @info, '(Perl extension)';
12109 }
99870f4d
KW
12110 push @info, "($string_count)" if $output_range_counts;
12111
12112 # Now, we have both the entry and info so add them to the
12113 # list of all the properties.
12114 push @match_properties,
12115 format_pod_line($indent_info_column,
12116 $entry,
12117 join( " ", @info),
12118 $alias->status,
12119 $alias->loose_match);
12120
12121 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12122 } # End of looping through the aliases for this table.
12123
12124 if (! $entry_for_first_table) {
12125 $entry_for_first_table = $entry_for_first_alias;
12126 }
12127 } # End of looping through all the related tables
12128 return;
12129}
12130
12131sub pod_alphanumeric_sort {
12132 # Sort pod entries alphanumerically.
12133
99f78760
KW
12134 # The first few character columns are filler, plus the '\p{'; and get rid
12135 # of all the trailing stuff, starting with the trailing '}', so as to sort
12136 # on just 'Name=Value'
12137 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 12138 $a =~ s/}.*//;
99f78760 12139 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
12140 $b =~ s/}.*//;
12141
99f78760
KW
12142 # Determine if the two operands are both internal only or both not.
12143 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
12144 # should be the underscore that begins internal only
12145 my $a_is_internal = (substr($a, 0, 1) eq '_');
12146 my $b_is_internal = (substr($b, 0, 1) eq '_');
12147
12148 # Sort so the internals come last in the table instead of first (which the
12149 # leading underscore would otherwise indicate).
12150 if ($a_is_internal != $b_is_internal) {
12151 return 1 if $a_is_internal;
12152 return -1
12153 }
12154
99870f4d 12155 # Determine if the two operands are numeric property values or not.
99f78760 12156 # A numeric property will look like xyz: 3. But the number
99870f4d 12157 # can begin with an optional minus sign, and may have a
99f78760 12158 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
12159 # isn't numeric, use alphabetic sort.
12160 my ($a_initial, $a_number) =
99f78760 12161 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
12162 return $a cmp $b unless defined $a_number;
12163 my ($b_initial, $b_number) =
99f78760 12164 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
12165 return $a cmp $b unless defined $b_number;
12166
12167 # Here they are both numeric, but use alphabetic sort if the
12168 # initial parts don't match
12169 return $a cmp $b if $a_initial ne $b_initial;
12170
12171 # Convert rationals to floating for the comparison.
12172 $a_number = eval $a_number if $a_number =~ qr{/};
12173 $b_number = eval $b_number if $b_number =~ qr{/};
12174
12175 return $a_number <=> $b_number;
12176}
12177
12178sub make_pod () {
12179 # Create the .pod file. This generates the various subsections and then
12180 # combines them in one big HERE document.
12181
12182 return unless defined $pod_directory;
12183 print "Making pod file\n" if $verbosity >= $PROGRESS;
12184
12185 my $exception_message =
12186 '(Any exceptions are individually noted beginning with the word NOT.)';
12187 my @block_warning;
12188 if (-e 'Blocks.txt') {
12189
12190 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
12191 # if the global $has_In_conflicts indicates we have them.
12192 push @match_properties, format_pod_line($indent_info_column,
12193 '\p{In_*}',
12194 '\p{Block: *}'
12195 . (($has_In_conflicts)
12196 ? " $exception_message"
12197 : ""));
12198 @block_warning = << "END";
12199
12200Matches in the Block property have shortcuts that begin with 'In_'. For
12201example, \\p{Block=Latin1} can be written as \\p{In_Latin1}. For backward
12202compatibility, if there is no conflict with another shortcut, these may also
12203be written as \\p{Latin1} or \\p{Is_Latin1}. But, N.B., there are numerous
12204such conflicting shortcuts. Use of these forms for Block is discouraged, and
12205are flagged as such, not only because of the potential confusion as to what is
12206meant, but also because a later release of Unicode may preempt the shortcut,
12207and your program would no longer be correct. Use the 'In_' form instead to
12208avoid this, or even more clearly, use the compound form, e.g.,
12209\\p{blk:latin1}. See L<perlunicode/"Blocks"> for more information about this.
12210END
12211 }
12212 my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
12213 $text = "$exception_message $text" if $has_Is_conflicts;
12214
12215 # And the 'Is_ line';
12216 push @match_properties, format_pod_line($indent_info_column,
12217 '\p{Is_*}',
12218 "\\p{*} $text");
12219
12220 # Sort the properties array for output. It is sorted alphabetically
12221 # except numerically for numeric properties, and only output unique lines.
12222 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
12223
12224 my $formatted_properties = simple_fold(\@match_properties,
12225 "",
12226 # indent succeeding lines by two extra
12227 # which looks better
12228 $indent_info_column + 2,
12229
12230 # shorten the line length by how much
12231 # the formatter indents, so the folded
12232 # line will fit in the space
12233 # presumably available
12234 $automatic_pod_indent);
12235 # Add column headings, indented to be a little more centered, but not
12236 # exactly
12237 $formatted_properties = format_pod_line($indent_info_column,
12238 ' NAME',
12239 ' INFO')
12240 . "\n"
12241 . $formatted_properties;
12242
12243 # Generate pod documentation lines for the tables that match nothing
12244 my $zero_matches;
12245 if (@zero_match_tables) {
12246 @zero_match_tables = uniques(@zero_match_tables);
12247 $zero_matches = join "\n\n",
12248 map { $_ = '=item \p{' . $_->complete_name . "}" }
12249 sort { $a->complete_name cmp $b->complete_name }
12250 uniques(@zero_match_tables);
12251
12252 $zero_matches = <<END;
12253
12254=head2 Legal \\p{} and \\P{} constructs that match no characters
12255
12256Unicode has some property-value pairs that currently don't match anything.
12257This happens generally either because they are obsolete, or for symmetry with
12258other forms, but no language has yet been encoded that uses them. In this
12259version of Unicode, the following match zero code points:
12260
12261=over 4
12262
12263$zero_matches
12264
12265=back
12266
12267END
12268 }
12269
12270 # Generate list of properties that we don't accept, grouped by the reasons
12271 # why. This is so only put out the 'why' once, and then list all the
12272 # properties that have that reason under it.
12273
12274 my %why_list; # The keys are the reasons; the values are lists of
12275 # properties that have the key as their reason
12276
12277 # For each property, add it to the list that are suppressed for its reason
12278 # The sort will cause the alphabetically first properties to be added to
12279 # each list first, so each list will be sorted.
12280 foreach my $property (sort keys %why_suppressed) {
12281 push @{$why_list{$why_suppressed{$property}}}, $property;
12282 }
12283
12284 # For each reason (sorted by the first property that has that reason)...
12285 my @bad_re_properties;
12286 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
12287 keys %why_list)
12288 {
12289 # Add to the output, all the properties that have that reason. Start
12290 # with an empty line.
12291 push @bad_re_properties, "\n\n";
12292
12293 my $has_item = 0; # Flag if actually output anything.
12294 foreach my $name (@{$why_list{$why}}) {
12295
12296 # Split compound names into $property and $table components
12297 my $property = $name;
12298 my $table;
12299 if ($property =~ / (.*) = (.*) /x) {
12300 $property = $1;
12301 $table = $2;
12302 }
12303
12304 # This release of Unicode may not have a property that is
12305 # suppressed, so don't reference a non-existent one.
12306 $property = property_ref($property);
12307 next if ! defined $property;
12308
12309 # And since this list is only for match tables, don't list the
12310 # ones that don't have match tables.
12311 next if ! $property->to_create_match_tables;
12312
12313 # Find any abbreviation, and turn it into a compound name if this
12314 # is a property=value pair.
12315 my $short_name = $property->name;
12316 $short_name .= '=' . $property->table($table)->name if $table;
12317
12318 # And add the property as an item for the reason.
12319 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12320 $has_item = 1;
12321 }
12322
12323 # And add the reason under the list of properties, if such a list
12324 # actually got generated. Note that the header got added
12325 # unconditionally before. But pod ignores extra blank lines, so no
12326 # harm.
12327 push @bad_re_properties, "\n$why\n" if $has_item;
12328
12329 } # End of looping through each reason.
12330
12331 # Generate a list of the properties whose map table we output, from the
12332 # global @map_properties.
12333 my @map_tables_actually_output;
12334 my $info_indent = 20; # Left column is narrower than \p{} table.
12335 foreach my $property (@map_properties) {
12336
12337 # Get the path to the file; don't output any not in the standard
12338 # directory.
12339 my @path = $property->file_path;
12340 next if $path[0] ne $map_directory;
12341 shift @path; # Remove the standard name
12342
12343 my $file = join '/', @path; # In case is in sub directory
12344 my $info = $property->full_name;
12345 my $short_name = $property->name;
12346 if ($info ne $short_name) {
12347 $info .= " ($short_name)";
12348 }
12349 foreach my $more_info ($property->description,
12350 $property->note,
12351 $property->status_info)
12352 {
12353 next unless $more_info;
12354 $info =~ s/\.\Z//;
12355 $info .= ". $more_info";
12356 }
12357 push @map_tables_actually_output, format_pod_line($info_indent,
12358 $file,
12359 $info,
12360 $property->status);
12361 }
12362
12363 # Sort alphabetically, and fold for output
12364 @map_tables_actually_output = sort
12365 pod_alphanumeric_sort @map_tables_actually_output;
12366 @map_tables_actually_output
12367 = simple_fold(\@map_tables_actually_output,
12368 ' ',
12369 $info_indent,
12370 $automatic_pod_indent);
12371
12372 # Generate a list of the formats that can appear in the map tables.
12373 my @map_table_formats;
12374 foreach my $format (sort keys %map_table_formats) {
12375 push @map_table_formats, " $format $map_table_formats{$format}\n";
12376 }
12377
12378 # Everything is ready to assemble.
12379 my @OUT = << "END";
12380=begin comment
12381
12382$HEADER
12383
12384To change this file, edit $0 instead.
12385
12386=end comment
12387
12388=head1 NAME
12389
51f494cc 12390$pod_file - Index of Unicode Version $string_version properties in Perl
99870f4d
KW
12391
12392=head1 DESCRIPTION
12393
12394There are many properties in Unicode, and Perl provides access to almost all of
12395them, as well as some additional extensions and short-cut synonyms.
12396
12397And just about all of the few that aren't accessible through the Perl
12398core are accessible through the modules: Unicode::Normalize and
12399Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12400
12401This document merely lists all available properties and does not attempt to
12402explain what each property really means. There is a brief description of each
12403Perl extension. There is some detail about Blocks, Scripts, General_Category,
12404and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12405Unicode properties, refer to the Unicode standard. A good starting place is
12406L<$unicode_reference_url>. More information on the Perl extensions is in
12407L<perlrecharclass>.
12408
12409Note that you can define your own properties; see
12410L<perlunicode/"User-Defined Character Properties">.
12411
12412=head1 Properties accessible through \\p{} and \\P{}
12413
12414The Perl regular expression \\p{} and \\P{} constructs give access to most of
12415the Unicode character properties. The table below shows all these constructs,
12416both single and compound forms.
12417
12418B<Compound forms> consist of two components, separated by an equals sign or a
12419colon. The first component is the property name, and the second component is
12420the particular value of the property to match against, for example,
12421'\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12422whose Script property is Greek.
12423
12424B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12425their equivalent compound forms. The table shows these equivalences. (In our
12426example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12427There are also a few Perl-defined single forms that are not shortcuts for a
12428compound form. One such is \\p{Word}. These are also listed in the table.
12429
12430In parsing these constructs, Perl always ignores Upper/lower case differences
12431everywhere within the {braces}. Thus '\\p{Greek}' means the same thing as
12432'\\p{greek}'. But note that changing the case of the 'p' or 'P' before the
12433left brace completely changes the meaning of the construct, from "match" (for
12434'\\p{}') to "doesn't match" (for '\\P{}'). Casing in this document is for
12435improved legibility.
12436
12437Also, white space, hyphens, and underscores are also normally ignored
12438everywhere between the {braces}, and hence can be freely added or removed
12439even if the C</x> modifier hasn't been specified on the regular expression.
12440But $a_bold_stricter at the beginning of an entry in the table below
12441means that tighter (stricter) rules are used for that entry:
12442
12443=over 4
12444
12445=item Single form (\\p{name}) tighter rules:
12446
12447White space, hyphens, and underscores ARE significant
12448except for:
12449
12450=over 4
12451
12452=item * white space adjacent to a non-word character
12453
12454=item * underscores separating digits in numbers
12455
12456=back
12457
12458That means, for example, that you can freely add or remove white space
12459adjacent to (but within) the braces without affecting the meaning.
12460
12461=item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12462
12463The tighter rules given above for the single form apply to everything to the
12464right of the colon or equals; the looser rules still apply to everything to
12465the left.
12466
12467That means, for example, that you can freely add or remove white space
12468adjacent to (but within) the braces and the colon or equal sign.
12469
12470=back
12471
12472Some properties are considered obsolete, but still available. There are
12473several varieties of obsolesence:
12474
12475=over 4
12476
12477=item Obsolete
12478
12479Properties marked with $a_bold_obsolete in the table are considered
12480obsolete. At the time of this writing (Unicode version 5.2) there is no
12481information in the Unicode standard about the implications of a property being
12482obsolete.
12483
12484=item Stabilized
12485
12486Obsolete properties may be stabilized. This means that they are not actively
12487maintained by Unicode, and will not be extended as new characters are added to
12488the standard. Such properties are marked with $a_bold_stabilized in the
12489table. At the time of this writing (Unicode version 5.2) there is no further
12490information in the Unicode standard about the implications of a property being
12491stabilized.
12492
12493=item Deprecated
12494
12495Obsolete properties may be deprecated. This means that their use is strongly
12496discouraged, so much so that a warning will be issued if used, unless the
12497regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12498statement. $A_bold_deprecated flags each such entry in the table, and
12499the entry there for the longest, most descriptive version of the property will
12500give the reason it is deprecated, and perhaps advice. Perl may issue such a
12501warning, even for properties that aren't officially deprecated by Unicode,
12502when there used to be characters or code points that were matched by them, but
12503no longer. This is to warn you that your program may not work like it did on
12504earlier Unicode releases.
12505
12506A deprecated property may be made unavailable in a future Perl version, so it
12507is best to move away from them.
12508
12509=back
12510
12511Some Perl extensions are present for backwards compatibility and are
12512discouraged from being used, but not obsolete. $A_bold_discouraged
12513flags each such entry in the table.
12514
12515@block_warning
12516
12517The table below has two columns. The left column contains the \\p{}
12518constructs to look up, possibly preceeded by the flags mentioned above; and
12519the right column contains information about them, like a description, or
12520synonyms. It shows both the single and compound forms for each property that
12521has them. If the left column is a short name for a property, the right column
12522will give its longer, more descriptive name; and if the left column is the
12523longest name, the right column will show any equivalent shortest name, in both
12524single and compound forms if applicable.
12525
12526The right column will also caution you if a property means something different
12527than what might normally be expected.
12528
d57ccc9a
KW
12529All single forms are Perl extensions; a few compound forms are as well, and
12530are noted as such.
12531
99870f4d
KW
12532Numbers in (parentheses) indicate the total number of code points matched by
12533the property. For emphasis, those properties that match no code points at all
12534are listed as well in a separate section following the table.
12535
12536There is no description given for most non-Perl defined properties (See
12537$unicode_reference_url for that).
d73e5302 12538
99870f4d
KW
12539For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12540combinations. For example, entries like:
d73e5302 12541
99870f4d 12542 \\p{Gc: *} \\p{General_Category: *}
5beb625e 12543
99870f4d
KW
12544mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12545for the latter is also valid for the former. Similarly,
5beb625e 12546
99870f4d 12547 \\p{Is_*} \\p{*}
5beb625e 12548
99870f4d
KW
12549means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12550\\p{IsFoo} are also valid and all mean the same thing. And similarly,
12551\\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}. '*' here
12552is restricted to something not beginning with an underscore.
5beb625e 12553
99870f4d
KW
12554Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12555And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
12556'N*' to indicate this, and doesn't have separate entries for the other
12557possibilities. Note that not all properties which have values 'Yes' and 'No'
12558are binary, and they have all their values spelled out without using this wild
12559card, and a C<NOT> clause in their description that highlights their not being
12560binary. These also require the compound form to match them, whereas true
12561binary properties have both single and compound forms available.
5beb625e 12562
99870f4d
KW
12563Note that all non-essential underscores are removed in the display of the
12564short names below.
5beb625e 12565
99870f4d 12566B<Summary legend:>
5beb625e 12567
99870f4d 12568=over 4
cf25bb62 12569
99870f4d 12570=item B<*> is a wild-card
cf25bb62 12571
99870f4d
KW
12572=item B<(\\d+)> in the info column gives the number of code points matched by
12573this property.
cf25bb62 12574
99870f4d 12575=item B<$DEPRECATED> means this is deprecated.
cf25bb62 12576
99870f4d 12577=item B<$OBSOLETE> means this is obsolete.
cf25bb62 12578
99870f4d 12579=item B<$STABILIZED> means this is stabilized.
cf25bb62 12580
99870f4d 12581=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 12582
99870f4d 12583=item B<$DISCOURAGED> means use of this form is discouraged.
5beb625e 12584
99870f4d 12585=back
da7fcca4 12586
99870f4d 12587$formatted_properties
cf25bb62 12588
99870f4d 12589$zero_matches
cf25bb62 12590
99870f4d 12591=head1 Properties not accessible through \\p{} and \\P{}
cf25bb62 12592
99870f4d
KW
12593A few properties are accessible in Perl via various function calls only.
12594These are:
12595 Lowercase_Mapping lc() and lcfirst()
12596 Titlecase_Mapping ucfirst()
12597 Uppercase_Mapping uc()
12ac2576 12598
99870f4d 12599Case_Folding is accessible through the /i modifier in regular expressions.
cf25bb62 12600
99870f4d
KW
12601The Name property is accessible through the \\N{} interpolation in
12602double-quoted strings and regular expressions, but both usages require a C<use
fb121860
KW
12603charnames;> to be specified, which also contains related functions viacode(),
12604vianame(), and string_vianame().
cf25bb62 12605
99870f4d 12606=head1 Unicode regular expression properties that are NOT accepted by Perl
d2d499f5 12607
99870f4d
KW
12608Perl will generate an error for a few character properties in Unicode when
12609used in a regular expression. The non-Unihan ones are listed below, with the
12610reasons they are not accepted, perhaps with work-arounds. The short names for
12611the properties are listed enclosed in (parentheses).
ae6979a8 12612
99870f4d 12613=over 4
ae6979a8 12614
99870f4d 12615@bad_re_properties
a3a8c5f0 12616
99870f4d 12617=back
a3a8c5f0 12618
99870f4d
KW
12619An installation can choose to allow any of these to be matched by changing the
12620controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12621and then re-running F<$0>. (C<\%Config> is available from the Config module).
d73e5302 12622
99870f4d 12623=head1 Files in the I<To> directory (for serious hackers only)
12ac2576 12624
99870f4d
KW
12625All Unicode properties are really mappings (in the mathematical sense) from
12626code points to their respective values. As part of its build process,
12627Perl constructs tables containing these mappings for all properties that it
12628deals with. But only a few of these are written out into files.
12629Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12630(%Config is available from the Config module).
7ebf06b3 12631
99870f4d
KW
12632Those ones written are ones needed by Perl internally during execution, or for
12633which there is some demand, and those for which there is no access through the
12634Perl core. Generally, properties that can be used in regular expression
12635matching do not have their map tables written, like Script. Nor are the
12636simplistic properties that have a better, more complete version, such as
12637Simple_Uppercase_Mapping (Uppercase_Mapping is written instead).
12ac2576 12638
99870f4d
KW
12639None of the properties in the I<To> directory are currently directly
12640accessible through the Perl core, although some may be accessed indirectly.
12641For example, the uc() function implements the Uppercase_Mapping property and
12642uses the F<Upper.pl> file found in this directory.
12ac2576 12643
99870f4d
KW
12644The available files with their properties (short names in parentheses),
12645and any flags or comments about them, are:
12ac2576 12646
99870f4d 12647@map_tables_actually_output
12ac2576 12648
99870f4d
KW
12649An installation can choose to change which files are generated by changing the
12650controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12651and then re-running F<$0>.
cf25bb62 12652
99870f4d
KW
12653Each of these files defines two hash entries to help reading programs decipher
12654it. One of them looks like this:
12ac2576 12655
99870f4d 12656 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
d73e5302 12657
99870f4d
KW
12658where 'NAME' is a name to indicate the property. For backwards compatibility,
12659this is not necessarily the property's official Unicode name. (The 'To' is
12660also for backwards compatibility.) The hash entry gives the format of the
12661mapping fields of the table, currently one of the following:
d73e5302 12662
99870f4d 12663 @map_table_formats
d73e5302 12664
99870f4d
KW
12665This format applies only to the entries in the main body of the table.
12666Entries defined in hashes or ones that are missing from the list can have a
12667different format.
d73e5302 12668
99870f4d
KW
12669The value that the missing entries have is given by the other SwashInfo hash
12670entry line; it looks like this:
d73e5302 12671
99870f4d 12672 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
d73e5302 12673
99870f4d
KW
12674This example line says that any Unicode code points not explicitly listed in
12675the file have the value 'NaN' under the property indicated by NAME. If the
12676value is the special string C<< <code point> >>, it means that the value for
12677any missing code point is the code point itself. This happens, for example,
12678in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12679character 'A', are missing because the uppercase of 'A' is itself.
d73e5302 12680
99870f4d 12681=head1 SEE ALSO
d73e5302 12682
99870f4d 12683L<$unicode_reference_url>
12ac2576 12684
99870f4d 12685L<perlrecharclass>
12ac2576 12686
99870f4d 12687L<perlunicode>
d73e5302 12688
99870f4d 12689END
d73e5302 12690
99870f4d
KW
12691 # And write it.
12692 main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12693 return;
12694}
d73e5302 12695
99870f4d
KW
12696sub make_Heavy () {
12697 # Create and write Heavy.pl, which passes info about the tables to
12698 # utf8_heavy.pl
12ac2576 12699
99870f4d
KW
12700 my @heavy = <<END;
12701$HEADER
12702$INTERNAL_ONLY
d73e5302 12703
99870f4d 12704# This file is for the use of utf8_heavy.pl
12ac2576 12705
99870f4d
KW
12706# Maps property names in loose standard form to its standard name
12707\%utf8::loose_property_name_of = (
12708END
cf25bb62 12709
99870f4d
KW
12710 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12711 push @heavy, <<END;
12712);
12ac2576 12713
99870f4d
KW
12714# Maps property, table to file for those using stricter matching
12715\%utf8::stricter_to_file_of = (
12716END
12717 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12718 push @heavy, <<END;
12719);
12ac2576 12720
99870f4d
KW
12721# Maps property, table to file for those using loose matching
12722\%utf8::loose_to_file_of = (
12723END
12724 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12725 push @heavy, <<END;
12726);
12ac2576 12727
99870f4d
KW
12728# Maps floating point to fractional form
12729\%utf8::nv_floating_to_rational = (
12730END
12731 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12732 push @heavy, <<END;
12733);
12ac2576 12734
99870f4d
KW
12735# If a floating point number doesn't have enough digits in it to get this
12736# close to a fraction, it isn't considered to be that fraction even if all the
12737# digits it does have match.
12738\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 12739
99870f4d
KW
12740# Deprecated tables to generate a warning for. The key is the file containing
12741# the table, so as to avoid duplication, as many property names can map to the
12742# file, but we only need one entry for all of them.
12743\%utf8::why_deprecated = (
12744END
12ac2576 12745
99870f4d
KW
12746 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12747 push @heavy, <<END;
12748);
12ac2576 12749
99870f4d
KW
127501;
12751END
12ac2576 12752
99870f4d
KW
12753 main::write("Heavy.pl", @heavy);
12754 return;
12ac2576
JP
12755}
12756
99870f4d
KW
12757sub write_all_tables() {
12758 # Write out all the tables generated by this program to files, as well as
12759 # the supporting data structures, pod file, and .t file.
12760
12761 my @writables; # List of tables that actually get written
12762 my %match_tables_to_write; # Used to collapse identical match tables
12763 # into one file. Each key is a hash function
12764 # result to partition tables into buckets.
12765 # Each value is an array of the tables that
12766 # fit in the bucket.
12767
12768 # For each property ...
12769 # (sort so that if there is an immutable file name, it has precedence, so
12770 # some other property can't come in and take over its file name. If b's
12771 # file name is defined, will return 1, meaning to take it first; don't
12772 # care if both defined, as they had better be different anyway)
12773 PROPERTY:
12774 foreach my $property (sort { defined $b->file } property_ref('*')) {
12775 my $type = $property->type;
12776
12777 # And for each table for that property, starting with the mapping
12778 # table for it ...
12779 TABLE:
12780 foreach my $table($property,
12781
12782 # and all the match tables for it (if any), sorted so
12783 # the ones with the shortest associated file name come
12784 # first. The length sorting prevents problems of a
12785 # longer file taking a name that might have to be used
12786 # by a shorter one. The alphabetic sorting prevents
12787 # differences between releases
12788 sort { my $ext_a = $a->external_name;
12789 return 1 if ! defined $ext_a;
12790 my $ext_b = $b->external_name;
12791 return -1 if ! defined $ext_b;
12792 my $cmp = length $ext_a <=> length $ext_b;
12793
12794 # Return result if lengths not equal
12795 return $cmp if $cmp;
12796
12797 # Alphabetic if lengths equal
12798 return $ext_a cmp $ext_b
12799 } $property->tables
12800 )
12801 {
12ac2576 12802
99870f4d
KW
12803 # Here we have a table associated with a property. It could be
12804 # the map table (done first for each property), or one of the
12805 # other tables. Determine which type.
12806 my $is_property = $table->isa('Property');
12807
12808 my $name = $table->name;
12809 my $complete_name = $table->complete_name;
12810
12811 # See if should suppress the table if is empty, but warn if it
12812 # contains something.
12813 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12814 keys %why_suppress_if_empty_warn_if_not;
12815
12816 # Calculate if this table should have any code points associated
12817 # with it or not.
12818 my $expected_empty =
12819
12820 # $perl should be empty, as well as properties that we just
12821 # don't do anything with
12822 ($is_property
12823 && ($table == $perl
12824 || grep { $complete_name eq $_ }
12825 @unimplemented_properties
12826 )
12827 )
12828
12829 # Match tables in properties we skipped populating should be
12830 # empty
12831 || (! $is_property && ! $property->to_create_match_tables)
12832
12833 # Tables and properties that are expected to have no code
12834 # points should be empty
12835 || $suppress_if_empty_warn_if_not
12836 ;
12837
12838 # Set a boolean if this table is the complement of an empty binary
12839 # table
12840 my $is_complement_of_empty_binary =
12841 $type == $BINARY &&
12842 (($table == $property->table('Y')
12843 && $property->table('N')->is_empty)
12844 || ($table == $property->table('N')
12845 && $property->table('Y')->is_empty));
12846
12847
12848 # Some tables should match everything
12849 my $expected_full =
12850 ($is_property)
12851 ? # All these types of map tables will be full because
12852 # they will have been populated with defaults
12853 ($type == $ENUM || $type == $BINARY)
12854
12855 : # A match table should match everything if its method
12856 # shows it should
12857 ($table->matches_all
12858
12859 # The complement of an empty binary table will match
12860 # everything
12861 || $is_complement_of_empty_binary
12862 )
12863 ;
12864
12865 if ($table->is_empty) {
12866
12867
12868 if ($suppress_if_empty_warn_if_not) {
12869 $table->set_status($SUPPRESSED,
12870 $why_suppress_if_empty_warn_if_not{$complete_name});
12871 }
12ac2576 12872
99870f4d
KW
12873 # Suppress expected empty tables.
12874 next TABLE if $expected_empty;
12875
12876 # And setup to later output a warning for those that aren't
12877 # known to be allowed to be empty. Don't do the warning if
12878 # this table is a child of another one to avoid duplicating
12879 # the warning that should come from the parent one.
12880 if (($table == $property || $table->parent == $table)
12881 && $table->status ne $SUPPRESSED
12882 && ! grep { $complete_name =~ /^$_$/ }
12883 @tables_that_may_be_empty)
12884 {
12885 push @unhandled_properties, "$table";
12886 }
12887 }
12888 elsif ($expected_empty) {
12889 my $because = "";
12890 if ($suppress_if_empty_warn_if_not) {
12891 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12892 }
12ac2576 12893
99870f4d
KW
12894 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
12895 }
12ac2576 12896
99870f4d
KW
12897 my $count = $table->count;
12898 if ($expected_full) {
12899 if ($count != $MAX_UNICODE_CODEPOINTS) {
12900 Carp::my_carp("$table matches only "
12901 . clarify_number($count)
12902 . " Unicode code points but should match "
12903 . clarify_number($MAX_UNICODE_CODEPOINTS)
12904 . " (off by "
12905 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12906 . "). Proceeding anyway.");
12907 }
12ac2576 12908
99870f4d
KW
12909 # Here is expected to be full. If it is because it is the
12910 # complement of an (empty) binary table that is to be
12911 # suppressed, then suppress this one as well.
12912 if ($is_complement_of_empty_binary) {
12913 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12914 my $opposing = $property->table($opposing_name);
12915 my $opposing_status = $opposing->status;
12916 if ($opposing_status) {
12917 $table->set_status($opposing_status,
12918 $opposing->status_info);
12919 }
12920 }
12921 }
12922 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12923 if ($table == $property || $table->leader == $table) {
12924 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
12925 }
12926 }
d73e5302 12927
99870f4d
KW
12928 if ($table->status eq $SUPPRESSED) {
12929 if (! $is_property) {
12930 my @children = $table->children;
12931 foreach my $child (@children) {
12932 if ($child->status ne $SUPPRESSED) {
12933 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12934 }
12935 }
12936 }
12937 next TABLE;
d73e5302 12938
99870f4d
KW
12939 }
12940 if (! $is_property) {
12941
12942 # Several things need to be done just once for each related
12943 # group of match tables. Do them on the parent.
12944 if ($table->parent == $table) {
12945
12946 # Add an entry in the pod file for the table; it also does
12947 # the children.
23e33b60 12948 make_table_pod_entries($table) if defined $pod_directory;
99870f4d
KW
12949
12950 # See if the the table matches identical code points with
12951 # something that has already been output. In that case,
12952 # no need to have two files with the same code points in
12953 # them. We use the table's hash() method to store these
12954 # in buckets, so that it is quite likely that if two
12955 # tables are in the same bucket they will be identical, so
12956 # don't have to compare tables frequently. The tables
12957 # have to have the same status to share a file, so add
12958 # this to the bucket hash. (The reason for this latter is
12959 # that Heavy.pl associates a status with a file.)
12960 my $hash = $table->hash . ';' . $table->status;
12961
12962 # Look at each table that is in the same bucket as this
12963 # one would be.
12964 foreach my $comparison (@{$match_tables_to_write{$hash}})
12965 {
12966 if ($table->matches_identically_to($comparison)) {
12967 $table->set_equivalent_to($comparison,
12968 Related => 0);
12969 next TABLE;
12970 }
12971 }
d73e5302 12972
99870f4d
KW
12973 # Here, not equivalent, add this table to the bucket.
12974 push @{$match_tables_to_write{$hash}}, $table;
12975 }
12976 }
12977 else {
12978
12979 # Here is the property itself.
12980 # Don't write out or make references to the $perl property
12981 next if $table == $perl;
12982
12983 if ($type != $STRING) {
12984
12985 # There is a mapping stored of the various synonyms to the
12986 # standardized name of the property for utf8_heavy.pl.
12987 # Also, the pod file contains entries of the form:
12988 # \p{alias: *} \p{full: *}
12989 # rather than show every possible combination of things.
12990
12991 my @property_aliases = $property->aliases;
12992
12993 # The full name of this property is stored by convention
12994 # first in the alias array
12995 my $full_property_name =
12996 '\p{' . $property_aliases[0]->name . ': *}';
12997 my $standard_property_name = standardize($table->name);
12998
12999 # For each synonym ...
13000 for my $i (0 .. @property_aliases - 1) {
13001 my $alias = $property_aliases[$i];
13002 my $alias_name = $alias->name;
13003 my $alias_standard = standardize($alias_name);
13004
13005 # Set the mapping for utf8_heavy of the alias to the
13006 # property
13007 if (exists ($loose_property_name_of{$alias_standard}))
13008 {
13009 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
13010 }
13011 else {
13012 $loose_property_name_of{$alias_standard}
13013 = $standard_property_name;
13014 }
13015
23e33b60
KW
13016 # Now for the pod entry for this alias. Skip if not
13017 # outputting a pod; skip the first one, which is the
13018 # full name so won't have an entry like: '\p{full: *}
13019 # \p{full: *}', and skip if don't want an entry for
13020 # this one.
13021 next if $i == 0
13022 || ! defined $pod_directory
13023 || ! $alias->make_pod_entry;
99870f4d 13024
d57ccc9a
KW
13025 my $rhs = $full_property_name;
13026 if ($property != $perl && $table->perl_extension) {
13027 $rhs .= ' (Perl extension)';
13028 }
99870f4d
KW
13029 push @match_properties,
13030 format_pod_line($indent_info_column,
13031 '\p{' . $alias->name . ': *}',
d57ccc9a 13032 $rhs,
99870f4d
KW
13033 $alias->status);
13034 }
13035 } # End of non-string-like property code
d73e5302 13036
d73e5302 13037
99870f4d
KW
13038 # Don't output a mapping file if not desired.
13039 next if ! $property->to_output_map;
13040 }
d73e5302 13041
99870f4d
KW
13042 # Here, we know we want to write out the table, but don't do it
13043 # yet because there may be other tables that come along and will
13044 # want to share the file, and the file's comments will change to
13045 # mention them. So save for later.
13046 push @writables, $table;
13047
13048 } # End of looping through the property and all its tables.
13049 } # End of looping through all properties.
13050
13051 # Now have all the tables that will have files written for them. Do it.
13052 foreach my $table (@writables) {
13053 my @directory;
13054 my $filename;
13055 my $property = $table->property;
13056 my $is_property = ($table == $property);
13057 if (! $is_property) {
13058
13059 # Match tables for the property go in lib/$subdirectory, which is
13060 # the property's name. Don't use the standard file name for this,
13061 # as may get an unfamiliar alias
13062 @directory = ($matches_directory, $property->external_name);
13063 }
13064 else {
d73e5302 13065
99870f4d
KW
13066 @directory = $table->directory;
13067 $filename = $table->file;
13068 }
d73e5302 13069
99870f4d
KW
13070 # Use specified filename if avaliable, or default to property's
13071 # shortest name. We need an 8.3 safe filename (which means "an 8
13072 # safe" filename, since after the dot is only 'pl', which is < 3)
13073 # The 2nd parameter is if the filename shouldn't be changed, and
13074 # it shouldn't iff there is a hard-coded name for this table.
13075 $filename = construct_filename(
13076 $filename || $table->external_name,
13077 ! $filename, # mutable if no filename
13078 \@directory);
d73e5302 13079
99870f4d 13080 register_file_for_name($table, \@directory, $filename);
d73e5302 13081
99870f4d
KW
13082 # Only need to write one file when shared by more than one
13083 # property
13084 next if ! $is_property && $table->leader != $table;
d73e5302 13085
99870f4d
KW
13086 # Construct a nice comment to add to the file
13087 $table->set_final_comment;
13088
13089 $table->write;
cf25bb62 13090 }
d73e5302 13091
d73e5302 13092
99870f4d
KW
13093 # Write out the pod file
13094 make_pod;
13095
13096 # And Heavy.pl
13097 make_Heavy;
d73e5302 13098
99870f4d
KW
13099 make_property_test_script() if $make_test_script;
13100 return;
cf25bb62 13101}
d73e5302 13102
99870f4d
KW
13103my @white_space_separators = ( # This used only for making the test script.
13104 "",
13105 ' ',
13106 "\t",
13107 ' '
13108 );
d73e5302 13109
99870f4d
KW
13110sub generate_separator($) {
13111 # This used only for making the test script. It generates the colon or
13112 # equal separator between the property and property value, with random
13113 # white space surrounding the separator
d73e5302 13114
99870f4d 13115 my $lhs = shift;
d73e5302 13116
99870f4d 13117 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 13118
99870f4d
KW
13119 # Choose space before and after randomly
13120 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
13121 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 13122
99870f4d
KW
13123 # And return the whole complex, half the time using a colon, half the
13124 # equals
13125 return $spaces_before
13126 . (rand() < 0.5) ? '=' : ':'
13127 . $spaces_after;
13128}
76ccdbe2 13129
430ada4c 13130sub generate_tests($$$$$) {
99870f4d
KW
13131 # This used only for making the test script. It generates test cases that
13132 # are expected to compile successfully in perl. Note that the lhs and
13133 # rhs are assumed to already be as randomized as the caller wants.
13134
99870f4d
KW
13135 my $lhs = shift; # The property: what's to the left of the colon
13136 # or equals separator
13137 my $rhs = shift; # The property value; what's to the right
13138 my $valid_code = shift; # A code point that's known to be in the
13139 # table given by lhs=rhs; undef if table is
13140 # empty
13141 my $invalid_code = shift; # A code point known to not be in the table;
13142 # undef if the table is all code points
13143 my $warning = shift;
13144
13145 # Get the colon or equal
13146 my $separator = generate_separator($lhs);
13147
13148 # The whole 'property=value'
13149 my $name = "$lhs$separator$rhs";
13150
430ada4c 13151 my @output;
99870f4d
KW
13152 # Create a complete set of tests, with complements.
13153 if (defined $valid_code) {
430ada4c
NC
13154 push @output, <<"EOC"
13155Expect(1, $valid_code, '\\p{$name}', $warning);
13156Expect(0, $valid_code, '\\p{^$name}', $warning);
13157Expect(0, $valid_code, '\\P{$name}', $warning);
13158Expect(1, $valid_code, '\\P{^$name}', $warning);
13159EOC
99870f4d
KW
13160 }
13161 if (defined $invalid_code) {
430ada4c
NC
13162 push @output, <<"EOC"
13163Expect(0, $invalid_code, '\\p{$name}', $warning);
13164Expect(1, $invalid_code, '\\p{^$name}', $warning);
13165Expect(1, $invalid_code, '\\P{$name}', $warning);
13166Expect(0, $invalid_code, '\\P{^$name}', $warning);
13167EOC
13168 }
13169 return @output;
99870f4d 13170}
cf25bb62 13171
430ada4c 13172sub generate_error($$$) {
99870f4d
KW
13173 # This used only for making the test script. It generates test cases that
13174 # are expected to not only not match, but to be syntax or similar errors
13175
99870f4d
KW
13176 my $lhs = shift; # The property: what's to the left of the
13177 # colon or equals separator
13178 my $rhs = shift; # The property value; what's to the right
13179 my $already_in_error = shift; # Boolean; if true it's known that the
13180 # unmodified lhs and rhs will cause an error.
13181 # This routine should not force another one
13182 # Get the colon or equal
13183 my $separator = generate_separator($lhs);
13184
13185 # Since this is an error only, don't bother to randomly decide whether to
13186 # put the error on the left or right side; and assume that the rhs is
13187 # loosely matched, again for convenience rather than rigor.
13188 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
13189
13190 my $property = $lhs . $separator . $rhs;
13191
430ada4c
NC
13192 return <<"EOC";
13193Error('\\p{$property}');
13194Error('\\P{$property}');
13195EOC
d73e5302
JH
13196}
13197
99870f4d
KW
13198# These are used only for making the test script
13199# XXX Maybe should also have a bad strict seps, which includes underscore.
13200
13201my @good_loose_seps = (
13202 " ",
13203 "-",
13204 "\t",
13205 "",
13206 "_",
13207 );
13208my @bad_loose_seps = (
13209 "/a/",
13210 ':=',
13211 );
13212
13213sub randomize_stricter_name {
13214 # This used only for making the test script. Take the input name and
13215 # return a randomized, but valid version of it under the stricter matching
13216 # rules.
13217
13218 my $name = shift;
13219 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13220
13221 # If the name looks like a number (integer, floating, or rational), do
13222 # some extra work
13223 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
13224 my $sign = $1;
13225 my $number = $2;
13226 my $separator = $3;
13227
13228 # If there isn't a sign, part of the time add a plus
13229 # Note: Not testing having any denominator having a minus sign
13230 if (! $sign) {
13231 $sign = '+' if rand() <= .3;
13232 }
13233
13234 # And add 0 or more leading zeros.
13235 $name = $sign . ('0' x int rand(10)) . $number;
13236
13237 if (defined $separator) {
13238 my $extra_zeros = '0' x int rand(10);
cf25bb62 13239
99870f4d
KW
13240 if ($separator eq '.') {
13241
13242 # Similarly, add 0 or more trailing zeros after a decimal
13243 # point
13244 $name .= $extra_zeros;
13245 }
13246 else {
13247
13248 # Or, leading zeros before the denominator
13249 $name =~ s,/,/$extra_zeros,;
13250 }
13251 }
cf25bb62 13252 }
d73e5302 13253
99870f4d
KW
13254 # For legibility of the test, only change the case of whole sections at a
13255 # time. To do this, first split into sections. The split returns the
13256 # delimiters
13257 my @sections;
13258 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
13259 trace $section if main::DEBUG && $to_trace;
13260
13261 if (length $section > 1 && $section !~ /\D/) {
13262
13263 # If the section is a sequence of digits, about half the time
13264 # randomly add underscores between some of them.
13265 if (rand() > .5) {
13266
13267 # Figure out how many underscores to add. max is 1 less than
13268 # the number of digits. (But add 1 at the end to make sure
13269 # result isn't 0, and compensate earlier by subtracting 2
13270 # instead of 1)
13271 my $num_underscores = int rand(length($section) - 2) + 1;
13272
13273 # And add them evenly throughout, for convenience, not rigor
13274 use integer;
13275 my $spacing = (length($section) - 1)/ $num_underscores;
13276 my $temp = $section;
13277 $section = "";
13278 for my $i (1 .. $num_underscores) {
13279 $section .= substr($temp, 0, $spacing, "") . '_';
13280 }
13281 $section .= $temp;
13282 }
13283 push @sections, $section;
13284 }
13285 else {
d73e5302 13286
99870f4d
KW
13287 # Here not a sequence of digits. Change the case of the section
13288 # randomly
13289 my $switch = int rand(4);
13290 if ($switch == 0) {
13291 push @sections, uc $section;
13292 }
13293 elsif ($switch == 1) {
13294 push @sections, lc $section;
13295 }
13296 elsif ($switch == 2) {
13297 push @sections, ucfirst $section;
13298 }
13299 else {
13300 push @sections, $section;
13301 }
13302 }
cf25bb62 13303 }
99870f4d
KW
13304 trace "returning", join "", @sections if main::DEBUG && $to_trace;
13305 return join "", @sections;
13306}
71d929cb 13307
99870f4d
KW
13308sub randomize_loose_name($;$) {
13309 # This used only for making the test script
71d929cb 13310
99870f4d
KW
13311 my $name = shift;
13312 my $want_error = shift; # if true, make an error
13313 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13314
13315 $name = randomize_stricter_name($name);
5beb625e
JH
13316
13317 my @parts;
99870f4d
KW
13318 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13319 for my $part (split /[-\s_]+/, $name) {
5beb625e 13320 if (@parts) {
99870f4d
KW
13321 if ($want_error and rand() < 0.3) {
13322 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13323 $want_error = 0;
13324 }
13325 else {
13326 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
13327 }
13328 }
99870f4d 13329 push @parts, $part;
5beb625e 13330 }
99870f4d
KW
13331 my $new = join("", @parts);
13332 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 13333
99870f4d 13334 if ($want_error) {
5beb625e 13335 if (rand() >= 0.5) {
99870f4d
KW
13336 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13337 }
13338 else {
13339 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
13340 }
13341 }
13342 return $new;
13343}
13344
99870f4d
KW
13345# Used to make sure don't generate duplicate test cases.
13346my %test_generated;
5beb625e 13347
99870f4d
KW
13348sub make_property_test_script() {
13349 # This used only for making the test script
13350 # this written directly -- it's huge.
5beb625e 13351
99870f4d 13352 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 13353
99870f4d
KW
13354 # This uses randomness to test different possibilities without testing all
13355 # possibilities. To ensure repeatability, set the seed to 0. But if
13356 # tests are added, it will perturb all later ones in the .t file
13357 srand 0;
5beb625e 13358
3df51b85
KW
13359 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13360
99870f4d
KW
13361 # Keep going down an order of magnitude
13362 # until find that adding this quantity to
13363 # 1 remains 1; but put an upper limit on
13364 # this so in case this algorithm doesn't
13365 # work properly on some platform, that we
13366 # won't loop forever.
13367 my $digits = 0;
13368 my $min_floating_slop = 1;
13369 while (1+ $min_floating_slop != 1
13370 && $digits++ < 50)
5beb625e 13371 {
99870f4d
KW
13372 my $next = $min_floating_slop / 10;
13373 last if $next == 0; # If underflows,
13374 # use previous one
13375 $min_floating_slop = $next;
5beb625e 13376 }
430ada4c
NC
13377
13378 # It doesn't matter whether the elements of this array contain single lines
13379 # or multiple lines. main::write doesn't count the lines.
13380 my @output;
99870f4d
KW
13381
13382 foreach my $property (property_ref('*')) {
13383 foreach my $table ($property->tables) {
13384
13385 # Find code points that match, and don't match this table.
13386 my $valid = $table->get_valid_code_point;
13387 my $invalid = $table->get_invalid_code_point;
13388 my $warning = ($table->status eq $DEPRECATED)
13389 ? "'deprecated'"
13390 : '""';
13391
13392 # Test each possible combination of the property's aliases with
13393 # the table's. If this gets to be too many, could do what is done
13394 # in the set_final_comment() for Tables
13395 my @table_aliases = $table->aliases;
13396 my @property_aliases = $table->property->aliases;
13397 my $max = max(scalar @table_aliases, scalar @property_aliases);
13398 for my $j (0 .. $max - 1) {
13399
13400 # The current alias for property is the next one on the list,
13401 # or if beyond the end, start over. Similarly for table
13402 my $property_name
13403 = $property_aliases[$j % @property_aliases]->name;
13404
13405 $property_name = "" if $table->property == $perl;
13406 my $table_alias = $table_aliases[$j % @table_aliases];
13407 my $table_name = $table_alias->name;
13408 my $loose_match = $table_alias->loose_match;
13409
13410 # If the table doesn't have a file, any test for it is
13411 # already guaranteed to be in error
13412 my $already_error = ! $table->file_path;
13413
13414 # Generate error cases for this alias.
430ada4c
NC
13415 push @output, generate_error($property_name,
13416 $table_name,
13417 $already_error);
99870f4d
KW
13418
13419 # If the table is guaranteed to always generate an error,
13420 # quit now without generating success cases.
13421 next if $already_error;
13422
13423 # Now for the success cases.
13424 my $random;
13425 if ($loose_match) {
13426
13427 # For loose matching, create an extra test case for the
13428 # standard name.
13429 my $standard = standardize($table_name);
13430
13431 # $test_name should be a unique combination for each test
13432 # case; used just to avoid duplicate tests
13433 my $test_name = "$property_name=$standard";
13434
13435 # Don't output duplicate test cases.
13436 if (! exists $test_generated{$test_name}) {
13437 $test_generated{$test_name} = 1;
430ada4c
NC
13438 push @output, generate_tests($property_name,
13439 $standard,
13440 $valid,
13441 $invalid,
13442 $warning,
13443 );
5beb625e 13444 }
99870f4d
KW
13445 $random = randomize_loose_name($table_name)
13446 }
13447 else { # Stricter match
13448 $random = randomize_stricter_name($table_name);
99598c8c 13449 }
99598c8c 13450
99870f4d
KW
13451 # Now for the main test case for this alias.
13452 my $test_name = "$property_name=$random";
13453 if (! exists $test_generated{$test_name}) {
13454 $test_generated{$test_name} = 1;
430ada4c
NC
13455 push @output, generate_tests($property_name,
13456 $random,
13457 $valid,
13458 $invalid,
13459 $warning,
13460 );
99870f4d
KW
13461
13462 # If the name is a rational number, add tests for the
13463 # floating point equivalent.
13464 if ($table_name =~ qr{/}) {
13465
13466 # Calculate the float, and find just the fraction.
13467 my $float = eval $table_name;
13468 my ($whole, $fraction)
13469 = $float =~ / (.*) \. (.*) /x;
13470
13471 # Starting with one digit after the decimal point,
13472 # create a test for each possible precision (number of
13473 # digits past the decimal point) until well beyond the
13474 # native number found on this machine. (If we started
13475 # with 0 digits, it would be an integer, which could
13476 # well match an unrelated table)
13477 PLACE:
13478 for my $i (1 .. $min_floating_slop + 3) {
13479 my $table_name = sprintf("%.*f", $i, $float);
13480 if ($i < $MIN_FRACTION_LENGTH) {
13481
13482 # If the test case has fewer digits than the
13483 # minimum acceptable precision, it shouldn't
13484 # succeed, so we expect an error for it.
13485 # E.g., 2/3 = .7 at one decimal point, and we
13486 # shouldn't say it matches .7. We should make
13487 # it be .667 at least before agreeing that the
13488 # intent was to match 2/3. But at the
13489 # less-than- acceptable level of precision, it
13490 # might actually match an unrelated number.
13491 # So don't generate a test case if this
13492 # conflating is possible. In our example, we
13493 # don't want 2/3 matching 7/10, if there is
13494 # a 7/10 code point.
13495 for my $existing
13496 (keys %nv_floating_to_rational)
13497 {
13498 next PLACE
13499 if abs($table_name - $existing)
13500 < $MAX_FLOATING_SLOP;
13501 }
430ada4c
NC
13502 push @output, generate_error($property_name,
13503 $table_name,
13504 1 # 1 => already an error
13505 );
99870f4d
KW
13506 }
13507 else {
13508
13509 # Here the number of digits exceeds the
13510 # minimum we think is needed. So generate a
13511 # success test case for it.
430ada4c
NC
13512 push @output, generate_tests($property_name,
13513 $table_name,
13514 $valid,
13515 $invalid,
13516 $warning,
13517 );
99870f4d
KW
13518 }
13519 }
99598c8c
JH
13520 }
13521 }
99870f4d
KW
13522 }
13523 }
13524 }
37e2e78e 13525
430ada4c
NC
13526 &write($t_path, [<DATA>,
13527 @output,
13528 (map {"Test_X('$_');\n"} @backslash_X_tests),
13529 "Finished();\n"]);
99870f4d
KW
13530 return;
13531}
99598c8c 13532
99870f4d
KW
13533# This is a list of the input files and how to handle them. The files are
13534# processed in their order in this list. Some reordering is possible if
13535# desired, but the v0 files should be first, and the extracted before the
13536# others except DAge.txt (as data in an extracted file can be over-ridden by
13537# the non-extracted. Some other files depend on data derived from an earlier
13538# file, like UnicodeData requires data from Jamo, and the case changing and
13539# folding requires data from Unicode. Mostly, it safest to order by first
13540# version releases in (except the Jamo). DAge.txt is read before the
13541# extracted ones because of the rarely used feature $compare_versions. In the
13542# unlikely event that there were ever an extracted file that contained the Age
13543# property information, it would have to go in front of DAge.
13544#
13545# The version strings allow the program to know whether to expect a file or
13546# not, but if a file exists in the directory, it will be processed, even if it
13547# is in a version earlier than expected, so you can copy files from a later
13548# release into an earlier release's directory.
13549my @input_file_objects = (
13550 Input_file->new('PropertyAliases.txt', v0,
13551 Handler => \&process_PropertyAliases,
13552 ),
13553 Input_file->new(undef, v0, # No file associated with this
3df51b85 13554 Progress_Message => 'Finishing property setup',
99870f4d
KW
13555 Handler => \&finish_property_setup,
13556 ),
13557 Input_file->new('PropValueAliases.txt', v0,
13558 Handler => \&process_PropValueAliases,
13559 Has_Missings_Defaults => $NOT_IGNORED,
13560 ),
13561 Input_file->new('DAge.txt', v3.2.0,
13562 Has_Missings_Defaults => $NOT_IGNORED,
13563 Property => 'Age'
13564 ),
13565 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13566 Property => 'General_Category',
13567 ),
13568 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13569 Property => 'Canonical_Combining_Class',
13570 Has_Missings_Defaults => $NOT_IGNORED,
13571 ),
13572 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13573 Property => 'Numeric_Type',
13574 Has_Missings_Defaults => $NOT_IGNORED,
13575 ),
13576 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13577 Property => 'East_Asian_Width',
13578 Has_Missings_Defaults => $NOT_IGNORED,
13579 ),
13580 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13581 Property => 'Line_Break',
13582 Has_Missings_Defaults => $NOT_IGNORED,
13583 ),
13584 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13585 Property => 'Bidi_Class',
13586 Has_Missings_Defaults => $NOT_IGNORED,
13587 ),
13588 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13589 Property => 'Decomposition_Type',
13590 Has_Missings_Defaults => $NOT_IGNORED,
13591 ),
13592 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13593 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13594 Property => 'Numeric_Value',
13595 Each_Line_Handler => \&filter_numeric_value_line,
13596 Has_Missings_Defaults => $NOT_IGNORED,
13597 ),
13598 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13599 Property => 'Joining_Group',
13600 Has_Missings_Defaults => $NOT_IGNORED,
13601 ),
13602
13603 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13604 Property => 'Joining_Type',
13605 Has_Missings_Defaults => $NOT_IGNORED,
13606 ),
13607 Input_file->new('Jamo.txt', v2.0.0,
13608 Property => 'Jamo_Short_Name',
13609 Each_Line_Handler => \&filter_jamo_line,
13610 ),
13611 Input_file->new('UnicodeData.txt', v1.1.5,
13612 Pre_Handler => \&setup_UnicodeData,
13613
13614 # We clean up this file for some early versions.
13615 Each_Line_Handler => [ (($v_version lt v2.0.0 )
13616 ? \&filter_v1_ucd
13617 : ($v_version eq v2.1.5)
13618 ? \&filter_v2_1_5_ucd
13619 : undef),
13620
13621 # And the main filter
13622 \&filter_UnicodeData_line,
13623 ],
13624 EOF_Handler => \&EOF_UnicodeData,
13625 ),
13626 Input_file->new('ArabicShaping.txt', v2.0.0,
13627 Each_Line_Handler =>
13628 [ ($v_version lt 4.1.0)
13629 ? \&filter_old_style_arabic_shaping
13630 : undef,
13631 \&filter_arabic_shaping_line,
13632 ],
13633 Has_Missings_Defaults => $NOT_IGNORED,
13634 ),
13635 Input_file->new('Blocks.txt', v2.0.0,
13636 Property => 'Block',
13637 Has_Missings_Defaults => $NOT_IGNORED,
13638 Each_Line_Handler => \&filter_blocks_lines
13639 ),
13640 Input_file->new('PropList.txt', v2.0.0,
13641 Each_Line_Handler => (($v_version lt v3.1.0)
13642 ? \&filter_old_style_proplist
13643 : undef),
13644 ),
13645 Input_file->new('Unihan.txt', v2.0.0,
13646 Pre_Handler => \&setup_unihan,
13647 Optional => 1,
13648 Each_Line_Handler => \&filter_unihan_line,
13649 ),
13650 Input_file->new('SpecialCasing.txt', v2.1.8,
13651 Each_Line_Handler => \&filter_special_casing_line,
13652 Pre_Handler => \&setup_special_casing,
13653 ),
13654 Input_file->new(
13655 'LineBreak.txt', v3.0.0,
13656 Has_Missings_Defaults => $NOT_IGNORED,
13657 Property => 'Line_Break',
13658 # Early versions had problematic syntax
13659 Each_Line_Handler => (($v_version lt v3.1.0)
13660 ? \&filter_early_ea_lb
13661 : undef),
13662 ),
13663 Input_file->new('EastAsianWidth.txt', v3.0.0,
13664 Property => 'East_Asian_Width',
13665 Has_Missings_Defaults => $NOT_IGNORED,
13666 # Early versions had problematic syntax
13667 Each_Line_Handler => (($v_version lt v3.1.0)
13668 ? \&filter_early_ea_lb
13669 : undef),
13670 ),
13671 Input_file->new('CompositionExclusions.txt', v3.0.0,
13672 Property => 'Composition_Exclusion',
13673 ),
13674 Input_file->new('BidiMirroring.txt', v3.0.1,
13675 Property => 'Bidi_Mirroring_Glyph',
13676 ),
37e2e78e
KW
13677 Input_file->new("NormalizationTest.txt", v3.0.1,
13678 Skip => 1,
13679 ),
99870f4d
KW
13680 Input_file->new('CaseFolding.txt', v3.0.1,
13681 Pre_Handler => \&setup_case_folding,
13682 Each_Line_Handler =>
13683 [ ($v_version lt v3.1.0)
13684 ? \&filter_old_style_case_folding
13685 : undef,
13686 \&filter_case_folding_line
13687 ],
13688 Post_Handler => \&post_fold,
13689 ),
13690 Input_file->new('DCoreProperties.txt', v3.1.0,
13691 # 5.2 changed this file
13692 Has_Missings_Defaults => (($v_version ge v5.2.0)
13693 ? $NOT_IGNORED
13694 : $NO_DEFAULTS),
13695 ),
13696 Input_file->new('Scripts.txt', v3.1.0,
13697 Property => 'Script',
13698 Has_Missings_Defaults => $NOT_IGNORED,
13699 ),
13700 Input_file->new('DNormalizationProps.txt', v3.1.0,
13701 Has_Missings_Defaults => $NOT_IGNORED,
13702 Each_Line_Handler => (($v_version lt v4.0.1)
13703 ? \&filter_old_style_normalization_lines
13704 : undef),
13705 ),
13706 Input_file->new('HangulSyllableType.txt', v4.0.0,
13707 Has_Missings_Defaults => $NOT_IGNORED,
13708 Property => 'Hangul_Syllable_Type'),
13709 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13710 Property => 'Word_Break',
13711 Has_Missings_Defaults => $NOT_IGNORED,
13712 ),
13713 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13714 Property => 'Grapheme_Cluster_Break',
13715 Has_Missings_Defaults => $NOT_IGNORED,
13716 ),
37e2e78e
KW
13717 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
13718 Handler => \&process_GCB_test,
13719 ),
13720 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
13721 Skip => 1,
13722 ),
13723 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
13724 Skip => 1,
13725 ),
13726 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
13727 Skip => 1,
13728 ),
99870f4d
KW
13729 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13730 Property => 'Sentence_Break',
13731 Has_Missings_Defaults => $NOT_IGNORED,
13732 ),
13733 Input_file->new('NamedSequences.txt', v4.1.0,
13734 Handler => \&process_NamedSequences
13735 ),
13736 Input_file->new('NameAliases.txt', v5.0.0,
13737 Property => 'Name_Alias',
13738 ),
37e2e78e
KW
13739 Input_file->new("BidiTest.txt", v5.2.0,
13740 Skip => 1,
13741 ),
99870f4d
KW
13742 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13743 Optional => 1,
13744 Each_Line_Handler => \&filter_unihan_line,
13745 ),
13746 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13747 Optional => 1,
13748 Each_Line_Handler => \&filter_unihan_line,
13749 ),
13750 Input_file->new('UnihanIRGSources.txt', v5.2.0,
13751 Optional => 1,
13752 Pre_Handler => \&setup_unihan,
13753 Each_Line_Handler => \&filter_unihan_line,
13754 ),
13755 Input_file->new('UnihanNumericValues.txt', v5.2.0,
13756 Optional => 1,
13757 Each_Line_Handler => \&filter_unihan_line,
13758 ),
13759 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13760 Optional => 1,
13761 Each_Line_Handler => \&filter_unihan_line,
13762 ),
13763 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13764 Optional => 1,
13765 Each_Line_Handler => \&filter_unihan_line,
13766 ),
13767 Input_file->new('UnihanReadings.txt', v5.2.0,
13768 Optional => 1,
13769 Each_Line_Handler => \&filter_unihan_line,
13770 ),
13771 Input_file->new('UnihanVariants.txt', v5.2.0,
13772 Optional => 1,
13773 Each_Line_Handler => \&filter_unihan_line,
13774 ),
13775);
99598c8c 13776
99870f4d
KW
13777# End of all the preliminaries.
13778# Do it...
99598c8c 13779
99870f4d
KW
13780if ($compare_versions) {
13781 Carp::my_carp(<<END
13782Warning. \$compare_versions is set. Output is not suitable for production
13783END
13784 );
13785}
99598c8c 13786
99870f4d
KW
13787# Put into %potential_files a list of all the files in the directory structure
13788# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 13789# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
13790my @ignored_files_full_names = map { File::Spec->rel2abs(
13791 internal_file_to_platform($_))
13792 } keys %ignored_files;
13793File::Find::find({
13794 wanted=>sub {
37e2e78e 13795 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 13796 my $full = lc(File::Spec->rel2abs($_));
99870f4d 13797 $potential_files{$full} = 1
37e2e78e 13798 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
13799 return;
13800 }
13801}, File::Spec->curdir());
99598c8c 13802
99870f4d 13803my @mktables_list_output_files;
cf25bb62 13804
3644ba60
KW
13805if (! -e $file_list) {
13806 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
13807 $write_unchanged_files = 1;
13808} elsif ($write_unchanged_files) {
99870f4d
KW
13809 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13810}
13811else {
13812 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13813 my $file_handle;
23e33b60 13814 if (! open $file_handle, "<", $file_list) {
3644ba60 13815 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
13816 $glob_list = 1;
13817 }
13818 else {
13819 my @input;
13820
13821 # Read and parse mktables.lst, placing the results from the first part
13822 # into @input, and the second part into @mktables_list_output_files
13823 for my $list ( \@input, \@mktables_list_output_files ) {
13824 while (<$file_handle>) {
13825 s/^ \s+ | \s+ $//xg;
13826 next if /^ \s* (?: \# .* )? $/x;
13827 last if /^ =+ $/x;
13828 my ( $file ) = split /\t/;
13829 push @$list, $file;
cf25bb62 13830 }
99870f4d
KW
13831 @$list = uniques(@$list);
13832 next;
cf25bb62
JH
13833 }
13834
99870f4d
KW
13835 # Look through all the input files
13836 foreach my $input (@input) {
13837 next if $input eq 'version'; # Already have checked this.
cf25bb62 13838
99870f4d
KW
13839 # Ignore if doesn't exist. The checking about whether we care or
13840 # not is done via the Input_file object.
13841 next if ! file_exists($input);
5beb625e 13842
99870f4d
KW
13843 # The paths are stored with relative names, and with '/' as the
13844 # delimiter; convert to absolute on this machine
517956bf 13845 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
99870f4d 13846 $potential_files{$full} = 1
517956bf 13847 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 13848 }
5beb625e 13849 }
cf25bb62 13850
99870f4d
KW
13851 close $file_handle;
13852}
13853
13854if ($glob_list) {
13855
13856 # Here wants to process all .txt files in the directory structure.
13857 # Convert them to full path names. They are stored in the platform's
13858 # relative style
f86864ac
KW
13859 my @known_files;
13860 foreach my $object (@input_file_objects) {
13861 my $file = $object->file;
13862 next unless defined $file;
13863 push @known_files, File::Spec->rel2abs($file);
13864 }
99870f4d
KW
13865
13866 my @unknown_input_files;
13867 foreach my $file (keys %potential_files) {
517956bf 13868 next if grep { lc($file) eq lc($_) } @known_files;
99870f4d
KW
13869
13870 # Here, the file is unknown to us. Get relative path name
13871 $file = File::Spec->abs2rel($file);
13872 push @unknown_input_files, $file;
13873
13874 # What will happen is we create a data structure for it, and add it to
13875 # the list of input files to process. First get the subdirectories
13876 # into an array
13877 my (undef, $directories, undef) = File::Spec->splitpath($file);
13878 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13879 my @directories = File::Spec->splitdir($directories);
13880
13881 # If the file isn't extracted (meaning none of the directories is the
13882 # extracted one), just add it to the end of the list of inputs.
13883 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 13884 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
13885 }
13886 else {
13887
13888 # Here, the file is extracted. It needs to go ahead of most other
13889 # processing. Search for the first input file that isn't a
13890 # special required property (that is, find one whose first_release
13891 # is non-0), and isn't extracted. Also, the Age property file is
13892 # processed before the extracted ones, just in case
13893 # $compare_versions is set.
13894 for (my $i = 0; $i < @input_file_objects; $i++) {
13895 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
13896 && lc($input_file_objects[$i]->file) ne 'dage.txt'
13897 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 13898 {
99f78760 13899 splice @input_file_objects, $i, 0,
37e2e78e 13900 Input_file->new($file, v0);
99870f4d
KW
13901 last;
13902 }
cf25bb62 13903 }
99870f4d 13904
cf25bb62 13905 }
d2d499f5 13906 }
99870f4d 13907 if (@unknown_input_files) {
23e33b60 13908 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
13909
13910The following files are unknown as to how to handle. Assuming they are
13911typical property files. You'll know by later error messages if it worked or
13912not:
13913END
99f78760 13914 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
13915 }
13916} # End of looking through directory structure for more .txt files.
5beb625e 13917
99870f4d
KW
13918# Create the list of input files from the objects we have defined, plus
13919# version
13920my @input_files = 'version';
13921foreach my $object (@input_file_objects) {
13922 my $file = $object->file;
13923 next if ! defined $file; # Not all objects have files
13924 next if $object->optional && ! -e $file;
13925 push @input_files, $file;
13926}
5beb625e 13927
99870f4d
KW
13928if ( $verbosity >= $VERBOSE ) {
13929 print "Expecting ".scalar( @input_files )." input files. ",
13930 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13931}
cf25bb62 13932
99870f4d
KW
13933# We set $youngest to be the most recently changed input file, including this
13934# program itself (done much earlier in this file)
13935foreach my $in (@input_files) {
13936 my $age = -M $in;
13937 next unless defined $age; # Keep going even if missing a file
13938 $youngest = $age if $age < $youngest;
13939
13940 # See that the input files have distinct names, to warn someone if they
13941 # are adding a new one
13942 if ($make_list) {
13943 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13944 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13945 my @directories = File::Spec->splitdir($directories);
13946 my $base = $file =~ s/\.txt$//;
13947 construct_filename($file, 'mutable', \@directories);
cf25bb62 13948 }
99870f4d 13949}
cf25bb62 13950
99870f4d
KW
13951my $ok = ! $write_unchanged_files
13952 && scalar @mktables_list_output_files; # If none known, rebuild
cf25bb62 13953
99870f4d
KW
13954# Now we check to see if any output files are older than youngest, if
13955# they are, we need to continue on, otherwise we can presumably bail.
13956if ($ok) {
13957 foreach my $out (@mktables_list_output_files) {
13958 if ( ! file_exists($out)) {
13959 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
13960 $ok = 0;
13961 last;
13962 }
13963 #local $to_trace = 1 if main::DEBUG;
13964 trace $youngest, -M $out if main::DEBUG && $to_trace;
13965 if ( -M $out > $youngest ) {
13966 #trace "$out: age: ", -M $out, ", youngest: $youngest\n" if main::DEBUG && $to_trace;
13967 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
13968 $ok = 0;
13969 last;
cf25bb62 13970 }
cf25bb62 13971 }
99870f4d
KW
13972}
13973if ($ok) {
1265e11f 13974 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
13975 exit(0);
13976}
13977print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 13978
99870f4d
KW
13979# Ready to do the major processing. First create the perl pseudo-property.
13980$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 13981
99870f4d
KW
13982# Process each input file
13983foreach my $file (@input_file_objects) {
13984 $file->run;
d2d499f5
JH
13985}
13986
99870f4d 13987# Finish the table generation.
c4051cc5 13988
99870f4d
KW
13989print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
13990finish_Unicode();
c4051cc5 13991
99870f4d
KW
13992print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
13993compile_perl();
c4051cc5 13994
99870f4d
KW
13995print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
13996add_perl_synonyms();
c4051cc5 13997
99870f4d
KW
13998print "Writing tables\n" if $verbosity >= $PROGRESS;
13999write_all_tables();
c4051cc5 14000
99870f4d
KW
14001# Write mktables.lst
14002if ( $file_list and $make_list ) {
c4051cc5 14003
99870f4d
KW
14004 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
14005 foreach my $file (@input_files, @files_actually_output) {
14006 my (undef, $directories, $file) = File::Spec->splitpath($file);
14007 my @directories = File::Spec->splitdir($directories);
14008 $file = join '/', @directories, $file;
14009 }
14010
14011 my $ofh;
14012 if (! open $ofh,">",$file_list) {
14013 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
14014 return
14015 }
14016 else {
14017 print $ofh <<"END";
14018#
14019# $file_list -- File list for $0.
97050450
YO
14020#
14021# Autogenerated on @{[scalar localtime]}
14022#
14023# - First section is input files
99870f4d 14024# ($0 itself is not listed but is automatically considered an input)
97050450
YO
14025# - Section seperator is /^=+\$/
14026# - Second section is a list of output files.
14027# - Lines matching /^\\s*#/ are treated as comments
14028# which along with blank lines are ignored.
14029#
14030
14031# Input files:
14032
99870f4d
KW
14033END
14034 print $ofh "$_\n" for sort(@input_files);
14035 print $ofh "\n=================================\n# Output files:\n\n";
14036 print $ofh "$_\n" for sort @files_actually_output;
14037 print $ofh "\n# ",scalar(@input_files)," input files\n",
14038 "# ",scalar(@files_actually_output)+1," output files\n\n",
14039 "# End list\n";
14040 close $ofh
14041 or Carp::my_carp("Failed to close $ofh: $!");
14042
14043 print "Filelist has ",scalar(@input_files)," input files and ",
14044 scalar(@files_actually_output)+1," output files\n"
14045 if $verbosity >= $VERBOSE;
14046 }
14047}
14048
14049# Output these warnings unless -q explicitly specified.
14050if ($verbosity >= $NORMAL_VERBOSITY) {
14051 if (@unhandled_properties) {
14052 print "\nProperties and tables that unexpectedly have no code points\n";
14053 foreach my $property (sort @unhandled_properties) {
14054 print $property, "\n";
14055 }
14056 }
14057
14058 if (%potential_files) {
14059 print "\nInput files that are not considered:\n";
14060 foreach my $file (sort keys %potential_files) {
14061 print File::Spec->abs2rel($file), "\n";
14062 }
14063 }
14064 print "\nAll done\n" if $verbosity >= $VERBOSE;
14065}
5beb625e 14066exit(0);
cf25bb62 14067
99870f4d 14068# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 14069__DATA__
99870f4d 14070
5beb625e
JH
14071use strict;
14072use warnings;
14073
66fd7fd0
KW
14074# If run outside the normal test suite on an ASCII platform, you can
14075# just create a latin1_to_native() function that just returns its
14076# inputs, because that's the only function used from test.pl
14077require "test.pl";
14078
37e2e78e
KW
14079# Test qr/\X/ and the \p{} regular expression constructs. This file is
14080# constructed by mktables from the tables it generates, so if mktables is
14081# buggy, this won't necessarily catch those bugs. Tests are generated for all
14082# feasible properties; a few aren't currently feasible; see
14083# is_code_point_usable() in mktables for details.
99870f4d
KW
14084
14085# Standard test packages are not used because this manipulates SIG_WARN. It
14086# exits 0 if every non-skipped test succeeded; -1 if any failed.
14087
5beb625e
JH
14088my $Tests = 0;
14089my $Fails = 0;
99870f4d 14090
99870f4d
KW
14091sub Expect($$$$) {
14092 my $expected = shift;
14093 my $ord = shift;
14094 my $regex = shift;
14095 my $warning_type = shift; # Type of warning message, like 'deprecated'
14096 # or empty if none
14097 my $line = (caller)[2];
66fd7fd0 14098 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 14099
99870f4d 14100 # Convert the code point to hex form
23e33b60 14101 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 14102
99870f4d 14103 my @tests = "";
5beb625e 14104
37e2e78e
KW
14105 # The first time through, use all warnings. If the input should generate
14106 # a warning, add another time through with them turned off
99870f4d
KW
14107 push @tests, "no warnings '$warning_type';" if $warning_type;
14108
14109 foreach my $no_warnings (@tests) {
14110
14111 # Store any warning messages instead of outputting them
14112 local $SIG{__WARN__} = $SIG{__WARN__};
14113 my $warning_message;
14114 $SIG{__WARN__} = sub { $warning_message = $_[0] };
14115
14116 $Tests++;
14117
14118 # A string eval is needed because of the 'no warnings'.
14119 # Assumes no parens in the regular expression
14120 my $result = eval "$no_warnings
14121 my \$RegObj = qr($regex);
14122 $string =~ \$RegObj ? 1 : 0";
14123 if (not defined $result) {
14124 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
14125 $Fails++;
14126 }
14127 elsif ($result ^ $expected) {
14128 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
14129 $Fails++;
14130 }
14131 elsif ($warning_message) {
14132 if (! $warning_type || ($warning_type && $no_warnings)) {
14133 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
14134 $Fails++;
14135 }
14136 else {
14137 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
14138 }
14139 }
14140 elsif ($warning_type && ! $no_warnings) {
14141 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
14142 $Fails++;
14143 }
14144 else {
14145 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
14146 }
5beb625e 14147 }
99870f4d 14148 return;
5beb625e 14149}
d73e5302 14150
99870f4d
KW
14151sub Error($) {
14152 my $regex = shift;
5beb625e 14153 $Tests++;
99870f4d 14154 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 14155 $Fails++;
99870f4d
KW
14156 my $line = (caller)[2];
14157 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 14158 }
99870f4d
KW
14159 else {
14160 my $line = (caller)[2];
14161 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
14162 }
14163 return;
5beb625e
JH
14164}
14165
37e2e78e
KW
14166# GCBTest.txt character that separates grapheme clusters
14167my $breakable_utf8 = my $breakable = chr(0xF7);
14168utf8::upgrade($breakable_utf8);
14169
14170# GCBTest.txt character that indicates that the adjoining code points are part
14171# of the same grapheme cluster
14172my $nobreak_utf8 = my $nobreak = chr(0xD7);
14173utf8::upgrade($nobreak_utf8);
14174
14175sub Test_X($) {
14176 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
14177 # Each such line is a sequence of code points given by their hex numbers,
14178 # separated by the two characters defined just before this subroutine that
14179 # indicate that either there can or cannot be a break between the adjacent
14180 # code points. If there isn't a break, that means the sequence forms an
14181 # extended grapheme cluster, which means that \X should match the whole
14182 # thing. If there is a break, \X should stop there. This is all
14183 # converted by this routine into a match:
14184 # $string =~ /(\X)/,
14185 # Each \X should match the next cluster; and that is what is checked.
14186
14187 my $template = shift;
14188
14189 my $line = (caller)[2];
14190
14191 # The line contains characters above the ASCII range, but in Latin1. It
14192 # may or may not be in utf8, and if it is, it may or may not know it. So,
14193 # convert these characters to 8 bits. If knows is in utf8, simply
14194 # downgrade.
14195 if (utf8::is_utf8($template)) {
14196 utf8::downgrade($template);
14197 } else {
14198
14199 # Otherwise, if it is in utf8, but doesn't know it, the next lines
14200 # convert the two problematic characters to their 8-bit equivalents.
14201 # If it isn't in utf8, they don't harm anything.
14202 use bytes;
14203 $template =~ s/$nobreak_utf8/$nobreak/g;
14204 $template =~ s/$breakable_utf8/$breakable/g;
14205 }
14206
14207 # Get rid of the leading and trailing breakables
14208 $template =~ s/^ \s* $breakable \s* //x;
14209 $template =~ s/ \s* $breakable \s* $ //x;
14210
14211 # And no-breaks become just a space.
14212 $template =~ s/ \s* $nobreak \s* / /xg;
14213
14214 # Split the input into segments that are breakable between them.
14215 my @segments = split /\s*$breakable\s*/, $template;
14216
14217 my $string = "";
14218 my $display_string = "";
14219 my @should_match;
14220 my @should_display;
14221
14222 # Convert the code point sequence in each segment into a Perl string of
14223 # characters
14224 foreach my $segment (@segments) {
14225 my @code_points = split /\s+/, $segment;
14226 my $this_string = "";
14227 my $this_display = "";
14228 foreach my $code_point (@code_points) {
66fd7fd0 14229 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
14230 $this_display .= "\\x{$code_point}";
14231 }
14232
14233 # The next cluster should match the string in this segment.
14234 push @should_match, $this_string;
14235 push @should_display, $this_display;
14236 $string .= $this_string;
14237 $display_string .= $this_display;
14238 }
14239
14240 # If a string can be represented in both non-ut8 and utf8, test both cases
14241 UPGRADE:
14242 for my $to_upgrade (0 .. 1) {
678f13d5 14243
37e2e78e
KW
14244 if ($to_upgrade) {
14245
14246 # If already in utf8, would just be a repeat
14247 next UPGRADE if utf8::is_utf8($string);
14248
14249 utf8::upgrade($string);
14250 }
14251
14252 # Finally, do the \X match.
14253 my @matches = $string =~ /(\X)/g;
14254
14255 # Look through each matched cluster to verify that it matches what we
14256 # expect.
14257 my $min = (@matches < @should_match) ? @matches : @should_match;
14258 for my $i (0 .. $min - 1) {
14259 $Tests++;
14260 if ($matches[$i] eq $should_match[$i]) {
14261 print "ok $Tests - ";
14262 if ($i == 0) {
14263 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
14264 } else {
14265 print "And \\X #", $i + 1,
14266 }
14267 print " correctly matched $should_display[$i]; line $line\n";
14268 } else {
14269 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
14270 unpack("U*", $matches[$i]));
14271 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
14272 $i + 1,
14273 " should have matched $should_display[$i]",
14274 " but instead matched $matches[$i]",
14275 ". Abandoning rest of line $line\n";
14276 next UPGRADE;
14277 }
14278 }
14279
14280 # And the number of matches should equal the number of expected matches.
14281 $Tests++;
14282 if (@matches == @should_match) {
14283 print "ok $Tests - Nothing was left over; line $line\n";
14284 } else {
14285 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
14286 }
14287 }
14288
14289 return;
14290}
14291
99870f4d 14292sub Finished() {
f86864ac 14293 print "1..$Tests\n";
99870f4d 14294 exit($Fails ? -1 : 0);
5beb625e 14295}
99870f4d
KW
14296
14297Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 14298Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
14299Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
14300Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 14301Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726