This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
mktables: Add # XXX so can find experimental code
[perl5.git] / lib / unicore / mktables
CommitLineData
d73e5302 1#!/usr/bin/perl -w
99870f4d
KW
2
3# !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4# Any files created or read by this program should be listed in 'mktables.lst'
5# Use -makelist to regenerate it.
6
23e33b60
KW
7# Needs 'no overloading' to run faster on miniperl. Code commented out at the
8# subroutine objaddr can be used instead to work as far back (untested) as
f998e60c
KW
9# 5.8: needs pack "U". But almost all occurrences of objaddr have been
10# removed in favor of using 'no overloading'. You also would have to go
11# through and replace occurrences like:
ffe43484 12# my $addr = do { no overloading; pack 'J', $self; }
f998e60c
KW
13# with
14# my $addr = main::objaddr $self;
6c68572b 15# (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
051df77b
NC
16# that instituted the change to main::objaddr, and subsequent commits that
17# changed 0+$self to pack 'J', $self.)
6c68572b 18
cdcef19a
KW
19my $start_time;
20BEGIN { # Get the time the script started running; do it at compiliation to
21 # get it as close as possible
22 $start_time= time;
23}
24
25
23e33b60 26require 5.010_001;
d73e5302 27use strict;
99870f4d 28use warnings;
cf25bb62 29use Carp;
99870f4d
KW
30use File::Find;
31use File::Path;
d07a55ed 32use File::Spec;
99870f4d
KW
33use Text::Tabs;
34
35sub DEBUG () { 0 } # Set to 0 for production; 1 for development
36
37##########################################################################
38#
39# mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
40# from the Unicode database files (lib/unicore/.../*.txt), It also generates
41# a pod file and a .t file
42#
43# The structure of this file is:
44# First these introductory comments; then
45# code needed for everywhere, such as debugging stuff; then
46# code to handle input parameters; then
47# data structures likely to be of external interest (some of which depend on
48# the input parameters, so follows them; then
49# more data structures and subroutine and package (class) definitions; then
50# the small actual loop to process the input files and finish up; then
51# a __DATA__ section, for the .t tests
52#
53# This program works on all releases of Unicode through at least 5.2. The
54# outputs have been scrutinized most intently for release 5.1. The others
55# have been checked for somewhat more than just sanity. It can handle all
56# existing Unicode character properties in those releases.
57#
99870f4d
KW
58# This program is mostly about Unicode character (or code point) properties.
59# A property describes some attribute or quality of a code point, like if it
60# is lowercase or not, its name, what version of Unicode it was first defined
61# in, or what its uppercase equivalent is. Unicode deals with these disparate
62# possibilities by making all properties into mappings from each code point
63# into some corresponding value. In the case of it being lowercase or not,
64# the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
65# property maps each Unicode code point to a single value, called a "property
66# value". (Hence each Unicode property is a true mathematical function with
67# exactly one value per code point.)
68#
69# When using a property in a regular expression, what is desired isn't the
70# mapping of the code point to its property's value, but the reverse (or the
71# mathematical "inverse relation"): starting with the property value, "Does a
72# code point map to it?" These are written in a "compound" form:
73# \p{property=value}, e.g., \p{category=punctuation}. This program generates
74# files containing the lists of code points that map to each such regular
75# expression property value, one file per list
76#
77# There is also a single form shortcut that Perl adds for many of the commonly
78# used properties. This happens for all binary properties, plus script,
79# general_category, and block properties.
80#
81# Thus the outputs of this program are files. There are map files, mostly in
82# the 'To' directory; and there are list files for use in regular expression
83# matching, all in subdirectories of the 'lib' directory, with each
84# subdirectory being named for the property that the lists in it are for.
85# Bookkeeping, test, and documentation files are also generated.
86
87my $matches_directory = 'lib'; # Where match (\p{}) files go.
88my $map_directory = 'To'; # Where map files go.
89
90# DATA STRUCTURES
91#
92# The major data structures of this program are Property, of course, but also
93# Table. There are two kinds of tables, very similar to each other.
94# "Match_Table" is the data structure giving the list of code points that have
95# a particular property value, mentioned above. There is also a "Map_Table"
96# data structure which gives the property's mapping from code point to value.
97# There are two structures because the match tables need to be combined in
98# various ways, such as constructing unions, intersections, complements, etc.,
99# and the map ones don't. And there would be problems, perhaps subtle, if
100# a map table were inadvertently operated on in some of those ways.
101# The use of separate classes with operations defined on one but not the other
102# prevents accidentally confusing the two.
103#
104# At the heart of each table's data structure is a "Range_List", which is just
105# an ordered list of "Ranges", plus ancillary information, and methods to
106# operate on them. A Range is a compact way to store property information.
107# Each range has a starting code point, an ending code point, and a value that
108# is meant to apply to all the code points between the two end points,
109# inclusive. For a map table, this value is the property value for those
110# code points. Two such ranges could be written like this:
111# 0x41 .. 0x5A, 'Upper',
112# 0x61 .. 0x7A, 'Lower'
113#
114# Each range also has a type used as a convenience to classify the values.
115# Most ranges in this program will be Type 0, or normal, but there are some
116# ranges that have a non-zero type. These are used only in map tables, and
117# are for mappings that don't fit into the normal scheme of things. Mappings
118# that require a hash entry to communicate with utf8.c are one example;
119# another example is mappings for charnames.pm to use which indicate a name
120# that is algorithmically determinable from its code point (and vice-versa).
121# These are used to significantly compact these tables, instead of listing
122# each one of the tens of thousands individually.
123#
124# In a match table, the value of a range is irrelevant (and hence the type as
125# well, which will always be 0), and arbitrarily set to the null string.
126# Using the example above, there would be two match tables for those two
127# entries, one named Upper would contain the 0x41..0x5A range, and the other
128# named Lower would contain 0x61..0x7A.
129#
130# Actually, there are two types of range lists, "Range_Map" is the one
131# associated with map tables, and "Range_List" with match tables.
132# Again, this is so that methods can be defined on one and not the other so as
133# to prevent operating on them in incorrect ways.
134#
135# Eventually, most tables are written out to files to be read by utf8_heavy.pl
136# in the perl core. All tables could in theory be written, but some are
137# suppressed because there is no current practical use for them. It is easy
138# to change which get written by changing various lists that are near the top
139# of the actual code in this file. The table data structures contain enough
140# ancillary information to allow them to be treated as separate entities for
141# writing, such as the path to each one's file. There is a heading in each
142# map table that gives the format of its entries, and what the map is for all
143# the code points missing from it. (This allows tables to be more compact.)
678f13d5 144#
99870f4d
KW
145# The Property data structure contains one or more tables. All properties
146# contain a map table (except the $perl property which is a
147# pseudo-property containing only match tables), and any properties that
148# are usable in regular expression matches also contain various matching
149# tables, one for each value the property can have. A binary property can
150# have two values, True and False (or Y and N, which are preferred by Unicode
151# terminology). Thus each of these properties will have a map table that
152# takes every code point and maps it to Y or N (but having ranges cuts the
153# number of entries in that table way down), and two match tables, one
154# which has a list of all the code points that map to Y, and one for all the
155# code points that map to N. (For each of these, a third table is also
156# generated for the pseudo Perl property. It contains the identical code
157# points as the Y table, but can be written, not in the compound form, but in
158# a "single" form like \p{IsUppercase}.) Many properties are binary, but some
159# properties have several possible values, some have many, and properties like
160# Name have a different value for every named code point. Those will not,
161# unless the controlling lists are changed, have their match tables written
162# out. But all the ones which can be used in regular expression \p{} and \P{}
163# constructs will. Generally a property will have either its map table or its
164# match tables written but not both. Again, what gets written is controlled
165# by lists which can easily be changed.
678f13d5 166#
99870f4d
KW
167# For information about the Unicode properties, see Unicode's UAX44 document:
168
169my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
170
171# As stated earlier, this program will work on any release of Unicode so far.
172# Most obvious problems in earlier data have NOT been corrected except when
173# necessary to make Perl or this program work reasonably. For example, no
174# folding information was given in early releases, so this program uses the
175# substitute of lower case, just so that a regular expression with the /i
176# option will do something that actually gives the right results in many
177# cases. There are also a couple other corrections for version 1.1.5,
178# commented at the point they are made. As an example of corrections that
179# weren't made (but could be) is this statement from DerivedAge.txt: "The
180# supplementary private use code points and the non-character code points were
181# assigned in version 2.0, but not specifically listed in the UCD until
182# versions 3.0 and 3.1 respectively." (To be precise it was 3.0.1 not 3.0.0)
183# More information on Unicode version glitches is further down in these
184# introductory comments.
185#
186# This program works on all properties as of 5.2, though the files for some
678f13d5
KW
187# are suppressed from apparent lack of demand for them. You can change which
188# are output by changing lists in this program.
189#
99870f4d
KW
190# The old version of mktables emphasized the term "Fuzzy" to mean Unocde's
191# loose matchings rules (from Unicode TR18):
192#
193# The recommended names for UCD properties and property values are in
194# PropertyAliases.txt [Prop] and PropertyValueAliases.txt
195# [PropValue]. There are both abbreviated names and longer, more
196# descriptive names. It is strongly recommended that both names be
197# recognized, and that loose matching of property names be used,
198# whereby the case distinctions, whitespace, hyphens, and underbar
199# are ignored.
200# The program still allows Fuzzy to override its determination of if loose
201# matching should be used, but it isn't currently used, as it is no longer
202# needed; the calculations it makes are good enough.
678f13d5 203#
99870f4d
KW
204# SUMMARY OF HOW IT WORKS:
205#
206# Process arguments
207#
208# A list is constructed containing each input file that is to be processed
209#
210# Each file on the list is processed in a loop, using the associated handler
211# code for each:
212# The PropertyAliases.txt and PropValueAliases.txt files are processed
213# first. These files name the properties and property values.
214# Objects are created of all the property and property value names
215# that the rest of the input should expect, including all synonyms.
216# The other input files give mappings from properties to property
217# values. That is, they list code points and say what the mapping
218# is under the given property. Some files give the mappings for
219# just one property; and some for many. This program goes through
220# each file and populates the properties from them. Some properties
221# are listed in more than one file, and Unicode has set up a
222# precedence as to which has priority if there is a conflict. Thus
223# the order of processing matters, and this program handles the
224# conflict possibility by processing the overriding input files
225# last, so that if necessary they replace earlier values.
226# After this is all done, the program creates the property mappings not
227# furnished by Unicode, but derivable from what it does give.
228# The tables of code points that match each property value in each
229# property that is accessible by regular expressions are created.
230# The Perl-defined properties are created and populated. Many of these
231# require data determined from the earlier steps
232# Any Perl-defined synonyms are created, and name clashes between Perl
678f13d5 233# and Unicode are reconciled and warned about.
99870f4d
KW
234# All the properties are written to files
235# Any other files are written, and final warnings issued.
678f13d5 236#
99870f4d
KW
237# For clarity, a number of operators have been overloaded to work on tables:
238# ~ means invert (take all characters not in the set). The more
239# conventional '!' is not used because of the possibility of confusing
240# it with the actual boolean operation.
241# + means union
242# - means subtraction
243# & means intersection
244# The precedence of these is the order listed. Parentheses should be
245# copiously used. These are not a general scheme. The operations aren't
246# defined for a number of things, deliberately, to avoid getting into trouble.
247# Operations are done on references and affect the underlying structures, so
248# that the copy constructors for them have been overloaded to not return a new
249# clone, but the input object itself.
678f13d5 250#
99870f4d
KW
251# The bool operator is deliberately not overloaded to avoid confusion with
252# "should it mean if the object merely exists, or also is non-empty?".
99870f4d
KW
253#
254# WHY CERTAIN DESIGN DECISIONS WERE MADE
678f13d5
KW
255#
256# This program needs to be able to run under miniperl. Therefore, it uses a
257# minimum of other modules, and hence implements some things itself that could
258# be gotten from CPAN
259#
260# This program uses inputs published by the Unicode Consortium. These can
261# change incompatibly between releases without the Perl maintainers realizing
262# it. Therefore this program is now designed to try to flag these. It looks
263# at the directories where the inputs are, and flags any unrecognized files.
264# It keeps track of all the properties in the files it handles, and flags any
265# that it doesn't know how to handle. It also flags any input lines that
266# don't match the expected syntax, among other checks.
267#
268# It is also designed so if a new input file matches one of the known
269# templates, one hopefully just needs to add it to a list to have it
270# processed.
271#
272# As mentioned earlier, some properties are given in more than one file. In
273# particular, the files in the extracted directory are supposedly just
274# reformattings of the others. But they contain information not easily
275# derivable from the other files, including results for Unihan, which this
276# program doesn't ordinarily look at, and for unassigned code points. They
277# also have historically had errors or been incomplete. In an attempt to
278# create the best possible data, this program thus processes them first to
279# glean information missing from the other files; then processes those other
280# files to override any errors in the extracted ones. Much of the design was
281# driven by this need to store things and then possibly override them.
282#
283# It tries to keep fatal errors to a minimum, to generate something usable for
284# testing purposes. It always looks for files that could be inputs, and will
285# warn about any that it doesn't know how to handle (the -q option suppresses
286# the warning).
99870f4d
KW
287#
288# Why have files written out for binary 'N' matches?
289# For binary properties, if you know the mapping for either Y or N; the
678f13d5
KW
290# other is trivial to construct, so could be done at Perl run-time by just
291# complementing the result, instead of having a file for it. That is, if
292# someone types in \p{foo: N}, Perl could translate that to \P{foo: Y} and
293# not need a file. The problem is communicating to Perl that a given
294# property is binary. Perl can't figure it out from looking at the N (or
295# No), as some non-binary properties have these as property values. So
296# rather than inventing a way to communicate this info back to the core,
297# which would have required changes there as well, it was simpler just to
298# add the extra tables.
299#
300# Why is there more than one type of range?
301# This simplified things. There are some very specialized code points that
302# have to be handled specially for output, such as Hangul syllable names.
303# By creating a range type (done late in the development process), it
304# allowed this to be stored with the range, and overridden by other input.
305# Originally these were stored in another data structure, and it became a
306# mess trying to decide if a second file that was for the same property was
307# overriding the earlier one or not.
308#
309# Why are there two kinds of tables, match and map?
310# (And there is a base class shared by the two as well.) As stated above,
311# they actually are for different things. Development proceeded much more
312# smoothly when I (khw) realized the distinction. Map tables are used to
313# give the property value for every code point (actually every code point
314# that doesn't map to a default value). Match tables are used for regular
315# expression matches, and are essentially the inverse mapping. Separating
316# the two allows more specialized methods, and error checks so that one
317# can't just take the intersection of two map tables, for example, as that
318# is nonsensical.
99870f4d
KW
319#
320# There are no match tables generated for matches of the null string. These
c1739a4a 321# would look like qr/\p{JSN=}/ currently without modifying the regex code.
678f13d5
KW
322# Perhaps something like them could be added if necessary. The JSN does have
323# a real code point U+110B that maps to the null string, but it is a
324# contributory property, and therefore not output by default. And it's easily
325# handled so far by making the null string the default where it is a
326# possibility.
99870f4d 327#
23e33b60
KW
328# DEBUGGING
329#
678f13d5
KW
330# This program is written so it will run under miniperl. Occasionally changes
331# will cause an error where the backtrace doesn't work well under miniperl.
332# To diagnose the problem, you can instead run it under regular perl, if you
333# have one compiled.
334#
335# There is a good trace facility. To enable it, first sub DEBUG must be set
336# to return true. Then a line like
337#
338# local $to_trace = 1 if main::DEBUG;
339#
340# can be added to enable tracing in its lexical scope or until you insert
341# another line:
342#
343# local $to_trace = 0 if main::DEBUG;
344#
345# then use a line like "trace $a, @b, %c, ...;
346#
347# Some of the more complex subroutines already have trace statements in them.
348# Permanent trace statements should be like:
349#
350# trace ... if main::DEBUG && $to_trace;
351#
352# If there is just one or a few files that you're debugging, you can easily
353# cause most everything else to be skipped. Change the line
354#
355# my $debug_skip = 0;
356#
357# to 1, and every file whose object is in @input_file_objects and doesn't have
358# a, 'non_skip => 1,' in its constructor will be skipped.
359#
99870f4d
KW
360# FUTURE ISSUES
361#
362# The program would break if Unicode were to change its names so that
363# interior white space, underscores, or dashes differences were significant
364# within property and property value names.
365#
366# It might be easier to use the xml versions of the UCD if this program ever
367# would need heavy revision, and the ability to handle old versions was not
368# required.
369#
370# There is the potential for name collisions, in that Perl has chosen names
371# that Unicode could decide it also likes. There have been such collisions in
372# the past, with mostly Perl deciding to adopt the Unicode definition of the
373# name. However in the 5.2 Unicode beta testing, there were a number of such
374# collisions, which were withdrawn before the final release, because of Perl's
375# and other's protests. These all involved new properties which began with
376# 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
377# many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
378# Unicode document, so they are unlikely to be used by Unicode for another
379# purpose. However, they might try something beginning with 'In', or use any
380# of the other Perl-defined properties. This program will warn you of name
381# collisions, and refuse to generate tables with them, but manual intervention
382# will be required in this event. One scheme that could be implemented, if
383# necessary, would be to have this program generate another file, or add a
384# field to mktables.lst that gives the date of first definition of a property.
385# Each new release of Unicode would use that file as a basis for the next
386# iteration. And the Perl synonym addition code could sort based on the age
387# of the property, so older properties get priority, and newer ones that clash
388# would be refused; hence existing code would not be impacted, and some other
389# synonym would have to be used for the new property. This is ugly, and
390# manual intervention would certainly be easier to do in the short run; lets
391# hope it never comes to this.
678f13d5 392#
99870f4d
KW
393# A NOTE ON UNIHAN
394#
395# This program can generate tables from the Unihan database. But it doesn't
396# by default, letting the CPAN module Unicode::Unihan handle them. Prior to
397# version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
398# database was split into 8 different files, all beginning with the letters
399# 'Unihan'. This program will read those file(s) if present, but it needs to
400# know which of the many properties in the file(s) should have tables created
401# for them. It will create tables for any properties listed in
402# PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
403# @cjk_properties array and the @cjk_property_values array. Thus, if a
404# property you want is not in those files of the release you are building
405# against, you must add it to those two arrays. Starting in 4.0, the
406# Unicode_Radical_Stroke was listed in those files, so if the Unihan database
407# is present in the directory, a table will be generated for that property.
408# In 5.2, several more properties were added. For your convenience, the two
409# arrays are initialized with all the 5.2 listed properties that are also in
410# earlier releases. But these are commented out. You can just uncomment the
411# ones you want, or use them as a template for adding entries for other
412# properties.
413#
414# You may need to adjust the entries to suit your purposes. setup_unihan(),
415# and filter_unihan_line() are the functions where this is done. This program
416# already does some adjusting to make the lines look more like the rest of the
417# Unicode DB; You can see what that is in filter_unihan_line()
418#
419# There is a bug in the 3.2 data file in which some values for the
420# kPrimaryNumeric property have commas and an unexpected comment. A filter
421# could be added for these; or for a particular installation, the Unihan.txt
422# file could be edited to fix them.
99870f4d 423#
678f13d5
KW
424# HOW TO ADD A FILE TO BE PROCESSED
425#
426# A new file from Unicode needs to have an object constructed for it in
427# @input_file_objects, probably at the end or at the end of the extracted
428# ones. The program should warn you if its name will clash with others on
429# restrictive file systems, like DOS. If so, figure out a better name, and
430# add lines to the README.perl file giving that. If the file is a character
431# property, it should be in the format that Unicode has by default
432# standardized for such files for the more recently introduced ones.
433# If so, the Input_file constructor for @input_file_objects can just be the
434# file name and release it first appeared in. If not, then it should be
435# possible to construct an each_line_handler() to massage the line into the
436# standardized form.
437#
438# For non-character properties, more code will be needed. You can look at
439# the existing entries for clues.
440#
441# UNICODE VERSIONS NOTES
442#
443# The Unicode UCD has had a number of errors in it over the versions. And
444# these remain, by policy, in the standard for that version. Therefore it is
445# risky to correct them, because code may be expecting the error. So this
446# program doesn't generally make changes, unless the error breaks the Perl
447# core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
448# for U+1105, which causes real problems for the algorithms for Jamo
449# calculations, so it is changed here.
450#
451# But it isn't so clear cut as to what to do about concepts that are
452# introduced in a later release; should they extend back to earlier releases
453# where the concept just didn't exist? It was easier to do this than to not,
454# so that's what was done. For example, the default value for code points not
455# in the files for various properties was probably undefined until changed by
456# some version. No_Block for blocks is such an example. This program will
457# assign No_Block even in Unicode versions that didn't have it. This has the
458# benefit that code being written doesn't have to special case earlier
459# versions; and the detriment that it doesn't match the Standard precisely for
460# the affected versions.
461#
462# Here are some observations about some of the issues in early versions:
463#
464# The number of code points in \p{alpha} halve in 2.1.9. It turns out that
465# the reason is that the CJK block starting at 4E00 was removed from PropList,
466# and was not put back in until 3.1.0
467#
468# Unicode introduced the synonym Space for White_Space in 4.1. Perl has
469# always had a \p{Space}. In release 3.2 only, they are not synonymous. The
470# reason is that 3.2 introduced U+205F=medium math space, which was not
471# classed as white space, but Perl figured out that it should have been. 4.0
472# reclassified it correctly.
473#
474# Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
475# this was erroneously a synonym for 202. In 4.0, ATB became 202, and ATBL
476# was left with no code points, as all the ones that mapped to 202 stayed
477# mapped to 202. Thus if your program used the numeric name for the class,
478# it would not have been affected, but if it used the mnemonic, it would have
479# been.
480#
481# \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
482# points which eventually came to have this script property value, instead
483# mapped to "Unknown". But in the next release all these code points were
484# moved to \p{sc=common} instead.
99870f4d
KW
485#
486# The default for missing code points for BidiClass is complicated. Starting
487# in 3.1.1, the derived file DBidiClass.txt handles this, but this program
488# tries to do the best it can for earlier releases. It is done in
489# process_PropertyAliases()
490#
491##############################################################################
492
493my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
494 # and errors
495my $MAX_LINE_WIDTH = 78;
496
497# Debugging aid to skip most files so as to not be distracted by them when
498# concentrating on the ones being debugged. Add
499# non_skip => 1,
500# to the constructor for those files you want processed when you set this.
501# Files with a first version number of 0 are special: they are always
502# processed regardless of the state of this flag.
503my $debug_skip = 0;
504
505# Set to 1 to enable tracing.
506our $to_trace = 0;
507
508{ # Closure for trace: debugging aid
509 my $print_caller = 1; # ? Include calling subroutine name
510 my $main_with_colon = 'main::';
511 my $main_colon_length = length($main_with_colon);
512
513 sub trace {
514 return unless $to_trace; # Do nothing if global flag not set
515
516 my @input = @_;
517
518 local $DB::trace = 0;
519 $DB::trace = 0; # Quiet 'used only once' message
520
521 my $line_number;
522
523 # Loop looking up the stack to get the first non-trace caller
524 my $caller_line;
525 my $caller_name;
526 my $i = 0;
527 do {
528 $line_number = $caller_line;
529 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
530 $caller = $main_with_colon unless defined $caller;
531
532 $caller_name = $caller;
533
534 # get rid of pkg
535 $caller_name =~ s/.*:://;
536 if (substr($caller_name, 0, $main_colon_length)
537 eq $main_with_colon)
538 {
539 $caller_name = substr($caller_name, $main_colon_length);
540 }
541
542 } until ($caller_name ne 'trace');
543
544 # If the stack was empty, we were called from the top level
545 $caller_name = 'main' if ($caller_name eq ""
546 || $caller_name eq 'trace');
547
548 my $output = "";
549 foreach my $string (@input) {
550 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
551 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
552 $output .= simple_dumper($string);
553 }
554 else {
555 $string = "$string" if ref $string;
556 $string = $UNDEF unless defined $string;
557 chomp $string;
558 $string = '""' if $string eq "";
559 $output .= " " if $output ne ""
560 && $string ne ""
561 && substr($output, -1, 1) ne " "
562 && substr($string, 0, 1) ne " ";
563 $output .= $string;
564 }
565 }
566
99f78760
KW
567 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
568 print STDERR "$caller_name: " if $print_caller;
99870f4d
KW
569 print STDERR $output, "\n";
570 return;
571 }
572}
573
574# This is for a rarely used development feature that allows you to compare two
575# versions of the Unicode standard without having to deal with changes caused
576# by the code points introduced in the later verson. Change the 0 to a SINGLE
577# dotted Unicode release number (e.g. 2.1). Only code points introduced in
578# that release and earlier will be used; later ones are thrown away. You use
579# the version number of the earliest one you want to compare; then run this
580# program on directory structures containing each release, and compare the
581# outputs. These outputs will therefore include only the code points common
582# to both releases, and you can see the changes caused just by the underlying
583# release semantic changes. For versions earlier than 3.2, you must copy a
584# version of DAge.txt into the directory.
585my $string_compare_versions = DEBUG && 0; # e.g., v2.1;
586my $compare_versions = DEBUG
587 && $string_compare_versions
588 && pack "C*", split /\./, $string_compare_versions;
589
590sub uniques {
591 # Returns non-duplicated input values. From "Perl Best Practices:
592 # Encapsulated Cleverness". p. 455 in first edition.
593
594 my %seen;
0e407844
NC
595 # Arguably this breaks encapsulation, if the goal is to permit multiple
596 # distinct objects to stringify to the same value, and be interchangeable.
597 # However, for this program, no two objects stringify identically, and all
598 # lists passed to this function are either objects or strings. So this
599 # doesn't affect correctness, but it does give a couple of percent speedup.
600 no overloading;
99870f4d
KW
601 return grep { ! $seen{$_}++ } @_;
602}
603
604$0 = File::Spec->canonpath($0);
605
606my $make_test_script = 0; # ? Should we output a test script
607my $write_unchanged_files = 0; # ? Should we update the output files even if
608 # we don't think they have changed
609my $use_directory = ""; # ? Should we chdir somewhere.
610my $pod_directory; # input directory to store the pod file.
611my $pod_file = 'perluniprops';
612my $t_path; # Path to the .t test file
613my $file_list = 'mktables.lst'; # File to store input and output file names.
614 # This is used to speed up the build, by not
615 # executing the main body of the program if
616 # nothing on the list has changed since the
617 # previous build
618my $make_list = 1; # ? Should we write $file_list. Set to always
619 # make a list so that when the pumpking is
620 # preparing a release, s/he won't have to do
621 # special things
622my $glob_list = 0; # ? Should we try to include unknown .txt files
623 # in the input.
624my $output_range_counts = 1; # ? Should we include the number of code points
625 # in ranges in the output
9ef2b94f
KW
626my $output_names = 0; # ? Should character names be in the output
627my @viacode; # Contains the 1 million character names, if
628 # $output_names is true
629
99870f4d
KW
630# Verbosity levels; 0 is quiet
631my $NORMAL_VERBOSITY = 1;
632my $PROGRESS = 2;
633my $VERBOSE = 3;
634
635my $verbosity = $NORMAL_VERBOSITY;
636
637# Process arguments
638while (@ARGV) {
cf25bb62
JH
639 my $arg = shift @ARGV;
640 if ($arg eq '-v') {
99870f4d
KW
641 $verbosity = $VERBOSE;
642 }
643 elsif ($arg eq '-p') {
644 $verbosity = $PROGRESS;
645 $| = 1; # Flush buffers as we go.
646 }
647 elsif ($arg eq '-q') {
648 $verbosity = 0;
649 }
650 elsif ($arg eq '-w') {
651 $write_unchanged_files = 1; # update the files even if havent changed
652 }
653 elsif ($arg eq '-check') {
6ae7e459
YO
654 my $this = shift @ARGV;
655 my $ok = shift @ARGV;
656 if ($this ne $ok) {
657 print "Skipping as check params are not the same.\n";
658 exit(0);
659 }
00a8df5c 660 }
99870f4d
KW
661 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
662 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
663 }
3df51b85
KW
664 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
665 {
99870f4d 666 $make_test_script = 1;
99870f4d
KW
667 }
668 elsif ($arg eq '-makelist') {
669 $make_list = 1;
670 }
671 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
672 -d $use_directory or croak "Unknown directory '$use_directory'";
673 }
674 elsif ($arg eq '-L') {
675
676 # Existence not tested until have chdir'd
677 $file_list = shift;
678 }
679 elsif ($arg eq '-globlist') {
680 $glob_list = 1;
681 }
682 elsif ($arg eq '-c') {
683 $output_range_counts = ! $output_range_counts
684 }
9ef2b94f
KW
685 elsif ($arg eq '-output_names') {
686 $output_names = 1;
687 }
99870f4d
KW
688 else {
689 my $with_c = 'with';
690 $with_c .= 'out' if $output_range_counts; # Complements the state
691 croak <<END;
692usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
693 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
694 [-check A B ]
695 -c : Output comments $with_c number of code points in ranges
696 -q : Quiet Mode: Only output serious warnings.
697 -p : Set verbosity level to normal plus show progress.
698 -v : Set Verbosity level high: Show progress and non-serious
699 warnings
700 -w : Write files regardless
701 -C dir : Change to this directory before proceeding. All relative paths
702 except those specified by the -P and -T options will be done
703 with respect to this directory.
704 -P dir : Output $pod_file file to directory 'dir'.
3df51b85 705 -T path : Create a test script as 'path'; overrides -maketest
99870f4d
KW
706 -L filelist : Use alternate 'filelist' instead of standard one
707 -globlist : Take as input all non-Test *.txt files in current and sub
708 directories
3df51b85
KW
709 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
710 overrides -T
99870f4d 711 -makelist : Rewrite the file list $file_list based on current setup
9ef2b94f
KW
712 -output_names : Output each character's name in the table files; useful for
713 doing what-ifs, looking at diffs; is slow, memory intensive,
714 resulting tables are usable but very large.
99870f4d
KW
715 -check A B : Executes $0 only if A and B are the same
716END
717 }
718}
719
720# Stores the most-recently changed file. If none have changed, can skip the
721# build
aeab6150 722my $most_recent = (stat $0)[9]; # Do this before the chdir!
99870f4d
KW
723
724# Change directories now, because need to read 'version' early.
725if ($use_directory) {
3df51b85 726 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d
KW
727 $pod_directory = File::Spec->rel2abs($pod_directory);
728 }
3df51b85 729 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
99870f4d 730 $t_path = File::Spec->rel2abs($t_path);
00a8df5c 731 }
99870f4d 732 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
3df51b85 733 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
99870f4d 734 $pod_directory = File::Spec->abs2rel($pod_directory);
02b1aeec 735 }
3df51b85 736 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
99870f4d 737 $t_path = File::Spec->abs2rel($t_path);
02b1aeec 738 }
00a8df5c
YO
739}
740
99870f4d
KW
741# Get Unicode version into regular and v-string. This is done now because
742# various tables below get populated based on it. These tables are populated
743# here to be near the top of the file, and so easily seeable by those needing
744# to modify things.
745open my $VERSION, "<", "version"
746 or croak "$0: can't open required file 'version': $!\n";
747my $string_version = <$VERSION>;
748close $VERSION;
749chomp $string_version;
750my $v_version = pack "C*", split /\./, $string_version; # v string
751
752# The following are the complete names of properties with property values that
753# are known to not match any code points in some versions of Unicode, but that
754# may change in the future so they should be matchable, hence an empty file is
755# generated for them.
756my @tables_that_may_be_empty = (
757 'Joining_Type=Left_Joining',
758 );
759push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
760push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
761push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
762 if $v_version ge v4.1.0;
763
764# The lists below are hashes, so the key is the item in the list, and the
765# value is the reason why it is in the list. This makes generation of
766# documentation easier.
767
768my %why_suppressed; # No file generated for these.
769
770# Files aren't generated for empty extraneous properties. This is arguable.
771# Extraneous properties generally come about because a property is no longer
772# used in a newer version of Unicode. If we generated a file without code
773# points, programs that used to work on that property will still execute
774# without errors. It just won't ever match (or will always match, with \P{}).
775# This means that the logic is now likely wrong. I (khw) think its better to
776# find this out by getting an error message. Just move them to the table
777# above to change this behavior
778my %why_suppress_if_empty_warn_if_not = (
779
780 # It is the only property that has ever officially been removed from the
781 # Standard. The database never contained any code points for it.
782 'Special_Case_Condition' => 'Obsolete',
783
784 # Apparently never official, but there were code points in some versions of
785 # old-style PropList.txt
786 'Non_Break' => 'Obsolete',
787);
788
789# These would normally go in the warn table just above, but they were changed
790# a long time before this program was written, so warnings about them are
791# moot.
792if ($v_version gt v3.2.0) {
793 push @tables_that_may_be_empty,
794 'Canonical_Combining_Class=Attached_Below_Left'
795}
796
797# These are listed in the Property aliases file in 5.2, but Unihan is ignored
798# unless explicitly added.
799if ($v_version ge v5.2.0) {
800 my $unihan = 'Unihan; remove from list if using Unihan';
ea25a9b2 801 foreach my $table (qw (
99870f4d
KW
802 kAccountingNumeric
803 kOtherNumeric
804 kPrimaryNumeric
805 kCompatibilityVariant
806 kIICore
807 kIRG_GSource
808 kIRG_HSource
809 kIRG_JSource
810 kIRG_KPSource
811 kIRG_MSource
812 kIRG_KSource
813 kIRG_TSource
814 kIRG_USource
815 kIRG_VSource
816 kRSUnicode
ea25a9b2 817 ))
99870f4d
KW
818 {
819 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
820 }
ca12659b
NC
821}
822
99870f4d
KW
823# Properties that this program ignores.
824my @unimplemented_properties = (
825'Unicode_Radical_Stroke' # Remove if changing to handle this one.
826);
d73e5302 827
99870f4d
KW
828# There are several types of obsolete properties defined by Unicode. These
829# must be hand-edited for every new Unicode release.
830my %why_deprecated; # Generates a deprecated warning message if used.
831my %why_stabilized; # Documentation only
832my %why_obsolete; # Documentation only
833
834{ # Closure
835 my $simple = 'Perl uses the more complete version of this property';
836 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
837
838 my $other_properties = 'other properties';
839 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
840 my $why_no_expand = "Easily computed, and yet doesn't cover the common encoding forms (UTF-16/8)",
841
842 %why_deprecated = (
843 'Grapheme_Link' => 'Deprecated by Unicode. Use ccc=vr (Canonical_Combining_Class=Virama) instead',
844 'Jamo_Short_Name' => $contributory,
845 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
846 'Other_Alphabetic' => $contributory,
847 'Other_Default_Ignorable_Code_Point' => $contributory,
848 'Other_Grapheme_Extend' => $contributory,
849 'Other_ID_Continue' => $contributory,
850 'Other_ID_Start' => $contributory,
851 'Other_Lowercase' => $contributory,
852 'Other_Math' => $contributory,
853 'Other_Uppercase' => $contributory,
854 );
855
856 %why_suppressed = (
857 # There is a lib/unicore/Decomposition.pl (used by normalize.pm) which
858 # contains the same information, but without the algorithmically
859 # determinable Hangul syllables'. This file is not published, so it's
860 # existence is not noted in the comment.
861 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize',
862
863 'ISO_Comment' => 'Apparently no demand for it, but can access it through Unicode::UCD::charinfo. Obsoleted, and code points for it removed in Unicode 5.2',
864 'Unicode_1_Name' => "$simple, and no apparent demand for it, but can access it through Unicode::UCD::charinfo. If there is no later name for a code point, then this one is used instead in charnames",
865
866 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold",
867 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
868 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
869 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo",
870
871 'Name' => "Accessible via 'use charnames;'",
872 'Name_Alias' => "Accessible via 'use charnames;'",
873
874 # These are sort of jumping the gun; deprecation is proposed for
875 # Unicode version 6.0, but they have never been exposed by Perl, and
876 # likely are soon to be deprecated, so best not to expose them.
877 FC_NFKC_Closure => 'Use NFKC_Casefold instead',
878 Expands_On_NFC => $why_no_expand,
879 Expands_On_NFD => $why_no_expand,
880 Expands_On_NFKC => $why_no_expand,
881 Expands_On_NFKD => $why_no_expand,
882 );
883
884 # The following are suppressed because they were made contributory or
885 # deprecated by Unicode before Perl ever thought about supporting them.
886 foreach my $property ('Jamo_Short_Name', 'Grapheme_Link') {
887 $why_suppressed{$property} = $why_deprecated{$property};
888 }
cf25bb62 889
99870f4d
KW
890 # Customize the message for all the 'Other_' properties
891 foreach my $property (keys %why_deprecated) {
892 next if (my $main_property = $property) !~ s/^Other_//;
893 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
894 }
895}
896
897if ($v_version ge 4.0.0) {
898 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
899}
900if ($v_version ge 5.2.0) {
901 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
902}
903
904# Probably obsolete forever
905if ($v_version ge v4.1.0) {
906 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common"';
907}
908
909# This program can create files for enumerated-like properties, such as
910# 'Numeric_Type'. This file would be the same format as for a string
911# property, with a mapping from code point to its value, so you could look up,
912# for example, the script a code point is in. But no one so far wants this
913# mapping, or they have found another way to get it since this is a new
914# feature. So no file is generated except if it is in this list.
915my @output_mapped_properties = split "\n", <<END;
916END
917
918# If you are using the Unihan database, you need to add the properties that
919# you want to extract from it to this table. For your convenience, the
920# properties in the 5.2 PropertyAliases.txt file are listed, commented out
921my @cjk_properties = split "\n", <<'END';
922#cjkAccountingNumeric; kAccountingNumeric
923#cjkOtherNumeric; kOtherNumeric
924#cjkPrimaryNumeric; kPrimaryNumeric
925#cjkCompatibilityVariant; kCompatibilityVariant
926#cjkIICore ; kIICore
927#cjkIRG_GSource; kIRG_GSource
928#cjkIRG_HSource; kIRG_HSource
929#cjkIRG_JSource; kIRG_JSource
930#cjkIRG_KPSource; kIRG_KPSource
931#cjkIRG_KSource; kIRG_KSource
932#cjkIRG_TSource; kIRG_TSource
933#cjkIRG_USource; kIRG_USource
934#cjkIRG_VSource; kIRG_VSource
935#cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
936END
937
938# Similarly for the property values. For your convenience, the lines in the
939# 5.2 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
940# '#' marks
941my @cjk_property_values = split "\n", <<'END';
942## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
943## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
944## @missing: 0000..10FFFF; cjkIICore; <none>
945## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
946## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
947## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
948## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
949## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
950## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
951## @missing: 0000..10FFFF; cjkIRG_USource; <none>
952## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
953## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
954## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
955## @missing: 0000..10FFFF; cjkRSUnicode; <none>
956END
957
958# The input files don't list every code point. Those not listed are to be
959# defaulted to some value. Below are hard-coded what those values are for
960# non-binary properties as of 5.1. Starting in 5.0, there are
961# machine-parsable comment lines in the files the give the defaults; so this
962# list shouldn't have to be extended. The claim is that all missing entries
963# for binary properties will default to 'N'. Unicode tried to change that in
964# 5.2, but the beta period produced enough protest that they backed off.
965#
966# The defaults for the fields that appear in UnicodeData.txt in this hash must
967# be in the form that it expects. The others may be synonyms.
968my $CODE_POINT = '<code point>';
969my %default_mapping = (
970 Age => "Unassigned",
971 # Bidi_Class => Complicated; set in code
972 Bidi_Mirroring_Glyph => "",
973 Block => 'No_Block',
974 Canonical_Combining_Class => 0,
975 Case_Folding => $CODE_POINT,
976 Decomposition_Mapping => $CODE_POINT,
977 Decomposition_Type => 'None',
978 East_Asian_Width => "Neutral",
979 FC_NFKC_Closure => $CODE_POINT,
980 General_Category => 'Cn',
981 Grapheme_Cluster_Break => 'Other',
982 Hangul_Syllable_Type => 'NA',
983 ISO_Comment => "",
984 Jamo_Short_Name => "",
985 Joining_Group => "No_Joining_Group",
986 # Joining_Type => Complicated; set in code
987 kIICore => 'N', # Is converted to binary
988 #Line_Break => Complicated; set in code
989 Lowercase_Mapping => $CODE_POINT,
990 Name => "",
991 Name_Alias => "",
992 NFC_QC => 'Yes',
993 NFD_QC => 'Yes',
994 NFKC_QC => 'Yes',
995 NFKD_QC => 'Yes',
996 Numeric_Type => 'None',
997 Numeric_Value => 'NaN',
998 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
999 Sentence_Break => 'Other',
1000 Simple_Case_Folding => $CODE_POINT,
1001 Simple_Lowercase_Mapping => $CODE_POINT,
1002 Simple_Titlecase_Mapping => $CODE_POINT,
1003 Simple_Uppercase_Mapping => $CODE_POINT,
1004 Titlecase_Mapping => $CODE_POINT,
1005 Unicode_1_Name => "",
1006 Unicode_Radical_Stroke => "",
1007 Uppercase_Mapping => $CODE_POINT,
1008 Word_Break => 'Other',
1009);
1010
1011# Below are files that Unicode furnishes, but this program ignores, and why
1012my %ignored_files = (
1013 'CJKRadicals.txt' => 'Unihan data',
1014 'Index.txt' => 'An index, not actual data',
1015 'NamedSqProv.txt' => 'Not officially part of the Unicode standard; Append it to NamedSequences.txt if you want to process the contents.',
1016 'NamesList.txt' => 'Just adds commentary',
1017 'NormalizationCorrections.txt' => 'Data is already in other files.',
1018 'Props.txt' => 'Adds nothing to PropList.txt; only in very early releases',
1019 'ReadMe.txt' => 'Just comments',
1020 'README.TXT' => 'Just comments',
1021 'StandardizedVariants.txt' => 'Only for glyph changes, not a Unicode character property. Does not fit into current scheme where one code point is mapped',
1022);
1023
678f13d5 1024### End of externally interesting definitions, except for @input_file_objects
99870f4d
KW
1025
1026my $HEADER=<<"EOF";
1027# !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
3df51b85
KW
1028# This file is machine-generated by $0 from the Unicode
1029# database, Version $string_version. Any changes made here will be lost!
cf25bb62
JH
1030EOF
1031
b6922eda 1032my $INTERNAL_ONLY=<<"EOF";
99870f4d
KW
1033
1034# !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
b6922eda 1035# This file is for internal use by the Perl program only. The format and even
99870f4d
KW
1036# the name or existence of this file are subject to change without notice.
1037# Don't use it directly.
1038EOF
1039
1040my $DEVELOPMENT_ONLY=<<"EOF";
1041# !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1042# This file contains information artificially constrained to code points
1043# present in Unicode release $string_compare_versions.
1044# IT CANNOT BE RELIED ON. It is for use during development only and should
23e33b60 1045# not be used for production.
b6922eda
KW
1046
1047EOF
1048
99870f4d
KW
1049my $LAST_UNICODE_CODEPOINT_STRING = "10FFFF";
1050my $LAST_UNICODE_CODEPOINT = hex $LAST_UNICODE_CODEPOINT_STRING;
1051my $MAX_UNICODE_CODEPOINTS = $LAST_UNICODE_CODEPOINT + 1;
1052
1053# Matches legal code point. 4-6 hex numbers, If there are 6, the first
1054# two must be 10; if there are 5, the first must not be a 0. Written this way
1055# to decrease backtracking
1056my $code_point_re =
1057 qr/ \b (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1058
1059# This matches the beginning of the line in the Unicode db files that give the
1060# defaults for code points not listed (i.e., missing) in the file. The code
1061# depends on this ending with a semi-colon, so it can assume it is a valid
1062# field when the line is split() by semi-colons
1063my $missing_defaults_prefix =
1064 qr/^#\s+\@missing:\s+0000\.\.$LAST_UNICODE_CODEPOINT_STRING\s*;/;
1065
1066# Property types. Unicode has more types, but these are sufficient for our
1067# purposes.
1068my $UNKNOWN = -1; # initialized to illegal value
1069my $NON_STRING = 1; # Either binary or enum
1070my $BINARY = 2;
1071my $ENUM = 3; # Include catalog
1072my $STRING = 4; # Anything else: string or misc
1073
1074# Some input files have lines that give default values for code points not
1075# contained in the file. Sometimes these should be ignored.
1076my $NO_DEFAULTS = 0; # Must evaluate to false
1077my $NOT_IGNORED = 1;
1078my $IGNORED = 2;
1079
1080# Range types. Each range has a type. Most ranges are type 0, for normal,
1081# and will appear in the main body of the tables in the output files, but
1082# there are other types of ranges as well, listed below, that are specially
1083# handled. There are pseudo-types as well that will never be stored as a
1084# type, but will affect the calculation of the type.
1085
1086# 0 is for normal, non-specials
1087my $MULTI_CP = 1; # Sequence of more than code point
1088my $HANGUL_SYLLABLE = 2;
1089my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1090my $NULL = 4; # The map is to the null string; utf8.c can't
1091 # handle these, nor is there an accepted syntax
1092 # for them in \p{} constructs
f86864ac 1093my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
99870f4d
KW
1094 # otherwise be $MULTI_CP type are instead type 0
1095
1096# process_generic_property_file() can accept certain overrides in its input.
1097# Each of these must begin AND end with $CMD_DELIM.
1098my $CMD_DELIM = "\a";
1099my $REPLACE_CMD = 'replace'; # Override the Replace
1100my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1101
1102my $NO = 0;
1103my $YES = 1;
1104
1105# Values for the Replace argument to add_range.
1106# $NO # Don't replace; add only the code points not
1107 # already present.
1108my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1109 # the comments at the subroutine definition.
1110my $UNCONDITIONALLY = 2; # Replace without conditions.
1111my $MULTIPLE = 4; # Don't replace, but add a duplicate record if
1112 # already there
1113
1114# Flags to give property statuses. The phrases are to remind maintainers that
1115# if the flag is changed, the indefinite article referring to it in the
1116# documentation may need to be as well.
1117my $NORMAL = "";
1118my $SUPPRESSED = 'z'; # The character should never actually be seen, since
1119 # it is suppressed
37e2e78e 1120my $PLACEHOLDER = 'P'; # Implies no pod entry generated
99870f4d
KW
1121my $DEPRECATED = 'D';
1122my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1123my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1124my $DISCOURAGED = 'X';
1125my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1126my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1127my $STRICTER = 'T';
1128my $a_bold_stricter = "a 'B<$STRICTER>'";
1129my $A_bold_stricter = "A 'B<$STRICTER>'";
1130my $STABILIZED = 'S';
1131my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1132my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1133my $OBSOLETE = 'O';
1134my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1135my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1136
1137my %status_past_participles = (
1138 $DISCOURAGED => 'discouraged',
1139 $SUPPRESSED => 'should never be generated',
1140 $STABILIZED => 'stabilized',
1141 $OBSOLETE => 'obsolete',
37e2e78e 1142 $DEPRECATED => 'deprecated',
99870f4d
KW
1143);
1144
f5817e0a
KW
1145# The format of the values of the tables:
1146my $EMPTY_FORMAT = "";
99870f4d
KW
1147my $BINARY_FORMAT = 'b';
1148my $DECIMAL_FORMAT = 'd';
1149my $FLOAT_FORMAT = 'f';
1150my $INTEGER_FORMAT = 'i';
1151my $HEX_FORMAT = 'x';
1152my $RATIONAL_FORMAT = 'r';
1153my $STRING_FORMAT = 's';
a14f3cb1 1154my $DECOMP_STRING_FORMAT = 'c';
99870f4d
KW
1155
1156my %map_table_formats = (
1157 $BINARY_FORMAT => 'binary',
1158 $DECIMAL_FORMAT => 'single decimal digit',
1159 $FLOAT_FORMAT => 'floating point number',
1160 $INTEGER_FORMAT => 'integer',
1161 $HEX_FORMAT => 'positive hex whole number; a code point',
1162 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1a9d544b 1163 $STRING_FORMAT => 'string',
a14f3cb1 1164 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decompostion mapping',
99870f4d
KW
1165);
1166
1167# Unicode didn't put such derived files in a separate directory at first.
1168my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1169my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1170my $AUXILIARY = 'auxiliary';
1171
1172# Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1173my %loose_to_file_of; # loosely maps table names to their respective
1174 # files
1175my %stricter_to_file_of; # same; but for stricter mapping.
1176my %nv_floating_to_rational; # maps numeric values floating point numbers to
1177 # their rational equivalent
1178my %loose_property_name_of; # Loosely maps property names to standard form
1179
1180# These constants names and values were taken from the Unicode standard,
1181# version 5.1, section 3.12. They are used in conjunction with Hangul
6e5a209b
KW
1182# syllables. The '_string' versions are so generated tables can retain the
1183# hex format, which is the more familiar value
1184my $SBase_string = "0xAC00";
1185my $SBase = CORE::hex $SBase_string;
1186my $LBase_string = "0x1100";
1187my $LBase = CORE::hex $LBase_string;
1188my $VBase_string = "0x1161";
1189my $VBase = CORE::hex $VBase_string;
1190my $TBase_string = "0x11A7";
1191my $TBase = CORE::hex $TBase_string;
99870f4d
KW
1192my $SCount = 11172;
1193my $LCount = 19;
1194my $VCount = 21;
1195my $TCount = 28;
1196my $NCount = $VCount * $TCount;
1197
1198# For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1199# with the above published constants.
1200my %Jamo;
1201my %Jamo_L; # Leading consonants
1202my %Jamo_V; # Vowels
1203my %Jamo_T; # Trailing consonants
1204
37e2e78e 1205my @backslash_X_tests; # List of tests read in for testing \X
99870f4d
KW
1206my @unhandled_properties; # Will contain a list of properties found in
1207 # the input that we didn't process.
f86864ac 1208my @match_properties; # Properties that have match tables, to be
99870f4d
KW
1209 # listed in the pod
1210my @map_properties; # Properties that get map files written
1211my @named_sequences; # NamedSequences.txt contents.
1212my %potential_files; # Generated list of all .txt files in the directory
1213 # structure so we can warn if something is being
1214 # ignored.
1215my @files_actually_output; # List of files we generated.
1216my @more_Names; # Some code point names are compound; this is used
1217 # to store the extra components of them.
1218my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1219 # the minimum before we consider it equivalent to a
1220 # candidate rational
1221my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1222
1223# These store references to certain commonly used property objects
1224my $gc;
1225my $perl;
1226my $block;
3e20195b
KW
1227my $perl_charname;
1228my $print;
99870f4d
KW
1229
1230# Are there conflicting names because of beginning with 'In_', or 'Is_'
1231my $has_In_conflicts = 0;
1232my $has_Is_conflicts = 0;
1233
1234sub internal_file_to_platform ($) {
1235 # Convert our file paths which have '/' separators to those of the
1236 # platform.
1237
1238 my $file = shift;
1239 return undef unless defined $file;
1240
1241 return File::Spec->join(split '/', $file);
d07a55ed 1242}
5beb625e 1243
99870f4d
KW
1244sub file_exists ($) { # platform independent '-e'. This program internally
1245 # uses slash as a path separator.
1246 my $file = shift;
1247 return 0 if ! defined $file;
1248 return -e internal_file_to_platform($file);
1249}
5beb625e 1250
99870f4d 1251sub objaddr($) {
23e33b60
KW
1252 # Returns the address of the blessed input object.
1253 # It doesn't check for blessedness because that would do a string eval
1254 # every call, and the program is structured so that this is never called
1255 # for a non-blessed object.
99870f4d 1256
23e33b60 1257 no overloading; # If overloaded, numifying below won't work.
99870f4d
KW
1258
1259 # Numifying a ref gives its address.
051df77b 1260 return pack 'J', $_[0];
99870f4d
KW
1261}
1262
23e33b60
KW
1263# Commented code below should work on Perl 5.8.
1264## This 'require' doesn't necessarily work in miniperl, and even if it does,
1265## the native perl version of it (which is what would operate under miniperl)
1266## is extremely slow, as it does a string eval every call.
1267#my $has_fast_scalar_util = $\18 !~ /miniperl/
1268# && defined eval "require Scalar::Util";
1269#
1270#sub objaddr($) {
1271# # Returns the address of the blessed input object. Uses the XS version if
1272# # available. It doesn't check for blessedness because that would do a
1273# # string eval every call, and the program is structured so that this is
1274# # never called for a non-blessed object.
1275#
1276# return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1277#
1278# # Check at least that is a ref.
1279# my $pkg = ref($_[0]) or return undef;
1280#
1281# # Change to a fake package to defeat any overloaded stringify
1282# bless $_[0], 'main::Fake';
1283#
1284# # Numifying a ref gives its address.
051df77b 1285# my $addr = pack 'J', $_[0];
23e33b60
KW
1286#
1287# # Return to original class
1288# bless $_[0], $pkg;
1289# return $addr;
1290#}
1291
99870f4d
KW
1292sub max ($$) {
1293 my $a = shift;
1294 my $b = shift;
1295 return $a if $a >= $b;
1296 return $b;
1297}
1298
1299sub min ($$) {
1300 my $a = shift;
1301 my $b = shift;
1302 return $a if $a <= $b;
1303 return $b;
1304}
1305
1306sub clarify_number ($) {
1307 # This returns the input number with underscores inserted every 3 digits
1308 # in large (5 digits or more) numbers. Input must be entirely digits, not
1309 # checked.
1310
1311 my $number = shift;
1312 my $pos = length($number) - 3;
1313 return $number if $pos <= 1;
1314 while ($pos > 0) {
1315 substr($number, $pos, 0) = '_';
1316 $pos -= 3;
5beb625e 1317 }
99870f4d 1318 return $number;
99598c8c
JH
1319}
1320
12ac2576 1321
99870f4d 1322package Carp;
7ebf06b3 1323
99870f4d
KW
1324# These routines give a uniform treatment of messages in this program. They
1325# are placed in the Carp package to cause the stack trace to not include them,
1326# although an alternative would be to use another package and set @CARP_NOT
1327# for it.
12ac2576 1328
99870f4d 1329our $Verbose = 1 if main::DEBUG; # Useful info when debugging
12ac2576 1330
99f78760
KW
1331# This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1332# and overload trying to load Scalar:Util under miniperl. See
1333# http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1334undef $overload::VERSION;
1335
99870f4d
KW
1336sub my_carp {
1337 my $message = shift || "";
1338 my $nofold = shift || 0;
7ebf06b3 1339
99870f4d
KW
1340 if ($message) {
1341 $message = main::join_lines($message);
1342 $message =~ s/^$0: *//; # Remove initial program name
1343 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1344 $message = "\n$0: $message;";
12ac2576 1345
99870f4d
KW
1346 # Fold the message with program name, semi-colon end punctuation
1347 # (which looks good with the message that carp appends to it), and a
1348 # hanging indent for continuation lines.
1349 $message = main::simple_fold($message, "", 4) unless $nofold;
1350 $message =~ s/\n$//; # Remove the trailing nl so what carp
1351 # appends is to the same line
1352 }
12ac2576 1353
99870f4d 1354 return $message if defined wantarray; # If a caller just wants the msg
12ac2576 1355
99870f4d
KW
1356 carp $message;
1357 return;
1358}
7ebf06b3 1359
99870f4d
KW
1360sub my_carp_bug {
1361 # This is called when it is clear that the problem is caused by a bug in
1362 # this program.
7ebf06b3 1363
99870f4d
KW
1364 my $message = shift;
1365 $message =~ s/^$0: *//;
1366 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1367 carp $message;
1368 return;
1369}
7ebf06b3 1370
99870f4d
KW
1371sub carp_too_few_args {
1372 if (@_ != 2) {
1373 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1374 return;
12ac2576 1375 }
7ebf06b3 1376
99870f4d
KW
1377 my $args_ref = shift;
1378 my $count = shift;
7ebf06b3 1379
99870f4d
KW
1380 my_carp_bug("Need at least $count arguments to "
1381 . (caller 1)[3]
1382 . ". Instead got: '"
1383 . join ', ', @$args_ref
1384 . "'. No action taken.");
1385 return;
12ac2576
JP
1386}
1387
99870f4d
KW
1388sub carp_extra_args {
1389 my $args_ref = shift;
1390 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
12ac2576 1391
99870f4d
KW
1392 unless (ref $args_ref) {
1393 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1394 return;
1395 }
1396 my ($package, $file, $line) = caller;
1397 my $subroutine = (caller 1)[3];
cf25bb62 1398
99870f4d
KW
1399 my $list;
1400 if (ref $args_ref eq 'HASH') {
1401 foreach my $key (keys %$args_ref) {
1402 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
cf25bb62 1403 }
99870f4d 1404 $list = join ', ', each %{$args_ref};
cf25bb62 1405 }
99870f4d
KW
1406 elsif (ref $args_ref eq 'ARRAY') {
1407 foreach my $arg (@$args_ref) {
1408 $arg = $UNDEF unless defined $arg;
1409 }
1410 $list = join ', ', @$args_ref;
1411 }
1412 else {
1413 my_carp_bug("Can't cope with ref "
1414 . ref($args_ref)
1415 . " . argument to 'carp_extra_args'. Not checking arguments.");
1416 return;
1417 }
1418
1419 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1420 return;
d73e5302
JH
1421}
1422
99870f4d
KW
1423package main;
1424
1425{ # Closure
1426
1427 # This program uses the inside-out method for objects, as recommended in
1428 # "Perl Best Practices". This closure aids in generating those. There
1429 # are two routines. setup_package() is called once per package to set
1430 # things up, and then set_access() is called for each hash representing a
1431 # field in the object. These routines arrange for the object to be
1432 # properly destroyed when no longer used, and for standard accessor
1433 # functions to be generated. If you need more complex accessors, just
1434 # write your own and leave those accesses out of the call to set_access().
1435 # More details below.
1436
1437 my %constructor_fields; # fields that are to be used in constructors; see
1438 # below
1439
1440 # The values of this hash will be the package names as keys to other
1441 # hashes containing the name of each field in the package as keys, and
1442 # references to their respective hashes as values.
1443 my %package_fields;
1444
1445 sub setup_package {
1446 # Sets up the package, creating standard DESTROY and dump methods
1447 # (unless already defined). The dump method is used in debugging by
1448 # simple_dumper().
1449 # The optional parameters are:
1450 # a) a reference to a hash, that gets populated by later
1451 # set_access() calls with one of the accesses being
1452 # 'constructor'. The caller can then refer to this, but it is
1453 # not otherwise used by these two routines.
1454 # b) a reference to a callback routine to call during destruction
1455 # of the object, before any fields are actually destroyed
1456
1457 my %args = @_;
1458 my $constructor_ref = delete $args{'Constructor_Fields'};
1459 my $destroy_callback = delete $args{'Destroy_Callback'};
1460 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1461
1462 my %fields;
1463 my $package = (caller)[0];
1464
1465 $package_fields{$package} = \%fields;
1466 $constructor_fields{$package} = $constructor_ref;
1467
1468 unless ($package->can('DESTROY')) {
1469 my $destroy_name = "${package}::DESTROY";
1470 no strict "refs";
1471
1472 # Use typeglob to give the anonymous subroutine the name we want
1473 *$destroy_name = sub {
1474 my $self = shift;
ffe43484 1475 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1476
1477 $self->$destroy_callback if $destroy_callback;
1478 foreach my $field (keys %{$package_fields{$package}}) {
1479 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1480 delete $package_fields{$package}{$field}{$addr};
1481 }
1482 return;
1483 }
1484 }
1485
1486 unless ($package->can('dump')) {
1487 my $dump_name = "${package}::dump";
1488 no strict "refs";
1489 *$dump_name = sub {
1490 my $self = shift;
1491 return dump_inside_out($self, $package_fields{$package}, @_);
1492 }
1493 }
1494 return;
1495 }
1496
1497 sub set_access {
1498 # Arrange for the input field to be garbage collected when no longer
1499 # needed. Also, creates standard accessor functions for the field
1500 # based on the optional parameters-- none if none of these parameters:
1501 # 'addable' creates an 'add_NAME()' accessor function.
1502 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1503 # function.
1504 # 'settable' creates a 'set_NAME()' accessor function.
1505 # 'constructor' doesn't create an accessor function, but adds the
1506 # field to the hash that was previously passed to
1507 # setup_package();
1508 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1509 # 'add' etc. all mean 'addable'.
1510 # The read accessor function will work on both array and scalar
1511 # values. If another accessor in the parameter list is 'a', the read
1512 # access assumes an array. You can also force it to be array access
1513 # by specifying 'readable_array' instead of 'readable'
1514 #
1515 # A sort-of 'protected' access can be set-up by preceding the addable,
1516 # readable or settable with some initial portion of 'protected_' (but,
1517 # the underscore is required), like 'p_a', 'pro_set', etc. The
1518 # "protection" is only by convention. All that happens is that the
1519 # accessor functions' names begin with an underscore. So instead of
1520 # calling set_foo, the call is _set_foo. (Real protection could be
c1739a4a 1521 # accomplished by having a new subroutine, end_package, called at the
99870f4d
KW
1522 # end of each package, and then storing the __LINE__ ranges and
1523 # checking them on every accessor. But that is way overkill.)
1524
1525 # We create anonymous subroutines as the accessors and then use
1526 # typeglobs to assign them to the proper package and name
1527
1528 my $name = shift; # Name of the field
1529 my $field = shift; # Reference to the inside-out hash containing the
1530 # field
1531
1532 my $package = (caller)[0];
1533
1534 if (! exists $package_fields{$package}) {
1535 croak "$0: Must call 'setup_package' before 'set_access'";
1536 }
d73e5302 1537
99870f4d
KW
1538 # Stash the field so DESTROY can get it.
1539 $package_fields{$package}{$name} = $field;
cf25bb62 1540
99870f4d
KW
1541 # Remaining arguments are the accessors. For each...
1542 foreach my $access (@_) {
1543 my $access = lc $access;
cf25bb62 1544
99870f4d 1545 my $protected = "";
cf25bb62 1546
99870f4d
KW
1547 # Match the input as far as it goes.
1548 if ($access =~ /^(p[^_]*)_/) {
1549 $protected = $1;
1550 if (substr('protected_', 0, length $protected)
1551 eq $protected)
1552 {
1553
1554 # Add 1 for the underscore not included in $protected
1555 $access = substr($access, length($protected) + 1);
1556 $protected = '_';
1557 }
1558 else {
1559 $protected = "";
1560 }
1561 }
1562
1563 if (substr('addable', 0, length $access) eq $access) {
1564 my $subname = "${package}::${protected}add_$name";
1565 no strict "refs";
1566
1567 # add_ accessor. Don't add if already there, which we
1568 # determine using 'eq' for scalars and '==' otherwise.
1569 *$subname = sub {
1570 use strict "refs";
1571 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1572 my $self = shift;
1573 my $value = shift;
ffe43484 1574 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1575 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1576 if (ref $value) {
f998e60c 1577 return if grep { $value == $_ } @{$field->{$addr}};
99870f4d
KW
1578 }
1579 else {
f998e60c 1580 return if grep { $value eq $_ } @{$field->{$addr}};
99870f4d 1581 }
f998e60c 1582 push @{$field->{$addr}}, $value;
99870f4d
KW
1583 return;
1584 }
1585 }
1586 elsif (substr('constructor', 0, length $access) eq $access) {
1587 if ($protected) {
1588 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1589 }
1590 else {
1591 $constructor_fields{$package}{$name} = $field;
1592 }
1593 }
1594 elsif (substr('readable_array', 0, length $access) eq $access) {
1595
1596 # Here has read access. If one of the other parameters for
1597 # access is array, or this one specifies array (by being more
1598 # than just 'readable_'), then create a subroutine that
1599 # assumes the data is an array. Otherwise just a scalar
1600 my $subname = "${package}::${protected}$name";
1601 if (grep { /^a/i } @_
1602 or length($access) > length('readable_'))
1603 {
1604 no strict "refs";
1605 *$subname = sub {
1606 use strict "refs";
23e33b60 1607 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
ffe43484 1608 my $addr = do { no overloading; pack 'J', $_[0]; };
99870f4d
KW
1609 if (ref $field->{$addr} ne 'ARRAY') {
1610 my $type = ref $field->{$addr};
1611 $type = 'scalar' unless $type;
1612 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1613 return;
1614 }
1615 return scalar @{$field->{$addr}} unless wantarray;
1616
1617 # Make a copy; had problems with caller modifying the
1618 # original otherwise
1619 my @return = @{$field->{$addr}};
1620 return @return;
1621 }
1622 }
1623 else {
1624
1625 # Here not an array value, a simpler function.
1626 no strict "refs";
1627 *$subname = sub {
1628 use strict "refs";
23e33b60 1629 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
f998e60c 1630 no overloading;
051df77b 1631 return $field->{pack 'J', $_[0]};
99870f4d
KW
1632 }
1633 }
1634 }
1635 elsif (substr('settable', 0, length $access) eq $access) {
1636 my $subname = "${package}::${protected}set_$name";
1637 no strict "refs";
1638 *$subname = sub {
1639 use strict "refs";
23e33b60
KW
1640 if (main::DEBUG) {
1641 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
1642 Carp::carp_extra_args(\@_) if @_ > 2;
1643 }
1644 # $self is $_[0]; $value is $_[1]
f998e60c 1645 no overloading;
051df77b 1646 $field->{pack 'J', $_[0]} = $_[1];
99870f4d
KW
1647 return;
1648 }
1649 }
1650 else {
1651 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
1652 }
cf25bb62 1653 }
99870f4d 1654 return;
cf25bb62 1655 }
99870f4d
KW
1656}
1657
1658package Input_file;
1659
1660# All input files use this object, which stores various attributes about them,
1661# and provides for convenient, uniform handling. The run method wraps the
1662# processing. It handles all the bookkeeping of opening, reading, and closing
1663# the file, returning only significant input lines.
1664#
1665# Each object gets a handler which processes the body of the file, and is
1666# called by run(). Most should use the generic, default handler, which has
1667# code scrubbed to handle things you might not expect. A handler should
1668# basically be a while(next_line()) {...} loop.
1669#
1670# You can also set up handlers to
1671# 1) call before the first line is read for pre processing
1672# 2) call to adjust each line of the input before the main handler gets them
1673# 3) call upon EOF before the main handler exits its loop
1674# 4) call at the end for post processing
1675#
1676# $_ is used to store the input line, and is to be filtered by the
1677# each_line_handler()s. So, if the format of the line is not in the desired
1678# format for the main handler, these are used to do that adjusting. They can
1679# be stacked (by enclosing them in an [ anonymous array ] in the constructor,
1680# so the $_ output of one is used as the input to the next. None of the other
1681# handlers are stackable, but could easily be changed to be so.
1682#
1683# Most of the handlers can call insert_lines() or insert_adjusted_lines()
1684# which insert the parameters as lines to be processed before the next input
1685# file line is read. This allows the EOF handler to flush buffers, for
1686# example. The difference between the two routines is that the lines inserted
1687# by insert_lines() are subjected to the each_line_handler()s. (So if you
1688# called it from such a handler, you would get infinite recursion.) Lines
1689# inserted by insert_adjusted_lines() go directly to the main handler without
1690# any adjustments. If the post-processing handler calls any of these, there
1691# will be no effect. Some error checking for these conditions could be added,
1692# but it hasn't been done.
1693#
1694# carp_bad_line() should be called to warn of bad input lines, which clears $_
1695# to prevent further processing of the line. This routine will output the
1696# message as a warning once, and then keep a count of the lines that have the
1697# same message, and output that count at the end of the file's processing.
1698# This keeps the number of messages down to a manageable amount.
1699#
1700# get_missings() should be called to retrieve any @missing input lines.
1701# Messages will be raised if this isn't done if the options aren't to ignore
1702# missings.
1703
1704sub trace { return main::trace(@_); }
1705
99870f4d
KW
1706{ # Closure
1707 # Keep track of fields that are to be put into the constructor.
1708 my %constructor_fields;
1709
1710 main::setup_package(Constructor_Fields => \%constructor_fields);
1711
1712 my %file; # Input file name, required
1713 main::set_access('file', \%file, qw{ c r });
1714
1715 my %first_released; # Unicode version file was first released in, required
1716 main::set_access('first_released', \%first_released, qw{ c r });
1717
1718 my %handler; # Subroutine to process the input file, defaults to
1719 # 'process_generic_property_file'
1720 main::set_access('handler', \%handler, qw{ c });
1721
1722 my %property;
1723 # name of property this file is for. defaults to none, meaning not
1724 # applicable, or is otherwise determinable, for example, from each line.
1725 main::set_access('property', \%property, qw{ c });
1726
1727 my %optional;
1728 # If this is true, the file is optional. If not present, no warning is
1729 # output. If it is present, the string given by this parameter is
1730 # evaluated, and if false the file is not processed.
1731 main::set_access('optional', \%optional, 'c', 'r');
1732
1733 my %non_skip;
1734 # This is used for debugging, to skip processing of all but a few input
1735 # files. Add 'non_skip => 1' to the constructor for those files you want
1736 # processed when you set the $debug_skip global.
1737 main::set_access('non_skip', \%non_skip, 'c');
1738
37e2e78e
KW
1739 my %skip;
1740 # This is used to skip processing of this input file semi-permanently.
1741 # It is used for files that we aren't planning to process anytime soon,
1742 # but want to allow to be in the directory and not raise a message that we
1743 # are not handling. Mostly for test files. This is in contrast to the
1744 # non_skip element, which is supposed to be used very temporarily for
1745 # debugging. Sets 'optional' to 1
1746 main::set_access('skip', \%skip, 'c');
1747
99870f4d
KW
1748 my %each_line_handler;
1749 # list of subroutines to look at and filter each non-comment line in the
1750 # file. defaults to none. The subroutines are called in order, each is
1751 # to adjust $_ for the next one, and the final one adjusts it for
1752 # 'handler'
1753 main::set_access('each_line_handler', \%each_line_handler, 'c');
1754
1755 my %has_missings_defaults;
1756 # ? Are there lines in the file giving default values for code points
1757 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
1758 # the norm, but IGNORED means it has such lines, but the handler doesn't
1759 # use them. Having these three states allows us to catch changes to the
1760 # UCD that this program should track
1761 main::set_access('has_missings_defaults',
1762 \%has_missings_defaults, qw{ c r });
1763
1764 my %pre_handler;
1765 # Subroutine to call before doing anything else in the file. If undef, no
1766 # such handler is called.
1767 main::set_access('pre_handler', \%pre_handler, qw{ c });
1768
1769 my %eof_handler;
1770 # Subroutine to call upon getting an EOF on the input file, but before
1771 # that is returned to the main handler. This is to allow buffers to be
1772 # flushed. The handler is expected to call insert_lines() or
1773 # insert_adjusted() with the buffered material
1774 main::set_access('eof_handler', \%eof_handler, qw{ c r });
1775
1776 my %post_handler;
1777 # Subroutine to call after all the lines of the file are read in and
1778 # processed. If undef, no such handler is called.
1779 main::set_access('post_handler', \%post_handler, qw{ c });
1780
1781 my %progress_message;
1782 # Message to print to display progress in lieu of the standard one
1783 main::set_access('progress_message', \%progress_message, qw{ c });
1784
1785 my %handle;
1786 # cache open file handle, internal. Is undef if file hasn't been
1787 # processed at all, empty if has;
1788 main::set_access('handle', \%handle);
1789
1790 my %added_lines;
1791 # cache of lines added virtually to the file, internal
1792 main::set_access('added_lines', \%added_lines);
1793
1794 my %errors;
1795 # cache of errors found, internal
1796 main::set_access('errors', \%errors);
1797
1798 my %missings;
1799 # storage of '@missing' defaults lines
1800 main::set_access('missings', \%missings);
1801
1802 sub new {
1803 my $class = shift;
1804
1805 my $self = bless \do{ my $anonymous_scalar }, $class;
ffe43484 1806 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
1807
1808 # Set defaults
1809 $handler{$addr} = \&main::process_generic_property_file;
1810 $non_skip{$addr} = 0;
37e2e78e 1811 $skip{$addr} = 0;
99870f4d
KW
1812 $has_missings_defaults{$addr} = $NO_DEFAULTS;
1813 $handle{$addr} = undef;
1814 $added_lines{$addr} = [ ];
1815 $each_line_handler{$addr} = [ ];
1816 $errors{$addr} = { };
1817 $missings{$addr} = [ ];
1818
1819 # Two positional parameters.
99f78760 1820 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
99870f4d
KW
1821 $file{$addr} = main::internal_file_to_platform(shift);
1822 $first_released{$addr} = shift;
1823
1824 # The rest of the arguments are key => value pairs
1825 # %constructor_fields has been set up earlier to list all possible
1826 # ones. Either set or push, depending on how the default has been set
1827 # up just above.
1828 my %args = @_;
1829 foreach my $key (keys %args) {
1830 my $argument = $args{$key};
1831
1832 # Note that the fields are the lower case of the constructor keys
1833 my $hash = $constructor_fields{lc $key};
1834 if (! defined $hash) {
1835 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
1836 next;
1837 }
1838 if (ref $hash->{$addr} eq 'ARRAY') {
1839 if (ref $argument eq 'ARRAY') {
1840 foreach my $argument (@{$argument}) {
1841 next if ! defined $argument;
1842 push @{$hash->{$addr}}, $argument;
1843 }
1844 }
1845 else {
1846 push @{$hash->{$addr}}, $argument if defined $argument;
1847 }
1848 }
1849 else {
1850 $hash->{$addr} = $argument;
1851 }
1852 delete $args{$key};
1853 };
1854
1855 # If the file has a property for it, it means that the property is not
1856 # listed in the file's entries. So add a handler to the list of line
1857 # handlers to insert the property name into the lines, to provide a
1858 # uniform interface to the final processing subroutine.
1859 # the final code doesn't have to worry about that.
1860 if ($property{$addr}) {
1861 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
1862 }
1863
1864 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
1865 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
a3a8c5f0 1866 }
99870f4d 1867
37e2e78e
KW
1868 $optional{$addr} = 1 if $skip{$addr};
1869
99870f4d 1870 return $self;
d73e5302
JH
1871 }
1872
cf25bb62 1873
99870f4d
KW
1874 use overload
1875 fallback => 0,
1876 qw("") => "_operator_stringify",
1877 "." => \&main::_operator_dot,
1878 ;
cf25bb62 1879
99870f4d
KW
1880 sub _operator_stringify {
1881 my $self = shift;
cf25bb62 1882
99870f4d 1883 return __PACKAGE__ . " object for " . $self->file;
d73e5302 1884 }
d73e5302 1885
99870f4d
KW
1886 # flag to make sure extracted files are processed early
1887 my $seen_non_extracted_non_age = 0;
d73e5302 1888
99870f4d
KW
1889 sub run {
1890 # Process the input object $self. This opens and closes the file and
1891 # calls all the handlers for it. Currently, this can only be called
1892 # once per file, as it destroy's the EOF handler
d73e5302 1893
99870f4d
KW
1894 my $self = shift;
1895 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
b6922eda 1896
ffe43484 1897 my $addr = do { no overloading; pack 'J', $self; };
b6922eda 1898
99870f4d 1899 my $file = $file{$addr};
d73e5302 1900
99870f4d
KW
1901 # Don't process if not expecting this file (because released later
1902 # than this Unicode version), and isn't there. This means if someone
1903 # copies it into an earlier version's directory, we will go ahead and
1904 # process it.
1905 return if $first_released{$addr} gt $v_version && ! -e $file;
1906
1907 # If in debugging mode and this file doesn't have the non-skip
1908 # flag set, and isn't one of the critical files, skip it.
1909 if ($debug_skip
1910 && $first_released{$addr} ne v0
1911 && ! $non_skip{$addr})
1912 {
1913 print "Skipping $file in debugging\n" if $verbosity;
1914 return;
1915 }
1916
1917 # File could be optional
37e2e78e 1918 if ($optional{$addr}) {
99870f4d
KW
1919 return unless -e $file;
1920 my $result = eval $optional{$addr};
1921 if (! defined $result) {
1922 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
1923 return;
1924 }
1925 if (! $result) {
1926 if ($verbosity) {
1927 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
1928 }
1929 return;
1930 }
1931 }
1932
1933 if (! defined $file || ! -e $file) {
1934
1935 # If the file doesn't exist, see if have internal data for it
1936 # (based on first_released being 0).
1937 if ($first_released{$addr} eq v0) {
1938 $handle{$addr} = 'pretend_is_open';
1939 }
1940 else {
1941 if (! $optional{$addr} # File could be optional
1942 && $v_version ge $first_released{$addr})
1943 {
1944 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
1945 }
1946 return;
1947 }
1948 }
1949 else {
1950
37e2e78e
KW
1951 # Here, the file exists. Some platforms may change the case of
1952 # its name
99870f4d 1953 if ($seen_non_extracted_non_age) {
517956bf 1954 if ($file =~ /$EXTRACTED/i) {
99870f4d 1955 Carp::my_carp_bug(join_lines(<<END
99f78760 1956$file should be processed just after the 'Prop...Alias' files, and before
99870f4d
KW
1957anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
1958have subtle problems
1959END
1960 ));
1961 }
1962 }
1963 elsif ($EXTRACTED_DIR
1964 && $first_released{$addr} ne v0
517956bf
CB
1965 && $file !~ /$EXTRACTED/i
1966 && lc($file) ne 'dage.txt')
99870f4d
KW
1967 {
1968 # We don't set this (by the 'if' above) if we have no
1969 # extracted directory, so if running on an early version,
1970 # this test won't work. Not worth worrying about.
1971 $seen_non_extracted_non_age = 1;
1972 }
1973
1974 # And mark the file as having being processed, and warn if it
1975 # isn't a file we are expecting. As we process the files,
1976 # they are deleted from the hash, so any that remain at the
1977 # end of the program are files that we didn't process.
517956bf
CB
1978 my $fkey = File::Spec->rel2abs($file);
1979 my $expecting = delete $potential_files{$fkey};
1980 $expecting = delete $potential_files{lc($fkey)} unless defined $expecting;
678f13d5
KW
1981 Carp::my_carp("Was not expecting '$file'.") if
1982 ! $expecting
99870f4d
KW
1983 && ! defined $handle{$addr};
1984
37e2e78e
KW
1985 # Having deleted from expected files, we can quit if not to do
1986 # anything. Don't print progress unless really want verbosity
1987 if ($skip{$addr}) {
1988 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
1989 return;
1990 }
1991
99870f4d
KW
1992 # Open the file, converting the slashes used in this program
1993 # into the proper form for the OS
1994 my $file_handle;
1995 if (not open $file_handle, "<", $file) {
1996 Carp::my_carp("Can't open $file. Skipping: $!");
1997 return 0;
1998 }
1999 $handle{$addr} = $file_handle; # Cache the open file handle
2000 }
2001
2002 if ($verbosity >= $PROGRESS) {
2003 if ($progress_message{$addr}) {
2004 print "$progress_message{$addr}\n";
2005 }
2006 else {
2007 # If using a virtual file, say so.
2008 print "Processing ", (-e $file)
2009 ? $file
2010 : "substitute $file",
2011 "\n";
2012 }
2013 }
2014
2015
2016 # Call any special handler for before the file.
2017 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2018
2019 # Then the main handler
2020 &{$handler{$addr}}($self);
2021
2022 # Then any special post-file handler.
2023 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2024
2025 # If any errors have been accumulated, output the counts (as the first
2026 # error message in each class was output when it was encountered).
2027 if ($errors{$addr}) {
2028 my $total = 0;
2029 my $types = 0;
2030 foreach my $error (keys %{$errors{$addr}}) {
2031 $total += $errors{$addr}->{$error};
2032 delete $errors{$addr}->{$error};
2033 $types++;
2034 }
2035 if ($total > 1) {
2036 my $message
2037 = "A total of $total lines had errors in $file. ";
2038
2039 $message .= ($types == 1)
2040 ? '(Only the first one was displayed.)'
2041 : '(Only the first of each type was displayed.)';
2042 Carp::my_carp($message);
2043 }
2044 }
2045
2046 if (@{$missings{$addr}}) {
2047 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2048 }
2049
2050 # If a real file handle, close it.
2051 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2052 ref $handle{$addr};
2053 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2054 # the file, as opposed to undef
2055 return;
2056 }
2057
2058 sub next_line {
2059 # Sets $_ to be the next logical input line, if any. Returns non-zero
2060 # if such a line exists. 'logical' means that any lines that have
2061 # been added via insert_lines() will be returned in $_ before the file
2062 # is read again.
2063
2064 my $self = shift;
2065 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2066
ffe43484 2067 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2068
2069 # Here the file is open (or if the handle is not a ref, is an open
2070 # 'virtual' file). Get the next line; any inserted lines get priority
2071 # over the file itself.
2072 my $adjusted;
2073
2074 LINE:
2075 while (1) { # Loop until find non-comment, non-empty line
2076 #local $to_trace = 1 if main::DEBUG;
2077 my $inserted_ref = shift @{$added_lines{$addr}};
2078 if (defined $inserted_ref) {
2079 ($adjusted, $_) = @{$inserted_ref};
2080 trace $adjusted, $_ if main::DEBUG && $to_trace;
2081 return 1 if $adjusted;
2082 }
2083 else {
2084 last if ! ref $handle{$addr}; # Don't read unless is real file
2085 last if ! defined ($_ = readline $handle{$addr});
2086 }
2087 chomp;
2088 trace $_ if main::DEBUG && $to_trace;
2089
2090 # See if this line is the comment line that defines what property
2091 # value that code points that are not listed in the file should
2092 # have. The format or existence of these lines is not guaranteed
2093 # by Unicode since they are comments, but the documentation says
2094 # that this was added for machine-readability, so probably won't
2095 # change. This works starting in Unicode Version 5.0. They look
2096 # like:
2097 #
2098 # @missing: 0000..10FFFF; Not_Reordered
2099 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2100 # @missing: 0000..10FFFF; ; NaN
2101 #
2102 # Save the line for a later get_missings() call.
2103 if (/$missing_defaults_prefix/) {
2104 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2105 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2106 }
2107 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2108 my @defaults = split /\s* ; \s*/x, $_;
2109
2110 # The first field is the @missing, which ends in a
2111 # semi-colon, so can safely shift.
2112 shift @defaults;
2113
2114 # Some of these lines may have empty field placeholders
2115 # which get in the way. An example is:
2116 # @missing: 0000..10FFFF; ; NaN
2117 # Remove them. Process starting from the top so the
2118 # splice doesn't affect things still to be looked at.
2119 for (my $i = @defaults - 1; $i >= 0; $i--) {
2120 next if $defaults[$i] ne "";
2121 splice @defaults, $i, 1;
2122 }
2123
2124 # What's left should be just the property (maybe) and the
2125 # default. Having only one element means it doesn't have
2126 # the property.
2127 my $default;
2128 my $property;
2129 if (@defaults >= 1) {
2130 if (@defaults == 1) {
2131 $default = $defaults[0];
2132 }
2133 else {
2134 $property = $defaults[0];
2135 $default = $defaults[1];
2136 }
2137 }
2138
2139 if (@defaults < 1
2140 || @defaults > 2
2141 || ($default =~ /^</
2142 && $default !~ /^<code *point>$/i
2143 && $default !~ /^<none>$/i))
2144 {
2145 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2146 }
2147 else {
2148
2149 # If the property is missing from the line, it should
2150 # be the one for the whole file
2151 $property = $property{$addr} if ! defined $property;
2152
2153 # Change <none> to the null string, which is what it
2154 # really means. If the default is the code point
2155 # itself, set it to <code point>, which is what
2156 # Unicode uses (but sometimes they've forgotten the
2157 # space)
2158 if ($default =~ /^<none>$/i) {
2159 $default = "";
2160 }
2161 elsif ($default =~ /^<code *point>$/i) {
2162 $default = $CODE_POINT;
2163 }
2164
2165 # Store them as a sub-arrays with both components.
2166 push @{$missings{$addr}}, [ $default, $property ];
2167 }
2168 }
2169
2170 # There is nothing for the caller to process on this comment
2171 # line.
2172 next;
2173 }
2174
2175 # Remove comments and trailing space, and skip this line if the
2176 # result is empty
2177 s/#.*//;
2178 s/\s+$//;
2179 next if /^$/;
2180
2181 # Call any handlers for this line, and skip further processing of
2182 # the line if the handler sets the line to null.
2183 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2184 &{$sub_ref}($self);
2185 next LINE if /^$/;
2186 }
2187
2188 # Here the line is ok. return success.
2189 return 1;
2190 } # End of looping through lines.
2191
2192 # If there is an EOF handler, call it (only once) and if it generates
2193 # more lines to process go back in the loop to handle them.
2194 if ($eof_handler{$addr}) {
2195 &{$eof_handler{$addr}}($self);
2196 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2197 goto LINE if $added_lines{$addr};
2198 }
2199
2200 # Return failure -- no more lines.
2201 return 0;
2202
2203 }
2204
2205# Not currently used, not fully tested.
2206# sub peek {
2207# # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2208# # record. Not callable from an each_line_handler(), nor does it call
2209# # an each_line_handler() on the line.
2210#
2211# my $self = shift;
ffe43484 2212# my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2213#
2214# foreach my $inserted_ref (@{$added_lines{$addr}}) {
2215# my ($adjusted, $line) = @{$inserted_ref};
2216# next if $adjusted;
2217#
2218# # Remove comments and trailing space, and return a non-empty
2219# # resulting line
2220# $line =~ s/#.*//;
2221# $line =~ s/\s+$//;
2222# return $line if $line ne "";
2223# }
2224#
2225# return if ! ref $handle{$addr}; # Don't read unless is real file
2226# while (1) { # Loop until find non-comment, non-empty line
2227# local $to_trace = 1 if main::DEBUG;
2228# trace $_ if main::DEBUG && $to_trace;
2229# return if ! defined (my $line = readline $handle{$addr});
2230# chomp $line;
2231# push @{$added_lines{$addr}}, [ 0, $line ];
2232#
2233# $line =~ s/#.*//;
2234# $line =~ s/\s+$//;
2235# return $line if $line ne "";
2236# }
2237#
2238# return;
2239# }
2240
2241
2242 sub insert_lines {
2243 # Lines can be inserted so that it looks like they were in the input
2244 # file at the place it was when this routine is called. See also
2245 # insert_adjusted_lines(). Lines inserted via this routine go through
2246 # any each_line_handler()
2247
2248 my $self = shift;
2249
2250 # Each inserted line is an array, with the first element being 0 to
2251 # indicate that this line hasn't been adjusted, and needs to be
2252 # processed.
f998e60c 2253 no overloading;
051df77b 2254 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
99870f4d
KW
2255 return;
2256 }
2257
2258 sub insert_adjusted_lines {
2259 # Lines can be inserted so that it looks like they were in the input
2260 # file at the place it was when this routine is called. See also
2261 # insert_lines(). Lines inserted via this routine are already fully
2262 # adjusted, ready to be processed; each_line_handler()s handlers will
2263 # not be called. This means this is not a completely general
2264 # facility, as only the last each_line_handler on the stack should
2265 # call this. It could be made more general, by passing to each of the
2266 # line_handlers their position on the stack, which they would pass on
2267 # to this routine, and that would replace the boolean first element in
2268 # the anonymous array pushed here, so that the next_line routine could
2269 # use that to call only those handlers whose index is after it on the
2270 # stack. But this is overkill for what is needed now.
2271
2272 my $self = shift;
2273 trace $_[0] if main::DEBUG && $to_trace;
2274
2275 # Each inserted line is an array, with the first element being 1 to
2276 # indicate that this line has been adjusted
f998e60c 2277 no overloading;
051df77b 2278 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
99870f4d
KW
2279 return;
2280 }
2281
2282 sub get_missings {
2283 # Returns the stored up @missings lines' values, and clears the list.
2284 # The values are in an array, consisting of the default in the first
2285 # element, and the property in the 2nd. However, since these lines
2286 # can be stacked up, the return is an array of all these arrays.
2287
2288 my $self = shift;
2289 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2290
ffe43484 2291 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2292
2293 # If not accepting a list return, just return the first one.
2294 return shift @{$missings{$addr}} unless wantarray;
2295
2296 my @return = @{$missings{$addr}};
2297 undef @{$missings{$addr}};
2298 return @return;
2299 }
2300
2301 sub _insert_property_into_line {
2302 # Add a property field to $_, if this file requires it.
2303
f998e60c 2304 my $self = shift;
ffe43484 2305 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 2306 my $property = $property{$addr};
99870f4d
KW
2307 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2308
2309 $_ =~ s/(;|$)/; $property$1/;
2310 return;
2311 }
2312
2313 sub carp_bad_line {
2314 # Output consistent error messages, using either a generic one, or the
2315 # one given by the optional parameter. To avoid gazillions of the
2316 # same message in case the syntax of a file is way off, this routine
2317 # only outputs the first instance of each message, incrementing a
2318 # count so the totals can be output at the end of the file.
2319
2320 my $self = shift;
2321 my $message = shift;
2322 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2323
ffe43484 2324 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2325
2326 $message = 'Unexpected line' unless $message;
2327
2328 # No trailing punctuation so as to fit with our addenda.
2329 $message =~ s/[.:;,]$//;
2330
2331 # If haven't seen this exact message before, output it now. Otherwise
2332 # increment the count of how many times it has occurred
2333 unless ($errors{$addr}->{$message}) {
2334 Carp::my_carp("$message in '$_' in "
f998e60c 2335 . $file{$addr}
99870f4d
KW
2336 . " at line $.. Skipping this line;");
2337 $errors{$addr}->{$message} = 1;
2338 }
2339 else {
2340 $errors{$addr}->{$message}++;
2341 }
2342
2343 # Clear the line to prevent any further (meaningful) processing of it.
2344 $_ = "";
2345
2346 return;
2347 }
2348} # End closure
2349
2350package Multi_Default;
2351
2352# Certain properties in early versions of Unicode had more than one possible
2353# default for code points missing from the files. In these cases, one
2354# default applies to everything left over after all the others are applied,
2355# and for each of the others, there is a description of which class of code
2356# points applies to it. This object helps implement this by storing the
2357# defaults, and for all but that final default, an eval string that generates
2358# the class that it applies to.
2359
2360
2361{ # Closure
2362
2363 main::setup_package();
2364
2365 my %class_defaults;
2366 # The defaults structure for the classes
2367 main::set_access('class_defaults', \%class_defaults);
2368
2369 my %other_default;
2370 # The default that applies to everything left over.
2371 main::set_access('other_default', \%other_default, 'r');
2372
2373
2374 sub new {
2375 # The constructor is called with default => eval pairs, terminated by
2376 # the left-over default. e.g.
2377 # Multi_Default->new(
2378 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2379 # - 0x200D',
2380 # 'R' => 'some other expression that evaluates to code points',
2381 # .
2382 # .
2383 # .
2384 # 'U'));
2385
2386 my $class = shift;
2387
2388 my $self = bless \do{my $anonymous_scalar}, $class;
ffe43484 2389 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2390
2391 while (@_ > 1) {
2392 my $default = shift;
2393 my $eval = shift;
2394 $class_defaults{$addr}->{$default} = $eval;
2395 }
2396
2397 $other_default{$addr} = shift;
2398
2399 return $self;
2400 }
2401
2402 sub get_next_defaults {
2403 # Iterates and returns the next class of defaults.
2404 my $self = shift;
2405 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2406
ffe43484 2407 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2408
2409 return each %{$class_defaults{$addr}};
2410 }
2411}
2412
2413package Alias;
2414
2415# An alias is one of the names that a table goes by. This class defines them
2416# including some attributes. Everything is currently setup in the
2417# constructor.
2418
2419
2420{ # Closure
2421
2422 main::setup_package();
2423
2424 my %name;
2425 main::set_access('name', \%name, 'r');
2426
2427 my %loose_match;
2428 # Determined by the constructor code if this name should match loosely or
2429 # not. The constructor parameters can override this, but it isn't fully
2430 # implemented, as should have ability to override Unicode one's via
2431 # something like a set_loose_match()
2432 main::set_access('loose_match', \%loose_match, 'r');
2433
2434 my %make_pod_entry;
2435 # Some aliases should not get their own entries because they are covered
2436 # by a wild-card, and some we want to discourage use of. Binary
2437 main::set_access('make_pod_entry', \%make_pod_entry, 'r');
2438
2439 my %status;
2440 # Aliases have a status, like deprecated, or even suppressed (which means
2441 # they don't appear in documentation). Enum
2442 main::set_access('status', \%status, 'r');
2443
2444 my %externally_ok;
2445 # Similarly, some aliases should not be considered as usable ones for
2446 # external use, such as file names, or we don't want documentation to
2447 # recommend them. Boolean
2448 main::set_access('externally_ok', \%externally_ok, 'r');
2449
2450 sub new {
2451 my $class = shift;
2452
2453 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2454 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2455
2456 $name{$addr} = shift;
2457 $loose_match{$addr} = shift;
2458 $make_pod_entry{$addr} = shift;
2459 $externally_ok{$addr} = shift;
2460 $status{$addr} = shift;
2461
2462 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2463
2464 # Null names are never ok externally
2465 $externally_ok{$addr} = 0 if $name{$addr} eq "";
2466
2467 return $self;
2468 }
2469}
2470
2471package Range;
2472
2473# A range is the basic unit for storing code points, and is described in the
2474# comments at the beginning of the program. Each range has a starting code
2475# point; an ending code point (not less than the starting one); a value
2476# that applies to every code point in between the two end-points, inclusive;
2477# and an enum type that applies to the value. The type is for the user's
2478# convenience, and has no meaning here, except that a non-zero type is
2479# considered to not obey the normal Unicode rules for having standard forms.
2480#
2481# The same structure is used for both map and match tables, even though in the
2482# latter, the value (and hence type) is irrelevant and could be used as a
2483# comment. In map tables, the value is what all the code points in the range
2484# map to. Type 0 values have the standardized version of the value stored as
2485# well, so as to not have to recalculate it a lot.
2486
2487sub trace { return main::trace(@_); }
2488
2489{ # Closure
2490
2491 main::setup_package();
2492
2493 my %start;
2494 main::set_access('start', \%start, 'r', 's');
2495
2496 my %end;
2497 main::set_access('end', \%end, 'r', 's');
2498
2499 my %value;
2500 main::set_access('value', \%value, 'r');
2501
2502 my %type;
2503 main::set_access('type', \%type, 'r');
2504
2505 my %standard_form;
2506 # The value in internal standard form. Defined only if the type is 0.
2507 main::set_access('standard_form', \%standard_form);
2508
2509 # Note that if these fields change, the dump() method should as well
2510
2511 sub new {
2512 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2513 my $class = shift;
2514
2515 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2516 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2517
2518 $start{$addr} = shift;
2519 $end{$addr} = shift;
2520
2521 my %args = @_;
2522
2523 my $value = delete $args{'Value'}; # Can be 0
2524 $value = "" unless defined $value;
2525 $value{$addr} = $value;
2526
2527 $type{$addr} = delete $args{'Type'} || 0;
2528
2529 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2530
2531 if (! $type{$addr}) {
2532 $standard_form{$addr} = main::standardize($value);
2533 }
2534
2535 return $self;
2536 }
2537
2538 use overload
2539 fallback => 0,
2540 qw("") => "_operator_stringify",
2541 "." => \&main::_operator_dot,
2542 ;
2543
2544 sub _operator_stringify {
2545 my $self = shift;
ffe43484 2546 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2547
2548 # Output it like '0041..0065 (value)'
2549 my $return = sprintf("%04X", $start{$addr})
2550 . '..'
2551 . sprintf("%04X", $end{$addr});
2552 my $value = $value{$addr};
2553 my $type = $type{$addr};
2554 $return .= ' (';
2555 $return .= "$value";
2556 $return .= ", Type=$type" if $type != 0;
2557 $return .= ')';
2558
2559 return $return;
2560 }
2561
2562 sub standard_form {
2563 # The standard form is the value itself if the standard form is
2564 # undefined (that is if the value is special)
2565
2566 my $self = shift;
2567 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2568
ffe43484 2569 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2570
2571 return $standard_form{$addr} if defined $standard_form{$addr};
2572 return $value{$addr};
2573 }
2574
2575 sub dump {
2576 # Human, not machine readable. For machine readable, comment out this
2577 # entire routine and let the standard one take effect.
2578 my $self = shift;
2579 my $indent = shift;
2580 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2581
ffe43484 2582 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2583
2584 my $return = $indent
2585 . sprintf("%04X", $start{$addr})
2586 . '..'
2587 . sprintf("%04X", $end{$addr})
2588 . " '$value{$addr}';";
2589 if (! defined $standard_form{$addr}) {
2590 $return .= "(type=$type{$addr})";
2591 }
2592 elsif ($standard_form{$addr} ne $value{$addr}) {
2593 $return .= "(standard '$standard_form{$addr}')";
2594 }
2595 return $return;
2596 }
2597} # End closure
2598
2599package _Range_List_Base;
2600
2601# Base class for range lists. A range list is simply an ordered list of
2602# ranges, so that the ranges with the lowest starting numbers are first in it.
2603#
2604# When a new range is added that is adjacent to an existing range that has the
2605# same value and type, it merges with it to form a larger range.
2606#
2607# Ranges generally do not overlap, except that there can be multiple entries
2608# of single code point ranges. This is because of NameAliases.txt.
2609#
2610# In this program, there is a standard value such that if two different
2611# values, have the same standard value, they are considered equivalent. This
2612# value was chosen so that it gives correct results on Unicode data
2613
2614# There are a number of methods to manipulate range lists, and some operators
2615# are overloaded to handle them.
2616
99870f4d
KW
2617sub trace { return main::trace(@_); }
2618
2619{ # Closure
2620
2621 our $addr;
2622
2623 main::setup_package();
2624
2625 my %ranges;
2626 # The list of ranges
2627 main::set_access('ranges', \%ranges, 'readable_array');
2628
2629 my %max;
2630 # The highest code point in the list. This was originally a method, but
2631 # actual measurements said it was used a lot.
2632 main::set_access('max', \%max, 'r');
2633
2634 my %each_range_iterator;
2635 # Iterator position for each_range()
2636 main::set_access('each_range_iterator', \%each_range_iterator);
2637
2638 my %owner_name_of;
2639 # Name of parent this is attached to, if any. Solely for better error
2640 # messages.
2641 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
2642
2643 my %_search_ranges_cache;
2644 # A cache of the previous result from _search_ranges(), for better
2645 # performance
2646 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
2647
2648 sub new {
2649 my $class = shift;
2650 my %args = @_;
2651
2652 # Optional initialization data for the range list.
2653 my $initialize = delete $args{'Initialize'};
2654
2655 my $self;
2656
2657 # Use _union() to initialize. _union() returns an object of this
2658 # class, which means that it will call this constructor recursively.
2659 # But it won't have this $initialize parameter so that it won't
2660 # infinitely loop on this.
2661 return _union($class, $initialize, %args) if defined $initialize;
2662
2663 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 2664 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2665
2666 # Optional parent object, only for debug info.
2667 $owner_name_of{$addr} = delete $args{'Owner'};
2668 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
2669
2670 # Stringify, in case it is an object.
2671 $owner_name_of{$addr} = "$owner_name_of{$addr}";
2672
2673 # This is used only for error messages, and so a colon is added
2674 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
2675
2676 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2677
2678 # Max is initialized to a negative value that isn't adjacent to 0,
2679 # for simpler tests
2680 $max{$addr} = -2;
2681
2682 $_search_ranges_cache{$addr} = 0;
2683 $ranges{$addr} = [];
2684
2685 return $self;
2686 }
2687
2688 use overload
2689 fallback => 0,
2690 qw("") => "_operator_stringify",
2691 "." => \&main::_operator_dot,
2692 ;
2693
2694 sub _operator_stringify {
2695 my $self = shift;
ffe43484 2696 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2697
2698 return "Range_List attached to '$owner_name_of{$addr}'"
2699 if $owner_name_of{$addr};
2700 return "anonymous Range_List " . \$self;
2701 }
2702
2703 sub _union {
2704 # Returns the union of the input code points. It can be called as
2705 # either a constructor or a method. If called as a method, the result
2706 # will be a new() instance of the calling object, containing the union
2707 # of that object with the other parameter's code points; if called as
2708 # a constructor, the first parameter gives the class the new object
2709 # should be, and the second parameter gives the code points to go into
2710 # it.
2711 # In either case, there are two parameters looked at by this routine;
2712 # any additional parameters are passed to the new() constructor.
2713 #
2714 # The code points can come in the form of some object that contains
2715 # ranges, and has a conventionally named method to access them; or
2716 # they can be an array of individual code points (as integers); or
2717 # just a single code point.
2718 #
2719 # If they are ranges, this routine doesn't make any effort to preserve
2720 # the range values of one input over the other. Therefore this base
2721 # class should not allow _union to be called from other than
2722 # initialization code, so as to prevent two tables from being added
2723 # together where the range values matter. The general form of this
2724 # routine therefore belongs in a derived class, but it was moved here
2725 # to avoid duplication of code. The failure to overload this in this
2726 # class keeps it safe.
2727 #
2728
2729 my $self;
2730 my @args; # Arguments to pass to the constructor
2731
2732 my $class = shift;
2733
2734 # If a method call, will start the union with the object itself, and
2735 # the class of the new object will be the same as self.
2736 if (ref $class) {
2737 $self = $class;
2738 $class = ref $self;
2739 push @args, $self;
2740 }
2741
2742 # Add the other required parameter.
2743 push @args, shift;
2744 # Rest of parameters are passed on to the constructor
2745
2746 # Accumulate all records from both lists.
2747 my @records;
2748 for my $arg (@args) {
2749 #local $to_trace = 0 if main::DEBUG;
2750 trace "argument = $arg" if main::DEBUG && $to_trace;
2751 if (! defined $arg) {
2752 my $message = "";
2753 if (defined $self) {
f998e60c 2754 no overloading;
051df77b 2755 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2756 }
2757 Carp::my_carp_bug($message .= "Undefined argument to _union. No union done.");
2758 return;
2759 }
2760 $arg = [ $arg ] if ! ref $arg;
2761 my $type = ref $arg;
2762 if ($type eq 'ARRAY') {
2763 foreach my $element (@$arg) {
2764 push @records, Range->new($element, $element);
2765 }
2766 }
2767 elsif ($arg->isa('Range')) {
2768 push @records, $arg;
2769 }
2770 elsif ($arg->can('ranges')) {
2771 push @records, $arg->ranges;
2772 }
2773 else {
2774 my $message = "";
2775 if (defined $self) {
f998e60c 2776 no overloading;
051df77b 2777 $message .= $owner_name_of{pack 'J', $self};
99870f4d
KW
2778 }
2779 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
2780 return;
2781 }
2782 }
2783
2784 # Sort with the range containing the lowest ordinal first, but if
2785 # two ranges start at the same code point, sort with the bigger range
2786 # of the two first, because it takes fewer cycles.
2787 @records = sort { ($a->start <=> $b->start)
2788 or
2789 # if b is shorter than a, b->end will be
2790 # less than a->end, and we want to select
2791 # a, so want to return -1
2792 ($b->end <=> $a->end)
2793 } @records;
2794
2795 my $new = $class->new(@_);
2796
2797 # Fold in records so long as they add new information.
2798 for my $set (@records) {
2799 my $start = $set->start;
2800 my $end = $set->end;
2801 my $value = $set->value;
2802 if ($start > $new->max) {
2803 $new->_add_delete('+', $start, $end, $value);
2804 }
2805 elsif ($end > $new->max) {
2806 $new->_add_delete('+', $new->max +1, $end, $value);
2807 }
2808 }
2809
2810 return $new;
2811 }
2812
2813 sub range_count { # Return the number of ranges in the range list
2814 my $self = shift;
2815 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2816
f998e60c 2817 no overloading;
051df77b 2818 return scalar @{$ranges{pack 'J', $self}};
99870f4d
KW
2819 }
2820
2821 sub min {
2822 # Returns the minimum code point currently in the range list, or if
2823 # the range list is empty, 2 beyond the max possible. This is a
2824 # method because used so rarely, that not worth saving between calls,
2825 # and having to worry about changing it as ranges are added and
2826 # deleted.
2827
2828 my $self = shift;
2829 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2830
ffe43484 2831 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2832
2833 # If the range list is empty, return a large value that isn't adjacent
2834 # to any that could be in the range list, for simpler tests
2835 return $LAST_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
2836 return $ranges{$addr}->[0]->start;
2837 }
2838
2839 sub contains {
2840 # Boolean: Is argument in the range list? If so returns $i such that:
2841 # range[$i]->end < $codepoint <= range[$i+1]->end
2842 # which is one beyond what you want; this is so that the 0th range
2843 # doesn't return false
2844 my $self = shift;
2845 my $codepoint = shift;
2846 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2847
99870f4d
KW
2848 my $i = $self->_search_ranges($codepoint);
2849 return 0 unless defined $i;
2850
2851 # The search returns $i, such that
2852 # range[$i-1]->end < $codepoint <= range[$i]->end
2853 # So is in the table if and only iff it is at least the start position
2854 # of range $i.
f998e60c 2855 no overloading;
051df77b 2856 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
99870f4d
KW
2857 return $i + 1;
2858 }
2859
2f7a8815
KW
2860 sub containing_range {
2861 # Returns the range object that contains the code point, undef if none
2862
2863 my $self = shift;
2864 my $codepoint = shift;
2865 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2866
2867 my $i = $self->contains($codepoint);
2868 return unless $i;
2869
2870 # contains() returns 1 beyond where we should look
2871 no overloading;
2872 return $ranges{pack 'J', $self}->[$i-1];
2873 }
2874
99870f4d
KW
2875 sub value_of {
2876 # Returns the value associated with the code point, undef if none
2877
2878 my $self = shift;
2879 my $codepoint = shift;
2880 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2881
d69c231b
KW
2882 my $range = $self->containing_range($codepoint);
2883 return unless defined $range;
99870f4d 2884
d69c231b 2885 return $range->value;
99870f4d
KW
2886 }
2887
0a9dbafc
KW
2888 sub type_of {
2889 # Returns the type of the range containing the code point, undef if
2890 # the code point is not in the table
2891
2892 my $self = shift;
2893 my $codepoint = shift;
2894 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2895
2896 my $range = $self->containing_range($codepoint);
2897 return unless defined $range;
2898
2899 return $range->type;
2900 }
2901
99870f4d
KW
2902 sub _search_ranges {
2903 # Find the range in the list which contains a code point, or where it
2904 # should go if were to add it. That is, it returns $i, such that:
2905 # range[$i-1]->end < $codepoint <= range[$i]->end
2906 # Returns undef if no such $i is possible (e.g. at end of table), or
2907 # if there is an error.
2908
2909 my $self = shift;
2910 my $code_point = shift;
2911 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2912
ffe43484 2913 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
2914
2915 return if $code_point > $max{$addr};
2916 my $r = $ranges{$addr}; # The current list of ranges
2917 my $range_list_size = scalar @$r;
2918 my $i;
2919
2920 use integer; # want integer division
2921
2922 # Use the cached result as the starting guess for this one, because,
2923 # an experiment on 5.1 showed that 90% of the time the cache was the
2924 # same as the result on the next call (and 7% it was one less).
2925 $i = $_search_ranges_cache{$addr};
2926 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
2927 # from an intervening deletion
2928 #local $to_trace = 1 if main::DEBUG;
2929 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
2930 return $i if $code_point <= $r->[$i]->end
2931 && ($i == 0 || $r->[$i-1]->end < $code_point);
2932
2933 # Here the cache doesn't yield the correct $i. Try adding 1.
2934 if ($i < $range_list_size - 1
2935 && $r->[$i]->end < $code_point &&
2936 $code_point <= $r->[$i+1]->end)
2937 {
2938 $i++;
2939 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
2940 $_search_ranges_cache{$addr} = $i;
2941 return $i;
2942 }
2943
2944 # Here, adding 1 also didn't work. We do a binary search to
2945 # find the correct position, starting with current $i
2946 my $lower = 0;
2947 my $upper = $range_list_size - 1;
2948 while (1) {
2949 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
2950
2951 if ($code_point <= $r->[$i]->end) {
2952
2953 # Here we have met the upper constraint. We can quit if we
2954 # also meet the lower one.
2955 last if $i == 0 || $r->[$i-1]->end < $code_point;
2956
2957 $upper = $i; # Still too high.
2958
2959 }
2960 else {
2961
2962 # Here, $r[$i]->end < $code_point, so look higher up.
2963 $lower = $i;
2964 }
2965
2966 # Split search domain in half to try again.
2967 my $temp = ($upper + $lower) / 2;
2968
2969 # No point in continuing unless $i changes for next time
2970 # in the loop.
2971 if ($temp == $i) {
2972
2973 # We can't reach the highest element because of the averaging.
2974 # So if one below the upper edge, force it there and try one
2975 # more time.
2976 if ($i == $range_list_size - 2) {
2977
2978 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
2979 $i = $range_list_size - 1;
2980
2981 # Change $lower as well so if fails next time through,
2982 # taking the average will yield the same $i, and we will
2983 # quit with the error message just below.
2984 $lower = $i;
2985 next;
2986 }
2987 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
2988 return;
2989 }
2990 $i = $temp;
2991 } # End of while loop
2992
2993 if (main::DEBUG && $to_trace) {
2994 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
2995 trace "i= [ $i ]", $r->[$i];
2996 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
2997 }
2998
2999 # Here we have found the offset. Cache it as a starting point for the
3000 # next call.
3001 $_search_ranges_cache{$addr} = $i;
3002 return $i;
3003 }
3004
3005 sub _add_delete {
3006 # Add, replace or delete ranges to or from a list. The $type
3007 # parameter gives which:
3008 # '+' => insert or replace a range, returning a list of any changed
3009 # ranges.
3010 # '-' => delete a range, returning a list of any deleted ranges.
3011 #
3012 # The next three parameters give respectively the start, end, and
3013 # value associated with the range. 'value' should be null unless the
3014 # operation is '+';
3015 #
3016 # The range list is kept sorted so that the range with the lowest
3017 # starting position is first in the list, and generally, adjacent
c1739a4a 3018 # ranges with the same values are merged into a single larger one (see
99870f4d
KW
3019 # exceptions below).
3020 #
c1739a4a 3021 # There are more parameters; all are key => value pairs:
99870f4d
KW
3022 # Type gives the type of the value. It is only valid for '+'.
3023 # All ranges have types; if this parameter is omitted, 0 is
3024 # assumed. Ranges with type 0 are assumed to obey the
3025 # Unicode rules for casing, etc; ranges with other types are
3026 # not. Otherwise, the type is arbitrary, for the caller's
3027 # convenience, and looked at only by this routine to keep
3028 # adjacent ranges of different types from being merged into
3029 # a single larger range, and when Replace =>
3030 # $IF_NOT_EQUIVALENT is specified (see just below).
3031 # Replace determines what to do if the range list already contains
3032 # ranges which coincide with all or portions of the input
3033 # range. It is only valid for '+':
3034 # => $NO means that the new value is not to replace
3035 # any existing ones, but any empty gaps of the
3036 # range list coinciding with the input range
3037 # will be filled in with the new value.
3038 # => $UNCONDITIONALLY means to replace the existing values with
3039 # this one unconditionally. However, if the
3040 # new and old values are identical, the
3041 # replacement is skipped to save cycles
3042 # => $IF_NOT_EQUIVALENT means to replace the existing values
3043 # with this one if they are not equivalent.
3044 # Ranges are equivalent if their types are the
c1739a4a 3045 # same, and they are the same string; or if
99870f4d
KW
3046 # both are type 0 ranges, if their Unicode
3047 # standard forms are identical. In this last
3048 # case, the routine chooses the more "modern"
3049 # one to use. This is because some of the
3050 # older files are formatted with values that
3051 # are, for example, ALL CAPs, whereas the
3052 # derived files have a more modern style,
3053 # which looks better. By looking for this
3054 # style when the pre-existing and replacement
3055 # standard forms are the same, we can move to
3056 # the modern style
3057 # => $MULTIPLE means that if this range duplicates an
3058 # existing one, but has a different value,
3059 # don't replace the existing one, but insert
3060 # this, one so that the same range can occur
3061 # multiple times.
3062 # => anything else is the same as => $IF_NOT_EQUIVALENT
3063 #
c1739a4a
KW
3064 # "same value" means identical for non-type-0 ranges, and it means
3065 # having the same standard forms for type-0 ranges.
99870f4d
KW
3066
3067 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3068
3069 my $self = shift;
3070 my $operation = shift; # '+' for add/replace; '-' for delete;
3071 my $start = shift;
3072 my $end = shift;
3073 my $value = shift;
3074
3075 my %args = @_;
3076
3077 $value = "" if not defined $value; # warning: $value can be "0"
3078
3079 my $replace = delete $args{'Replace'};
3080 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3081
3082 my $type = delete $args{'Type'};
3083 $type = 0 unless defined $type;
3084
3085 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3086
ffe43484 3087 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3088
3089 if ($operation ne '+' && $operation ne '-') {
3090 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3091 return;
3092 }
3093 unless (defined $start && defined $end) {
3094 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3095 return;
3096 }
3097 unless ($end >= $start) {
3098 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3099 return;
3100 }
3101 #local $to_trace = 1 if main::DEBUG;
3102
3103 if ($operation eq '-') {
3104 if ($replace != $IF_NOT_EQUIVALENT) {
3105 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3106 $replace = $IF_NOT_EQUIVALENT;
3107 }
3108 if ($type) {
3109 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3110 $type = 0;
3111 }
3112 if ($value ne "") {
3113 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3114 $value = "";
3115 }
3116 }
3117
3118 my $r = $ranges{$addr}; # The current list of ranges
3119 my $range_list_size = scalar @$r; # And its size
3120 my $max = $max{$addr}; # The current high code point in
3121 # the list of ranges
3122
3123 # Do a special case requiring fewer machine cycles when the new range
3124 # starts after the current highest point. The Unicode input data is
3125 # structured so this is common.
3126 if ($start > $max) {
3127
3128 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) type=$type" if main::DEBUG && $to_trace;
3129 return if $operation eq '-'; # Deleting a non-existing range is a
3130 # no-op
3131
3132 # If the new range doesn't logically extend the current final one
3133 # in the range list, create a new range at the end of the range
3134 # list. (max cleverly is initialized to a negative number not
3135 # adjacent to 0 if the range list is empty, so even adding a range
3136 # to an empty range list starting at 0 will have this 'if'
3137 # succeed.)
3138 if ($start > $max + 1 # non-adjacent means can't extend.
3139 || @{$r}[-1]->value ne $value # values differ, can't extend.
3140 || @{$r}[-1]->type != $type # types differ, can't extend.
3141 ) {
3142 push @$r, Range->new($start, $end,
3143 Value => $value,
3144 Type => $type);
3145 }
3146 else {
3147
3148 # Here, the new range starts just after the current highest in
3149 # the range list, and they have the same type and value.
3150 # Extend the current range to incorporate the new one.
3151 @{$r}[-1]->set_end($end);
3152 }
3153
3154 # This becomes the new maximum.
3155 $max{$addr} = $end;
3156
3157 return;
3158 }
3159 #local $to_trace = 0 if main::DEBUG;
3160
3161 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3162
3163 # Here, the input range isn't after the whole rest of the range list.
3164 # Most likely 'splice' will be needed. The rest of the routine finds
3165 # the needed splice parameters, and if necessary, does the splice.
3166 # First, find the offset parameter needed by the splice function for
3167 # the input range. Note that the input range may span multiple
3168 # existing ones, but we'll worry about that later. For now, just find
3169 # the beginning. If the input range is to be inserted starting in a
3170 # position not currently in the range list, it must (obviously) come
3171 # just after the range below it, and just before the range above it.
3172 # Slightly less obviously, it will occupy the position currently
3173 # occupied by the range that is to come after it. More formally, we
3174 # are looking for the position, $i, in the array of ranges, such that:
3175 #
3176 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3177 #
3178 # (The ordered relationships within existing ranges are also shown in
3179 # the equation above). However, if the start of the input range is
3180 # within an existing range, the splice offset should point to that
3181 # existing range's position in the list; that is $i satisfies a
3182 # somewhat different equation, namely:
3183 #
3184 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3185 #
3186 # More briefly, $start can come before or after r[$i]->start, and at
3187 # this point, we don't know which it will be. However, these
3188 # two equations share these constraints:
3189 #
3190 # r[$i-1]->end < $start <= r[$i]->end
3191 #
3192 # And that is good enough to find $i.
3193
3194 my $i = $self->_search_ranges($start);
3195 if (! defined $i) {
3196 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3197 return;
3198 }
3199
3200 # The search function returns $i such that:
3201 #
3202 # r[$i-1]->end < $start <= r[$i]->end
3203 #
3204 # That means that $i points to the first range in the range list
3205 # that could possibly be affected by this operation. We still don't
3206 # know if the start of the input range is within r[$i], or if it
3207 # points to empty space between r[$i-1] and r[$i].
3208 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3209
3210 # Special case the insertion of data that is not to replace any
3211 # existing data.
3212 if ($replace == $NO) { # If $NO, has to be operation '+'
3213 #local $to_trace = 1 if main::DEBUG;
3214 trace "Doesn't replace" if main::DEBUG && $to_trace;
3215
3216 # Here, the new range is to take effect only on those code points
3217 # that aren't already in an existing range. This can be done by
3218 # looking through the existing range list and finding the gaps in
3219 # the ranges that this new range affects, and then calling this
3220 # function recursively on each of those gaps, leaving untouched
3221 # anything already in the list. Gather up a list of the changed
3222 # gaps first so that changes to the internal state as new ranges
3223 # are added won't be a problem.
3224 my @gap_list;
3225
3226 # First, if the starting point of the input range is outside an
3227 # existing one, there is a gap from there to the beginning of the
3228 # existing range -- add a span to fill the part that this new
3229 # range occupies
3230 if ($start < $r->[$i]->start) {
3231 push @gap_list, Range->new($start,
3232 main::min($end,
3233 $r->[$i]->start - 1),
3234 Type => $type);
3235 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3236 }
3237
3238 # Then look through the range list for other gaps until we reach
3239 # the highest range affected by the input one.
3240 my $j;
3241 for ($j = $i+1; $j < $range_list_size; $j++) {
3242 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3243 last if $end < $r->[$j]->start;
3244
3245 # If there is a gap between when this range starts and the
3246 # previous one ends, add a span to fill it. Note that just
3247 # because there are two ranges doesn't mean there is a
3248 # non-zero gap between them. It could be that they have
3249 # different values or types
3250 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3251 push @gap_list,
3252 Range->new($r->[$j-1]->end + 1,
3253 $r->[$j]->start - 1,
3254 Type => $type);
3255 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3256 }
3257 }
3258
3259 # Here, we have either found an existing range in the range list,
3260 # beyond the area affected by the input one, or we fell off the
3261 # end of the loop because the input range affects the whole rest
3262 # of the range list. In either case, $j is 1 higher than the
3263 # highest affected range. If $j == $i, it means that there are no
3264 # affected ranges, that the entire insertion is in the gap between
3265 # r[$i-1], and r[$i], which we already have taken care of before
3266 # the loop.
3267 # On the other hand, if there are affected ranges, it might be
3268 # that there is a gap that needs filling after the final such
3269 # range to the end of the input range
3270 if ($r->[$j-1]->end < $end) {
3271 push @gap_list, Range->new(main::max($start,
3272 $r->[$j-1]->end + 1),
3273 $end,
3274 Type => $type);
3275 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3276 }
3277
3278 # Call recursively to fill in all the gaps.
3279 foreach my $gap (@gap_list) {
3280 $self->_add_delete($operation,
3281 $gap->start,
3282 $gap->end,
3283 $value,
3284 Type => $type);
3285 }
3286
3287 return;
3288 }
3289
3290 # Here, we have taken care of the case where $replace is $NO, which
3291 # means that whatever action we now take is done unconditionally. It
3292 # still could be that this call will result in a no-op, if duplicates
3293 # aren't allowed, and we are inserting a range that merely duplicates
3294 # data already in the range list; or also if deleting a non-existent
3295 # range.
3296 # $i still points to the first potential affected range. Now find the
3297 # highest range affected, which will determine the length parameter to
3298 # splice. (The input range can span multiple existing ones.) While
3299 # we are looking through the range list, see also if this is an
3300 # insertion that will change the values of at least one of the
3301 # affected ranges. We don't need to do this check unless this is an
3302 # insertion of non-multiples, and also since this is a boolean, we
3303 # don't need to do it if have already determined that it will make a
3304 # change; just unconditionally change them. $cdm is created to be 1
3305 # if either of these is true. (The 'c' in the name comes from below)
3306 my $cdm = ($operation eq '-' || $replace == $MULTIPLE);
3307 my $j; # This will point to the highest affected range
3308
3309 # For non-zero types, the standard form is the value itself;
3310 my $standard_form = ($type) ? $value : main::standardize($value);
3311
3312 for ($j = $i; $j < $range_list_size; $j++) {
3313 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3314
3315 # If find a range that it doesn't overlap into, we can stop
3316 # searching
3317 last if $end < $r->[$j]->start;
3318
3319 # Here, overlaps the range at $j. If the value's don't match,
3320 # and this is supposedly an insertion, it becomes a change
3321 # instead. This is what the 'c' stands for in $cdm.
3322 if (! $cdm) {
3323 if ($r->[$j]->standard_form ne $standard_form) {
3324 $cdm = 1;
3325 }
3326 else {
3327
3328 # Here, the two values are essentially the same. If the
3329 # two are actually identical, replacing wouldn't change
3330 # anything so skip it.
3331 my $pre_existing = $r->[$j]->value;
3332 if ($pre_existing ne $value) {
3333
3334 # Here the new and old standardized values are the
3335 # same, but the non-standardized values aren't. If
3336 # replacing unconditionally, then replace
3337 if( $replace == $UNCONDITIONALLY) {
3338 $cdm = 1;
3339 }
3340 else {
3341
3342 # Here, are replacing conditionally. Decide to
3343 # replace or not based on which appears to look
3344 # the "nicest". If one is mixed case and the
3345 # other isn't, choose the mixed case one.
3346 my $new_mixed = $value =~ /[A-Z]/
3347 && $value =~ /[a-z]/;
3348 my $old_mixed = $pre_existing =~ /[A-Z]/
3349 && $pre_existing =~ /[a-z]/;
3350
3351 if ($old_mixed != $new_mixed) {
3352 $cdm = 1 if $new_mixed;
3353 if (main::DEBUG && $to_trace) {
3354 if ($cdm) {
3355 trace "Replacing $pre_existing with $value";
3356 }
3357 else {
3358 trace "Retaining $pre_existing over $value";
3359 }
3360 }
3361 }
3362 else {
3363
3364 # Here casing wasn't different between the two.
3365 # If one has hyphens or underscores and the
3366 # other doesn't, choose the one with the
3367 # punctuation.
3368 my $new_punct = $value =~ /[-_]/;
3369 my $old_punct = $pre_existing =~ /[-_]/;
3370
3371 if ($old_punct != $new_punct) {
3372 $cdm = 1 if $new_punct;
3373 if (main::DEBUG && $to_trace) {
3374 if ($cdm) {
3375 trace "Replacing $pre_existing with $value";
3376 }
3377 else {
3378 trace "Retaining $pre_existing over $value";
3379 }
3380 }
3381 } # else existing one is just as "good";
3382 # retain it to save cycles.
3383 }
3384 }
3385 }
3386 }
3387 }
3388 } # End of loop looking for highest affected range.
3389
3390 # Here, $j points to one beyond the highest range that this insertion
3391 # affects (hence to beyond the range list if that range is the final
3392 # one in the range list).
3393
3394 # The splice length is all the affected ranges. Get it before
3395 # subtracting, for efficiency, so we don't have to later add 1.
3396 my $length = $j - $i;
3397
3398 $j--; # $j now points to the highest affected range.
3399 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3400
3401 # If inserting a multiple record, this is where it goes, after all the
3402 # existing ones for this range. This implies an insertion, and no
3403 # change to any existing ranges. Note that $j can be -1 if this new
3404 # range doesn't actually duplicate any existing, and comes at the
3405 # beginning of the list, in which case we can handle it like any other
3406 # insertion, and is easier to do so.
3407 if ($replace == $MULTIPLE && $j >= 0) {
3408
3409 # This restriction could be remedied with a little extra work, but
3410 # it won't hopefully ever be necessary
3411 if ($r->[$j]->start != $r->[$j]->end) {
3412 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple when the other range ($r->[$j]) contains more than one code point. No action taken.");
3413 return;
3414 }
3415
3416 # Don't add an exact duplicate, as it isn't really a multiple
3417 return if $value eq $r->[$j]->value && $type eq $r->[$j]->type;
3418
3419 trace "Adding multiple record at $j+1 with $start..$end, $value" if main::DEBUG && $to_trace;
3420 my @return = splice @$r,
3421 $j+1,
3422 0,
3423 Range->new($start,
3424 $end,
3425 Value => $value,
3426 Type => $type);
3427 if (main::DEBUG && $to_trace) {
3428 trace "After splice:";
3429 trace 'j-2=[', $j-2, ']', $r->[$j-2] if $j >= 2;
3430 trace 'j-1=[', $j-1, ']', $r->[$j-1] if $j >= 1;
3431 trace "j =[", $j, "]", $r->[$j] if $j >= 0;
3432 trace 'j+1=[', $j+1, ']', $r->[$j+1] if $j < @$r - 1;
3433 trace 'j+2=[', $j+2, ']', $r->[$j+2] if $j < @$r - 2;
3434 trace 'j+3=[', $j+3, ']', $r->[$j+3] if $j < @$r - 3;
3435 }
3436 return @return;
3437 }
3438
3439 # Here, have taken care of $NO and $MULTIPLE replaces.
3440 # $j points to the highest affected range. But it can be < $i or even
3441 # -1. These happen only if the insertion is entirely in the gap
3442 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3443 # above exited first time through with $end < $r->[$i]->start. (And
3444 # then we subtracted one from j) This implies also that $start <
3445 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3446 # $start, so the entire input range is in the gap.
3447 if ($j < $i) {
3448
3449 # Here the entire input range is in the gap before $i.
3450
3451 if (main::DEBUG && $to_trace) {
3452 if ($i) {
3453 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3454 }
3455 else {
3456 trace "Entire range is before $r->[$i]";
3457 }
3458 }
3459 return if $operation ne '+'; # Deletion of a non-existent range is
3460 # a no-op
3461 }
3462 else {
3463
3464 # Here the entire input range is not in the gap before $i. There
3465 # is an affected one, and $j points to the highest such one.
3466
3467 # At this point, here is the situation:
3468 # This is not an insertion of a multiple, nor of tentative ($NO)
3469 # data.
3470 # $i points to the first element in the current range list that
3471 # may be affected by this operation. In fact, we know
3472 # that the range at $i is affected because we are in
3473 # the else branch of this 'if'
3474 # $j points to the highest affected range.
3475 # In other words,
3476 # r[$i-1]->end < $start <= r[$i]->end
3477 # And:
3478 # r[$i-1]->end < $start <= $end <= r[$j]->end
3479 #
3480 # Also:
3481 # $cdm is a boolean which is set true if and only if this is a
3482 # change or deletion (multiple was handled above). In
3483 # other words, it could be renamed to be just $cd.
3484
3485 # We now have enough information to decide if this call is a no-op
3486 # or not. It is a no-op if it is a deletion of a non-existent
3487 # range, or an insertion of already existing data.
3488
3489 if (main::DEBUG && $to_trace && ! $cdm
3490 && $i == $j
3491 && $start >= $r->[$i]->start)
3492 {
3493 trace "no-op";
3494 }
3495 return if ! $cdm # change or delete => not no-op
3496 && $i == $j # more than one affected range => not no-op
3497
3498 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
3499 # Further, $start and/or $end is >= r[$i]->start
3500 # The test below hence guarantees that
3501 # r[$i]->start < $start <= $end <= r[$i]->end
3502 # This means the input range is contained entirely in
3503 # the one at $i, so is a no-op
3504 && $start >= $r->[$i]->start;
3505 }
3506
3507 # Here, we know that some action will have to be taken. We have
3508 # calculated the offset and length (though adjustments may be needed)
3509 # for the splice. Now start constructing the replacement list.
3510 my @replacement;
3511 my $splice_start = $i;
3512
3513 my $extends_below;
3514 my $extends_above;
3515
3516 # See if should extend any adjacent ranges.
3517 if ($operation eq '-') { # Don't extend deletions
3518 $extends_below = $extends_above = 0;
3519 }
3520 else { # Here, should extend any adjacent ranges. See if there are
3521 # any.
3522 $extends_below = ($i > 0
3523 # can't extend unless adjacent
3524 && $r->[$i-1]->end == $start -1
3525 # can't extend unless are same standard value
3526 && $r->[$i-1]->standard_form eq $standard_form
3527 # can't extend unless share type
3528 && $r->[$i-1]->type == $type);
3529 $extends_above = ($j+1 < $range_list_size
3530 && $r->[$j+1]->start == $end +1
3531 && $r->[$j+1]->standard_form eq $standard_form
3532 && $r->[$j-1]->type == $type);
3533 }
3534 if ($extends_below && $extends_above) { # Adds to both
3535 $splice_start--; # start replace at element below
3536 $length += 2; # will replace on both sides
3537 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
3538
3539 # The result will fill in any gap, replacing both sides, and
3540 # create one large range.
3541 @replacement = Range->new($r->[$i-1]->start,
3542 $r->[$j+1]->end,
3543 Value => $value,
3544 Type => $type);
3545 }
3546 else {
3547
3548 # Here we know that the result won't just be the conglomeration of
3549 # a new range with both its adjacent neighbors. But it could
3550 # extend one of them.
3551
3552 if ($extends_below) {
3553
3554 # Here the new element adds to the one below, but not to the
3555 # one above. If inserting, and only to that one range, can
3556 # just change its ending to include the new one.
3557 if ($length == 0 && ! $cdm) {
3558 $r->[$i-1]->set_end($end);
3559 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
3560 return;
3561 }
3562 else {
3563 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
3564 $splice_start--; # start replace at element below
3565 $length++; # will replace the element below
3566 $start = $r->[$i-1]->start;
3567 }
3568 }
3569 elsif ($extends_above) {
3570
3571 # Here the new element adds to the one above, but not below.
3572 # Mirror the code above
3573 if ($length == 0 && ! $cdm) {
3574 $r->[$j+1]->set_start($start);
3575 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
3576 return;
3577 }
3578 else {
3579 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
3580 $length++; # will replace the element above
3581 $end = $r->[$j+1]->end;
3582 }
3583 }
3584
3585 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
3586
3587 # Finally, here we know there will have to be a splice.
3588 # If the change or delete affects only the highest portion of the
3589 # first affected range, the range will have to be split. The
3590 # splice will remove the whole range, but will replace it by a new
3591 # range containing just the unaffected part. So, in this case,
3592 # add to the replacement list just this unaffected portion.
3593 if (! $extends_below
3594 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
3595 {
3596 push @replacement,
3597 Range->new($r->[$i]->start,
3598 $start - 1,
3599 Value => $r->[$i]->value,
3600 Type => $r->[$i]->type);
3601 }
3602
3603 # In the case of an insert or change, but not a delete, we have to
3604 # put in the new stuff; this comes next.
3605 if ($operation eq '+') {
3606 push @replacement, Range->new($start,
3607 $end,
3608 Value => $value,
3609 Type => $type);
3610 }
3611
3612 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
3613 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
3614
3615 # And finally, if we're changing or deleting only a portion of the
3616 # highest affected range, it must be split, as the lowest one was.
3617 if (! $extends_above
3618 && $j >= 0 # Remember that j can be -1 if before first
3619 # current element
3620 && $end >= $r->[$j]->start
3621 && $end < $r->[$j]->end)
3622 {
3623 push @replacement,
3624 Range->new($end + 1,
3625 $r->[$j]->end,
3626 Value => $r->[$j]->value,
3627 Type => $r->[$j]->type);
3628 }
3629 }
3630
3631 # And do the splice, as calculated above
3632 if (main::DEBUG && $to_trace) {
3633 trace "replacing $length element(s) at $i with ";
3634 foreach my $replacement (@replacement) {
3635 trace " $replacement";
3636 }
3637 trace "Before splice:";
3638 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3639 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3640 trace "i =[", $i, "]", $r->[$i];
3641 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3642 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3643 }
3644
3645 my @return = splice @$r, $splice_start, $length, @replacement;
3646
3647 if (main::DEBUG && $to_trace) {
3648 trace "After splice:";
3649 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3650 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3651 trace "i =[", $i, "]", $r->[$i];
3652 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3653 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3654 trace "removed @return";
3655 }
3656
3657 # An actual deletion could have changed the maximum in the list.
3658 # There was no deletion if the splice didn't return something, but
3659 # otherwise recalculate it. This is done too rarely to worry about
3660 # performance.
3661 if ($operation eq '-' && @return) {
3662 $max{$addr} = $r->[-1]->end;
3663 }
3664 return @return;
3665 }
3666
3667 sub reset_each_range { # reset the iterator for each_range();
3668 my $self = shift;
3669 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3670
f998e60c 3671 no overloading;
051df77b 3672 undef $each_range_iterator{pack 'J', $self};
99870f4d
KW
3673 return;
3674 }
3675
3676 sub each_range {
3677 # Iterate over each range in a range list. Results are undefined if
3678 # the range list is changed during the iteration.
3679
3680 my $self = shift;
3681 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3682
ffe43484 3683 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3684
3685 return if $self->is_empty;
3686
3687 $each_range_iterator{$addr} = -1
3688 if ! defined $each_range_iterator{$addr};
3689 $each_range_iterator{$addr}++;
3690 return $ranges{$addr}->[$each_range_iterator{$addr}]
3691 if $each_range_iterator{$addr} < @{$ranges{$addr}};
3692 undef $each_range_iterator{$addr};
3693 return;
3694 }
3695
3696 sub count { # Returns count of code points in range list
3697 my $self = shift;
3698 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3699
ffe43484 3700 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3701
3702 my $count = 0;
3703 foreach my $range (@{$ranges{$addr}}) {
3704 $count += $range->end - $range->start + 1;
3705 }
3706 return $count;
3707 }
3708
3709 sub delete_range { # Delete a range
3710 my $self = shift;
3711 my $start = shift;
3712 my $end = shift;
3713
3714 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3715
3716 return $self->_add_delete('-', $start, $end, "");
3717 }
3718
3719 sub is_empty { # Returns boolean as to if a range list is empty
3720 my $self = shift;
3721 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3722
f998e60c 3723 no overloading;
051df77b 3724 return scalar @{$ranges{pack 'J', $self}} == 0;
99870f4d
KW
3725 }
3726
3727 sub hash {
3728 # Quickly returns a scalar suitable for separating tables into
3729 # buckets, i.e. it is a hash function of the contents of a table, so
3730 # there are relatively few conflicts.
3731
3732 my $self = shift;
3733 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3734
ffe43484 3735 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
3736
3737 # These are quickly computable. Return looks like 'min..max;count'
3738 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
3739 }
3740} # End closure for _Range_List_Base
3741
3742package Range_List;
3743use base '_Range_List_Base';
3744
3745# A Range_List is a range list for match tables; i.e. the range values are
3746# not significant. Thus a number of operations can be safely added to it,
3747# such as inversion, intersection. Note that union is also an unsafe
3748# operation when range values are cared about, and that method is in the base
3749# class, not here. But things are set up so that that method is callable only
3750# during initialization. Only in this derived class, is there an operation
3751# that combines two tables. A Range_Map can thus be used to initialize a
3752# Range_List, and its mappings will be in the list, but are not significant to
3753# this class.
3754
3755sub trace { return main::trace(@_); }
3756
3757{ # Closure
3758
3759 use overload
3760 fallback => 0,
3761 '+' => sub { my $self = shift;
3762 my $other = shift;
3763
3764 return $self->_union($other)
3765 },
3766 '&' => sub { my $self = shift;
3767 my $other = shift;
3768
3769 return $self->_intersect($other, 0);
3770 },
3771 '~' => "_invert",
3772 '-' => "_subtract",
3773 ;
3774
3775 sub _invert {
3776 # Returns a new Range_List that gives all code points not in $self.
3777
3778 my $self = shift;
3779
3780 my $new = Range_List->new;
3781
3782 # Go through each range in the table, finding the gaps between them
3783 my $max = -1; # Set so no gap before range beginning at 0
3784 for my $range ($self->ranges) {
3785 my $start = $range->start;
3786 my $end = $range->end;
3787
3788 # If there is a gap before this range, the inverse will contain
3789 # that gap.
3790 if ($start > $max + 1) {
3791 $new->add_range($max + 1, $start - 1);
3792 }
3793 $max = $end;
3794 }
3795
3796 # And finally, add the gap from the end of the table to the max
3797 # possible code point
3798 if ($max < $LAST_UNICODE_CODEPOINT) {
3799 $new->add_range($max + 1, $LAST_UNICODE_CODEPOINT);
3800 }
3801 return $new;
3802 }
3803
3804 sub _subtract {
3805 # Returns a new Range_List with the argument deleted from it. The
3806 # argument can be a single code point, a range, or something that has
3807 # a range, with the _range_list() method on it returning them
3808
3809 my $self = shift;
3810 my $other = shift;
3811 my $reversed = shift;
3812 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3813
3814 if ($reversed) {
3815 Carp::my_carp_bug("Can't cope with a "
3816 . __PACKAGE__
3817 . " being the second parameter in a '-'. Subtraction ignored.");
3818 return $self;
3819 }
3820
3821 my $new = Range_List->new(Initialize => $self);
3822
3823 if (! ref $other) { # Single code point
3824 $new->delete_range($other, $other);
3825 }
3826 elsif ($other->isa('Range')) {
3827 $new->delete_range($other->start, $other->end);
3828 }
3829 elsif ($other->can('_range_list')) {
3830 foreach my $range ($other->_range_list->ranges) {
3831 $new->delete_range($range->start, $range->end);
3832 }
3833 }
3834 else {
3835 Carp::my_carp_bug("Can't cope with a "
3836 . ref($other)
3837 . " argument to '-'. Subtraction ignored."
3838 );
3839 return $self;
3840 }
3841
3842 return $new;
3843 }
3844
3845 sub _intersect {
3846 # Returns either a boolean giving whether the two inputs' range lists
3847 # intersect (overlap), or a new Range_List containing the intersection
3848 # of the two lists. The optional final parameter being true indicates
3849 # to do the check instead of the intersection.
3850
3851 my $a_object = shift;
3852 my $b_object = shift;
3853 my $check_if_overlapping = shift;
3854 $check_if_overlapping = 0 unless defined $check_if_overlapping;
3855 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3856
3857 if (! defined $b_object) {
3858 my $message = "";
3859 $message .= $a_object->_owner_name_of if defined $a_object;
3860 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
3861 return;
3862 }
3863
3864 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
3865 # Thus the intersection could be much more simply be written:
3866 # return ~(~$a_object + ~$b_object);
3867 # But, this is slower, and when taking the inverse of a large
3868 # range_size_1 table, back when such tables were always stored that
3869 # way, it became prohibitively slow, hence the code was changed to the
3870 # below
3871
3872 if ($b_object->isa('Range')) {
3873 $b_object = Range_List->new(Initialize => $b_object,
3874 Owner => $a_object->_owner_name_of);
3875 }
3876 $b_object = $b_object->_range_list if $b_object->can('_range_list');
3877
3878 my @a_ranges = $a_object->ranges;
3879 my @b_ranges = $b_object->ranges;
3880
3881 #local $to_trace = 1 if main::DEBUG;
3882 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
3883
3884 # Start with the first range in each list
3885 my $a_i = 0;
3886 my $range_a = $a_ranges[$a_i];
3887 my $b_i = 0;
3888 my $range_b = $b_ranges[$b_i];
3889
3890 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
3891 if ! $check_if_overlapping;
3892
3893 # If either list is empty, there is no intersection and no overlap
3894 if (! defined $range_a || ! defined $range_b) {
3895 return $check_if_overlapping ? 0 : $new;
3896 }
3897 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3898
3899 # Otherwise, must calculate the intersection/overlap. Start with the
3900 # very first code point in each list
3901 my $a = $range_a->start;
3902 my $b = $range_b->start;
3903
3904 # Loop through all the ranges of each list; in each iteration, $a and
3905 # $b are the current code points in their respective lists
3906 while (1) {
3907
3908 # If $a and $b are the same code point, ...
3909 if ($a == $b) {
3910
3911 # it means the lists overlap. If just checking for overlap
3912 # know the answer now,
3913 return 1 if $check_if_overlapping;
3914
3915 # The intersection includes this code point plus anything else
3916 # common to both current ranges.
3917 my $start = $a;
3918 my $end = main::min($range_a->end, $range_b->end);
3919 if (! $check_if_overlapping) {
3920 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
3921 $new->add_range($start, $end);
3922 }
3923
3924 # Skip ahead to the end of the current intersect
3925 $a = $b = $end;
3926
3927 # If the current intersect ends at the end of either range (as
3928 # it must for at least one of them), the next possible one
3929 # will be the beginning code point in it's list's next range.
3930 if ($a == $range_a->end) {
3931 $range_a = $a_ranges[++$a_i];
3932 last unless defined $range_a;
3933 $a = $range_a->start;
3934 }
3935 if ($b == $range_b->end) {
3936 $range_b = $b_ranges[++$b_i];
3937 last unless defined $range_b;
3938 $b = $range_b->start;
3939 }
3940
3941 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
3942 }
3943 elsif ($a < $b) {
3944
3945 # Not equal, but if the range containing $a encompasses $b,
3946 # change $a to be the middle of the range where it does equal
3947 # $b, so the next iteration will get the intersection
3948 if ($range_a->end >= $b) {
3949 $a = $b;
3950 }
3951 else {
3952
3953 # Here, the current range containing $a is entirely below
3954 # $b. Go try to find a range that could contain $b.
3955 $a_i = $a_object->_search_ranges($b);
3956
3957 # If no range found, quit.
3958 last unless defined $a_i;
3959
3960 # The search returns $a_i, such that
3961 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
3962 # Set $a to the beginning of this new range, and repeat.
3963 $range_a = $a_ranges[$a_i];
3964 $a = $range_a->start;
3965 }
3966 }
3967 else { # Here, $b < $a.
3968
3969 # Mirror image code to the leg just above
3970 if ($range_b->end >= $a) {
3971 $b = $a;
3972 }
3973 else {
3974 $b_i = $b_object->_search_ranges($a);
3975 last unless defined $b_i;
3976 $range_b = $b_ranges[$b_i];
3977 $b = $range_b->start;
3978 }
3979 }
3980 } # End of looping through ranges.
3981
3982 # Intersection fully computed, or now know that there is no overlap
3983 return $check_if_overlapping ? 0 : $new;
3984 }
3985
3986 sub overlaps {
3987 # Returns boolean giving whether the two arguments overlap somewhere
3988
3989 my $self = shift;
3990 my $other = shift;
3991 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3992
3993 return $self->_intersect($other, 1);
3994 }
3995
3996 sub add_range {
3997 # Add a range to the list.
3998
3999 my $self = shift;
4000 my $start = shift;
4001 my $end = shift;
4002 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4003
4004 return $self->_add_delete('+', $start, $end, "");
4005 }
4006
99870f4d
KW
4007 sub is_code_point_usable {
4008 # This used only for making the test script. See if the input
4009 # proposed trial code point is one that Perl will handle. If second
4010 # parameter is 0, it won't select some code points for various
4011 # reasons, noted below.
4012
4013 my $code = shift;
4014 my $try_hard = shift;
4015 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4016
4017 return 0 if $code < 0; # Never use a negative
4018
99870f4d
KW
4019 # shun null. I'm (khw) not sure why this was done, but NULL would be
4020 # the character very frequently used.
4021 return $try_hard if $code == 0x0000;
4022
4023 return 0 if $try_hard; # XXX Temporary until fix utf8.c
4024
4025 # shun non-character code points.
4026 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4027 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4028
4029 return $try_hard if $code > $LAST_UNICODE_CODEPOINT; # keep in range
4030 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4031
4032 return 1;
4033 }
4034
4035 sub get_valid_code_point {
4036 # Return a code point that's part of the range list. Returns nothing
4037 # if the table is empty or we can't find a suitable code point. This
4038 # used only for making the test script.
4039
4040 my $self = shift;
4041 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4042
ffe43484 4043 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4044
4045 # On first pass, don't choose less desirable code points; if no good
4046 # one is found, repeat, allowing a less desirable one to be selected.
4047 for my $try_hard (0, 1) {
4048
4049 # Look through all the ranges for a usable code point.
4050 for my $set ($self->ranges) {
4051
4052 # Try the edge cases first, starting with the end point of the
4053 # range.
4054 my $end = $set->end;
4055 return $end if is_code_point_usable($end, $try_hard);
4056
4057 # End point didn't, work. Start at the beginning and try
4058 # every one until find one that does work.
4059 for my $trial ($set->start .. $end - 1) {
4060 return $trial if is_code_point_usable($trial, $try_hard);
4061 }
4062 }
4063 }
4064 return (); # If none found, give up.
4065 }
4066
4067 sub get_invalid_code_point {
4068 # Return a code point that's not part of the table. Returns nothing
4069 # if the table covers all code points or a suitable code point can't
4070 # be found. This used only for making the test script.
4071
4072 my $self = shift;
4073 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4074
4075 # Just find a valid code point of the inverse, if any.
4076 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4077 }
4078} # end closure for Range_List
4079
4080package Range_Map;
4081use base '_Range_List_Base';
4082
4083# A Range_Map is a range list in which the range values (called maps) are
4084# significant, and hence shouldn't be manipulated by our other code, which
4085# could be ambiguous or lose things. For example, in taking the union of two
4086# lists, which share code points, but which have differing values, which one
4087# has precedence in the union?
4088# It turns out that these operations aren't really necessary for map tables,
4089# and so this class was created to make sure they aren't accidentally
4090# applied to them.
4091
4092{ # Closure
4093
4094 sub add_map {
4095 # Add a range containing a mapping value to the list
4096
4097 my $self = shift;
4098 # Rest of parameters passed on
4099
4100 return $self->_add_delete('+', @_);
4101 }
4102
4103 sub add_duplicate {
4104 # Adds entry to a range list which can duplicate an existing entry
4105
4106 my $self = shift;
4107 my $code_point = shift;
4108 my $value = shift;
4109 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4110
4111 return $self->add_map($code_point, $code_point,
4112 $value, Replace => $MULTIPLE);
4113 }
4114} # End of closure for package Range_Map
4115
4116package _Base_Table;
4117
4118# A table is the basic data structure that gets written out into a file for
4119# use by the Perl core. This is the abstract base class implementing the
4120# common elements from the derived ones. A list of the methods to be
4121# furnished by an implementing class is just after the constructor.
4122
4123sub standardize { return main::standardize($_[0]); }
4124sub trace { return main::trace(@_); }
4125
4126{ # Closure
4127
4128 main::setup_package();
4129
4130 my %range_list;
4131 # Object containing the ranges of the table.
4132 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4133
4134 my %full_name;
4135 # The full table name.
4136 main::set_access('full_name', \%full_name, 'r');
4137
4138 my %name;
4139 # The table name, almost always shorter
4140 main::set_access('name', \%name, 'r');
4141
4142 my %short_name;
4143 # The shortest of all the aliases for this table, with underscores removed
4144 main::set_access('short_name', \%short_name);
4145
4146 my %nominal_short_name_length;
4147 # The length of short_name before removing underscores
4148 main::set_access('nominal_short_name_length',
4149 \%nominal_short_name_length);
4150
23e33b60
KW
4151 my %complete_name;
4152 # The complete name, including property.
4153 main::set_access('complete_name', \%complete_name, 'r');
4154
99870f4d
KW
4155 my %property;
4156 # Parent property this table is attached to.
4157 main::set_access('property', \%property, 'r');
4158
4159 my %aliases;
4160 # Ordered list of aliases of the table's name. The first ones in the list
4161 # are output first in comments
4162 main::set_access('aliases', \%aliases, 'readable_array');
4163
4164 my %comment;
4165 # A comment associated with the table for human readers of the files
4166 main::set_access('comment', \%comment, 's');
4167
4168 my %description;
4169 # A comment giving a short description of the table's meaning for human
4170 # readers of the files.
4171 main::set_access('description', \%description, 'readable_array');
4172
4173 my %note;
4174 # A comment giving a short note about the table for human readers of the
4175 # files.
4176 main::set_access('note', \%note, 'readable_array');
4177
4178 my %internal_only;
4179 # Boolean; if set means any file that contains this table is marked as for
4180 # internal-only use.
4181 main::set_access('internal_only', \%internal_only);
4182
4183 my %find_table_from_alias;
4184 # The parent property passes this pointer to a hash which this class adds
4185 # all its aliases to, so that the parent can quickly take an alias and
4186 # find this table.
4187 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4188
4189 my %locked;
4190 # After this table is made equivalent to another one; we shouldn't go
4191 # changing the contents because that could mean it's no longer equivalent
4192 main::set_access('locked', \%locked, 'r');
4193
4194 my %file_path;
4195 # This gives the final path to the file containing the table. Each
4196 # directory in the path is an element in the array
4197 main::set_access('file_path', \%file_path, 'readable_array');
4198
4199 my %status;
4200 # What is the table's status, normal, $OBSOLETE, etc. Enum
4201 main::set_access('status', \%status, 'r');
4202
4203 my %status_info;
4204 # A comment about its being obsolete, or whatever non normal status it has
4205 main::set_access('status_info', \%status_info, 'r');
4206
4207 my %range_size_1;
4208 # Is the table to be output with each range only a single code point?
4209 # This is done to avoid breaking existing code that may have come to rely
4210 # on this behavior in previous versions of this program.)
4211 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4212
4213 my %perl_extension;
4214 # A boolean set iff this table is a Perl extension to the Unicode
4215 # standard.
4216 main::set_access('perl_extension', \%perl_extension, 'r');
4217
0c07e538
KW
4218 my %output_range_counts;
4219 # A boolean set iff this table is to have comments written in the
4220 # output file that contain the number of code points in the range.
4221 # The constructor can override the global flag of the same name.
4222 main::set_access('output_range_counts', \%output_range_counts, 'r');
4223
f5817e0a
KW
4224 my %format;
4225 # The format of the entries of the table. This is calculated from the
4226 # data in the table (or passed in the constructor). This is an enum e.g.,
4227 # $STRING_FORMAT
4228 main::set_access('format', \%format, 'r', 'p_s');
4229
99870f4d
KW
4230 sub new {
4231 # All arguments are key => value pairs, which you can see below, most
4232 # of which match fields documented above. Otherwise: Pod_Entry,
4233 # Externally_Ok, and Fuzzy apply to the names of the table, and are
4234 # documented in the Alias package
4235
4236 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4237
4238 my $class = shift;
4239
4240 my $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 4241 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4242
4243 my %args = @_;
4244
4245 $name{$addr} = delete $args{'Name'};
4246 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4247 $full_name{$addr} = delete $args{'Full_Name'};
23e33b60
KW
4248 my $complete_name = $complete_name{$addr}
4249 = delete $args{'Complete_Name'};
f5817e0a 4250 $format{$addr} = delete $args{'Format'};
99870f4d 4251 $internal_only{$addr} = delete $args{'Internal_Only_Warning'} || 0;
0c07e538 4252 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
99870f4d
KW
4253 $property{$addr} = delete $args{'_Property'};
4254 $range_list{$addr} = delete $args{'_Range_List'};
4255 $status{$addr} = delete $args{'Status'} || $NORMAL;
4256 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4257 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
9ef2b94f 4258 $range_size_1{$addr} = 1 if $output_names; # Make sure 1 name per line
99870f4d
KW
4259
4260 my $description = delete $args{'Description'};
4261 my $externally_ok = delete $args{'Externally_Ok'};
4262 my $loose_match = delete $args{'Fuzzy'};
4263 my $note = delete $args{'Note'};
4264 my $make_pod_entry = delete $args{'Pod_Entry'};
37e2e78e 4265 my $perl_extension = delete $args{'Perl_Extension'};
99870f4d
KW
4266
4267 # Shouldn't have any left over
4268 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4269
4270 # Can't use || above because conceivably the name could be 0, and
4271 # can't use // operator in case this program gets used in Perl 5.8
4272 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
0c07e538
KW
4273 $output_range_counts{$addr} = $output_range_counts if
4274 ! defined $output_range_counts{$addr};
99870f4d
KW
4275
4276 $aliases{$addr} = [ ];
4277 $comment{$addr} = [ ];
4278 $description{$addr} = [ ];
4279 $note{$addr} = [ ];
4280 $file_path{$addr} = [ ];
4281 $locked{$addr} = "";
4282
4283 push @{$description{$addr}}, $description if $description;
4284 push @{$note{$addr}}, $note if $note;
4285
37e2e78e
KW
4286 if ($status{$addr} eq $PLACEHOLDER) {
4287
4288 # A placeholder table doesn't get documented, is a perl extension,
4289 # and quite likely will be empty
4290 $make_pod_entry = 0 if ! defined $make_pod_entry;
4291 $perl_extension = 1 if ! defined $perl_extension;
4292 push @tables_that_may_be_empty, $complete_name{$addr};
4293 }
4294 elsif (! $status{$addr}) {
4295
4296 # If hasn't set its status already, see if it is on one of the
4297 # lists of properties or tables that have particular statuses; if
4298 # not, is normal. The lists are prioritized so the most serious
4299 # ones are checked first
ec11e5f4
KW
4300 if (exists $why_suppressed{$complete_name}
4301 # Don't suppress if overriden
4302 && ! grep { $_ eq $complete_name{$addr} }
4303 @output_mapped_properties)
4304 {
99870f4d
KW
4305 $status{$addr} = $SUPPRESSED;
4306 }
4307 elsif (exists $why_deprecated{$complete_name}) {
4308 $status{$addr} = $DEPRECATED;
4309 }
4310 elsif (exists $why_stabilized{$complete_name}) {
4311 $status{$addr} = $STABILIZED;
4312 }
4313 elsif (exists $why_obsolete{$complete_name}) {
4314 $status{$addr} = $OBSOLETE;
4315 }
4316
4317 # Existence above doesn't necessarily mean there is a message
4318 # associated with it. Use the most serious message.
4319 if ($status{$addr}) {
4320 if ($why_suppressed{$complete_name}) {
4321 $status_info{$addr}
4322 = $why_suppressed{$complete_name};
4323 }
4324 elsif ($why_deprecated{$complete_name}) {
4325 $status_info{$addr}
4326 = $why_deprecated{$complete_name};
4327 }
4328 elsif ($why_stabilized{$complete_name}) {
4329 $status_info{$addr}
4330 = $why_stabilized{$complete_name};
4331 }
4332 elsif ($why_obsolete{$complete_name}) {
4333 $status_info{$addr}
4334 = $why_obsolete{$complete_name};
4335 }
4336 }
4337 }
4338
37e2e78e
KW
4339 $perl_extension{$addr} = $perl_extension || 0;
4340
99870f4d
KW
4341 # By convention what typically gets printed only or first is what's
4342 # first in the list, so put the full name there for good output
4343 # clarity. Other routines rely on the full name being first on the
4344 # list
4345 $self->add_alias($full_name{$addr},
4346 Externally_Ok => $externally_ok,
4347 Fuzzy => $loose_match,
4348 Pod_Entry => $make_pod_entry,
4349 Status => $status{$addr},
4350 );
4351
4352 # Then comes the other name, if meaningfully different.
4353 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4354 $self->add_alias($name{$addr},
4355 Externally_Ok => $externally_ok,
4356 Fuzzy => $loose_match,
4357 Pod_Entry => $make_pod_entry,
4358 Status => $status{$addr},
4359 );
4360 }
4361
4362 return $self;
4363 }
4364
4365 # Here are the methods that are required to be defined by any derived
4366 # class
ea25a9b2 4367 for my $sub (qw(
99870f4d 4368 append_to_body
99870f4d 4369 pre_body
ea25a9b2 4370 ))
99870f4d
KW
4371 # append_to_body and pre_body are called in the write() method
4372 # to add stuff after the main body of the table, but before
4373 # its close; and to prepend stuff before the beginning of the
4374 # table.
99870f4d
KW
4375 {
4376 no strict "refs";
4377 *$sub = sub {
4378 Carp::my_carp_bug( __LINE__
4379 . ": Must create method '$sub()' for "
4380 . ref shift);
4381 return;
4382 }
4383 }
4384
4385 use overload
4386 fallback => 0,
4387 "." => \&main::_operator_dot,
4388 '!=' => \&main::_operator_not_equal,
4389 '==' => \&main::_operator_equal,
4390 ;
4391
4392 sub ranges {
4393 # Returns the array of ranges associated with this table.
4394
f998e60c 4395 no overloading;
051df77b 4396 return $range_list{pack 'J', shift}->ranges;
99870f4d
KW
4397 }
4398
4399 sub add_alias {
4400 # Add a synonym for this table.
4401
4402 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
4403
4404 my $self = shift;
4405 my $name = shift; # The name to add.
4406 my $pointer = shift; # What the alias hash should point to. For
4407 # map tables, this is the parent property;
4408 # for match tables, it is the table itself.
4409
4410 my %args = @_;
4411 my $loose_match = delete $args{'Fuzzy'};
4412
4413 my $make_pod_entry = delete $args{'Pod_Entry'};
4414 $make_pod_entry = $YES unless defined $make_pod_entry;
4415
4416 my $externally_ok = delete $args{'Externally_Ok'};
4417 $externally_ok = 1 unless defined $externally_ok;
4418
4419 my $status = delete $args{'Status'};
4420 $status = $NORMAL unless defined $status;
4421
4422 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4423
4424 # Capitalize the first letter of the alias unless it is one of the CJK
4425 # ones which specifically begins with a lower 'k'. Do this because
4426 # Unicode has varied whether they capitalize first letters or not, and
4427 # have later changed their minds and capitalized them, but not the
4428 # other way around. So do it always and avoid changes from release to
4429 # release
4430 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
4431
ffe43484 4432 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4433
4434 # Figure out if should be loosely matched if not already specified.
4435 if (! defined $loose_match) {
4436
4437 # Is a loose_match if isn't null, and doesn't begin with an
4438 # underscore and isn't just a number
4439 if ($name ne ""
4440 && substr($name, 0, 1) ne '_'
4441 && $name !~ qr{^[0-9_.+-/]+$})
4442 {
4443 $loose_match = 1;
4444 }
4445 else {
4446 $loose_match = 0;
4447 }
4448 }
4449
4450 # If this alias has already been defined, do nothing.
4451 return if defined $find_table_from_alias{$addr}->{$name};
4452
4453 # That includes if it is standardly equivalent to an existing alias,
4454 # in which case, add this name to the list, so won't have to search
4455 # for it again.
4456 my $standard_name = main::standardize($name);
4457 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
4458 $find_table_from_alias{$addr}->{$name}
4459 = $find_table_from_alias{$addr}->{$standard_name};
4460 return;
4461 }
4462
4463 # Set the index hash for this alias for future quick reference.
4464 $find_table_from_alias{$addr}->{$name} = $pointer;
4465 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
4466 local $to_trace = 0 if main::DEBUG;
4467 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
4468 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
4469
4470
4471 # Put the new alias at the end of the list of aliases unless the final
4472 # element begins with an underscore (meaning it is for internal perl
4473 # use) or is all numeric, in which case, put the new one before that
4474 # one. This floats any all-numeric or underscore-beginning aliases to
4475 # the end. This is done so that they are listed last in output lists,
4476 # to encourage the user to use a better name (either more descriptive
4477 # or not an internal-only one) instead. This ordering is relied on
4478 # implicitly elsewhere in this program, like in short_name()
4479 my $list = $aliases{$addr};
4480 my $insert_position = (@$list == 0
4481 || (substr($list->[-1]->name, 0, 1) ne '_'
4482 && $list->[-1]->name =~ /\D/))
4483 ? @$list
4484 : @$list - 1;
4485 splice @$list,
4486 $insert_position,
4487 0,
4488 Alias->new($name, $loose_match, $make_pod_entry,
4489 $externally_ok, $status);
4490
4491 # This name may be shorter than any existing ones, so clear the cache
4492 # of the shortest, so will have to be recalculated.
f998e60c 4493 no overloading;
051df77b 4494 undef $short_name{pack 'J', $self};
99870f4d
KW
4495 return;
4496 }
4497
4498 sub short_name {
4499 # Returns a name suitable for use as the base part of a file name.
4500 # That is, shorter wins. It can return undef if there is no suitable
4501 # name. The name has all non-essential underscores removed.
4502
4503 # The optional second parameter is a reference to a scalar in which
4504 # this routine will store the length the returned name had before the
4505 # underscores were removed, or undef if the return is undef.
4506
4507 # The shortest name can change if new aliases are added. So using
4508 # this should be deferred until after all these are added. The code
4509 # that does that should clear this one's cache.
4510 # Any name with alphabetics is preferred over an all numeric one, even
4511 # if longer.
4512
4513 my $self = shift;
4514 my $nominal_length_ptr = shift;
4515 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4516
ffe43484 4517 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4518
4519 # For efficiency, don't recalculate, but this means that adding new
4520 # aliases could change what the shortest is, so the code that does
4521 # that needs to undef this.
4522 if (defined $short_name{$addr}) {
4523 if ($nominal_length_ptr) {
4524 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4525 }
4526 return $short_name{$addr};
4527 }
4528
4529 # Look at each alias
4530 foreach my $alias ($self->aliases()) {
4531
4532 # Don't use an alias that isn't ok to use for an external name.
4533 next if ! $alias->externally_ok;
4534
4535 my $name = main::Standardize($alias->name);
4536 trace $self, $name if main::DEBUG && $to_trace;
4537
4538 # Take the first one, or a shorter one that isn't numeric. This
4539 # relies on numeric aliases always being last in the array
4540 # returned by aliases(). Any alpha one will have precedence.
4541 if (! defined $short_name{$addr}
4542 || ($name =~ /\D/
4543 && length($name) < length($short_name{$addr})))
4544 {
4545 # Remove interior underscores.
4546 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
4547
4548 $nominal_short_name_length{$addr} = length $name;
4549 }
4550 }
4551
4552 # If no suitable external name return undef
4553 if (! defined $short_name{$addr}) {
4554 $$nominal_length_ptr = undef if $nominal_length_ptr;
4555 return;
4556 }
4557
4558 # Don't allow a null external name.
4559 if ($short_name{$addr} eq "") {
4560 $short_name{$addr} = '_';
4561 $nominal_short_name_length{$addr} = 1;
4562 }
4563
4564 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
4565
4566 if ($nominal_length_ptr) {
4567 $$nominal_length_ptr = $nominal_short_name_length{$addr};
4568 }
4569 return $short_name{$addr};
4570 }
4571
4572 sub external_name {
4573 # Returns the external name that this table should be known by. This
4574 # is usually the short_name, but not if the short_name is undefined.
4575
4576 my $self = shift;
4577 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4578
4579 my $short = $self->short_name;
4580 return $short if defined $short;
4581
4582 return '_';
4583 }
4584
4585 sub add_description { # Adds the parameter as a short description.
4586
4587 my $self = shift;
4588 my $description = shift;
4589 chomp $description;
4590 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4591
f998e60c 4592 no overloading;
051df77b 4593 push @{$description{pack 'J', $self}}, $description;
99870f4d
KW
4594
4595 return;
4596 }
4597
4598 sub add_note { # Adds the parameter as a short note.
4599
4600 my $self = shift;
4601 my $note = shift;
4602 chomp $note;
4603 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4604
f998e60c 4605 no overloading;
051df77b 4606 push @{$note{pack 'J', $self}}, $note;
99870f4d
KW
4607
4608 return;
4609 }
4610
4611 sub add_comment { # Adds the parameter as a comment.
4612
4613 my $self = shift;
4614 my $comment = shift;
4615 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4616
4617 chomp $comment;
f998e60c
KW
4618
4619 no overloading;
051df77b 4620 push @{$comment{pack 'J', $self}}, $comment;
99870f4d
KW
4621
4622 return;
4623 }
4624
4625 sub comment {
4626 # Return the current comment for this table. If called in list
4627 # context, returns the array of comments. In scalar, returns a string
4628 # of each element joined together with a period ending each.
4629
4630 my $self = shift;
4631 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4632
ffe43484 4633 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 4634 my @list = @{$comment{$addr}};
99870f4d
KW
4635 return @list if wantarray;
4636 my $return = "";
4637 foreach my $sentence (@list) {
4638 $return .= '. ' if $return;
4639 $return .= $sentence;
4640 $return =~ s/\.$//;
4641 }
4642 $return .= '.' if $return;
4643 return $return;
4644 }
4645
4646 sub initialize {
4647 # Initialize the table with the argument which is any valid
4648 # initialization for range lists.
4649
4650 my $self = shift;
ffe43484 4651 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4652 my $initialization = shift;
4653 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4654
4655 # Replace the current range list with a new one of the same exact
4656 # type.
f998e60c
KW
4657 my $class = ref $range_list{$addr};
4658 $range_list{$addr} = $class->new(Owner => $self,
99870f4d
KW
4659 Initialize => $initialization);
4660 return;
4661
4662 }
4663
4664 sub header {
4665 # The header that is output for the table in the file it is written
4666 # in.
4667
4668 my $self = shift;
4669 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4670
4671 my $return = "";
4672 $return .= $DEVELOPMENT_ONLY if $compare_versions;
4673 $return .= $HEADER;
f998e60c 4674 no overloading;
051df77b 4675 $return .= $INTERNAL_ONLY if $internal_only{pack 'J', $self};
99870f4d
KW
4676 return $return;
4677 }
4678
4679 sub write {
4680 # Write a representation of the table to its file.
4681
4682 my $self = shift;
4683 my $tab_stops = shift; # The number of tab stops over to put any
4684 # comment.
4685 my $suppress_value = shift; # Optional, if the value associated with
4686 # a range equals this one, don't write
4687 # the range
4688 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4689
ffe43484 4690 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4691
4692 # Start with the header
4693 my @OUT = $self->header;
4694
4695 # Then the comments
4696 push @OUT, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
4697 if $comment{$addr};
4698
4699 # Then any pre-body stuff.
4700 my $pre_body = $self->pre_body;
4701 push @OUT, $pre_body, "\n" if $pre_body;
4702
4703 # The main body looks like a 'here' document
4704 push @OUT, "return <<'END';\n";
4705
4706 if ($range_list{$addr}->is_empty) {
4707
4708 # This is a kludge for empty tables to silence a warning in
4709 # utf8.c, which can't really deal with empty tables, but it can
4710 # deal with a table that matches nothing, as the inverse of 'Any'
4711 # does.
4712 push @OUT, "!utf8::IsAny\n";
4713 }
4714 else {
4715 my $range_size_1 = $range_size_1{$addr};
4716
4717 # Output each range as part of the here document.
4718 for my $set ($range_list{$addr}->ranges) {
4719 my $start = $set->start;
4720 my $end = $set->end;
4721 my $value = $set->value;
4722
4723 # Don't output ranges whose value is the one to suppress
4724 next if defined $suppress_value && $value eq $suppress_value;
4725
4726 # If has or wants a single point range output
4727 if ($start == $end || $range_size_1) {
b1c167a3
KW
4728 if (ref $range_size_1 eq 'CODE') {
4729 for my $i ($start .. $end) {
4730 push @OUT, &$range_size_1($i, $value);
4731 }
4732 }
4733 else {
4734 for my $i ($start .. $end) {
4735 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
4736 if ($output_names) {
4737 if (! defined $viacode[$i]) {
4738 $viacode[$i] =
4739 Property::property_ref('Perl_Charnames')
4740 ->value_of($i)
4741 || "";
4742 }
4743 $OUT[-1] =~ s/\n/\t# $viacode[$i]\n/;
9ef2b94f 4744 }
9ef2b94f 4745 }
99870f4d
KW
4746 }
4747 }
4748 else {
4749 push @OUT, sprintf "%04X\t%04X\t%s", $start, $end, $value;
4750
4751 # Add a comment with the size of the range, if requested.
4752 # Expand Tabs to make sure they all start in the same
4753 # column, and then unexpand to use mostly tabs.
0c07e538 4754 if (! $output_range_counts{$addr}) {
99870f4d
KW
4755 $OUT[-1] .= "\n";
4756 }
4757 else {
4758 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
4759 my $count = main::clarify_number($end - $start + 1);
4760 use integer;
4761
4762 my $width = $tab_stops * 8 - 1;
4763 $OUT[-1] = sprintf("%-*s # [%s]\n",
4764 $width,
4765 $OUT[-1],
4766 $count);
4767 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
4768 }
4769 }
4770 } # End of loop through all the table's ranges
4771 }
4772
4773 # Add anything that goes after the main body, but within the here
4774 # document,
4775 my $append_to_body = $self->append_to_body;
4776 push @OUT, $append_to_body if $append_to_body;
4777
4778 # And finish the here document.
4779 push @OUT, "END\n";
4780
4781 # All these files have a .pl suffix
4782 $file_path{$addr}->[-1] .= '.pl';
4783
4784 main::write($file_path{$addr}, \@OUT);
4785 return;
4786 }
4787
4788 sub set_status { # Set the table's status
4789 my $self = shift;
4790 my $status = shift; # The status enum value
4791 my $info = shift; # Any message associated with it.
4792 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4793
ffe43484 4794 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4795
4796 $status{$addr} = $status;
4797 $status_info{$addr} = $info;
4798 return;
4799 }
4800
4801 sub lock {
4802 # Don't allow changes to the table from now on. This stores a stack
4803 # trace of where it was called, so that later attempts to modify it
4804 # can immediately show where it got locked.
4805
4806 my $self = shift;
4807 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4808
ffe43484 4809 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4810
4811 $locked{$addr} = "";
4812
4813 my $line = (caller(0))[2];
4814 my $i = 1;
4815
4816 # Accumulate the stack trace
4817 while (1) {
4818 my ($pkg, $file, $caller_line, $caller) = caller $i++;
4819
4820 last unless defined $caller;
4821
4822 $locked{$addr} .= " called from $caller() at line $line\n";
4823 $line = $caller_line;
4824 }
4825 $locked{$addr} .= " called from main at line $line\n";
4826
4827 return;
4828 }
4829
4830 sub carp_if_locked {
4831 # Return whether a table is locked or not, and, by the way, complain
4832 # if is locked
4833
4834 my $self = shift;
4835 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4836
ffe43484 4837 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4838
4839 return 0 if ! $locked{$addr};
4840 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
4841 return 1;
4842 }
4843
4844 sub set_file_path { # Set the final directory path for this table
4845 my $self = shift;
4846 # Rest of parameters passed on
4847
f998e60c 4848 no overloading;
051df77b 4849 @{$file_path{pack 'J', $self}} = @_;
99870f4d
KW
4850 return
4851 }
4852
4853 # Accessors for the range list stored in this table. First for
4854 # unconditional
ea25a9b2 4855 for my $sub (qw(
2f7a8815 4856 containing_range
99870f4d
KW
4857 contains
4858 count
4859 each_range
4860 hash
4861 is_empty
4862 max
4863 min
4864 range_count
4865 reset_each_range
0a9dbafc 4866 type_of
99870f4d 4867 value_of
ea25a9b2 4868 ))
99870f4d
KW
4869 {
4870 no strict "refs";
4871 *$sub = sub {
4872 use strict "refs";
4873 my $self = shift;
f998e60c 4874 no overloading;
051df77b 4875 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
4876 }
4877 }
4878
4879 # Then for ones that should fail if locked
ea25a9b2 4880 for my $sub (qw(
99870f4d 4881 delete_range
ea25a9b2 4882 ))
99870f4d
KW
4883 {
4884 no strict "refs";
4885 *$sub = sub {
4886 use strict "refs";
4887 my $self = shift;
4888
4889 return if $self->carp_if_locked;
f998e60c 4890 no overloading;
051df77b 4891 return $range_list{pack 'J', $self}->$sub(@_);
99870f4d
KW
4892 }
4893 }
4894
4895} # End closure
4896
4897package Map_Table;
4898use base '_Base_Table';
4899
4900# A Map Table is a table that contains the mappings from code points to
4901# values. There are two weird cases:
4902# 1) Anomalous entries are ones that aren't maps of ranges of code points, but
4903# are written in the table's file at the end of the table nonetheless. It
4904# requires specially constructed code to handle these; utf8.c can not read
4905# these in, so they should not go in $map_directory. As of this writing,
4906# the only case that these happen is for named sequences used in
4907# charnames.pm. But this code doesn't enforce any syntax on these, so
4908# something else could come along that uses it.
4909# 2) Specials are anything that doesn't fit syntactically into the body of the
4910# table. The ranges for these have a map type of non-zero. The code below
4911# knows about and handles each possible type. In most cases, these are
4912# written as part of the header.
4913#
4914# A map table deliberately can't be manipulated at will unlike match tables.
4915# This is because of the ambiguities having to do with what to do with
4916# overlapping code points. And there just isn't a need for those things;
4917# what one wants to do is just query, add, replace, or delete mappings, plus
4918# write the final result.
4919# However, there is a method to get the list of possible ranges that aren't in
4920# this table to use for defaulting missing code point mappings. And,
4921# map_add_or_replace_non_nulls() does allow one to add another table to this
4922# one, but it is clearly very specialized, and defined that the other's
4923# non-null values replace this one's if there is any overlap.
4924
4925sub trace { return main::trace(@_); }
4926
4927{ # Closure
4928
4929 main::setup_package();
4930
4931 my %default_map;
4932 # Many input files omit some entries; this gives what the mapping for the
4933 # missing entries should be
4934 main::set_access('default_map', \%default_map, 'r');
4935
4936 my %anomalous_entries;
4937 # Things that go in the body of the table which don't fit the normal
4938 # scheme of things, like having a range. Not much can be done with these
4939 # once there except to output them. This was created to handle named
4940 # sequences.
4941 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
4942 main::set_access('anomalous_entries', # Append singular, read plural
4943 \%anomalous_entries,
4944 'readable_array');
4945
99870f4d
KW
4946 my %core_access;
4947 # This is a string, solely for documentation, indicating how one can get
4948 # access to this property via the Perl core.
4949 main::set_access('core_access', \%core_access, 'r', 's');
4950
4951 my %has_specials;
4952 # Boolean set when non-zero map-type ranges are added to this table,
4953 # which happens in only a few tables. This is purely for performance, to
4954 # avoid having to search through every table upon output, so if all the
4955 # non-zero maps got deleted before output, this would remain set, and the
4956 # only penalty would be performance. Currently, most map tables that get
4957 # output have specials in them, so this doesn't help that much anyway.
4958 main::set_access('has_specials', \%has_specials);
4959
4960 my %to_output_map;
4961 # Boolean as to whether or not to write out this map table
4962 main::set_access('to_output_map', \%to_output_map, 's');
4963
4964
4965 sub new {
4966 my $class = shift;
4967 my $name = shift;
4968
4969 my %args = @_;
4970
4971 # Optional initialization data for the table.
4972 my $initialize = delete $args{'Initialize'};
4973
4974 my $core_access = delete $args{'Core_Access'};
4975 my $default_map = delete $args{'Default_Map'};
99870f4d 4976 my $property = delete $args{'_Property'};
23e33b60 4977 my $full_name = delete $args{'Full_Name'};
99870f4d
KW
4978 # Rest of parameters passed on
4979
4980 my $range_list = Range_Map->new(Owner => $property);
4981
4982 my $self = $class->SUPER::new(
4983 Name => $name,
23e33b60
KW
4984 Complete_Name => $full_name,
4985 Full_Name => $full_name,
99870f4d
KW
4986 _Property => $property,
4987 _Range_List => $range_list,
4988 %args);
4989
ffe43484 4990 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
4991
4992 $anomalous_entries{$addr} = [];
4993 $core_access{$addr} = $core_access;
4994 $default_map{$addr} = $default_map;
99870f4d
KW
4995
4996 $self->initialize($initialize) if defined $initialize;
4997
4998 return $self;
4999 }
5000
5001 use overload
5002 fallback => 0,
5003 qw("") => "_operator_stringify",
5004 ;
5005
5006 sub _operator_stringify {
5007 my $self = shift;
5008
5009 my $name = $self->property->full_name;
5010 $name = '""' if $name eq "";
5011 return "Map table for Property '$name'";
5012 }
5013
99870f4d
KW
5014 sub add_alias {
5015 # Add a synonym for this table (which means the property itself)
5016 my $self = shift;
5017 my $name = shift;
5018 # Rest of parameters passed on.
5019
5020 $self->SUPER::add_alias($name, $self->property, @_);
5021 return;
5022 }
5023
5024 sub add_map {
5025 # Add a range of code points to the list of specially-handled code
5026 # points. $MULTI_CP is assumed if the type of special is not passed
5027 # in.
5028
5029 my $self = shift;
5030 my $lower = shift;
5031 my $upper = shift;
5032 my $string = shift;
5033 my %args = @_;
5034
5035 my $type = delete $args{'Type'} || 0;
5036 # Rest of parameters passed on
5037
5038 # Can't change the table if locked.
5039 return if $self->carp_if_locked;
5040
ffe43484 5041 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5042
5043 $has_specials{$addr} = 1 if $type;
5044
5045 $self->_range_list->add_map($lower, $upper,
5046 $string,
5047 @_,
5048 Type => $type);
5049 return;
5050 }
5051
5052 sub append_to_body {
5053 # Adds to the written HERE document of the table's body any anomalous
5054 # entries in the table..
5055
5056 my $self = shift;
5057 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5058
ffe43484 5059 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5060
5061 return "" unless @{$anomalous_entries{$addr}};
5062 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
5063 }
5064
5065 sub map_add_or_replace_non_nulls {
5066 # This adds the mappings in the table $other to $self. Non-null
5067 # mappings from $other override those in $self. It essentially merges
5068 # the two tables, with the second having priority except for null
5069 # mappings.
5070
5071 my $self = shift;
5072 my $other = shift;
5073 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5074
5075 return if $self->carp_if_locked;
5076
5077 if (! $other->isa(__PACKAGE__)) {
5078 Carp::my_carp_bug("$other should be a "
5079 . __PACKAGE__
5080 . ". Not a '"
5081 . ref($other)
5082 . "'. Not added;");
5083 return;
5084 }
5085
ffe43484
NC
5086 my $addr = do { no overloading; pack 'J', $self; };
5087 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
5088
5089 local $to_trace = 0 if main::DEBUG;
5090
5091 my $self_range_list = $self->_range_list;
5092 my $other_range_list = $other->_range_list;
5093 foreach my $range ($other_range_list->ranges) {
5094 my $value = $range->value;
5095 next if $value eq "";
5096 $self_range_list->_add_delete('+',
5097 $range->start,
5098 $range->end,
5099 $value,
5100 Type => $range->type,
5101 Replace => $UNCONDITIONALLY);
5102 }
5103
5104 # Copy the specials information from the other table to $self
5105 if ($has_specials{$other_addr}) {
5106 $has_specials{$addr} = 1;
5107 }
5108
5109 return;
5110 }
5111
5112 sub set_default_map {
5113 # Define what code points that are missing from the input files should
5114 # map to
5115
5116 my $self = shift;
5117 my $map = shift;
5118 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5119
ffe43484 5120 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5121
5122 # Convert the input to the standard equivalent, if any (won't have any
5123 # for $STRING properties)
5124 my $standard = $self->_find_table_from_alias->{$map};
5125 $map = $standard->name if defined $standard;
5126
5127 # Warn if there already is a non-equivalent default map for this
5128 # property. Note that a default map can be a ref, which means that
5129 # what it actually means is delayed until later in the program, and it
5130 # IS permissible to override it here without a message.
5131 my $default_map = $default_map{$addr};
5132 if (defined $default_map
5133 && ! ref($default_map)
5134 && $default_map ne $map
5135 && main::Standardize($map) ne $default_map)
5136 {
5137 my $property = $self->property;
5138 my $map_table = $property->table($map);
5139 my $default_table = $property->table($default_map);
5140 if (defined $map_table
5141 && defined $default_table
5142 && $map_table != $default_table)
5143 {
5144 Carp::my_carp("Changing the default mapping for "
5145 . $property
5146 . " from $default_map to $map'");
5147 }
5148 }
5149
5150 $default_map{$addr} = $map;
5151
5152 # Don't also create any missing table for this map at this point,
5153 # because if we did, it could get done before the main table add is
5154 # done for PropValueAliases.txt; instead the caller will have to make
5155 # sure it exists, if desired.
5156 return;
5157 }
5158
5159 sub to_output_map {
5160 # Returns boolean: should we write this map table?
5161
5162 my $self = shift;
5163 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5164
ffe43484 5165 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5166
5167 # If overridden, use that
5168 return $to_output_map{$addr} if defined $to_output_map{$addr};
5169
5170 my $full_name = $self->full_name;
5171
5172 # If table says to output, do so; if says to suppress it, do do.
5173 return 1 if grep { $_ eq $full_name } @output_mapped_properties;
5174 return 0 if $self->status eq $SUPPRESSED;
5175
5176 my $type = $self->property->type;
5177
5178 # Don't want to output binary map tables even for debugging.
5179 return 0 if $type == $BINARY;
5180
5181 # But do want to output string ones.
5182 return 1 if $type == $STRING;
5183
5184 # Otherwise is an $ENUM, don't output it
5185 return 0;
5186 }
5187
5188 sub inverse_list {
5189 # Returns a Range_List that is gaps of the current table. That is,
5190 # the inversion
5191
5192 my $self = shift;
5193 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5194
5195 my $current = Range_List->new(Initialize => $self->_range_list,
5196 Owner => $self->property);
5197 return ~ $current;
5198 }
5199
5200 sub set_final_comment {
5201 # Just before output, create the comment that heads the file
5202 # containing this table.
5203
5204 my $self = shift;
5205 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5206
5207 # No sense generating a comment if aren't going to write it out.
5208 return if ! $self->to_output_map;
5209
ffe43484 5210 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5211
5212 my $property = $self->property;
5213
5214 # Get all the possible names for this property. Don't use any that
5215 # aren't ok for use in a file name, etc. This is perhaps causing that
5216 # flag to do double duty, and may have to be changed in the future to
5217 # have our own flag for just this purpose; but it works now to exclude
5218 # Perl generated synonyms from the lists for properties, where the
5219 # name is always the proper Unicode one.
5220 my @property_aliases = grep { $_->externally_ok } $self->aliases;
5221
5222 my $count = $self->count;
5223 my $default_map = $default_map{$addr};
5224
5225 # The ranges that map to the default aren't output, so subtract that
5226 # to get those actually output. A property with matching tables
5227 # already has the information calculated.
5228 if ($property->type != $STRING) {
5229 $count -= $property->table($default_map)->count;
5230 }
5231 elsif (defined $default_map) {
5232
5233 # But for $STRING properties, must calculate now. Subtract the
5234 # count from each range that maps to the default.
5235 foreach my $range ($self->_range_list->ranges) {
99870f4d
KW
5236 if ($range->value eq $default_map) {
5237 $count -= $range->end +1 - $range->start;
5238 }
5239 }
5240
5241 }
5242
5243 # Get a string version of $count with underscores in large numbers,
5244 # for clarity.
5245 my $string_count = main::clarify_number($count);
5246
5247 my $code_points = ($count == 1)
5248 ? 'single code point'
5249 : "$string_count code points";
5250
5251 my $mapping;
5252 my $these_mappings;
5253 my $are;
5254 if (@property_aliases <= 1) {
5255 $mapping = 'mapping';
5256 $these_mappings = 'this mapping';
5257 $are = 'is'
5258 }
5259 else {
5260 $mapping = 'synonymous mappings';
5261 $these_mappings = 'these mappings';
5262 $are = 'are'
5263 }
5264 my $cp;
5265 if ($count >= $MAX_UNICODE_CODEPOINTS) {
5266 $cp = "any code point in Unicode Version $string_version";
5267 }
5268 else {
5269 my $map_to;
5270 if ($default_map eq "") {
5271 $map_to = 'the null string';
5272 }
5273 elsif ($default_map eq $CODE_POINT) {
5274 $map_to = "itself";
5275 }
5276 else {
5277 $map_to = "'$default_map'";
5278 }
5279 if ($count == 1) {
5280 $cp = "the single code point";
5281 }
5282 else {
5283 $cp = "one of the $code_points";
5284 }
5285 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
5286 }
5287
5288 my $comment = "";
5289
5290 my $status = $self->status;
5291 if ($status) {
5292 my $warn = uc $status_past_participles{$status};
5293 $comment .= <<END;
5294
5295!!!!!!! $warn !!!!!!!!!!!!!!!!!!!
5296 All property or property=value combinations contained in this file are $warn.
5297 See $unicode_reference_url for what this means.
5298
5299END
5300 }
5301 $comment .= "This file returns the $mapping:\n";
5302
5303 for my $i (0 .. @property_aliases - 1) {
5304 $comment .= sprintf("%-8s%s\n",
5305 " ",
5306 $property_aliases[$i]->name . '(cp)'
5307 );
5308 }
5309 $comment .=
5310 "\nwhere 'cp' is $cp. Note that $these_mappings $are ";
5311
5312 my $access = $core_access{$addr};
5313 if ($access) {
5314 $comment .= "accessible through the Perl core via $access.";
5315 }
5316 else {
5317 $comment .= "not accessible through the Perl core directly.";
5318 }
5319
5320 # And append any commentary already set from the actual property.
5321 $comment .= "\n\n" . $self->comment if $self->comment;
5322 if ($self->description) {
5323 $comment .= "\n\n" . join " ", $self->description;
5324 }
5325 if ($self->note) {
5326 $comment .= "\n\n" . join " ", $self->note;
5327 }
5328 $comment .= "\n";
5329
5330 if (! $self->perl_extension) {
5331 $comment .= <<END;
5332
5333For information about what this property really means, see:
5334$unicode_reference_url
5335END
5336 }
5337
5338 if ($count) { # Format differs for empty table
5339 $comment.= "\nThe format of the ";
5340 if ($self->range_size_1) {
5341 $comment.= <<END;
5342main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
5343is in hex; MAPPING is what CODE_POINT maps to.
5344END
5345 }
5346 else {
5347
5348 # There are tables which end up only having one element per
5349 # range, but it is not worth keeping track of for making just
5350 # this comment a little better.
5351 $comment.= <<END;
5352non-comment portions of the main body of lines of this file is:
5353START\\tSTOP\\tMAPPING where START is the starting code point of the
5354range, in hex; STOP is the ending point, or if omitted, the range has just one
5355code point; MAPPING is what each code point between START and STOP maps to.
5356END
0c07e538 5357 if ($self->output_range_counts) {
99870f4d
KW
5358 $comment .= <<END;
5359Numbers in comments in [brackets] indicate how many code points are in the
5360range (omitted when the range is a single code point or if the mapping is to
5361the null string).
5362END
5363 }
5364 }
5365 }
5366 $self->set_comment(main::join_lines($comment));
5367 return;
5368 }
5369
5370 my %swash_keys; # Makes sure don't duplicate swash names.
5371
5372 sub pre_body {
5373 # Returns the string that should be output in the file before the main
5374 # body of this table. This includes some hash entries identifying the
5375 # format of the body, and what the single value should be for all
5376 # ranges missing from it. It also includes any code points which have
5377 # map_types that don't go in the main table.
5378
5379 my $self = shift;
5380 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5381
ffe43484 5382 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5383
5384 my $name = $self->property->swash_name;
5385
5386 if (defined $swash_keys{$name}) {
5387 Carp::my_carp(join_lines(<<END
5388Already created a swash name '$name' for $swash_keys{$name}. This means that
5389the same name desired for $self shouldn't be used. Bad News. This must be
5390fixed before production use, but proceeding anyway
5391END
5392 ));
5393 }
5394 $swash_keys{$name} = "$self";
5395
5396 my $default_map = $default_map{$addr};
5397
5398 my $pre_body = "";
5399 if ($has_specials{$addr}) {
5400
5401 # Here, some maps with non-zero type have been added to the table.
5402 # Go through the table and handle each of them. None will appear
5403 # in the body of the table, so delete each one as we go. The
5404 # code point count has already been calculated, so ok to delete
5405 # now.
5406
5407 my @multi_code_point_maps;
5408 my $has_hangul_syllables = 0;
5409
5410 # The key is the base name of the code point, and the value is an
5411 # array giving all the ranges that use this base name. Each range
5412 # is actually a hash giving the 'low' and 'high' values of it.
5413 my %names_ending_in_code_point;
5414
5415 # Inverse mapping. The list of ranges that have these kinds of
5416 # names. Each element contains the low, high, and base names in a
5417 # hash.
5418 my @code_points_ending_in_code_point;
5419
5420 my $range_map = $self->_range_list;
5421 foreach my $range ($range_map->ranges) {
5422 next unless $range->type != 0;
5423 my $low = $range->start;
5424 my $high = $range->end;
5425 my $map = $range->value;
5426 my $type = $range->type;
5427
5428 # No need to output the range if it maps to the default. And
5429 # the write method won't output it either, so no need to
5430 # delete it to keep it from being output, and is faster to
5431 # skip than to delete anyway.
5432 next if $map eq $default_map;
5433
5434 # Delete the range to keep write() from trying to output it
5435 $range_map->delete_range($low, $high);
5436
5437 # Switch based on the map type...
5438 if ($type == $HANGUL_SYLLABLE) {
5439
5440 # These are entirely algorithmically determinable based on
5441 # some constants furnished by Unicode; for now, just set a
5442 # flag to indicate that have them. Below we will output
5443 # the code that does the algorithm.
5444 $has_hangul_syllables = 1;
5445 }
5446 elsif ($type == $CP_IN_NAME) {
5447
5448 # If the name ends in the code point it represents, are
5449 # also algorithmically determinable, but need information
5450 # about the map to do so. Both the map and its inverse
5451 # are stored in data structures output in the file.
5452 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
5453 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
5454
5455 push @code_points_ending_in_code_point, { low => $low,
5456 high => $high,
5457 name => $map
5458 };
5459 }
5460 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
5461
5462 # Multi-code point maps and null string maps have an entry
5463 # for each code point in the range. They use the same
5464 # output format.
5465 for my $code_point ($low .. $high) {
5466
5467 # The pack() below can't cope with surrogates.
5468 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
5469 Carp::my_carp("Surrogage code point '$code_point' in mapping to '$map' in $self. No map created");
5470 next;
5471 }
5472
5473 # Generate the hash entries for these in the form that
5474 # utf8.c understands.
5475 my $tostr = "";
5476 foreach my $to (split " ", $map) {
5477 if ($to !~ /^$code_point_re$/) {
5478 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
5479 next;
5480 }
5481 $tostr .= sprintf "\\x{%s}", $to;
5482 }
5483
5484 # I (khw) have never waded through this line to
5485 # understand it well enough to comment it.
5486 my $utf8 = sprintf(qq["%s" => "$tostr",],
5487 join("", map { sprintf "\\x%02X", $_ }
5488 unpack("U0C*", pack("U", $code_point))));
5489
5490 # Add a comment so that a human reader can more easily
5491 # see what's going on.
5492 push @multi_code_point_maps,
5493 sprintf("%-45s # U+%04X => %s", $utf8,
5494 $code_point,
5495 $map);
5496 }
5497 }
5498 else {
5499 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Using type 0 instead");
5500 $range_map->add_map($low, $high, $map, Replace => $UNCONDITIONALLY, Type => 0);
5501 }
5502 } # End of loop through all ranges
5503
5504 # Here have gone through the whole file. If actually generated
5505 # anything for each map type, add its respective header and
5506 # trailer
5507 if (@multi_code_point_maps) {
5508 $pre_body .= <<END;
5509
5510# Some code points require special handling because their mappings are each to
5511# multiple code points. These do not appear in the main body, but are defined
5512# in the hash below.
5513
76591e2b
KW
5514# Each key is the string of N bytes that together make up the UTF-8 encoding
5515# for the code point. (i.e. the same as looking at the code point's UTF-8
5516# under "use bytes"). Each value is the UTF-8 of the translation, for speed.
99870f4d
KW
5517%utf8::ToSpec$name = (
5518END
5519 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
5520 }
5521
5522 if ($has_hangul_syllables || @code_points_ending_in_code_point) {
5523
5524 # Convert these structures to output format.
5525 my $code_points_ending_in_code_point =
5526 main::simple_dumper(\@code_points_ending_in_code_point,
5527 ' ' x 8);
5528 my $names = main::simple_dumper(\%names_ending_in_code_point,
5529 ' ' x 8);
5530
5531 # Do the same with the Hangul names,
5532 my $jamo;
5533 my $jamo_l;
5534 my $jamo_v;
5535 my $jamo_t;
5536 my $jamo_re;
5537 if ($has_hangul_syllables) {
5538
5539 # Construct a regular expression of all the possible
5540 # combinations of the Hangul syllables.
5541 my @L_re; # Leading consonants
5542 for my $i ($LBase .. $LBase + $LCount - 1) {
5543 push @L_re, $Jamo{$i}
5544 }
5545 my @V_re; # Middle vowels
5546 for my $i ($VBase .. $VBase + $VCount - 1) {
5547 push @V_re, $Jamo{$i}
5548 }
5549 my @T_re; # Trailing consonants
5550 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
5551 push @T_re, $Jamo{$i}
5552 }
5553
5554 # The whole re is made up of the L V T combination.
5555 $jamo_re = '('
5556 . join ('|', sort @L_re)
5557 . ')('
5558 . join ('|', sort @V_re)
5559 . ')('
5560 . join ('|', sort @T_re)
5561 . ')?';
5562
5563 # These hashes needed by the algorithm were generated
5564 # during reading of the Jamo.txt file
5565 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
5566 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
5567 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
5568 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
5569 }
5570
5571 $pre_body .= <<END;
5572
5573# To achieve significant memory savings when this file is read in,
5574# algorithmically derivable code points are omitted from the main body below.
5575# Instead, the following routines can be used to translate between name and
5576# code point and vice versa
5577
5578{ # Closure
5579
5580 # Matches legal code point. 4-6 hex numbers, If there are 6, the
5581 # first two must be '10'; if there are 5, the first must not be a '0'.
5582 my \$code_point_re = qr/$code_point_re/;
5583
5584 # In the following hash, the keys are the bases of names which includes
5585 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The values
5586 # of each key is another hash which is used to get the low and high ends
5587 # for each range of code points that apply to the name
5588 my %names_ending_in_code_point = (
5589$names
5590 );
5591
5592 # And the following array gives the inverse mapping from code points to
5593 # names. Lowest code points are first
5594 my \@code_points_ending_in_code_point = (
5595$code_points_ending_in_code_point
5596 );
5597END
5598 # Earlier releases didn't have Jamos. No sense outputting
5599 # them unless will be used.
5600 if ($has_hangul_syllables) {
5601 $pre_body .= <<END;
5602
5603 # Convert from code point to Jamo short name for use in composing Hangul
5604 # syllable names
5605 my %Jamo = (
5606$jamo
5607 );
5608
5609 # Leading consonant (can be null)
5610 my %Jamo_L = (
5611$jamo_l
5612 );
5613
5614 # Vowel
5615 my %Jamo_V = (
5616$jamo_v
5617 );
5618
5619 # Optional trailing consonant
5620 my %Jamo_T = (
5621$jamo_t
5622 );
5623
5624 # Computed re that splits up a Hangul name into LVT or LV syllables
5625 my \$syllable_re = qr/$jamo_re/;
5626
5627 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
5628 my \$HANGUL_SYLLABLE_LENGTH = length \$HANGUL_SYLLABLE;
5629
5630 # These constants names and values were taken from the Unicode standard,
5631 # version 5.1, section 3.12. They are used in conjunction with Hangul
5632 # syllables
6e5a209b
KW
5633 my \$SBase = $SBase_string;
5634 my \$LBase = $LBase_string;
5635 my \$VBase = $VBase_string;
5636 my \$TBase = $TBase_string;
5637 my \$SCount = $SCount;
5638 my \$LCount = $LCount;
5639 my \$VCount = $VCount;
5640 my \$TCount = $TCount;
99870f4d
KW
5641 my \$NCount = \$VCount * \$TCount;
5642END
5643 } # End of has Jamos
5644
5645 $pre_body .= << 'END';
5646
5647 sub name_to_code_point_special {
5648 my $name = shift;
5649
5650 # Returns undef if not one of the specially handled names; otherwise
5651 # returns the code point equivalent to the input name
5652END
5653 if ($has_hangul_syllables) {
5654 $pre_body .= << 'END';
5655
5656 if (substr($name, 0, $HANGUL_SYLLABLE_LENGTH) eq $HANGUL_SYLLABLE) {
5657 $name = substr($name, $HANGUL_SYLLABLE_LENGTH);
5658 return if $name !~ qr/^$syllable_re$/;
5659 my $L = $Jamo_L{$1};
5660 my $V = $Jamo_V{$2};
5661 my $T = (defined $3) ? $Jamo_T{$3} : 0;
5662 return ($L * $VCount + $V) * $TCount + $T + $SBase;
5663 }
5664END
5665 }
5666 $pre_body .= << 'END';
5667
5668 # Name must end in '-code_point' for this to handle.
5669 if ($name !~ /^ (.*) - ($code_point_re) $/x) {
5670 return;
5671 }
5672
5673 my $base = $1;
5674 my $code_point = CORE::hex $2;
5675
5676 # Name must be one of the ones which has the code point in it.
5677 return if ! $names_ending_in_code_point{$base};
5678
5679 # Look through the list of ranges that apply to this name to see if
5680 # the code point is in one of them.
5681 for (my $i = 0; $i < scalar @{$names_ending_in_code_point{$base}{'low'}}; $i++) {
5682 return if $names_ending_in_code_point{$base}{'low'}->[$i] > $code_point;
5683 next if $names_ending_in_code_point{$base}{'high'}->[$i] < $code_point;
5684
5685 # Here, the code point is in the range.
5686 return $code_point;
5687 }
5688
5689 # Here, looked like the name had a code point number in it, but
5690 # did not match one of the valid ones.
5691 return;
5692 }
5693
5694 sub code_point_to_name_special {
5695 my $code_point = shift;
5696
5697 # Returns the name of a code point if algorithmically determinable;
5698 # undef if not
5699END
5700 if ($has_hangul_syllables) {
5701 $pre_body .= << 'END';
5702
5703 # If in the Hangul range, calculate the name based on Unicode's
5704 # algorithm
5705 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
5706 use integer;
5707 my $SIndex = $code_point - $SBase;
5708 my $L = $LBase + $SIndex / $NCount;
5709 my $V = $VBase + ($SIndex % $NCount) / $TCount;
5710 my $T = $TBase + $SIndex % $TCount;
03e1aa51 5711 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
99870f4d
KW
5712 $name .= $Jamo{$T} if $T != $TBase;
5713 return $name;
5714 }
5715END
5716 }
5717 $pre_body .= << 'END';
5718
5719 # Look through list of these code points for one in range.
5720 foreach my $hash (@code_points_ending_in_code_point) {
5721 return if $code_point < $hash->{'low'};
5722 if ($code_point <= $hash->{'high'}) {
5723 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
5724 }
5725 }
5726 return; # None found
5727 }
5728} # End closure
5729
5730END
5731 } # End of has hangul or code point in name maps.
5732 } # End of has specials
5733
5734 # Calculate the format of the table if not already done.
f5817e0a 5735 my $format = $self->format;
99870f4d
KW
5736 my $property = $self->property;
5737 my $type = $property->type;
5738 if (! defined $format) {
5739 if ($type == $BINARY) {
5740
5741 # Don't bother checking the values, because we elsewhere
5742 # verify that a binary table has only 2 values.
5743 $format = $BINARY_FORMAT;
5744 }
5745 else {
5746 my @ranges = $self->_range_list->ranges;
5747
5748 # default an empty table based on its type and default map
5749 if (! @ranges) {
5750
5751 # But it turns out that the only one we can say is a
5752 # non-string (besides binary, handled above) is when the
5753 # table is a string and the default map is to a code point
5754 if ($type == $STRING && $default_map eq $CODE_POINT) {
5755 $format = $HEX_FORMAT;
5756 }
5757 else {
5758 $format = $STRING_FORMAT;
5759 }
5760 }
5761 else {
5762
5763 # Start with the most restrictive format, and as we find
5764 # something that doesn't fit with that, change to the next
5765 # most restrictive, and so on.
5766 $format = $DECIMAL_FORMAT;
5767 foreach my $range (@ranges) {
5768 my $map = $range->value;
5769 if ($map ne $default_map) {
5770 last if $format eq $STRING_FORMAT; # already at
5771 # least
5772 # restrictive
5773 $format = $INTEGER_FORMAT
5774 if $format eq $DECIMAL_FORMAT
5775 && $map !~ / ^ [0-9] $ /x;
5776 $format = $FLOAT_FORMAT
5777 if $format eq $INTEGER_FORMAT
5778 && $map !~ / ^ -? [0-9]+ $ /x;
5779 $format = $RATIONAL_FORMAT
5780 if $format eq $FLOAT_FORMAT
5781 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
5782 $format = $HEX_FORMAT
5783 if $format eq $RATIONAL_FORMAT
5784 && $map !~ / ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x;
5785 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
5786 && $map =~ /[^0-9A-F]/;
5787 }
5788 }
5789 }
5790 }
5791 } # end of calculating format
5792
5793 my $return = <<END;
5794# The name this swash is to be known by, with the format of the mappings in
5795# the main body of the table, and what all code points missing from this file
5796# map to.
5797\$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
5798END
5799 my $missing = $default_map;
5800 if ($missing eq $CODE_POINT
5801 && $format ne $HEX_FORMAT
f5817e0a 5802 && ! defined $self->format) # Is expected if was manually set
99870f4d
KW
5803 {
5804 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
5805 }
f5817e0a 5806 $self->_set_format($format);
99870f4d
KW
5807 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '$missing';";
5808 if ($missing eq $CODE_POINT) {
5809 $return .= ' # code point maps to itself';
5810 }
5811 elsif ($missing eq "") {
5812 $return .= ' # code point maps to the null string';
5813 }
5814 $return .= "\n";
5815
5816 $return .= $pre_body;
5817
5818 return $return;
5819 }
5820
5821 sub write {
5822 # Write the table to the file.
5823
5824 my $self = shift;
5825 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5826
ffe43484 5827 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5828
5829 return $self->SUPER::write(
5830 ($self->property == $block)
5831 ? 7 # block file needs more tab stops
5832 : 3,
5833 $default_map{$addr}); # don't write defaulteds
5834 }
5835
5836 # Accessors for the underlying list that should fail if locked.
ea25a9b2 5837 for my $sub (qw(
99870f4d 5838 add_duplicate
ea25a9b2 5839 ))
99870f4d
KW
5840 {
5841 no strict "refs";
5842 *$sub = sub {
5843 use strict "refs";
5844 my $self = shift;
5845
5846 return if $self->carp_if_locked;
5847 return $self->_range_list->$sub(@_);
5848 }
5849 }
5850} # End closure for Map_Table
5851
5852package Match_Table;
5853use base '_Base_Table';
5854
5855# A Match table is one which is a list of all the code points that have
5856# the same property and property value, for use in \p{property=value}
5857# constructs in regular expressions. It adds very little data to the base
5858# structure, but many methods, as these lists can be combined in many ways to
5859# form new ones.
5860# There are only a few concepts added:
5861# 1) Equivalents and Relatedness.
5862# Two tables can match the identical code points, but have different names.
5863# This always happens when there is a perl single form extension
5864# \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
5865# tables are set to be related, with the Perl extension being a child, and
5866# the Unicode property being the parent.
5867#
5868# It may be that two tables match the identical code points and we don't
5869# know if they are related or not. This happens most frequently when the
5870# Block and Script properties have the exact range. But note that a
5871# revision to Unicode could add new code points to the script, which would
5872# now have to be in a different block (as the block was filled, or there
5873# would have been 'Unknown' script code points in it and they wouldn't have
5874# been identical). So we can't rely on any two properties from Unicode
5875# always matching the same code points from release to release, and thus
5876# these tables are considered coincidentally equivalent--not related. When
5877# two tables are unrelated but equivalent, one is arbitrarily chosen as the
5878# 'leader', and the others are 'equivalents'. This concept is useful
5879# to minimize the number of tables written out. Only one file is used for
5880# any identical set of code points, with entries in Heavy.pl mapping all
5881# the involved tables to it.
5882#
5883# Related tables will always be identical; we set them up to be so. Thus
5884# if the Unicode one is deprecated, the Perl one will be too. Not so for
5885# unrelated tables. Relatedness makes generating the documentation easier.
5886#
5887# 2) Conflicting. It may be that there will eventually be name clashes, with
5888# the same name meaning different things. For a while, there actually were
5889# conflicts, but they have so far been resolved by changing Perl's or
5890# Unicode's definitions to match the other, but when this code was written,
5891# it wasn't clear that that was what was going to happen. (Unicode changed
5892# because of protests during their beta period.) Name clashes are warned
5893# about during compilation, and the documentation. The generated tables
5894# are sane, free of name clashes, because the code suppresses the Perl
5895# version. But manual intervention to decide what the actual behavior
5896# should be may be required should this happen. The introductory comments
5897# have more to say about this.
5898
5899sub standardize { return main::standardize($_[0]); }
5900sub trace { return main::trace(@_); }
5901
5902
5903{ # Closure
5904
5905 main::setup_package();
5906
5907 my %leader;
5908 # The leader table of this one; initially $self.
5909 main::set_access('leader', \%leader, 'r');
5910
5911 my %equivalents;
5912 # An array of any tables that have this one as their leader
5913 main::set_access('equivalents', \%equivalents, 'readable_array');
5914
5915 my %parent;
5916 # The parent table to this one, initially $self. This allows us to
5917 # distinguish between equivalent tables that are related, and those which
5918 # may not be, but share the same output file because they match the exact
5919 # same set of code points in the current Unicode release.
5920 main::set_access('parent', \%parent, 'r');
5921
5922 my %children;
5923 # An array of any tables that have this one as their parent
5924 main::set_access('children', \%children, 'readable_array');
5925
5926 my %conflicting;
5927 # Array of any tables that would have the same name as this one with
5928 # a different meaning. This is used for the generated documentation.
5929 main::set_access('conflicting', \%conflicting, 'readable_array');
5930
5931 my %matches_all;
5932 # Set in the constructor for tables that are expected to match all code
5933 # points.
5934 main::set_access('matches_all', \%matches_all, 'r');
5935
5936 sub new {
5937 my $class = shift;
5938
5939 my %args = @_;
5940
5941 # The property for which this table is a listing of property values.
5942 my $property = delete $args{'_Property'};
5943
23e33b60
KW
5944 my $name = delete $args{'Name'};
5945 my $full_name = delete $args{'Full_Name'};
5946 $full_name = $name if ! defined $full_name;
5947
99870f4d
KW
5948 # Optional
5949 my $initialize = delete $args{'Initialize'};
5950 my $matches_all = delete $args{'Matches_All'} || 0;
f5817e0a 5951 my $format = delete $args{'Format'};
99870f4d
KW
5952 # Rest of parameters passed on.
5953
5954 my $range_list = Range_List->new(Initialize => $initialize,
5955 Owner => $property);
5956
23e33b60
KW
5957 my $complete = $full_name;
5958 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
5959 # but this helps debug if it
5960 # does
5961 # The complete name for a match table includes it's property in a
5962 # compound form 'property=table', except if the property is the
5963 # pseudo-property, perl, in which case it is just the single form,
5964 # 'table' (If you change the '=' must also change the ':' in lots of
5965 # places in this program that assume an equal sign)
5966 $complete = $property->full_name . "=$complete" if $property != $perl;
678f13d5 5967
99870f4d 5968 my $self = $class->SUPER::new(%args,
23e33b60
KW
5969 Name => $name,
5970 Complete_Name => $complete,
5971 Full_Name => $full_name,
99870f4d
KW
5972 _Property => $property,
5973 _Range_List => $range_list,
f5817e0a 5974 Format => $EMPTY_FORMAT,
99870f4d 5975 );
ffe43484 5976 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
5977
5978 $conflicting{$addr} = [ ];
5979 $equivalents{$addr} = [ ];
5980 $children{$addr} = [ ];
5981 $matches_all{$addr} = $matches_all;
5982 $leader{$addr} = $self;
5983 $parent{$addr} = $self;
5984
f5817e0a
KW
5985 if (defined $format && $format ne $EMPTY_FORMAT) {
5986 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
5987 }
5988
99870f4d
KW
5989 return $self;
5990 }
5991
5992 # See this program's beginning comment block about overloading these.
5993 use overload
5994 fallback => 0,
5995 qw("") => "_operator_stringify",
5996 '=' => sub {
5997 my $self = shift;
5998
5999 return if $self->carp_if_locked;
6000 return $self;
6001 },
6002
6003 '+' => sub {
6004 my $self = shift;
6005 my $other = shift;
6006
6007 return $self->_range_list + $other;
6008 },
6009 '&' => sub {
6010 my $self = shift;
6011 my $other = shift;
6012
6013 return $self->_range_list & $other;
6014 },
6015 '+=' => sub {
6016 my $self = shift;
6017 my $other = shift;
6018
6019 return if $self->carp_if_locked;
6020
ffe43484 6021 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6022
6023 if (ref $other) {
6024
6025 # Change the range list of this table to be the
6026 # union of the two.
6027 $self->_set_range_list($self->_range_list
6028 + $other);
6029 }
6030 else { # $other is just a simple value
6031 $self->add_range($other, $other);
6032 }
6033 return $self;
6034 },
6035 '-' => sub { my $self = shift;
6036 my $other = shift;
6037 my $reversed = shift;
6038
6039 if ($reversed) {
6040 Carp::my_carp_bug("Can't cope with a "
6041 . __PACKAGE__
6042 . " being the first parameter in a '-'. Subtraction ignored.");
6043 return;
6044 }
6045
6046 return $self->_range_list - $other;
6047 },
6048 '~' => sub { my $self = shift;
6049 return ~ $self->_range_list;
6050 },
6051 ;
6052
6053 sub _operator_stringify {
6054 my $self = shift;
6055
23e33b60 6056 my $name = $self->complete_name;
99870f4d
KW
6057 return "Table '$name'";
6058 }
6059
6060 sub add_alias {
6061 # Add a synonym for this table. See the comments in the base class
6062
6063 my $self = shift;
6064 my $name = shift;
6065 # Rest of parameters passed on.
6066
6067 $self->SUPER::add_alias($name, $self, @_);
6068 return;
6069 }
6070
6071 sub add_conflicting {
6072 # Add the name of some other object to the list of ones that name
6073 # clash with this match table.
6074
6075 my $self = shift;
6076 my $conflicting_name = shift; # The name of the conflicting object
6077 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
6078 my $conflicting_object = shift; # Optional, the conflicting object
6079 # itself. This is used to
6080 # disambiguate the text if the input
6081 # name is identical to any of the
6082 # aliases $self is known by.
6083 # Sometimes the conflicting object is
6084 # merely hypothetical, so this has to
6085 # be an optional parameter.
6086 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6087
ffe43484 6088 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6089
6090 # Check if the conflicting name is exactly the same as any existing
6091 # alias in this table (as long as there is a real object there to
6092 # disambiguate with).
6093 if (defined $conflicting_object) {
6094 foreach my $alias ($self->aliases) {
6095 if ($alias->name eq $conflicting_name) {
6096
6097 # Here, there is an exact match. This results in
6098 # ambiguous comments, so disambiguate by changing the
6099 # conflicting name to its object's complete equivalent.
6100 $conflicting_name = $conflicting_object->complete_name;
6101 last;
6102 }
6103 }
6104 }
6105
6106 # Convert to the \p{...} final name
6107 $conflicting_name = "\\$p" . "{$conflicting_name}";
6108
6109 # Only add once
6110 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
6111
6112 push @{$conflicting{$addr}}, $conflicting_name;
6113
6114 return;
6115 }
6116
6117 sub is_equivalent_to {
6118 # Return boolean of whether or not the other object is a table of this
6119 # type and has been marked equivalent to this one.
6120
6121 my $self = shift;
6122 my $other = shift;
6123 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6124
6125 return 0 if ! defined $other; # Can happen for incomplete early
6126 # releases
6127 unless ($other->isa(__PACKAGE__)) {
6128 my $ref_other = ref $other;
6129 my $ref_self = ref $self;
6130 Carp::my_carp_bug("Argument to 'is_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6131 return 0;
6132 }
6133
6134 # Two tables are equivalent if they have the same leader.
f998e60c 6135 no overloading;
051df77b 6136 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
99870f4d
KW
6137 return;
6138 }
6139
6140 sub matches_identically_to {
6141 # Return a boolean as to whether or not two tables match identical
6142 # sets of code points.
6143
6144 my $self = shift;
6145 my $other = shift;
6146 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6147
6148 unless ($other->isa(__PACKAGE__)) {
6149 my $ref_other = ref $other;
6150 my $ref_self = ref $self;
6151 Carp::my_carp_bug("Argument to 'matches_identically_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
6152 return 0;
6153 }
6154
6155 # These are ordered in increasing real time to figure out (at least
6156 # until a patch changes that and doesn't change this)
6157 return 0 if $self->max != $other->max;
6158 return 0 if $self->min != $other->min;
6159 return 0 if $self->range_count != $other->range_count;
6160 return 0 if $self->count != $other->count;
6161
6162 # Here they could be identical because all the tests above passed.
6163 # The loop below is somewhat simpler since we know they have the same
6164 # number of elements. Compare range by range, until reach the end or
6165 # find something that differs.
6166 my @a_ranges = $self->_range_list->ranges;
6167 my @b_ranges = $other->_range_list->ranges;
6168 for my $i (0 .. @a_ranges - 1) {
6169 my $a = $a_ranges[$i];
6170 my $b = $b_ranges[$i];
6171 trace "self $a; other $b" if main::DEBUG && $to_trace;
6172 return 0 if $a->start != $b->start || $a->end != $b->end;
6173 }
6174 return 1;
6175 }
6176
6177 sub set_equivalent_to {
6178 # Set $self equivalent to the parameter table.
6179 # The required Related => 'x' parameter is a boolean indicating
6180 # whether these tables are related or not. If related, $other becomes
6181 # the 'parent' of $self; if unrelated it becomes the 'leader'
6182 #
6183 # Related tables share all characteristics except names; equivalents
6184 # not quite so many.
6185 # If they are related, one must be a perl extension. This is because
6186 # we can't guarantee that Unicode won't change one or the other in a
6187 # later release even if they are idential now.
6188
6189 my $self = shift;
6190 my $other = shift;
6191
6192 my %args = @_;
6193 my $related = delete $args{'Related'};
6194
6195 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
6196
6197 return if ! defined $other; # Keep on going; happens in some early
6198 # Unicode releases.
6199
6200 if (! defined $related) {
6201 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
6202 $related = 0;
6203 }
6204
6205 # If already are equivalent, no need to re-do it; if subroutine
6206 # returns null, it found an error, also do nothing
6207 my $are_equivalent = $self->is_equivalent_to($other);
6208 return if ! defined $are_equivalent || $are_equivalent;
6209
ffe43484 6210 my $addr = do { no overloading; pack 'J', $self; };
f998e60c 6211 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
99870f4d
KW
6212
6213 if ($related &&
6214 ! $other->perl_extension
6215 && ! $current_leader->perl_extension)
6216 {
6217 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
6218 $related = 0;
6219 }
6220
ffe43484
NC
6221 my $leader = do { no overloading; pack 'J', $current_leader; };
6222 my $other_addr = do { no overloading; pack 'J', $other; };
99870f4d
KW
6223
6224 # Any tables that are equivalent to or children of this table must now
6225 # instead be equivalent to or (children) to the new leader (parent),
6226 # still equivalent. The equivalency includes their matches_all info,
6227 # and for related tables, their status
6228 # All related tables are of necessity equivalent, but the converse
6229 # isn't necessarily true
6230 my $status = $other->status;
6231 my $status_info = $other->status_info;
6232 my $matches_all = $matches_all{other_addr};
6233 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
6234 next if $table == $other;
6235 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
6236
ffe43484 6237 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6238 $leader{$table_addr} = $other;
6239 $matches_all{$table_addr} = $matches_all;
6240 $self->_set_range_list($other->_range_list);
6241 push @{$equivalents{$other_addr}}, $table;
6242 if ($related) {
6243 $parent{$table_addr} = $other;
6244 push @{$children{$other_addr}}, $table;
6245 $table->set_status($status, $status_info);
6246 }
6247 }
6248
6249 # Now that we've declared these to be equivalent, any changes to one
6250 # of the tables would invalidate that equivalency.
6251 $self->lock;
6252 $other->lock;
6253 return;
6254 }
6255
6256 sub add_range { # Add a range to the list for this table.
6257 my $self = shift;
6258 # Rest of parameters passed on
6259
6260 return if $self->carp_if_locked;
6261 return $self->_range_list->add_range(@_);
6262 }
6263
99870f4d
KW
6264 sub pre_body { # Does nothing for match tables.
6265 return
6266 }
6267
6268 sub append_to_body { # Does nothing for match tables.
6269 return
6270 }
6271
6272 sub write {
6273 my $self = shift;
6274 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6275
6276 return $self->SUPER::write(2); # 2 tab stops
6277 }
6278
6279 sub set_final_comment {
6280 # This creates a comment for the file that is to hold the match table
6281 # $self. It is somewhat convoluted to make the English read nicely,
6282 # but, heh, it's just a comment.
6283 # This should be called only with the leader match table of all the
6284 # ones that share the same file. It lists all such tables, ordered so
6285 # that related ones are together.
6286
6287 my $leader = shift; # Should only be called on the leader table of
6288 # an equivalent group
6289 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6290
ffe43484 6291 my $addr = do { no overloading; pack 'J', $leader; };
99870f4d
KW
6292
6293 if ($leader{$addr} != $leader) {
6294 Carp::my_carp_bug(<<END
6295set_final_comment() must be called on a leader table, which $leader is not.
6296It is equivalent to $leader{$addr}. No comment created
6297END
6298 );
6299 return;
6300 }
6301
6302 # Get the number of code points matched by each of the tables in this
6303 # file, and add underscores for clarity.
6304 my $count = $leader->count;
6305 my $string_count = main::clarify_number($count);
6306
6307 my $loose_count = 0; # how many aliases loosely matched
6308 my $compound_name = ""; # ? Are any names compound?, and if so, an
6309 # example
6310 my $properties_with_compound_names = 0; # count of these
6311
6312
6313 my %flags; # The status flags used in the file
6314 my $total_entries = 0; # number of entries written in the comment
6315 my $matches_comment = ""; # The portion of the comment about the
6316 # \p{}'s
6317 my @global_comments; # List of all the tables' comments that are
6318 # there before this routine was called.
6319
6320 # Get list of all the parent tables that are equivalent to this one
6321 # (including itself).
6322 my @parents = grep { $parent{main::objaddr $_} == $_ }
6323 main::uniques($leader, @{$equivalents{$addr}});
6324 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
6325 # tables
6326
6327 for my $parent (@parents) {
6328
6329 my $property = $parent->property;
6330
6331 # Special case 'N' tables in properties with two match tables when
6332 # the other is a 'Y' one. These are likely to be binary tables,
6333 # but not necessarily. In either case, \P{} will match the
6334 # complement of \p{}, and so if something is a synonym of \p, the
6335 # complement of that something will be the synonym of \P. This
6336 # would be true of any property with just two match tables, not
6337 # just those whose values are Y and N; but that would require a
6338 # little extra work, and there are none such so far in Unicode.
6339 my $perl_p = 'p'; # which is it? \p{} or \P{}
6340 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
6341
6342 if (scalar $property->tables == 2
6343 && $parent == $property->table('N')
6344 && defined (my $yes = $property->table('Y')))
6345 {
ffe43484 6346 my $yes_addr = do { no overloading; pack 'J', $yes; };
99870f4d
KW
6347 @yes_perl_synonyms
6348 = grep { $_->property == $perl }
6349 main::uniques($yes,
6350 $parent{$yes_addr},
6351 $parent{$yes_addr}->children);
6352
6353 # But these synonyms are \P{} ,not \p{}
6354 $perl_p = 'P';
6355 }
6356
6357 my @description; # Will hold the table description
6358 my @note; # Will hold the table notes.
6359 my @conflicting; # Will hold the table conflicts.
6360
6361 # Look at the parent, any yes synonyms, and all the children
ffe43484 6362 my $parent_addr = do { no overloading; pack 'J', $parent; };
99870f4d
KW
6363 for my $table ($parent,
6364 @yes_perl_synonyms,
f998e60c 6365 @{$children{$parent_addr}})
99870f4d 6366 {
ffe43484 6367 my $table_addr = do { no overloading; pack 'J', $table; };
99870f4d
KW
6368 my $table_property = $table->property;
6369
6370 # Tables are separated by a blank line to create a grouping.
6371 $matches_comment .= "\n" if $matches_comment;
6372
6373 # The table is named based on the property and value
6374 # combination it is for, like script=greek. But there may be
6375 # a number of synonyms for each side, like 'sc' for 'script',
6376 # and 'grek' for 'greek'. Any combination of these is a valid
6377 # name for this table. In this case, there are three more,
6378 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
6379 # listing all possible combinations in the comment, we make
6380 # sure that each synonym occurs at least once, and add
6381 # commentary that the other combinations are possible.
6382 my @property_aliases = $table_property->aliases;
6383 my @table_aliases = $table->aliases;
6384
6385 Carp::my_carp_bug("$table doesn't have any names. Proceeding anyway.") unless @table_aliases;
6386
6387 # The alias lists above are already ordered in the order we
6388 # want to output them. To ensure that each synonym is listed,
6389 # we must use the max of the two numbers.
6390 my $listed_combos = main::max(scalar @table_aliases,
6391 scalar @property_aliases);
6392 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
6393
6394 my $property_had_compound_name = 0;
6395
6396 for my $i (0 .. $listed_combos - 1) {
6397 $total_entries++;
6398
6399 # The current alias for the property is the next one on
6400 # the list, or if beyond the end, start over. Similarly
6401 # for the table (\p{prop=table})
6402 my $property_alias = $property_aliases
6403 [$i % @property_aliases]->name;
6404 my $table_alias_object = $table_aliases
6405 [$i % @table_aliases];
6406 my $table_alias = $table_alias_object->name;
6407 my $loose_match = $table_alias_object->loose_match;
6408
6409 if ($table_alias !~ /\D/) { # Clarify large numbers.
6410 $table_alias = main::clarify_number($table_alias)
6411 }
6412
6413 # Add a comment for this alias combination
6414 my $current_match_comment;
6415 if ($table_property == $perl) {
6416 $current_match_comment = "\\$perl_p"
6417 . "{$table_alias}";
6418 }
6419 else {
6420 $current_match_comment
6421 = "\\p{$property_alias=$table_alias}";
6422 $property_had_compound_name = 1;
6423 }
6424
6425 # Flag any abnormal status for this table.
6426 my $flag = $property->status
6427 || $table->status
6428 || $table_alias_object->status;
37e2e78e
KW
6429 if ($flag) {
6430 if ($flag ne $PLACEHOLDER) {
6431 $flags{$flag} = $status_past_participles{$flag};
6432 } else {
6433 $flags{$flag} = <<END;
6434a placeholder because it is not in Version $string_version of Unicode, but is
6435needed by the Perl core to work gracefully. Because it is not in this version
6436of Unicode, it will not be listed in $pod_file.pod
6437END
6438 }
6439 }
99870f4d
KW
6440
6441 $loose_count++;
6442
6443 # Pretty up the comment. Note the \b; it says don't make
6444 # this line a continuation.
6445 $matches_comment .= sprintf("\b%-1s%-s%s\n",
6446 $flag,
6447 " " x 7,
6448 $current_match_comment);
6449 } # End of generating the entries for this table.
6450
6451 # Save these for output after this group of related tables.
6452 push @description, $table->description;
6453 push @note, $table->note;
6454 push @conflicting, $table->conflicting;
6455
37e2e78e
KW
6456 # And this for output after all the tables.
6457 push @global_comments, $table->comment;
6458
99870f4d
KW
6459 # Compute an alternate compound name using the final property
6460 # synonym and the first table synonym with a colon instead of
6461 # the equal sign used elsewhere.
6462 if ($property_had_compound_name) {
6463 $properties_with_compound_names ++;
6464 if (! $compound_name || @property_aliases > 1) {
6465 $compound_name = $property_aliases[-1]->name
6466 . ': '
6467 . $table_aliases[0]->name;
6468 }
6469 }
6470 } # End of looping through all children of this table
6471
6472 # Here have assembled in $matches_comment all the related tables
6473 # to the current parent (preceded by the same info for all the
6474 # previous parents). Put out information that applies to all of
6475 # the current family.
6476 if (@conflicting) {
6477
6478 # But output the conflicting information now, as it applies to
6479 # just this table.
6480 my $conflicting = join ", ", @conflicting;
6481 if ($conflicting) {
6482 $matches_comment .= <<END;
6483
6484 Note that contrary to what you might expect, the above is NOT the same as
6485END
6486 $matches_comment .= "any of: " if @conflicting > 1;
6487 $matches_comment .= "$conflicting\n";
6488 }
6489 }
6490 if (@description) {
6491 $matches_comment .= "\n Meaning: "
6492 . join('; ', @description)
6493 . "\n";
6494 }
6495 if (@note) {
6496 $matches_comment .= "\n Note: "
6497 . join("\n ", @note)
6498 . "\n";
6499 }
6500 } # End of looping through all tables
6501
6502
6503 my $code_points;
6504 my $match;
6505 my $any_of_these;
6506 if ($count == 1) {
6507 $match = 'matches';
6508 $code_points = 'single code point';
6509 }
6510 else {
6511 $match = 'match';
6512 $code_points = "$string_count code points";
6513 }
6514
6515 my $synonyms;
6516 my $entries;
6517 if ($total_entries <= 1) {
6518 $synonyms = "";
6519 $entries = 'entry';
6520 $any_of_these = 'this'
6521 }
6522 else {
6523 $synonyms = " any of the following regular expression constructs";
6524 $entries = 'entries';
6525 $any_of_these = 'any of these'
6526 }
6527
6528 my $comment = "";
6529 if ($has_unrelated) {
6530 $comment .= <<END;
6531This file is for tables that are not necessarily related: To conserve
6532resources, every table that matches the identical set of code points in this
6533version of Unicode uses this file. Each one is listed in a separate group
6534below. It could be that the tables will match the same set of code points in
6535other Unicode releases, or it could be purely coincidence that they happen to
6536be the same in Unicode $string_version, and hence may not in other versions.
6537
6538END
6539 }
6540
6541 if (%flags) {
6542 foreach my $flag (sort keys %flags) {
6543 $comment .= <<END;
37e2e78e 6544'$flag' below means that this form is $flags{$flag}.
99870f4d 6545END
37e2e78e
KW
6546 next if $flag eq $PLACEHOLDER;
6547 $comment .= "Consult $pod_file.pod\n";
99870f4d
KW
6548 }
6549 $comment .= "\n";
6550 }
6551
6552 $comment .= <<END;
6553This file returns the $code_points in Unicode Version $string_version that
6554$match$synonyms:
6555
6556$matches_comment
37e2e78e 6557$pod_file.pod should be consulted for the syntax rules for $any_of_these,
99870f4d
KW
6558including if adding or subtracting white space, underscore, and hyphen
6559characters matters or doesn't matter, and other permissible syntactic
6560variants. Upper/lower case distinctions never matter.
6561END
6562
6563 if ($compound_name) {
6564 $comment .= <<END;
6565
6566A colon can be substituted for the equals sign, and
6567END
6568 if ($properties_with_compound_names > 1) {
6569 $comment .= <<END;
6570within each group above,
6571END
6572 }
6573 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
6574
6575 # Note the \b below, it says don't make that line a continuation.
6576 $comment .= <<END;
6577anything to the left of the equals (or colon) can be combined with anything to
6578the right. Thus, for example,
6579$compound_name
6580\bis also valid.
6581END
6582 }
6583
6584 # And append any comment(s) from the actual tables. They are all
6585 # gathered here, so may not read all that well.
37e2e78e
KW
6586 if (@global_comments) {
6587 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
6588 }
99870f4d
KW
6589
6590 if ($count) { # The format differs if no code points, and needs no
6591 # explanation in that case
6592 $comment.= <<END;
6593
6594The format of the lines of this file is:
6595END
6596 $comment.= <<END;
6597START\\tSTOP\\twhere START is the starting code point of the range, in hex;
6598STOP is the ending point, or if omitted, the range has just one code point.
6599END
0c07e538 6600 if ($leader->output_range_counts) {
99870f4d
KW
6601 $comment .= <<END;
6602Numbers in comments in [brackets] indicate how many code points are in the
6603range.
6604END
6605 }
6606 }
6607
6608 $leader->set_comment(main::join_lines($comment));
6609 return;
6610 }
6611
6612 # Accessors for the underlying list
ea25a9b2 6613 for my $sub (qw(
99870f4d
KW
6614 get_valid_code_point
6615 get_invalid_code_point
ea25a9b2 6616 ))
99870f4d
KW
6617 {
6618 no strict "refs";
6619 *$sub = sub {
6620 use strict "refs";
6621 my $self = shift;
6622
6623 return $self->_range_list->$sub(@_);
6624 }
6625 }
6626} # End closure for Match_Table
6627
6628package Property;
6629
6630# The Property class represents a Unicode property, or the $perl
6631# pseudo-property. It contains a map table initialized empty at construction
6632# time, and for properties accessible through regular expressions, various
6633# match tables, created through the add_match_table() method, and referenced
6634# by the table('NAME') or tables() methods, the latter returning a list of all
6635# of the match tables. Otherwise table operations implicitly are for the map
6636# table.
6637#
6638# Most of the data in the property is actually about its map table, so it
6639# mostly just uses that table's accessors for most methods. The two could
6640# have been combined into one object, but for clarity because of their
6641# differing semantics, they have been kept separate. It could be argued that
6642# the 'file' and 'directory' fields should be kept with the map table.
6643#
6644# Each property has a type. This can be set in the constructor, or in the
6645# set_type accessor, but mostly it is figured out by the data. Every property
6646# starts with unknown type, overridden by a parameter to the constructor, or
6647# as match tables are added, or ranges added to the map table, the data is
6648# inspected, and the type changed. After the table is mostly or entirely
6649# filled, compute_type() should be called to finalize they analysis.
6650#
6651# There are very few operations defined. One can safely remove a range from
6652# the map table, and property_add_or_replace_non_nulls() adds the maps from another
6653# table to this one, replacing any in the intersection of the two.
6654
6655sub standardize { return main::standardize($_[0]); }
6656sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
6657
6658{ # Closure
6659
6660 # This hash will contain as keys, all the aliases of all properties, and
6661 # as values, pointers to their respective property objects. This allows
6662 # quick look-up of a property from any of its names.
6663 my %alias_to_property_of;
6664
6665 sub dump_alias_to_property_of {
6666 # For debugging
6667
6668 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
6669 return;
6670 }
6671
6672 sub property_ref {
6673 # This is a package subroutine, not called as a method.
6674 # If the single parameter is a literal '*' it returns a list of all
6675 # defined properties.
6676 # Otherwise, the single parameter is a name, and it returns a pointer
6677 # to the corresponding property object, or undef if none.
6678 #
6679 # Properties can have several different names. The 'standard' form of
6680 # each of them is stored in %alias_to_property_of as they are defined.
6681 # But it's possible that this subroutine will be called with some
6682 # variant, so if the initial lookup fails, it is repeated with the
6683 # standarized form of the input name. If found, besides returning the
6684 # result, the input name is added to the list so future calls won't
6685 # have to do the conversion again.
6686
6687 my $name = shift;
6688
6689 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6690
6691 if (! defined $name) {
6692 Carp::my_carp_bug("Undefined input property. No action taken.");
6693 return;
6694 }
6695
6696 return main::uniques(values %alias_to_property_of) if $name eq '*';
6697
6698 # Return cached result if have it.
6699 my $result = $alias_to_property_of{$name};
6700 return $result if defined $result;
6701
6702 # Convert the input to standard form.
6703 my $standard_name = standardize($name);
6704
6705 $result = $alias_to_property_of{$standard_name};
6706 return unless defined $result; # Don't cache undefs
6707
6708 # Cache the result before returning it.
6709 $alias_to_property_of{$name} = $result;
6710 return $result;
6711 }
6712
6713
6714 main::setup_package();
6715
6716 my %map;
6717 # A pointer to the map table object for this property
6718 main::set_access('map', \%map);
6719
6720 my %full_name;
6721 # The property's full name. This is a duplicate of the copy kept in the
6722 # map table, but is needed because stringify needs it during
6723 # construction of the map table, and then would have a chicken before egg
6724 # problem.
6725 main::set_access('full_name', \%full_name, 'r');
6726
6727 my %table_ref;
6728 # This hash will contain as keys, all the aliases of any match tables
6729 # attached to this property, and as values, the pointers to their
6730 # respective tables. This allows quick look-up of a table from any of its
6731 # names.
6732 main::set_access('table_ref', \%table_ref);
6733
6734 my %type;
6735 # The type of the property, $ENUM, $BINARY, etc
6736 main::set_access('type', \%type, 'r');
6737
6738 my %file;
6739 # The filename where the map table will go (if actually written).
6740 # Normally defaulted, but can be overridden.
6741 main::set_access('file', \%file, 'r', 's');
6742
6743 my %directory;
6744 # The directory where the map table will go (if actually written).
6745 # Normally defaulted, but can be overridden.
6746 main::set_access('directory', \%directory, 's');
6747
6748 my %pseudo_map_type;
6749 # This is used to affect the calculation of the map types for all the
6750 # ranges in the table. It should be set to one of the values that signify
6751 # to alter the calculation.
6752 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
6753
6754 my %has_only_code_point_maps;
6755 # A boolean used to help in computing the type of data in the map table.
6756 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
6757
6758 my %unique_maps;
6759 # A list of the first few distinct mappings this property has. This is
6760 # used to disambiguate between binary and enum property types, so don't
6761 # have to keep more than three.
6762 main::set_access('unique_maps', \%unique_maps);
6763
6764 sub new {
6765 # The only required parameter is the positionally first, name. All
6766 # other parameters are key => value pairs. See the documentation just
6767 # above for the meanings of the ones not passed directly on to the map
6768 # table constructor.
6769
6770 my $class = shift;
6771 my $name = shift || "";
6772
6773 my $self = property_ref($name);
6774 if (defined $self) {
6775 my $options_string = join ", ", @_;
6776 $options_string = ". Ignoring options $options_string" if $options_string;
6777 Carp::my_carp("$self is already in use. Using existing one$options_string;");
6778 return $self;
6779 }
6780
6781 my %args = @_;
6782
6783 $self = bless \do { my $anonymous_scalar }, $class;
ffe43484 6784 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6785
6786 $directory{$addr} = delete $args{'Directory'};
6787 $file{$addr} = delete $args{'File'};
6788 $full_name{$addr} = delete $args{'Full_Name'} || $name;
6789 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
6790 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
6791 # Rest of parameters passed on.
6792
6793 $has_only_code_point_maps{$addr} = 1;
6794 $table_ref{$addr} = { };
6795 $unique_maps{$addr} = { };
6796
6797 $map{$addr} = Map_Table->new($name,
6798 Full_Name => $full_name{$addr},
6799 _Alias_Hash => \%alias_to_property_of,
6800 _Property => $self,
6801 %args);
6802 return $self;
6803 }
6804
6805 # See this program's beginning comment block about overloading the copy
6806 # constructor. Few operations are defined on properties, but a couple are
6807 # useful. It is safe to take the inverse of a property, and to remove a
6808 # single code point from it.
6809 use overload
6810 fallback => 0,
6811 qw("") => "_operator_stringify",
6812 "." => \&main::_operator_dot,
6813 '==' => \&main::_operator_equal,
6814 '!=' => \&main::_operator_not_equal,
6815 '=' => sub { return shift },
6816 '-=' => "_minus_and_equal",
6817 ;
6818
6819 sub _operator_stringify {
6820 return "Property '" . shift->full_name . "'";
6821 }
6822
6823 sub _minus_and_equal {
6824 # Remove a single code point from the map table of a property.
6825
6826 my $self = shift;
6827 my $other = shift;
6828 my $reversed = shift;
6829 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6830
6831 if (ref $other) {
6832 Carp::my_carp_bug("Can't cope with a "
6833 . ref($other)
6834 . " argument to '-='. Subtraction ignored.");
6835 return $self;
6836 }
6837 elsif ($reversed) { # Shouldnt happen in a -=, but just in case
6838 Carp::my_carp_bug("Can't cope with a "
6839 . __PACKAGE__
6840 . " being the first parameter in a '-='. Subtraction ignored.");
6841 return $self;
6842 }
6843 else {
f998e60c 6844 no overloading;
051df77b 6845 $map{pack 'J', $self}->delete_range($other, $other);
99870f4d
KW
6846 }
6847 return $self;
6848 }
6849
6850 sub add_match_table {
6851 # Add a new match table for this property, with name given by the
6852 # parameter. It returns a pointer to the table.
6853
6854 my $self = shift;
6855 my $name = shift;
6856 my %args = @_;
6857
ffe43484 6858 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6859
6860 my $table = $table_ref{$addr}{$name};
6861 my $standard_name = main::standardize($name);
6862 if (defined $table
6863 || (defined ($table = $table_ref{$addr}{$standard_name})))
6864 {
6865 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
6866 $table_ref{$addr}{$name} = $table;
6867 return $table;
6868 }
6869 else {
6870
6871 # See if this is a perl extension, if not passed in.
6872 my $perl_extension = delete $args{'Perl_Extension'};
6873 $perl_extension
6874 = $self->perl_extension if ! defined $perl_extension;
6875
6876 $table = Match_Table->new(
6877 Name => $name,
6878 Perl_Extension => $perl_extension,
6879 _Alias_Hash => $table_ref{$addr},
6880 _Property => $self,
6881
6882 # gets property's status by default
6883 Status => $self->status,
6884 _Status_Info => $self->status_info,
6885 %args,
6886 Internal_Only_Warning => 1); # Override any
6887 # input param
6888 return unless defined $table;
6889 }
6890
6891 # Save the names for quick look up
6892 $table_ref{$addr}{$standard_name} = $table;
6893 $table_ref{$addr}{$name} = $table;
6894
6895 # Perhaps we can figure out the type of this property based on the
6896 # fact of adding this match table. First, string properties don't
6897 # have match tables; second, a binary property can't have 3 match
6898 # tables
6899 if ($type{$addr} == $UNKNOWN) {
6900 $type{$addr} = $NON_STRING;
6901 }
6902 elsif ($type{$addr} == $STRING) {
6903 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
6904 $type{$addr} = $NON_STRING;
6905 }
6906 elsif ($type{$addr} != $ENUM) {
6907 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
6908 && $type{$addr} == $BINARY)
6909 {
6910 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
6911 $type{$addr} = $ENUM;
6912 }
6913 }
6914
6915 return $table;
6916 }
6917
6918 sub table {
6919 # Return a pointer to the match table (with name given by the
6920 # parameter) associated with this property; undef if none.
6921
6922 my $self = shift;
6923 my $name = shift;
6924 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6925
ffe43484 6926 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6927
6928 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
6929
6930 # If quick look-up failed, try again using the standard form of the
6931 # input name. If that succeeds, cache the result before returning so
6932 # won't have to standardize this input name again.
6933 my $standard_name = main::standardize($name);
6934 return unless defined $table_ref{$addr}{$standard_name};
6935
6936 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
6937 return $table_ref{$addr}{$name};
6938 }
6939
6940 sub tables {
6941 # Return a list of pointers to all the match tables attached to this
6942 # property
6943
f998e60c 6944 no overloading;
051df77b 6945 return main::uniques(values %{$table_ref{pack 'J', shift}});
99870f4d
KW
6946 }
6947
6948 sub directory {
6949 # Returns the directory the map table for this property should be
6950 # output in. If a specific directory has been specified, that has
6951 # priority; 'undef' is returned if the type isn't defined;
6952 # or $map_directory for everything else.
6953
ffe43484 6954 my $addr = do { no overloading; pack 'J', shift; };
99870f4d
KW
6955
6956 return $directory{$addr} if defined $directory{$addr};
6957 return undef if $type{$addr} == $UNKNOWN;
6958 return $map_directory;
6959 }
6960
6961 sub swash_name {
6962 # Return the name that is used to both:
6963 # 1) Name the file that the map table is written to.
6964 # 2) The name of swash related stuff inside that file.
6965 # The reason for this is that the Perl core historically has used
6966 # certain names that aren't the same as the Unicode property names.
6967 # To continue using these, $file is hard-coded in this file for those,
6968 # but otherwise the standard name is used. This is different from the
6969 # external_name, so that the rest of the files, like in lib can use
6970 # the standard name always, without regard to historical precedent.
6971
6972 my $self = shift;
6973 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6974
ffe43484 6975 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6976
6977 return $file{$addr} if defined $file{$addr};
6978 return $map{$addr}->external_name;
6979 }
6980
6981 sub to_create_match_tables {
6982 # Returns a boolean as to whether or not match tables should be
6983 # created for this property.
6984
6985 my $self = shift;
6986 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6987
6988 # The whole point of this pseudo property is match tables.
6989 return 1 if $self == $perl;
6990
ffe43484 6991 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
6992
6993 # Don't generate tables of code points that match the property values
6994 # of a string property. Such a list would most likely have many
6995 # property values, each with just one or very few code points mapping
6996 # to it.
6997 return 0 if $type{$addr} == $STRING;
6998
6999 # Don't generate anything for unimplemented properties.
7000 return 0 if grep { $self->complete_name eq $_ }
7001 @unimplemented_properties;
7002 # Otherwise, do.
7003 return 1;
7004 }
7005
7006 sub property_add_or_replace_non_nulls {
7007 # This adds the mappings in the property $other to $self. Non-null
7008 # mappings from $other override those in $self. It essentially merges
7009 # the two properties, with the second having priority except for null
7010 # mappings.
7011
7012 my $self = shift;
7013 my $other = shift;
7014 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7015
7016 if (! $other->isa(__PACKAGE__)) {
7017 Carp::my_carp_bug("$other should be a "
7018 . __PACKAGE__
7019 . ". Not a '"
7020 . ref($other)
7021 . "'. Not added;");
7022 return;
7023 }
7024
f998e60c 7025 no overloading;
051df77b 7026 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
99870f4d
KW
7027 }
7028
7029 sub set_type {
7030 # Set the type of the property. Mostly this is figured out by the
7031 # data in the table. But this is used to set it explicitly. The
7032 # reason it is not a standard accessor is that when setting a binary
7033 # property, we need to make sure that all the true/false aliases are
7034 # present, as they were omitted in early Unicode releases.
7035
7036 my $self = shift;
7037 my $type = shift;
7038 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7039
7040 if ($type != $ENUM && $type != $BINARY && $type != $STRING) {
7041 Carp::my_carp("Unrecognized type '$type'. Type not set");
7042 return;
7043 }
7044
051df77b 7045 { no overloading; $type{pack 'J', $self} = $type; }
99870f4d
KW
7046 return if $type != $BINARY;
7047
7048 my $yes = $self->table('Y');
7049 $yes = $self->table('Yes') if ! defined $yes;
7050 $yes = $self->add_match_table('Y') if ! defined $yes;
7051 $yes->add_alias('Yes');
7052 $yes->add_alias('T');
7053 $yes->add_alias('True');
7054
7055 my $no = $self->table('N');
7056 $no = $self->table('No') if ! defined $no;
7057 $no = $self->add_match_table('N') if ! defined $no;
7058 $no->add_alias('No');
7059 $no->add_alias('F');
7060 $no->add_alias('False');
7061 return;
7062 }
7063
7064 sub add_map {
7065 # Add a map to the property's map table. This also keeps
7066 # track of the maps so that the property type can be determined from
7067 # its data.
7068
7069 my $self = shift;
7070 my $start = shift; # First code point in range
7071 my $end = shift; # Final code point in range
7072 my $map = shift; # What the range maps to.
7073 # Rest of parameters passed on.
7074
ffe43484 7075 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7076
7077 # If haven't the type of the property, gather information to figure it
7078 # out.
7079 if ($type{$addr} == $UNKNOWN) {
7080
7081 # If the map contains an interior blank or dash, or most other
7082 # nonword characters, it will be a string property. This
7083 # heuristic may actually miss some string properties. If so, they
7084 # may need to have explicit set_types called for them. This
7085 # happens in the Unihan properties.
7086 if ($map =~ / (?<= . ) [ -] (?= . ) /x
7087 || $map =~ / [^\w.\/\ -] /x)
7088 {
7089 $self->set_type($STRING);
7090
7091 # $unique_maps is used for disambiguating between ENUM and
7092 # BINARY later; since we know the property is not going to be
7093 # one of those, no point in keeping the data around
7094 undef $unique_maps{$addr};
7095 }
7096 else {
7097
7098 # Not necessarily a string. The final decision has to be
7099 # deferred until all the data are in. We keep track of if all
7100 # the values are code points for that eventual decision.
7101 $has_only_code_point_maps{$addr} &=
7102 $map =~ / ^ $code_point_re $/x;
7103
7104 # For the purposes of disambiguating between binary and other
7105 # enumerations at the end, we keep track of the first three
7106 # distinct property values. Once we get to three, we know
7107 # it's not going to be binary, so no need to track more.
7108 if (scalar keys %{$unique_maps{$addr}} < 3) {
7109 $unique_maps{$addr}{main::standardize($map)} = 1;
7110 }
7111 }
7112 }
7113
7114 # Add the mapping by calling our map table's method
7115 return $map{$addr}->add_map($start, $end, $map, @_);
7116 }
7117
7118 sub compute_type {
7119 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
7120 # should be called after the property is mostly filled with its maps.
7121 # We have been keeping track of what the property values have been,
7122 # and now have the necessary information to figure out the type.
7123
7124 my $self = shift;
7125 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7126
ffe43484 7127 my $addr = do { no overloading; pack 'J', $self; };
99870f4d
KW
7128
7129 my $type = $type{$addr};
7130
7131 # If already have figured these out, no need to do so again, but we do
7132 # a double check on ENUMS to make sure that a string property hasn't
7133 # improperly been classified as an ENUM, so continue on with those.
7134 return if $type == $STRING || $type == $BINARY;
7135
7136 # If every map is to a code point, is a string property.
7137 if ($type == $UNKNOWN
7138 && ($has_only_code_point_maps{$addr}
7139 || (defined $map{$addr}->default_map
7140 && $map{$addr}->default_map eq "")))
7141 {
7142 $self->set_type($STRING);
7143 }
7144 else {
7145
7146 # Otherwise, it is to some sort of enumeration. (The case where
7147 # it is a Unicode miscellaneous property, and treated like a
7148 # string in this program is handled in add_map()). Distinguish
7149 # between binary and some other enumeration type. Of course, if
7150 # there are more than two values, it's not binary. But more
7151 # subtle is the test that the default mapping is defined means it
7152 # isn't binary. This in fact may change in the future if Unicode
7153 # changes the way its data is structured. But so far, no binary
7154 # properties ever have @missing lines for them, so the default map
7155 # isn't defined for them. The few properties that are two-valued
7156 # and aren't considered binary have the default map defined
7157 # starting in Unicode 5.0, when the @missing lines appeared; and
7158 # this program has special code to put in a default map for them
7159 # for earlier than 5.0 releases.
7160 if ($type == $ENUM
7161 || scalar keys %{$unique_maps{$addr}} > 2
7162 || defined $self->default_map)
7163 {
7164 my $tables = $self->tables;
7165 my $count = $self->count;
7166 if ($verbosity && $count > 500 && $tables/$count > .1) {
7167 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
7168 }
7169 $self->set_type($ENUM);
7170 }
7171 else {
7172 $self->set_type($BINARY);
7173 }
7174 }
7175 undef $unique_maps{$addr}; # Garbage collect
7176 return;
7177 }
7178
7179 # Most of the accessors for a property actually apply to its map table.
7180 # Setup up accessor functions for those, referring to %map
ea25a9b2 7181 for my $sub (qw(
99870f4d
KW
7182 add_alias
7183 add_anomalous_entry
7184 add_comment
7185 add_conflicting
7186 add_description
7187 add_duplicate
7188 add_note
7189 aliases
7190 comment
7191 complete_name
2f7a8815 7192 containing_range
99870f4d
KW
7193 core_access
7194 count
7195 default_map
7196 delete_range
7197 description
7198 each_range
7199 external_name
7200 file_path
7201 format
7202 initialize
7203 inverse_list
7204 is_empty
7205 name
7206 note
7207 perl_extension
7208 property
7209 range_count
7210 ranges
7211 range_size_1
7212 reset_each_range
7213 set_comment
7214 set_core_access
7215 set_default_map
7216 set_file_path
7217 set_final_comment
7218 set_range_size_1
7219 set_status
7220 set_to_output_map
7221 short_name
7222 status
7223 status_info
7224 to_output_map
0a9dbafc 7225 type_of
99870f4d
KW
7226 value_of
7227 write
ea25a9b2 7228 ))
99870f4d
KW
7229 # 'property' above is for symmetry, so that one can take
7230 # the property of a property and get itself, and so don't
7231 # have to distinguish between properties and tables in
7232 # calling code
7233 {
7234 no strict "refs";
7235 *$sub = sub {
7236 use strict "refs";
7237 my $self = shift;
f998e60c 7238 no overloading;
051df77b 7239 return $map{pack 'J', $self}->$sub(@_);
99870f4d
KW
7240 }
7241 }
7242
7243
7244} # End closure
7245
7246package main;
7247
7248sub join_lines($) {
7249 # Returns lines of the input joined together, so that they can be folded
7250 # properly.
7251 # This causes continuation lines to be joined together into one long line
7252 # for folding. A continuation line is any line that doesn't begin with a
7253 # space or "\b" (the latter is stripped from the output). This is so
7254 # lines can be be in a HERE document so as to fit nicely in the terminal
7255 # width, but be joined together in one long line, and then folded with
7256 # indents, '#' prefixes, etc, properly handled.
7257 # A blank separates the joined lines except if there is a break; an extra
7258 # blank is inserted after a period ending a line.
7259
7260 # Intialize the return with the first line.
7261 my ($return, @lines) = split "\n", shift;
7262
7263 # If the first line is null, it was an empty line, add the \n back in
7264 $return = "\n" if $return eq "";
7265
7266 # Now join the remainder of the physical lines.
7267 for my $line (@lines) {
7268
7269 # An empty line means wanted a blank line, so add two \n's to get that
7270 # effect, and go to the next line.
7271 if (length $line == 0) {
7272 $return .= "\n\n";
7273 next;
7274 }
7275
7276 # Look at the last character of what we have so far.
7277 my $previous_char = substr($return, -1, 1);
7278
7279 # And at the next char to be output.
7280 my $next_char = substr($line, 0, 1);
7281
7282 if ($previous_char ne "\n") {
7283
7284 # Here didn't end wth a nl. If the next char a blank or \b, it
7285 # means that here there is a break anyway. So add a nl to the
7286 # output.
7287 if ($next_char eq " " || $next_char eq "\b") {
7288 $previous_char = "\n";
7289 $return .= $previous_char;
7290 }
7291
7292 # Add an extra space after periods.
7293 $return .= " " if $previous_char eq '.';
7294 }
7295
7296 # Here $previous_char is still the latest character to be output. If
7297 # it isn't a nl, it means that the next line is to be a continuation
7298 # line, with a blank inserted between them.
7299 $return .= " " if $previous_char ne "\n";
7300
7301 # Get rid of any \b
7302 substr($line, 0, 1) = "" if $next_char eq "\b";
7303
7304 # And append this next line.
7305 $return .= $line;
7306 }
7307
7308 return $return;
7309}
7310
7311sub simple_fold($;$$$) {
7312 # Returns a string of the input (string or an array of strings) folded
7313 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
7314 # a \n
7315 # This is tailored for the kind of text written by this program,
7316 # especially the pod file, which can have very long names with
7317 # underscores in the middle, or words like AbcDefgHij.... We allow
7318 # breaking in the middle of such constructs if the line won't fit
7319 # otherwise. The break in such cases will come either just after an
7320 # underscore, or just before one of the Capital letters.
7321
7322 local $to_trace = 0 if main::DEBUG;
7323
7324 my $line = shift;
7325 my $prefix = shift; # Optional string to prepend to each output
7326 # line
7327 $prefix = "" unless defined $prefix;
7328
7329 my $hanging_indent = shift; # Optional number of spaces to indent
7330 # continuation lines
7331 $hanging_indent = 0 unless $hanging_indent;
7332
7333 my $right_margin = shift; # Optional number of spaces to narrow the
7334 # total width by.
7335 $right_margin = 0 unless defined $right_margin;
7336
7337 # Call carp with the 'nofold' option to avoid it from trying to call us
7338 # recursively
7339 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
7340
7341 # The space available doesn't include what's automatically prepended
7342 # to each line, or what's reserved on the right.
7343 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
7344 # XXX Instead of using the 'nofold' perhaps better to look up the stack
7345
7346 if (DEBUG && $hanging_indent >= $max) {
7347 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
7348 $hanging_indent = 0;
7349 }
7350
7351 # First, split into the current physical lines.
7352 my @line;
7353 if (ref $line) { # Better be an array, because not bothering to
7354 # test
7355 foreach my $line (@{$line}) {
7356 push @line, split /\n/, $line;
7357 }
7358 }
7359 else {
7360 @line = split /\n/, $line;
7361 }
7362
7363 #local $to_trace = 1 if main::DEBUG;
7364 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
7365
7366 # Look at each current physical line.
7367 for (my $i = 0; $i < @line; $i++) {
7368 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
7369 #local $to_trace = 1 if main::DEBUG;
7370 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
7371
7372 # Remove prefix, because will be added back anyway, don't want
7373 # doubled prefix
7374 $line[$i] =~ s/^$prefix//;
7375
7376 # Remove trailing space
7377 $line[$i] =~ s/\s+\Z//;
7378
7379 # If the line is too long, fold it.
7380 if (length $line[$i] > $max) {
7381 my $remainder;
7382
7383 # Here needs to fold. Save the leading space in the line for
7384 # later.
7385 $line[$i] =~ /^ ( \s* )/x;
7386 my $leading_space = $1;
7387 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
7388
7389 # If character at final permissible position is white space,
7390 # fold there, which will delete that white space
7391 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
7392 $remainder = substr($line[$i], $max);
7393 $line[$i] = substr($line[$i], 0, $max - 1);
7394 }
7395 else {
7396
7397 # Otherwise fold at an acceptable break char closest to
7398 # the max length. Look at just the maximal initial
7399 # segment of the line
7400 my $segment = substr($line[$i], 0, $max - 1);
7401 if ($segment =~
7402 /^ ( .{$hanging_indent} # Don't look before the
7403 # indent.
7404 \ * # Don't look in leading
7405 # blanks past the indent
7406 [^ ] .* # Find the right-most
7407 (?: # acceptable break:
7408 [ \s = ] # space or equal
7409 | - (?! [.0-9] ) # or non-unary minus.
7410 ) # $1 includes the character
7411 )/x)
7412 {
7413 # Split into the initial part that fits, and remaining
7414 # part of the input
7415 $remainder = substr($line[$i], length $1);
7416 $line[$i] = $1;
7417 trace $line[$i] if DEBUG && $to_trace;
7418 trace $remainder if DEBUG && $to_trace;
7419 }
7420
7421 # If didn't find a good breaking spot, see if there is a
7422 # not-so-good breaking spot. These are just after
7423 # underscores or where the case changes from lower to
7424 # upper. Use \a as a soft hyphen, but give up
7425 # and don't break the line if there is actually a \a
7426 # already in the input. We use an ascii character for the
7427 # soft-hyphen to avoid any attempt by miniperl to try to
7428 # access the files that this program is creating.
7429 elsif ($segment !~ /\a/
7430 && ($segment =~ s/_/_\a/g
7431 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
7432 {
7433 # Here were able to find at least one place to insert
7434 # our substitute soft hyphen. Find the right-most one
7435 # and replace it by a real hyphen.
7436 trace $segment if DEBUG && $to_trace;
7437 substr($segment,
7438 rindex($segment, "\a"),
7439 1) = '-';
7440
7441 # Then remove the soft hyphen substitutes.
7442 $segment =~ s/\a//g;
7443 trace $segment if DEBUG && $to_trace;
7444
7445 # And split into the initial part that fits, and
7446 # remainder of the line
7447 my $pos = rindex($segment, '-');
7448 $remainder = substr($line[$i], $pos);
7449 trace $remainder if DEBUG && $to_trace;
7450 $line[$i] = substr($segment, 0, $pos + 1);
7451 }
7452 }
7453
7454 # Here we know if we can fold or not. If we can, $remainder
7455 # is what remains to be processed in the next iteration.
7456 if (defined $remainder) {
7457 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
7458
7459 # Insert the folded remainder of the line as a new element
7460 # of the array. (It may still be too long, but we will
7461 # deal with that next time through the loop.) Omit any
7462 # leading space in the remainder.
7463 $remainder =~ s/^\s+//;
7464 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
7465
7466 # But then indent by whichever is larger of:
7467 # 1) the leading space on the input line;
7468 # 2) the hanging indent.
7469 # This preserves indentation in the original line.
7470 my $lead = ($leading_space)
7471 ? length $leading_space
7472 : $hanging_indent;
7473 $lead = max($lead, $hanging_indent);
7474 splice @line, $i+1, 0, (" " x $lead) . $remainder;
7475 }
7476 }
7477
7478 # Ready to output the line. Get rid of any trailing space
7479 # And prefix by the required $prefix passed in.
7480 $line[$i] =~ s/\s+$//;
7481 $line[$i] = "$prefix$line[$i]\n";
7482 } # End of looping through all the lines.
7483
7484 return join "", @line;
7485}
7486
7487sub property_ref { # Returns a reference to a property object.
7488 return Property::property_ref(@_);
7489}
7490
7491sub force_unlink ($) {
7492 my $filename = shift;
7493 return unless file_exists($filename);
7494 return if CORE::unlink($filename);
7495
7496 # We might need write permission
7497 chmod 0777, $filename;
7498 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
7499 return;
7500}
7501
7502sub write ($\@) {
7503 # Given a filename and a reference to an array of lines, write the lines
7504 # to the file
7505 # Filename can be given as an arrayref of directory names
7506
7507 my $file = shift;
7508 my $lines_ref = shift;
7509 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7510
7511 if (! defined $lines_ref) {
7512 Carp::my_carp("Missing lines to write parameter for $file. Writing skipped;");
7513 return;
7514 }
7515
7516 # Get into a single string if an array, and get rid of, in Unix terms, any
7517 # leading '.'
7518 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
7519 $file = File::Spec->canonpath($file);
7520
7521 # If has directories, make sure that they all exist
7522 (undef, my $directories, undef) = File::Spec->splitpath($file);
7523 File::Path::mkpath($directories) if $directories && ! -d $directories;
7524
7525 push @files_actually_output, $file;
7526
430ada4c 7527 unless (@$lines_ref) {
99870f4d
KW
7528 Carp::my_carp("Output file '$file' is empty; writing it anyway;");
7529 }
7530
7531 force_unlink ($file);
7532
7533 my $OUT;
7534 if (not open $OUT, ">", $file) {
7535 Carp::my_carp("can't open $file for output. Skipping this file: $!");
7536 return;
7537 }
430ada4c
NC
7538
7539 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
7540 close $OUT or die Carp::my_carp("close '$file' failed: $!");
7541
99870f4d
KW
7542 print "$file written.\n" if $verbosity >= $VERBOSE;
7543
99870f4d
KW
7544 return;
7545}
7546
7547
7548sub Standardize($) {
7549 # This converts the input name string into a standardized equivalent to
7550 # use internally.
7551
7552 my $name = shift;
7553 unless (defined $name) {
7554 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
7555 return;
7556 }
7557
7558 # Remove any leading or trailing white space
7559 $name =~ s/^\s+//g;
7560 $name =~ s/\s+$//g;
7561
7562 # Convert interior white space and hypens into underscores.
7563 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
7564
7565 # Capitalize the letter following an underscore, and convert a sequence of
7566 # multiple underscores to a single one
7567 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
7568
7569 # And capitalize the first letter, but not for the special cjk ones.
7570 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
7571 return $name;
7572}
7573
7574sub standardize ($) {
7575 # Returns a lower-cased standardized name, without underscores. This form
7576 # is chosen so that it can distinguish between any real versus superficial
7577 # Unicode name differences. It relies on the fact that Unicode doesn't
7578 # have interior underscores, white space, nor dashes in any
7579 # stricter-matched name. It should not be used on Unicode code point
7580 # names (the Name property), as they mostly, but not always follow these
7581 # rules.
7582
7583 my $name = Standardize(shift);
7584 return if !defined $name;
7585
7586 $name =~ s/ (?<= .) _ (?= . ) //xg;
7587 return lc $name;
7588}
7589
7590{ # Closure
7591
7592 my $indent_increment = " " x 2;
7593 my %already_output;
7594
7595 $main::simple_dumper_nesting = 0;
7596
7597 sub simple_dumper {
7598 # Like Simple Data::Dumper. Good enough for our needs. We can't use
7599 # the real thing as we have to run under miniperl.
7600
7601 # It is designed so that on input it is at the beginning of a line,
7602 # and the final thing output in any call is a trailing ",\n".
7603
7604 my $item = shift;
7605 my $indent = shift;
7606 $indent = "" if ! defined $indent;
7607
7608 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7609
7610 # nesting level is localized, so that as the call stack pops, it goes
7611 # back to the prior value.
7612 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
7613 undef %already_output if $main::simple_dumper_nesting == 0;
7614 $main::simple_dumper_nesting++;
7615 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
7616
7617 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7618
7619 # Determine the indent for recursive calls.
7620 my $next_indent = $indent . $indent_increment;
7621
7622 my $output;
7623 if (! ref $item) {
7624
7625 # Dump of scalar: just output it in quotes if not a number. To do
7626 # so we must escape certain characters, and therefore need to
7627 # operate on a copy to avoid changing the original
7628 my $copy = $item;
7629 $copy = $UNDEF unless defined $copy;
7630
7631 # Quote non-numbers (numbers also have optional leading '-' and
7632 # fractions)
7633 if ($copy eq "" || $copy !~ /^ -? \d+ ( \. \d+ )? $/x) {
7634
7635 # Escape apostrophe and backslash
7636 $copy =~ s/ ( ['\\] ) /\\$1/xg;
7637 $copy = "'$copy'";
7638 }
7639 $output = "$indent$copy,\n";
7640 }
7641 else {
7642
7643 # Keep track of cycles in the input, and refuse to infinitely loop
ffe43484 7644 my $addr = do { no overloading; pack 'J', $item; };
f998e60c 7645 if (defined $already_output{$addr}) {
99870f4d
KW
7646 return "${indent}ALREADY OUTPUT: $item\n";
7647 }
f998e60c 7648 $already_output{$addr} = $item;
99870f4d
KW
7649
7650 if (ref $item eq 'ARRAY') {
7651 my $using_brackets;
7652 $output = $indent;
7653 if ($main::simple_dumper_nesting > 1) {
7654 $output .= '[';
7655 $using_brackets = 1;
7656 }
7657 else {
7658 $using_brackets = 0;
7659 }
7660
7661 # If the array is empty, put the closing bracket on the same
7662 # line. Otherwise, recursively add each array element
7663 if (@$item == 0) {
7664 $output .= " ";
7665 }
7666 else {
7667 $output .= "\n";
7668 for (my $i = 0; $i < @$item; $i++) {
7669
7670 # Indent array elements one level
7671 $output .= &simple_dumper($item->[$i], $next_indent);
7672 $output =~ s/\n$//; # Remove trailing nl so as to
7673 $output .= " # [$i]\n"; # add a comment giving the
7674 # array index
7675 }
7676 $output .= $indent; # Indent closing ']' to orig level
7677 }
7678 $output .= ']' if $using_brackets;
7679 $output .= ",\n";
7680 }
7681 elsif (ref $item eq 'HASH') {
7682 my $is_first_line;
7683 my $using_braces;
7684 my $body_indent;
7685
7686 # No surrounding braces at top level
7687 $output .= $indent;
7688 if ($main::simple_dumper_nesting > 1) {
7689 $output .= "{\n";
7690 $is_first_line = 0;
7691 $body_indent = $next_indent;
7692 $next_indent .= $indent_increment;
7693 $using_braces = 1;
7694 }
7695 else {
7696 $is_first_line = 1;
7697 $body_indent = $indent;
7698 $using_braces = 0;
7699 }
7700
7701 # Output hashes sorted alphabetically instead of apparently
7702 # random. Use caseless alphabetic sort
7703 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
7704 {
7705 if ($is_first_line) {
7706 $is_first_line = 0;
7707 }
7708 else {
7709 $output .= "$body_indent";
7710 }
7711
7712 # The key must be a scalar, but this recursive call quotes
7713 # it
7714 $output .= &simple_dumper($key);
7715
7716 # And change the trailing comma and nl to the hash fat
7717 # comma for clarity, and so the value can be on the same
7718 # line
7719 $output =~ s/,\n$/ => /;
7720
7721 # Recursively call to get the value's dump.
7722 my $next = &simple_dumper($item->{$key}, $next_indent);
7723
7724 # If the value is all on one line, remove its indent, so
7725 # will follow the => immediately. If it takes more than
7726 # one line, start it on a new line.
7727 if ($next !~ /\n.*\n/) {
7728 $next =~ s/^ *//;
7729 }
7730 else {
7731 $output .= "\n";
7732 }
7733 $output .= $next;
7734 }
7735
7736 $output .= "$indent},\n" if $using_braces;
7737 }
7738 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
7739 $output = $indent . ref($item) . "\n";
7740 # XXX see if blessed
7741 }
7742 elsif ($item->can('dump')) {
7743
7744 # By convention in this program, objects furnish a 'dump'
7745 # method. Since not doing any output at this level, just pass
7746 # on the input indent
7747 $output = $item->dump($indent);
7748 }
7749 else {
7750 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
7751 }
7752 }
7753 return $output;
7754 }
7755}
7756
7757sub dump_inside_out {
7758 # Dump inside-out hashes in an object's state by converting them to a
7759 # regular hash and then calling simple_dumper on that.
7760
7761 my $object = shift;
7762 my $fields_ref = shift;
7763 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7764
ffe43484 7765 my $addr = do { no overloading; pack 'J', $object; };
99870f4d
KW
7766
7767 my %hash;
7768 foreach my $key (keys %$fields_ref) {
7769 $hash{$key} = $fields_ref->{$key}{$addr};
7770 }
7771
7772 return simple_dumper(\%hash, @_);
7773}
7774
7775sub _operator_dot {
7776 # Overloaded '.' method that is common to all packages. It uses the
7777 # package's stringify method.
7778
7779 my $self = shift;
7780 my $other = shift;
7781 my $reversed = shift;
7782 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7783
7784 $other = "" unless defined $other;
7785
7786 foreach my $which (\$self, \$other) {
7787 next unless ref $$which;
7788 if ($$which->can('_operator_stringify')) {
7789 $$which = $$which->_operator_stringify;
7790 }
7791 else {
7792 my $ref = ref $$which;
ffe43484 7793 my $addr = do { no overloading; pack 'J', $$which; };
99870f4d
KW
7794 $$which = "$ref ($addr)";
7795 }
7796 }
7797 return ($reversed)
7798 ? "$other$self"
7799 : "$self$other";
7800}
7801
7802sub _operator_equal {
7803 # Generic overloaded '==' routine. To be equal, they must be the exact
7804 # same object
7805
7806 my $self = shift;
7807 my $other = shift;
7808
7809 return 0 unless defined $other;
7810 return 0 unless ref $other;
f998e60c 7811 no overloading;
2100aa98 7812 return $self == $other;
99870f4d
KW
7813}
7814
7815sub _operator_not_equal {
7816 my $self = shift;
7817 my $other = shift;
7818
7819 return ! _operator_equal($self, $other);
7820}
7821
7822sub process_PropertyAliases($) {
7823 # This reads in the PropertyAliases.txt file, which contains almost all
7824 # the character properties in Unicode and their equivalent aliases:
7825 # scf ; Simple_Case_Folding ; sfc
7826 #
7827 # Field 0 is the preferred short name for the property.
7828 # Field 1 is the full name.
7829 # Any succeeding ones are other accepted names.
7830
7831 my $file= shift;
7832 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7833
7834 # This whole file was non-existent in early releases, so use our own
7835 # internal one.
7836 $file->insert_lines(get_old_property_aliases())
7837 if ! -e 'PropertyAliases.txt';
7838
7839 # Add any cjk properties that may have been defined.
7840 $file->insert_lines(@cjk_properties);
7841
7842 while ($file->next_line) {
7843
7844 my @data = split /\s*;\s*/;
7845
7846 my $full = $data[1];
7847
7848 my $this = Property->new($data[0], Full_Name => $full);
7849
7850 # Start looking for more aliases after these two.
7851 for my $i (2 .. @data - 1) {
7852 $this->add_alias($data[$i]);
7853 }
7854
7855 }
7856 return;
7857}
7858
7859sub finish_property_setup {
7860 # Finishes setting up after PropertyAliases.
7861
7862 my $file = shift;
7863 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7864
7865 # This entry was missing from this file in earlier Unicode versions
7866 if (-e 'Jamo.txt') {
7867 my $jsn = property_ref('JSN');
7868 if (! defined $jsn) {
7869 $jsn = Property->new('JSN', Full_Name => 'Jamo_Short_Name');
7870 }
7871 }
7872
7873 # This entry is still missing as of 5.2, perhaps because no short name for
7874 # it.
7875 if (-e 'NameAliases.txt') {
7876 my $aliases = property_ref('Name_Alias');
7877 if (! defined $aliases) {
7878 $aliases = Property->new('Name_Alias');
7879 }
7880 }
7881
7882 # These are used so much, that we set globals for them.
7883 $gc = property_ref('General_Category');
7884 $block = property_ref('Block');
7885
7886 # Perl adds this alias.
7887 $gc->add_alias('Category');
7888
7889 # For backwards compatibility, these property files have particular names.
7890 my $upper = property_ref('Uppercase_Mapping');
7891 $upper->set_core_access('uc()');
7892 $upper->set_file('Upper'); # This is what utf8.c calls it
7893
7894 my $lower = property_ref('Lowercase_Mapping');
7895 $lower->set_core_access('lc()');
7896 $lower->set_file('Lower');
7897
7898 my $title = property_ref('Titlecase_Mapping');
7899 $title->set_core_access('ucfirst()');
7900 $title->set_file('Title');
7901
7902 my $fold = property_ref('Case_Folding');
7903 $fold->set_file('Fold') if defined $fold;
7904
7905 # utf8.c can't currently cope with non range-size-1 for these, and even if
7906 # it were changed to do so, someone else may be using them, expecting the
7907 # old style
7908 foreach my $property (qw {
7909 Case_Folding
7910 Lowercase_Mapping
7911 Titlecase_Mapping
7912 Uppercase_Mapping
7913 })
7914 {
7915 property_ref($property)->set_range_size_1(1);
7916 }
7917
7918 # These two properties aren't actually used in the core, but unfortunately
7919 # the names just above that are in the core interfere with these, so
7920 # choose different names. These aren't a problem unless the map tables
7921 # for these files get written out.
7922 my $lowercase = property_ref('Lowercase');
7923 $lowercase->set_file('IsLower') if defined $lowercase;
7924 my $uppercase = property_ref('Uppercase');
7925 $uppercase->set_file('IsUpper') if defined $uppercase;
7926
7927 # Set up the hard-coded default mappings, but only on properties defined
7928 # for this release
7929 foreach my $property (keys %default_mapping) {
7930 my $property_object = property_ref($property);
7931 next if ! defined $property_object;
7932 my $default_map = $default_mapping{$property};
7933 $property_object->set_default_map($default_map);
7934
7935 # A map of <code point> implies the property is string.
7936 if ($property_object->type == $UNKNOWN
7937 && $default_map eq $CODE_POINT)
7938 {
7939 $property_object->set_type($STRING);
7940 }
7941 }
7942
7943 # The following use the Multi_Default class to create objects for
7944 # defaults.
7945
7946 # Bidi class has a complicated default, but the derived file takes care of
7947 # the complications, leaving just 'L'.
7948 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
7949 property_ref('Bidi_Class')->set_default_map('L');
7950 }
7951 else {
7952 my $default;
7953
7954 # The derived file was introduced in 3.1.1. The values below are
7955 # taken from table 3-8, TUS 3.0
7956 my $default_R =
7957 'my $default = Range_List->new;
7958 $default->add_range(0x0590, 0x05FF);
7959 $default->add_range(0xFB1D, 0xFB4F);'
7960 ;
7961
7962 # The defaults apply only to unassigned characters
a67f160a 7963 $default_R .= '$gc->table("Unassigned") & $default;';
99870f4d
KW
7964
7965 if ($v_version lt v3.0.0) {
7966 $default = Multi_Default->new(R => $default_R, 'L');
7967 }
7968 else {
7969
7970 # AL apparently not introduced until 3.0: TUS 2.x references are
7971 # not on-line to check it out
7972 my $default_AL =
7973 'my $default = Range_List->new;
7974 $default->add_range(0x0600, 0x07BF);
7975 $default->add_range(0xFB50, 0xFDFF);
7976 $default->add_range(0xFE70, 0xFEFF);'
7977 ;
7978
7979 # Non-character code points introduced in this release; aren't AL
7980 if ($v_version ge 3.1.0) {
7981 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
7982 }
a67f160a 7983 $default_AL .= '$gc->table("Unassigned") & $default';
99870f4d
KW
7984 $default = Multi_Default->new(AL => $default_AL,
7985 R => $default_R,
7986 'L');
7987 }
7988 property_ref('Bidi_Class')->set_default_map($default);
7989 }
7990
7991 # Joining type has a complicated default, but the derived file takes care
7992 # of the complications, leaving just 'U' (or Non_Joining), except the file
7993 # is bad in 3.1.0
7994 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
7995 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
7996 property_ref('Joining_Type')->set_default_map('Non_Joining');
7997 }
7998 else {
7999
8000 # Otherwise, there are not one, but two possibilities for the
8001 # missing defaults: T and U.
8002 # The missing defaults that evaluate to T are given by:
8003 # T = Mn + Cf - ZWNJ - ZWJ
8004 # where Mn and Cf are the general category values. In other words,
8005 # any non-spacing mark or any format control character, except
8006 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
8007 # WIDTH JOINER (joining type C).
8008 my $default = Multi_Default->new(
8009 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
8010 'Non_Joining');
8011 property_ref('Joining_Type')->set_default_map($default);
8012 }
8013 }
8014
8015 # Line break has a complicated default in early releases. It is 'Unknown'
8016 # for non-assigned code points; 'AL' for assigned.
8017 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
8018 my $lb = property_ref('Line_Break');
8019 if ($v_version gt 3.2.0) {
8020 $lb->set_default_map('Unknown');
8021 }
8022 else {
8023 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
8024 'AL');
8025 $lb->set_default_map($default);
8026 }
8027
8028 # If has the URS property, make sure that the standard aliases are in
8029 # it, since not in the input tables in some versions.
8030 my $urs = property_ref('Unicode_Radical_Stroke');
8031 if (defined $urs) {
8032 $urs->add_alias('cjkRSUnicode');
8033 $urs->add_alias('kRSUnicode');
8034 }
8035 }
8036 return;
8037}
8038
8039sub get_old_property_aliases() {
8040 # Returns what would be in PropertyAliases.txt if it existed in very old
8041 # versions of Unicode. It was derived from the one in 3.2, and pared
8042 # down based on the data that was actually in the older releases.
8043 # An attempt was made to use the existence of files to mean inclusion or
8044 # not of various aliases, but if this was not sufficient, using version
8045 # numbers was resorted to.
8046
8047 my @return;
8048
8049 # These are to be used in all versions (though some are constructed by
8050 # this program if missing)
8051 push @return, split /\n/, <<'END';
8052bc ; Bidi_Class
8053Bidi_M ; Bidi_Mirrored
8054cf ; Case_Folding
8055ccc ; Canonical_Combining_Class
8056dm ; Decomposition_Mapping
8057dt ; Decomposition_Type
8058gc ; General_Category
8059isc ; ISO_Comment
8060lc ; Lowercase_Mapping
8061na ; Name
8062na1 ; Unicode_1_Name
8063nt ; Numeric_Type
8064nv ; Numeric_Value
8065sfc ; Simple_Case_Folding
8066slc ; Simple_Lowercase_Mapping
8067stc ; Simple_Titlecase_Mapping
8068suc ; Simple_Uppercase_Mapping
8069tc ; Titlecase_Mapping
8070uc ; Uppercase_Mapping
8071END
8072
8073 if (-e 'Blocks.txt') {
8074 push @return, "blk ; Block\n";
8075 }
8076 if (-e 'ArabicShaping.txt') {
8077 push @return, split /\n/, <<'END';
8078jg ; Joining_Group
8079jt ; Joining_Type
8080END
8081 }
8082 if (-e 'PropList.txt') {
8083
8084 # This first set is in the original old-style proplist.
8085 push @return, split /\n/, <<'END';
8086Alpha ; Alphabetic
8087Bidi_C ; Bidi_Control
8088Dash ; Dash
8089Dia ; Diacritic
8090Ext ; Extender
8091Hex ; Hex_Digit
8092Hyphen ; Hyphen
8093IDC ; ID_Continue
8094Ideo ; Ideographic
8095Join_C ; Join_Control
8096Math ; Math
8097QMark ; Quotation_Mark
8098Term ; Terminal_Punctuation
8099WSpace ; White_Space
8100END
8101 # The next sets were added later
8102 if ($v_version ge v3.0.0) {
8103 push @return, split /\n/, <<'END';
8104Upper ; Uppercase
8105Lower ; Lowercase
8106END
8107 }
8108 if ($v_version ge v3.0.1) {
8109 push @return, split /\n/, <<'END';
8110NChar ; Noncharacter_Code_Point
8111END
8112 }
8113 # The next sets were added in the new-style
8114 if ($v_version ge v3.1.0) {
8115 push @return, split /\n/, <<'END';
8116OAlpha ; Other_Alphabetic
8117OLower ; Other_Lowercase
8118OMath ; Other_Math
8119OUpper ; Other_Uppercase
8120END
8121 }
8122 if ($v_version ge v3.1.1) {
8123 push @return, "AHex ; ASCII_Hex_Digit\n";
8124 }
8125 }
8126 if (-e 'EastAsianWidth.txt') {
8127 push @return, "ea ; East_Asian_Width\n";
8128 }
8129 if (-e 'CompositionExclusions.txt') {
8130 push @return, "CE ; Composition_Exclusion\n";
8131 }
8132 if (-e 'LineBreak.txt') {
8133 push @return, "lb ; Line_Break\n";
8134 }
8135 if (-e 'BidiMirroring.txt') {
8136 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
8137 }
8138 if (-e 'Scripts.txt') {
8139 push @return, "sc ; Script\n";
8140 }
8141 if (-e 'DNormalizationProps.txt') {
8142 push @return, split /\n/, <<'END';
8143Comp_Ex ; Full_Composition_Exclusion
8144FC_NFKC ; FC_NFKC_Closure
8145NFC_QC ; NFC_Quick_Check
8146NFD_QC ; NFD_Quick_Check
8147NFKC_QC ; NFKC_Quick_Check
8148NFKD_QC ; NFKD_Quick_Check
8149XO_NFC ; Expands_On_NFC
8150XO_NFD ; Expands_On_NFD
8151XO_NFKC ; Expands_On_NFKC
8152XO_NFKD ; Expands_On_NFKD
8153END
8154 }
8155 if (-e 'DCoreProperties.txt') {
8156 push @return, split /\n/, <<'END';
8157IDS ; ID_Start
8158XIDC ; XID_Continue
8159XIDS ; XID_Start
8160END
8161 # These can also appear in some versions of PropList.txt
8162 push @return, "Lower ; Lowercase\n"
8163 unless grep { $_ =~ /^Lower\b/} @return;
8164 push @return, "Upper ; Uppercase\n"
8165 unless grep { $_ =~ /^Upper\b/} @return;
8166 }
8167
8168 # This flag requires the DAge.txt file to be copied into the directory.
8169 if (DEBUG && $compare_versions) {
8170 push @return, 'age ; Age';
8171 }
8172
8173 return @return;
8174}
8175
8176sub process_PropValueAliases {
8177 # This file contains values that properties look like:
8178 # bc ; AL ; Arabic_Letter
8179 # blk; n/a ; Greek_And_Coptic ; Greek
8180 #
8181 # Field 0 is the property.
8182 # Field 1 is the short name of a property value or 'n/a' if no
8183 # short name exists;
8184 # Field 2 is the full property value name;
8185 # Any other fields are more synonyms for the property value.
8186 # Purely numeric property values are omitted from the file; as are some
8187 # others, fewer and fewer in later releases
8188
8189 # Entries for the ccc property have an extra field before the
8190 # abbreviation:
8191 # ccc; 0; NR ; Not_Reordered
8192 # It is the numeric value that the names are synonyms for.
8193
8194 # There are comment entries for values missing from this file:
8195 # # @missing: 0000..10FFFF; ISO_Comment; <none>
8196 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
8197
8198 my $file= shift;
8199 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8200
8201 # This whole file was non-existent in early releases, so use our own
8202 # internal one if necessary.
8203 if (! -e 'PropValueAliases.txt') {
8204 $file->insert_lines(get_old_property_value_aliases());
8205 }
8206
8207 # Add any explicit cjk values
8208 $file->insert_lines(@cjk_property_values);
8209
8210 # This line is used only for testing the code that checks for name
8211 # conflicts. There is a script Inherited, and when this line is executed
8212 # it causes there to be a name conflict with the 'Inherited' that this
8213 # program generates for this block property value
8214 #$file->insert_lines('blk; n/a; Herited');
8215
8216
8217 # Process each line of the file ...
8218 while ($file->next_line) {
8219
8220 my ($property, @data) = split /\s*;\s*/;
8221
8222 # The full name for the ccc property value is in field 2 of the
8223 # remaining ones; field 1 for all other properties. Swap ccc fields 1
8224 # and 2. (Rightmost splice removes field 2, returning it; left splice
8225 # inserts that into field 1, thus shifting former field 1 to field 2.)
8226 splice (@data, 1, 0, splice(@data, 2, 1)) if $property eq 'ccc';
8227
8228 # If there is no short name, use the full one in element 1
8229 $data[0] = $data[1] if $data[0] eq "n/a";
8230
8231 # Earlier releases had the pseudo property 'qc' that should expand to
8232 # the ones that replace it below.
8233 if ($property eq 'qc') {
8234 if (lc $data[0] eq 'y') {
8235 $file->insert_lines('NFC_QC; Y ; Yes',
8236 'NFD_QC; Y ; Yes',
8237 'NFKC_QC; Y ; Yes',
8238 'NFKD_QC; Y ; Yes',
8239 );
8240 }
8241 elsif (lc $data[0] eq 'n') {
8242 $file->insert_lines('NFC_QC; N ; No',
8243 'NFD_QC; N ; No',
8244 'NFKC_QC; N ; No',
8245 'NFKD_QC; N ; No',
8246 );
8247 }
8248 elsif (lc $data[0] eq 'm') {
8249 $file->insert_lines('NFC_QC; M ; Maybe',
8250 'NFKC_QC; M ; Maybe',
8251 );
8252 }
8253 else {
8254 $file->carp_bad_line("qc followed by unexpected '$data[0]");
8255 }
8256 next;
8257 }
8258
8259 # The first field is the short name, 2nd is the full one.
8260 my $property_object = property_ref($property);
8261 my $table = $property_object->add_match_table($data[0],
8262 Full_Name => $data[1]);
8263
8264 # Start looking for more aliases after these two.
8265 for my $i (2 .. @data - 1) {
8266 $table->add_alias($data[$i]);
8267 }
8268 } # End of looping through the file
8269
8270 # As noted in the comments early in the program, it generates tables for
8271 # the default values for all releases, even those for which the concept
8272 # didn't exist at the time. Here we add those if missing.
8273 my $age = property_ref('age');
8274 if (defined $age && ! defined $age->table('Unassigned')) {
8275 $age->add_match_table('Unassigned');
8276 }
8277 $block->add_match_table('No_Block') if -e 'Blocks.txt'
8278 && ! defined $block->table('No_Block');
8279
8280
8281 # Now set the default mappings of the properties from the file. This is
8282 # done after the loop because a number of properties have only @missings
8283 # entries in the file, and may not show up until the end.
8284 my @defaults = $file->get_missings;
8285 foreach my $default_ref (@defaults) {
8286 my $default = $default_ref->[0];
8287 my $property = property_ref($default_ref->[1]);
8288 $property->set_default_map($default);
8289 }
8290 return;
8291}
8292
8293sub get_old_property_value_aliases () {
8294 # Returns what would be in PropValueAliases.txt if it existed in very old
8295 # versions of Unicode. It was derived from the one in 3.2, and pared
8296 # down. An attempt was made to use the existence of files to mean
8297 # inclusion or not of various aliases, but if this was not sufficient,
8298 # using version numbers was resorted to.
8299
8300 my @return = split /\n/, <<'END';
8301bc ; AN ; Arabic_Number
8302bc ; B ; Paragraph_Separator
8303bc ; CS ; Common_Separator
8304bc ; EN ; European_Number
8305bc ; ES ; European_Separator
8306bc ; ET ; European_Terminator
8307bc ; L ; Left_To_Right
8308bc ; ON ; Other_Neutral
8309bc ; R ; Right_To_Left
8310bc ; WS ; White_Space
8311
8312# The standard combining classes are very much different in v1, so only use
8313# ones that look right (not checked thoroughly)
8314ccc; 0; NR ; Not_Reordered
8315ccc; 1; OV ; Overlay
8316ccc; 7; NK ; Nukta
8317ccc; 8; KV ; Kana_Voicing
8318ccc; 9; VR ; Virama
8319ccc; 202; ATBL ; Attached_Below_Left
8320ccc; 216; ATAR ; Attached_Above_Right
8321ccc; 218; BL ; Below_Left
8322ccc; 220; B ; Below
8323ccc; 222; BR ; Below_Right
8324ccc; 224; L ; Left
8325ccc; 228; AL ; Above_Left
8326ccc; 230; A ; Above
8327ccc; 232; AR ; Above_Right
8328ccc; 234; DA ; Double_Above
8329
8330dt ; can ; canonical
8331dt ; enc ; circle
8332dt ; fin ; final
8333dt ; font ; font
8334dt ; fra ; fraction
8335dt ; init ; initial
8336dt ; iso ; isolated
8337dt ; med ; medial
8338dt ; n/a ; none
8339dt ; nb ; noBreak
8340dt ; sqr ; square
8341dt ; sub ; sub
8342dt ; sup ; super
8343
8344gc ; C ; Other # Cc | Cf | Cn | Co | Cs
8345gc ; Cc ; Control
8346gc ; Cn ; Unassigned
8347gc ; Co ; Private_Use
8348gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
8349gc ; LC ; Cased_Letter # Ll | Lt | Lu
8350gc ; Ll ; Lowercase_Letter
8351gc ; Lm ; Modifier_Letter
8352gc ; Lo ; Other_Letter
8353gc ; Lu ; Uppercase_Letter
8354gc ; M ; Mark # Mc | Me | Mn
8355gc ; Mc ; Spacing_Mark
8356gc ; Mn ; Nonspacing_Mark
8357gc ; N ; Number # Nd | Nl | No
8358gc ; Nd ; Decimal_Number
8359gc ; No ; Other_Number
8360gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
8361gc ; Pd ; Dash_Punctuation
8362gc ; Pe ; Close_Punctuation
8363gc ; Po ; Other_Punctuation
8364gc ; Ps ; Open_Punctuation
8365gc ; S ; Symbol # Sc | Sk | Sm | So
8366gc ; Sc ; Currency_Symbol
8367gc ; Sm ; Math_Symbol
8368gc ; So ; Other_Symbol
8369gc ; Z ; Separator # Zl | Zp | Zs
8370gc ; Zl ; Line_Separator
8371gc ; Zp ; Paragraph_Separator
8372gc ; Zs ; Space_Separator
8373
8374nt ; de ; Decimal
8375nt ; di ; Digit
8376nt ; n/a ; None
8377nt ; nu ; Numeric
8378END
8379
8380 if (-e 'ArabicShaping.txt') {
8381 push @return, split /\n/, <<'END';
8382jg ; n/a ; AIN
8383jg ; n/a ; ALEF
8384jg ; n/a ; DAL
8385jg ; n/a ; GAF
8386jg ; n/a ; LAM
8387jg ; n/a ; MEEM
8388jg ; n/a ; NO_JOINING_GROUP
8389jg ; n/a ; NOON
8390jg ; n/a ; QAF
8391jg ; n/a ; SAD
8392jg ; n/a ; SEEN
8393jg ; n/a ; TAH
8394jg ; n/a ; WAW
8395
8396jt ; C ; Join_Causing
8397jt ; D ; Dual_Joining
8398jt ; L ; Left_Joining
8399jt ; R ; Right_Joining
8400jt ; U ; Non_Joining
8401jt ; T ; Transparent
8402END
8403 if ($v_version ge v3.0.0) {
8404 push @return, split /\n/, <<'END';
8405jg ; n/a ; ALAPH
8406jg ; n/a ; BEH
8407jg ; n/a ; BETH
8408jg ; n/a ; DALATH_RISH
8409jg ; n/a ; E
8410jg ; n/a ; FEH
8411jg ; n/a ; FINAL_SEMKATH
8412jg ; n/a ; GAMAL
8413jg ; n/a ; HAH
8414jg ; n/a ; HAMZA_ON_HEH_GOAL
8415jg ; n/a ; HE
8416jg ; n/a ; HEH
8417jg ; n/a ; HEH_GOAL
8418jg ; n/a ; HETH
8419jg ; n/a ; KAF
8420jg ; n/a ; KAPH
8421jg ; n/a ; KNOTTED_HEH
8422jg ; n/a ; LAMADH
8423jg ; n/a ; MIM
8424jg ; n/a ; NUN
8425jg ; n/a ; PE
8426jg ; n/a ; QAPH
8427jg ; n/a ; REH
8428jg ; n/a ; REVERSED_PE
8429jg ; n/a ; SADHE
8430jg ; n/a ; SEMKATH
8431jg ; n/a ; SHIN
8432jg ; n/a ; SWASH_KAF
8433jg ; n/a ; TAW
8434jg ; n/a ; TEH_MARBUTA
8435jg ; n/a ; TETH
8436jg ; n/a ; YEH
8437jg ; n/a ; YEH_BARREE
8438jg ; n/a ; YEH_WITH_TAIL
8439jg ; n/a ; YUDH
8440jg ; n/a ; YUDH_HE
8441jg ; n/a ; ZAIN
8442END
8443 }
8444 }
8445
8446
8447 if (-e 'EastAsianWidth.txt') {
8448 push @return, split /\n/, <<'END';
8449ea ; A ; Ambiguous
8450ea ; F ; Fullwidth
8451ea ; H ; Halfwidth
8452ea ; N ; Neutral
8453ea ; Na ; Narrow
8454ea ; W ; Wide
8455END
8456 }
8457
8458 if (-e 'LineBreak.txt') {
8459 push @return, split /\n/, <<'END';
8460lb ; AI ; Ambiguous
8461lb ; AL ; Alphabetic
8462lb ; B2 ; Break_Both
8463lb ; BA ; Break_After
8464lb ; BB ; Break_Before
8465lb ; BK ; Mandatory_Break
8466lb ; CB ; Contingent_Break
8467lb ; CL ; Close_Punctuation
8468lb ; CM ; Combining_Mark
8469lb ; CR ; Carriage_Return
8470lb ; EX ; Exclamation
8471lb ; GL ; Glue
8472lb ; HY ; Hyphen
8473lb ; ID ; Ideographic
8474lb ; IN ; Inseperable
8475lb ; IS ; Infix_Numeric
8476lb ; LF ; Line_Feed
8477lb ; NS ; Nonstarter
8478lb ; NU ; Numeric
8479lb ; OP ; Open_Punctuation
8480lb ; PO ; Postfix_Numeric
8481lb ; PR ; Prefix_Numeric
8482lb ; QU ; Quotation
8483lb ; SA ; Complex_Context
8484lb ; SG ; Surrogate
8485lb ; SP ; Space
8486lb ; SY ; Break_Symbols
8487lb ; XX ; Unknown
8488lb ; ZW ; ZWSpace
8489END
8490 }
8491
8492 if (-e 'DNormalizationProps.txt') {
8493 push @return, split /\n/, <<'END';
8494qc ; M ; Maybe
8495qc ; N ; No
8496qc ; Y ; Yes
8497END
8498 }
8499
8500 if (-e 'Scripts.txt') {
8501 push @return, split /\n/, <<'END';
8502sc ; Arab ; Arabic
8503sc ; Armn ; Armenian
8504sc ; Beng ; Bengali
8505sc ; Bopo ; Bopomofo
8506sc ; Cans ; Canadian_Aboriginal
8507sc ; Cher ; Cherokee
8508sc ; Cyrl ; Cyrillic
8509sc ; Deva ; Devanagari
8510sc ; Dsrt ; Deseret
8511sc ; Ethi ; Ethiopic
8512sc ; Geor ; Georgian
8513sc ; Goth ; Gothic
8514sc ; Grek ; Greek
8515sc ; Gujr ; Gujarati
8516sc ; Guru ; Gurmukhi
8517sc ; Hang ; Hangul
8518sc ; Hani ; Han
8519sc ; Hebr ; Hebrew
8520sc ; Hira ; Hiragana
8521sc ; Ital ; Old_Italic
8522sc ; Kana ; Katakana
8523sc ; Khmr ; Khmer
8524sc ; Knda ; Kannada
8525sc ; Laoo ; Lao
8526sc ; Latn ; Latin
8527sc ; Mlym ; Malayalam
8528sc ; Mong ; Mongolian
8529sc ; Mymr ; Myanmar
8530sc ; Ogam ; Ogham
8531sc ; Orya ; Oriya
8532sc ; Qaai ; Inherited
8533sc ; Runr ; Runic
8534sc ; Sinh ; Sinhala
8535sc ; Syrc ; Syriac
8536sc ; Taml ; Tamil
8537sc ; Telu ; Telugu
8538sc ; Thaa ; Thaana
8539sc ; Thai ; Thai
8540sc ; Tibt ; Tibetan
8541sc ; Yiii ; Yi
8542sc ; Zyyy ; Common
8543END
8544 }
8545
8546 if ($v_version ge v2.0.0) {
8547 push @return, split /\n/, <<'END';
8548dt ; com ; compat
8549dt ; nar ; narrow
8550dt ; sml ; small
8551dt ; vert ; vertical
8552dt ; wide ; wide
8553
8554gc ; Cf ; Format
8555gc ; Cs ; Surrogate
8556gc ; Lt ; Titlecase_Letter
8557gc ; Me ; Enclosing_Mark
8558gc ; Nl ; Letter_Number
8559gc ; Pc ; Connector_Punctuation
8560gc ; Sk ; Modifier_Symbol
8561END
8562 }
8563 if ($v_version ge v2.1.2) {
8564 push @return, "bc ; S ; Segment_Separator\n";
8565 }
8566 if ($v_version ge v2.1.5) {
8567 push @return, split /\n/, <<'END';
8568gc ; Pf ; Final_Punctuation
8569gc ; Pi ; Initial_Punctuation
8570END
8571 }
8572 if ($v_version ge v2.1.8) {
8573 push @return, "ccc; 240; IS ; Iota_Subscript\n";
8574 }
8575
8576 if ($v_version ge v3.0.0) {
8577 push @return, split /\n/, <<'END';
8578bc ; AL ; Arabic_Letter
8579bc ; BN ; Boundary_Neutral
8580bc ; LRE ; Left_To_Right_Embedding
8581bc ; LRO ; Left_To_Right_Override
8582bc ; NSM ; Nonspacing_Mark
8583bc ; PDF ; Pop_Directional_Format
8584bc ; RLE ; Right_To_Left_Embedding
8585bc ; RLO ; Right_To_Left_Override
8586
8587ccc; 233; DB ; Double_Below
8588END
8589 }
8590
8591 if ($v_version ge v3.1.0) {
8592 push @return, "ccc; 226; R ; Right\n";
8593 }
8594
8595 return @return;
8596}
8597
b1c167a3
KW
8598sub output_perl_charnames_line ($$) {
8599
8600 # Output the entries in Perl_charnames specially, using 5 digits instead
8601 # of four. This makes the entries a constant length, and simplifies
8602 # charnames.pm which this table is for. Unicode can have 6 digit
8603 # ordinals, but they are all private use or noncharacters which do not
8604 # have names, so won't be in this table.
8605
73d9566f 8606 return sprintf "%05X\t%s\n", $_[0], $_[1];
b1c167a3
KW
8607}
8608
99870f4d
KW
8609{ # Closure
8610 # This is used to store the range list of all the code points usable when
8611 # the little used $compare_versions feature is enabled.
8612 my $compare_versions_range_list;
8613
8614 sub process_generic_property_file {
8615 # This processes a file containing property mappings and puts them
8616 # into internal map tables. It should be used to handle any property
8617 # files that have mappings from a code point or range thereof to
8618 # something else. This means almost all the UCD .txt files.
8619 # each_line_handlers() should be set to adjust the lines of these
8620 # files, if necessary, to what this routine understands:
8621 #
8622 # 0374 ; NFD_QC; N
8623 # 003C..003E ; Math
8624 #
8625 # the fields are: "codepoint range ; property; map"
8626 #
8627 # meaning the codepoints in the range all have the value 'map' under
8628 # 'property'.
8629 # Beginning and trailing white space in each field are not signficant.
8630 # Note there is not a trailing semi-colon in the above. A trailing
8631 # semi-colon means the map is a null-string. An omitted map, as
8632 # opposed to a null-string, is assumed to be 'Y', based on Unicode
8633 # table syntax. (This could have been hidden from this routine by
8634 # doing it in the $file object, but that would require parsing of the
8635 # line there, so would have to parse it twice, or change the interface
8636 # to pass this an array. So not done.)
8637 #
8638 # The map field may begin with a sequence of commands that apply to
8639 # this range. Each such command begins and ends with $CMD_DELIM.
8640 # These are used to indicate, for example, that the mapping for a
8641 # range has a non-default type.
8642 #
8643 # This loops through the file, calling it's next_line() method, and
8644 # then taking the map and adding it to the property's table.
8645 # Complications arise because any number of properties can be in the
8646 # file, in any order, interspersed in any way. The first time a
8647 # property is seen, it gets information about that property and
f86864ac 8648 # caches it for quick retrieval later. It also normalizes the maps
99870f4d
KW
8649 # so that only one of many synonym is stored. The Unicode input files
8650 # do use some multiple synonyms.
8651
8652 my $file = shift;
8653 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8654
8655 my %property_info; # To keep track of what properties
8656 # have already had entries in the
8657 # current file, and info about each,
8658 # so don't have to recompute.
8659 my $property_name; # property currently being worked on
8660 my $property_type; # and its type
8661 my $previous_property_name = ""; # name from last time through loop
8662 my $property_object; # pointer to the current property's
8663 # object
8664 my $property_addr; # the address of that object
8665 my $default_map; # the string that code points missing
8666 # from the file map to
8667 my $default_table; # For non-string properties, a
8668 # reference to the match table that
8669 # will contain the list of code
8670 # points that map to $default_map.
8671
8672 # Get the next real non-comment line
8673 LINE:
8674 while ($file->next_line) {
8675
8676 # Default replacement type; means that if parts of the range have
8677 # already been stored in our tables, the new map overrides them if
8678 # they differ more than cosmetically
8679 my $replace = $IF_NOT_EQUIVALENT;
8680 my $map_type; # Default type for the map of this range
8681
8682 #local $to_trace = 1 if main::DEBUG;
8683 trace $_ if main::DEBUG && $to_trace;
8684
8685 # Split the line into components
8686 my ($range, $property_name, $map, @remainder)
8687 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
8688
8689 # If more or less on the line than we are expecting, warn and skip
8690 # the line
8691 if (@remainder) {
8692 $file->carp_bad_line('Extra fields');
8693 next LINE;
8694 }
8695 elsif ( ! defined $property_name) {
8696 $file->carp_bad_line('Missing property');
8697 next LINE;
8698 }
8699
8700 # Examine the range.
8701 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
8702 {
8703 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
8704 next LINE;
8705 }
8706 my $low = hex $1;
8707 my $high = (defined $2) ? hex $2 : $low;
8708
8709 # For the very specialized case of comparing two Unicode
8710 # versions...
8711 if (DEBUG && $compare_versions) {
8712 if ($property_name eq 'Age') {
8713
8714 # Only allow code points at least as old as the version
8715 # specified.
8716 my $age = pack "C*", split(/\./, $map); # v string
8717 next LINE if $age gt $compare_versions;
8718 }
8719 else {
8720
8721 # Again, we throw out code points younger than those of
8722 # the specified version. By now, the Age property is
8723 # populated. We use the intersection of each input range
8724 # with this property to find what code points in it are
8725 # valid. To do the intersection, we have to convert the
8726 # Age property map to a Range_list. We only have to do
8727 # this once.
8728 if (! defined $compare_versions_range_list) {
8729 my $age = property_ref('Age');
8730 if (! -e 'DAge.txt') {
8731 croak "Need to have 'DAge.txt' file to do version comparison";
8732 }
8733 elsif ($age->count == 0) {
8734 croak "The 'Age' table is empty, but its file exists";
8735 }
8736 $compare_versions_range_list
8737 = Range_List->new(Initialize => $age);
8738 }
8739
8740 # An undefined map is always 'Y'
8741 $map = 'Y' if ! defined $map;
8742
8743 # Calculate the intersection of the input range with the
8744 # code points that are known in the specified version
8745 my @ranges = ($compare_versions_range_list
8746 & Range->new($low, $high))->ranges;
8747
8748 # If the intersection is empty, throw away this range
8749 next LINE unless @ranges;
8750
8751 # Only examine the first range this time through the loop.
8752 my $this_range = shift @ranges;
8753
8754 # Put any remaining ranges in the queue to be processed
8755 # later. Note that there is unnecessary work here, as we
8756 # will do the intersection again for each of these ranges
8757 # during some future iteration of the LINE loop, but this
8758 # code is not used in production. The later intersections
8759 # are guaranteed to not splinter, so this will not become
8760 # an infinite loop.
8761 my $line = join ';', $property_name, $map;
8762 foreach my $range (@ranges) {
8763 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
8764 $range->start,
8765 $range->end,
8766 $line));
8767 }
8768
8769 # And process the first range, like any other.
8770 $low = $this_range->start;
8771 $high = $this_range->end;
8772 }
8773 } # End of $compare_versions
8774
8775 # If changing to a new property, get the things constant per
8776 # property
8777 if ($previous_property_name ne $property_name) {
8778
8779 $property_object = property_ref($property_name);
8780 if (! defined $property_object) {
8781 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
8782 next LINE;
8783 }
051df77b 8784 { no overloading; $property_addr = pack 'J', $property_object; }
99870f4d
KW
8785
8786 # Defer changing names until have a line that is acceptable
8787 # (the 'next' statement above means is unacceptable)
8788 $previous_property_name = $property_name;
8789
8790 # If not the first time for this property, retrieve info about
8791 # it from the cache
8792 if (defined ($property_info{$property_addr}{'type'})) {
8793 $property_type = $property_info{$property_addr}{'type'};
8794 $default_map = $property_info{$property_addr}{'default'};
8795 $map_type
8796 = $property_info{$property_addr}{'pseudo_map_type'};
8797 $default_table
8798 = $property_info{$property_addr}{'default_table'};
8799 }
8800 else {
8801
8802 # Here, is the first time for this property. Set up the
8803 # cache.
8804 $property_type = $property_info{$property_addr}{'type'}
8805 = $property_object->type;
8806 $map_type
8807 = $property_info{$property_addr}{'pseudo_map_type'}
8808 = $property_object->pseudo_map_type;
8809
8810 # The Unicode files are set up so that if the map is not
8811 # defined, it is a binary property
8812 if (! defined $map && $property_type != $BINARY) {
8813 if ($property_type != $UNKNOWN
8814 && $property_type != $NON_STRING)
8815 {
8816 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
8817 }
8818 else {
8819 $property_object->set_type($BINARY);
8820 $property_type
8821 = $property_info{$property_addr}{'type'}
8822 = $BINARY;
8823 }
8824 }
8825
8826 # Get any @missings default for this property. This
8827 # should precede the first entry for the property in the
8828 # input file, and is located in a comment that has been
8829 # stored by the Input_file class until we access it here.
8830 # It's possible that there is more than one such line
8831 # waiting for us; collect them all, and parse
8832 my @missings_list = $file->get_missings
8833 if $file->has_missings_defaults;
8834 foreach my $default_ref (@missings_list) {
8835 my $default = $default_ref->[0];
ffe43484 8836 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
99870f4d
KW
8837
8838 # For string properties, the default is just what the
8839 # file says, but non-string properties should already
8840 # have set up a table for the default property value;
8841 # use the table for these, so can resolve synonyms
8842 # later to a single standard one.
8843 if ($property_type == $STRING
8844 || $property_type == $UNKNOWN)
8845 {
8846 $property_info{$addr}{'missings'} = $default;
8847 }
8848 else {
8849 $property_info{$addr}{'missings'}
8850 = $property_object->table($default);
8851 }
8852 }
8853
8854 # Finished storing all the @missings defaults in the input
8855 # file so far. Get the one for the current property.
8856 my $missings = $property_info{$property_addr}{'missings'};
8857
8858 # But we likely have separately stored what the default
8859 # should be. (This is to accommodate versions of the
8860 # standard where the @missings lines are absent or
8861 # incomplete.) Hopefully the two will match. But check
8862 # it out.
8863 $default_map = $property_object->default_map;
8864
8865 # If the map is a ref, it means that the default won't be
8866 # processed until later, so undef it, so next few lines
8867 # will redefine it to something that nothing will match
8868 undef $default_map if ref $default_map;
8869
8870 # Create a $default_map if don't have one; maybe a dummy
8871 # that won't match anything.
8872 if (! defined $default_map) {
8873
8874 # Use any @missings line in the file.
8875 if (defined $missings) {
8876 if (ref $missings) {
8877 $default_map = $missings->full_name;
8878 $default_table = $missings;
8879 }
8880 else {
8881 $default_map = $missings;
8882 }
678f13d5 8883
99870f4d
KW
8884 # And store it with the property for outside use.
8885 $property_object->set_default_map($default_map);
8886 }
8887 else {
8888
8889 # Neither an @missings nor a default map. Create
8890 # a dummy one, so won't have to test definedness
8891 # in the main loop.
8892 $default_map = '_Perl This will never be in a file
8893 from Unicode';
8894 }
8895 }
8896
8897 # Here, we have $default_map defined, possibly in terms of
8898 # $missings, but maybe not, and possibly is a dummy one.
8899 if (defined $missings) {
8900
8901 # Make sure there is no conflict between the two.
8902 # $missings has priority.
8903 if (ref $missings) {
23e33b60
KW
8904 $default_table
8905 = $property_object->table($default_map);
99870f4d
KW
8906 if (! defined $default_table
8907 || $default_table != $missings)
8908 {
8909 if (! defined $default_table) {
8910 $default_table = $UNDEF;
8911 }
8912 $file->carp_bad_line(<<END
8913The \@missings line for $property_name in $file says that missings default to
8914$missings, but we expect it to be $default_table. $missings used.
8915END
8916 );
8917 $default_table = $missings;
8918 $default_map = $missings->full_name;
8919 }
8920 $property_info{$property_addr}{'default_table'}
8921 = $default_table;
8922 }
8923 elsif ($default_map ne $missings) {
8924 $file->carp_bad_line(<<END
8925The \@missings line for $property_name in $file says that missings default to
8926$missings, but we expect it to be $default_map. $missings used.
8927END
8928 );
8929 $default_map = $missings;
8930 }
8931 }
8932
8933 $property_info{$property_addr}{'default'}
8934 = $default_map;
8935
8936 # If haven't done so already, find the table corresponding
8937 # to this map for non-string properties.
8938 if (! defined $default_table
8939 && $property_type != $STRING
8940 && $property_type != $UNKNOWN)
8941 {
8942 $default_table = $property_info{$property_addr}
8943 {'default_table'}
8944 = $property_object->table($default_map);
8945 }
8946 } # End of is first time for this property
8947 } # End of switching properties.
8948
8949 # Ready to process the line.
8950 # The Unicode files are set up so that if the map is not defined,
8951 # it is a binary property with value 'Y'
8952 if (! defined $map) {
8953 $map = 'Y';
8954 }
8955 else {
8956
8957 # If the map begins with a special command to us (enclosed in
8958 # delimiters), extract the command(s).
8959 if (substr($map, 0, 1) eq $CMD_DELIM) {
8960 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
8961 my $command = $1;
8962 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
8963 $replace = $1;
8964 }
8965 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
8966 $map_type = $1;
8967 }
8968 else {
8969 $file->carp_bad_line("Unknown command line: '$1'");
8970 next LINE;
8971 }
8972 }
8973 }
8974 }
8975
8976 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
8977 {
8978
8979 # Here, we have a map to a particular code point, and the
8980 # default map is to a code point itself. If the range
8981 # includes the particular code point, change that portion of
8982 # the range to the default. This makes sure that in the final
8983 # table only the non-defaults are listed.
8984 my $decimal_map = hex $map;
8985 if ($low <= $decimal_map && $decimal_map <= $high) {
8986
8987 # If the range includes stuff before or after the map
8988 # we're changing, split it and process the split-off parts
8989 # later.
8990 if ($low < $decimal_map) {
8991 $file->insert_adjusted_lines(
8992 sprintf("%04X..%04X; %s; %s",
8993 $low,
8994 $decimal_map - 1,
8995 $property_name,
8996 $map));
8997 }
8998 if ($high > $decimal_map) {
8999 $file->insert_adjusted_lines(
9000 sprintf("%04X..%04X; %s; %s",
9001 $decimal_map + 1,
9002 $high,
9003 $property_name,
9004 $map));
9005 }
9006 $low = $high = $decimal_map;
9007 $map = $CODE_POINT;
9008 }
9009 }
9010
9011 # If we can tell that this is a synonym for the default map, use
9012 # the default one instead.
9013 if ($property_type != $STRING
9014 && $property_type != $UNKNOWN)
9015 {
9016 my $table = $property_object->table($map);
9017 if (defined $table && $table == $default_table) {
9018 $map = $default_map;
9019 }
9020 }
9021
9022 # And figure out the map type if not known.
9023 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
9024 if ($map eq "") { # Nulls are always $NULL map type
9025 $map_type = $NULL;
9026 } # Otherwise, non-strings, and those that don't allow
9027 # $MULTI_CP, and those that aren't multiple code points are
9028 # 0
9029 elsif
9030 (($property_type != $STRING && $property_type != $UNKNOWN)
9031 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
9032 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
9033 {
9034 $map_type = 0;
9035 }
9036 else {
9037 $map_type = $MULTI_CP;
9038 }
9039 }
9040
9041 $property_object->add_map($low, $high,
9042 $map,
9043 Type => $map_type,
9044 Replace => $replace);
9045 } # End of loop through file's lines
9046
9047 return;
9048 }
9049}
9050
99870f4d
KW
9051{ # Closure for UnicodeData.txt handling
9052
9053 # This file was the first one in the UCD; its design leads to some
9054 # awkwardness in processing. Here is a sample line:
9055 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
9056 # The fields in order are:
9057 my $i = 0; # The code point is in field 0, and is shifted off.
28093d0e 9058 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
99870f4d
KW
9059 my $CATEGORY = $i++; # category (e.g. "Lu")
9060 my $CCC = $i++; # Canonical combining class (e.g. "230")
9061 my $BIDI = $i++; # directional class (e.g. "L")
9062 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
9063 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
9064 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
9065 # Dual-use in this program; see below
9066 my $NUMERIC = $i++; # numeric value
9067 my $MIRRORED = $i++; # ? mirrored
9068 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
9069 my $COMMENT = $i++; # iso comment
9070 my $UPPER = $i++; # simple uppercase mapping
9071 my $LOWER = $i++; # simple lowercase mapping
9072 my $TITLE = $i++; # simple titlecase mapping
9073 my $input_field_count = $i;
9074
9075 # This routine in addition outputs these extra fields:
9076 my $DECOMP_TYPE = $i++; # Decomposition type
28093d0e
KW
9077
9078 # These fields are modifications of ones above, and are usually
9079 # suppressed; they must come last, as for speed, the loop upper bound is
9080 # normally set to ignore them
9081 my $NAME = $i++; # This is the strict name field, not the one that
9082 # charnames uses.
9083 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
9084 # by Unicode::Normalize
99870f4d
KW
9085 my $last_field = $i - 1;
9086
9087 # All these are read into an array for each line, with the indices defined
9088 # above. The empty fields in the example line above indicate that the
9089 # value is defaulted. The handler called for each line of the input
9090 # changes these to their defaults.
9091
9092 # Here are the official names of the properties, in a parallel array:
9093 my @field_names;
9094 $field_names[$BIDI] = 'Bidi_Class';
9095 $field_names[$CATEGORY] = 'General_Category';
9096 $field_names[$CCC] = 'Canonical_Combining_Class';
28093d0e 9097 $field_names[$CHARNAME] = 'Perl_Charnames';
99870f4d
KW
9098 $field_names[$COMMENT] = 'ISO_Comment';
9099 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
9100 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
959ce5bf 9101 $field_names[$LOWER] = 'Lowercase_Mapping';
99870f4d
KW
9102 $field_names[$MIRRORED] = 'Bidi_Mirrored';
9103 $field_names[$NAME] = 'Name';
9104 $field_names[$NUMERIC] = 'Numeric_Value';
9105 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
9106 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
9107 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
959ce5bf 9108 $field_names[$TITLE] = 'Titlecase_Mapping';
99870f4d 9109 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
959ce5bf 9110 $field_names[$UPPER] = 'Uppercase_Mapping';
99870f4d 9111
28093d0e
KW
9112 # Some of these need a little more explanation:
9113 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
9114 # property, but is used in calculating the Numeric_Type. Perl however,
9115 # creates a file from this field, so a Perl property is created from it.
9116 # Similarly, the Other_Digit field is used only for calculating the
9117 # Numeric_Type, and so it can be safely re-used as the place to store
9118 # the value for Numeric_Type; hence it is referred to as
9119 # $NUMERIC_TYPE_OTHER_DIGIT.
9120 # The input field named $PERL_DECOMPOSITION is a combination of both the
9121 # decomposition mapping and its type. Perl creates a file containing
9122 # exactly this field, so it is used for that. The two properties are
9123 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
9124 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
9125 # output it), as Perl doesn't use it directly.
9126 # The input field named here $CHARNAME is used to construct the
9127 # Perl_Charnames property, which is a combination of the Name property
9128 # (which the input field contains), and the Unicode_1_Name property, and
9129 # others from other files. Since, the strict Name property is not used
9130 # by Perl, this field is used for the table that Perl does use. The
9131 # strict Name property table is usually suppressed (unless the lists are
9132 # changed to output it), so it is accumulated in a separate field,
9133 # $NAME, which to save time is discarded unless the table is actually to
9134 # be output
99870f4d
KW
9135
9136 # This file is processed like most in this program. Control is passed to
9137 # process_generic_property_file() which calls filter_UnicodeData_line()
9138 # for each input line. This filter converts the input into line(s) that
9139 # process_generic_property_file() understands. There is also a setup
9140 # routine called before any of the file is processed, and a handler for
9141 # EOF processing, all in this closure.
9142
9143 # A huge speed-up occurred at the cost of some added complexity when these
9144 # routines were altered to buffer the outputs into ranges. Almost all the
9145 # lines of the input file apply to just one code point, and for most
9146 # properties, the map for the next code point up is the same as the
9147 # current one. So instead of creating a line for each property for each
9148 # input line, filter_UnicodeData_line() remembers what the previous map
9149 # of a property was, and doesn't generate a line to pass on until it has
9150 # to, as when the map changes; and that passed-on line encompasses the
9151 # whole contiguous range of code points that have the same map for that
9152 # property. This means a slight amount of extra setup, and having to
9153 # flush these buffers on EOF, testing if the maps have changed, plus
9154 # remembering state information in the closure. But it means a lot less
9155 # real time in not having to change the data base for each property on
9156 # each line.
9157
9158 # Another complication is that there are already a few ranges designated
9159 # in the input. There are two lines for each, with the same maps except
9160 # the code point and name on each line. This was actually the hardest
9161 # thing to design around. The code points in those ranges may actually
9162 # have real maps not given by these two lines. These maps will either
9163 # be algorthimically determinable, or in the extracted files furnished
9164 # with the UCD. In the event of conflicts between these extracted files,
9165 # and this one, Unicode says that this one prevails. But it shouldn't
9166 # prevail for conflicts that occur in these ranges. The data from the
9167 # extracted files prevails in those cases. So, this program is structured
9168 # so that those files are processed first, storing maps. Then the other
9169 # files are processed, generally overwriting what the extracted files
9170 # stored. But just the range lines in this input file are processed
9171 # without overwriting. This is accomplished by adding a special string to
9172 # the lines output to tell process_generic_property_file() to turn off the
9173 # overwriting for just this one line.
9174 # A similar mechanism is used to tell it that the map is of a non-default
9175 # type.
9176
9177 sub setup_UnicodeData { # Called before any lines of the input are read
9178 my $file = shift;
9179 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9180
28093d0e
KW
9181 # Create a new property specially located that is a combination of the
9182 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
9183 # Name_Alias properties. (The final duplicates elements of the
9184 # first.) A comment for it will later be constructed based on the
9185 # actual properties present and used
3e20195b 9186 $perl_charname = Property->new('Perl_Charnames',
28093d0e
KW
9187 Core_Access => '\N{...} and "use charnames"',
9188 Default_Map => "",
9189 Directory => File::Spec->curdir(),
9190 File => 'Name',
9191 Internal_Only_Warning => 1,
9192 Perl_Extension => 1,
b1c167a3 9193 Range_Size_1 => \&output_perl_charnames_line,
28093d0e
KW
9194 Type => $STRING,
9195 );
9196
99870f4d 9197 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
517956bf 9198 Directory => File::Spec->curdir(),
99870f4d 9199 File => 'Decomposition',
a14f3cb1 9200 Format => $DECOMP_STRING_FORMAT,
99870f4d
KW
9201 Internal_Only_Warning => 1,
9202 Perl_Extension => 1,
9203 Default_Map => $CODE_POINT,
9204
0c07e538
KW
9205 # normalize.pm can't cope with these
9206 Output_Range_Counts => 0,
9207
99870f4d
KW
9208 # This is a specially formatted table
9209 # explicitly for normalize.pm, which
9210 # is expecting a particular format,
9211 # which means that mappings containing
9212 # multiple code points are in the main
9213 # body of the table
9214 Map_Type => $COMPUTE_NO_MULTI_CP,
9215 Type => $STRING,
9216 );
9217 $Perl_decomp->add_comment(join_lines(<<END
9218This mapping is a combination of the Unicode 'Decomposition_Type' and
9219'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
9220identical to the official Unicode 'Decomposition_Mapping' property except for
9221two things:
9222 1) It omits the algorithmically determinable Hangul syllable decompositions,
9223which normalize.pm handles algorithmically.
9224 2) It contains the decomposition type as well. Non-canonical decompositions
9225begin with a word in angle brackets, like <super>, which denotes the
9226compatible decomposition type. If the map does not begin with the <angle
9227brackets>, the decomposition is canonical.
9228END
9229 ));
9230
9231 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
9232 Default_Map => "",
9233 Perl_Extension => 1,
9234 File => 'Digit', # Trad. location
9235 Directory => $map_directory,
9236 Type => $STRING,
9237 Range_Size_1 => 1,
9238 );
9239 $Decimal_Digit->add_comment(join_lines(<<END
9240This file gives the mapping of all code points which represent a single
9241decimal digit [0-9] to their respective digits. For example, the code point
9242U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
9243that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
9244numerals.
9245END
9246 ));
9247
28093d0e
KW
9248 # These properties are not used for generating anything else, and are
9249 # usually not output. By making them last in the list, we can just
99870f4d 9250 # change the high end of the loop downwards to avoid the work of
28093d0e
KW
9251 # generating a table(s) that is/are just going to get thrown away.
9252 if (! property_ref('Decomposition_Mapping')->to_output_map
9253 && ! property_ref('Name')->to_output_map)
9254 {
9255 $last_field = min($NAME, $DECOMP_MAP) - 1;
9256 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
9257 $last_field = $DECOMP_MAP;
9258 } elsif (property_ref('Name')->to_output_map) {
9259 $last_field = $NAME;
99870f4d
KW
9260 }
9261 return;
9262 }
9263
9264 my $first_time = 1; # ? Is this the first line of the file
9265 my $in_range = 0; # ? Are we in one of the file's ranges
9266 my $previous_cp; # hex code point of previous line
9267 my $decimal_previous_cp = -1; # And its decimal equivalent
9268 my @start; # For each field, the current starting
9269 # code point in hex for the range
9270 # being accumulated.
9271 my @fields; # The input fields;
9272 my @previous_fields; # And those from the previous call
9273
9274 sub filter_UnicodeData_line {
9275 # Handle a single input line from UnicodeData.txt; see comments above
9276 # Conceptually this takes a single line from the file containing N
9277 # properties, and converts it into N lines with one property per line,
9278 # which is what the final handler expects. But there are
9279 # complications due to the quirkiness of the input file, and to save
9280 # time, it accumulates ranges where the property values don't change
9281 # and only emits lines when necessary. This is about an order of
9282 # magnitude fewer lines emitted.
9283
9284 my $file = shift;
9285 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9286
9287 # $_ contains the input line.
9288 # -1 in split means retain trailing null fields
9289 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
9290
9291 #local $to_trace = 1 if main::DEBUG;
9292 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
9293 if (@fields > $input_field_count) {
9294 $file->carp_bad_line('Extra fields');
9295 $_ = "";
9296 return;
9297 }
9298
9299 my $decimal_cp = hex $cp;
9300
9301 # We have to output all the buffered ranges when the next code point
9302 # is not exactly one after the previous one, which means there is a
9303 # gap in the ranges.
9304 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
9305
9306 # The decomposition mapping field requires special handling. It looks
9307 # like either:
9308 #
9309 # <compat> 0032 0020
9310 # 0041 0300
9311 #
9312 # The decomposition type is enclosed in <brackets>; if missing, it
9313 # means the type is canonical. There are two decomposition mapping
9314 # tables: the one for use by Perl's normalize.pm has a special format
9315 # which is this field intact; the other, for general use is of
9316 # standard format. In either case we have to find the decomposition
9317 # type. Empty fields have None as their type, and map to the code
9318 # point itself
9319 if ($fields[$PERL_DECOMPOSITION] eq "") {
9320 $fields[$DECOMP_TYPE] = 'None';
9321 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
9322 }
9323 else {
9324 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
9325 =~ / < ( .+? ) > \s* ( .+ ) /x;
9326 if (! defined $fields[$DECOMP_TYPE]) {
9327 $fields[$DECOMP_TYPE] = 'Canonical';
9328 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
9329 }
9330 else {
9331 $fields[$DECOMP_MAP] = $map;
9332 }
9333 }
9334
9335 # The 3 numeric fields also require special handling. The 2 digit
9336 # fields must be either empty or match the number field. This means
9337 # that if it is empty, they must be as well, and the numeric type is
9338 # None, and the numeric value is 'Nan'.
9339 # The decimal digit field must be empty or match the other digit
9340 # field. If the decimal digit field is non-empty, the code point is
9341 # a decimal digit, and the other two fields will have the same value.
9342 # If it is empty, but the other digit field is non-empty, the code
9343 # point is an 'other digit', and the number field will have the same
9344 # value as the other digit field. If the other digit field is empty,
9345 # but the number field is non-empty, the code point is a generic
9346 # numeric type.
9347 if ($fields[$NUMERIC] eq "") {
9348 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
9349 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
9350 ) {
9351 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
9352 }
9353 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
9354 $fields[$NUMERIC] = 'NaN';
9355 }
9356 else {
9357 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
9358 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
9359 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
9360 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
9361 }
9362 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
9363 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
9364 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
9365 }
9366 else {
9367 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
9368
9369 # Rationals require extra effort.
9370 register_fraction($fields[$NUMERIC])
9371 if $fields[$NUMERIC] =~ qr{/};
9372 }
9373 }
9374
9375 # For the properties that have empty fields in the file, and which
9376 # mean something different from empty, change them to that default.
9377 # Certain fields just haven't been empty so far in any Unicode
9378 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
9379 # $CATEGORY. This leaves just the two fields, and so we hard-code in
c1739a4a 9380 # the defaults; which are very unlikely to ever change.
99870f4d
KW
9381 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
9382 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
9383
9384 # UAX44 says that if title is empty, it is the same as whatever upper
9385 # is,
9386 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
9387
9388 # There are a few pairs of lines like:
9389 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
9390 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
9391 # that define ranges. These should be processed after the fields are
9392 # adjusted above, as they may override some of them; but mostly what
28093d0e 9393 # is left is to possibly adjust the $CHARNAME field. The names of all the
99870f4d
KW
9394 # paired lines start with a '<', but this is also true of '<control>,
9395 # which isn't one of these special ones.
28093d0e 9396 if ($fields[$CHARNAME] eq '<control>') {
99870f4d
KW
9397
9398 # Some code points in this file have the pseudo-name
9399 # '<control>', but the official name for such ones is the null
28093d0e 9400 # string. For charnames.pm, we use the Unicode version 1 name
99870f4d 9401 $fields[$NAME] = "";
28093d0e 9402 $fields[$CHARNAME] = $fields[$UNICODE_1_NAME];
99870f4d
KW
9403
9404 # We had better not be in between range lines.
9405 if ($in_range) {
28093d0e 9406 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9407 $in_range = 0;
9408 }
9409 }
28093d0e 9410 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
99870f4d
KW
9411
9412 # Here is a non-range line. We had better not be in between range
9413 # lines.
9414 if ($in_range) {
28093d0e 9415 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9416 $in_range = 0;
9417 }
edb80b88
KW
9418 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
9419
9420 # These are code points whose names end in their code points,
9421 # which means the names are algorithmically derivable from the
9422 # code points. To shorten the output Name file, the algorithm
9423 # for deriving these is placed in the file instead of each
9424 # code point, so they have map type $CP_IN_NAME
9425 $fields[$CHARNAME] = $CMD_DELIM
9426 . $MAP_TYPE_CMD
9427 . '='
9428 . $CP_IN_NAME
9429 . $CMD_DELIM
9430 . $fields[$CHARNAME];
9431 }
28093d0e 9432 $fields[$NAME] = $fields[$CHARNAME];
99870f4d 9433 }
28093d0e
KW
9434 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
9435 $fields[$CHARNAME] = $fields[$NAME] = $1;
99870f4d
KW
9436
9437 # Here we are at the beginning of a range pair.
9438 if ($in_range) {
28093d0e 9439 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
99870f4d
KW
9440 }
9441 $in_range = 1;
9442
9443 # Because the properties in the range do not overwrite any already
9444 # in the db, we must flush the buffers of what's already there, so
9445 # they get handled in the normal scheme.
9446 $force_output = 1;
9447
9448 }
28093d0e
KW
9449 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
9450 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
99870f4d
KW
9451 $_ = "";
9452 return;
9453 }
9454 else { # Here, we are at the last line of a range pair.
9455
9456 if (! $in_range) {
28093d0e 9457 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
99870f4d
KW
9458 $_ = "";
9459 return;
9460 }
9461 $in_range = 0;
9462
28093d0e
KW
9463 $fields[$NAME] = $fields[$CHARNAME];
9464
99870f4d
KW
9465 # Check that the input is valid: that the closing of the range is
9466 # the same as the beginning.
9467 foreach my $i (0 .. $last_field) {
9468 next if $fields[$i] eq $previous_fields[$i];
9469 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
9470 }
9471
9472 # The processing differs depending on the type of range,
28093d0e
KW
9473 # determined by its $CHARNAME
9474 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
99870f4d
KW
9475
9476 # Check that the data looks right.
9477 if ($decimal_previous_cp != $SBase) {
9478 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
9479 }
9480 if ($decimal_cp != $SBase + $SCount - 1) {
9481 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
9482 }
9483
9484 # The Hangul syllable range has a somewhat complicated name
9485 # generation algorithm. Each code point in it has a canonical
9486 # decomposition also computable by an algorithm. The
9487 # perl decomposition map table built from these is used only
9488 # by normalize.pm, which has the algorithm built in it, so the
9489 # decomposition maps are not needed, and are large, so are
9490 # omitted from it. If the full decomposition map table is to
9491 # be output, the decompositions are generated for it, in the
9492 # EOF handling code for this input file.
9493
9494 $previous_fields[$DECOMP_TYPE] = 'Canonical';
9495
9496 # This range is stored in our internal structure with its
9497 # own map type, different from all others.
28093d0e
KW
9498 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9499 = $CMD_DELIM
99870f4d
KW
9500 . $MAP_TYPE_CMD
9501 . '='
9502 . $HANGUL_SYLLABLE
9503 . $CMD_DELIM
28093d0e 9504 . $fields[$CHARNAME];
99870f4d 9505 }
28093d0e 9506 elsif ($fields[$CHARNAME] =~ /^CJK/) {
99870f4d
KW
9507
9508 # The name for these contains the code point itself, and all
9509 # are defined to have the same base name, regardless of what
9510 # is in the file. They are stored in our internal structure
9511 # with a map type of $CP_IN_NAME
28093d0e
KW
9512 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
9513 = $CMD_DELIM
99870f4d
KW
9514 . $MAP_TYPE_CMD
9515 . '='
9516 . $CP_IN_NAME
9517 . $CMD_DELIM
9518 . 'CJK UNIFIED IDEOGRAPH';
9519
9520 }
9521 elsif ($fields[$CATEGORY] eq 'Co'
9522 || $fields[$CATEGORY] eq 'Cs')
9523 {
9524 # The names of all the code points in these ranges are set to
9525 # null, as there are no names for the private use and
9526 # surrogate code points.
9527
28093d0e 9528 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
99870f4d
KW
9529 }
9530 else {
28093d0e 9531 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
99870f4d
KW
9532 }
9533
9534 # The first line of the range caused everything else to be output,
9535 # and then its values were stored as the beginning values for the
9536 # next set of ranges, which this one ends. Now, for each value,
9537 # add a command to tell the handler that these values should not
9538 # replace any existing ones in our database.
9539 foreach my $i (0 .. $last_field) {
9540 $previous_fields[$i] = $CMD_DELIM
9541 . $REPLACE_CMD
9542 . '='
9543 . $NO
9544 . $CMD_DELIM
9545 . $previous_fields[$i];
9546 }
9547
9548 # And change things so it looks like the entire range has been
9549 # gone through with this being the final part of it. Adding the
9550 # command above to each field will cause this range to be flushed
9551 # during the next iteration, as it guaranteed that the stored
9552 # field won't match whatever value the next one has.
9553 $previous_cp = $cp;
9554 $decimal_previous_cp = $decimal_cp;
9555
9556 # We are now set up for the next iteration; so skip the remaining
9557 # code in this subroutine that does the same thing, but doesn't
9558 # know about these ranges.
9559 $_ = "";
c1739a4a 9560
99870f4d
KW
9561 return;
9562 }
9563
9564 # On the very first line, we fake it so the code below thinks there is
9565 # nothing to output, and initialize so that when it does get output it
9566 # uses the first line's values for the lowest part of the range.
9567 # (One could avoid this by using peek(), but then one would need to
9568 # know the adjustments done above and do the same ones in the setup
9569 # routine; not worth it)
9570 if ($first_time) {
9571 $first_time = 0;
9572 @previous_fields = @fields;
9573 @start = ($cp) x scalar @fields;
9574 $decimal_previous_cp = $decimal_cp - 1;
9575 }
9576
9577 # For each field, output the stored up ranges that this code point
9578 # doesn't fit in. Earlier we figured out if all ranges should be
9579 # terminated because of changing the replace or map type styles, or if
9580 # there is a gap between this new code point and the previous one, and
9581 # that is stored in $force_output. But even if those aren't true, we
9582 # need to output the range if this new code point's value for the
9583 # given property doesn't match the stored range's.
9584 #local $to_trace = 1 if main::DEBUG;
9585 foreach my $i (0 .. $last_field) {
9586 my $field = $fields[$i];
9587 if ($force_output || $field ne $previous_fields[$i]) {
9588
9589 # Flush the buffer of stored values.
9590 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9591
9592 # Start a new range with this code point and its value
9593 $start[$i] = $cp;
9594 $previous_fields[$i] = $field;
9595 }
9596 }
9597
9598 # Set the values for the next time.
9599 $previous_cp = $cp;
9600 $decimal_previous_cp = $decimal_cp;
9601
9602 # The input line has generated whatever adjusted lines are needed, and
9603 # should not be looked at further.
9604 $_ = "";
9605 return;
9606 }
9607
9608 sub EOF_UnicodeData {
9609 # Called upon EOF to flush the buffers, and create the Hangul
9610 # decomposition mappings if needed.
9611
9612 my $file = shift;
9613 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9614
9615 # Flush the buffers.
9616 foreach my $i (1 .. $last_field) {
9617 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
9618 }
9619
9620 if (-e 'Jamo.txt') {
9621
9622 # The algorithm is published by Unicode, based on values in
9623 # Jamo.txt, (which should have been processed before this
9624 # subroutine), and the results left in %Jamo
9625 unless (%Jamo) {
9626 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
9627 return;
9628 }
9629
9630 # If the full decomposition map table is being output, insert
9631 # into it the Hangul syllable mappings. This is to avoid having
9632 # to publish a subroutine in it to compute them. (which would
9633 # essentially be this code.) This uses the algorithm published by
9634 # Unicode.
9635 if (property_ref('Decomposition_Mapping')->to_output_map) {
9636 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
9637 use integer;
9638 my $SIndex = $S - $SBase;
9639 my $L = $LBase + $SIndex / $NCount;
9640 my $V = $VBase + ($SIndex % $NCount) / $TCount;
9641 my $T = $TBase + $SIndex % $TCount;
9642
9643 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
9644 my $decomposition = sprintf("%04X %04X", $L, $V);
9645 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
9646 $file->insert_adjusted_lines(
9647 sprintf("%04X; Decomposition_Mapping; %s",
9648 $S,
9649 $decomposition));
9650 }
9651 }
9652 }
9653
9654 return;
9655 }
9656
9657 sub filter_v1_ucd {
9658 # Fix UCD lines in version 1. This is probably overkill, but this
9659 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
9660 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
9661 # removed. This program retains them
9662 # 2) didn't include ranges, which it should have, and which are now
9663 # added in @corrected_lines below. It was hand populated by
9664 # taking the data from Version 2, verified by analyzing
9665 # DAge.txt.
9666 # 3) There is a syntax error in the entry for U+09F8 which could
9667 # cause problems for utf8_heavy, and so is changed. It's
9668 # numeric value was simply a minus sign, without any number.
9669 # (Eventually Unicode changed the code point to non-numeric.)
9670 # 4) The decomposition types often don't match later versions
9671 # exactly, and the whole syntax of that field is different; so
9672 # the syntax is changed as well as the types to their later
9673 # terminology. Otherwise normalize.pm would be very unhappy
9674 # 5) Many ccc classes are different. These are left intact.
9675 # 6) U+FF10 - U+FF19 are missing their numeric values in all three
9676 # fields. These are unchanged because it doesn't really cause
9677 # problems for Perl.
9678 # 7) A number of code points, such as controls, don't have their
9679 # Unicode Version 1 Names in this file. These are unchanged.
9680
9681 my @corrected_lines = split /\n/, <<'END';
96824E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
96839FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9684E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
9685F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
9686F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
9687FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
9688END
9689
9690 my $file = shift;
9691 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9692
9693 #local $to_trace = 1 if main::DEBUG;
9694 trace $_ if main::DEBUG && $to_trace;
9695
9696 # -1 => retain trailing null fields
9697 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9698
9699 # At the first place that is wrong in the input, insert all the
9700 # corrections, replacing the wrong line.
9701 if ($code_point eq '4E00') {
9702 my @copy = @corrected_lines;
9703 $_ = shift @copy;
9704 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9705
9706 $file->insert_lines(@copy);
9707 }
9708
9709
9710 if ($fields[$NUMERIC] eq '-') {
9711 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
9712 }
9713
9714 if ($fields[$PERL_DECOMPOSITION] ne "") {
9715
9716 # Several entries have this change to superscript 2 or 3 in the
9717 # middle. Convert these to the modern version, which is to use
9718 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
9719 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
9720 # 'HHHH HHHH 00B3 HHHH'.
9721 # It turns out that all of these that don't have another
9722 # decomposition defined at the beginning of the line have the
9723 # <square> decomposition in later releases.
9724 if ($code_point ne '00B2' && $code_point ne '00B3') {
9725 if ($fields[$PERL_DECOMPOSITION]
9726 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
9727 {
9728 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
9729 $fields[$PERL_DECOMPOSITION] = '<square> '
9730 . $fields[$PERL_DECOMPOSITION];
9731 }
9732 }
9733 }
9734
9735 # If is like '<+circled> 0052 <-circled>', convert to
9736 # '<circled> 0052'
9737 $fields[$PERL_DECOMPOSITION] =~
9738 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/x;
9739
9740 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
9741 $fields[$PERL_DECOMPOSITION] =~
9742 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
9743 or $fields[$PERL_DECOMPOSITION] =~
9744 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
9745 or $fields[$PERL_DECOMPOSITION] =~
9746 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
9747 or $fields[$PERL_DECOMPOSITION] =~
9748 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
9749
9750 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
9751 $fields[$PERL_DECOMPOSITION] =~
9752 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
9753
9754 # Change names to modern form.
9755 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
9756 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
9757 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
9758 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
9759
9760 # One entry has weird braces
9761 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
9762 }
9763
9764 $_ = join ';', $code_point, @fields;
9765 trace $_ if main::DEBUG && $to_trace;
9766 return;
9767 }
9768
9769 sub filter_v2_1_5_ucd {
9770 # A dozen entries in this 2.1.5 file had the mirrored and numeric
9771 # columns swapped; These all had mirrored be 'N'. So if the numeric
9772 # column appears to be N, swap it back.
9773
9774 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
9775 if ($fields[$NUMERIC] eq 'N') {
9776 $fields[$NUMERIC] = $fields[$MIRRORED];
9777 $fields[$MIRRORED] = 'N';
9778 $_ = join ';', $code_point, @fields;
9779 }
9780 return;
9781 }
9782} # End closure for UnicodeData
9783
37e2e78e
KW
9784sub process_GCB_test {
9785
9786 my $file = shift;
9787 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9788
9789 while ($file->next_line) {
9790 push @backslash_X_tests, $_;
9791 }
678f13d5 9792
37e2e78e
KW
9793 return;
9794}
9795
99870f4d
KW
9796sub process_NamedSequences {
9797 # NamedSequences.txt entries are just added to an array. Because these
9798 # don't look like the other tables, they have their own handler.
9799 # An example:
9800 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
9801 #
9802 # This just adds the sequence to an array for later handling
9803
99870f4d
KW
9804 my $file = shift;
9805 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9806
9807 while ($file->next_line) {
9808 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
9809 if (@remainder) {
9810 $file->carp_bad_line(
9811 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
9812 next;
9813 }
fb121860
KW
9814
9815 # Note single \t in keeping with special output format of
9816 # Perl_charnames. But it turns out that the code points don't have to
9817 # be 5 digits long, like the rest, based on the internal workings of
9818 # charnames.pm. This could be easily changed for consistency.
9819 push @named_sequences, "$sequence\t$name";
99870f4d
KW
9820 }
9821 return;
9822}
9823
9824{ # Closure
9825
9826 my $first_range;
9827
9828 sub filter_early_ea_lb {
9829 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
9830 # third field be the name of the code point, which can be ignored in
9831 # most cases. But it can be meaningful if it marks a range:
9832 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
9833 # 3400;W;<CJK Ideograph Extension A, First>
9834 #
9835 # We need to see the First in the example above to know it's a range.
9836 # They did not use the later range syntaxes. This routine changes it
9837 # to use the modern syntax.
9838 # $1 is the Input_file object.
9839
9840 my @fields = split /\s*;\s*/;
9841 if ($fields[2] =~ /^<.*, First>/) {
9842 $first_range = $fields[0];
9843 $_ = "";
9844 }
9845 elsif ($fields[2] =~ /^<.*, Last>/) {
9846 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
9847 }
9848 else {
9849 undef $first_range;
9850 $_ = "$fields[0]; $fields[1]";
9851 }
9852
9853 return;
9854 }
9855}
9856
9857sub filter_old_style_arabic_shaping {
9858 # Early versions used a different term for the later one.
9859
9860 my @fields = split /\s*;\s*/;
9861 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
9862 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
9863 $_ = join ';', @fields;
9864 return;
9865}
9866
9867sub filter_arabic_shaping_line {
9868 # ArabicShaping.txt has entries that look like:
9869 # 062A; TEH; D; BEH
9870 # The field containing 'TEH' is not used. The next field is Joining_Type
9871 # and the last is Joining_Group
9872 # This generates two lines to pass on, one for each property on the input
9873 # line.
9874
9875 my $file = shift;
9876 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9877
9878 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9879
9880 if (@fields > 4) {
9881 $file->carp_bad_line('Extra fields');
9882 $_ = "";
9883 return;
9884 }
9885
9886 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
9887 $_ = "$fields[0]; Joining_Type; $fields[2]";
9888
9889 return;
9890}
9891
9892sub setup_special_casing {
9893 # SpecialCasing.txt contains the non-simple case change mappings. The
959ce5bf
KW
9894 # simple ones are in UnicodeData.txt, which should already have been read
9895 # in to the full property data structures, so as to initialize these with
9896 # the simple ones. Then the SpecialCasing.txt entries overwrite the ones
9897 # which have different full mappings.
9898
9899 # This routine sees if the simple mappings are to be output, and if so,
9900 # copies what has already been put into the full mapping tables, while
9901 # they still contain only the simple mappings.
9902
9903 # The reason it is done this way is that the simple mappings are probably
9904 # not going to be output, so it saves work to initialize the full tables
9905 # with the simple mappings, and then overwrite those relatively few
9906 # entries in them that have different full mappings, and thus skip the
9907 # simple mapping tables altogether.
99870f4d
KW
9908
9909 my $file= shift;
9910 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9911
9912 # For each of the case change mappings...
9913 foreach my $case ('lc', 'tc', 'uc') {
959ce5bf
KW
9914 my $full = property_ref($case);
9915 unless (defined $full && ! $full->is_empty) {
9916 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
9917 }
99870f4d
KW
9918
9919 # The simple version's name in each mapping merely has an 's' in front
9920 # of the full one's
9921 my $simple = property_ref('s' . $case);
d7078fb7 9922 $simple->initialize($full) if $simple->to_output_map();
99870f4d
KW
9923 }
9924
9925 return;
9926}
9927
9928sub filter_special_casing_line {
9929 # Change the format of $_ from SpecialCasing.txt into something that the
9930 # generic handler understands. Each input line contains three case
9931 # mappings. This will generate three lines to pass to the generic handler
9932 # for each of those.
9933
9934 # The input syntax (after stripping comments and trailing white space is
9935 # like one of the following (with the final two being entries that we
9936 # ignore):
9937 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
9938 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
9939 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
9940 # Note the trailing semi-colon, unlike many of the input files. That
9941 # means that there will be an extra null field generated by the split
9942
9943 my $file = shift;
9944 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9945
9946 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9947
9948 # field #4 is when this mapping is conditional. If any of these get
9949 # implemented, it would be by hard-coding in the casing functions in the
9950 # Perl core, not through tables. But if there is a new condition we don't
9951 # know about, output a warning. We know about all the conditions through
9952 # 5.2
9953 if ($fields[4] ne "") {
9954 my @conditions = split ' ', $fields[4];
9955 if ($conditions[0] ne 'tr' # We know that these languages have
9956 # conditions, and some are multiple
9957 && $conditions[0] ne 'az'
9958 && $conditions[0] ne 'lt'
9959
9960 # And, we know about a single condition Final_Sigma, but
9961 # nothing else.
9962 && ($v_version gt v5.2.0
9963 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
9964 {
9965 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
9966 }
9967 elsif ($conditions[0] ne 'Final_Sigma') {
9968
9969 # Don't print out a message for Final_Sigma, because we have
9970 # hard-coded handling for it. (But the standard could change
9971 # what the rule should be, but it wouldn't show up here
9972 # anyway.
9973
9974 print "# SKIPPING Special Casing: $_\n"
9975 if $verbosity >= $VERBOSE;
9976 }
9977 $_ = "";
9978 return;
9979 }
9980 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
9981 $file->carp_bad_line('Extra fields');
9982 $_ = "";
9983 return;
9984 }
9985
9986 $_ = "$fields[0]; lc; $fields[1]";
9987 $file->insert_adjusted_lines("$fields[0]; tc; $fields[2]");
9988 $file->insert_adjusted_lines("$fields[0]; uc; $fields[3]");
9989
9990 return;
9991}
9992
9993sub filter_old_style_case_folding {
9994 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
f86864ac 9995 # and later style. Different letters were used in the earlier.
99870f4d
KW
9996
9997 my $file = shift;
9998 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9999
10000 my @fields = split /\s*;\s*/;
10001 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
10002 $fields[1] = 'I';
10003 }
10004 elsif ($fields[1] eq 'L') {
10005 $fields[1] = 'C'; # L => C always
10006 }
10007 elsif ($fields[1] eq 'E') {
10008 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
10009 $fields[1] = 'F'
10010 }
10011 else {
10012 $fields[1] = 'C'
10013 }
10014 }
10015 else {
10016 $file->carp_bad_line("Expecting L or E in second field");
10017 $_ = "";
10018 return;
10019 }
10020 $_ = join("; ", @fields) . ';';
10021 return;
10022}
10023
10024{ # Closure for case folding
10025
10026 # Create the map for simple only if are going to output it, for otherwise
10027 # it takes no part in anything we do.
10028 my $to_output_simple;
10029
ebda5909 10030 # XXX
99870f4d
KW
10031 # These are experimental, perhaps will need these to pass to regcomp.c to
10032 # handle the cases where for example the Kelvin sign character folds to k,
10033 # and in regcomp, we need to know which of the characters can have a
10034 # non-latin1 char fold to it, so it doesn't do the optimizations it might
10035 # otherwise.
10036 my @latin1_singly_folded;
10037 my @latin1_folded;
10038
10039 sub setup_case_folding($) {
10040 # Read in the case foldings in CaseFolding.txt. This handles both
10041 # simple and full case folding.
10042
10043 $to_output_simple
10044 = property_ref('Simple_Case_Folding')->to_output_map;
10045
10046 return;
10047 }
10048
10049 sub filter_case_folding_line {
10050 # Called for each line in CaseFolding.txt
10051 # Input lines look like:
10052 # 0041; C; 0061; # LATIN CAPITAL LETTER A
10053 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
10054 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
10055 #
10056 # 'C' means that folding is the same for both simple and full
10057 # 'F' that it is only for full folding
10058 # 'S' that it is only for simple folding
10059 # 'T' is locale-dependent, and ignored
10060 # 'I' is a type of 'F' used in some early releases.
10061 # Note the trailing semi-colon, unlike many of the input files. That
10062 # means that there will be an extra null field generated by the split
10063 # below, which we ignore and hence is not an error.
10064
10065 my $file = shift;
10066 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10067
10068 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
10069 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
10070 $file->carp_bad_line('Extra fields');
10071 $_ = "";
10072 return;
10073 }
10074
10075 if ($type eq 'T') { # Skip Turkic case folding, is locale dependent
10076 $_ = "";
10077 return;
10078 }
10079
10080 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
10081 # I are all full foldings
10082 if ($type eq 'C' || $type eq 'F' || $type eq 'I') {
10083 $_ = "$range; Case_Folding; $map";
10084 }
10085 else {
10086 $_ = "";
10087 if ($type ne 'S') {
10088 $file->carp_bad_line('Expecting C F I S or T in second field');
10089 return;
10090 }
10091 }
10092
10093 # C and S are simple foldings, but simple case folding is not needed
10094 # unless we explicitly want its map table output.
10095 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
10096 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
10097 }
10098
ebda5909 10099 # XXX Experimental, see comment above
99870f4d
KW
10100 if ($type ne 'S' && hex($range) >= 256) { # assumes range is 1 point
10101 my @folded = split ' ', $map;
10102 if (hex $folded[0] < 256 && @folded == 1) {
10103 push @latin1_singly_folded, hex $folded[0];
10104 }
10105 foreach my $folded (@folded) {
10106 push @latin1_folded, hex $folded if hex $folded < 256;
10107 }
10108 }
10109
10110 return;
10111 }
10112
10113 sub post_fold {
ebda5909 10114 # XXX Experimental, see comment above
99870f4d
KW
10115 return;
10116
10117 #local $to_trace = 1 if main::DEBUG;
10118 @latin1_singly_folded = uniques(@latin1_singly_folded);
10119 @latin1_folded = uniques(@latin1_folded);
10120 trace "latin1 single folded:", map { chr $_ } sort { $a <=> $b } @latin1_singly_folded if main::DEBUG && $to_trace;
10121 trace "latin1 folded:", map { chr $_ } sort { $a <=> $b } @latin1_folded if main::DEBUG && $to_trace;
10122 return;
10123 }
10124} # End case fold closure
10125
10126sub filter_jamo_line {
10127 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
10128 # from this file that is used in generating the Name property for Jamo
10129 # code points. But, it also is used to convert early versions' syntax
10130 # into the modern form. Here are two examples:
10131 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
10132 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
10133 #
10134 # The input is $_, the output is $_ filtered.
10135
10136 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10137
10138 # Let the caller handle unexpected input. In earlier versions, there was
10139 # a third field which is supposed to be a comment, but did not have a '#'
10140 # before it.
10141 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
10142
10143 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
10144 # beginning.
10145
10146 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
10147 $fields[1] = 'R' if $fields[0] eq '1105';
10148
10149 # Add to structure so can generate Names from it.
10150 my $cp = hex $fields[0];
10151 my $short_name = $fields[1];
10152 $Jamo{$cp} = $short_name;
10153 if ($cp <= $LBase + $LCount) {
10154 $Jamo_L{$short_name} = $cp - $LBase;
10155 }
10156 elsif ($cp <= $VBase + $VCount) {
10157 $Jamo_V{$short_name} = $cp - $VBase;
10158 }
10159 elsif ($cp <= $TBase + $TCount) {
10160 $Jamo_T{$short_name} = $cp - $TBase;
10161 }
10162 else {
10163 Carp::my_carp_bug("Unexpected Jamo code point in $_");
10164 }
10165
10166
10167 # Reassemble using just the first two fields to look like a typical
10168 # property file line
10169 $_ = "$fields[0]; $fields[1]";
10170
10171 return;
10172}
10173
99870f4d
KW
10174sub register_fraction($) {
10175 # This registers the input rational number so that it can be passed on to
10176 # utf8_heavy.pl, both in rational and floating forms.
10177
10178 my $rational = shift;
10179
10180 my $float = eval $rational;
10181 $nv_floating_to_rational{$float} = $rational;
10182 return;
10183}
10184
10185sub filter_numeric_value_line {
10186 # DNumValues contains lines of a different syntax than the typical
10187 # property file:
10188 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
10189 #
10190 # This routine transforms $_ containing the anomalous syntax to the
10191 # typical, by filtering out the extra columns, and convert early version
10192 # decimal numbers to strings that look like rational numbers.
10193
10194 my $file = shift;
10195 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10196
10197 # Starting in 5.1, there is a rational field. Just use that, omitting the
10198 # extra columns. Otherwise convert the decimal number in the second field
10199 # to a rational, and omit extraneous columns.
10200 my @fields = split /\s*;\s*/, $_, -1;
10201 my $rational;
10202
10203 if ($v_version ge v5.1.0) {
10204 if (@fields != 4) {
10205 $file->carp_bad_line('Not 4 semi-colon separated fields');
10206 $_ = "";
10207 return;
10208 }
10209 $rational = $fields[3];
10210 $_ = join '; ', @fields[ 0, 3 ];
10211 }
10212 else {
10213
10214 # Here, is an older Unicode file, which has decimal numbers instead of
10215 # rationals in it. Use the fraction to calculate the denominator and
10216 # convert to rational.
10217
10218 if (@fields != 2 && @fields != 3) {
10219 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
10220 $_ = "";
10221 return;
10222 }
10223
10224 my $codepoints = $fields[0];
10225 my $decimal = $fields[1];
10226 if ($decimal =~ s/\.0+$//) {
10227
10228 # Anything ending with a decimal followed by nothing but 0's is an
10229 # integer
10230 $_ = "$codepoints; $decimal";
10231 $rational = $decimal;
10232 }
10233 else {
10234
10235 my $denominator;
10236 if ($decimal =~ /\.50*$/) {
10237 $denominator = 2;
10238 }
10239
10240 # Here have the hardcoded repeating decimals in the fraction, and
10241 # the denominator they imply. There were only a few denominators
10242 # in the older Unicode versions of this file which this code
10243 # handles, so it is easy to convert them.
10244
10245 # The 4 is because of a round-off error in the Unicode 3.2 files
10246 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
10247 $denominator = 3;
10248 }
10249 elsif ($decimal =~ /\.[27]50*$/) {
10250 $denominator = 4;
10251 }
10252 elsif ($decimal =~ /\.[2468]0*$/) {
10253 $denominator = 5;
10254 }
10255 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
10256 $denominator = 6;
10257 }
10258 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
10259 $denominator = 8;
10260 }
10261 if ($denominator) {
10262 my $sign = ($decimal < 0) ? "-" : "";
10263 my $numerator = int((abs($decimal) * $denominator) + .5);
10264 $rational = "$sign$numerator/$denominator";
10265 $_ = "$codepoints; $rational";
10266 }
10267 else {
10268 $file->carp_bad_line("Can't cope with number '$decimal'.");
10269 $_ = "";
10270 return;
10271 }
10272 }
10273 }
10274
10275 register_fraction($rational) if $rational =~ qr{/};
10276 return;
10277}
10278
10279{ # Closure
10280 my %unihan_properties;
10281 my $iicore;
10282
10283
10284 sub setup_unihan {
10285 # Do any special setup for Unihan properties.
10286
10287 # This property gives the wrong computed type, so override.
10288 my $usource = property_ref('kIRG_USource');
10289 $usource->set_type($STRING) if defined $usource;
10290
10291 # This property is to be considered binary, so change all the values
10292 # to Y.
10293 $iicore = property_ref('kIICore');
10294 if (defined $iicore) {
10295 $iicore->add_match_table('Y') if ! defined $iicore->table('Y');
10296
10297 # We have to change the default map, because the @missing line is
10298 # misleading, given that we are treating it as binary.
10299 $iicore->set_default_map('N');
10300 $iicore->set_type($BINARY);
10301 }
10302
10303 return;
10304 }
10305
10306 sub filter_unihan_line {
10307 # Change unihan db lines to look like the others in the db. Here is
10308 # an input sample:
10309 # U+341C kCangjie IEKN
10310
10311 # Tabs are used instead of semi-colons to separate fields; therefore
10312 # they may have semi-colons embedded in them. Change these to periods
10313 # so won't screw up the rest of the code.
10314 s/;/./g;
10315
10316 # Remove lines that don't look like ones we accept.
10317 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
10318 $_ = "";
10319 return;
10320 }
10321
10322 # Extract the property, and save a reference to its object.
10323 my $property = $1;
10324 if (! exists $unihan_properties{$property}) {
10325 $unihan_properties{$property} = property_ref($property);
10326 }
10327
10328 # Don't do anything unless the property is one we're handling, which
10329 # we determine by seeing if there is an object defined for it or not
10330 if (! defined $unihan_properties{$property}) {
10331 $_ = "";
10332 return;
10333 }
10334
10335 # The iicore property is supposed to be a boolean, so convert to our
10336 # standard boolean form.
10337 if (defined $iicore && $unihan_properties{$property} == $iicore) {
10338 $_ =~ s/$property.*/$property\tY/
10339 }
10340
10341 # Convert the tab separators to our standard semi-colons, and convert
10342 # the U+HHHH notation to the rest of the standard's HHHH
10343 s/\t/;/g;
10344 s/\b U \+ (?= $code_point_re )//xg;
10345
10346 #local $to_trace = 1 if main::DEBUG;
10347 trace $_ if main::DEBUG && $to_trace;
10348
10349 return;
10350 }
10351}
10352
10353sub filter_blocks_lines {
10354 # In the Blocks.txt file, the names of the blocks don't quite match the
10355 # names given in PropertyValueAliases.txt, so this changes them so they
10356 # do match: Blanks and hyphens are changed into underscores. Also makes
10357 # early release versions look like later ones
10358 #
10359 # $_ is transformed to the correct value.
10360
10361 my $file = shift;
10362 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10363
10364 if ($v_version lt v3.2.0) {
10365 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
10366 $_ = "";
10367 return;
10368 }
10369
10370 # Old versions used a different syntax to mark the range.
10371 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
10372 }
10373
10374 my @fields = split /\s*;\s*/, $_, -1;
10375 if (@fields != 2) {
10376 $file->carp_bad_line("Expecting exactly two fields");
10377 $_ = "";
10378 return;
10379 }
10380
10381 # Change hyphens and blanks in the block name field only
10382 $fields[1] =~ s/[ -]/_/g;
10383 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
10384
10385 $_ = join("; ", @fields);
10386 return;
10387}
10388
10389{ # Closure
10390 my $current_property;
10391
10392 sub filter_old_style_proplist {
10393 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
10394 # was in a completely different syntax. Ken Whistler of Unicode says
10395 # that it was something he used as an aid for his own purposes, but
10396 # was never an official part of the standard. However, comments in
10397 # DAge.txt indicate that non-character code points were available in
10398 # the UCD as of 3.1. It is unclear to me (khw) how they could be
10399 # there except through this file (but on the other hand, they first
10400 # appeared there in 3.0.1), so maybe it was part of the UCD, and maybe
10401 # not. But the claim is that it was published as an aid to others who
10402 # might want some more information than was given in the official UCD
10403 # of the time. Many of the properties in it were incorporated into
10404 # the later PropList.txt, but some were not. This program uses this
10405 # early file to generate property tables that are otherwise not
10406 # accessible in the early UCD's, and most were probably not really
10407 # official at that time, so one could argue that it should be ignored,
10408 # and you can easily modify things to skip this. And there are bugs
10409 # in this file in various versions. (For example, the 2.1.9 version
10410 # removes from Alphabetic the CJK range starting at 4E00, and they
10411 # weren't added back in until 3.1.0.) Many of this file's properties
10412 # were later sanctioned, so this code generates tables for those
10413 # properties that aren't otherwise in the UCD of the time but
10414 # eventually did become official, and throws away the rest. Here is a
10415 # list of all the ones that are thrown away:
10416 # Bidi=* duplicates UnicodeData.txt
10417 # Combining never made into official property;
10418 # is \P{ccc=0}
10419 # Composite never made into official property.
10420 # Currency Symbol duplicates UnicodeData.txt: gc=sc
10421 # Decimal Digit duplicates UnicodeData.txt: gc=nd
10422 # Delimiter never made into official property;
10423 # removed in 3.0.1
10424 # Format Control never made into official property;
10425 # similar to gc=cf
10426 # High Surrogate duplicates Blocks.txt
10427 # Ignorable Control never made into official property;
10428 # similar to di=y
10429 # ISO Control duplicates UnicodeData.txt: gc=cc
10430 # Left of Pair never made into official property;
10431 # Line Separator duplicates UnicodeData.txt: gc=zl
10432 # Low Surrogate duplicates Blocks.txt
10433 # Non-break was actually listed as a property
10434 # in 3.2, but without any code
10435 # points. Unicode denies that this
10436 # was ever an official property
10437 # Non-spacing duplicate UnicodeData.txt: gc=mn
10438 # Numeric duplicates UnicodeData.txt: gc=cc
10439 # Paired Punctuation never made into official property;
10440 # appears to be gc=ps + gc=pe
10441 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
10442 # Private Use duplicates UnicodeData.txt: gc=co
10443 # Private Use High Surrogate duplicates Blocks.txt
10444 # Punctuation duplicates UnicodeData.txt: gc=p
10445 # Space different definition than eventual
10446 # one.
10447 # Titlecase duplicates UnicodeData.txt: gc=lt
10448 # Unassigned Code Value duplicates UnicodeData.txt: gc=cc
10449 # Zero-width never made into offical property;
10450 # subset of gc=cf
10451 # Most of the properties have the same names in this file as in later
10452 # versions, but a couple do not.
10453 #
10454 # This subroutine filters $_, converting it from the old style into
10455 # the new style. Here's a sample of the old-style
10456 #
10457 # *******************************************
10458 #
10459 # Property dump for: 0x100000A0 (Join Control)
10460 #
10461 # 200C..200D (2 chars)
10462 #
10463 # In the example, the property is "Join Control". It is kept in this
10464 # closure between calls to the subroutine. The numbers beginning with
10465 # 0x were internal to Ken's program that generated this file.
10466
10467 # If this line contains the property name, extract it.
10468 if (/^Property dump for: [^(]*\((.*)\)/) {
10469 $_ = $1;
10470
10471 # Convert white space to underscores.
10472 s/ /_/g;
10473
10474 # Convert the few properties that don't have the same name as
10475 # their modern counterparts
10476 s/Identifier_Part/ID_Continue/
10477 or s/Not_a_Character/NChar/;
10478
10479 # If the name matches an existing property, use it.
10480 if (defined property_ref($_)) {
10481 trace "new property=", $_ if main::DEBUG && $to_trace;
10482 $current_property = $_;
10483 }
10484 else { # Otherwise discard it
10485 trace "rejected property=", $_ if main::DEBUG && $to_trace;
10486 undef $current_property;
10487 }
10488 $_ = ""; # The property is saved for the next lines of the
10489 # file, but this defining line is of no further use,
10490 # so clear it so that the caller won't process it
10491 # further.
10492 }
10493 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
10494
10495 # Here, the input line isn't a header defining a property for the
10496 # following section, and either we aren't in such a section, or
10497 # the line doesn't look like one that defines the code points in
10498 # such a section. Ignore this line.
10499 $_ = "";
10500 }
10501 else {
10502
10503 # Here, we have a line defining the code points for the current
10504 # stashed property. Anything starting with the first blank is
10505 # extraneous. Otherwise, it should look like a normal range to
10506 # the caller. Append the property name so that it looks just like
10507 # a modern PropList entry.
10508
10509 $_ =~ s/\s.*//;
10510 $_ .= "; $current_property";
10511 }
10512 trace $_ if main::DEBUG && $to_trace;
10513 return;
10514 }
10515} # End closure for old style proplist
10516
10517sub filter_old_style_normalization_lines {
10518 # For early releases of Unicode, the lines were like:
10519 # 74..2A76 ; NFKD_NO
10520 # For later releases this became:
10521 # 74..2A76 ; NFKD_QC; N
10522 # Filter $_ to look like those in later releases.
10523 # Similarly for MAYBEs
10524
10525 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
10526
10527 # Also, the property FC_NFKC was abbreviated to FNC
10528 s/FNC/FC_NFKC/;
10529 return;
10530}
10531
10532sub finish_Unicode() {
10533 # This routine should be called after all the Unicode files have been read
10534 # in. It:
10535 # 1) Adds the mappings for code points missing from the files which have
10536 # defaults specified for them.
10537 # 2) At this this point all mappings are known, so it computes the type of
10538 # each property whose type hasn't been determined yet.
10539 # 3) Calculates all the regular expression match tables based on the
10540 # mappings.
10541 # 3) Calculates and adds the tables which are defined by Unicode, but
10542 # which aren't derived by them
10543
10544 # For each property, fill in any missing mappings, and calculate the re
10545 # match tables. If a property has more than one missing mapping, the
10546 # default is a reference to a data structure, and requires data from other
10547 # properties to resolve. The sort is used to cause these to be processed
10548 # last, after all the other properties have been calculated.
10549 # (Fortunately, the missing properties so far don't depend on each other.)
10550 foreach my $property
10551 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
10552 property_ref('*'))
10553 {
10554 # $perl has been defined, but isn't one of the Unicode properties that
10555 # need to be finished up.
10556 next if $property == $perl;
10557
10558 # Handle the properties that have more than one possible default
10559 if (ref $property->default_map) {
10560 my $default_map = $property->default_map;
10561
10562 # These properties have stored in the default_map:
10563 # One or more of:
10564 # 1) A default map which applies to all code points in a
10565 # certain class
10566 # 2) an expression which will evaluate to the list of code
10567 # points in that class
10568 # And
10569 # 3) the default map which applies to every other missing code
10570 # point.
10571 #
10572 # Go through each list.
10573 while (my ($default, $eval) = $default_map->get_next_defaults) {
10574
10575 # Get the class list, and intersect it with all the so-far
10576 # unspecified code points yielding all the code points
10577 # in the class that haven't been specified.
10578 my $list = eval $eval;
10579 if ($@) {
10580 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
10581 last;
10582 }
10583
10584 # Narrow down the list to just those code points we don't have
10585 # maps for yet.
10586 $list = $list & $property->inverse_list;
10587
10588 # Add mappings to the property for each code point in the list
10589 foreach my $range ($list->ranges) {
10590 $property->add_map($range->start, $range->end, $default);
10591 }
10592 }
10593
10594 # All remaining code points have the other mapping. Set that up
10595 # so the normal single-default mapping code will work on them
10596 $property->set_default_map($default_map->other_default);
10597
10598 # And fall through to do that
10599 }
10600
10601 # We should have enough data now to compute the type of the property.
10602 $property->compute_type;
10603 my $property_type = $property->type;
10604
10605 next if ! $property->to_create_match_tables;
10606
10607 # Here want to create match tables for this property
10608
10609 # The Unicode db always (so far, and they claim into the future) have
10610 # the default for missing entries in binary properties be 'N' (unless
10611 # there is a '@missing' line that specifies otherwise)
10612 if ($property_type == $BINARY && ! defined $property->default_map) {
10613 $property->set_default_map('N');
10614 }
10615
10616 # Add any remaining code points to the mapping, using the default for
10617 # missing code points
10618 if (defined (my $default_map = $property->default_map)) {
10619 foreach my $range ($property->inverse_list->ranges) {
10620 $property->add_map($range->start, $range->end, $default_map);
10621 }
10622
10623 # Make sure there is a match table for the default
10624 if (! defined $property->table($default_map)) {
10625 $property->add_match_table($default_map);
10626 }
10627 }
10628
10629 # Have all we need to populate the match tables.
10630 my $property_name = $property->name;
10631 foreach my $range ($property->ranges) {
10632 my $map = $range->value;
10633 my $table = property_ref($property_name)->table($map);
10634 if (! defined $table) {
10635
10636 # Integral and rational property values are not necessarily
10637 # defined in PropValueAliases, but all other ones should be,
10638 # starting in 5.1
10639 if ($v_version ge v5.1.0
10640 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
10641 {
10642 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
10643 }
10644 $table = property_ref($property_name)->add_match_table($map);
10645 }
10646
10647 $table->add_range($range->start, $range->end);
10648 }
10649
10650 # And add the Is_ prefix synonyms for Perl 5.6 compatibility, in which
10651 # all properties have this optional prefix. These do not get a
10652 # separate entry in the pod file, because are covered by a wild-card
10653 # entry
10654 foreach my $alias ($property->aliases) {
10655 my $Is_name = 'Is_' . $alias->name;
10656 if (! defined (my $pre_existing = property_ref($Is_name))) {
10657 $property->add_alias($Is_name,
10658 Pod_Entry => 0,
10659 Status => $alias->status,
10660 Externally_Ok => 0);
10661 }
10662 else {
10663
10664 # It seemed too much work to add in these warnings when it
10665 # appears that Unicode has made a decision never to begin a
10666 # property name with 'Is_', so this shouldn't happen, but just
10667 # in case, it is a warning.
10668 Carp::my_carp(<<END
10669There is already an alias named $Is_name (from " . $pre_existing . "), so not
10670creating this alias for $property. The generated table and pod files do not
10671warn users of this conflict.
10672END
10673 );
10674 $has_Is_conflicts++;
10675 }
10676 } # End of loop through aliases for this property
10677 } # End of loop through all Unicode properties.
10678
10679 # Fill in the mappings that Unicode doesn't completely furnish. First the
10680 # single letter major general categories. If Unicode were to start
10681 # delivering the values, this would be redundant, but better that than to
10682 # try to figure out if should skip and not get it right. Ths could happen
10683 # if a new major category were to be introduced, and the hard-coded test
10684 # wouldn't know about it.
10685 # This routine depends on the standard names for the general categories
10686 # being what it thinks they are, like 'Cn'. The major categories are the
10687 # union of all the general category tables which have the same first
10688 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
10689 foreach my $minor_table ($gc->tables) {
10690 my $minor_name = $minor_table->name;
10691 next if length $minor_name == 1;
10692 if (length $minor_name != 2) {
10693 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
10694 next;
10695 }
10696
10697 my $major_name = uc(substr($minor_name, 0, 1));
10698 my $major_table = $gc->table($major_name);
10699 $major_table += $minor_table;
10700 }
10701
10702 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
10703 # defines it as LC)
10704 my $LC = $gc->table('LC');
10705 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
10706 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
10707
10708
10709 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
10710 # deliver the correct values in it
10711 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
10712
10713 # Lt not in release 1.
10714 $LC += $gc->table('Lt') if defined $gc->table('Lt');
10715 }
10716 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
10717
10718 my $Cs = $gc->table('Cs');
10719 if (defined $Cs) {
10720 $Cs->add_note('Mostly not usable in Perl.');
10721 $Cs->add_comment(join_lines(<<END
10722Surrogates are used exclusively for I/O in UTF-16, and should not appear in
10723Unicode text, and hence their use will generate (usually fatal) messages
10724END
10725 ));
10726 }
10727
10728
10729 # Folding information was introduced later into Unicode data. To get
10730 # Perl's case ignore (/i) to work at all in releases that don't have
10731 # folding, use the best available alternative, which is lower casing.
10732 my $fold = property_ref('Simple_Case_Folding');
10733 if ($fold->is_empty) {
10734 $fold->initialize(property_ref('Simple_Lowercase_Mapping'));
10735 $fold->add_note(join_lines(<<END
10736WARNING: This table uses lower case as a substitute for missing fold
10737information
10738END
10739 ));
10740 }
10741
10742 # Multiple-character mapping was introduced later into Unicode data. If
10743 # missing, use the single-characters maps as best available alternative
10744 foreach my $map (qw { Uppercase_Mapping
10745 Lowercase_Mapping
10746 Titlecase_Mapping
10747 Case_Folding
10748 } ) {
10749 my $full = property_ref($map);
10750 if ($full->is_empty) {
10751 my $simple = property_ref('Simple_' . $map);
10752 $full->initialize($simple);
10753 $full->add_comment($simple->comment) if ($simple->comment);
10754 $full->add_note(join_lines(<<END
10755WARNING: This table uses simple mapping (single-character only) as a
10756substitute for missing multiple-character information
10757END
10758 ));
10759 }
10760 }
10761 return
10762}
10763
10764sub compile_perl() {
10765 # Create perl-defined tables. Almost all are part of the pseudo-property
10766 # named 'perl' internally to this program. Many of these are recommended
10767 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
10768 # on those found there.
10769 # Almost all of these are equivalent to some Unicode property.
10770 # A number of these properties have equivalents restricted to the ASCII
10771 # range, with their names prefaced by 'Posix', to signify that these match
10772 # what the Posix standard says they should match. A couple are
10773 # effectively this, but the name doesn't have 'Posix' in it because there
10774 # just isn't any Posix equivalent.
10775
10776 # 'Any' is all code points. As an error check, instead of just setting it
10777 # to be that, construct it to be the union of all the major categories
10778 my $Any = $perl->add_match_table('Any',
10779 Description => "[\\x{0000}-\\x{$LAST_UNICODE_CODEPOINT_STRING}]",
10780 Matches_All => 1);
10781
10782 foreach my $major_table ($gc->tables) {
10783
10784 # Major categories are the ones with single letter names.
10785 next if length($major_table->name) != 1;
10786
10787 $Any += $major_table;
10788 }
10789
10790 if ($Any->max != $LAST_UNICODE_CODEPOINT) {
10791 Carp::my_carp_bug("Generated highest code point ("
10792 . sprintf("%X", $Any->max)
10793 . ") doesn't match expected value $LAST_UNICODE_CODEPOINT_STRING.")
10794 }
10795 if ($Any->range_count != 1 || $Any->min != 0) {
10796 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
10797 }
10798
10799 $Any->add_alias('All');
10800
10801 # Assigned is the opposite of gc=unassigned
10802 my $Assigned = $perl->add_match_table('Assigned',
10803 Description => "All assigned code points",
10804 Initialize => ~ $gc->table('Unassigned'),
10805 );
10806
10807 # Our internal-only property should be treated as more than just a
10808 # synonym.
10809 $perl->add_match_table('_CombAbove')
10810 ->set_equivalent_to(property_ref('ccc')->table('Above'),
10811 Related => 1);
10812
10813 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
10814 if (defined $block) { # This is equivalent to the block if have it.
10815 my $Unicode_ASCII = $block->table('Basic_Latin');
10816 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
10817 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
10818 }
10819 }
10820
10821 # Very early releases didn't have blocks, so initialize ASCII ourselves if
10822 # necessary
10823 if ($ASCII->is_empty) {
10824 $ASCII->initialize([ 0..127 ]);
10825 }
10826
99870f4d
KW
10827 # Get the best available case definitions. Early Unicode versions didn't
10828 # have Uppercase and Lowercase defined, so use the general category
10829 # instead for them.
10830 my $Lower = $perl->add_match_table('Lower');
10831 my $Unicode_Lower = property_ref('Lowercase');
10832 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
10833 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
10834 }
10835 else {
10836 $Lower->set_equivalent_to($gc->table('Lowercase_Letter'),
10837 Related => 1);
10838 }
ad5e8af1
KW
10839 $perl->add_match_table("PosixLower",
10840 Description => "[a-z]",
10841 Initialize => $Lower & $ASCII,
10842 );
99870f4d
KW
10843
10844 my $Upper = $perl->add_match_table('Upper');
10845 my $Unicode_Upper = property_ref('Uppercase');
10846 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
10847 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
10848 }
10849 else {
10850 $Upper->set_equivalent_to($gc->table('Uppercase_Letter'),
10851 Related => 1);
10852 }
ad5e8af1
KW
10853 $perl->add_match_table("PosixUpper",
10854 Description => "[A-Z]",
10855 Initialize => $Upper & $ASCII,
10856 );
99870f4d
KW
10857
10858 # Earliest releases didn't have title case. Initialize it to empty if not
10859 # otherwise present
10860 my $Title = $perl->add_match_table('Title');
10861 my $lt = $gc->table('Lt');
10862 if (defined $lt) {
10863 $Title->set_equivalent_to($lt, Related => 1);
10864 }
10865
10866 # If this Unicode version doesn't have Cased, set up our own. From
10867 # Unicode 5.1: Definition D120: A character C is defined to be cased if
10868 # and only if C has the Lowercase or Uppercase property or has a
10869 # General_Category value of Titlecase_Letter.
10870 unless (defined property_ref('Cased')) {
10871 my $cased = $perl->add_match_table('Cased',
10872 Initialize => $Lower + $Upper + $Title,
10873 Description => 'Uppercase or Lowercase or Titlecase',
10874 );
10875 }
10876
10877 # Similarly, set up our own Case_Ignorable property if this Unicode
10878 # version doesn't have it. From Unicode 5.1: Definition D121: A character
10879 # C is defined to be case-ignorable if C has the value MidLetter or the
10880 # value MidNumLet for the Word_Break property or its General_Category is
10881 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
10882 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
10883
10884 # Perl has long had an internal-only alias for this property.
10885 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable');
10886 my $case_ignorable = property_ref('Case_Ignorable');
10887 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
10888 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
10889 Related => 1);
10890 }
10891 else {
10892
10893 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
10894
10895 # The following three properties are not in early releases
10896 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
10897 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
10898 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
10899
10900 # For versions 4.1 - 5.0, there is no MidNumLet property, and
10901 # correspondingly the case-ignorable definition lacks that one. For
10902 # 4.0, it appears that it was meant to be the same definition, but was
10903 # inadvertently omitted from the standard's text, so add it if the
10904 # property actually is there
10905 my $wb = property_ref('Word_Break');
10906 if (defined $wb) {
10907 my $midlet = $wb->table('MidLetter');
10908 $perl_case_ignorable += $midlet if defined $midlet;
10909 my $midnumlet = $wb->table('MidNumLet');
10910 $perl_case_ignorable += $midnumlet if defined $midnumlet;
10911 }
10912 else {
10913
10914 # In earlier versions of the standard, instead of the above two
10915 # properties , just the following characters were used:
10916 $perl_case_ignorable += 0x0027 # APOSTROPHE
10917 + 0x00AD # SOFT HYPHEN (SHY)
10918 + 0x2019; # RIGHT SINGLE QUOTATION MARK
10919 }
10920 }
10921
10922 # The remaining perl defined tables are mostly based on Unicode TR 18,
10923 # "Annex C: Compatibility Properties". All of these have two versions,
10924 # one whose name generally begins with Posix that is posix-compliant, and
10925 # one that matches Unicode characters beyond the Posix, ASCII range
10926
ad5e8af1 10927 my $Alpha = $perl->add_match_table('Alpha');
99870f4d
KW
10928
10929 # Alphabetic was not present in early releases
10930 my $Alphabetic = property_ref('Alphabetic');
10931 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
10932 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
10933 }
10934 else {
10935
10936 # For early releases, we don't get it exactly right. The below
10937 # includes more than it should, which in 5.2 terms is: L + Nl +
10938 # Other_Alphabetic. Other_Alphabetic contains many characters from
10939 # Mn and Mc. It's better to match more than we should, than less than
10940 # we should.
10941 $Alpha->initialize($gc->table('Letter')
10942 + $gc->table('Mn')
10943 + $gc->table('Mc'));
10944 $Alpha += $gc->table('Nl') if defined $gc->table('Nl');
ad5e8af1 10945 $Alpha->add_description('Alphabetic');
99870f4d 10946 }
ad5e8af1
KW
10947 $perl->add_match_table("PosixAlpha",
10948 Description => "[A-Za-z]",
10949 Initialize => $Alpha & $ASCII,
10950 );
99870f4d
KW
10951
10952 my $Alnum = $perl->add_match_table('Alnum',
ad5e8af1 10953 Description => 'Alphabetic and (Decimal) Numeric',
99870f4d
KW
10954 Initialize => $Alpha + $gc->table('Decimal_Number'),
10955 );
ad5e8af1
KW
10956 $perl->add_match_table("PosixAlnum",
10957 Description => "[A-Za-z0-9]",
10958 Initialize => $Alnum & $ASCII,
10959 );
99870f4d
KW
10960
10961 my $Word = $perl->add_match_table('Word',
10962 Description => '\w, including beyond ASCII',
10963 Initialize => $Alnum + $gc->table('Mark'),
10964 );
10965 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
10966 $Word += $Pc if defined $Pc;
10967
f38f76ae 10968 # This is a Perl extension, so the name doesn't begin with Posix.
99870f4d
KW
10969 $perl->add_match_table('PerlWord',
10970 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
10971 Initialize => $Word & $ASCII,
10972 );
10973
10974 my $Blank = $perl->add_match_table('Blank',
10975 Description => '\h, Horizontal white space',
10976
10977 # 200B is Zero Width Space which is for line
10978 # break control, and was listed as
10979 # Space_Separator in early releases
10980 Initialize => $gc->table('Space_Separator')
10981 + 0x0009 # TAB
10982 - 0x200B, # ZWSP
10983 );
10984 $Blank->add_alias('HorizSpace'); # Another name for it.
ad5e8af1
KW
10985 $perl->add_match_table("PosixBlank",
10986 Description => "\\t and ' '",
10987 Initialize => $Blank & $ASCII,
10988 );
99870f4d
KW
10989
10990 my $VertSpace = $perl->add_match_table('VertSpace',
10991 Description => '\v',
10992 Initialize => $gc->table('Line_Separator')
10993 + $gc->table('Paragraph_Separator')
10994 + 0x000A # LINE FEED
10995 + 0x000B # VERTICAL TAB
10996 + 0x000C # FORM FEED
10997 + 0x000D # CARRIAGE RETURN
10998 + 0x0085, # NEL
10999 );
11000 # No Posix equivalent for vertical space
11001
11002 my $Space = $perl->add_match_table('Space',
ad5e8af1
KW
11003 Description => '\s including beyond ASCII plus vertical tab',
11004 Initialize => $Blank + $VertSpace,
99870f4d 11005 );
ad5e8af1 11006 $perl->add_match_table("PosixSpace",
f38f76ae 11007 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
ad5e8af1
KW
11008 Initialize => $Space & $ASCII,
11009 );
99870f4d
KW
11010
11011 # Perl's traditional space doesn't include Vertical Tab
11012 my $SpacePerl = $perl->add_match_table('SpacePerl',
11013 Description => '\s, including beyond ASCII',
11014 Initialize => $Space - 0x000B,
11015 );
11016 $perl->add_match_table('PerlSpace',
11017 Description => '\s, restricted to ASCII',
11018 Initialize => $SpacePerl & $ASCII,
11019 );
11020
11021 my $Cntrl = $perl->add_match_table('Cntrl',
ad5e8af1 11022 Description => 'Control characters');
99870f4d 11023 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
ad5e8af1 11024 $perl->add_match_table("PosixCntrl",
f38f76ae 11025 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
ad5e8af1
KW
11026 Initialize => $Cntrl & $ASCII,
11027 );
99870f4d
KW
11028
11029 # $controls is a temporary used to construct Graph.
11030 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
11031 + $gc->table('Control'));
11032 # Cs not in release 1
11033 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
11034
11035 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
11036 my $Graph = $perl->add_match_table('Graph',
ad5e8af1 11037 Description => 'Characters that are graphical',
99870f4d
KW
11038 Initialize => ~ ($Space + $controls),
11039 );
ad5e8af1 11040 $perl->add_match_table("PosixGraph",
f38f76ae
KW
11041 Description =>
11042 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
ad5e8af1
KW
11043 Initialize => $Graph & $ASCII,
11044 );
99870f4d 11045
3e20195b 11046 $print = $perl->add_match_table('Print',
ad5e8af1 11047 Description => 'Characters that are graphical plus space characters (but no controls)',
ae5b72c8 11048 Initialize => $Blank + $Graph - $gc->table('Control'),
99870f4d 11049 );
ad5e8af1 11050 $perl->add_match_table("PosixPrint",
66fd7fd0 11051 Description =>
f38f76ae 11052 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
3e20195b 11053 Initialize => $print & $ASCII,
ad5e8af1 11054 );
99870f4d
KW
11055
11056 my $Punct = $perl->add_match_table('Punct');
11057 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
11058
11059 # \p{punct} doesn't include the symbols, which posix does
11060 $perl->add_match_table('PosixPunct',
f38f76ae 11061 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
ad5e8af1
KW
11062 Initialize => $ASCII & ($gc->table('Punctuation')
11063 + $gc->table('Symbol')),
11064 );
99870f4d
KW
11065
11066 my $Digit = $perl->add_match_table('Digit',
11067 Description => '\d, extended beyond just [0-9]');
11068 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
ad5e8af1
KW
11069 my $PosixDigit = $perl->add_match_table("PosixDigit",
11070 Description => '[0-9]',
11071 Initialize => $Digit & $ASCII,
11072 );
99870f4d 11073
eadadd41
KW
11074 # Hex_Digit was not present in first release
11075 my $Xdigit = $perl->add_match_table('XDigit');
11076 my $Hex = property_ref('Hex_Digit');
11077 if (defined $Hex && ! $Hex->is_empty) {
11078 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
99870f4d
KW
11079 }
11080 else {
eadadd41
KW
11081 # (Have to use hex instead of e.g. '0', because could be running on an
11082 # non-ASCII machine, and we want the Unicode (ASCII) values)
11083 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
11084 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
11085 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
99870f4d
KW
11086 }
11087
99870f4d
KW
11088 my $dt = property_ref('Decomposition_Type');
11089 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
11090 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
11091 Perl_Extension => 1,
d57ccc9a 11092 Note => 'Union of all non-canonical decompositions',
99870f4d
KW
11093 );
11094
11095 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
11096 # than SD appeared, construct it ourselves, based on the first release SD
11097 # was in.
11098 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ');
11099 my $soft_dotted = property_ref('Soft_Dotted');
11100 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
11101 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
11102 }
11103 else {
11104
11105 # This list came from 3.2 Soft_Dotted.
11106 $CanonDCIJ->initialize([ 0x0069,
11107 0x006A,
11108 0x012F,
11109 0x0268,
11110 0x0456,
11111 0x0458,
11112 0x1E2D,
11113 0x1ECB,
11114 ]);
11115 $CanonDCIJ = $CanonDCIJ & $Assigned;
11116 }
11117
f86864ac 11118 # These are used in Unicode's definition of \X
37e2e78e
KW
11119 my $begin = $perl->add_match_table('_X_Begin', Perl_Extension => 1);
11120 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1);
11121
99870f4d 11122 my $gcb = property_ref('Grapheme_Cluster_Break');
37e2e78e 11123
678f13d5 11124 # The 'extended' grapheme cluster came in 5.1. The non-extended
37e2e78e
KW
11125 # definition differs too much from the traditional Perl one to use.
11126 if (defined $gcb && defined $gcb->table('SpacingMark')) {
11127
11128 # Note that assumes HST is defined; it came in an earlier release than
11129 # GCB. In the line below, two negatives means: yes hangul
11130 $begin += ~ property_ref('Hangul_Syllable_Type')
11131 ->table('Not_Applicable')
11132 + ~ ($gcb->table('Control')
11133 + $gcb->table('CR')
11134 + $gcb->table('LF'));
11135 $begin->add_comment('For use in \X; matches: Hangul_Syllable | ! Control');
11136
11137 $extend += $gcb->table('Extend') + $gcb->table('SpacingMark');
11138 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
99870f4d
KW
11139 }
11140 else { # Old definition, used on early releases.
f86864ac 11141 $extend += $gc->table('Mark')
37e2e78e
KW
11142 + 0x200C # ZWNJ
11143 + 0x200D; # ZWJ
11144 $begin += ~ $extend;
11145
11146 # Here we may have a release that has the regular grapheme cluster
11147 # defined, or a release that doesn't have anything defined.
11148 # We set things up so the Perl core degrades gracefully, possibly with
11149 # placeholders that match nothing.
11150
11151 if (! defined $gcb) {
11152 $gcb = Property->new('GCB', Status => $PLACEHOLDER);
11153 }
11154 my $hst = property_ref('HST');
11155 if (!defined $hst) {
11156 $hst = Property->new('HST', Status => $PLACEHOLDER);
11157 $hst->add_match_table('Not_Applicable',
11158 Initialize => $Any,
11159 Matches_All => 1);
11160 }
11161
11162 # On some releases, here we may not have the needed tables for the
11163 # perl core, in some releases we may.
11164 foreach my $name (qw{ L LV LVT T V prepend }) {
11165 my $table = $gcb->table($name);
11166 if (! defined $table) {
11167 $table = $gcb->add_match_table($name);
11168 push @tables_that_may_be_empty, $table->complete_name;
11169 }
11170
11171 # The HST property predates the GCB one, and has identical tables
11172 # for some of them, so use it if we can.
11173 if ($table->is_empty
11174 && defined $hst
11175 && defined $hst->table($name))
11176 {
11177 $table += $hst->table($name);
11178 }
11179 }
11180 }
11181
11182 # More GCB. If we found some hangul syllables, populate a combined
11183 # table.
11184 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V');
11185 my $LV = $gcb->table('LV');
11186 if ($LV->is_empty) {
11187 push @tables_that_may_be_empty, $lv_lvt_v->complete_name;
11188 } else {
11189 $lv_lvt_v += $LV + $gcb->table('LVT') + $gcb->table('V');
11190 $lv_lvt_v->add_comment('For use in \X; matches: HST=LV | HST=LVT | HST=V');
99870f4d
KW
11191 }
11192
28093d0e 11193 # Was previously constructed to contain both Name and Unicode_1_Name
99870f4d
KW
11194 my @composition = ('Name', 'Unicode_1_Name');
11195
11196 if (@named_sequences) {
11197 push @composition, 'Named_Sequence';
11198 foreach my $sequence (@named_sequences) {
11199 $perl_charname->add_anomalous_entry($sequence);
11200 }
11201 }
11202
11203 my $alias_sentence = "";
11204 my $alias = property_ref('Name_Alias');
11205 if (defined $alias) {
11206 push @composition, 'Name_Alias';
11207 $alias->reset_each_range;
11208 while (my ($range) = $alias->each_range) {
11209 next if $range->value eq "";
11210 if ($range->start != $range->end) {
11211 Carp::my_carp("Expecting only one code point in the range $range. Just to keep going, using just the first code point;");
11212 }
11213 $perl_charname->add_duplicate($range->start, $range->value);
11214 }
11215 $alias_sentence = <<END;
11216The Name_Alias property adds duplicate code point entries with a corrected
11217name. The original (less correct, but still valid) name will be physically
11218first.
11219END
11220 }
11221 my $comment;
11222 if (@composition <= 2) { # Always at least 2
11223 $comment = join " and ", @composition;
11224 }
11225 else {
11226 $comment = join ", ", @composition[0 .. scalar @composition - 2];
11227 $comment .= ", and $composition[-1]";
11228 }
11229
99870f4d
KW
11230 $perl_charname->add_comment(join_lines( <<END
11231This file is for charnames.pm. It is the union of the $comment properties.
11232Unicode_1_Name entries are used only for otherwise nameless code
11233points.
11234$alias_sentence
11235END
11236 ));
11237
11238 # The combining class property used by Perl's normalize.pm is not located
11239 # in the normal mapping directory; create a copy for it.
11240 my $ccc = property_ref('Canonical_Combining_Class');
11241 my $perl_ccc = Property->new('Perl_ccc',
11242 Default_Map => $ccc->default_map,
11243 Full_Name => 'Perl_Canonical_Combining_Class',
11244 Internal_Only_Warning => 1,
11245 Perl_Extension => 1,
11246 Pod_Entry =>0,
11247 Type => $ENUM,
11248 Initialize => $ccc,
11249 File => 'CombiningClass',
517956bf 11250 Directory => File::Spec->curdir(),
99870f4d
KW
11251 );
11252 $perl_ccc->set_to_output_map(1);
11253 $perl_ccc->add_comment(join_lines(<<END
11254This mapping is for normalize.pm. It is currently identical to the Unicode
11255Canonical_Combining_Class property.
11256END
11257 ));
11258
11259 # This one match table for it is needed for calculations on output
11260 my $default = $perl_ccc->add_match_table($ccc->default_map,
11261 Initialize => $ccc->table($ccc->default_map),
11262 Status => $SUPPRESSED);
11263
11264 # Construct the Present_In property from the Age property.
11265 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
11266 my $default_map = $age->default_map;
11267 my $in = Property->new('In',
11268 Default_Map => $default_map,
11269 Full_Name => "Present_In",
11270 Internal_Only_Warning => 1,
11271 Perl_Extension => 1,
11272 Type => $ENUM,
11273 Initialize => $age,
11274 );
11275 $in->add_comment(join_lines(<<END
11276This file should not be used for any purpose. The values in this file are the
11277same as for $age, and not for what $in really means. This is because anything
11278defined in a given release should have multiple values: that release and all
11279higher ones. But only one value per code point can be represented in a table
11280like this.
11281END
11282 ));
11283
11284 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
11285 # lowest numbered (earliest) come first, with the non-numeric one
11286 # last.
11287 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
11288 ? 1
11289 : ($b->name !~ /^[\d.]*$/)
11290 ? -1
11291 : $a->name <=> $b->name
11292 } $age->tables;
11293
11294 # The Present_In property is the cumulative age properties. The first
11295 # one hence is identical to the first age one.
11296 my $previous_in = $in->add_match_table($first_age->name);
11297 $previous_in->set_equivalent_to($first_age, Related => 1);
11298
11299 my $description_start = "Code point's usage introduced in version ";
11300 $first_age->add_description($description_start . $first_age->name);
11301
11302 # To construct the accumlated values, for each of the age tables
11303 # starting with the 2nd earliest, merge the earliest with it, to get
11304 # all those code points existing in the 2nd earliest. Repeat merging
11305 # the new 2nd earliest with the 3rd earliest to get all those existing
11306 # in the 3rd earliest, and so on.
11307 foreach my $current_age (@rest_ages) {
11308 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
11309
11310 my $current_in = $in->add_match_table(
11311 $current_age->name,
11312 Initialize => $current_age + $previous_in,
11313 Description => $description_start
11314 . $current_age->name
11315 . ' or earlier',
11316 );
11317 $previous_in = $current_in;
11318
11319 # Add clarifying material for the corresponding age file. This is
11320 # in part because of the confusing and contradictory information
11321 # given in the Standard's documentation itself, as of 5.2.
11322 $current_age->add_description(
11323 "Code point's usage was introduced in version "
11324 . $current_age->name);
11325 $current_age->add_note("See also $in");
11326
11327 }
11328
11329 # And finally the code points whose usages have yet to be decided are
11330 # the same in both properties. Note that permanently unassigned code
11331 # points actually have their usage assigned (as being permanently
11332 # unassigned), so that these tables are not the same as gc=cn.
11333 my $unassigned = $in->add_match_table($default_map);
11334 my $age_default = $age->table($default_map);
11335 $age_default->add_description(<<END
11336Code point's usage has not been assigned in any Unicode release thus far.
11337END
11338 );
11339 $unassigned->set_equivalent_to($age_default, Related => 1);
11340 }
11341
11342
11343 # Finished creating all the perl properties. All non-internal non-string
11344 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
11345 # an underscore.) These do not get a separate entry in the pod file
11346 foreach my $table ($perl->tables) {
11347 foreach my $alias ($table->aliases) {
11348 next if $alias->name =~ /^_/;
11349 $table->add_alias('Is_' . $alias->name,
11350 Pod_Entry => 0,
11351 Status => $alias->status,
11352 Externally_Ok => 0);
11353 }
11354 }
11355
11356 return;
11357}
11358
11359sub add_perl_synonyms() {
11360 # A number of Unicode tables have Perl synonyms that are expressed in
11361 # the single-form, \p{name}. These are:
11362 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
11363 # \p{Is_Name} as synonyms
11364 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
11365 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
11366 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
11367 # conflict, \p{Value} and \p{Is_Value} as well
11368 #
11369 # This routine generates these synonyms, warning of any unexpected
11370 # conflicts.
11371
11372 # Construct the list of tables to get synonyms for. Start with all the
11373 # binary and the General_Category ones.
11374 my @tables = grep { $_->type == $BINARY } property_ref('*');
11375 push @tables, $gc->tables;
11376
11377 # If the version of Unicode includes the Script property, add its tables
11378 if (defined property_ref('Script')) {
11379 push @tables, property_ref('Script')->tables;
11380 }
11381
11382 # The Block tables are kept separate because they are treated differently.
11383 # And the earliest versions of Unicode didn't include them, so add only if
11384 # there are some.
11385 my @blocks;
11386 push @blocks, $block->tables if defined $block;
11387
11388 # Here, have the lists of tables constructed. Process blocks last so that
11389 # if there are name collisions with them, blocks have lowest priority.
11390 # Should there ever be other collisions, manual intervention would be
11391 # required. See the comments at the beginning of the program for a
11392 # possible way to handle those semi-automatically.
11393 foreach my $table (@tables, @blocks) {
11394
11395 # For non-binary properties, the synonym is just the name of the
11396 # table, like Greek, but for binary properties the synonym is the name
11397 # of the property, and means the code points in its 'Y' table.
11398 my $nominal = $table;
11399 my $nominal_property = $nominal->property;
11400 my $actual;
11401 if (! $nominal->isa('Property')) {
11402 $actual = $table;
11403 }
11404 else {
11405
11406 # Here is a binary property. Use the 'Y' table. Verify that is
11407 # there
11408 my $yes = $nominal->table('Y');
11409 unless (defined $yes) { # Must be defined, but is permissible to
11410 # be empty.
11411 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
11412 next;
11413 }
11414 $actual = $yes;
11415 }
11416
11417 foreach my $alias ($nominal->aliases) {
11418
11419 # Attempt to create a table in the perl directory for the
11420 # candidate table, using whatever aliases in it that don't
11421 # conflict. Also add non-conflicting aliases for all these
11422 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
11423 PREFIX:
11424 foreach my $prefix ("", 'Is_', 'In_') {
11425
11426 # Only Block properties can have added 'In_' aliases.
11427 next if $prefix eq 'In_' and $nominal_property != $block;
11428
11429 my $proposed_name = $prefix . $alias->name;
11430
11431 # No Is_Is, In_In, nor combinations thereof
11432 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
11433 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
11434
11435 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
11436
11437 # Get a reference to any existing table in the perl
11438 # directory with the desired name.
11439 my $pre_existing = $perl->table($proposed_name);
11440
11441 if (! defined $pre_existing) {
11442
11443 # No name collision, so ok to add the perl synonym.
11444
11445 my $make_pod_entry;
11446 my $externally_ok;
11447 my $status = $actual->status;
11448 if ($nominal_property == $block) {
11449
11450 # For block properties, the 'In' form is preferred for
11451 # external use; the pod file contains wild cards for
11452 # this and the 'Is' form so no entries for those; and
11453 # we don't want people using the name without the
11454 # 'In', so discourage that.
11455 if ($prefix eq "") {
11456 $make_pod_entry = 1;
11457 $status = $status || $DISCOURAGED;
11458 $externally_ok = 0;
11459 }
11460 elsif ($prefix eq 'In_') {
11461 $make_pod_entry = 0;
11462 $status = $status || $NORMAL;
11463 $externally_ok = 1;
11464 }
11465 else {
11466 $make_pod_entry = 0;
11467 $status = $status || $DISCOURAGED;
11468 $externally_ok = 0;
11469 }
11470 }
11471 elsif ($prefix ne "") {
11472
11473 # The 'Is' prefix is handled in the pod by a wild
11474 # card, and we won't use it for an external name
11475 $make_pod_entry = 0;
11476 $status = $status || $NORMAL;
11477 $externally_ok = 0;
11478 }
11479 else {
11480
11481 # Here, is an empty prefix, non block. This gets its
11482 # own pod entry and can be used for an external name.
11483 $make_pod_entry = 1;
11484 $status = $status || $NORMAL;
11485 $externally_ok = 1;
11486 }
11487
11488 # Here, there isn't a perl pre-existing table with the
11489 # name. Look through the list of equivalents of this
11490 # table to see if one is a perl table.
11491 foreach my $equivalent ($actual->leader->equivalents) {
11492 next if $equivalent->property != $perl;
11493
11494 # Here, have found a table for $perl. Add this alias
11495 # to it, and are done with this prefix.
11496 $equivalent->add_alias($proposed_name,
11497 Pod_Entry => $make_pod_entry,
11498 Status => $status,
11499 Externally_Ok => $externally_ok);
11500 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
11501 next PREFIX;
11502 }
11503
11504 # Here, $perl doesn't already have a table that is a
11505 # synonym for this property, add one.
11506 my $added_table = $perl->add_match_table($proposed_name,
11507 Pod_Entry => $make_pod_entry,
11508 Status => $status,
11509 Externally_Ok => $externally_ok);
11510 # And it will be related to the actual table, since it is
11511 # based on it.
11512 $added_table->set_equivalent_to($actual, Related => 1);
11513 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
11514 next;
11515 } # End of no pre-existing.
11516
11517 # Here, there is a pre-existing table that has the proposed
11518 # name. We could be in trouble, but not if this is just a
11519 # synonym for another table that we have already made a child
11520 # of the pre-existing one.
11521 if ($pre_existing->is_equivalent_to($actual)) {
11522 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
11523 $pre_existing->add_alias($proposed_name);
11524 next;
11525 }
11526
11527 # Here, there is a name collision, but it still could be ok if
11528 # the tables match the identical set of code points, in which
11529 # case, we can combine the names. Compare each table's code
11530 # point list to see if they are identical.
11531 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
11532 if ($pre_existing->matches_identically_to($actual)) {
11533
11534 # Here, they do match identically. Not a real conflict.
11535 # Make the perl version a child of the Unicode one, except
11536 # in the non-obvious case of where the perl name is
11537 # already a synonym of another Unicode property. (This is
11538 # excluded by the test for it being its own parent.) The
11539 # reason for this exclusion is that then the two Unicode
11540 # properties become related; and we don't really know if
11541 # they are or not. We generate documentation based on
11542 # relatedness, and this would be misleading. Code
11543 # later executed in the process will cause the tables to
11544 # be represented by a single file anyway, without making
11545 # it look in the pod like they are necessarily related.
11546 if ($pre_existing->parent == $pre_existing
11547 && ($pre_existing->property == $perl
11548 || $actual->property == $perl))
11549 {
11550 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
11551 $pre_existing->set_equivalent_to($actual, Related => 1);
11552 }
11553 elsif (main::DEBUG && $to_trace) {
11554 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
11555 trace $pre_existing->parent;
11556 }
11557 next PREFIX;
11558 }
11559
11560 # Here they didn't match identically, there is a real conflict
11561 # between our new name and a pre-existing property.
11562 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
11563 $pre_existing->add_conflicting($nominal->full_name,
11564 'p',
11565 $actual);
11566
11567 # Don't output a warning for aliases for the block
11568 # properties (unless they start with 'In_') as it is
11569 # expected that there will be conflicts and the block
11570 # form loses.
11571 if ($verbosity >= $NORMAL_VERBOSITY
11572 && ($actual->property != $block || $prefix eq 'In_'))
11573 {
11574 print simple_fold(join_lines(<<END
11575There is already an alias named $proposed_name (from " . $pre_existing . "),
11576so not creating this alias for " . $actual
11577END
11578 ), "", 4);
11579 }
11580
11581 # Keep track for documentation purposes.
11582 $has_In_conflicts++ if $prefix eq 'In_';
11583 $has_Is_conflicts++ if $prefix eq 'Is_';
11584 }
11585 }
11586 }
11587
11588 # There are some properties which have No and Yes (and N and Y) as
11589 # property values, but aren't binary, and could possibly be confused with
11590 # binary ones. So create caveats for them. There are tables that are
11591 # named 'No', and tables that are named 'N', but confusion is not likely
11592 # unless they are the same table. For example, N meaning Number or
11593 # Neutral is not likely to cause confusion, so don't add caveats to things
11594 # like them.
11595 foreach my $property (grep { $_->type != $BINARY } property_ref('*')) {
11596 my $yes = $property->table('Yes');
11597 if (defined $yes) {
11598 my $y = $property->table('Y');
11599 if (defined $y && $yes == $y) {
11600 foreach my $alias ($property->aliases) {
11601 $yes->add_conflicting($alias->name);
11602 }
11603 }
11604 }
11605 my $no = $property->table('No');
11606 if (defined $no) {
11607 my $n = $property->table('N');
11608 if (defined $n && $no == $n) {
11609 foreach my $alias ($property->aliases) {
11610 $no->add_conflicting($alias->name, 'P');
11611 }
11612 }
11613 }
11614 }
11615
11616 return;
11617}
11618
11619sub register_file_for_name($$$) {
11620 # Given info about a table and a datafile that it should be associated
11621 # with, register that assocation
11622
11623 my $table = shift;
11624 my $directory_ref = shift; # Array of the directory path for the file
11625 my $file = shift; # The file name in the final directory, [-1].
11626 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11627
11628 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
11629
11630 if ($table->isa('Property')) {
11631 $table->set_file_path(@$directory_ref, $file);
11632 push @map_properties, $table
11633 if $directory_ref->[0] eq $map_directory;
11634 return;
11635 }
11636
11637 # Do all of the work for all equivalent tables when called with the leader
11638 # table, so skip if isn't the leader.
11639 return if $table->leader != $table;
11640
11641 # Join all the file path components together, using slashes.
11642 my $full_filename = join('/', @$directory_ref, $file);
11643
11644 # All go in the same subdirectory of unicore
11645 if ($directory_ref->[0] ne $matches_directory) {
11646 Carp::my_carp("Unexpected directory in "
11647 . join('/', @{$directory_ref}, $file));
11648 }
11649
11650 # For this table and all its equivalents ...
11651 foreach my $table ($table, $table->equivalents) {
11652
11653 # Associate it with its file internally. Don't include the
11654 # $matches_directory first component
11655 $table->set_file_path(@$directory_ref, $file);
11656 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
11657
11658 my $property = $table->property;
11659 $property = ($property == $perl)
11660 ? "" # 'perl' is never explicitly stated
11661 : standardize($property->name) . '=';
11662
11663 my $deprecated = ($table->status eq $DEPRECATED)
11664 ? $table->status_info
11665 : "";
11666
11667 # And for each of the table's aliases... This inner loop eventually
11668 # goes through all aliases in the UCD that we generate regex match
11669 # files for
11670 foreach my $alias ($table->aliases) {
11671 my $name = $alias->name;
11672
11673 # Generate an entry in either the loose or strict hashes, which
11674 # will translate the property and alias names combination into the
11675 # file where the table for them is stored.
11676 my $standard;
11677 if ($alias->loose_match) {
11678 $standard = $property . standardize($alias->name);
11679 if (exists $loose_to_file_of{$standard}) {
11680 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
11681 }
11682 else {
11683 $loose_to_file_of{$standard} = $sub_filename;
11684 }
11685 }
11686 else {
11687 $standard = lc ($property . $name);
11688 if (exists $stricter_to_file_of{$standard}) {
11689 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
11690 }
11691 else {
11692 $stricter_to_file_of{$standard} = $sub_filename;
11693
11694 # Tightly coupled with how utf8_heavy.pl works, for a
11695 # floating point number that is a whole number, get rid of
11696 # the trailing decimal point and 0's, so that utf8_heavy
11697 # will work. Also note that this assumes that such a
11698 # number is matched strictly; so if that were to change,
11699 # this would be wrong.
11700 if ((my $integer_name = $name)
11701 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
11702 {
11703 $stricter_to_file_of{$property . $integer_name}
11704 = $sub_filename;
11705 }
11706 }
11707 }
11708
11709 # Keep a list of the deprecated properties and their filenames
11710 if ($deprecated) {
11711 $utf8::why_deprecated{$sub_filename} = $deprecated;
11712 }
11713 }
11714 }
11715
11716 return;
11717}
11718
11719{ # Closure
11720 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
11721 # conflicts
11722 my %full_dir_name_of; # Full length names of directories used.
11723
11724 sub construct_filename($$$) {
11725 # Return a file name for a table, based on the table name, but perhaps
11726 # changed to get rid of non-portable characters in it, and to make
11727 # sure that it is unique on a file system that allows the names before
11728 # any period to be at most 8 characters (DOS). While we're at it
11729 # check and complain if there are any directory conflicts.
11730
11731 my $name = shift; # The name to start with
11732 my $mutable = shift; # Boolean: can it be changed? If no, but
11733 # yet it must be to work properly, a warning
11734 # is given
11735 my $directories_ref = shift; # A reference to an array containing the
11736 # path to the file, with each element one path
11737 # component. This is used because the same
11738 # name can be used in different directories.
11739 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11740
11741 my $warn = ! defined wantarray; # If true, then if the name is
11742 # changed, a warning is issued as well.
11743
11744 if (! defined $name) {
11745 Carp::my_carp("Undefined name in directory "
11746 . File::Spec->join(@$directories_ref)
11747 . ". '_' used");
11748 return '_';
11749 }
11750
11751 # Make sure that no directory names conflict with each other. Look at
11752 # each directory in the input file's path. If it is already in use,
11753 # assume it is correct, and is merely being re-used, but if we
11754 # truncate it to 8 characters, and find that there are two directories
11755 # that are the same for the first 8 characters, but differ after that,
11756 # then that is a problem.
11757 foreach my $directory (@$directories_ref) {
11758 my $short_dir = substr($directory, 0, 8);
11759 if (defined $full_dir_name_of{$short_dir}) {
11760 next if $full_dir_name_of{$short_dir} eq $directory;
11761 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
11762 }
11763 else {
11764 $full_dir_name_of{$short_dir} = $directory;
11765 }
11766 }
11767
11768 my $path = join '/', @$directories_ref;
11769 $path .= '/' if $path;
11770
11771 # Remove interior underscores.
11772 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
11773
11774 # Change any non-word character into an underscore, and truncate to 8.
11775 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
11776 substr($filename, 8) = "" if length($filename) > 8;
11777
11778 # Make sure the basename doesn't conflict with something we
11779 # might have already written. If we have, say,
11780 # InGreekExtended1
11781 # InGreekExtended2
11782 # they become
11783 # InGreekE
11784 # InGreek2
11785 my $warned = 0;
11786 while (my $num = $base_names{$path}{lc $filename}++) {
11787 $num++; # so basenames with numbers start with '2', which
11788 # just looks more natural.
11789
11790 # Want to append $num, but if it'll make the basename longer
11791 # than 8 characters, pre-truncate $filename so that the result
11792 # is acceptable.
11793 my $delta = length($filename) + length($num) - 8;
11794 if ($delta > 0) {
11795 substr($filename, -$delta) = $num;
11796 }
11797 else {
11798 $filename .= $num;
11799 }
11800 if ($warn && ! $warned) {
11801 $warned = 1;
11802 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
11803 }
11804 }
11805
11806 return $filename if $mutable;
11807
11808 # If not changeable, must return the input name, but warn if needed to
11809 # change it beyond shortening it.
11810 if ($name ne $filename
11811 && substr($name, 0, length($filename)) ne $filename) {
11812 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
11813 }
11814 return $name;
11815 }
11816}
11817
11818# The pod file contains a very large table. Many of the lines in that table
11819# would exceed a typical output window's size, and so need to be wrapped with
11820# a hanging indent to make them look good. The pod language is really
11821# insufficient here. There is no general construct to do that in pod, so it
11822# is done here by beginning each such line with a space to cause the result to
11823# be output without formatting, and doing all the formatting here. This leads
11824# to the result that if the eventual display window is too narrow it won't
11825# look good, and if the window is too wide, no advantage is taken of that
11826# extra width. A further complication is that the output may be indented by
11827# the formatter so that there is less space than expected. What I (khw) have
11828# done is to assume that that indent is a particular number of spaces based on
11829# what it is in my Linux system; people can always resize their windows if
11830# necessary, but this is obviously less than desirable, but the best that can
11831# be expected.
11832my $automatic_pod_indent = 8;
11833
11834# Try to format so that uses fewest lines, but few long left column entries
11835# slide into the right column. An experiment on 5.1 data yielded the
11836# following percentages that didn't cut into the other side along with the
11837# associated first-column widths
11838# 69% = 24
11839# 80% not too bad except for a few blocks
11840# 90% = 33; # , cuts 353/3053 lines from 37 = 12%
11841# 95% = 37;
11842my $indent_info_column = 27; # 75% of lines didn't have overlap
11843
11844my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
11845 # The 3 is because of:
11846 # 1 for the leading space to tell the pod formatter to
11847 # output as-is
11848 # 1 for the flag
11849 # 1 for the space between the flag and the main data
11850
11851sub format_pod_line ($$$;$$) {
11852 # Take a pod line and return it, formatted properly
11853
11854 my $first_column_width = shift;
11855 my $entry = shift; # Contents of left column
11856 my $info = shift; # Contents of right column
11857
11858 my $status = shift || ""; # Any flag
11859
11860 my $loose_match = shift; # Boolean.
11861 $loose_match = 1 unless defined $loose_match;
11862
11863 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11864
11865 my $flags = "";
11866 $flags .= $STRICTER if ! $loose_match;
11867
11868 $flags .= $status if $status;
11869
11870 # There is a blank in the left column to cause the pod formatter to
11871 # output the line as-is.
11872 return sprintf " %-*s%-*s %s\n",
11873 # The first * in the format is replaced by this, the -1 is
11874 # to account for the leading blank. There isn't a
11875 # hard-coded blank after this to separate the flags from
11876 # the rest of the line, so that in the unlikely event that
11877 # multiple flags are shown on the same line, they both
11878 # will get displayed at the expense of that separation,
11879 # but since they are left justified, a blank will be
11880 # inserted in the normal case.
11881 $FILLER - 1,
11882 $flags,
11883
11884 # The other * in the format is replaced by this number to
11885 # cause the first main column to right fill with blanks.
11886 # The -1 is for the guaranteed blank following it.
11887 $first_column_width - $FILLER - 1,
11888 $entry,
11889 $info;
11890}
11891
11892my @zero_match_tables; # List of tables that have no matches in this release
11893
11894sub make_table_pod_entries($) {
11895 # This generates the entries for the pod file for a given table.
11896 # Also done at this time are any children tables. The output looks like:
11897 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
11898
11899 my $input_table = shift; # Table the entry is for
11900 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11901
11902 # Generate parent and all its children at the same time.
11903 return if $input_table->parent != $input_table;
11904
11905 my $property = $input_table->property;
11906 my $type = $property->type;
11907 my $full_name = $property->full_name;
11908
11909 my $count = $input_table->count;
11910 my $string_count = clarify_number($count);
11911 my $status = $input_table->status;
11912 my $status_info = $input_table->status_info;
11913
11914 my $entry_for_first_table; # The entry for the first table output.
11915 # Almost certainly, it is the parent.
11916
11917 # For each related table (including itself), we will generate a pod entry
11918 # for each name each table goes by
11919 foreach my $table ($input_table, $input_table->children) {
11920
11921 # utf8_heavy.pl cannot deal with null string property values, so don't
11922 # output any.
11923 next if $table->name eq "";
11924
11925 # First, gather all the info that applies to this table as a whole.
11926
11927 push @zero_match_tables, $table if $count == 0;
11928
11929 my $table_property = $table->property;
11930
11931 # The short name has all the underscores removed, while the full name
11932 # retains them. Later, we decide whether to output a short synonym
11933 # for the full one, we need to compare apples to apples, so we use the
11934 # short name's length including underscores.
11935 my $table_property_short_name_length;
11936 my $table_property_short_name
11937 = $table_property->short_name(\$table_property_short_name_length);
11938 my $table_property_full_name = $table_property->full_name;
11939
11940 # Get how much savings there is in the short name over the full one
11941 # (delta will always be <= 0)
11942 my $table_property_short_delta = $table_property_short_name_length
11943 - length($table_property_full_name);
11944 my @table_description = $table->description;
11945 my @table_note = $table->note;
11946
11947 # Generate an entry for each alias in this table.
11948 my $entry_for_first_alias; # saves the first one encountered.
11949 foreach my $alias ($table->aliases) {
11950
11951 # Skip if not to go in pod.
11952 next unless $alias->make_pod_entry;
11953
11954 # Start gathering all the components for the entry
11955 my $name = $alias->name;
11956
11957 my $entry; # Holds the left column, may include extras
11958 my $entry_ref; # To refer to the left column's contents from
11959 # another entry; has no extras
11960
11961 # First the left column of the pod entry. Tables for the $perl
11962 # property always use the single form.
11963 if ($table_property == $perl) {
11964 $entry = "\\p{$name}";
11965 $entry_ref = "\\p{$name}";
11966 }
11967 else { # Compound form.
11968
11969 # Only generate one entry for all the aliases that mean true
11970 # or false in binary properties. Append a '*' to indicate
11971 # some are missing. (The heading comment notes this.)
11972 my $wild_card_mark;
11973 if ($type == $BINARY) {
11974 next if $name ne 'N' && $name ne 'Y';
11975 $wild_card_mark = '*';
11976 }
11977 else {
11978 $wild_card_mark = "";
11979 }
11980
11981 # Colon-space is used to give a little more space to be easier
11982 # to read;
11983 $entry = "\\p{"
11984 . $table_property_full_name
11985 . ": $name$wild_card_mark}";
11986
11987 # But for the reference to this entry, which will go in the
11988 # right column, where space is at a premium, use equals
11989 # without a space
11990 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
11991 }
11992
11993 # Then the right (info) column. This is stored as components of
11994 # an array for the moment, then joined into a string later. For
11995 # non-internal only properties, begin the info with the entry for
11996 # the first table we encountered (if any), as things are ordered
11997 # so that that one is the most descriptive. This leads to the
11998 # info column of an entry being a more descriptive version of the
11999 # name column
12000 my @info;
12001 if ($name =~ /^_/) {
12002 push @info,
12003 '(For internal use by Perl, not necessarily stable)';
12004 }
12005 elsif ($entry_for_first_alias) {
12006 push @info, $entry_for_first_alias;
12007 }
12008
12009 # If this entry is equivalent to another, add that to the info,
12010 # using the first such table we encountered
12011 if ($entry_for_first_table) {
12012 if (@info) {
12013 push @info, "(= $entry_for_first_table)";
12014 }
12015 else {
12016 push @info, $entry_for_first_table;
12017 }
12018 }
12019
12020 # If the name is a large integer, add an equivalent with an
12021 # exponent for better readability
12022 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
12023 push @info, sprintf "(= %.1e)", $name
12024 }
12025
12026 my $parenthesized = "";
12027 if (! $entry_for_first_alias) {
12028
12029 # This is the first alias for the current table. The alias
12030 # array is ordered so that this is the fullest, most
12031 # descriptive alias, so it gets the fullest info. The other
12032 # aliases are mostly merely pointers to this one, using the
12033 # information already added above.
12034
12035 # Display any status message, but only on the parent table
12036 if ($status && ! $entry_for_first_table) {
12037 push @info, $status_info;
12038 }
12039
12040 # Put out any descriptive info
12041 if (@table_description || @table_note) {
12042 push @info, join "; ", @table_description, @table_note;
12043 }
12044
12045 # Look to see if there is a shorter name we can point people
12046 # at
12047 my $standard_name = standardize($name);
12048 my $short_name;
12049 my $proposed_short = $table->short_name;
12050 if (defined $proposed_short) {
12051 my $standard_short = standardize($proposed_short);
12052
12053 # If the short name is shorter than the standard one, or
12054 # even it it's not, but the combination of it and its
12055 # short property name (as in \p{prop=short} ($perl doesn't
12056 # have this form)) saves at least two characters, then,
12057 # cause it to be listed as a shorter synonym.
12058 if (length $standard_short < length $standard_name
12059 || ($table_property != $perl
12060 && (length($standard_short)
12061 - length($standard_name)
12062 + $table_property_short_delta) # (<= 0)
12063 < -2))
12064 {
12065 $short_name = $proposed_short;
12066 if ($table_property != $perl) {
12067 $short_name = $table_property_short_name
12068 . "=$short_name";
12069 }
12070 $short_name = "\\p{$short_name}";
12071 }
12072 }
12073
12074 # And if this is a compound form name, see if there is a
12075 # single form equivalent
12076 my $single_form;
12077 if ($table_property != $perl) {
12078
12079 # Special case the binary N tables, so that will print
12080 # \P{single}, but use the Y table values to populate
12081 # 'single', as we haven't populated the N table.
12082 my $test_table;
12083 my $p;
12084 if ($type == $BINARY
12085 && $input_table == $property->table('No'))
12086 {
12087 $test_table = $property->table('Yes');
12088 $p = 'P';
12089 }
12090 else {
12091 $test_table = $input_table;
12092 $p = 'p';
12093 }
12094
12095 # Look for a single form amongst all the children.
12096 foreach my $table ($test_table->children) {
12097 next if $table->property != $perl;
12098 my $proposed_name = $table->short_name;
12099 next if ! defined $proposed_name;
12100
12101 # Don't mention internal-only properties as a possible
12102 # single form synonym
12103 next if substr($proposed_name, 0, 1) eq '_';
12104
12105 $proposed_name = "\\$p\{$proposed_name}";
12106 if (! defined $single_form
12107 || length($proposed_name) < length $single_form)
12108 {
12109 $single_form = $proposed_name;
12110
12111 # The goal here is to find a single form; not the
12112 # shortest possible one. We've already found a
12113 # short name. So, stop at the first single form
12114 # found, which is likely to be closer to the
12115 # original.
12116 last;
12117 }
12118 }
12119 }
12120
12121 # Ouput both short and single in the same parenthesized
12122 # expression, but with only one of 'Single', 'Short' if there
12123 # are both items.
12124 if ($short_name || $single_form || $table->conflicting) {
12125 $parenthesized .= '(';
12126 $parenthesized .= "Short: $short_name" if $short_name;
12127 if ($short_name && $single_form) {
12128 $parenthesized .= ', ';
12129 }
12130 elsif ($single_form) {
12131 $parenthesized .= 'Single: ';
12132 }
12133 $parenthesized .= $single_form if $single_form;
12134 }
12135 }
12136
12137
12138 # Warn if this property isn't the same as one that a
12139 # semi-casual user might expect. The other components of this
12140 # parenthesized structure are calculated only for the first entry
12141 # for this table, but the conflicting is deemed important enough
12142 # to go on every entry.
12143 my $conflicting = join " NOR ", $table->conflicting;
12144 if ($conflicting) {
12145 $parenthesized .= '(' if ! $parenthesized;
12146 $parenthesized .= '; ' if $parenthesized ne '(';
12147 $parenthesized .= "NOT $conflicting";
12148 }
12149 $parenthesized .= ')' if $parenthesized;
12150
12151 push @info, $parenthesized if $parenthesized;
d57ccc9a
KW
12152
12153 if ($table_property != $perl && $table->perl_extension) {
12154 push @info, '(Perl extension)';
12155 }
99870f4d
KW
12156 push @info, "($string_count)" if $output_range_counts;
12157
12158 # Now, we have both the entry and info so add them to the
12159 # list of all the properties.
12160 push @match_properties,
12161 format_pod_line($indent_info_column,
12162 $entry,
12163 join( " ", @info),
12164 $alias->status,
12165 $alias->loose_match);
12166
12167 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
12168 } # End of looping through the aliases for this table.
12169
12170 if (! $entry_for_first_table) {
12171 $entry_for_first_table = $entry_for_first_alias;
12172 }
12173 } # End of looping through all the related tables
12174 return;
12175}
12176
12177sub pod_alphanumeric_sort {
12178 # Sort pod entries alphanumerically.
12179
99f78760
KW
12180 # The first few character columns are filler, plus the '\p{'; and get rid
12181 # of all the trailing stuff, starting with the trailing '}', so as to sort
12182 # on just 'Name=Value'
12183 (my $a = lc $a) =~ s/^ .*? { //x;
99870f4d 12184 $a =~ s/}.*//;
99f78760 12185 (my $b = lc $b) =~ s/^ .*? { //x;
99870f4d
KW
12186 $b =~ s/}.*//;
12187
99f78760
KW
12188 # Determine if the two operands are both internal only or both not.
12189 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
12190 # should be the underscore that begins internal only
12191 my $a_is_internal = (substr($a, 0, 1) eq '_');
12192 my $b_is_internal = (substr($b, 0, 1) eq '_');
12193
12194 # Sort so the internals come last in the table instead of first (which the
12195 # leading underscore would otherwise indicate).
12196 if ($a_is_internal != $b_is_internal) {
12197 return 1 if $a_is_internal;
12198 return -1
12199 }
12200
99870f4d 12201 # Determine if the two operands are numeric property values or not.
99f78760 12202 # A numeric property will look like xyz: 3. But the number
99870f4d 12203 # can begin with an optional minus sign, and may have a
99f78760 12204 # fraction or rational component, like xyz: 3/2. If either
99870f4d
KW
12205 # isn't numeric, use alphabetic sort.
12206 my ($a_initial, $a_number) =
99f78760 12207 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
12208 return $a cmp $b unless defined $a_number;
12209 my ($b_initial, $b_number) =
99f78760 12210 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
99870f4d
KW
12211 return $a cmp $b unless defined $b_number;
12212
12213 # Here they are both numeric, but use alphabetic sort if the
12214 # initial parts don't match
12215 return $a cmp $b if $a_initial ne $b_initial;
12216
12217 # Convert rationals to floating for the comparison.
12218 $a_number = eval $a_number if $a_number =~ qr{/};
12219 $b_number = eval $b_number if $b_number =~ qr{/};
12220
12221 return $a_number <=> $b_number;
12222}
12223
12224sub make_pod () {
12225 # Create the .pod file. This generates the various subsections and then
12226 # combines them in one big HERE document.
12227
12228 return unless defined $pod_directory;
12229 print "Making pod file\n" if $verbosity >= $PROGRESS;
12230
12231 my $exception_message =
12232 '(Any exceptions are individually noted beginning with the word NOT.)';
12233 my @block_warning;
12234 if (-e 'Blocks.txt') {
12235
12236 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
12237 # if the global $has_In_conflicts indicates we have them.
12238 push @match_properties, format_pod_line($indent_info_column,
12239 '\p{In_*}',
12240 '\p{Block: *}'
12241 . (($has_In_conflicts)
12242 ? " $exception_message"
12243 : ""));
12244 @block_warning = << "END";
12245
12246Matches in the Block property have shortcuts that begin with 'In_'. For
12247example, \\p{Block=Latin1} can be written as \\p{In_Latin1}. For backward
12248compatibility, if there is no conflict with another shortcut, these may also
12249be written as \\p{Latin1} or \\p{Is_Latin1}. But, N.B., there are numerous
12250such conflicting shortcuts. Use of these forms for Block is discouraged, and
12251are flagged as such, not only because of the potential confusion as to what is
12252meant, but also because a later release of Unicode may preempt the shortcut,
12253and your program would no longer be correct. Use the 'In_' form instead to
12254avoid this, or even more clearly, use the compound form, e.g.,
12255\\p{blk:latin1}. See L<perlunicode/"Blocks"> for more information about this.
12256END
12257 }
12258 my $text = "If an entry has flag(s) at its beginning, like '$DEPRECATED', the 'Is_' form has the same flag(s)";
12259 $text = "$exception_message $text" if $has_Is_conflicts;
12260
12261 # And the 'Is_ line';
12262 push @match_properties, format_pod_line($indent_info_column,
12263 '\p{Is_*}',
12264 "\\p{*} $text");
12265
12266 # Sort the properties array for output. It is sorted alphabetically
12267 # except numerically for numeric properties, and only output unique lines.
12268 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
12269
12270 my $formatted_properties = simple_fold(\@match_properties,
12271 "",
12272 # indent succeeding lines by two extra
12273 # which looks better
12274 $indent_info_column + 2,
12275
12276 # shorten the line length by how much
12277 # the formatter indents, so the folded
12278 # line will fit in the space
12279 # presumably available
12280 $automatic_pod_indent);
12281 # Add column headings, indented to be a little more centered, but not
12282 # exactly
12283 $formatted_properties = format_pod_line($indent_info_column,
12284 ' NAME',
12285 ' INFO')
12286 . "\n"
12287 . $formatted_properties;
12288
12289 # Generate pod documentation lines for the tables that match nothing
12290 my $zero_matches;
12291 if (@zero_match_tables) {
12292 @zero_match_tables = uniques(@zero_match_tables);
12293 $zero_matches = join "\n\n",
12294 map { $_ = '=item \p{' . $_->complete_name . "}" }
12295 sort { $a->complete_name cmp $b->complete_name }
12296 uniques(@zero_match_tables);
12297
12298 $zero_matches = <<END;
12299
12300=head2 Legal \\p{} and \\P{} constructs that match no characters
12301
12302Unicode has some property-value pairs that currently don't match anything.
12303This happens generally either because they are obsolete, or for symmetry with
12304other forms, but no language has yet been encoded that uses them. In this
12305version of Unicode, the following match zero code points:
12306
12307=over 4
12308
12309$zero_matches
12310
12311=back
12312
12313END
12314 }
12315
12316 # Generate list of properties that we don't accept, grouped by the reasons
12317 # why. This is so only put out the 'why' once, and then list all the
12318 # properties that have that reason under it.
12319
12320 my %why_list; # The keys are the reasons; the values are lists of
12321 # properties that have the key as their reason
12322
12323 # For each property, add it to the list that are suppressed for its reason
12324 # The sort will cause the alphabetically first properties to be added to
12325 # each list first, so each list will be sorted.
12326 foreach my $property (sort keys %why_suppressed) {
12327 push @{$why_list{$why_suppressed{$property}}}, $property;
12328 }
12329
12330 # For each reason (sorted by the first property that has that reason)...
12331 my @bad_re_properties;
12332 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
12333 keys %why_list)
12334 {
12335 # Add to the output, all the properties that have that reason. Start
12336 # with an empty line.
12337 push @bad_re_properties, "\n\n";
12338
12339 my $has_item = 0; # Flag if actually output anything.
12340 foreach my $name (@{$why_list{$why}}) {
12341
12342 # Split compound names into $property and $table components
12343 my $property = $name;
12344 my $table;
12345 if ($property =~ / (.*) = (.*) /x) {
12346 $property = $1;
12347 $table = $2;
12348 }
12349
12350 # This release of Unicode may not have a property that is
12351 # suppressed, so don't reference a non-existent one.
12352 $property = property_ref($property);
12353 next if ! defined $property;
12354
12355 # And since this list is only for match tables, don't list the
12356 # ones that don't have match tables.
12357 next if ! $property->to_create_match_tables;
12358
12359 # Find any abbreviation, and turn it into a compound name if this
12360 # is a property=value pair.
12361 my $short_name = $property->name;
12362 $short_name .= '=' . $property->table($table)->name if $table;
12363
12364 # And add the property as an item for the reason.
12365 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
12366 $has_item = 1;
12367 }
12368
12369 # And add the reason under the list of properties, if such a list
12370 # actually got generated. Note that the header got added
12371 # unconditionally before. But pod ignores extra blank lines, so no
12372 # harm.
12373 push @bad_re_properties, "\n$why\n" if $has_item;
12374
12375 } # End of looping through each reason.
12376
12377 # Generate a list of the properties whose map table we output, from the
12378 # global @map_properties.
12379 my @map_tables_actually_output;
12380 my $info_indent = 20; # Left column is narrower than \p{} table.
12381 foreach my $property (@map_properties) {
12382
12383 # Get the path to the file; don't output any not in the standard
12384 # directory.
12385 my @path = $property->file_path;
12386 next if $path[0] ne $map_directory;
12387 shift @path; # Remove the standard name
12388
12389 my $file = join '/', @path; # In case is in sub directory
12390 my $info = $property->full_name;
12391 my $short_name = $property->name;
12392 if ($info ne $short_name) {
12393 $info .= " ($short_name)";
12394 }
12395 foreach my $more_info ($property->description,
12396 $property->note,
12397 $property->status_info)
12398 {
12399 next unless $more_info;
12400 $info =~ s/\.\Z//;
12401 $info .= ". $more_info";
12402 }
12403 push @map_tables_actually_output, format_pod_line($info_indent,
12404 $file,
12405 $info,
12406 $property->status);
12407 }
12408
12409 # Sort alphabetically, and fold for output
12410 @map_tables_actually_output = sort
12411 pod_alphanumeric_sort @map_tables_actually_output;
12412 @map_tables_actually_output
12413 = simple_fold(\@map_tables_actually_output,
12414 ' ',
12415 $info_indent,
12416 $automatic_pod_indent);
12417
12418 # Generate a list of the formats that can appear in the map tables.
12419 my @map_table_formats;
12420 foreach my $format (sort keys %map_table_formats) {
12421 push @map_table_formats, " $format $map_table_formats{$format}\n";
12422 }
12423
12424 # Everything is ready to assemble.
12425 my @OUT = << "END";
12426=begin comment
12427
12428$HEADER
12429
12430To change this file, edit $0 instead.
12431
12432=end comment
12433
12434=head1 NAME
12435
51f494cc 12436$pod_file - Index of Unicode Version $string_version properties in Perl
99870f4d
KW
12437
12438=head1 DESCRIPTION
12439
12440There are many properties in Unicode, and Perl provides access to almost all of
12441them, as well as some additional extensions and short-cut synonyms.
12442
12443And just about all of the few that aren't accessible through the Perl
12444core are accessible through the modules: Unicode::Normalize and
12445Unicode::UCD, and for Unihan properties, via the CPAN module Unicode::Unihan.
12446
12447This document merely lists all available properties and does not attempt to
12448explain what each property really means. There is a brief description of each
12449Perl extension. There is some detail about Blocks, Scripts, General_Category,
12450and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
12451Unicode properties, refer to the Unicode standard. A good starting place is
12452L<$unicode_reference_url>. More information on the Perl extensions is in
12453L<perlrecharclass>.
12454
12455Note that you can define your own properties; see
12456L<perlunicode/"User-Defined Character Properties">.
12457
12458=head1 Properties accessible through \\p{} and \\P{}
12459
12460The Perl regular expression \\p{} and \\P{} constructs give access to most of
12461the Unicode character properties. The table below shows all these constructs,
12462both single and compound forms.
12463
12464B<Compound forms> consist of two components, separated by an equals sign or a
12465colon. The first component is the property name, and the second component is
12466the particular value of the property to match against, for example,
12467'\\p{Script: Greek}' or '\\p{Script=Greek}' both mean to match characters
12468whose Script property is Greek.
12469
12470B<Single forms>, like '\\p{Greek}', are mostly Perl-defined shortcuts for
12471their equivalent compound forms. The table shows these equivalences. (In our
12472example, '\\p{Greek}' is a just a shortcut for '\\p{Script=Greek}'.)
12473There are also a few Perl-defined single forms that are not shortcuts for a
12474compound form. One such is \\p{Word}. These are also listed in the table.
12475
12476In parsing these constructs, Perl always ignores Upper/lower case differences
12477everywhere within the {braces}. Thus '\\p{Greek}' means the same thing as
12478'\\p{greek}'. But note that changing the case of the 'p' or 'P' before the
12479left brace completely changes the meaning of the construct, from "match" (for
12480'\\p{}') to "doesn't match" (for '\\P{}'). Casing in this document is for
12481improved legibility.
12482
12483Also, white space, hyphens, and underscores are also normally ignored
12484everywhere between the {braces}, and hence can be freely added or removed
12485even if the C</x> modifier hasn't been specified on the regular expression.
12486But $a_bold_stricter at the beginning of an entry in the table below
12487means that tighter (stricter) rules are used for that entry:
12488
12489=over 4
12490
12491=item Single form (\\p{name}) tighter rules:
12492
12493White space, hyphens, and underscores ARE significant
12494except for:
12495
12496=over 4
12497
12498=item * white space adjacent to a non-word character
12499
12500=item * underscores separating digits in numbers
12501
12502=back
12503
12504That means, for example, that you can freely add or remove white space
12505adjacent to (but within) the braces without affecting the meaning.
12506
12507=item Compound form (\\p{name=value} or \\p{name:value}) tighter rules:
12508
12509The tighter rules given above for the single form apply to everything to the
12510right of the colon or equals; the looser rules still apply to everything to
12511the left.
12512
12513That means, for example, that you can freely add or remove white space
12514adjacent to (but within) the braces and the colon or equal sign.
12515
12516=back
12517
12518Some properties are considered obsolete, but still available. There are
12519several varieties of obsolesence:
12520
12521=over 4
12522
12523=item Obsolete
12524
12525Properties marked with $a_bold_obsolete in the table are considered
12526obsolete. At the time of this writing (Unicode version 5.2) there is no
12527information in the Unicode standard about the implications of a property being
12528obsolete.
12529
12530=item Stabilized
12531
12532Obsolete properties may be stabilized. This means that they are not actively
12533maintained by Unicode, and will not be extended as new characters are added to
12534the standard. Such properties are marked with $a_bold_stabilized in the
12535table. At the time of this writing (Unicode version 5.2) there is no further
12536information in the Unicode standard about the implications of a property being
12537stabilized.
12538
12539=item Deprecated
12540
12541Obsolete properties may be deprecated. This means that their use is strongly
12542discouraged, so much so that a warning will be issued if used, unless the
12543regular expression is in the scope of a C<S<no warnings 'deprecated'>>
12544statement. $A_bold_deprecated flags each such entry in the table, and
12545the entry there for the longest, most descriptive version of the property will
12546give the reason it is deprecated, and perhaps advice. Perl may issue such a
12547warning, even for properties that aren't officially deprecated by Unicode,
12548when there used to be characters or code points that were matched by them, but
12549no longer. This is to warn you that your program may not work like it did on
12550earlier Unicode releases.
12551
12552A deprecated property may be made unavailable in a future Perl version, so it
12553is best to move away from them.
12554
12555=back
12556
12557Some Perl extensions are present for backwards compatibility and are
12558discouraged from being used, but not obsolete. $A_bold_discouraged
12559flags each such entry in the table.
12560
12561@block_warning
12562
12563The table below has two columns. The left column contains the \\p{}
12564constructs to look up, possibly preceeded by the flags mentioned above; and
12565the right column contains information about them, like a description, or
12566synonyms. It shows both the single and compound forms for each property that
12567has them. If the left column is a short name for a property, the right column
12568will give its longer, more descriptive name; and if the left column is the
12569longest name, the right column will show any equivalent shortest name, in both
12570single and compound forms if applicable.
12571
12572The right column will also caution you if a property means something different
12573than what might normally be expected.
12574
d57ccc9a
KW
12575All single forms are Perl extensions; a few compound forms are as well, and
12576are noted as such.
12577
99870f4d
KW
12578Numbers in (parentheses) indicate the total number of code points matched by
12579the property. For emphasis, those properties that match no code points at all
12580are listed as well in a separate section following the table.
12581
12582There is no description given for most non-Perl defined properties (See
12583$unicode_reference_url for that).
d73e5302 12584
99870f4d
KW
12585For compactness, 'B<*>' is used as a wildcard instead of showing all possible
12586combinations. For example, entries like:
d73e5302 12587
99870f4d 12588 \\p{Gc: *} \\p{General_Category: *}
5beb625e 12589
99870f4d
KW
12590mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
12591for the latter is also valid for the former. Similarly,
5beb625e 12592
99870f4d 12593 \\p{Is_*} \\p{*}
5beb625e 12594
99870f4d
KW
12595means that if and only if, for example, \\p{Foo} exists, then \\p{Is_Foo} and
12596\\p{IsFoo} are also valid and all mean the same thing. And similarly,
12597\\p{Foo=Bar} means the same as \\p{Is_Foo=Bar} and \\p{IsFoo=Bar}. '*' here
12598is restricted to something not beginning with an underscore.
5beb625e 12599
99870f4d
KW
12600Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
12601And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
12602'N*' to indicate this, and doesn't have separate entries for the other
12603possibilities. Note that not all properties which have values 'Yes' and 'No'
12604are binary, and they have all their values spelled out without using this wild
12605card, and a C<NOT> clause in their description that highlights their not being
12606binary. These also require the compound form to match them, whereas true
12607binary properties have both single and compound forms available.
5beb625e 12608
99870f4d
KW
12609Note that all non-essential underscores are removed in the display of the
12610short names below.
5beb625e 12611
99870f4d 12612B<Summary legend:>
5beb625e 12613
99870f4d 12614=over 4
cf25bb62 12615
99870f4d 12616=item B<*> is a wild-card
cf25bb62 12617
99870f4d
KW
12618=item B<(\\d+)> in the info column gives the number of code points matched by
12619this property.
cf25bb62 12620
99870f4d 12621=item B<$DEPRECATED> means this is deprecated.
cf25bb62 12622
99870f4d 12623=item B<$OBSOLETE> means this is obsolete.
cf25bb62 12624
99870f4d 12625=item B<$STABILIZED> means this is stabilized.
cf25bb62 12626
99870f4d 12627=item B<$STRICTER> means tighter (stricter) name matching applies.
d73e5302 12628
99870f4d 12629=item B<$DISCOURAGED> means use of this form is discouraged.
5beb625e 12630
99870f4d 12631=back
da7fcca4 12632
99870f4d 12633$formatted_properties
cf25bb62 12634
99870f4d 12635$zero_matches
cf25bb62 12636
99870f4d 12637=head1 Properties not accessible through \\p{} and \\P{}
cf25bb62 12638
99870f4d
KW
12639A few properties are accessible in Perl via various function calls only.
12640These are:
12641 Lowercase_Mapping lc() and lcfirst()
12642 Titlecase_Mapping ucfirst()
12643 Uppercase_Mapping uc()
12ac2576 12644
99870f4d 12645Case_Folding is accessible through the /i modifier in regular expressions.
cf25bb62 12646
99870f4d
KW
12647The Name property is accessible through the \\N{} interpolation in
12648double-quoted strings and regular expressions, but both usages require a C<use
fb121860
KW
12649charnames;> to be specified, which also contains related functions viacode(),
12650vianame(), and string_vianame().
cf25bb62 12651
99870f4d 12652=head1 Unicode regular expression properties that are NOT accepted by Perl
d2d499f5 12653
99870f4d
KW
12654Perl will generate an error for a few character properties in Unicode when
12655used in a regular expression. The non-Unihan ones are listed below, with the
12656reasons they are not accepted, perhaps with work-arounds. The short names for
12657the properties are listed enclosed in (parentheses).
ae6979a8 12658
99870f4d 12659=over 4
ae6979a8 12660
99870f4d 12661@bad_re_properties
a3a8c5f0 12662
99870f4d 12663=back
a3a8c5f0 12664
99870f4d
KW
12665An installation can choose to allow any of these to be matched by changing the
12666controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12667and then re-running F<$0>. (C<\%Config> is available from the Config module).
d73e5302 12668
99870f4d 12669=head1 Files in the I<To> directory (for serious hackers only)
12ac2576 12670
99870f4d
KW
12671All Unicode properties are really mappings (in the mathematical sense) from
12672code points to their respective values. As part of its build process,
12673Perl constructs tables containing these mappings for all properties that it
12674deals with. But only a few of these are written out into files.
12675Those written out are in the directory C<\$Config{privlib}>/F<unicore/To/>
12676(%Config is available from the Config module).
7ebf06b3 12677
99870f4d
KW
12678Those ones written are ones needed by Perl internally during execution, or for
12679which there is some demand, and those for which there is no access through the
12680Perl core. Generally, properties that can be used in regular expression
12681matching do not have their map tables written, like Script. Nor are the
12682simplistic properties that have a better, more complete version, such as
12683Simple_Uppercase_Mapping (Uppercase_Mapping is written instead).
12ac2576 12684
99870f4d
KW
12685None of the properties in the I<To> directory are currently directly
12686accessible through the Perl core, although some may be accessed indirectly.
12687For example, the uc() function implements the Uppercase_Mapping property and
12688uses the F<Upper.pl> file found in this directory.
12ac2576 12689
8f2f18b9
KW
12690The available files in the current installation, with their properties (short
12691names in parentheses), and any flags or comments about them, are:
12ac2576 12692
99870f4d 12693@map_tables_actually_output
12ac2576 12694
99870f4d
KW
12695An installation can choose to change which files are generated by changing the
12696controlling lists contained in the program C<\$Config{privlib}>/F<unicore/$0>
12697and then re-running F<$0>.
cf25bb62 12698
99870f4d
KW
12699Each of these files defines two hash entries to help reading programs decipher
12700it. One of them looks like this:
12ac2576 12701
99870f4d 12702 \$utf8::SwashInfo{'ToNAME'}{'format'} = 's';
d73e5302 12703
99870f4d
KW
12704where 'NAME' is a name to indicate the property. For backwards compatibility,
12705this is not necessarily the property's official Unicode name. (The 'To' is
12706also for backwards compatibility.) The hash entry gives the format of the
12707mapping fields of the table, currently one of the following:
d73e5302 12708
99870f4d 12709 @map_table_formats
d73e5302 12710
99870f4d
KW
12711This format applies only to the entries in the main body of the table.
12712Entries defined in hashes or ones that are missing from the list can have a
12713different format.
d73e5302 12714
99870f4d
KW
12715The value that the missing entries have is given by the other SwashInfo hash
12716entry line; it looks like this:
d73e5302 12717
99870f4d 12718 \$utf8::SwashInfo{'ToNAME'}{'missing'} = 'NaN';
d73e5302 12719
99870f4d
KW
12720This example line says that any Unicode code points not explicitly listed in
12721the file have the value 'NaN' under the property indicated by NAME. If the
12722value is the special string C<< <code point> >>, it means that the value for
12723any missing code point is the code point itself. This happens, for example,
12724in the file for Uppercase_Mapping (To/Upper.pl), in which code points like the
12725character 'A', are missing because the uppercase of 'A' is itself.
d73e5302 12726
99870f4d 12727=head1 SEE ALSO
d73e5302 12728
99870f4d 12729L<$unicode_reference_url>
12ac2576 12730
99870f4d 12731L<perlrecharclass>
12ac2576 12732
99870f4d 12733L<perlunicode>
d73e5302 12734
99870f4d 12735END
d73e5302 12736
99870f4d
KW
12737 # And write it.
12738 main::write([ $pod_directory, "$pod_file.pod" ], @OUT);
12739 return;
12740}
d73e5302 12741
99870f4d
KW
12742sub make_Heavy () {
12743 # Create and write Heavy.pl, which passes info about the tables to
12744 # utf8_heavy.pl
12ac2576 12745
99870f4d
KW
12746 my @heavy = <<END;
12747$HEADER
12748$INTERNAL_ONLY
d73e5302 12749
99870f4d 12750# This file is for the use of utf8_heavy.pl
12ac2576 12751
99870f4d
KW
12752# Maps property names in loose standard form to its standard name
12753\%utf8::loose_property_name_of = (
12754END
cf25bb62 12755
99870f4d
KW
12756 push @heavy, simple_dumper (\%loose_property_name_of, ' ' x 4);
12757 push @heavy, <<END;
12758);
12ac2576 12759
99870f4d
KW
12760# Maps property, table to file for those using stricter matching
12761\%utf8::stricter_to_file_of = (
12762END
12763 push @heavy, simple_dumper (\%stricter_to_file_of, ' ' x 4);
12764 push @heavy, <<END;
12765);
12ac2576 12766
99870f4d
KW
12767# Maps property, table to file for those using loose matching
12768\%utf8::loose_to_file_of = (
12769END
12770 push @heavy, simple_dumper (\%loose_to_file_of, ' ' x 4);
12771 push @heavy, <<END;
12772);
12ac2576 12773
99870f4d
KW
12774# Maps floating point to fractional form
12775\%utf8::nv_floating_to_rational = (
12776END
12777 push @heavy, simple_dumper (\%nv_floating_to_rational, ' ' x 4);
12778 push @heavy, <<END;
12779);
12ac2576 12780
99870f4d
KW
12781# If a floating point number doesn't have enough digits in it to get this
12782# close to a fraction, it isn't considered to be that fraction even if all the
12783# digits it does have match.
12784\$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
12ac2576 12785
99870f4d
KW
12786# Deprecated tables to generate a warning for. The key is the file containing
12787# the table, so as to avoid duplication, as many property names can map to the
12788# file, but we only need one entry for all of them.
12789\%utf8::why_deprecated = (
12790END
12ac2576 12791
99870f4d
KW
12792 push @heavy, simple_dumper (\%utf8::why_deprecated, ' ' x 4);
12793 push @heavy, <<END;
12794);
12ac2576 12795
99870f4d
KW
127961;
12797END
12ac2576 12798
99870f4d
KW
12799 main::write("Heavy.pl", @heavy);
12800 return;
12ac2576
JP
12801}
12802
99870f4d
KW
12803sub write_all_tables() {
12804 # Write out all the tables generated by this program to files, as well as
12805 # the supporting data structures, pod file, and .t file.
12806
12807 my @writables; # List of tables that actually get written
12808 my %match_tables_to_write; # Used to collapse identical match tables
12809 # into one file. Each key is a hash function
12810 # result to partition tables into buckets.
12811 # Each value is an array of the tables that
12812 # fit in the bucket.
12813
12814 # For each property ...
12815 # (sort so that if there is an immutable file name, it has precedence, so
12816 # some other property can't come in and take over its file name. If b's
12817 # file name is defined, will return 1, meaning to take it first; don't
12818 # care if both defined, as they had better be different anyway)
12819 PROPERTY:
12820 foreach my $property (sort { defined $b->file } property_ref('*')) {
12821 my $type = $property->type;
12822
12823 # And for each table for that property, starting with the mapping
12824 # table for it ...
12825 TABLE:
12826 foreach my $table($property,
12827
12828 # and all the match tables for it (if any), sorted so
12829 # the ones with the shortest associated file name come
12830 # first. The length sorting prevents problems of a
12831 # longer file taking a name that might have to be used
12832 # by a shorter one. The alphabetic sorting prevents
12833 # differences between releases
12834 sort { my $ext_a = $a->external_name;
12835 return 1 if ! defined $ext_a;
12836 my $ext_b = $b->external_name;
12837 return -1 if ! defined $ext_b;
12838 my $cmp = length $ext_a <=> length $ext_b;
12839
12840 # Return result if lengths not equal
12841 return $cmp if $cmp;
12842
12843 # Alphabetic if lengths equal
12844 return $ext_a cmp $ext_b
12845 } $property->tables
12846 )
12847 {
12ac2576 12848
99870f4d
KW
12849 # Here we have a table associated with a property. It could be
12850 # the map table (done first for each property), or one of the
12851 # other tables. Determine which type.
12852 my $is_property = $table->isa('Property');
12853
12854 my $name = $table->name;
12855 my $complete_name = $table->complete_name;
12856
12857 # See if should suppress the table if is empty, but warn if it
12858 # contains something.
12859 my $suppress_if_empty_warn_if_not = grep { $complete_name eq $_ }
12860 keys %why_suppress_if_empty_warn_if_not;
12861
12862 # Calculate if this table should have any code points associated
12863 # with it or not.
12864 my $expected_empty =
12865
12866 # $perl should be empty, as well as properties that we just
12867 # don't do anything with
12868 ($is_property
12869 && ($table == $perl
12870 || grep { $complete_name eq $_ }
12871 @unimplemented_properties
12872 )
12873 )
12874
12875 # Match tables in properties we skipped populating should be
12876 # empty
12877 || (! $is_property && ! $property->to_create_match_tables)
12878
12879 # Tables and properties that are expected to have no code
12880 # points should be empty
12881 || $suppress_if_empty_warn_if_not
12882 ;
12883
12884 # Set a boolean if this table is the complement of an empty binary
12885 # table
12886 my $is_complement_of_empty_binary =
12887 $type == $BINARY &&
12888 (($table == $property->table('Y')
12889 && $property->table('N')->is_empty)
12890 || ($table == $property->table('N')
12891 && $property->table('Y')->is_empty));
12892
12893
12894 # Some tables should match everything
12895 my $expected_full =
12896 ($is_property)
12897 ? # All these types of map tables will be full because
12898 # they will have been populated with defaults
12899 ($type == $ENUM || $type == $BINARY)
12900
12901 : # A match table should match everything if its method
12902 # shows it should
12903 ($table->matches_all
12904
12905 # The complement of an empty binary table will match
12906 # everything
12907 || $is_complement_of_empty_binary
12908 )
12909 ;
12910
12911 if ($table->is_empty) {
12912
12913
12914 if ($suppress_if_empty_warn_if_not) {
12915 $table->set_status($SUPPRESSED,
12916 $why_suppress_if_empty_warn_if_not{$complete_name});
12917 }
12ac2576 12918
99870f4d
KW
12919 # Suppress expected empty tables.
12920 next TABLE if $expected_empty;
12921
12922 # And setup to later output a warning for those that aren't
12923 # known to be allowed to be empty. Don't do the warning if
12924 # this table is a child of another one to avoid duplicating
12925 # the warning that should come from the parent one.
12926 if (($table == $property || $table->parent == $table)
12927 && $table->status ne $SUPPRESSED
12928 && ! grep { $complete_name =~ /^$_$/ }
12929 @tables_that_may_be_empty)
12930 {
12931 push @unhandled_properties, "$table";
12932 }
12933 }
12934 elsif ($expected_empty) {
12935 my $because = "";
12936 if ($suppress_if_empty_warn_if_not) {
12937 $because = " because $why_suppress_if_empty_warn_if_not{$complete_name}";
12938 }
12ac2576 12939
99870f4d
KW
12940 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
12941 }
12ac2576 12942
99870f4d
KW
12943 my $count = $table->count;
12944 if ($expected_full) {
12945 if ($count != $MAX_UNICODE_CODEPOINTS) {
12946 Carp::my_carp("$table matches only "
12947 . clarify_number($count)
12948 . " Unicode code points but should match "
12949 . clarify_number($MAX_UNICODE_CODEPOINTS)
12950 . " (off by "
12951 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
12952 . "). Proceeding anyway.");
12953 }
12ac2576 12954
99870f4d
KW
12955 # Here is expected to be full. If it is because it is the
12956 # complement of an (empty) binary table that is to be
12957 # suppressed, then suppress this one as well.
12958 if ($is_complement_of_empty_binary) {
12959 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
12960 my $opposing = $property->table($opposing_name);
12961 my $opposing_status = $opposing->status;
12962 if ($opposing_status) {
12963 $table->set_status($opposing_status,
12964 $opposing->status_info);
12965 }
12966 }
12967 }
12968 elsif ($count == $MAX_UNICODE_CODEPOINTS) {
12969 if ($table == $property || $table->leader == $table) {
12970 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
12971 }
12972 }
d73e5302 12973
99870f4d
KW
12974 if ($table->status eq $SUPPRESSED) {
12975 if (! $is_property) {
12976 my @children = $table->children;
12977 foreach my $child (@children) {
12978 if ($child->status ne $SUPPRESSED) {
12979 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
12980 }
12981 }
12982 }
12983 next TABLE;
d73e5302 12984
99870f4d
KW
12985 }
12986 if (! $is_property) {
12987
12988 # Several things need to be done just once for each related
12989 # group of match tables. Do them on the parent.
12990 if ($table->parent == $table) {
12991
12992 # Add an entry in the pod file for the table; it also does
12993 # the children.
23e33b60 12994 make_table_pod_entries($table) if defined $pod_directory;
99870f4d
KW
12995
12996 # See if the the table matches identical code points with
12997 # something that has already been output. In that case,
12998 # no need to have two files with the same code points in
12999 # them. We use the table's hash() method to store these
13000 # in buckets, so that it is quite likely that if two
13001 # tables are in the same bucket they will be identical, so
13002 # don't have to compare tables frequently. The tables
13003 # have to have the same status to share a file, so add
13004 # this to the bucket hash. (The reason for this latter is
13005 # that Heavy.pl associates a status with a file.)
13006 my $hash = $table->hash . ';' . $table->status;
13007
13008 # Look at each table that is in the same bucket as this
13009 # one would be.
13010 foreach my $comparison (@{$match_tables_to_write{$hash}})
13011 {
13012 if ($table->matches_identically_to($comparison)) {
13013 $table->set_equivalent_to($comparison,
13014 Related => 0);
13015 next TABLE;
13016 }
13017 }
d73e5302 13018
99870f4d
KW
13019 # Here, not equivalent, add this table to the bucket.
13020 push @{$match_tables_to_write{$hash}}, $table;
13021 }
13022 }
13023 else {
13024
13025 # Here is the property itself.
13026 # Don't write out or make references to the $perl property
13027 next if $table == $perl;
13028
13029 if ($type != $STRING) {
13030
13031 # There is a mapping stored of the various synonyms to the
13032 # standardized name of the property for utf8_heavy.pl.
13033 # Also, the pod file contains entries of the form:
13034 # \p{alias: *} \p{full: *}
13035 # rather than show every possible combination of things.
13036
13037 my @property_aliases = $property->aliases;
13038
13039 # The full name of this property is stored by convention
13040 # first in the alias array
13041 my $full_property_name =
13042 '\p{' . $property_aliases[0]->name . ': *}';
13043 my $standard_property_name = standardize($table->name);
13044
13045 # For each synonym ...
13046 for my $i (0 .. @property_aliases - 1) {
13047 my $alias = $property_aliases[$i];
13048 my $alias_name = $alias->name;
13049 my $alias_standard = standardize($alias_name);
13050
13051 # Set the mapping for utf8_heavy of the alias to the
13052 # property
13053 if (exists ($loose_property_name_of{$alias_standard}))
13054 {
13055 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
13056 }
13057 else {
13058 $loose_property_name_of{$alias_standard}
13059 = $standard_property_name;
13060 }
13061
23e33b60
KW
13062 # Now for the pod entry for this alias. Skip if not
13063 # outputting a pod; skip the first one, which is the
13064 # full name so won't have an entry like: '\p{full: *}
13065 # \p{full: *}', and skip if don't want an entry for
13066 # this one.
13067 next if $i == 0
13068 || ! defined $pod_directory
13069 || ! $alias->make_pod_entry;
99870f4d 13070
d57ccc9a
KW
13071 my $rhs = $full_property_name;
13072 if ($property != $perl && $table->perl_extension) {
13073 $rhs .= ' (Perl extension)';
13074 }
99870f4d
KW
13075 push @match_properties,
13076 format_pod_line($indent_info_column,
13077 '\p{' . $alias->name . ': *}',
d57ccc9a 13078 $rhs,
99870f4d
KW
13079 $alias->status);
13080 }
13081 } # End of non-string-like property code
d73e5302 13082
d73e5302 13083
99870f4d
KW
13084 # Don't output a mapping file if not desired.
13085 next if ! $property->to_output_map;
13086 }
d73e5302 13087
99870f4d
KW
13088 # Here, we know we want to write out the table, but don't do it
13089 # yet because there may be other tables that come along and will
13090 # want to share the file, and the file's comments will change to
13091 # mention them. So save for later.
13092 push @writables, $table;
13093
13094 } # End of looping through the property and all its tables.
13095 } # End of looping through all properties.
13096
13097 # Now have all the tables that will have files written for them. Do it.
13098 foreach my $table (@writables) {
13099 my @directory;
13100 my $filename;
13101 my $property = $table->property;
13102 my $is_property = ($table == $property);
13103 if (! $is_property) {
13104
13105 # Match tables for the property go in lib/$subdirectory, which is
13106 # the property's name. Don't use the standard file name for this,
13107 # as may get an unfamiliar alias
13108 @directory = ($matches_directory, $property->external_name);
13109 }
13110 else {
d73e5302 13111
99870f4d
KW
13112 @directory = $table->directory;
13113 $filename = $table->file;
13114 }
d73e5302 13115
99870f4d
KW
13116 # Use specified filename if avaliable, or default to property's
13117 # shortest name. We need an 8.3 safe filename (which means "an 8
13118 # safe" filename, since after the dot is only 'pl', which is < 3)
13119 # The 2nd parameter is if the filename shouldn't be changed, and
13120 # it shouldn't iff there is a hard-coded name for this table.
13121 $filename = construct_filename(
13122 $filename || $table->external_name,
13123 ! $filename, # mutable if no filename
13124 \@directory);
d73e5302 13125
99870f4d 13126 register_file_for_name($table, \@directory, $filename);
d73e5302 13127
99870f4d
KW
13128 # Only need to write one file when shared by more than one
13129 # property
13130 next if ! $is_property && $table->leader != $table;
d73e5302 13131
99870f4d
KW
13132 # Construct a nice comment to add to the file
13133 $table->set_final_comment;
13134
13135 $table->write;
cf25bb62 13136 }
d73e5302 13137
d73e5302 13138
99870f4d
KW
13139 # Write out the pod file
13140 make_pod;
13141
13142 # And Heavy.pl
13143 make_Heavy;
d73e5302 13144
99870f4d
KW
13145 make_property_test_script() if $make_test_script;
13146 return;
cf25bb62 13147}
d73e5302 13148
99870f4d
KW
13149my @white_space_separators = ( # This used only for making the test script.
13150 "",
13151 ' ',
13152 "\t",
13153 ' '
13154 );
d73e5302 13155
99870f4d
KW
13156sub generate_separator($) {
13157 # This used only for making the test script. It generates the colon or
13158 # equal separator between the property and property value, with random
13159 # white space surrounding the separator
d73e5302 13160
99870f4d 13161 my $lhs = shift;
d73e5302 13162
99870f4d 13163 return "" if $lhs eq ""; # No separator if there's only one (the r) side
d73e5302 13164
99870f4d
KW
13165 # Choose space before and after randomly
13166 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
13167 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
76ccdbe2 13168
99870f4d
KW
13169 # And return the whole complex, half the time using a colon, half the
13170 # equals
13171 return $spaces_before
13172 . (rand() < 0.5) ? '=' : ':'
13173 . $spaces_after;
13174}
76ccdbe2 13175
430ada4c 13176sub generate_tests($$$$$) {
99870f4d
KW
13177 # This used only for making the test script. It generates test cases that
13178 # are expected to compile successfully in perl. Note that the lhs and
13179 # rhs are assumed to already be as randomized as the caller wants.
13180
99870f4d
KW
13181 my $lhs = shift; # The property: what's to the left of the colon
13182 # or equals separator
13183 my $rhs = shift; # The property value; what's to the right
13184 my $valid_code = shift; # A code point that's known to be in the
13185 # table given by lhs=rhs; undef if table is
13186 # empty
13187 my $invalid_code = shift; # A code point known to not be in the table;
13188 # undef if the table is all code points
13189 my $warning = shift;
13190
13191 # Get the colon or equal
13192 my $separator = generate_separator($lhs);
13193
13194 # The whole 'property=value'
13195 my $name = "$lhs$separator$rhs";
13196
430ada4c 13197 my @output;
99870f4d
KW
13198 # Create a complete set of tests, with complements.
13199 if (defined $valid_code) {
430ada4c
NC
13200 push @output, <<"EOC"
13201Expect(1, $valid_code, '\\p{$name}', $warning);
13202Expect(0, $valid_code, '\\p{^$name}', $warning);
13203Expect(0, $valid_code, '\\P{$name}', $warning);
13204Expect(1, $valid_code, '\\P{^$name}', $warning);
13205EOC
99870f4d
KW
13206 }
13207 if (defined $invalid_code) {
430ada4c
NC
13208 push @output, <<"EOC"
13209Expect(0, $invalid_code, '\\p{$name}', $warning);
13210Expect(1, $invalid_code, '\\p{^$name}', $warning);
13211Expect(1, $invalid_code, '\\P{$name}', $warning);
13212Expect(0, $invalid_code, '\\P{^$name}', $warning);
13213EOC
13214 }
13215 return @output;
99870f4d 13216}
cf25bb62 13217
430ada4c 13218sub generate_error($$$) {
99870f4d
KW
13219 # This used only for making the test script. It generates test cases that
13220 # are expected to not only not match, but to be syntax or similar errors
13221
99870f4d
KW
13222 my $lhs = shift; # The property: what's to the left of the
13223 # colon or equals separator
13224 my $rhs = shift; # The property value; what's to the right
13225 my $already_in_error = shift; # Boolean; if true it's known that the
13226 # unmodified lhs and rhs will cause an error.
13227 # This routine should not force another one
13228 # Get the colon or equal
13229 my $separator = generate_separator($lhs);
13230
13231 # Since this is an error only, don't bother to randomly decide whether to
13232 # put the error on the left or right side; and assume that the rhs is
13233 # loosely matched, again for convenience rather than rigor.
13234 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
13235
13236 my $property = $lhs . $separator . $rhs;
13237
430ada4c
NC
13238 return <<"EOC";
13239Error('\\p{$property}');
13240Error('\\P{$property}');
13241EOC
d73e5302
JH
13242}
13243
99870f4d
KW
13244# These are used only for making the test script
13245# XXX Maybe should also have a bad strict seps, which includes underscore.
13246
13247my @good_loose_seps = (
13248 " ",
13249 "-",
13250 "\t",
13251 "",
13252 "_",
13253 );
13254my @bad_loose_seps = (
13255 "/a/",
13256 ':=',
13257 );
13258
13259sub randomize_stricter_name {
13260 # This used only for making the test script. Take the input name and
13261 # return a randomized, but valid version of it under the stricter matching
13262 # rules.
13263
13264 my $name = shift;
13265 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13266
13267 # If the name looks like a number (integer, floating, or rational), do
13268 # some extra work
13269 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
13270 my $sign = $1;
13271 my $number = $2;
13272 my $separator = $3;
13273
13274 # If there isn't a sign, part of the time add a plus
13275 # Note: Not testing having any denominator having a minus sign
13276 if (! $sign) {
13277 $sign = '+' if rand() <= .3;
13278 }
13279
13280 # And add 0 or more leading zeros.
13281 $name = $sign . ('0' x int rand(10)) . $number;
13282
13283 if (defined $separator) {
13284 my $extra_zeros = '0' x int rand(10);
cf25bb62 13285
99870f4d
KW
13286 if ($separator eq '.') {
13287
13288 # Similarly, add 0 or more trailing zeros after a decimal
13289 # point
13290 $name .= $extra_zeros;
13291 }
13292 else {
13293
13294 # Or, leading zeros before the denominator
13295 $name =~ s,/,/$extra_zeros,;
13296 }
13297 }
cf25bb62 13298 }
d73e5302 13299
99870f4d
KW
13300 # For legibility of the test, only change the case of whole sections at a
13301 # time. To do this, first split into sections. The split returns the
13302 # delimiters
13303 my @sections;
13304 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
13305 trace $section if main::DEBUG && $to_trace;
13306
13307 if (length $section > 1 && $section !~ /\D/) {
13308
13309 # If the section is a sequence of digits, about half the time
13310 # randomly add underscores between some of them.
13311 if (rand() > .5) {
13312
13313 # Figure out how many underscores to add. max is 1 less than
13314 # the number of digits. (But add 1 at the end to make sure
13315 # result isn't 0, and compensate earlier by subtracting 2
13316 # instead of 1)
13317 my $num_underscores = int rand(length($section) - 2) + 1;
13318
13319 # And add them evenly throughout, for convenience, not rigor
13320 use integer;
13321 my $spacing = (length($section) - 1)/ $num_underscores;
13322 my $temp = $section;
13323 $section = "";
13324 for my $i (1 .. $num_underscores) {
13325 $section .= substr($temp, 0, $spacing, "") . '_';
13326 }
13327 $section .= $temp;
13328 }
13329 push @sections, $section;
13330 }
13331 else {
d73e5302 13332
99870f4d
KW
13333 # Here not a sequence of digits. Change the case of the section
13334 # randomly
13335 my $switch = int rand(4);
13336 if ($switch == 0) {
13337 push @sections, uc $section;
13338 }
13339 elsif ($switch == 1) {
13340 push @sections, lc $section;
13341 }
13342 elsif ($switch == 2) {
13343 push @sections, ucfirst $section;
13344 }
13345 else {
13346 push @sections, $section;
13347 }
13348 }
cf25bb62 13349 }
99870f4d
KW
13350 trace "returning", join "", @sections if main::DEBUG && $to_trace;
13351 return join "", @sections;
13352}
71d929cb 13353
99870f4d
KW
13354sub randomize_loose_name($;$) {
13355 # This used only for making the test script
71d929cb 13356
99870f4d
KW
13357 my $name = shift;
13358 my $want_error = shift; # if true, make an error
13359 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13360
13361 $name = randomize_stricter_name($name);
5beb625e
JH
13362
13363 my @parts;
99870f4d
KW
13364 push @parts, $good_loose_seps[rand(@good_loose_seps)];
13365 for my $part (split /[-\s_]+/, $name) {
5beb625e 13366 if (@parts) {
99870f4d
KW
13367 if ($want_error and rand() < 0.3) {
13368 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
13369 $want_error = 0;
13370 }
13371 else {
13372 push @parts, $good_loose_seps[rand(@good_loose_seps)];
5beb625e
JH
13373 }
13374 }
99870f4d 13375 push @parts, $part;
5beb625e 13376 }
99870f4d
KW
13377 my $new = join("", @parts);
13378 trace "$name => $new" if main::DEBUG && $to_trace;
5beb625e 13379
99870f4d 13380 if ($want_error) {
5beb625e 13381 if (rand() >= 0.5) {
99870f4d
KW
13382 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
13383 }
13384 else {
13385 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
5beb625e
JH
13386 }
13387 }
13388 return $new;
13389}
13390
99870f4d
KW
13391# Used to make sure don't generate duplicate test cases.
13392my %test_generated;
5beb625e 13393
99870f4d
KW
13394sub make_property_test_script() {
13395 # This used only for making the test script
13396 # this written directly -- it's huge.
5beb625e 13397
99870f4d 13398 print "Making test script\n" if $verbosity >= $PROGRESS;
5beb625e 13399
99870f4d
KW
13400 # This uses randomness to test different possibilities without testing all
13401 # possibilities. To ensure repeatability, set the seed to 0. But if
13402 # tests are added, it will perturb all later ones in the .t file
13403 srand 0;
5beb625e 13404
3df51b85
KW
13405 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
13406
99870f4d
KW
13407 # Keep going down an order of magnitude
13408 # until find that adding this quantity to
13409 # 1 remains 1; but put an upper limit on
13410 # this so in case this algorithm doesn't
13411 # work properly on some platform, that we
13412 # won't loop forever.
13413 my $digits = 0;
13414 my $min_floating_slop = 1;
13415 while (1+ $min_floating_slop != 1
13416 && $digits++ < 50)
5beb625e 13417 {
99870f4d
KW
13418 my $next = $min_floating_slop / 10;
13419 last if $next == 0; # If underflows,
13420 # use previous one
13421 $min_floating_slop = $next;
5beb625e 13422 }
430ada4c
NC
13423
13424 # It doesn't matter whether the elements of this array contain single lines
13425 # or multiple lines. main::write doesn't count the lines.
13426 my @output;
99870f4d
KW
13427
13428 foreach my $property (property_ref('*')) {
13429 foreach my $table ($property->tables) {
13430
13431 # Find code points that match, and don't match this table.
13432 my $valid = $table->get_valid_code_point;
13433 my $invalid = $table->get_invalid_code_point;
13434 my $warning = ($table->status eq $DEPRECATED)
13435 ? "'deprecated'"
13436 : '""';
13437
13438 # Test each possible combination of the property's aliases with
13439 # the table's. If this gets to be too many, could do what is done
13440 # in the set_final_comment() for Tables
13441 my @table_aliases = $table->aliases;
13442 my @property_aliases = $table->property->aliases;
13443 my $max = max(scalar @table_aliases, scalar @property_aliases);
13444 for my $j (0 .. $max - 1) {
13445
13446 # The current alias for property is the next one on the list,
13447 # or if beyond the end, start over. Similarly for table
13448 my $property_name
13449 = $property_aliases[$j % @property_aliases]->name;
13450
13451 $property_name = "" if $table->property == $perl;
13452 my $table_alias = $table_aliases[$j % @table_aliases];
13453 my $table_name = $table_alias->name;
13454 my $loose_match = $table_alias->loose_match;
13455
13456 # If the table doesn't have a file, any test for it is
13457 # already guaranteed to be in error
13458 my $already_error = ! $table->file_path;
13459
13460 # Generate error cases for this alias.
430ada4c
NC
13461 push @output, generate_error($property_name,
13462 $table_name,
13463 $already_error);
99870f4d
KW
13464
13465 # If the table is guaranteed to always generate an error,
13466 # quit now without generating success cases.
13467 next if $already_error;
13468
13469 # Now for the success cases.
13470 my $random;
13471 if ($loose_match) {
13472
13473 # For loose matching, create an extra test case for the
13474 # standard name.
13475 my $standard = standardize($table_name);
13476
13477 # $test_name should be a unique combination for each test
13478 # case; used just to avoid duplicate tests
13479 my $test_name = "$property_name=$standard";
13480
13481 # Don't output duplicate test cases.
13482 if (! exists $test_generated{$test_name}) {
13483 $test_generated{$test_name} = 1;
430ada4c
NC
13484 push @output, generate_tests($property_name,
13485 $standard,
13486 $valid,
13487 $invalid,
13488 $warning,
13489 );
5beb625e 13490 }
99870f4d
KW
13491 $random = randomize_loose_name($table_name)
13492 }
13493 else { # Stricter match
13494 $random = randomize_stricter_name($table_name);
99598c8c 13495 }
99598c8c 13496
99870f4d
KW
13497 # Now for the main test case for this alias.
13498 my $test_name = "$property_name=$random";
13499 if (! exists $test_generated{$test_name}) {
13500 $test_generated{$test_name} = 1;
430ada4c
NC
13501 push @output, generate_tests($property_name,
13502 $random,
13503 $valid,
13504 $invalid,
13505 $warning,
13506 );
99870f4d
KW
13507
13508 # If the name is a rational number, add tests for the
13509 # floating point equivalent.
13510 if ($table_name =~ qr{/}) {
13511
13512 # Calculate the float, and find just the fraction.
13513 my $float = eval $table_name;
13514 my ($whole, $fraction)
13515 = $float =~ / (.*) \. (.*) /x;
13516
13517 # Starting with one digit after the decimal point,
13518 # create a test for each possible precision (number of
13519 # digits past the decimal point) until well beyond the
13520 # native number found on this machine. (If we started
13521 # with 0 digits, it would be an integer, which could
13522 # well match an unrelated table)
13523 PLACE:
13524 for my $i (1 .. $min_floating_slop + 3) {
13525 my $table_name = sprintf("%.*f", $i, $float);
13526 if ($i < $MIN_FRACTION_LENGTH) {
13527
13528 # If the test case has fewer digits than the
13529 # minimum acceptable precision, it shouldn't
13530 # succeed, so we expect an error for it.
13531 # E.g., 2/3 = .7 at one decimal point, and we
13532 # shouldn't say it matches .7. We should make
13533 # it be .667 at least before agreeing that the
13534 # intent was to match 2/3. But at the
13535 # less-than- acceptable level of precision, it
13536 # might actually match an unrelated number.
13537 # So don't generate a test case if this
13538 # conflating is possible. In our example, we
13539 # don't want 2/3 matching 7/10, if there is
13540 # a 7/10 code point.
13541 for my $existing
13542 (keys %nv_floating_to_rational)
13543 {
13544 next PLACE
13545 if abs($table_name - $existing)
13546 < $MAX_FLOATING_SLOP;
13547 }
430ada4c
NC
13548 push @output, generate_error($property_name,
13549 $table_name,
13550 1 # 1 => already an error
13551 );
99870f4d
KW
13552 }
13553 else {
13554
13555 # Here the number of digits exceeds the
13556 # minimum we think is needed. So generate a
13557 # success test case for it.
430ada4c
NC
13558 push @output, generate_tests($property_name,
13559 $table_name,
13560 $valid,
13561 $invalid,
13562 $warning,
13563 );
99870f4d
KW
13564 }
13565 }
99598c8c
JH
13566 }
13567 }
99870f4d
KW
13568 }
13569 }
13570 }
37e2e78e 13571
430ada4c
NC
13572 &write($t_path, [<DATA>,
13573 @output,
13574 (map {"Test_X('$_');\n"} @backslash_X_tests),
13575 "Finished();\n"]);
99870f4d
KW
13576 return;
13577}
99598c8c 13578
99870f4d
KW
13579# This is a list of the input files and how to handle them. The files are
13580# processed in their order in this list. Some reordering is possible if
13581# desired, but the v0 files should be first, and the extracted before the
13582# others except DAge.txt (as data in an extracted file can be over-ridden by
13583# the non-extracted. Some other files depend on data derived from an earlier
13584# file, like UnicodeData requires data from Jamo, and the case changing and
13585# folding requires data from Unicode. Mostly, it safest to order by first
13586# version releases in (except the Jamo). DAge.txt is read before the
13587# extracted ones because of the rarely used feature $compare_versions. In the
13588# unlikely event that there were ever an extracted file that contained the Age
13589# property information, it would have to go in front of DAge.
13590#
13591# The version strings allow the program to know whether to expect a file or
13592# not, but if a file exists in the directory, it will be processed, even if it
13593# is in a version earlier than expected, so you can copy files from a later
13594# release into an earlier release's directory.
13595my @input_file_objects = (
13596 Input_file->new('PropertyAliases.txt', v0,
13597 Handler => \&process_PropertyAliases,
13598 ),
13599 Input_file->new(undef, v0, # No file associated with this
3df51b85 13600 Progress_Message => 'Finishing property setup',
99870f4d
KW
13601 Handler => \&finish_property_setup,
13602 ),
13603 Input_file->new('PropValueAliases.txt', v0,
13604 Handler => \&process_PropValueAliases,
13605 Has_Missings_Defaults => $NOT_IGNORED,
13606 ),
13607 Input_file->new('DAge.txt', v3.2.0,
13608 Has_Missings_Defaults => $NOT_IGNORED,
13609 Property => 'Age'
13610 ),
13611 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
13612 Property => 'General_Category',
13613 ),
13614 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
13615 Property => 'Canonical_Combining_Class',
13616 Has_Missings_Defaults => $NOT_IGNORED,
13617 ),
13618 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
13619 Property => 'Numeric_Type',
13620 Has_Missings_Defaults => $NOT_IGNORED,
13621 ),
13622 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
13623 Property => 'East_Asian_Width',
13624 Has_Missings_Defaults => $NOT_IGNORED,
13625 ),
13626 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
13627 Property => 'Line_Break',
13628 Has_Missings_Defaults => $NOT_IGNORED,
13629 ),
13630 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
13631 Property => 'Bidi_Class',
13632 Has_Missings_Defaults => $NOT_IGNORED,
13633 ),
13634 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
13635 Property => 'Decomposition_Type',
13636 Has_Missings_Defaults => $NOT_IGNORED,
13637 ),
13638 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
13639 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
13640 Property => 'Numeric_Value',
13641 Each_Line_Handler => \&filter_numeric_value_line,
13642 Has_Missings_Defaults => $NOT_IGNORED,
13643 ),
13644 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
13645 Property => 'Joining_Group',
13646 Has_Missings_Defaults => $NOT_IGNORED,
13647 ),
13648
13649 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
13650 Property => 'Joining_Type',
13651 Has_Missings_Defaults => $NOT_IGNORED,
13652 ),
13653 Input_file->new('Jamo.txt', v2.0.0,
13654 Property => 'Jamo_Short_Name',
13655 Each_Line_Handler => \&filter_jamo_line,
13656 ),
13657 Input_file->new('UnicodeData.txt', v1.1.5,
13658 Pre_Handler => \&setup_UnicodeData,
13659
13660 # We clean up this file for some early versions.
13661 Each_Line_Handler => [ (($v_version lt v2.0.0 )
13662 ? \&filter_v1_ucd
13663 : ($v_version eq v2.1.5)
13664 ? \&filter_v2_1_5_ucd
13665 : undef),
13666
13667 # And the main filter
13668 \&filter_UnicodeData_line,
13669 ],
13670 EOF_Handler => \&EOF_UnicodeData,
13671 ),
13672 Input_file->new('ArabicShaping.txt', v2.0.0,
13673 Each_Line_Handler =>
13674 [ ($v_version lt 4.1.0)
13675 ? \&filter_old_style_arabic_shaping
13676 : undef,
13677 \&filter_arabic_shaping_line,
13678 ],
13679 Has_Missings_Defaults => $NOT_IGNORED,
13680 ),
13681 Input_file->new('Blocks.txt', v2.0.0,
13682 Property => 'Block',
13683 Has_Missings_Defaults => $NOT_IGNORED,
13684 Each_Line_Handler => \&filter_blocks_lines
13685 ),
13686 Input_file->new('PropList.txt', v2.0.0,
13687 Each_Line_Handler => (($v_version lt v3.1.0)
13688 ? \&filter_old_style_proplist
13689 : undef),
13690 ),
13691 Input_file->new('Unihan.txt', v2.0.0,
13692 Pre_Handler => \&setup_unihan,
13693 Optional => 1,
13694 Each_Line_Handler => \&filter_unihan_line,
13695 ),
13696 Input_file->new('SpecialCasing.txt', v2.1.8,
13697 Each_Line_Handler => \&filter_special_casing_line,
13698 Pre_Handler => \&setup_special_casing,
13699 ),
13700 Input_file->new(
13701 'LineBreak.txt', v3.0.0,
13702 Has_Missings_Defaults => $NOT_IGNORED,
13703 Property => 'Line_Break',
13704 # Early versions had problematic syntax
13705 Each_Line_Handler => (($v_version lt v3.1.0)
13706 ? \&filter_early_ea_lb
13707 : undef),
13708 ),
13709 Input_file->new('EastAsianWidth.txt', v3.0.0,
13710 Property => 'East_Asian_Width',
13711 Has_Missings_Defaults => $NOT_IGNORED,
13712 # Early versions had problematic syntax
13713 Each_Line_Handler => (($v_version lt v3.1.0)
13714 ? \&filter_early_ea_lb
13715 : undef),
13716 ),
13717 Input_file->new('CompositionExclusions.txt', v3.0.0,
13718 Property => 'Composition_Exclusion',
13719 ),
13720 Input_file->new('BidiMirroring.txt', v3.0.1,
13721 Property => 'Bidi_Mirroring_Glyph',
13722 ),
37e2e78e
KW
13723 Input_file->new("NormalizationTest.txt", v3.0.1,
13724 Skip => 1,
13725 ),
99870f4d
KW
13726 Input_file->new('CaseFolding.txt', v3.0.1,
13727 Pre_Handler => \&setup_case_folding,
13728 Each_Line_Handler =>
13729 [ ($v_version lt v3.1.0)
13730 ? \&filter_old_style_case_folding
13731 : undef,
13732 \&filter_case_folding_line
13733 ],
13734 Post_Handler => \&post_fold,
13735 ),
13736 Input_file->new('DCoreProperties.txt', v3.1.0,
13737 # 5.2 changed this file
13738 Has_Missings_Defaults => (($v_version ge v5.2.0)
13739 ? $NOT_IGNORED
13740 : $NO_DEFAULTS),
13741 ),
13742 Input_file->new('Scripts.txt', v3.1.0,
13743 Property => 'Script',
13744 Has_Missings_Defaults => $NOT_IGNORED,
13745 ),
13746 Input_file->new('DNormalizationProps.txt', v3.1.0,
13747 Has_Missings_Defaults => $NOT_IGNORED,
13748 Each_Line_Handler => (($v_version lt v4.0.1)
13749 ? \&filter_old_style_normalization_lines
13750 : undef),
13751 ),
13752 Input_file->new('HangulSyllableType.txt', v4.0.0,
13753 Has_Missings_Defaults => $NOT_IGNORED,
13754 Property => 'Hangul_Syllable_Type'),
13755 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
13756 Property => 'Word_Break',
13757 Has_Missings_Defaults => $NOT_IGNORED,
13758 ),
13759 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1.0,
13760 Property => 'Grapheme_Cluster_Break',
13761 Has_Missings_Defaults => $NOT_IGNORED,
13762 ),
37e2e78e
KW
13763 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
13764 Handler => \&process_GCB_test,
13765 ),
13766 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
13767 Skip => 1,
13768 ),
13769 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
13770 Skip => 1,
13771 ),
13772 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
13773 Skip => 1,
13774 ),
99870f4d
KW
13775 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
13776 Property => 'Sentence_Break',
13777 Has_Missings_Defaults => $NOT_IGNORED,
13778 ),
13779 Input_file->new('NamedSequences.txt', v4.1.0,
13780 Handler => \&process_NamedSequences
13781 ),
13782 Input_file->new('NameAliases.txt', v5.0.0,
13783 Property => 'Name_Alias',
13784 ),
37e2e78e
KW
13785 Input_file->new("BidiTest.txt", v5.2.0,
13786 Skip => 1,
13787 ),
99870f4d
KW
13788 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
13789 Optional => 1,
13790 Each_Line_Handler => \&filter_unihan_line,
13791 ),
13792 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
13793 Optional => 1,
13794 Each_Line_Handler => \&filter_unihan_line,
13795 ),
13796 Input_file->new('UnihanIRGSources.txt', v5.2.0,
13797 Optional => 1,
13798 Pre_Handler => \&setup_unihan,
13799 Each_Line_Handler => \&filter_unihan_line,
13800 ),
13801 Input_file->new('UnihanNumericValues.txt', v5.2.0,
13802 Optional => 1,
13803 Each_Line_Handler => \&filter_unihan_line,
13804 ),
13805 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
13806 Optional => 1,
13807 Each_Line_Handler => \&filter_unihan_line,
13808 ),
13809 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
13810 Optional => 1,
13811 Each_Line_Handler => \&filter_unihan_line,
13812 ),
13813 Input_file->new('UnihanReadings.txt', v5.2.0,
13814 Optional => 1,
13815 Each_Line_Handler => \&filter_unihan_line,
13816 ),
13817 Input_file->new('UnihanVariants.txt', v5.2.0,
13818 Optional => 1,
13819 Each_Line_Handler => \&filter_unihan_line,
13820 ),
13821);
99598c8c 13822
99870f4d
KW
13823# End of all the preliminaries.
13824# Do it...
99598c8c 13825
99870f4d
KW
13826if ($compare_versions) {
13827 Carp::my_carp(<<END
13828Warning. \$compare_versions is set. Output is not suitable for production
13829END
13830 );
13831}
99598c8c 13832
99870f4d
KW
13833# Put into %potential_files a list of all the files in the directory structure
13834# that could be inputs to this program, excluding those that we should ignore.
37e2e78e 13835# Use absolute file names because it makes it easier across machine types.
99870f4d
KW
13836my @ignored_files_full_names = map { File::Spec->rel2abs(
13837 internal_file_to_platform($_))
13838 } keys %ignored_files;
13839File::Find::find({
13840 wanted=>sub {
37e2e78e 13841 return unless /\.txt$/i; # Some platforms change the name's case
517956bf 13842 my $full = lc(File::Spec->rel2abs($_));
99870f4d 13843 $potential_files{$full} = 1
37e2e78e 13844 if ! grep { $full eq lc($_) } @ignored_files_full_names;
99870f4d
KW
13845 return;
13846 }
13847}, File::Spec->curdir());
99598c8c 13848
99870f4d 13849my @mktables_list_output_files;
cdcef19a 13850my $old_start_time = 0;
cf25bb62 13851
3644ba60
KW
13852if (! -e $file_list) {
13853 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
13854 $write_unchanged_files = 1;
13855} elsif ($write_unchanged_files) {
99870f4d
KW
13856 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
13857}
13858else {
13859 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
13860 my $file_handle;
23e33b60 13861 if (! open $file_handle, "<", $file_list) {
3644ba60 13862 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
99870f4d
KW
13863 $glob_list = 1;
13864 }
13865 else {
13866 my @input;
13867
13868 # Read and parse mktables.lst, placing the results from the first part
13869 # into @input, and the second part into @mktables_list_output_files
13870 for my $list ( \@input, \@mktables_list_output_files ) {
13871 while (<$file_handle>) {
13872 s/^ \s+ | \s+ $//xg;
cdcef19a
KW
13873 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
13874 $old_start_time = $1;
13875 }
99870f4d
KW
13876 next if /^ \s* (?: \# .* )? $/x;
13877 last if /^ =+ $/x;
13878 my ( $file ) = split /\t/;
13879 push @$list, $file;
cf25bb62 13880 }
99870f4d
KW
13881 @$list = uniques(@$list);
13882 next;
cf25bb62
JH
13883 }
13884
99870f4d
KW
13885 # Look through all the input files
13886 foreach my $input (@input) {
13887 next if $input eq 'version'; # Already have checked this.
cf25bb62 13888
99870f4d
KW
13889 # Ignore if doesn't exist. The checking about whether we care or
13890 # not is done via the Input_file object.
13891 next if ! file_exists($input);
5beb625e 13892
99870f4d
KW
13893 # The paths are stored with relative names, and with '/' as the
13894 # delimiter; convert to absolute on this machine
517956bf 13895 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
99870f4d 13896 $potential_files{$full} = 1
517956bf 13897 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
99870f4d 13898 }
5beb625e 13899 }
cf25bb62 13900
99870f4d
KW
13901 close $file_handle;
13902}
13903
13904if ($glob_list) {
13905
13906 # Here wants to process all .txt files in the directory structure.
13907 # Convert them to full path names. They are stored in the platform's
13908 # relative style
f86864ac
KW
13909 my @known_files;
13910 foreach my $object (@input_file_objects) {
13911 my $file = $object->file;
13912 next unless defined $file;
13913 push @known_files, File::Spec->rel2abs($file);
13914 }
99870f4d
KW
13915
13916 my @unknown_input_files;
13917 foreach my $file (keys %potential_files) {
517956bf 13918 next if grep { lc($file) eq lc($_) } @known_files;
99870f4d
KW
13919
13920 # Here, the file is unknown to us. Get relative path name
13921 $file = File::Spec->abs2rel($file);
13922 push @unknown_input_files, $file;
13923
13924 # What will happen is we create a data structure for it, and add it to
13925 # the list of input files to process. First get the subdirectories
13926 # into an array
13927 my (undef, $directories, undef) = File::Spec->splitpath($file);
13928 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13929 my @directories = File::Spec->splitdir($directories);
13930
13931 # If the file isn't extracted (meaning none of the directories is the
13932 # extracted one), just add it to the end of the list of inputs.
13933 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
99f78760 13934 push @input_file_objects, Input_file->new($file, v0);
99870f4d
KW
13935 }
13936 else {
13937
13938 # Here, the file is extracted. It needs to go ahead of most other
13939 # processing. Search for the first input file that isn't a
13940 # special required property (that is, find one whose first_release
13941 # is non-0), and isn't extracted. Also, the Age property file is
13942 # processed before the extracted ones, just in case
13943 # $compare_versions is set.
13944 for (my $i = 0; $i < @input_file_objects; $i++) {
13945 if ($input_file_objects[$i]->first_released ne v0
517956bf
CB
13946 && lc($input_file_objects[$i]->file) ne 'dage.txt'
13947 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
99870f4d 13948 {
99f78760 13949 splice @input_file_objects, $i, 0,
37e2e78e 13950 Input_file->new($file, v0);
99870f4d
KW
13951 last;
13952 }
cf25bb62 13953 }
99870f4d 13954
cf25bb62 13955 }
d2d499f5 13956 }
99870f4d 13957 if (@unknown_input_files) {
23e33b60 13958 print STDERR simple_fold(join_lines(<<END
99870f4d
KW
13959
13960The following files are unknown as to how to handle. Assuming they are
13961typical property files. You'll know by later error messages if it worked or
13962not:
13963END
99f78760 13964 ) . " " . join(", ", @unknown_input_files) . "\n\n");
99870f4d
KW
13965 }
13966} # End of looking through directory structure for more .txt files.
5beb625e 13967
99870f4d
KW
13968# Create the list of input files from the objects we have defined, plus
13969# version
13970my @input_files = 'version';
13971foreach my $object (@input_file_objects) {
13972 my $file = $object->file;
13973 next if ! defined $file; # Not all objects have files
13974 next if $object->optional && ! -e $file;
13975 push @input_files, $file;
13976}
5beb625e 13977
99870f4d
KW
13978if ( $verbosity >= $VERBOSE ) {
13979 print "Expecting ".scalar( @input_files )." input files. ",
13980 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
13981}
cf25bb62 13982
aeab6150
KW
13983# We set $most_recent to be the most recently changed input file, including
13984# this program itself (done much earlier in this file)
99870f4d 13985foreach my $in (@input_files) {
cdcef19a
KW
13986 next unless -e $in; # Keep going even if missing a file
13987 my $mod_time = (stat $in)[9];
aeab6150 13988 $most_recent = $mod_time if $mod_time > $most_recent;
99870f4d
KW
13989
13990 # See that the input files have distinct names, to warn someone if they
13991 # are adding a new one
13992 if ($make_list) {
13993 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
13994 $directories =~ s;/$;;; # Can have extraneous trailing '/'
13995 my @directories = File::Spec->splitdir($directories);
13996 my $base = $file =~ s/\.txt$//;
13997 construct_filename($file, 'mutable', \@directories);
cf25bb62 13998 }
99870f4d 13999}
cf25bb62 14000
dff6c046 14001my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
cdcef19a 14002 || ! scalar @mktables_list_output_files # or if no outputs known
aeab6150 14003 || $old_start_time < $most_recent; # or out-of-date
cf25bb62 14004
99870f4d
KW
14005# Now we check to see if any output files are older than youngest, if
14006# they are, we need to continue on, otherwise we can presumably bail.
d1d1cd7a 14007if (! $rebuild) {
99870f4d
KW
14008 foreach my $out (@mktables_list_output_files) {
14009 if ( ! file_exists($out)) {
14010 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 14011 $rebuild = 1;
99870f4d
KW
14012 last;
14013 }
14014 #local $to_trace = 1 if main::DEBUG;
aeab6150
KW
14015 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
14016 if ( (stat $out)[9] <= $most_recent ) {
14017 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
99870f4d 14018 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
d1d1cd7a 14019 $rebuild = 1;
99870f4d 14020 last;
cf25bb62 14021 }
cf25bb62 14022 }
99870f4d 14023}
d1d1cd7a 14024if (! $rebuild) {
1265e11f 14025 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
99870f4d
KW
14026 exit(0);
14027}
14028print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
cf25bb62 14029
99870f4d
KW
14030# Ready to do the major processing. First create the perl pseudo-property.
14031$perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
cf25bb62 14032
99870f4d
KW
14033# Process each input file
14034foreach my $file (@input_file_objects) {
14035 $file->run;
d2d499f5
JH
14036}
14037
99870f4d 14038# Finish the table generation.
c4051cc5 14039
99870f4d
KW
14040print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
14041finish_Unicode();
c4051cc5 14042
99870f4d
KW
14043print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
14044compile_perl();
c4051cc5 14045
99870f4d
KW
14046print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
14047add_perl_synonyms();
c4051cc5 14048
99870f4d
KW
14049print "Writing tables\n" if $verbosity >= $PROGRESS;
14050write_all_tables();
c4051cc5 14051
99870f4d
KW
14052# Write mktables.lst
14053if ( $file_list and $make_list ) {
c4051cc5 14054
99870f4d
KW
14055 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
14056 foreach my $file (@input_files, @files_actually_output) {
14057 my (undef, $directories, $file) = File::Spec->splitpath($file);
14058 my @directories = File::Spec->splitdir($directories);
14059 $file = join '/', @directories, $file;
14060 }
14061
14062 my $ofh;
14063 if (! open $ofh,">",$file_list) {
14064 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
14065 return
14066 }
14067 else {
cdcef19a 14068 my $localtime = localtime $start_time;
99870f4d
KW
14069 print $ofh <<"END";
14070#
14071# $file_list -- File list for $0.
97050450 14072#
cdcef19a 14073# Autogenerated starting on $start_time ($localtime)
97050450
YO
14074#
14075# - First section is input files
99870f4d 14076# ($0 itself is not listed but is automatically considered an input)
97050450
YO
14077# - Section seperator is /^=+\$/
14078# - Second section is a list of output files.
14079# - Lines matching /^\\s*#/ are treated as comments
14080# which along with blank lines are ignored.
14081#
14082
14083# Input files:
14084
99870f4d
KW
14085END
14086 print $ofh "$_\n" for sort(@input_files);
14087 print $ofh "\n=================================\n# Output files:\n\n";
14088 print $ofh "$_\n" for sort @files_actually_output;
14089 print $ofh "\n# ",scalar(@input_files)," input files\n",
14090 "# ",scalar(@files_actually_output)+1," output files\n\n",
14091 "# End list\n";
14092 close $ofh
14093 or Carp::my_carp("Failed to close $ofh: $!");
14094
14095 print "Filelist has ",scalar(@input_files)," input files and ",
14096 scalar(@files_actually_output)+1," output files\n"
14097 if $verbosity >= $VERBOSE;
14098 }
14099}
14100
14101# Output these warnings unless -q explicitly specified.
14102if ($verbosity >= $NORMAL_VERBOSITY) {
14103 if (@unhandled_properties) {
14104 print "\nProperties and tables that unexpectedly have no code points\n";
14105 foreach my $property (sort @unhandled_properties) {
14106 print $property, "\n";
14107 }
14108 }
14109
14110 if (%potential_files) {
14111 print "\nInput files that are not considered:\n";
14112 foreach my $file (sort keys %potential_files) {
14113 print File::Spec->abs2rel($file), "\n";
14114 }
14115 }
14116 print "\nAll done\n" if $verbosity >= $VERBOSE;
14117}
5beb625e 14118exit(0);
cf25bb62 14119
99870f4d 14120# TRAILING CODE IS USED BY make_property_test_script()
5beb625e 14121__DATA__
99870f4d 14122
5beb625e
JH
14123use strict;
14124use warnings;
14125
66fd7fd0
KW
14126# If run outside the normal test suite on an ASCII platform, you can
14127# just create a latin1_to_native() function that just returns its
14128# inputs, because that's the only function used from test.pl
14129require "test.pl";
14130
37e2e78e
KW
14131# Test qr/\X/ and the \p{} regular expression constructs. This file is
14132# constructed by mktables from the tables it generates, so if mktables is
14133# buggy, this won't necessarily catch those bugs. Tests are generated for all
14134# feasible properties; a few aren't currently feasible; see
14135# is_code_point_usable() in mktables for details.
99870f4d
KW
14136
14137# Standard test packages are not used because this manipulates SIG_WARN. It
14138# exits 0 if every non-skipped test succeeded; -1 if any failed.
14139
5beb625e
JH
14140my $Tests = 0;
14141my $Fails = 0;
99870f4d 14142
99870f4d
KW
14143sub Expect($$$$) {
14144 my $expected = shift;
14145 my $ord = shift;
14146 my $regex = shift;
14147 my $warning_type = shift; # Type of warning message, like 'deprecated'
14148 # or empty if none
14149 my $line = (caller)[2];
66fd7fd0 14150 $ord = ord(latin1_to_native(chr($ord)));
37e2e78e 14151
99870f4d 14152 # Convert the code point to hex form
23e33b60 14153 my $string = sprintf "\"\\x{%04X}\"", $ord;
99870f4d 14154
99870f4d 14155 my @tests = "";
5beb625e 14156
37e2e78e
KW
14157 # The first time through, use all warnings. If the input should generate
14158 # a warning, add another time through with them turned off
99870f4d
KW
14159 push @tests, "no warnings '$warning_type';" if $warning_type;
14160
14161 foreach my $no_warnings (@tests) {
14162
14163 # Store any warning messages instead of outputting them
14164 local $SIG{__WARN__} = $SIG{__WARN__};
14165 my $warning_message;
14166 $SIG{__WARN__} = sub { $warning_message = $_[0] };
14167
14168 $Tests++;
14169
14170 # A string eval is needed because of the 'no warnings'.
14171 # Assumes no parens in the regular expression
14172 my $result = eval "$no_warnings
14173 my \$RegObj = qr($regex);
14174 $string =~ \$RegObj ? 1 : 0";
14175 if (not defined $result) {
14176 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
14177 $Fails++;
14178 }
14179 elsif ($result ^ $expected) {
14180 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
14181 $Fails++;
14182 }
14183 elsif ($warning_message) {
14184 if (! $warning_type || ($warning_type && $no_warnings)) {
14185 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
14186 $Fails++;
14187 }
14188 else {
14189 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
14190 }
14191 }
14192 elsif ($warning_type && ! $no_warnings) {
14193 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
14194 $Fails++;
14195 }
14196 else {
14197 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
14198 }
5beb625e 14199 }
99870f4d 14200 return;
5beb625e 14201}
d73e5302 14202
99870f4d
KW
14203sub Error($) {
14204 my $regex = shift;
5beb625e 14205 $Tests++;
99870f4d 14206 if (eval { 'x' =~ qr/$regex/; 1 }) {
5beb625e 14207 $Fails++;
99870f4d
KW
14208 my $line = (caller)[2];
14209 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
5beb625e 14210 }
99870f4d
KW
14211 else {
14212 my $line = (caller)[2];
14213 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
14214 }
14215 return;
5beb625e
JH
14216}
14217
37e2e78e
KW
14218# GCBTest.txt character that separates grapheme clusters
14219my $breakable_utf8 = my $breakable = chr(0xF7);
14220utf8::upgrade($breakable_utf8);
14221
14222# GCBTest.txt character that indicates that the adjoining code points are part
14223# of the same grapheme cluster
14224my $nobreak_utf8 = my $nobreak = chr(0xD7);
14225utf8::upgrade($nobreak_utf8);
14226
14227sub Test_X($) {
14228 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
14229 # Each such line is a sequence of code points given by their hex numbers,
14230 # separated by the two characters defined just before this subroutine that
14231 # indicate that either there can or cannot be a break between the adjacent
14232 # code points. If there isn't a break, that means the sequence forms an
14233 # extended grapheme cluster, which means that \X should match the whole
14234 # thing. If there is a break, \X should stop there. This is all
14235 # converted by this routine into a match:
14236 # $string =~ /(\X)/,
14237 # Each \X should match the next cluster; and that is what is checked.
14238
14239 my $template = shift;
14240
14241 my $line = (caller)[2];
14242
14243 # The line contains characters above the ASCII range, but in Latin1. It
14244 # may or may not be in utf8, and if it is, it may or may not know it. So,
14245 # convert these characters to 8 bits. If knows is in utf8, simply
14246 # downgrade.
14247 if (utf8::is_utf8($template)) {
14248 utf8::downgrade($template);
14249 } else {
14250
14251 # Otherwise, if it is in utf8, but doesn't know it, the next lines
14252 # convert the two problematic characters to their 8-bit equivalents.
14253 # If it isn't in utf8, they don't harm anything.
14254 use bytes;
14255 $template =~ s/$nobreak_utf8/$nobreak/g;
14256 $template =~ s/$breakable_utf8/$breakable/g;
14257 }
14258
14259 # Get rid of the leading and trailing breakables
14260 $template =~ s/^ \s* $breakable \s* //x;
14261 $template =~ s/ \s* $breakable \s* $ //x;
14262
14263 # And no-breaks become just a space.
14264 $template =~ s/ \s* $nobreak \s* / /xg;
14265
14266 # Split the input into segments that are breakable between them.
14267 my @segments = split /\s*$breakable\s*/, $template;
14268
14269 my $string = "";
14270 my $display_string = "";
14271 my @should_match;
14272 my @should_display;
14273
14274 # Convert the code point sequence in each segment into a Perl string of
14275 # characters
14276 foreach my $segment (@segments) {
14277 my @code_points = split /\s+/, $segment;
14278 my $this_string = "";
14279 my $this_display = "";
14280 foreach my $code_point (@code_points) {
66fd7fd0 14281 $this_string .= latin1_to_native(chr(hex $code_point));
37e2e78e
KW
14282 $this_display .= "\\x{$code_point}";
14283 }
14284
14285 # The next cluster should match the string in this segment.
14286 push @should_match, $this_string;
14287 push @should_display, $this_display;
14288 $string .= $this_string;
14289 $display_string .= $this_display;
14290 }
14291
14292 # If a string can be represented in both non-ut8 and utf8, test both cases
14293 UPGRADE:
14294 for my $to_upgrade (0 .. 1) {
678f13d5 14295
37e2e78e
KW
14296 if ($to_upgrade) {
14297
14298 # If already in utf8, would just be a repeat
14299 next UPGRADE if utf8::is_utf8($string);
14300
14301 utf8::upgrade($string);
14302 }
14303
14304 # Finally, do the \X match.
14305 my @matches = $string =~ /(\X)/g;
14306
14307 # Look through each matched cluster to verify that it matches what we
14308 # expect.
14309 my $min = (@matches < @should_match) ? @matches : @should_match;
14310 for my $i (0 .. $min - 1) {
14311 $Tests++;
14312 if ($matches[$i] eq $should_match[$i]) {
14313 print "ok $Tests - ";
14314 if ($i == 0) {
14315 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
14316 } else {
14317 print "And \\X #", $i + 1,
14318 }
14319 print " correctly matched $should_display[$i]; line $line\n";
14320 } else {
14321 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
14322 unpack("U*", $matches[$i]));
14323 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
14324 $i + 1,
14325 " should have matched $should_display[$i]",
14326 " but instead matched $matches[$i]",
14327 ". Abandoning rest of line $line\n";
14328 next UPGRADE;
14329 }
14330 }
14331
14332 # And the number of matches should equal the number of expected matches.
14333 $Tests++;
14334 if (@matches == @should_match) {
14335 print "ok $Tests - Nothing was left over; line $line\n";
14336 } else {
14337 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
14338 }
14339 }
14340
14341 return;
14342}
14343
99870f4d 14344sub Finished() {
f86864ac 14345 print "1..$Tests\n";
99870f4d 14346 exit($Fails ? -1 : 0);
5beb625e 14347}
99870f4d
KW
14348
14349Error('\p{Script=InGreek}'); # Bug #69018
37e2e78e 14350Test_X("1100 $nobreak 1161"); # Bug #70940
ae5b72c8
KW
14351Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
14352Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
eadadd41 14353Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726