3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # Needs 'no overloading' to run faster on miniperl. Code commented out at the
8 # subroutine objaddr can be used instead to work as far back (untested) as
9 # 5.8: needs pack "U". But almost all occurrences of objaddr have been
10 # removed in favor of using 'no overloading'. You also would have to go
11 # through and replace occurrences like:
12 # my $addr = do { no overloading; pack 'J', $self; }
14 # my $addr = main::objaddr $self;
15 # (or reverse commit 9b01bafde4b022706c3d6f947a0963f821b2e50b
16 # that instituted the change to main::objaddr, and subsequent commits that
17 # changed 0+$self to pack 'J', $self.)
20 BEGIN { # Get the time the script started running; do it at compilation to
21 # get it as close as possible
36 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
37 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
39 ##########################################################################
41 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
42 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
43 # a pod file and .t files, depending on option parameters.
45 # The structure of this file is:
46 # First these introductory comments; then
47 # code needed for everywhere, such as debugging stuff; then
48 # code to handle input parameters; then
49 # data structures likely to be of external interest (some of which depend on
50 # the input parameters, so follows them; then
51 # more data structures and subroutine and package (class) definitions; then
52 # the small actual loop to process the input files and finish up; then
53 # a __DATA__ section, for the .t tests
55 # This program works on all releases of Unicode so far. The outputs have been
56 # scrutinized most intently for release 5.1. The others have been checked for
57 # somewhat more than just sanity. It can handle all non-provisional Unicode
58 # character properties in those releases.
60 # This program is mostly about Unicode character (or code point) properties.
61 # A property describes some attribute or quality of a code point, like if it
62 # is lowercase or not, its name, what version of Unicode it was first defined
63 # in, or what its uppercase equivalent is. Unicode deals with these disparate
64 # possibilities by making all properties into mappings from each code point
65 # into some corresponding value. In the case of it being lowercase or not,
66 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
67 # property maps each Unicode code point to a single value, called a "property
68 # value". (Some more recently defined properties, map a code point to a set
71 # When using a property in a regular expression, what is desired isn't the
72 # mapping of the code point to its property's value, but the reverse (or the
73 # mathematical "inverse relation"): starting with the property value, "Does a
74 # code point map to it?" These are written in a "compound" form:
75 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
76 # files containing the lists of code points that map to each such regular
77 # expression property value, one file per list
79 # There is also a single form shortcut that Perl adds for many of the commonly
80 # used properties. This happens for all binary properties, plus script,
81 # general_category, and block properties.
83 # Thus the outputs of this program are files. There are map files, mostly in
84 # the 'To' directory; and there are list files for use in regular expression
85 # matching, all in subdirectories of the 'lib' directory, with each
86 # subdirectory being named for the property that the lists in it are for.
87 # Bookkeeping, test, and documentation files are also generated.
89 my $matches_directory = 'lib'; # Where match (\p{}) files go.
90 my $map_directory = 'To'; # Where map files go.
94 # The major data structures of this program are Property, of course, but also
95 # Table. There are two kinds of tables, very similar to each other.
96 # "Match_Table" is the data structure giving the list of code points that have
97 # a particular property value, mentioned above. There is also a "Map_Table"
98 # data structure which gives the property's mapping from code point to value.
99 # There are two structures because the match tables need to be combined in
100 # various ways, such as constructing unions, intersections, complements, etc.,
101 # and the map ones don't. And there would be problems, perhaps subtle, if
102 # a map table were inadvertently operated on in some of those ways.
103 # The use of separate classes with operations defined on one but not the other
104 # prevents accidentally confusing the two.
106 # At the heart of each table's data structure is a "Range_List", which is just
107 # an ordered list of "Ranges", plus ancillary information, and methods to
108 # operate on them. A Range is a compact way to store property information.
109 # Each range has a starting code point, an ending code point, and a value that
110 # is meant to apply to all the code points between the two end points,
111 # inclusive. For a map table, this value is the property value for those
112 # code points. Two such ranges could be written like this:
113 # 0x41 .. 0x5A, 'Upper',
114 # 0x61 .. 0x7A, 'Lower'
116 # Each range also has a type used as a convenience to classify the values.
117 # Most ranges in this program will be Type 0, or normal, but there are some
118 # ranges that have a non-zero type. These are used only in map tables, and
119 # are for mappings that don't fit into the normal scheme of things. Mappings
120 # that require a hash entry to communicate with utf8.c are one example;
121 # another example is mappings for charnames.pm to use which indicate a name
122 # that is algorithmically determinable from its code point (and the reverse).
123 # These are used to significantly compact these tables, instead of listing
124 # each one of the tens of thousands individually.
126 # In a match table, the value of a range is irrelevant (and hence the type as
127 # well, which will always be 0), and arbitrarily set to the null string.
128 # Using the example above, there would be two match tables for those two
129 # entries, one named Upper would contain the 0x41..0x5A range, and the other
130 # named Lower would contain 0x61..0x7A.
132 # Actually, there are two types of range lists, "Range_Map" is the one
133 # associated with map tables, and "Range_List" with match tables.
134 # Again, this is so that methods can be defined on one and not the others so
135 # as to prevent operating on them in incorrect ways.
137 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
138 # in the perl core. All tables could in theory be written, but some are
139 # suppressed because there is no current practical use for them. It is easy
140 # to change which get written by changing various lists that are near the top
141 # of the actual code in this file. The table data structures contain enough
142 # ancillary information to allow them to be treated as separate entities for
143 # writing, such as the path to each one's file. There is a heading in each
144 # map table that gives the format of its entries, and what the map is for all
145 # the code points missing from it. (This allows tables to be more compact.)
147 # The Property data structure contains one or more tables. All properties
148 # contain a map table (except the $perl property which is a
149 # pseudo-property containing only match tables), and any properties that
150 # are usable in regular expression matches also contain various matching
151 # tables, one for each value the property can have. A binary property can
152 # have two values, True and False (or Y and N, which are preferred by Unicode
153 # terminology). Thus each of these properties will have a map table that
154 # takes every code point and maps it to Y or N (but having ranges cuts the
155 # number of entries in that table way down), and two match tables, one
156 # which has a list of all the code points that map to Y, and one for all the
157 # code points that map to N. (For each binary property, a third table is also
158 # generated for the pseudo Perl property. It contains the identical code
159 # points as the Y table, but can be written in regular expressions, not in the
160 # compound form, but in a "single" form like \p{IsUppercase}.) Many
161 # properties are binary, but some properties have several possible values,
162 # some have many, and properties like Name have a different value for every
163 # named code point. Those will not, unless the controlling lists are changed,
164 # have their match tables written out. But all the ones which can be used in
165 # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
166 # a property would have either its map table or its match tables written but
167 # not both. Again, what gets written is controlled by lists which can easily
168 # be changed. Starting in 5.14, advantage was taken of this, and all the map
169 # tables needed to reconstruct the Unicode db are now written out, while
170 # suppressing the Unicode .txt files that contain the data. Our tables are
171 # much more compact than the .txt files, so a significant space savings was
172 # achieved. Also, tables are not written out that are trivially derivable
173 # from tables that do get written. So, there typically is no file containing
174 # the code points not matched by a binary property (the table for \P{} versus
175 # lowercase \p{}), since you just need to invert the True table to get the
178 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
179 # how many match tables there are and the content of the maps. This 'Type' is
180 # different than a range 'Type', so don't get confused by the two concepts
181 # having the same name.
183 # For information about the Unicode properties, see Unicode's UAX44 document:
185 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
187 # As stated earlier, this program will work on any release of Unicode so far.
188 # Most obvious problems in earlier data have NOT been corrected except when
189 # necessary to make Perl or this program work reasonably, and to keep out
190 # potential security issues. For example, no folding information was given in
191 # early releases, so this program substitutes lower case instead, just so that
192 # a regular expression with the /i option will do something that actually
193 # gives the right results in many cases. There are also a couple other
194 # corrections for version 1.1.5, commented at the point they are made. As an
195 # example of corrections that weren't made (but could be) is this statement
196 # from DerivedAge.txt: "The supplementary private use code points and the
197 # non-character code points were assigned in version 2.0, but not specifically
198 # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
199 # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
200 # further down in these introductory comments.
202 # This program works on all non-provisional properties as of the current
203 # Unicode release, though the files for some are suppressed for various
204 # reasons. You can change which are output by changing lists in this program.
206 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
207 # loose matchings rules (from Unicode TR18):
209 # The recommended names for UCD properties and property values are in
210 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
211 # [PropValue]. There are both abbreviated names and longer, more
212 # descriptive names. It is strongly recommended that both names be
213 # recognized, and that loose matching of property names be used,
214 # whereby the case distinctions, whitespace, hyphens, and underbar
217 # The program still allows Fuzzy to override its determination of if loose
218 # matching should be used, but it isn't currently used, as it is no longer
219 # needed; the calculations it makes are good enough.
221 # SUMMARY OF HOW IT WORKS:
225 # A list is constructed containing each input file that is to be processed
227 # Each file on the list is processed in a loop, using the associated handler
229 # The PropertyAliases.txt and PropValueAliases.txt files are processed
230 # first. These files name the properties and property values.
231 # Objects are created of all the property and property value names
232 # that the rest of the input should expect, including all synonyms.
233 # The other input files give mappings from properties to property
234 # values. That is, they list code points and say what the mapping
235 # is under the given property. Some files give the mappings for
236 # just one property; and some for many. This program goes through
237 # each file and populates the properties and their map tables from
238 # them. Some properties are listed in more than one file, and
239 # Unicode has set up a precedence as to which has priority if there
240 # is a conflict. Thus the order of processing matters, and this
241 # program handles the conflict possibility by processing the
242 # overriding input files last, so that if necessary they replace
244 # After this is all done, the program creates the property mappings not
245 # furnished by Unicode, but derivable from what it does give.
246 # The tables of code points that match each property value in each
247 # property that is accessible by regular expressions are created.
248 # The Perl-defined properties are created and populated. Many of these
249 # require data determined from the earlier steps
250 # Any Perl-defined synonyms are created, and name clashes between Perl
251 # and Unicode are reconciled and warned about.
252 # All the properties are written to files
253 # Any other files are written, and final warnings issued.
255 # For clarity, a number of operators have been overloaded to work on tables:
256 # ~ means invert (take all characters not in the set). The more
257 # conventional '!' is not used because of the possibility of confusing
258 # it with the actual boolean operation.
260 # - means subtraction
261 # & means intersection
262 # The precedence of these is the order listed. Parentheses should be
263 # copiously used. These are not a general scheme. The operations aren't
264 # defined for a number of things, deliberately, to avoid getting into trouble.
265 # Operations are done on references and affect the underlying structures, so
266 # that the copy constructors for them have been overloaded to not return a new
267 # clone, but the input object itself.
269 # The bool operator is deliberately not overloaded to avoid confusion with
270 # "should it mean if the object merely exists, or also is non-empty?".
272 # WHY CERTAIN DESIGN DECISIONS WERE MADE
274 # This program needs to be able to run under miniperl. Therefore, it uses a
275 # minimum of other modules, and hence implements some things itself that could
276 # be gotten from CPAN
278 # This program uses inputs published by the Unicode Consortium. These can
279 # change incompatibly between releases without the Perl maintainers realizing
280 # it. Therefore this program is now designed to try to flag these. It looks
281 # at the directories where the inputs are, and flags any unrecognized files.
282 # It keeps track of all the properties in the files it handles, and flags any
283 # that it doesn't know how to handle. It also flags any input lines that
284 # don't match the expected syntax, among other checks.
286 # It is also designed so if a new input file matches one of the known
287 # templates, one hopefully just needs to add it to a list to have it
290 # As mentioned earlier, some properties are given in more than one file. In
291 # particular, the files in the extracted directory are supposedly just
292 # reformattings of the others. But they contain information not easily
293 # derivable from the other files, including results for Unihan, which this
294 # program doesn't ordinarily look at, and for unassigned code points. They
295 # also have historically had errors or been incomplete. In an attempt to
296 # create the best possible data, this program thus processes them first to
297 # glean information missing from the other files; then processes those other
298 # files to override any errors in the extracted ones. Much of the design was
299 # driven by this need to store things and then possibly override them.
301 # It tries to keep fatal errors to a minimum, to generate something usable for
302 # testing purposes. It always looks for files that could be inputs, and will
303 # warn about any that it doesn't know how to handle (the -q option suppresses
306 # Why is there more than one type of range?
307 # This simplified things. There are some very specialized code points that
308 # have to be handled specially for output, such as Hangul syllable names.
309 # By creating a range type (done late in the development process), it
310 # allowed this to be stored with the range, and overridden by other input.
311 # Originally these were stored in another data structure, and it became a
312 # mess trying to decide if a second file that was for the same property was
313 # overriding the earlier one or not.
315 # Why are there two kinds of tables, match and map?
316 # (And there is a base class shared by the two as well.) As stated above,
317 # they actually are for different things. Development proceeded much more
318 # smoothly when I (khw) realized the distinction. Map tables are used to
319 # give the property value for every code point (actually every code point
320 # that doesn't map to a default value). Match tables are used for regular
321 # expression matches, and are essentially the inverse mapping. Separating
322 # the two allows more specialized methods, and error checks so that one
323 # can't just take the intersection of two map tables, for example, as that
326 # What about 'fate' and 'status'. The concept of a table's fate was created
327 # late when it became clear that something more was needed. The difference
328 # between this and 'status' is unclean, and could be improved if someone
329 # wanted to spend the effort.
333 # This program is written so it will run under miniperl. Occasionally changes
334 # will cause an error where the backtrace doesn't work well under miniperl.
335 # To diagnose the problem, you can instead run it under regular perl, if you
338 # There is a good trace facility. To enable it, first sub DEBUG must be set
339 # to return true. Then a line like
341 # local $to_trace = 1 if main::DEBUG;
343 # can be added to enable tracing in its lexical scope (plus dynamic) or until
344 # you insert another line:
346 # local $to_trace = 0 if main::DEBUG;
348 # To actually trace, use a line like "trace $a, @b, %c, ...;
350 # Some of the more complex subroutines already have trace statements in them.
351 # Permanent trace statements should be like:
353 # trace ... if main::DEBUG && $to_trace;
355 # If there is just one or a few files that you're debugging, you can easily
356 # cause most everything else to be skipped. Change the line
358 # my $debug_skip = 0;
360 # to 1, and every file whose object is in @input_file_objects and doesn't have
361 # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
362 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
364 # To compare the output tables, it may be useful to specify the -annotate
365 # flag. This causes the tables to expand so there is one entry for each
366 # non-algorithmically named code point giving, currently its name, and its
367 # graphic representation if printable (and you have a font that knows about
368 # it). This makes it easier to see what the particular code points are in
369 # each output table. The tables are usable, but because they don't have
370 # ranges (for the most part), a Perl using them will run slower. Non-named
371 # code points are annotated with a description of their status, and contiguous
372 # ones with the same description will be output as a range rather than
373 # individually. Algorithmically named characters are also output as ranges,
374 # except when there are just a few contiguous ones.
378 # The program would break if Unicode were to change its names so that
379 # interior white space, underscores, or dashes differences were significant
380 # within property and property value names.
382 # It might be easier to use the xml versions of the UCD if this program ever
383 # would need heavy revision, and the ability to handle old versions was not
386 # There is the potential for name collisions, in that Perl has chosen names
387 # that Unicode could decide it also likes. There have been such collisions in
388 # the past, with mostly Perl deciding to adopt the Unicode definition of the
389 # name. However in the 5.2 Unicode beta testing, there were a number of such
390 # collisions, which were withdrawn before the final release, because of Perl's
391 # and other's protests. These all involved new properties which began with
392 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
393 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
394 # Unicode document, so they are unlikely to be used by Unicode for another
395 # purpose. However, they might try something beginning with 'In', or use any
396 # of the other Perl-defined properties. This program will warn you of name
397 # collisions, and refuse to generate tables with them, but manual intervention
398 # will be required in this event. One scheme that could be implemented, if
399 # necessary, would be to have this program generate another file, or add a
400 # field to mktables.lst that gives the date of first definition of a property.
401 # Each new release of Unicode would use that file as a basis for the next
402 # iteration. And the Perl synonym addition code could sort based on the age
403 # of the property, so older properties get priority, and newer ones that clash
404 # would be refused; hence existing code would not be impacted, and some other
405 # synonym would have to be used for the new property. This is ugly, and
406 # manual intervention would certainly be easier to do in the short run; lets
407 # hope it never comes to this.
411 # This program can generate tables from the Unihan database. But it doesn't
412 # by default, letting the CPAN module Unicode::Unihan handle them. Prior to
413 # version 5.2, this database was in a single file, Unihan.txt. In 5.2 the
414 # database was split into 8 different files, all beginning with the letters
415 # 'Unihan'. This program will read those file(s) if present, but it needs to
416 # know which of the many properties in the file(s) should have tables created
417 # for them. It will create tables for any properties listed in
418 # PropertyAliases.txt and PropValueAliases.txt, plus any listed in the
419 # @cjk_properties array and the @cjk_property_values array. Thus, if a
420 # property you want is not in those files of the release you are building
421 # against, you must add it to those two arrays. Starting in 4.0, the
422 # Unicode_Radical_Stroke was listed in those files, so if the Unihan database
423 # is present in the directory, a table will be generated for that property.
424 # In 5.2, several more properties were added. For your convenience, the two
425 # arrays are initialized with all the 6.0 listed properties that are also in
426 # earlier releases. But these are commented out. You can just uncomment the
427 # ones you want, or use them as a template for adding entries for other
430 # You may need to adjust the entries to suit your purposes. setup_unihan(),
431 # and filter_unihan_line() are the functions where this is done. This program
432 # already does some adjusting to make the lines look more like the rest of the
433 # Unicode DB; You can see what that is in filter_unihan_line()
435 # There is a bug in the 3.2 data file in which some values for the
436 # kPrimaryNumeric property have commas and an unexpected comment. A filter
437 # could be added for these; or for a particular installation, the Unihan.txt
438 # file could be edited to fix them.
440 # HOW TO ADD A FILE TO BE PROCESSED
442 # A new file from Unicode needs to have an object constructed for it in
443 # @input_file_objects, probably at the end or at the end of the extracted
444 # ones. The program should warn you if its name will clash with others on
445 # restrictive file systems, like DOS. If so, figure out a better name, and
446 # add lines to the README.perl file giving that. If the file is a character
447 # property, it should be in the format that Unicode has implicitly
448 # standardized for such files for the more recently introduced ones.
449 # If so, the Input_file constructor for @input_file_objects can just be the
450 # file name and release it first appeared in. If not, then it should be
451 # possible to construct an each_line_handler() to massage the line into the
454 # For non-character properties, more code will be needed. You can look at
455 # the existing entries for clues.
457 # UNICODE VERSIONS NOTES
459 # The Unicode UCD has had a number of errors in it over the versions. And
460 # these remain, by policy, in the standard for that version. Therefore it is
461 # risky to correct them, because code may be expecting the error. So this
462 # program doesn't generally make changes, unless the error breaks the Perl
463 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
464 # for U+1105, which causes real problems for the algorithms for Jamo
465 # calculations, so it is changed here.
467 # But it isn't so clear cut as to what to do about concepts that are
468 # introduced in a later release; should they extend back to earlier releases
469 # where the concept just didn't exist? It was easier to do this than to not,
470 # so that's what was done. For example, the default value for code points not
471 # in the files for various properties was probably undefined until changed by
472 # some version. No_Block for blocks is such an example. This program will
473 # assign No_Block even in Unicode versions that didn't have it. This has the
474 # benefit that code being written doesn't have to special case earlier
475 # versions; and the detriment that it doesn't match the Standard precisely for
476 # the affected versions.
478 # Here are some observations about some of the issues in early versions:
480 # Prior to version 3.0, there were 3 character decompositions. These are not
481 # handled by Unicode::Normalize, nor will it compile when presented a version
482 # that has them. However, you can trivially get it to compile by simply
483 # ignoring those decompositions, by changing the croak to a carp. At the time
484 # of this writing, the line (in cpan/Unicode-Normalize/mkheader) reads
486 # croak("Weird Canonical Decomposition of U+$h");
488 # Simply change to a carp. It will compile, but will not know about any three
489 # character decomposition.
491 # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
492 # that the reason is that the CJK block starting at 4E00 was removed from
493 # PropList, and was not put back in until 3.1.0. The Perl extension (the
494 # single property name \p{alpha}) has the correct values. But the compound
495 # form is simply not generated until 3.1, as it can be argued that prior to
496 # this release, this was not an official property. The comments for
497 # filter_old_style_proplist() give more details.
499 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
500 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
501 # reason is that 3.2 introduced U+205F=medium math space, which was not
502 # classed as white space, but Perl figured out that it should have been. 4.0
503 # reclassified it correctly.
505 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
506 # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
507 # became 202, and ATBL was left with no code points, as all the ones that
508 # mapped to 202 stayed mapped to 202. Thus if your program used the numeric
509 # name for the class, it would not have been affected, but if it used the
510 # mnemonic, it would have been.
512 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that code
513 # points which eventually came to have this script property value, instead
514 # mapped to "Unknown". But in the next release all these code points were
515 # moved to \p{sc=common} instead.
517 # The default for missing code points for BidiClass is complicated. Starting
518 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
519 # tries to do the best it can for earlier releases. It is done in
520 # process_PropertyAliases()
522 # In version 2.1.2, the entry in UnicodeData.txt:
523 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
525 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
526 # Without this change, there are casing problems for this character.
528 ##############################################################################
530 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
532 my $MAX_LINE_WIDTH = 78;
534 # Debugging aid to skip most files so as to not be distracted by them when
535 # concentrating on the ones being debugged. Add
537 # to the constructor for those files you want processed when you set this.
538 # Files with a first version number of 0 are special: they are always
539 # processed regardless of the state of this flag. Generally, Jamo.txt and
540 # UnicodeData.txt must not be skipped if you want this program to not die
541 # before normal completion.
545 # Normally these are suppressed.
546 my $write_Unicode_deprecated_tables = 0;
548 # Set to 1 to enable tracing.
551 { # Closure for trace: debugging aid
552 my $print_caller = 1; # ? Include calling subroutine name
553 my $main_with_colon = 'main::';
554 my $main_colon_length = length($main_with_colon);
557 return unless $to_trace; # Do nothing if global flag not set
561 local $DB::trace = 0;
562 $DB::trace = 0; # Quiet 'used only once' message
566 # Loop looking up the stack to get the first non-trace caller
571 $line_number = $caller_line;
572 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
573 $caller = $main_with_colon unless defined $caller;
575 $caller_name = $caller;
578 $caller_name =~ s/.*:://;
579 if (substr($caller_name, 0, $main_colon_length)
582 $caller_name = substr($caller_name, $main_colon_length);
585 } until ($caller_name ne 'trace');
587 # If the stack was empty, we were called from the top level
588 $caller_name = 'main' if ($caller_name eq ""
589 || $caller_name eq 'trace');
592 foreach my $string (@input) {
593 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
594 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
595 $output .= simple_dumper($string);
598 $string = "$string" if ref $string;
599 $string = $UNDEF unless defined $string;
601 $string = '""' if $string eq "";
602 $output .= " " if $output ne ""
604 && substr($output, -1, 1) ne " "
605 && substr($string, 0, 1) ne " ";
610 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
611 print STDERR "$caller_name: " if $print_caller;
612 print STDERR $output, "\n";
617 # This is for a rarely used development feature that allows you to compare two
618 # versions of the Unicode standard without having to deal with changes caused
619 # by the code points introduced in the later version. Change the 0 to a
620 # string containing a SINGLE dotted Unicode release number (e.g. "2.1"). Only
621 # code points introduced in that release and earlier will be used; later ones
622 # are thrown away. You use the version number of the earliest one you want to
623 # compare; then run this program on directory structures containing each
624 # release, and compare the outputs. These outputs will therefore include only
625 # the code points common to both releases, and you can see the changes caused
626 # just by the underlying release semantic changes. For versions earlier than
627 # 3.2, you must copy a version of DAge.txt into the directory.
628 my $string_compare_versions = DEBUG && 0; # e.g., "2.1";
629 my $compare_versions = DEBUG
630 && $string_compare_versions
631 && pack "C*", split /\./, $string_compare_versions;
634 # Returns non-duplicated input values. From "Perl Best Practices:
635 # Encapsulated Cleverness". p. 455 in first edition.
638 # Arguably this breaks encapsulation, if the goal is to permit multiple
639 # distinct objects to stringify to the same value, and be interchangeable.
640 # However, for this program, no two objects stringify identically, and all
641 # lists passed to this function are either objects or strings. So this
642 # doesn't affect correctness, but it does give a couple of percent speedup.
644 return grep { ! $seen{$_}++ } @_;
647 $0 = File::Spec->canonpath($0);
649 my $make_test_script = 0; # ? Should we output a test script
650 my $make_norm_test_script = 0; # ? Should we output a normalization test script
651 my $write_unchanged_files = 0; # ? Should we update the output files even if
652 # we don't think they have changed
653 my $use_directory = ""; # ? Should we chdir somewhere.
654 my $pod_directory; # input directory to store the pod file.
655 my $pod_file = 'perluniprops';
656 my $t_path; # Path to the .t test file
657 my $file_list = 'mktables.lst'; # File to store input and output file names.
658 # This is used to speed up the build, by not
659 # executing the main body of the program if
660 # nothing on the list has changed since the
662 my $make_list = 1; # ? Should we write $file_list. Set to always
663 # make a list so that when the pumpking is
664 # preparing a release, s/he won't have to do
666 my $glob_list = 0; # ? Should we try to include unknown .txt files
668 my $output_range_counts = $debugging_build; # ? Should we include the number
669 # of code points in ranges in
671 my $annotate = 0; # ? Should character names be in the output
673 # Verbosity levels; 0 is quiet
674 my $NORMAL_VERBOSITY = 1;
678 my $verbosity = $NORMAL_VERBOSITY;
682 my $arg = shift @ARGV;
684 $verbosity = $VERBOSE;
686 elsif ($arg eq '-p') {
687 $verbosity = $PROGRESS;
688 $| = 1; # Flush buffers as we go.
690 elsif ($arg eq '-q') {
693 elsif ($arg eq '-w') {
694 $write_unchanged_files = 1; # update the files even if havent changed
696 elsif ($arg eq '-check') {
697 my $this = shift @ARGV;
698 my $ok = shift @ARGV;
700 print "Skipping as check params are not the same.\n";
704 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
705 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
707 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
709 $make_test_script = 1;
711 elsif ($arg eq '-makenormtest')
713 $make_norm_test_script = 1;
715 elsif ($arg eq '-makelist') {
718 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
719 -d $use_directory or croak "Unknown directory '$use_directory'";
721 elsif ($arg eq '-L') {
723 # Existence not tested until have chdir'd
726 elsif ($arg eq '-globlist') {
729 elsif ($arg eq '-c') {
730 $output_range_counts = ! $output_range_counts
732 elsif ($arg eq '-annotate') {
734 $debugging_build = 1;
735 $output_range_counts = 1;
739 $with_c .= 'out' if $output_range_counts; # Complements the state
741 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
742 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
744 -c : Output comments $with_c number of code points in ranges
745 -q : Quiet Mode: Only output serious warnings.
746 -p : Set verbosity level to normal plus show progress.
747 -v : Set Verbosity level high: Show progress and non-serious
749 -w : Write files regardless
750 -C dir : Change to this directory before proceeding. All relative paths
751 except those specified by the -P and -T options will be done
752 with respect to this directory.
753 -P dir : Output $pod_file file to directory 'dir'.
754 -T path : Create a test script as 'path'; overrides -maketest
755 -L filelist : Use alternate 'filelist' instead of standard one
756 -globlist : Take as input all non-Test *.txt files in current and sub
758 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
760 -makelist : Rewrite the file list $file_list based on current setup
761 -annotate : Output an annotation for each character in the table files;
762 useful for debugging mktables, looking at diffs; but is slow,
763 memory intensive; resulting tables are usable but are slow and
764 very large (and currently fail the Unicode::UCD.t tests).
765 -check A B : Executes $0 only if A and B are the same
770 # Stores the most-recently changed file. If none have changed, can skip the
772 my $most_recent = (stat $0)[9]; # Do this before the chdir!
774 # Change directories now, because need to read 'version' early.
775 if ($use_directory) {
776 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
777 $pod_directory = File::Spec->rel2abs($pod_directory);
779 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
780 $t_path = File::Spec->rel2abs($t_path);
782 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
783 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
784 $pod_directory = File::Spec->abs2rel($pod_directory);
786 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
787 $t_path = File::Spec->abs2rel($t_path);
791 # Get Unicode version into regular and v-string. This is done now because
792 # various tables below get populated based on it. These tables are populated
793 # here to be near the top of the file, and so easily seeable by those needing
795 open my $VERSION, "<", "version"
796 or croak "$0: can't open required file 'version': $!\n";
797 my $string_version = <$VERSION>;
799 chomp $string_version;
800 my $v_version = pack "C*", split /\./, $string_version; # v string
802 # The following are the complete names of properties with property values that
803 # are known to not match any code points in some versions of Unicode, but that
804 # may change in the future so they should be matchable, hence an empty file is
805 # generated for them.
806 my @tables_that_may_be_empty = (
807 'Joining_Type=Left_Joining',
809 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
810 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
811 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
812 if $v_version ge v4.1.0;
813 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
814 if $v_version ge v6.0.0;
815 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
816 if $v_version ge v6.1.0;
817 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
818 if $v_version ge v6.2.0;
820 # The lists below are hashes, so the key is the item in the list, and the
821 # value is the reason why it is in the list. This makes generation of
822 # documentation easier.
824 my %why_suppressed; # No file generated for these.
826 # Files aren't generated for empty extraneous properties. This is arguable.
827 # Extraneous properties generally come about because a property is no longer
828 # used in a newer version of Unicode. If we generated a file without code
829 # points, programs that used to work on that property will still execute
830 # without errors. It just won't ever match (or will always match, with \P{}).
831 # This means that the logic is now likely wrong. I (khw) think its better to
832 # find this out by getting an error message. Just move them to the table
833 # above to change this behavior
834 my %why_suppress_if_empty_warn_if_not = (
836 # It is the only property that has ever officially been removed from the
837 # Standard. The database never contained any code points for it.
838 'Special_Case_Condition' => 'Obsolete',
840 # Apparently never official, but there were code points in some versions of
841 # old-style PropList.txt
842 'Non_Break' => 'Obsolete',
845 # These would normally go in the warn table just above, but they were changed
846 # a long time before this program was written, so warnings about them are
848 if ($v_version gt v3.2.0) {
849 push @tables_that_may_be_empty,
850 'Canonical_Combining_Class=Attached_Below_Left'
853 # These are listed in the Property aliases file in 6.0, but Unihan is ignored
854 # unless explicitly added.
855 if ($v_version ge v5.2.0) {
856 my $unihan = 'Unihan; remove from list if using Unihan';
857 foreach my $table (qw (
861 kCompatibilityVariant
875 $why_suppress_if_empty_warn_if_not{$table} = $unihan;
879 # Enum values for to_output_map() method in the Map_Table package.
880 my $EXTERNAL_MAP = 1;
881 my $INTERNAL_MAP = 2;
882 my $OUTPUT_ADJUSTED = 3;
884 # To override computed values for writing the map tables for these properties.
885 # The default for enum map tables is to write them out, so that the Unicode
886 # .txt files can be removed, but all the data to compute any property value
887 # for any code point is available in a more compact form.
888 my %global_to_output_map = (
889 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
890 # get stuck supporting it if things change. Since it is a STRING
891 # property, it normally would be listed in the pod, but INTERNAL_MAP
893 Unicode_1_Name => $INTERNAL_MAP,
895 Present_In => 0, # Suppress, as easily computed from Age
896 Block => 0, # Suppress, as Blocks.txt is retained.
898 # Suppress, as mapping can be found instead from the
899 # Perl_Decomposition_Mapping file
900 Decomposition_Type => 0,
903 # Properties that this program ignores.
904 my @unimplemented_properties;
906 # With this release, it is automatically handled if the Unihan db is
908 push @unimplemented_properties, 'Unicode_Radical_Stroke' if $v_version le v5.2.0;
910 # There are several types of obsolete properties defined by Unicode. These
911 # must be hand-edited for every new Unicode release.
912 my %why_deprecated; # Generates a deprecated warning message if used.
913 my %why_stabilized; # Documentation only
914 my %why_obsolete; # Documentation only
917 my $simple = 'Perl uses the more complete version of this property';
918 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
920 my $other_properties = 'other properties';
921 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
922 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
925 'Grapheme_Link' => 'Deprecated by Unicode: Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
926 'Jamo_Short_Name' => $contributory,
927 'Line_Break=Surrogate' => 'Deprecated by Unicode because surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
928 'Other_Alphabetic' => $contributory,
929 'Other_Default_Ignorable_Code_Point' => $contributory,
930 'Other_Grapheme_Extend' => $contributory,
931 'Other_ID_Continue' => $contributory,
932 'Other_ID_Start' => $contributory,
933 'Other_Lowercase' => $contributory,
934 'Other_Math' => $contributory,
935 'Other_Uppercase' => $contributory,
936 'Expands_On_NFC' => $why_no_expand,
937 'Expands_On_NFD' => $why_no_expand,
938 'Expands_On_NFKC' => $why_no_expand,
939 'Expands_On_NFKD' => $why_no_expand,
943 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
944 # contains the same information, but without the algorithmically
945 # determinable Hangul syllables'. This file is not published, so it's
946 # existence is not noted in the comment.
947 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or Unicode::UCD::prop_invmap()',
949 'Indic_Matra_Category' => "Provisional",
950 'Indic_Syllabic_Category' => "Provisional",
952 # Don't suppress ISO_Comment, as otherwise special handling is needed
953 # to differentiate between it and gc=c, which can be written as 'isc',
954 # which is the same characters as ISO_Comment's short name.
956 'Name' => "Accessible via \\N{...} or 'use charnames;' or Unicode::UCD::prop_invmap()",
958 'Simple_Case_Folding' => "$simple. Can access this through Unicode::UCD::casefold or Unicode::UCD::prop_invmap()",
959 'Simple_Lowercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
960 'Simple_Titlecase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
961 'Simple_Uppercase_Mapping' => "$simple. Can access this through Unicode::UCD::charinfo or Unicode::UCD::prop_invmap()",
963 FC_NFKC_Closure => 'Supplanted in usage by NFKC_Casefold; otherwise not useful',
966 foreach my $property (
968 # The following are suppressed because they were made contributory
969 # or deprecated by Unicode before Perl ever thought about
978 # The following are suppressed because they have been marked
979 # as deprecated for a sufficient amount of time
981 'Other_Default_Ignorable_Code_Point',
982 'Other_Grapheme_Extend',
989 $why_suppressed{$property} = $why_deprecated{$property};
992 # Customize the message for all the 'Other_' properties
993 foreach my $property (keys %why_deprecated) {
994 next if (my $main_property = $property) !~ s/^Other_//;
995 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
999 if ($write_Unicode_deprecated_tables) {
1000 foreach my $property (keys %why_suppressed) {
1001 delete $why_suppressed{$property} if $property =~
1002 / ^ Other | Grapheme /x;
1006 if ($v_version ge 4.0.0) {
1007 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1008 if ($v_version ge 6.0.0) {
1009 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1012 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1013 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1014 if ($v_version ge 6.0.0) {
1015 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1019 # Probably obsolete forever
1020 if ($v_version ge v4.1.0) {
1021 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1023 if ($v_version ge v6.0.0) {
1024 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1025 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1028 # This program can create files for enumerated-like properties, such as
1029 # 'Numeric_Type'. This file would be the same format as for a string
1030 # property, with a mapping from code point to its value, so you could look up,
1031 # for example, the script a code point is in. But no one so far wants this
1032 # mapping, or they have found another way to get it since this is a new
1033 # feature. So no file is generated except if it is in this list.
1034 my @output_mapped_properties = split "\n", <<END;
1037 # If you are using the Unihan database in a Unicode version before 5.2, you
1038 # need to add the properties that you want to extract from it to this table.
1039 # For your convenience, the properties in the 6.0 PropertyAliases.txt file are
1040 # listed, commented out
1041 my @cjk_properties = split "\n", <<'END';
1042 #cjkAccountingNumeric; kAccountingNumeric
1043 #cjkOtherNumeric; kOtherNumeric
1044 #cjkPrimaryNumeric; kPrimaryNumeric
1045 #cjkCompatibilityVariant; kCompatibilityVariant
1046 #cjkIICore ; kIICore
1047 #cjkIRG_GSource; kIRG_GSource
1048 #cjkIRG_HSource; kIRG_HSource
1049 #cjkIRG_JSource; kIRG_JSource
1050 #cjkIRG_KPSource; kIRG_KPSource
1051 #cjkIRG_KSource; kIRG_KSource
1052 #cjkIRG_TSource; kIRG_TSource
1053 #cjkIRG_USource; kIRG_USource
1054 #cjkIRG_VSource; kIRG_VSource
1055 #cjkRSUnicode; kRSUnicode ; Unicode_Radical_Stroke; URS
1058 # Similarly for the property values. For your convenience, the lines in the
1059 # 6.0 PropertyAliases.txt file are listed. Just remove the first BUT NOT both
1060 # '#' marks (for Unicode versions before 5.2)
1061 my @cjk_property_values = split "\n", <<'END';
1062 ## @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1063 ## @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
1064 ## @missing: 0000..10FFFF; cjkIICore; <none>
1065 ## @missing: 0000..10FFFF; cjkIRG_GSource; <none>
1066 ## @missing: 0000..10FFFF; cjkIRG_HSource; <none>
1067 ## @missing: 0000..10FFFF; cjkIRG_JSource; <none>
1068 ## @missing: 0000..10FFFF; cjkIRG_KPSource; <none>
1069 ## @missing: 0000..10FFFF; cjkIRG_KSource; <none>
1070 ## @missing: 0000..10FFFF; cjkIRG_TSource; <none>
1071 ## @missing: 0000..10FFFF; cjkIRG_USource; <none>
1072 ## @missing: 0000..10FFFF; cjkIRG_VSource; <none>
1073 ## @missing: 0000..10FFFF; cjkOtherNumeric; NaN
1074 ## @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
1075 ## @missing: 0000..10FFFF; cjkRSUnicode; <none>
1078 # The input files don't list every code point. Those not listed are to be
1079 # defaulted to some value. Below are hard-coded what those values are for
1080 # non-binary properties as of 5.1. Starting in 5.0, there are
1081 # machine-parsable comment lines in the files that give the defaults; so this
1082 # list shouldn't have to be extended. The claim is that all missing entries
1083 # for binary properties will default to 'N'. Unicode tried to change that in
1084 # 5.2, but the beta period produced enough protest that they backed off.
1086 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1087 # be in the form that it expects. The others may be synonyms.
1088 my $CODE_POINT = '<code point>';
1089 my %default_mapping = (
1090 Age => "Unassigned",
1091 # Bidi_Class => Complicated; set in code
1092 Bidi_Mirroring_Glyph => "",
1093 Block => 'No_Block',
1094 Canonical_Combining_Class => 0,
1095 Case_Folding => $CODE_POINT,
1096 Decomposition_Mapping => $CODE_POINT,
1097 Decomposition_Type => 'None',
1098 East_Asian_Width => "Neutral",
1099 FC_NFKC_Closure => $CODE_POINT,
1100 General_Category => 'Cn',
1101 Grapheme_Cluster_Break => 'Other',
1102 Hangul_Syllable_Type => 'NA',
1104 Jamo_Short_Name => "",
1105 Joining_Group => "No_Joining_Group",
1106 # Joining_Type => Complicated; set in code
1107 kIICore => 'N', # Is converted to binary
1108 #Line_Break => Complicated; set in code
1109 Lowercase_Mapping => $CODE_POINT,
1116 Numeric_Type => 'None',
1117 Numeric_Value => 'NaN',
1118 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1119 Sentence_Break => 'Other',
1120 Simple_Case_Folding => $CODE_POINT,
1121 Simple_Lowercase_Mapping => $CODE_POINT,
1122 Simple_Titlecase_Mapping => $CODE_POINT,
1123 Simple_Uppercase_Mapping => $CODE_POINT,
1124 Titlecase_Mapping => $CODE_POINT,
1125 Unicode_1_Name => "",
1126 Unicode_Radical_Stroke => "",
1127 Uppercase_Mapping => $CODE_POINT,
1128 Word_Break => 'Other',
1131 # Below are files that Unicode furnishes, but this program ignores, and why.
1132 # NormalizationCorrections.txt requires some more explanation. It documents
1133 # the cumulative fixes to erroneous normalizations in earlier Unicode
1134 # versions. Its main purpose is so that someone running on an earlier version
1135 # can use this file to override what got published in that earlier release.
1136 # It would be easy for mktables to read and handle this file. But all the
1137 # corrections in it should already be in the other files for the release it
1138 # is. To get it to actually mean something useful, someone would have to be
1139 # using an earlier Unicode release, and copy it to the files for that release
1140 # and recomplile. So far there has been no demand to do that, so this hasn't
1142 my %ignored_files = (
1143 'CJKRadicals.txt' => 'Maps the kRSUnicode property values to corresponding code points',
1144 'Index.txt' => 'Alphabetical index of Unicode characters',
1145 'NamedSqProv.txt' => 'Named sequences proposed for inclusion in a later version of the Unicode Standard; if you need them now, you can append this file to F<NamedSequences.txt> and recompile perl',
1146 'NamesList.txt' => 'Annotated list of characters',
1147 'NormalizationCorrections.txt' => 'Documentation of corrections already incorporated into the Unicode data base',
1148 'Props.txt' => 'Only in very early releases; is a subset of F<PropList.txt> (which is used instead)',
1149 'ReadMe.txt' => 'Documentation',
1150 'StandardizedVariants.txt' => 'Certain glyph variations for character display are standardized. This lists the non-Unihan ones; the Unihan ones are also not used by Perl, and are in a separate Unicode data base L<http://www.unicode.org/ivd>',
1151 'EmojiSources.txt' => 'Maps certain Unicode code points to their legacy Japanese cell-phone values',
1152 'USourceData.txt' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1153 'USourceData.pdf' => 'Documentation of status and cross reference of proposals for encoding by Unicode of Unihan characters',
1154 'auxiliary/WordBreakTest.html' => 'Documentation of validation tests',
1155 'auxiliary/SentenceBreakTest.html' => 'Documentation of validation tests',
1156 'auxiliary/GraphemeBreakTest.html' => 'Documentation of validation tests',
1157 'auxiliary/LineBreakTest.html' => 'Documentation of validation tests',
1160 my %skipped_files; # List of files that we skip
1162 ### End of externally interesting definitions, except for @input_file_objects
1165 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1166 # This file is machine-generated by $0 from the Unicode
1167 # database, Version $string_version. Any changes made here will be lost!
1170 my $INTERNAL_ONLY_HEADER = <<"EOF";
1172 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1173 # This file is for internal use by core Perl only. The format and even the
1174 # name or existence of this file are subject to change without notice. Don't
1178 my $DEVELOPMENT_ONLY=<<"EOF";
1179 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1180 # This file contains information artificially constrained to code points
1181 # present in Unicode release $string_compare_versions.
1182 # IT CANNOT BE RELIED ON. It is for use during development only and should
1183 # not be used for production.
1187 my $MAX_UNICODE_CODEPOINT_STRING = "10FFFF";
1188 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1189 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1191 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1192 # two must be 10; if there are 5, the first must not be a 0. Written this way
1193 # to decrease backtracking. The first regex allows the code point to be at
1194 # the end of a word, but to work properly, the word shouldn't end with a valid
1195 # hex character. The second one won't match a code point at the end of a
1196 # word, and doesn't have the run-on issue
1197 my $run_on_code_point_re =
1198 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1199 my $code_point_re = qr/\b$run_on_code_point_re/;
1201 # This matches the beginning of the line in the Unicode db files that give the
1202 # defaults for code points not listed (i.e., missing) in the file. The code
1203 # depends on this ending with a semi-colon, so it can assume it is a valid
1204 # field when the line is split() by semi-colons
1205 my $missing_defaults_prefix =
1206 qr/^#\s+\@missing:\s+0000\.\.$MAX_UNICODE_CODEPOINT_STRING\s*;/;
1208 # Property types. Unicode has more types, but these are sufficient for our
1210 my $UNKNOWN = -1; # initialized to illegal value
1211 my $NON_STRING = 1; # Either binary or enum
1213 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1214 # tables, additional true and false tables are
1215 # generated so that false is anything matching the
1216 # default value, and true is everything else.
1217 my $ENUM = 4; # Include catalog
1218 my $STRING = 5; # Anything else: string or misc
1220 # Some input files have lines that give default values for code points not
1221 # contained in the file. Sometimes these should be ignored.
1222 my $NO_DEFAULTS = 0; # Must evaluate to false
1223 my $NOT_IGNORED = 1;
1226 # Range types. Each range has a type. Most ranges are type 0, for normal,
1227 # and will appear in the main body of the tables in the output files, but
1228 # there are other types of ranges as well, listed below, that are specially
1229 # handled. There are pseudo-types as well that will never be stored as a
1230 # type, but will affect the calculation of the type.
1232 # 0 is for normal, non-specials
1233 my $MULTI_CP = 1; # Sequence of more than code point
1234 my $HANGUL_SYLLABLE = 2;
1235 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1236 my $NULL = 4; # The map is to the null string; utf8.c can't
1237 # handle these, nor is there an accepted syntax
1238 # for them in \p{} constructs
1239 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1240 # otherwise be $MULTI_CP type are instead type 0
1242 # process_generic_property_file() can accept certain overrides in its input.
1243 # Each of these must begin AND end with $CMD_DELIM.
1244 my $CMD_DELIM = "\a";
1245 my $REPLACE_CMD = 'replace'; # Override the Replace
1246 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1251 # Values for the Replace argument to add_range.
1252 # $NO # Don't replace; add only the code points not
1254 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1255 # the comments at the subroutine definition.
1256 my $UNCONDITIONALLY = 2; # Replace without conditions.
1257 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1259 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1261 my $CROAK = 6; # Die with an error if is already there
1263 # Flags to give property statuses. The phrases are to remind maintainers that
1264 # if the flag is changed, the indefinite article referring to it in the
1265 # documentation may need to be as well.
1267 my $DEPRECATED = 'D';
1268 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1269 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1270 my $DISCOURAGED = 'X';
1271 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1272 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1274 my $a_bold_stricter = "a 'B<$STRICTER>'";
1275 my $A_bold_stricter = "A 'B<$STRICTER>'";
1276 my $STABILIZED = 'S';
1277 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1278 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1280 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1281 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1283 my %status_past_participles = (
1284 $DISCOURAGED => 'discouraged',
1285 $STABILIZED => 'stabilized',
1286 $OBSOLETE => 'obsolete',
1287 $DEPRECATED => 'deprecated',
1290 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1291 # externally documented.
1292 my $ORDINARY = 0; # The normal fate.
1293 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1294 # but there is a file written that can be used to
1295 # reconstruct this table
1296 my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
1297 # for Perl's internal use only
1298 my $SUPPRESSED = 3; # The file for this table is not written out, and as a
1299 # result, we don't bother to do many computations on
1301 my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the
1302 # computations anyway, as the values are needed for
1303 # things to work. This happens when we have Perl
1304 # extensions that depend on Unicode tables that
1305 # wouldn't normally be in a given Unicode version.
1307 # The format of the values of the tables:
1308 my $EMPTY_FORMAT = "";
1309 my $BINARY_FORMAT = 'b';
1310 my $DECIMAL_FORMAT = 'd';
1311 my $FLOAT_FORMAT = 'f';
1312 my $INTEGER_FORMAT = 'i';
1313 my $HEX_FORMAT = 'x';
1314 my $RATIONAL_FORMAT = 'r';
1315 my $STRING_FORMAT = 's';
1316 my $ADJUST_FORMAT = 'a';
1317 my $DECOMP_STRING_FORMAT = 'c';
1318 my $STRING_WHITE_SPACE_LIST = 'sw';
1320 my %map_table_formats = (
1321 $BINARY_FORMAT => 'binary',
1322 $DECIMAL_FORMAT => 'single decimal digit',
1323 $FLOAT_FORMAT => 'floating point number',
1324 $INTEGER_FORMAT => 'integer',
1325 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1326 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1327 $STRING_FORMAT => 'string',
1328 $ADJUST_FORMAT => 'some entries need adjustment',
1329 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1330 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1333 # Unicode didn't put such derived files in a separate directory at first.
1334 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1335 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1336 my $AUXILIARY = 'auxiliary';
1338 # Hashes that will eventually go into Heavy.pl for the use of utf8_heavy.pl
1339 # and into UCD.pl for the use of UCD.pm
1340 my %loose_to_file_of; # loosely maps table names to their respective
1342 my %stricter_to_file_of; # same; but for stricter mapping.
1343 my %loose_property_to_file_of; # Maps a loose property name to its map file
1344 my %file_to_swash_name; # Maps the file name to its corresponding key name
1345 # in the hash %utf8::SwashInfo
1346 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1347 # their rational equivalent
1348 my %loose_property_name_of; # Loosely maps (non_string) property names to
1350 my %string_property_loose_to_name; # Same, for string properties.
1351 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1352 # the property name in standard loose form, and
1353 # 'value' is the default value for that property,
1354 # also in standard loose form.
1355 my %loose_to_standard_value; # loosely maps table names to the canonical
1357 my %ambiguous_names; # keys are alias names (in standard form) that
1358 # have more than one possible meaning.
1359 my %prop_aliases; # Keys are standard property name; values are each
1361 my %prop_value_aliases; # Keys of top level are standard property name;
1362 # values are keys to another hash, Each one is
1363 # one of the property's values, in standard form.
1364 # The values are that prop-val's aliases.
1365 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1367 # Most properties are immune to caseless matching, otherwise you would get
1368 # nonsensical results, as properties are a function of a code point, not
1369 # everything that is caselessly equivalent to that code point. For example,
1370 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1371 # be true because 's' and 'S' are equivalent caselessly. However,
1372 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1373 # extend that concept to those very few properties that are like this. Each
1374 # such property will match the full range caselessly. They are hard-coded in
1375 # the program; it's not worth trying to make it general as it's extremely
1376 # unlikely that they will ever change.
1377 my %caseless_equivalent_to;
1379 # These constants names and values were taken from the Unicode standard,
1380 # version 5.1, section 3.12. They are used in conjunction with Hangul
1381 # syllables. The '_string' versions are so generated tables can retain the
1382 # hex format, which is the more familiar value
1383 my $SBase_string = "0xAC00";
1384 my $SBase = CORE::hex $SBase_string;
1385 my $LBase_string = "0x1100";
1386 my $LBase = CORE::hex $LBase_string;
1387 my $VBase_string = "0x1161";
1388 my $VBase = CORE::hex $VBase_string;
1389 my $TBase_string = "0x11A7";
1390 my $TBase = CORE::hex $TBase_string;
1395 my $NCount = $VCount * $TCount;
1397 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1398 # with the above published constants.
1400 my %Jamo_L; # Leading consonants
1401 my %Jamo_V; # Vowels
1402 my %Jamo_T; # Trailing consonants
1404 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1405 # The key is the base name of the code point, and the value is an
1406 # array giving all the ranges that use this base name. Each range
1407 # is actually a hash giving the 'low' and 'high' values of it.
1408 my %names_ending_in_code_point;
1409 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1410 # removed from the names
1411 # Inverse mapping. The list of ranges that have these kinds of
1412 # names. Each element contains the low, high, and base names in an
1414 my @code_points_ending_in_code_point;
1416 # To hold Unicode's normalization test suite
1417 my @normalization_tests;
1419 # Boolean: does this Unicode version have the hangul syllables, and are we
1420 # writing out a table for them?
1421 my $has_hangul_syllables = 0;
1423 # Does this Unicode version have code points whose names end in their
1424 # respective code points, and are we writing out a table for them? 0 for no;
1425 # otherwise points to first property that a table is needed for them, so that
1426 # if multiple tables are needed, we don't create duplicates
1427 my $needing_code_points_ending_in_code_point = 0;
1429 my @backslash_X_tests; # List of tests read in for testing \X
1430 my @unhandled_properties; # Will contain a list of properties found in
1431 # the input that we didn't process.
1432 my @match_properties; # Properties that have match tables, to be
1434 my @map_properties; # Properties that get map files written
1435 my @named_sequences; # NamedSequences.txt contents.
1436 my %potential_files; # Generated list of all .txt files in the directory
1437 # structure so we can warn if something is being
1439 my @files_actually_output; # List of files we generated.
1440 my @more_Names; # Some code point names are compound; this is used
1441 # to store the extra components of them.
1442 my $MIN_FRACTION_LENGTH = 3; # How many digits of a floating point number at
1443 # the minimum before we consider it equivalent to a
1444 # candidate rational
1445 my $MAX_FLOATING_SLOP = 10 ** - $MIN_FRACTION_LENGTH; # And in floating terms
1447 # These store references to certain commonly used property objects
1456 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1457 my $has_In_conflicts = 0;
1458 my $has_Is_conflicts = 0;
1460 sub internal_file_to_platform ($) {
1461 # Convert our file paths which have '/' separators to those of the
1465 return undef unless defined $file;
1467 return File::Spec->join(split '/', $file);
1470 sub file_exists ($) { # platform independent '-e'. This program internally
1471 # uses slash as a path separator.
1473 return 0 if ! defined $file;
1474 return -e internal_file_to_platform($file);
1478 # Returns the address of the blessed input object.
1479 # It doesn't check for blessedness because that would do a string eval
1480 # every call, and the program is structured so that this is never called
1481 # for a non-blessed object.
1483 no overloading; # If overloaded, numifying below won't work.
1485 # Numifying a ref gives its address.
1486 return pack 'J', $_[0];
1489 # These are used only if $annotate is true.
1490 # The entire range of Unicode characters is examined to populate these
1491 # after all the input has been processed. But most can be skipped, as they
1492 # have the same descriptive phrases, such as being unassigned
1493 my @viacode; # Contains the 1 million character names
1494 my @printable; # boolean: And are those characters printable?
1495 my @annotate_char_type; # Contains a type of those characters, specifically
1496 # for the purposes of annotation.
1497 my $annotate_ranges; # A map of ranges of code points that have the same
1498 # name for the purposes of annotation. They map to the
1499 # upper edge of the range, so that the end point can
1500 # be immediately found. This is used to skip ahead to
1501 # the end of a range, and avoid processing each
1502 # individual code point in it.
1503 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1504 # characters, but excluding those which are
1505 # also noncharacter code points
1507 # The annotation types are an extension of the regular range types, though
1508 # some of the latter are folded into one. Make the new types negative to
1509 # avoid conflicting with the regular types
1510 my $SURROGATE_TYPE = -1;
1511 my $UNASSIGNED_TYPE = -2;
1512 my $PRIVATE_USE_TYPE = -3;
1513 my $NONCHARACTER_TYPE = -4;
1514 my $CONTROL_TYPE = -5;
1515 my $UNKNOWN_TYPE = -6; # Used only if there is a bug in this program
1517 sub populate_char_info ($) {
1518 # Used only with the $annotate option. Populates the arrays with the
1519 # input code point's info that are needed for outputting more detailed
1520 # comments. If calling context wants a return, it is the end point of
1521 # any contiguous range of characters that share essentially the same info
1524 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1526 $viacode[$i] = $perl_charname->value_of($i) || "";
1528 # A character is generally printable if Unicode says it is,
1529 # but below we make sure that most Unicode general category 'C' types
1531 $printable[$i] = $print->contains($i);
1533 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1535 # Only these two regular types are treated specially for annotations
1537 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1538 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1540 # Give a generic name to all code points that don't have a real name.
1541 # We output ranges, if applicable, for these. Also calculate the end
1542 # point of the range.
1544 if (! $viacode[$i]) {
1546 if ($gc-> table('Private_use')->contains($i)) {
1547 $viacode[$i] = 'Private Use';
1548 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1550 $end = $gc->table('Private_Use')->containing_range($i)->end;
1552 elsif ((defined ($nonchar =
1553 Property::property_ref('Noncharacter_Code_Point'))
1554 && $nonchar->table('Y')->contains($i)))
1556 $viacode[$i] = 'Noncharacter';
1557 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1559 $end = property_ref('Noncharacter_Code_Point')->table('Y')->
1560 containing_range($i)->end;
1562 elsif ($gc-> table('Control')->contains($i)) {
1563 $viacode[$i] = property_ref('Name_Alias')->value_of($i) || 'Control';
1564 $annotate_char_type[$i] = $CONTROL_TYPE;
1567 elsif ($gc-> table('Unassigned')->contains($i)) {
1568 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1570 if ($v_version lt v2.0.0) { # No blocks in earliest releases
1571 $viacode[$i] = 'Unassigned';
1572 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1575 $viacode[$i] = 'Unassigned, block=' . $block-> value_of($i);
1577 # Because we name the unassigned by the blocks they are in, it
1578 # can't go past the end of that block, and it also can't go
1579 # past the unassigned range it is in. The special table makes
1580 # sure that the non-characters, which are unassigned, are
1582 $end = min($block->containing_range($i)->end,
1583 $unassigned_sans_noncharacters->
1584 containing_range($i)->end);
1587 elsif ($v_version lt v2.0.0) { # No surrogates in earliest releases
1588 $viacode[$i] = $gc->value_of($i);
1589 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1592 elsif ($gc-> table('Surrogate')->contains($i)) {
1593 $viacode[$i] = 'Surrogate';
1594 $annotate_char_type[$i] = $SURROGATE_TYPE;
1596 $end = $gc->table('Surrogate')->containing_range($i)->end;
1599 Carp::my_carp_bug("Can't figure out how to annotate "
1600 . sprintf("U+%04X", $i)
1601 . ". Proceeding anyway.");
1602 $viacode[$i] = 'UNKNOWN';
1603 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1608 # Here, has a name, but if it's one in which the code point number is
1609 # appended to the name, do that.
1610 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1611 $viacode[$i] .= sprintf("-%04X", $i);
1612 $end = $perl_charname->containing_range($i)->end;
1615 # And here, has a name, but if it's a hangul syllable one, replace it with
1616 # the correct name from the Unicode algorithm
1617 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1619 my $SIndex = $i - $SBase;
1620 my $L = $LBase + $SIndex / $NCount;
1621 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1622 my $T = $TBase + $SIndex % $TCount;
1623 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1624 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1625 $end = $perl_charname->containing_range($i)->end;
1628 return if ! defined wantarray;
1629 return $i if ! defined $end; # If not a range, return the input
1631 # Save this whole range so can find the end point quickly
1632 $annotate_ranges->add_map($i, $end, $end);
1637 # Commented code below should work on Perl 5.8.
1638 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1639 ## the native perl version of it (which is what would operate under miniperl)
1640 ## is extremely slow, as it does a string eval every call.
1641 #my $has_fast_scalar_util = $^X !~ /miniperl/
1642 # && defined eval "require Scalar::Util";
1645 # # Returns the address of the blessed input object. Uses the XS version if
1646 # # available. It doesn't check for blessedness because that would do a
1647 # # string eval every call, and the program is structured so that this is
1648 # # never called for a non-blessed object.
1650 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1652 # # Check at least that is a ref.
1653 # my $pkg = ref($_[0]) or return undef;
1655 # # Change to a fake package to defeat any overloaded stringify
1656 # bless $_[0], 'main::Fake';
1658 # # Numifying a ref gives its address.
1659 # my $addr = pack 'J', $_[0];
1661 # # Return to original class
1662 # bless $_[0], $pkg;
1669 return $a if $a >= $b;
1676 return $a if $a <= $b;
1680 sub clarify_number ($) {
1681 # This returns the input number with underscores inserted every 3 digits
1682 # in large (5 digits or more) numbers. Input must be entirely digits, not
1686 my $pos = length($number) - 3;
1687 return $number if $pos <= 1;
1689 substr($number, $pos, 0) = '_';
1698 # These routines give a uniform treatment of messages in this program. They
1699 # are placed in the Carp package to cause the stack trace to not include them,
1700 # although an alternative would be to use another package and set @CARP_NOT
1703 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1705 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1706 # and overload trying to load Scalar:Util under miniperl. See
1707 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1708 undef $overload::VERSION;
1711 my $message = shift || "";
1712 my $nofold = shift || 0;
1715 $message = main::join_lines($message);
1716 $message =~ s/^$0: *//; # Remove initial program name
1717 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1718 $message = "\n$0: $message;";
1720 # Fold the message with program name, semi-colon end punctuation
1721 # (which looks good with the message that carp appends to it), and a
1722 # hanging indent for continuation lines.
1723 $message = main::simple_fold($message, "", 4) unless $nofold;
1724 $message =~ s/\n$//; # Remove the trailing nl so what carp
1725 # appends is to the same line
1728 return $message if defined wantarray; # If a caller just wants the msg
1735 # This is called when it is clear that the problem is caused by a bug in
1738 my $message = shift;
1739 $message =~ s/^$0: *//;
1740 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1745 sub carp_too_few_args {
1747 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1751 my $args_ref = shift;
1754 my_carp_bug("Need at least $count arguments to "
1756 . ". Instead got: '"
1757 . join ', ', @$args_ref
1758 . "'. No action taken.");
1762 sub carp_extra_args {
1763 my $args_ref = shift;
1764 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1766 unless (ref $args_ref) {
1767 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1770 my ($package, $file, $line) = caller;
1771 my $subroutine = (caller 1)[3];
1774 if (ref $args_ref eq 'HASH') {
1775 foreach my $key (keys %$args_ref) {
1776 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1778 $list = join ', ', each %{$args_ref};
1780 elsif (ref $args_ref eq 'ARRAY') {
1781 foreach my $arg (@$args_ref) {
1782 $arg = $UNDEF unless defined $arg;
1784 $list = join ', ', @$args_ref;
1787 my_carp_bug("Can't cope with ref "
1789 . " . argument to 'carp_extra_args'. Not checking arguments.");
1793 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1801 # This program uses the inside-out method for objects, as recommended in
1802 # "Perl Best Practices". This closure aids in generating those. There
1803 # are two routines. setup_package() is called once per package to set
1804 # things up, and then set_access() is called for each hash representing a
1805 # field in the object. These routines arrange for the object to be
1806 # properly destroyed when no longer used, and for standard accessor
1807 # functions to be generated. If you need more complex accessors, just
1808 # write your own and leave those accesses out of the call to set_access().
1809 # More details below.
1811 my %constructor_fields; # fields that are to be used in constructors; see
1814 # The values of this hash will be the package names as keys to other
1815 # hashes containing the name of each field in the package as keys, and
1816 # references to their respective hashes as values.
1820 # Sets up the package, creating standard DESTROY and dump methods
1821 # (unless already defined). The dump method is used in debugging by
1823 # The optional parameters are:
1824 # a) a reference to a hash, that gets populated by later
1825 # set_access() calls with one of the accesses being
1826 # 'constructor'. The caller can then refer to this, but it is
1827 # not otherwise used by these two routines.
1828 # b) a reference to a callback routine to call during destruction
1829 # of the object, before any fields are actually destroyed
1832 my $constructor_ref = delete $args{'Constructor_Fields'};
1833 my $destroy_callback = delete $args{'Destroy_Callback'};
1834 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1837 my $package = (caller)[0];
1839 $package_fields{$package} = \%fields;
1840 $constructor_fields{$package} = $constructor_ref;
1842 unless ($package->can('DESTROY')) {
1843 my $destroy_name = "${package}::DESTROY";
1846 # Use typeglob to give the anonymous subroutine the name we want
1847 *$destroy_name = sub {
1849 my $addr = do { no overloading; pack 'J', $self; };
1851 $self->$destroy_callback if $destroy_callback;
1852 foreach my $field (keys %{$package_fields{$package}}) {
1853 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1854 delete $package_fields{$package}{$field}{$addr};
1860 unless ($package->can('dump')) {
1861 my $dump_name = "${package}::dump";
1865 return dump_inside_out($self, $package_fields{$package}, @_);
1872 # Arrange for the input field to be garbage collected when no longer
1873 # needed. Also, creates standard accessor functions for the field
1874 # based on the optional parameters-- none if none of these parameters:
1875 # 'addable' creates an 'add_NAME()' accessor function.
1876 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1878 # 'settable' creates a 'set_NAME()' accessor function.
1879 # 'constructor' doesn't create an accessor function, but adds the
1880 # field to the hash that was previously passed to
1882 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1883 # 'add' etc. all mean 'addable'.
1884 # The read accessor function will work on both array and scalar
1885 # values. If another accessor in the parameter list is 'a', the read
1886 # access assumes an array. You can also force it to be array access
1887 # by specifying 'readable_array' instead of 'readable'
1889 # A sort-of 'protected' access can be set-up by preceding the addable,
1890 # readable or settable with some initial portion of 'protected_' (but,
1891 # the underscore is required), like 'p_a', 'pro_set', etc. The
1892 # "protection" is only by convention. All that happens is that the
1893 # accessor functions' names begin with an underscore. So instead of
1894 # calling set_foo, the call is _set_foo. (Real protection could be
1895 # accomplished by having a new subroutine, end_package, called at the
1896 # end of each package, and then storing the __LINE__ ranges and
1897 # checking them on every accessor. But that is way overkill.)
1899 # We create anonymous subroutines as the accessors and then use
1900 # typeglobs to assign them to the proper package and name
1902 my $name = shift; # Name of the field
1903 my $field = shift; # Reference to the inside-out hash containing the
1906 my $package = (caller)[0];
1908 if (! exists $package_fields{$package}) {
1909 croak "$0: Must call 'setup_package' before 'set_access'";
1912 # Stash the field so DESTROY can get it.
1913 $package_fields{$package}{$name} = $field;
1915 # Remaining arguments are the accessors. For each...
1916 foreach my $access (@_) {
1917 my $access = lc $access;
1921 # Match the input as far as it goes.
1922 if ($access =~ /^(p[^_]*)_/) {
1924 if (substr('protected_', 0, length $protected)
1928 # Add 1 for the underscore not included in $protected
1929 $access = substr($access, length($protected) + 1);
1937 if (substr('addable', 0, length $access) eq $access) {
1938 my $subname = "${package}::${protected}add_$name";
1941 # add_ accessor. Don't add if already there, which we
1942 # determine using 'eq' for scalars and '==' otherwise.
1945 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
1948 my $addr = do { no overloading; pack 'J', $self; };
1949 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1951 return if grep { $value == $_ } @{$field->{$addr}};
1954 return if grep { $value eq $_ } @{$field->{$addr}};
1956 push @{$field->{$addr}}, $value;
1960 elsif (substr('constructor', 0, length $access) eq $access) {
1962 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1965 $constructor_fields{$package}{$name} = $field;
1968 elsif (substr('readable_array', 0, length $access) eq $access) {
1970 # Here has read access. If one of the other parameters for
1971 # access is array, or this one specifies array (by being more
1972 # than just 'readable_'), then create a subroutine that
1973 # assumes the data is an array. Otherwise just a scalar
1974 my $subname = "${package}::${protected}$name";
1975 if (grep { /^a/i } @_
1976 or length($access) > length('readable_'))
1981 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
1982 my $addr = do { no overloading; pack 'J', $_[0]; };
1983 if (ref $field->{$addr} ne 'ARRAY') {
1984 my $type = ref $field->{$addr};
1985 $type = 'scalar' unless $type;
1986 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1989 return scalar @{$field->{$addr}} unless wantarray;
1991 # Make a copy; had problems with caller modifying the
1992 # original otherwise
1993 my @return = @{$field->{$addr}};
1999 # Here not an array value, a simpler function.
2003 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2005 return $field->{pack 'J', $_[0]};
2009 elsif (substr('settable', 0, length $access) eq $access) {
2010 my $subname = "${package}::${protected}set_$name";
2015 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2016 Carp::carp_extra_args(\@_) if @_ > 2;
2018 # $self is $_[0]; $value is $_[1]
2020 $field->{pack 'J', $_[0]} = $_[1];
2025 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2034 # All input files use this object, which stores various attributes about them,
2035 # and provides for convenient, uniform handling. The run method wraps the
2036 # processing. It handles all the bookkeeping of opening, reading, and closing
2037 # the file, returning only significant input lines.
2039 # Each object gets a handler which processes the body of the file, and is
2040 # called by run(). Most should use the generic, default handler, which has
2041 # code scrubbed to handle things you might not expect. A handler should
2042 # basically be a while(next_line()) {...} loop.
2044 # You can also set up handlers to
2045 # 1) call before the first line is read, for pre processing
2046 # 2) call to adjust each line of the input before the main handler gets them
2047 # 3) call upon EOF before the main handler exits its loop
2048 # 4) call at the end, for post processing
2050 # $_ is used to store the input line, and is to be filtered by the
2051 # each_line_handler()s. So, if the format of the line is not in the desired
2052 # format for the main handler, these are used to do that adjusting. They can
2053 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2054 # so the $_ output of one is used as the input to the next. None of the other
2055 # handlers are stackable, but could easily be changed to be so.
2057 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2058 # which insert the parameters as lines to be processed before the next input
2059 # file line is read. This allows the EOF handler to flush buffers, for
2060 # example. The difference between the two routines is that the lines inserted
2061 # by insert_lines() are subjected to the each_line_handler()s. (So if you
2062 # called it from such a handler, you would get infinite recursion.) Lines
2063 # inserted by insert_adjusted_lines() go directly to the main handler without
2064 # any adjustments. If the post-processing handler calls any of these, there
2065 # will be no effect. Some error checking for these conditions could be added,
2066 # but it hasn't been done.
2068 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2069 # to prevent further processing of the line. This routine will output the
2070 # message as a warning once, and then keep a count of the lines that have the
2071 # same message, and output that count at the end of the file's processing.
2072 # This keeps the number of messages down to a manageable amount.
2074 # get_missings() should be called to retrieve any @missing input lines.
2075 # Messages will be raised if this isn't done if the options aren't to ignore
2078 sub trace { return main::trace(@_); }
2081 # Keep track of fields that are to be put into the constructor.
2082 my %constructor_fields;
2084 main::setup_package(Constructor_Fields => \%constructor_fields);
2086 my %file; # Input file name, required
2087 main::set_access('file', \%file, qw{ c r });
2089 my %first_released; # Unicode version file was first released in, required
2090 main::set_access('first_released', \%first_released, qw{ c r });
2092 my %handler; # Subroutine to process the input file, defaults to
2093 # 'process_generic_property_file'
2094 main::set_access('handler', \%handler, qw{ c });
2097 # name of property this file is for. defaults to none, meaning not
2098 # applicable, or is otherwise determinable, for example, from each line.
2099 main::set_access('property', \%property, qw{ c r });
2102 # If this is true, the file is optional. If not present, no warning is
2103 # output. If it is present, the string given by this parameter is
2104 # evaluated, and if false the file is not processed.
2105 main::set_access('optional', \%optional, 'c', 'r');
2108 # This is used for debugging, to skip processing of all but a few input
2109 # files. Add 'non_skip => 1' to the constructor for those files you want
2110 # processed when you set the $debug_skip global.
2111 main::set_access('non_skip', \%non_skip, 'c');
2114 # This is used to skip processing of this input file semi-permanently,
2115 # when it evaluates to true. The value should be the reason the file is
2116 # being skipped. It is used for files that we aren't planning to process
2117 # anytime soon, but want to allow to be in the directory and not raise a
2118 # message that we are not handling. Mostly for test files. This is in
2119 # contrast to the non_skip element, which is supposed to be used very
2120 # temporarily for debugging. Sets 'optional' to 1. Also, files that we
2121 # pretty much will never look at can be placed in the global
2122 # %ignored_files instead. Ones used here will be added to %skipped files
2123 main::set_access('skip', \%skip, 'c');
2125 my %each_line_handler;
2126 # list of subroutines to look at and filter each non-comment line in the
2127 # file. defaults to none. The subroutines are called in order, each is
2128 # to adjust $_ for the next one, and the final one adjusts it for
2130 main::set_access('each_line_handler', \%each_line_handler, 'c');
2132 my %has_missings_defaults;
2133 # ? Are there lines in the file giving default values for code points
2134 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2135 # the norm, but IGNORED means it has such lines, but the handler doesn't
2136 # use them. Having these three states allows us to catch changes to the
2137 # UCD that this program should track
2138 main::set_access('has_missings_defaults',
2139 \%has_missings_defaults, qw{ c r });
2142 # Subroutine to call before doing anything else in the file. If undef, no
2143 # such handler is called.
2144 main::set_access('pre_handler', \%pre_handler, qw{ c });
2147 # Subroutine to call upon getting an EOF on the input file, but before
2148 # that is returned to the main handler. This is to allow buffers to be
2149 # flushed. The handler is expected to call insert_lines() or
2150 # insert_adjusted() with the buffered material
2151 main::set_access('eof_handler', \%eof_handler, qw{ c r });
2154 # Subroutine to call after all the lines of the file are read in and
2155 # processed. If undef, no such handler is called.
2156 main::set_access('post_handler', \%post_handler, qw{ c });
2158 my %progress_message;
2159 # Message to print to display progress in lieu of the standard one
2160 main::set_access('progress_message', \%progress_message, qw{ c });
2163 # cache open file handle, internal. Is undef if file hasn't been
2164 # processed at all, empty if has;
2165 main::set_access('handle', \%handle);
2168 # cache of lines added virtually to the file, internal
2169 main::set_access('added_lines', \%added_lines);
2172 # cache of errors found, internal
2173 main::set_access('errors', \%errors);
2176 # storage of '@missing' defaults lines
2177 main::set_access('missings', \%missings);
2182 my $self = bless \do{ my $anonymous_scalar }, $class;
2183 my $addr = do { no overloading; pack 'J', $self; };
2186 $handler{$addr} = \&main::process_generic_property_file;
2187 $non_skip{$addr} = 0;
2189 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2190 $handle{$addr} = undef;
2191 $added_lines{$addr} = [ ];
2192 $each_line_handler{$addr} = [ ];
2193 $errors{$addr} = { };
2194 $missings{$addr} = [ ];
2196 # Two positional parameters.
2197 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2198 $file{$addr} = main::internal_file_to_platform(shift);
2199 $first_released{$addr} = shift;
2201 # The rest of the arguments are key => value pairs
2202 # %constructor_fields has been set up earlier to list all possible
2203 # ones. Either set or push, depending on how the default has been set
2206 foreach my $key (keys %args) {
2207 my $argument = $args{$key};
2209 # Note that the fields are the lower case of the constructor keys
2210 my $hash = $constructor_fields{lc $key};
2211 if (! defined $hash) {
2212 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2215 if (ref $hash->{$addr} eq 'ARRAY') {
2216 if (ref $argument eq 'ARRAY') {
2217 foreach my $argument (@{$argument}) {
2218 next if ! defined $argument;
2219 push @{$hash->{$addr}}, $argument;
2223 push @{$hash->{$addr}}, $argument if defined $argument;
2227 $hash->{$addr} = $argument;
2232 # If the file has a property for it, it means that the property is not
2233 # listed in the file's entries. So add a handler to the list of line
2234 # handlers to insert the property name into the lines, to provide a
2235 # uniform interface to the final processing subroutine.
2236 # the final code doesn't have to worry about that.
2237 if ($property{$addr}) {
2238 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2241 if ($non_skip{$addr} && ! $debug_skip && $verbosity) {
2242 print "Warning: " . __PACKAGE__ . " constructor for $file{$addr} has useless 'non_skip' in it\n";
2245 # If skipping, set to optional, and add to list of ignored files,
2246 # including its reason
2248 $optional{$addr} = 1;
2249 $skipped_files{$file{$addr}} = $skip{$addr}
2258 qw("") => "_operator_stringify",
2259 "." => \&main::_operator_dot,
2260 ".=" => \&main::_operator_dot_equal,
2263 sub _operator_stringify {
2266 return __PACKAGE__ . " object for " . $self->file;
2269 # flag to make sure extracted files are processed early
2270 my $seen_non_extracted_non_age = 0;
2273 # Process the input object $self. This opens and closes the file and
2274 # calls all the handlers for it. Currently, this can only be called
2275 # once per file, as it destroy's the EOF handler
2278 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2280 my $addr = do { no overloading; pack 'J', $self; };
2282 my $file = $file{$addr};
2284 # Don't process if not expecting this file (because released later
2285 # than this Unicode version), and isn't there. This means if someone
2286 # copies it into an earlier version's directory, we will go ahead and
2288 return if $first_released{$addr} gt $v_version && ! -e $file;
2290 # If in debugging mode and this file doesn't have the non-skip
2291 # flag set, and isn't one of the critical files, skip it.
2293 && $first_released{$addr} ne v0
2294 && ! $non_skip{$addr})
2296 print "Skipping $file in debugging\n" if $verbosity;
2300 # File could be optional
2301 if ($optional{$addr}) {
2302 return unless -e $file;
2303 my $result = eval $optional{$addr};
2304 if (! defined $result) {
2305 Carp::my_carp_bug("Got '$@' when tried to eval $optional{$addr}. $file Skipped.");
2310 print STDERR "Skipping processing input file '$file' because '$optional{$addr}' is not true\n";
2316 if (! defined $file || ! -e $file) {
2318 # If the file doesn't exist, see if have internal data for it
2319 # (based on first_released being 0).
2320 if ($first_released{$addr} eq v0) {
2321 $handle{$addr} = 'pretend_is_open';
2324 if (! $optional{$addr} # File could be optional
2325 && $v_version ge $first_released{$addr})
2327 print STDERR "Skipping processing input file '$file' because not found\n" if $v_version ge $first_released{$addr};
2334 # Here, the file exists. Some platforms may change the case of
2336 if ($seen_non_extracted_non_age) {
2337 if ($file =~ /$EXTRACTED/i) {
2338 Carp::my_carp_bug(main::join_lines(<<END
2339 $file should be processed just after the 'Prop...Alias' files, and before
2340 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2341 have subtle problems
2346 elsif ($EXTRACTED_DIR
2347 && $first_released{$addr} ne v0
2348 && $file !~ /$EXTRACTED/i
2349 && lc($file) ne 'dage.txt')
2351 # We don't set this (by the 'if' above) if we have no
2352 # extracted directory, so if running on an early version,
2353 # this test won't work. Not worth worrying about.
2354 $seen_non_extracted_non_age = 1;
2357 # And mark the file as having being processed, and warn if it
2358 # isn't a file we are expecting. As we process the files,
2359 # they are deleted from the hash, so any that remain at the
2360 # end of the program are files that we didn't process.
2361 my $fkey = File::Spec->rel2abs($file);
2362 my $expecting = delete $potential_files{lc($fkey)};
2364 Carp::my_carp("Was not expecting '$file'.") if
2366 && ! defined $handle{$addr};
2368 # Having deleted from expected files, we can quit if not to do
2369 # anything. Don't print progress unless really want verbosity
2371 print "Skipping $file.\n" if $verbosity >= $VERBOSE;
2375 # Open the file, converting the slashes used in this program
2376 # into the proper form for the OS
2378 if (not open $file_handle, "<", $file) {
2379 Carp::my_carp("Can't open $file. Skipping: $!");
2382 $handle{$addr} = $file_handle; # Cache the open file handle
2385 if ($verbosity >= $PROGRESS) {
2386 if ($progress_message{$addr}) {
2387 print "$progress_message{$addr}\n";
2390 # If using a virtual file, say so.
2391 print "Processing ", (-e $file)
2393 : "substitute $file",
2399 # Call any special handler for before the file.
2400 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2402 # Then the main handler
2403 &{$handler{$addr}}($self);
2405 # Then any special post-file handler.
2406 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2408 # If any errors have been accumulated, output the counts (as the first
2409 # error message in each class was output when it was encountered).
2410 if ($errors{$addr}) {
2413 foreach my $error (keys %{$errors{$addr}}) {
2414 $total += $errors{$addr}->{$error};
2415 delete $errors{$addr}->{$error};
2420 = "A total of $total lines had errors in $file. ";
2422 $message .= ($types == 1)
2423 ? '(Only the first one was displayed.)'
2424 : '(Only the first of each type was displayed.)';
2425 Carp::my_carp($message);
2429 if (@{$missings{$addr}}) {
2430 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2433 # If a real file handle, close it.
2434 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2436 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2437 # the file, as opposed to undef
2442 # Sets $_ to be the next logical input line, if any. Returns non-zero
2443 # if such a line exists. 'logical' means that any lines that have
2444 # been added via insert_lines() will be returned in $_ before the file
2448 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2450 my $addr = do { no overloading; pack 'J', $self; };
2452 # Here the file is open (or if the handle is not a ref, is an open
2453 # 'virtual' file). Get the next line; any inserted lines get priority
2454 # over the file itself.
2458 while (1) { # Loop until find non-comment, non-empty line
2459 #local $to_trace = 1 if main::DEBUG;
2460 my $inserted_ref = shift @{$added_lines{$addr}};
2461 if (defined $inserted_ref) {
2462 ($adjusted, $_) = @{$inserted_ref};
2463 trace $adjusted, $_ if main::DEBUG && $to_trace;
2464 return 1 if $adjusted;
2467 last if ! ref $handle{$addr}; # Don't read unless is real file
2468 last if ! defined ($_ = readline $handle{$addr});
2471 trace $_ if main::DEBUG && $to_trace;
2473 # See if this line is the comment line that defines what property
2474 # value that code points that are not listed in the file should
2475 # have. The format or existence of these lines is not guaranteed
2476 # by Unicode since they are comments, but the documentation says
2477 # that this was added for machine-readability, so probably won't
2478 # change. This works starting in Unicode Version 5.0. They look
2481 # @missing: 0000..10FFFF; Not_Reordered
2482 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2483 # @missing: 0000..10FFFF; ; NaN
2485 # Save the line for a later get_missings() call.
2486 if (/$missing_defaults_prefix/) {
2487 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2488 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2490 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2491 my @defaults = split /\s* ; \s*/x, $_;
2493 # The first field is the @missing, which ends in a
2494 # semi-colon, so can safely shift.
2497 # Some of these lines may have empty field placeholders
2498 # which get in the way. An example is:
2499 # @missing: 0000..10FFFF; ; NaN
2500 # Remove them. Process starting from the top so the
2501 # splice doesn't affect things still to be looked at.
2502 for (my $i = @defaults - 1; $i >= 0; $i--) {
2503 next if $defaults[$i] ne "";
2504 splice @defaults, $i, 1;
2507 # What's left should be just the property (maybe) and the
2508 # default. Having only one element means it doesn't have
2512 if (@defaults >= 1) {
2513 if (@defaults == 1) {
2514 $default = $defaults[0];
2517 $property = $defaults[0];
2518 $default = $defaults[1];
2524 || ($default =~ /^</
2525 && $default !~ /^<code *point>$/i
2526 && $default !~ /^<none>$/i
2527 && $default !~ /^<script>$/i))
2529 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2533 # If the property is missing from the line, it should
2534 # be the one for the whole file
2535 $property = $property{$addr} if ! defined $property;
2537 # Change <none> to the null string, which is what it
2538 # really means. If the default is the code point
2539 # itself, set it to <code point>, which is what
2540 # Unicode uses (but sometimes they've forgotten the
2542 if ($default =~ /^<none>$/i) {
2545 elsif ($default =~ /^<code *point>$/i) {
2546 $default = $CODE_POINT;
2548 elsif ($default =~ /^<script>$/i) {
2550 # Special case this one. Currently is from
2551 # ScriptExtensions.txt, and means for all unlisted
2552 # code points, use their Script property values.
2553 # For the code points not listed in that file, the
2554 # default value is 'Unknown'.
2555 $default = "Unknown";
2558 # Store them as a sub-arrays with both components.
2559 push @{$missings{$addr}}, [ $default, $property ];
2563 # There is nothing for the caller to process on this comment
2568 # Remove comments and trailing space, and skip this line if the
2574 # Call any handlers for this line, and skip further processing of
2575 # the line if the handler sets the line to null.
2576 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
2581 # Here the line is ok. return success.
2583 } # End of looping through lines.
2585 # If there is an EOF handler, call it (only once) and if it generates
2586 # more lines to process go back in the loop to handle them.
2587 if ($eof_handler{$addr}) {
2588 &{$eof_handler{$addr}}($self);
2589 $eof_handler{$addr} = ""; # Currently only get one shot at it.
2590 goto LINE if $added_lines{$addr};
2593 # Return failure -- no more lines.
2598 # Not currently used, not fully tested.
2600 # # Non-destructive look-ahead one non-adjusted, non-comment, non-blank
2601 # # record. Not callable from an each_line_handler(), nor does it call
2602 # # an each_line_handler() on the line.
2605 # my $addr = do { no overloading; pack 'J', $self; };
2607 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
2608 # my ($adjusted, $line) = @{$inserted_ref};
2609 # next if $adjusted;
2611 # # Remove comments and trailing space, and return a non-empty
2614 # $line =~ s/\s+$//;
2615 # return $line if $line ne "";
2618 # return if ! ref $handle{$addr}; # Don't read unless is real file
2619 # while (1) { # Loop until find non-comment, non-empty line
2620 # local $to_trace = 1 if main::DEBUG;
2621 # trace $_ if main::DEBUG && $to_trace;
2622 # return if ! defined (my $line = readline $handle{$addr});
2624 # push @{$added_lines{$addr}}, [ 0, $line ];
2627 # $line =~ s/\s+$//;
2628 # return $line if $line ne "";
2636 # Lines can be inserted so that it looks like they were in the input
2637 # file at the place it was when this routine is called. See also
2638 # insert_adjusted_lines(). Lines inserted via this routine go through
2639 # any each_line_handler()
2643 # Each inserted line is an array, with the first element being 0 to
2644 # indicate that this line hasn't been adjusted, and needs to be
2647 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
2651 sub insert_adjusted_lines {
2652 # Lines can be inserted so that it looks like they were in the input
2653 # file at the place it was when this routine is called. See also
2654 # insert_lines(). Lines inserted via this routine are already fully
2655 # adjusted, ready to be processed; each_line_handler()s handlers will
2656 # not be called. This means this is not a completely general
2657 # facility, as only the last each_line_handler on the stack should
2658 # call this. It could be made more general, by passing to each of the
2659 # line_handlers their position on the stack, which they would pass on
2660 # to this routine, and that would replace the boolean first element in
2661 # the anonymous array pushed here, so that the next_line routine could
2662 # use that to call only those handlers whose index is after it on the
2663 # stack. But this is overkill for what is needed now.
2666 trace $_[0] if main::DEBUG && $to_trace;
2668 # Each inserted line is an array, with the first element being 1 to
2669 # indicate that this line has been adjusted
2671 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
2676 # Returns the stored up @missings lines' values, and clears the list.
2677 # The values are in an array, consisting of the default in the first
2678 # element, and the property in the 2nd. However, since these lines
2679 # can be stacked up, the return is an array of all these arrays.
2682 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2684 my $addr = do { no overloading; pack 'J', $self; };
2686 # If not accepting a list return, just return the first one.
2687 return shift @{$missings{$addr}} unless wantarray;
2689 my @return = @{$missings{$addr}};
2690 undef @{$missings{$addr}};
2694 sub _insert_property_into_line {
2695 # Add a property field to $_, if this file requires it.
2698 my $addr = do { no overloading; pack 'J', $self; };
2699 my $property = $property{$addr};
2700 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2702 $_ =~ s/(;|$)/; $property$1/;
2707 # Output consistent error messages, using either a generic one, or the
2708 # one given by the optional parameter. To avoid gazillions of the
2709 # same message in case the syntax of a file is way off, this routine
2710 # only outputs the first instance of each message, incrementing a
2711 # count so the totals can be output at the end of the file.
2714 my $message = shift;
2715 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2717 my $addr = do { no overloading; pack 'J', $self; };
2719 $message = 'Unexpected line' unless $message;
2721 # No trailing punctuation so as to fit with our addenda.
2722 $message =~ s/[.:;,]$//;
2724 # If haven't seen this exact message before, output it now. Otherwise
2725 # increment the count of how many times it has occurred
2726 unless ($errors{$addr}->{$message}) {
2727 Carp::my_carp("$message in '$_' in "
2729 . " at line $.. Skipping this line;");
2730 $errors{$addr}->{$message} = 1;
2733 $errors{$addr}->{$message}++;
2736 # Clear the line to prevent any further (meaningful) processing of it.
2743 package Multi_Default;
2745 # Certain properties in early versions of Unicode had more than one possible
2746 # default for code points missing from the files. In these cases, one
2747 # default applies to everything left over after all the others are applied,
2748 # and for each of the others, there is a description of which class of code
2749 # points applies to it. This object helps implement this by storing the
2750 # defaults, and for all but that final default, an eval string that generates
2751 # the class that it applies to.
2756 main::setup_package();
2759 # The defaults structure for the classes
2760 main::set_access('class_defaults', \%class_defaults);
2763 # The default that applies to everything left over.
2764 main::set_access('other_default', \%other_default, 'r');
2768 # The constructor is called with default => eval pairs, terminated by
2769 # the left-over default. e.g.
2770 # Multi_Default->new(
2771 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
2773 # 'R' => 'some other expression that evaluates to code points',
2781 my $self = bless \do{my $anonymous_scalar}, $class;
2782 my $addr = do { no overloading; pack 'J', $self; };
2785 my $default = shift;
2787 $class_defaults{$addr}->{$default} = $eval;
2790 $other_default{$addr} = shift;
2795 sub get_next_defaults {
2796 # Iterates and returns the next class of defaults.
2798 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2800 my $addr = do { no overloading; pack 'J', $self; };
2802 return each %{$class_defaults{$addr}};
2808 # An alias is one of the names that a table goes by. This class defines them
2809 # including some attributes. Everything is currently setup in the
2815 main::setup_package();
2818 main::set_access('name', \%name, 'r');
2821 # Should this name match loosely or not.
2822 main::set_access('loose_match', \%loose_match, 'r');
2824 my %make_re_pod_entry;
2825 # Some aliases should not get their own entries in the re section of the
2826 # pod, because they are covered by a wild-card, and some we want to
2827 # discourage use of. Binary
2828 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
2831 # Is this documented to be accessible via Unicode::UCD
2832 main::set_access('ucd', \%ucd, 'r', 's');
2835 # Aliases have a status, like deprecated, or even suppressed (which means
2836 # they don't appear in documentation). Enum
2837 main::set_access('status', \%status, 'r');
2840 # Similarly, some aliases should not be considered as usable ones for
2841 # external use, such as file names, or we don't want documentation to
2842 # recommend them. Boolean
2843 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
2848 my $self = bless \do { my $anonymous_scalar }, $class;
2849 my $addr = do { no overloading; pack 'J', $self; };
2851 $name{$addr} = shift;
2852 $loose_match{$addr} = shift;
2853 $make_re_pod_entry{$addr} = shift;
2854 $ok_as_filename{$addr} = shift;
2855 $status{$addr} = shift;
2856 $ucd{$addr} = shift;
2858 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2860 # Null names are never ok externally
2861 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
2869 # A range is the basic unit for storing code points, and is described in the
2870 # comments at the beginning of the program. Each range has a starting code
2871 # point; an ending code point (not less than the starting one); a value
2872 # that applies to every code point in between the two end-points, inclusive;
2873 # and an enum type that applies to the value. The type is for the user's
2874 # convenience, and has no meaning here, except that a non-zero type is
2875 # considered to not obey the normal Unicode rules for having standard forms.
2877 # The same structure is used for both map and match tables, even though in the
2878 # latter, the value (and hence type) is irrelevant and could be used as a
2879 # comment. In map tables, the value is what all the code points in the range
2880 # map to. Type 0 values have the standardized version of the value stored as
2881 # well, so as to not have to recalculate it a lot.
2883 sub trace { return main::trace(@_); }
2887 main::setup_package();
2890 main::set_access('start', \%start, 'r', 's');
2893 main::set_access('end', \%end, 'r', 's');
2896 main::set_access('value', \%value, 'r');
2899 main::set_access('type', \%type, 'r');
2902 # The value in internal standard form. Defined only if the type is 0.
2903 main::set_access('standard_form', \%standard_form);
2905 # Note that if these fields change, the dump() method should as well
2908 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
2911 my $self = bless \do { my $anonymous_scalar }, $class;
2912 my $addr = do { no overloading; pack 'J', $self; };
2914 $start{$addr} = shift;
2915 $end{$addr} = shift;
2919 my $value = delete $args{'Value'}; # Can be 0
2920 $value = "" unless defined $value;
2921 $value{$addr} = $value;
2923 $type{$addr} = delete $args{'Type'} || 0;
2925 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
2932 qw("") => "_operator_stringify",
2933 "." => \&main::_operator_dot,
2934 ".=" => \&main::_operator_dot_equal,
2937 sub _operator_stringify {
2939 my $addr = do { no overloading; pack 'J', $self; };
2941 # Output it like '0041..0065 (value)'
2942 my $return = sprintf("%04X", $start{$addr})
2944 . sprintf("%04X", $end{$addr});
2945 my $value = $value{$addr};
2946 my $type = $type{$addr};
2948 $return .= "$value";
2949 $return .= ", Type=$type" if $type != 0;
2956 # Calculate the standard form only if needed, and cache the result.
2957 # The standard form is the value itself if the type is special.
2958 # This represents a considerable CPU and memory saving - at the time
2959 # of writing there are 368676 non-special objects, but the standard
2960 # form is only requested for 22047 of them - ie about 6%.
2963 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2965 my $addr = do { no overloading; pack 'J', $self; };
2967 return $standard_form{$addr} if defined $standard_form{$addr};
2969 my $value = $value{$addr};
2970 return $value if $type{$addr};
2971 return $standard_form{$addr} = main::standardize($value);
2975 # Human, not machine readable. For machine readable, comment out this
2976 # entire routine and let the standard one take effect.
2979 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2981 my $addr = do { no overloading; pack 'J', $self; };
2983 my $return = $indent
2984 . sprintf("%04X", $start{$addr})
2986 . sprintf("%04X", $end{$addr})
2987 . " '$value{$addr}';";
2988 if (! defined $standard_form{$addr}) {
2989 $return .= "(type=$type{$addr})";
2991 elsif ($standard_form{$addr} ne $value{$addr}) {
2992 $return .= "(standard '$standard_form{$addr}')";
2998 package _Range_List_Base;
3000 # Base class for range lists. A range list is simply an ordered list of
3001 # ranges, so that the ranges with the lowest starting numbers are first in it.
3003 # When a new range is added that is adjacent to an existing range that has the
3004 # same value and type, it merges with it to form a larger range.
3006 # Ranges generally do not overlap, except that there can be multiple entries
3007 # of single code point ranges. This is because of NameAliases.txt.
3009 # In this program, there is a standard value such that if two different
3010 # values, have the same standard value, they are considered equivalent. This
3011 # value was chosen so that it gives correct results on Unicode data
3013 # There are a number of methods to manipulate range lists, and some operators
3014 # are overloaded to handle them.
3016 sub trace { return main::trace(@_); }
3022 # Max is initialized to a negative value that isn't adjacent to 0, for
3026 main::setup_package();
3029 # The list of ranges
3030 main::set_access('ranges', \%ranges, 'readable_array');
3033 # The highest code point in the list. This was originally a method, but
3034 # actual measurements said it was used a lot.
3035 main::set_access('max', \%max, 'r');
3037 my %each_range_iterator;
3038 # Iterator position for each_range()
3039 main::set_access('each_range_iterator', \%each_range_iterator);
3042 # Name of parent this is attached to, if any. Solely for better error
3044 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3046 my %_search_ranges_cache;
3047 # A cache of the previous result from _search_ranges(), for better
3049 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3055 # Optional initialization data for the range list.
3056 my $initialize = delete $args{'Initialize'};
3060 # Use _union() to initialize. _union() returns an object of this
3061 # class, which means that it will call this constructor recursively.
3062 # But it won't have this $initialize parameter so that it won't
3063 # infinitely loop on this.
3064 return _union($class, $initialize, %args) if defined $initialize;
3066 $self = bless \do { my $anonymous_scalar }, $class;
3067 my $addr = do { no overloading; pack 'J', $self; };
3069 # Optional parent object, only for debug info.
3070 $owner_name_of{$addr} = delete $args{'Owner'};
3071 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3073 # Stringify, in case it is an object.
3074 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3076 # This is used only for error messages, and so a colon is added
3077 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3079 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3081 $max{$addr} = $max_init;
3083 $_search_ranges_cache{$addr} = 0;
3084 $ranges{$addr} = [];
3091 qw("") => "_operator_stringify",
3092 "." => \&main::_operator_dot,
3093 ".=" => \&main::_operator_dot_equal,
3096 sub _operator_stringify {
3098 my $addr = do { no overloading; pack 'J', $self; };
3100 return "Range_List attached to '$owner_name_of{$addr}'"
3101 if $owner_name_of{$addr};
3102 return "anonymous Range_List " . \$self;
3106 # Returns the union of the input code points. It can be called as
3107 # either a constructor or a method. If called as a method, the result
3108 # will be a new() instance of the calling object, containing the union
3109 # of that object with the other parameter's code points; if called as
3110 # a constructor, the first parameter gives the class that the new object
3111 # should be, and the second parameter gives the code points to go into
3113 # In either case, there are two parameters looked at by this routine;
3114 # any additional parameters are passed to the new() constructor.
3116 # The code points can come in the form of some object that contains
3117 # ranges, and has a conventionally named method to access them; or
3118 # they can be an array of individual code points (as integers); or
3119 # just a single code point.
3121 # If they are ranges, this routine doesn't make any effort to preserve
3122 # the range values and types of one input over the other. Therefore
3123 # this base class should not allow _union to be called from other than
3124 # initialization code, so as to prevent two tables from being added
3125 # together where the range values matter. The general form of this
3126 # routine therefore belongs in a derived class, but it was moved here
3127 # to avoid duplication of code. The failure to overload this in this
3128 # class keeps it safe.
3130 # It does make the effort during initialization to accept tables with
3131 # multiple values for the same code point, and to preserve the order
3132 # of these. If there is only one input range or range set, it doesn't
3133 # sort (as it should already be sorted to the desired order), and will
3134 # accept multiple values per code point. Otherwise it will merge
3135 # multiple values into a single one.
3138 my @args; # Arguments to pass to the constructor
3142 # If a method call, will start the union with the object itself, and
3143 # the class of the new object will be the same as self.
3150 # Add the other required parameter.
3152 # Rest of parameters are passed on to the constructor
3154 # Accumulate all records from both lists.
3156 my $input_count = 0;
3157 for my $arg (@args) {
3158 #local $to_trace = 0 if main::DEBUG;
3159 trace "argument = $arg" if main::DEBUG && $to_trace;
3160 if (! defined $arg) {
3162 if (defined $self) {
3164 $message .= $owner_name_of{pack 'J', $self};
3166 Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
3170 $arg = [ $arg ] if ! ref $arg;
3171 my $type = ref $arg;
3172 if ($type eq 'ARRAY') {
3173 foreach my $element (@$arg) {
3174 push @records, Range->new($element, $element);
3178 elsif ($arg->isa('Range')) {
3179 push @records, $arg;
3182 elsif ($arg->can('ranges')) {
3183 push @records, $arg->ranges;
3188 if (defined $self) {
3190 $message .= $owner_name_of{pack 'J', $self};
3192 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3197 # Sort with the range containing the lowest ordinal first, but if
3198 # two ranges start at the same code point, sort with the bigger range
3199 # of the two first, because it takes fewer cycles.
3200 if ($input_count > 1) {
3201 @records = sort { ($a->start <=> $b->start)
3203 # if b is shorter than a, b->end will be
3204 # less than a->end, and we want to select
3205 # a, so want to return -1
3206 ($b->end <=> $a->end)
3210 my $new = $class->new(@_);
3212 # Fold in records so long as they add new information.
3213 for my $set (@records) {
3214 my $start = $set->start;
3215 my $end = $set->end;
3216 my $value = $set->value;
3217 my $type = $set->type;
3218 if ($start > $new->max) {
3219 $new->_add_delete('+', $start, $end, $value, Type => $type);
3221 elsif ($end > $new->max) {
3222 $new->_add_delete('+', $new->max +1, $end, $value,
3225 elsif ($input_count == 1) {
3226 # Here, overlaps existing range, but is from a single input,
3227 # so preserve the multiple values from that input.
3228 $new->_add_delete('+', $start, $end, $value, Type => $type,
3229 Replace => $MULTIPLE_AFTER);
3236 sub range_count { # Return the number of ranges in the range list
3238 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3241 return scalar @{$ranges{pack 'J', $self}};
3245 # Returns the minimum code point currently in the range list, or if
3246 # the range list is empty, 2 beyond the max possible. This is a
3247 # method because used so rarely, that not worth saving between calls,
3248 # and having to worry about changing it as ranges are added and
3252 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3254 my $addr = do { no overloading; pack 'J', $self; };
3256 # If the range list is empty, return a large value that isn't adjacent
3257 # to any that could be in the range list, for simpler tests
3258 return $MAX_UNICODE_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3259 return $ranges{$addr}->[0]->start;
3263 # Boolean: Is argument in the range list? If so returns $i such that:
3264 # range[$i]->end < $codepoint <= range[$i+1]->end
3265 # which is one beyond what you want; this is so that the 0th range
3266 # doesn't return false
3268 my $codepoint = shift;
3269 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3271 my $i = $self->_search_ranges($codepoint);
3272 return 0 unless defined $i;
3274 # The search returns $i, such that
3275 # range[$i-1]->end < $codepoint <= range[$i]->end
3276 # So is in the table if and only iff it is at least the start position
3279 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3283 sub containing_range {
3284 # Returns the range object that contains the code point, undef if none
3287 my $codepoint = shift;
3288 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3290 my $i = $self->contains($codepoint);
3293 # contains() returns 1 beyond where we should look
3295 return $ranges{pack 'J', $self}->[$i-1];
3299 # Returns the value associated with the code point, undef if none
3302 my $codepoint = shift;
3303 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3305 my $range = $self->containing_range($codepoint);
3306 return unless defined $range;
3308 return $range->value;
3312 # Returns the type of the range containing the code point, undef if
3313 # the code point is not in the table
3316 my $codepoint = shift;
3317 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3319 my $range = $self->containing_range($codepoint);
3320 return unless defined $range;
3322 return $range->type;
3325 sub _search_ranges {
3326 # Find the range in the list which contains a code point, or where it
3327 # should go if were to add it. That is, it returns $i, such that:
3328 # range[$i-1]->end < $codepoint <= range[$i]->end
3329 # Returns undef if no such $i is possible (e.g. at end of table), or
3330 # if there is an error.
3333 my $code_point = shift;
3334 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3336 my $addr = do { no overloading; pack 'J', $self; };
3338 return if $code_point > $max{$addr};
3339 my $r = $ranges{$addr}; # The current list of ranges
3340 my $range_list_size = scalar @$r;
3343 use integer; # want integer division
3345 # Use the cached result as the starting guess for this one, because,
3346 # an experiment on 5.1 showed that 90% of the time the cache was the
3347 # same as the result on the next call (and 7% it was one less).
3348 $i = $_search_ranges_cache{$addr};
3349 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3350 # from an intervening deletion
3351 #local $to_trace = 1 if main::DEBUG;
3352 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3353 return $i if $code_point <= $r->[$i]->end
3354 && ($i == 0 || $r->[$i-1]->end < $code_point);
3356 # Here the cache doesn't yield the correct $i. Try adding 1.
3357 if ($i < $range_list_size - 1
3358 && $r->[$i]->end < $code_point &&
3359 $code_point <= $r->[$i+1]->end)
3362 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3363 $_search_ranges_cache{$addr} = $i;
3367 # Here, adding 1 also didn't work. We do a binary search to
3368 # find the correct position, starting with current $i
3370 my $upper = $range_list_size - 1;
3372 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3374 if ($code_point <= $r->[$i]->end) {
3376 # Here we have met the upper constraint. We can quit if we
3377 # also meet the lower one.
3378 last if $i == 0 || $r->[$i-1]->end < $code_point;
3380 $upper = $i; # Still too high.
3385 # Here, $r[$i]->end < $code_point, so look higher up.
3389 # Split search domain in half to try again.
3390 my $temp = ($upper + $lower) / 2;
3392 # No point in continuing unless $i changes for next time
3396 # We can't reach the highest element because of the averaging.
3397 # So if one below the upper edge, force it there and try one
3399 if ($i == $range_list_size - 2) {
3401 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
3402 $i = $range_list_size - 1;
3404 # Change $lower as well so if fails next time through,
3405 # taking the average will yield the same $i, and we will
3406 # quit with the error message just below.
3410 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
3414 } # End of while loop
3416 if (main::DEBUG && $to_trace) {
3417 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
3418 trace "i= [ $i ]", $r->[$i];
3419 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
3422 # Here we have found the offset. Cache it as a starting point for the
3424 $_search_ranges_cache{$addr} = $i;
3429 # Add, replace or delete ranges to or from a list. The $type
3430 # parameter gives which:
3431 # '+' => insert or replace a range, returning a list of any changed
3433 # '-' => delete a range, returning a list of any deleted ranges.
3435 # The next three parameters give respectively the start, end, and
3436 # value associated with the range. 'value' should be null unless the
3439 # The range list is kept sorted so that the range with the lowest
3440 # starting position is first in the list, and generally, adjacent
3441 # ranges with the same values are merged into a single larger one (see
3442 # exceptions below).
3444 # There are more parameters; all are key => value pairs:
3445 # Type gives the type of the value. It is only valid for '+'.
3446 # All ranges have types; if this parameter is omitted, 0 is
3447 # assumed. Ranges with type 0 are assumed to obey the
3448 # Unicode rules for casing, etc; ranges with other types are
3449 # not. Otherwise, the type is arbitrary, for the caller's
3450 # convenience, and looked at only by this routine to keep
3451 # adjacent ranges of different types from being merged into
3452 # a single larger range, and when Replace =>
3453 # $IF_NOT_EQUIVALENT is specified (see just below).
3454 # Replace determines what to do if the range list already contains
3455 # ranges which coincide with all or portions of the input
3456 # range. It is only valid for '+':
3457 # => $NO means that the new value is not to replace
3458 # any existing ones, but any empty gaps of the
3459 # range list coinciding with the input range
3460 # will be filled in with the new value.
3461 # => $UNCONDITIONALLY means to replace the existing values with
3462 # this one unconditionally. However, if the
3463 # new and old values are identical, the
3464 # replacement is skipped to save cycles
3465 # => $IF_NOT_EQUIVALENT means to replace the existing values
3466 # (the default) with this one if they are not equivalent.
3467 # Ranges are equivalent if their types are the
3468 # same, and they are the same string; or if
3469 # both are type 0 ranges, if their Unicode
3470 # standard forms are identical. In this last
3471 # case, the routine chooses the more "modern"
3472 # one to use. This is because some of the
3473 # older files are formatted with values that
3474 # are, for example, ALL CAPs, whereas the
3475 # derived files have a more modern style,
3476 # which looks better. By looking for this
3477 # style when the pre-existing and replacement
3478 # standard forms are the same, we can move to
3480 # => $MULTIPLE_BEFORE means that if this range duplicates an
3481 # existing one, but has a different value,
3482 # don't replace the existing one, but insert
3483 # this, one so that the same range can occur
3484 # multiple times. They are stored LIFO, so
3485 # that the final one inserted is the first one
3486 # returned in an ordered search of the table.
3487 # If this is an exact duplicate, including the
3488 # value, the original will be moved to be
3489 # first, before any other duplicate ranges
3490 # with different values.
3491 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
3492 # FIFO, so that this one is inserted after all
3493 # others that currently exist. If this is an
3494 # exact duplicate, including value, of an
3495 # existing range, this one is discarded
3496 # (leaving the existing one in its original,
3497 # higher priority position
3498 # => anything else is the same as => $IF_NOT_EQUIVALENT
3500 # "same value" means identical for non-type-0 ranges, and it means
3501 # having the same standard forms for type-0 ranges.
3503 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
3506 my $operation = shift; # '+' for add/replace; '-' for delete;
3513 $value = "" if not defined $value; # warning: $value can be "0"
3515 my $replace = delete $args{'Replace'};
3516 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
3518 my $type = delete $args{'Type'};
3519 $type = 0 unless defined $type;
3521 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3523 my $addr = do { no overloading; pack 'J', $self; };
3525 if ($operation ne '+' && $operation ne '-') {
3526 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
3529 unless (defined $start && defined $end) {
3530 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
3533 unless ($end >= $start) {
3534 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
3537 if ($end > $MAX_UNICODE_CODEPOINT && $operation eq '+') {
3538 Carp::my_carp("$owner_name_of{$addr}Warning: Range '" . sprintf("%04X..%04X", $start, $end) . ") is above the Unicode maximum of " . sprintf("%04X", $MAX_UNICODE_CODEPOINT) . ". Adding it anyway");
3540 #local $to_trace = 1 if main::DEBUG;
3542 if ($operation eq '-') {
3543 if ($replace != $IF_NOT_EQUIVALENT) {
3544 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
3545 $replace = $IF_NOT_EQUIVALENT;
3548 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
3552 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
3557 my $r = $ranges{$addr}; # The current list of ranges
3558 my $range_list_size = scalar @$r; # And its size
3559 my $max = $max{$addr}; # The current high code point in
3560 # the list of ranges
3562 # Do a special case requiring fewer machine cycles when the new range
3563 # starts after the current highest point. The Unicode input data is
3564 # structured so this is common.
3565 if ($start > $max) {
3567 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
3568 return if $operation eq '-'; # Deleting a non-existing range is a
3571 # If the new range doesn't logically extend the current final one
3572 # in the range list, create a new range at the end of the range
3573 # list. (max cleverly is initialized to a negative number not
3574 # adjacent to 0 if the range list is empty, so even adding a range
3575 # to an empty range list starting at 0 will have this 'if'
3577 if ($start > $max + 1 # non-adjacent means can't extend.
3578 || @{$r}[-1]->value ne $value # values differ, can't extend.
3579 || @{$r}[-1]->type != $type # types differ, can't extend.
3581 push @$r, Range->new($start, $end,
3587 # Here, the new range starts just after the current highest in
3588 # the range list, and they have the same type and value.
3589 # Extend the current range to incorporate the new one.
3590 @{$r}[-1]->set_end($end);
3593 # This becomes the new maximum.
3598 #local $to_trace = 0 if main::DEBUG;
3600 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
3602 # Here, the input range isn't after the whole rest of the range list.
3603 # Most likely 'splice' will be needed. The rest of the routine finds
3604 # the needed splice parameters, and if necessary, does the splice.
3605 # First, find the offset parameter needed by the splice function for
3606 # the input range. Note that the input range may span multiple
3607 # existing ones, but we'll worry about that later. For now, just find
3608 # the beginning. If the input range is to be inserted starting in a
3609 # position not currently in the range list, it must (obviously) come
3610 # just after the range below it, and just before the range above it.
3611 # Slightly less obviously, it will occupy the position currently
3612 # occupied by the range that is to come after it. More formally, we
3613 # are looking for the position, $i, in the array of ranges, such that:
3615 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
3617 # (The ordered relationships within existing ranges are also shown in
3618 # the equation above). However, if the start of the input range is
3619 # within an existing range, the splice offset should point to that
3620 # existing range's position in the list; that is $i satisfies a
3621 # somewhat different equation, namely:
3623 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
3625 # More briefly, $start can come before or after r[$i]->start, and at
3626 # this point, we don't know which it will be. However, these
3627 # two equations share these constraints:
3629 # r[$i-1]->end < $start <= r[$i]->end
3631 # And that is good enough to find $i.
3633 my $i = $self->_search_ranges($start);
3635 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
3639 # The search function returns $i such that:
3641 # r[$i-1]->end < $start <= r[$i]->end
3643 # That means that $i points to the first range in the range list
3644 # that could possibly be affected by this operation. We still don't
3645 # know if the start of the input range is within r[$i], or if it
3646 # points to empty space between r[$i-1] and r[$i].
3647 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
3649 # Special case the insertion of data that is not to replace any
3651 if ($replace == $NO) { # If $NO, has to be operation '+'
3652 #local $to_trace = 1 if main::DEBUG;
3653 trace "Doesn't replace" if main::DEBUG && $to_trace;
3655 # Here, the new range is to take effect only on those code points
3656 # that aren't already in an existing range. This can be done by
3657 # looking through the existing range list and finding the gaps in
3658 # the ranges that this new range affects, and then calling this
3659 # function recursively on each of those gaps, leaving untouched
3660 # anything already in the list. Gather up a list of the changed
3661 # gaps first so that changes to the internal state as new ranges
3662 # are added won't be a problem.
3665 # First, if the starting point of the input range is outside an
3666 # existing one, there is a gap from there to the beginning of the
3667 # existing range -- add a span to fill the part that this new
3669 if ($start < $r->[$i]->start) {
3670 push @gap_list, Range->new($start,
3672 $r->[$i]->start - 1),
3674 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
3677 # Then look through the range list for other gaps until we reach
3678 # the highest range affected by the input one.
3680 for ($j = $i+1; $j < $range_list_size; $j++) {
3681 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
3682 last if $end < $r->[$j]->start;
3684 # If there is a gap between when this range starts and the
3685 # previous one ends, add a span to fill it. Note that just
3686 # because there are two ranges doesn't mean there is a
3687 # non-zero gap between them. It could be that they have
3688 # different values or types
3689 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
3691 Range->new($r->[$j-1]->end + 1,
3692 $r->[$j]->start - 1,
3694 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
3698 # Here, we have either found an existing range in the range list,
3699 # beyond the area affected by the input one, or we fell off the
3700 # end of the loop because the input range affects the whole rest
3701 # of the range list. In either case, $j is 1 higher than the
3702 # highest affected range. If $j == $i, it means that there are no
3703 # affected ranges, that the entire insertion is in the gap between
3704 # r[$i-1], and r[$i], which we already have taken care of before
3706 # On the other hand, if there are affected ranges, it might be
3707 # that there is a gap that needs filling after the final such
3708 # range to the end of the input range
3709 if ($r->[$j-1]->end < $end) {
3710 push @gap_list, Range->new(main::max($start,
3711 $r->[$j-1]->end + 1),
3714 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
3717 # Call recursively to fill in all the gaps.
3718 foreach my $gap (@gap_list) {
3719 $self->_add_delete($operation,
3729 # Here, we have taken care of the case where $replace is $NO.
3730 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
3731 # If inserting a multiple record, this is where it goes, before the
3732 # first (if any) existing one if inserting LIFO. (If this is to go
3733 # afterwards, FIFO, we below move the pointer to there.) These imply
3734 # an insertion, and no change to any existing ranges. Note that $i
3735 # can be -1 if this new range doesn't actually duplicate any existing,
3736 # and comes at the beginning of the list.
3737 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
3739 if ($start != $end) {
3740 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
3744 # If the new code point is within a current range ...
3745 if ($end >= $r->[$i]->start) {
3747 # Don't add an exact duplicate, as it isn't really a multiple
3748 my $existing_value = $r->[$i]->value;
3749 my $existing_type = $r->[$i]->type;
3750 return if $value eq $existing_value && $type eq $existing_type;
3752 # If the multiple value is part of an existing range, we want
3753 # to split up that range, so that only the single code point
3754 # is affected. To do this, we first call ourselves
3755 # recursively to delete that code point from the table, having
3756 # preserved its current data above. Then we call ourselves
3757 # recursively again to add the new multiple, which we know by
3758 # the test just above is different than the current code
3759 # point's value, so it will become a range containing a single
3760 # code point: just itself. Finally, we add back in the
3761 # pre-existing code point, which will again be a single code
3762 # point range. Because 'i' likely will have changed as a
3763 # result of these operations, we can't just continue on, but
3764 # do this operation recursively as well. If we are inserting
3765 # LIFO, the pre-existing code point needs to go after the new
3766 # one, so use MULTIPLE_AFTER; and vice versa.
3767 if ($r->[$i]->start != $r->[$i]->end) {
3768 $self->_add_delete('-', $start, $end, "");
3769 $self->_add_delete('+', $start, $end, $value, Type => $type);
3770 return $self->_add_delete('+',
3773 Type => $existing_type,
3774 Replace => ($replace == $MULTIPLE_BEFORE)
3776 : $MULTIPLE_BEFORE);
3780 # If to place this new record after, move to beyond all existing
3781 # ones; but don't add this one if identical to any of them, as it
3782 # isn't really a multiple. This leaves the original order, so
3783 # that the current request is ignored. The reasoning is that the
3784 # previous request that wanted this record to have high priority
3785 # should have precedence.
3786 if ($replace == $MULTIPLE_AFTER) {
3787 while ($i < @$r && $r->[$i]->start == $start) {
3788 return if $value eq $r->[$i]->value
3789 && $type eq $r->[$i]->type;
3794 # If instead we are to place this new record before any
3795 # existing ones, remove any identical ones that come after it.
3796 # This changes the existing order so that the new one is
3797 # first, as is being requested.
3798 for (my $j = $i + 1;
3799 $j < @$r && $r->[$j]->start == $start;
3802 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
3804 last; # There should only be one instance, so no
3805 # need to keep looking
3810 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
3811 my @return = splice @$r,
3818 if (main::DEBUG && $to_trace) {
3819 trace "After splice:";
3820 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
3821 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
3822 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
3823 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
3824 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
3825 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
3830 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
3831 # leaves delete, insert, and replace either unconditionally or if not
3832 # equivalent. $i still points to the first potential affected range.
3833 # Now find the highest range affected, which will determine the length
3834 # parameter to splice. (The input range can span multiple existing
3835 # ones.) If this isn't a deletion, while we are looking through the
3836 # range list, see also if this is a replacement rather than a clean
3837 # insertion; that is if it will change the values of at least one
3838 # existing range. Start off assuming it is an insert, until find it
3840 my $clean_insert = $operation eq '+';
3841 my $j; # This will point to the highest affected range
3843 # For non-zero types, the standard form is the value itself;
3844 my $standard_form = ($type) ? $value : main::standardize($value);
3846 for ($j = $i; $j < $range_list_size; $j++) {
3847 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
3849 # If find a range that it doesn't overlap into, we can stop
3851 last if $end < $r->[$j]->start;
3853 # Here, overlaps the range at $j. If the values don't match,
3854 # and so far we think this is a clean insertion, it becomes a
3855 # non-clean insertion, i.e., a 'change' or 'replace' instead.
3856 if ($clean_insert) {
3857 if ($r->[$j]->standard_form ne $standard_form) {
3859 if ($replace == $CROAK) {
3860 main::croak("The range to add "
3861 . sprintf("%04X", $start)
3863 . sprintf("%04X", $end)
3864 . " with value '$value' overlaps an existing range $r->[$j]");
3869 # Here, the two values are essentially the same. If the
3870 # two are actually identical, replacing wouldn't change
3871 # anything so skip it.
3872 my $pre_existing = $r->[$j]->value;
3873 if ($pre_existing ne $value) {
3875 # Here the new and old standardized values are the
3876 # same, but the non-standardized values aren't. If
3877 # replacing unconditionally, then replace
3878 if( $replace == $UNCONDITIONALLY) {
3883 # Here, are replacing conditionally. Decide to
3884 # replace or not based on which appears to look
3885 # the "nicest". If one is mixed case and the
3886 # other isn't, choose the mixed case one.
3887 my $new_mixed = $value =~ /[A-Z]/
3888 && $value =~ /[a-z]/;
3889 my $old_mixed = $pre_existing =~ /[A-Z]/
3890 && $pre_existing =~ /[a-z]/;
3892 if ($old_mixed != $new_mixed) {
3893 $clean_insert = 0 if $new_mixed;
3894 if (main::DEBUG && $to_trace) {
3895 if ($clean_insert) {
3896 trace "Retaining $pre_existing over $value";
3899 trace "Replacing $pre_existing with $value";
3905 # Here casing wasn't different between the two.
3906 # If one has hyphens or underscores and the
3907 # other doesn't, choose the one with the
3909 my $new_punct = $value =~ /[-_]/;
3910 my $old_punct = $pre_existing =~ /[-_]/;
3912 if ($old_punct != $new_punct) {
3913 $clean_insert = 0 if $new_punct;
3914 if (main::DEBUG && $to_trace) {
3915 if ($clean_insert) {
3916 trace "Retaining $pre_existing over $value";
3919 trace "Replacing $pre_existing with $value";
3922 } # else existing one is just as "good";
3923 # retain it to save cycles.
3929 } # End of loop looking for highest affected range.
3931 # Here, $j points to one beyond the highest range that this insertion
3932 # affects (hence to beyond the range list if that range is the final
3933 # one in the range list).
3935 # The splice length is all the affected ranges. Get it before
3936 # subtracting, for efficiency, so we don't have to later add 1.
3937 my $length = $j - $i;
3939 $j--; # $j now points to the highest affected range.
3940 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
3942 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
3943 # $j points to the highest affected range. But it can be < $i or even
3944 # -1. These happen only if the insertion is entirely in the gap
3945 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
3946 # above exited first time through with $end < $r->[$i]->start. (And
3947 # then we subtracted one from j) This implies also that $start <
3948 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
3949 # $start, so the entire input range is in the gap.
3952 # Here the entire input range is in the gap before $i.
3954 if (main::DEBUG && $to_trace) {
3956 trace "Entire range is between $r->[$i-1] and $r->[$i]";
3959 trace "Entire range is before $r->[$i]";
3962 return if $operation ne '+'; # Deletion of a non-existent range is
3967 # Here part of the input range is not in the gap before $i. Thus,
3968 # there is at least one affected one, and $j points to the highest
3971 # At this point, here is the situation:
3972 # This is not an insertion of a multiple, nor of tentative ($NO)
3974 # $i points to the first element in the current range list that
3975 # may be affected by this operation. In fact, we know
3976 # that the range at $i is affected because we are in
3977 # the else branch of this 'if'
3978 # $j points to the highest affected range.
3980 # r[$i-1]->end < $start <= r[$i]->end
3982 # r[$i-1]->end < $start <= $end <= r[$j]->end
3985 # $clean_insert is a boolean which is set true if and only if
3986 # this is a "clean insertion", i.e., not a change nor a
3987 # deletion (multiple was handled above).
3989 # We now have enough information to decide if this call is a no-op
3990 # or not. It is a no-op if this is an insertion of already
3993 if (main::DEBUG && $to_trace && $clean_insert
3995 && $start >= $r->[$i]->start)
3999 return if $clean_insert
4000 && $i == $j # more than one affected range => not no-op
4002 # Here, r[$i-1]->end < $start <= $end <= r[$i]->end
4003 # Further, $start and/or $end is >= r[$i]->start
4004 # The test below hence guarantees that
4005 # r[$i]->start < $start <= $end <= r[$i]->end
4006 # This means the input range is contained entirely in
4007 # the one at $i, so is a no-op
4008 && $start >= $r->[$i]->start;
4011 # Here, we know that some action will have to be taken. We have
4012 # calculated the offset and length (though adjustments may be needed)
4013 # for the splice. Now start constructing the replacement list.
4015 my $splice_start = $i;
4020 # See if should extend any adjacent ranges.
4021 if ($operation eq '-') { # Don't extend deletions
4022 $extends_below = $extends_above = 0;
4024 else { # Here, should extend any adjacent ranges. See if there are
4026 $extends_below = ($i > 0
4027 # can't extend unless adjacent
4028 && $r->[$i-1]->end == $start -1
4029 # can't extend unless are same standard value
4030 && $r->[$i-1]->standard_form eq $standard_form
4031 # can't extend unless share type
4032 && $r->[$i-1]->type == $type);
4033 $extends_above = ($j+1 < $range_list_size
4034 && $r->[$j+1]->start == $end +1
4035 && $r->[$j+1]->standard_form eq $standard_form
4036 && $r->[$j+1]->type == $type);
4038 if ($extends_below && $extends_above) { # Adds to both
4039 $splice_start--; # start replace at element below
4040 $length += 2; # will replace on both sides
4041 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4043 # The result will fill in any gap, replacing both sides, and
4044 # create one large range.
4045 @replacement = Range->new($r->[$i-1]->start,
4052 # Here we know that the result won't just be the conglomeration of
4053 # a new range with both its adjacent neighbors. But it could
4054 # extend one of them.
4056 if ($extends_below) {
4058 # Here the new element adds to the one below, but not to the
4059 # one above. If inserting, and only to that one range, can
4060 # just change its ending to include the new one.
4061 if ($length == 0 && $clean_insert) {
4062 $r->[$i-1]->set_end($end);
4063 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4067 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4068 $splice_start--; # start replace at element below
4069 $length++; # will replace the element below
4070 $start = $r->[$i-1]->start;
4073 elsif ($extends_above) {
4075 # Here the new element adds to the one above, but not below.
4076 # Mirror the code above
4077 if ($length == 0 && $clean_insert) {
4078 $r->[$j+1]->set_start($start);
4079 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4083 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4084 $length++; # will replace the element above
4085 $end = $r->[$j+1]->end;
4089 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4091 # Finally, here we know there will have to be a splice.
4092 # If the change or delete affects only the highest portion of the
4093 # first affected range, the range will have to be split. The
4094 # splice will remove the whole range, but will replace it by a new
4095 # range containing just the unaffected part. So, in this case,
4096 # add to the replacement list just this unaffected portion.
4097 if (! $extends_below
4098 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4101 Range->new($r->[$i]->start,
4103 Value => $r->[$i]->value,
4104 Type => $r->[$i]->type);
4107 # In the case of an insert or change, but not a delete, we have to
4108 # put in the new stuff; this comes next.
4109 if ($operation eq '+') {
4110 push @replacement, Range->new($start,
4116 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4117 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4119 # And finally, if we're changing or deleting only a portion of the
4120 # highest affected range, it must be split, as the lowest one was.
4121 if (! $extends_above
4122 && $j >= 0 # Remember that j can be -1 if before first
4124 && $end >= $r->[$j]->start
4125 && $end < $r->[$j]->end)
4128 Range->new($end + 1,
4130 Value => $r->[$j]->value,
4131 Type => $r->[$j]->type);
4135 # And do the splice, as calculated above
4136 if (main::DEBUG && $to_trace) {
4137 trace "replacing $length element(s) at $i with ";
4138 foreach my $replacement (@replacement) {
4139 trace " $replacement";
4141 trace "Before splice:";
4142 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4143 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4144 trace "i =[", $i, "]", $r->[$i];
4145 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4146 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4149 my @return = splice @$r, $splice_start, $length, @replacement;
4151 if (main::DEBUG && $to_trace) {
4152 trace "After splice:";
4153 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4154 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4155 trace "i =[", $i, "]", $r->[$i];
4156 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4157 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4158 trace "removed ", @return if @return;
4161 # An actual deletion could have changed the maximum in the list.
4162 # There was no deletion if the splice didn't return something, but
4163 # otherwise recalculate it. This is done too rarely to worry about
4165 if ($operation eq '-' && @return) {
4167 $max{$addr} = $r->[-1]->end;
4170 $max{$addr} = $max_init;
4176 sub reset_each_range { # reset the iterator for each_range();
4178 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4181 undef $each_range_iterator{pack 'J', $self};
4186 # Iterate over each range in a range list. Results are undefined if
4187 # the range list is changed during the iteration.
4190 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4192 my $addr = do { no overloading; pack 'J', $self; };
4194 return if $self->is_empty;
4196 $each_range_iterator{$addr} = -1
4197 if ! defined $each_range_iterator{$addr};
4198 $each_range_iterator{$addr}++;
4199 return $ranges{$addr}->[$each_range_iterator{$addr}]
4200 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4201 undef $each_range_iterator{$addr};
4205 sub count { # Returns count of code points in range list
4207 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4209 my $addr = do { no overloading; pack 'J', $self; };
4212 foreach my $range (@{$ranges{$addr}}) {
4213 $count += $range->end - $range->start + 1;
4218 sub delete_range { # Delete a range
4223 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4225 return $self->_add_delete('-', $start, $end, "");
4228 sub is_empty { # Returns boolean as to if a range list is empty
4230 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4233 return scalar @{$ranges{pack 'J', $self}} == 0;
4237 # Quickly returns a scalar suitable for separating tables into
4238 # buckets, i.e. it is a hash function of the contents of a table, so
4239 # there are relatively few conflicts.
4242 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4244 my $addr = do { no overloading; pack 'J', $self; };
4246 # These are quickly computable. Return looks like 'min..max;count'
4247 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4249 } # End closure for _Range_List_Base
4252 use base '_Range_List_Base';
4254 # A Range_List is a range list for match tables; i.e. the range values are
4255 # not significant. Thus a number of operations can be safely added to it,
4256 # such as inversion, intersection. Note that union is also an unsafe
4257 # operation when range values are cared about, and that method is in the base
4258 # class, not here. But things are set up so that that method is callable only
4259 # during initialization. Only in this derived class, is there an operation
4260 # that combines two tables. A Range_Map can thus be used to initialize a
4261 # Range_List, and its mappings will be in the list, but are not significant to
4264 sub trace { return main::trace(@_); }
4270 '+' => sub { my $self = shift;
4273 return $self->_union($other)
4275 '+=' => sub { my $self = shift;
4277 my $reversed = shift;
4280 Carp::my_carp_bug("Bad news. Can't cope with '"
4284 . "'. undef returned.");
4288 return $self->_union($other)
4290 '&' => sub { my $self = shift;
4293 return $self->_intersect($other, 0);
4295 '&=' => sub { my $self = shift;
4297 my $reversed = shift;
4300 Carp::my_carp_bug("Bad news. Can't cope with '"
4304 . "'. undef returned.");
4308 return $self->_intersect($other, 0);
4315 # Returns a new Range_List that gives all code points not in $self.
4319 my $new = Range_List->new;
4321 # Go through each range in the table, finding the gaps between them
4322 my $max = -1; # Set so no gap before range beginning at 0
4323 for my $range ($self->ranges) {
4324 my $start = $range->start;
4325 my $end = $range->end;
4327 # If there is a gap before this range, the inverse will contain
4329 if ($start > $max + 1) {
4330 $new->add_range($max + 1, $start - 1);
4335 # And finally, add the gap from the end of the table to the max
4336 # possible code point
4337 if ($max < $MAX_UNICODE_CODEPOINT) {
4338 $new->add_range($max + 1, $MAX_UNICODE_CODEPOINT);
4344 # Returns a new Range_List with the argument deleted from it. The
4345 # argument can be a single code point, a range, or something that has
4346 # a range, with the _range_list() method on it returning them
4350 my $reversed = shift;
4351 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4354 Carp::my_carp_bug("Bad news. Can't cope with '"
4358 . "'. undef returned.");
4362 my $new = Range_List->new(Initialize => $self);
4364 if (! ref $other) { # Single code point
4365 $new->delete_range($other, $other);
4367 elsif ($other->isa('Range')) {
4368 $new->delete_range($other->start, $other->end);
4370 elsif ($other->can('_range_list')) {
4371 foreach my $range ($other->_range_list->ranges) {
4372 $new->delete_range($range->start, $range->end);
4376 Carp::my_carp_bug("Can't cope with a "
4378 . " argument to '-'. Subtraction ignored."
4387 # Returns either a boolean giving whether the two inputs' range lists
4388 # intersect (overlap), or a new Range_List containing the intersection
4389 # of the two lists. The optional final parameter being true indicates
4390 # to do the check instead of the intersection.
4392 my $a_object = shift;
4393 my $b_object = shift;
4394 my $check_if_overlapping = shift;
4395 $check_if_overlapping = 0 unless defined $check_if_overlapping;
4396 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4398 if (! defined $b_object) {
4400 $message .= $a_object->_owner_name_of if defined $a_object;
4401 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4405 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4406 # Thus the intersection could be much more simply be written:
4407 # return ~(~$a_object + ~$b_object);
4408 # But, this is slower, and when taking the inverse of a large
4409 # range_size_1 table, back when such tables were always stored that
4410 # way, it became prohibitively slow, hence the code was changed to the
4413 if ($b_object->isa('Range')) {
4414 $b_object = Range_List->new(Initialize => $b_object,
4415 Owner => $a_object->_owner_name_of);
4417 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4419 my @a_ranges = $a_object->ranges;
4420 my @b_ranges = $b_object->ranges;
4422 #local $to_trace = 1 if main::DEBUG;
4423 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4425 # Start with the first range in each list
4427 my $range_a = $a_ranges[$a_i];
4429 my $range_b = $b_ranges[$b_i];
4431 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4432 if ! $check_if_overlapping;
4434 # If either list is empty, there is no intersection and no overlap
4435 if (! defined $range_a || ! defined $range_b) {
4436 return $check_if_overlapping ? 0 : $new;
4438 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4440 # Otherwise, must calculate the intersection/overlap. Start with the
4441 # very first code point in each list
4442 my $a = $range_a->start;
4443 my $b = $range_b->start;
4445 # Loop through all the ranges of each list; in each iteration, $a and
4446 # $b are the current code points in their respective lists
4449 # If $a and $b are the same code point, ...
4452 # it means the lists overlap. If just checking for overlap
4453 # know the answer now,
4454 return 1 if $check_if_overlapping;
4456 # The intersection includes this code point plus anything else
4457 # common to both current ranges.
4459 my $end = main::min($range_a->end, $range_b->end);
4460 if (! $check_if_overlapping) {
4461 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
4462 $new->add_range($start, $end);
4465 # Skip ahead to the end of the current intersect
4468 # If the current intersect ends at the end of either range (as
4469 # it must for at least one of them), the next possible one
4470 # will be the beginning code point in it's list's next range.
4471 if ($a == $range_a->end) {
4472 $range_a = $a_ranges[++$a_i];
4473 last unless defined $range_a;
4474 $a = $range_a->start;
4476 if ($b == $range_b->end) {
4477 $range_b = $b_ranges[++$b_i];
4478 last unless defined $range_b;
4479 $b = $range_b->start;
4482 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4486 # Not equal, but if the range containing $a encompasses $b,
4487 # change $a to be the middle of the range where it does equal
4488 # $b, so the next iteration will get the intersection
4489 if ($range_a->end >= $b) {
4494 # Here, the current range containing $a is entirely below
4495 # $b. Go try to find a range that could contain $b.
4496 $a_i = $a_object->_search_ranges($b);
4498 # If no range found, quit.
4499 last unless defined $a_i;
4501 # The search returns $a_i, such that
4502 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
4503 # Set $a to the beginning of this new range, and repeat.
4504 $range_a = $a_ranges[$a_i];
4505 $a = $range_a->start;
4508 else { # Here, $b < $a.
4510 # Mirror image code to the leg just above
4511 if ($range_b->end >= $a) {
4515 $b_i = $b_object->_search_ranges($a);
4516 last unless defined $b_i;
4517 $range_b = $b_ranges[$b_i];
4518 $b = $range_b->start;
4521 } # End of looping through ranges.
4523 # Intersection fully computed, or now know that there is no overlap
4524 return $check_if_overlapping ? 0 : $new;
4528 # Returns boolean giving whether the two arguments overlap somewhere
4532 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4534 return $self->_intersect($other, 1);
4538 # Add a range to the list.
4543 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4545 return $self->_add_delete('+', $start, $end, "");
4548 sub matches_identically_to {
4549 # Return a boolean as to whether or not two Range_Lists match identical
4550 # sets of code points.
4554 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4556 # These are ordered in increasing real time to figure out (at least
4557 # until a patch changes that and doesn't change this)
4558 return 0 if $self->max != $other->max;
4559 return 0 if $self->min != $other->min;
4560 return 0 if $self->range_count != $other->range_count;
4561 return 0 if $self->count != $other->count;
4563 # Here they could be identical because all the tests above passed.
4564 # The loop below is somewhat simpler since we know they have the same
4565 # number of elements. Compare range by range, until reach the end or
4566 # find something that differs.
4567 my @a_ranges = $self->ranges;
4568 my @b_ranges = $other->ranges;
4569 for my $i (0 .. @a_ranges - 1) {
4570 my $a = $a_ranges[$i];
4571 my $b = $b_ranges[$i];
4572 trace "self $a; other $b" if main::DEBUG && $to_trace;
4573 return 0 if ! defined $b
4574 || $a->start != $b->start
4575 || $a->end != $b->end;
4580 sub is_code_point_usable {
4581 # This used only for making the test script. See if the input
4582 # proposed trial code point is one that Perl will handle. If second
4583 # parameter is 0, it won't select some code points for various
4584 # reasons, noted below.
4587 my $try_hard = shift;
4588 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4590 return 0 if $code < 0; # Never use a negative
4592 # shun null. I'm (khw) not sure why this was done, but NULL would be
4593 # the character very frequently used.
4594 return $try_hard if $code == 0x0000;
4596 # shun non-character code points.
4597 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
4598 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
4600 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
4601 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
4606 sub get_valid_code_point {
4607 # Return a code point that's part of the range list. Returns nothing
4608 # if the table is empty or we can't find a suitable code point. This
4609 # used only for making the test script.
4612 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4614 my $addr = do { no overloading; pack 'J', $self; };
4616 # On first pass, don't choose less desirable code points; if no good
4617 # one is found, repeat, allowing a less desirable one to be selected.
4618 for my $try_hard (0, 1) {
4620 # Look through all the ranges for a usable code point.
4621 for my $set (reverse $self->ranges) {
4623 # Try the edge cases first, starting with the end point of the
4625 my $end = $set->end;
4626 return $end if is_code_point_usable($end, $try_hard);
4628 # End point didn't, work. Start at the beginning and try
4629 # every one until find one that does work.
4630 for my $trial ($set->start .. $end - 1) {
4631 return $trial if is_code_point_usable($trial, $try_hard);
4635 return (); # If none found, give up.
4638 sub get_invalid_code_point {
4639 # Return a code point that's not part of the table. Returns nothing
4640 # if the table covers all code points or a suitable code point can't
4641 # be found. This used only for making the test script.
4644 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4646 # Just find a valid code point of the inverse, if any.
4647 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
4649 } # end closure for Range_List
4652 use base '_Range_List_Base';
4654 # A Range_Map is a range list in which the range values (called maps) are
4655 # significant, and hence shouldn't be manipulated by our other code, which
4656 # could be ambiguous or lose things. For example, in taking the union of two
4657 # lists, which share code points, but which have differing values, which one
4658 # has precedence in the union?
4659 # It turns out that these operations aren't really necessary for map tables,
4660 # and so this class was created to make sure they aren't accidentally
4666 # Add a range containing a mapping value to the list
4669 # Rest of parameters passed on
4671 return $self->_add_delete('+', @_);
4675 # Adds entry to a range list which can duplicate an existing entry
4678 my $code_point = shift;
4681 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
4682 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4684 return $self->add_map($code_point, $code_point,
4685 $value, Replace => $replace);
4687 } # End of closure for package Range_Map
4689 package _Base_Table;
4691 # A table is the basic data structure that gets written out into a file for
4692 # use by the Perl core. This is the abstract base class implementing the
4693 # common elements from the derived ones. A list of the methods to be
4694 # furnished by an implementing class is just after the constructor.
4696 sub standardize { return main::standardize($_[0]); }
4697 sub trace { return main::trace(@_); }
4701 main::setup_package();
4704 # Object containing the ranges of the table.
4705 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
4708 # The full table name.
4709 main::set_access('full_name', \%full_name, 'r');
4712 # The table name, almost always shorter
4713 main::set_access('name', \%name, 'r');
4716 # The shortest of all the aliases for this table, with underscores removed
4717 main::set_access('short_name', \%short_name);
4719 my %nominal_short_name_length;
4720 # The length of short_name before removing underscores
4721 main::set_access('nominal_short_name_length',
4722 \%nominal_short_name_length);
4725 # The complete name, including property.
4726 main::set_access('complete_name', \%complete_name, 'r');
4729 # Parent property this table is attached to.
4730 main::set_access('property', \%property, 'r');
4733 # Ordered list of alias objects of the table's name. The first ones in
4734 # the list are output first in comments
4735 main::set_access('aliases', \%aliases, 'readable_array');
4738 # A comment associated with the table for human readers of the files
4739 main::set_access('comment', \%comment, 's');
4742 # A comment giving a short description of the table's meaning for human
4743 # readers of the files.
4744 main::set_access('description', \%description, 'readable_array');
4747 # A comment giving a short note about the table for human readers of the
4749 main::set_access('note', \%note, 'readable_array');
4752 # Enum; there are a number of possibilities for what happens to this
4753 # table: it could be normal, or suppressed, or not for external use. See
4754 # values at definition for $SUPPRESSED.
4755 main::set_access('fate', \%fate, 'r');
4757 my %find_table_from_alias;
4758 # The parent property passes this pointer to a hash which this class adds
4759 # all its aliases to, so that the parent can quickly take an alias and
4761 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
4764 # After this table is made equivalent to another one; we shouldn't go
4765 # changing the contents because that could mean it's no longer equivalent
4766 main::set_access('locked', \%locked, 'r');
4769 # This gives the final path to the file containing the table. Each
4770 # directory in the path is an element in the array
4771 main::set_access('file_path', \%file_path, 'readable_array');
4774 # What is the table's status, normal, $OBSOLETE, etc. Enum
4775 main::set_access('status', \%status, 'r');
4778 # A comment about its being obsolete, or whatever non normal status it has
4779 main::set_access('status_info', \%status_info, 'r');
4781 my %caseless_equivalent;
4782 # The table this is equivalent to under /i matching, if any.
4783 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
4786 # Is the table to be output with each range only a single code point?
4787 # This is done to avoid breaking existing code that may have come to rely
4788 # on this behavior in previous versions of this program.)
4789 main::set_access('range_size_1', \%range_size_1, 'r', 's');
4792 # A boolean set iff this table is a Perl extension to the Unicode
4794 main::set_access('perl_extension', \%perl_extension, 'r');
4796 my %output_range_counts;
4797 # A boolean set iff this table is to have comments written in the
4798 # output file that contain the number of code points in the range.
4799 # The constructor can override the global flag of the same name.
4800 main::set_access('output_range_counts', \%output_range_counts, 'r');
4803 # The format of the entries of the table. This is calculated from the
4804 # data in the table (or passed in the constructor). This is an enum e.g.,
4805 # $STRING_FORMAT. It is marked protected as it should not be generally
4806 # used to override calculations.
4807 main::set_access('format', \%format, 'r', 'p_s');
4810 # All arguments are key => value pairs, which you can see below, most
4811 # of which match fields documented above. Otherwise: Re_Pod_Entry,
4812 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
4813 # documented in the Alias package
4815 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
4819 my $self = bless \do { my $anonymous_scalar }, $class;
4820 my $addr = do { no overloading; pack 'J', $self; };
4824 $name{$addr} = delete $args{'Name'};
4825 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
4826 $full_name{$addr} = delete $args{'Full_Name'};
4827 my $complete_name = $complete_name{$addr}
4828 = delete $args{'Complete_Name'};
4829 $format{$addr} = delete $args{'Format'};
4830 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
4831 $property{$addr} = delete $args{'_Property'};
4832 $range_list{$addr} = delete $args{'_Range_List'};
4833 $status{$addr} = delete $args{'Status'} || $NORMAL;
4834 $status_info{$addr} = delete $args{'_Status_Info'} || "";
4835 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
4836 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
4837 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
4838 my $ucd = delete $args{'UCD'};
4840 my $description = delete $args{'Description'};
4841 my $ok_as_filename = delete $args{'OK_as_Filename'};
4842 my $loose_match = delete $args{'Fuzzy'};
4843 my $note = delete $args{'Note'};
4844 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
4845 my $perl_extension = delete $args{'Perl_Extension'};
4847 # Shouldn't have any left over
4848 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4850 # Can't use || above because conceivably the name could be 0, and
4851 # can't use // operator in case this program gets used in Perl 5.8
4852 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
4853 $output_range_counts{$addr} = $output_range_counts if
4854 ! defined $output_range_counts{$addr};
4856 $aliases{$addr} = [ ];
4857 $comment{$addr} = [ ];
4858 $description{$addr} = [ ];
4860 $file_path{$addr} = [ ];
4861 $locked{$addr} = "";
4863 push @{$description{$addr}}, $description if $description;
4864 push @{$note{$addr}}, $note if $note;
4866 if ($fate{$addr} == $PLACEHOLDER) {
4868 # A placeholder table doesn't get documented, is a perl extension,
4869 # and quite likely will be empty
4870 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4871 $perl_extension = 1 if ! defined $perl_extension;
4872 $ucd = 0 if ! defined $ucd;
4873 push @tables_that_may_be_empty, $complete_name{$addr};
4874 $self->add_comment(<<END);
4875 This is a placeholder because it is not in Version $string_version of Unicode,
4876 but is needed by the Perl core to work gracefully. Because it is not in this
4877 version of Unicode, it will not be listed in $pod_file.pod
4880 elsif (exists $why_suppressed{$complete_name}
4881 # Don't suppress if overridden
4882 && ! grep { $_ eq $complete_name{$addr} }
4883 @output_mapped_properties)
4885 $fate{$addr} = $SUPPRESSED;
4887 elsif ($fate{$addr} == $SUPPRESSED
4888 && ! exists $why_suppressed{$property{$addr}->complete_name})
4890 Carp::my_carp_bug("There is no current capability to set the reason for suppressing.");
4891 # perhaps Fate => [ $SUPPRESSED, "reason" ]
4894 # If hasn't set its status already, see if it is on one of the
4895 # lists of properties or tables that have particular statuses; if
4896 # not, is normal. The lists are prioritized so the most serious
4897 # ones are checked first
4898 if (! $status{$addr}) {
4899 if (exists $why_deprecated{$complete_name}) {
4900 $status{$addr} = $DEPRECATED;
4902 elsif (exists $why_stabilized{$complete_name}) {
4903 $status{$addr} = $STABILIZED;
4905 elsif (exists $why_obsolete{$complete_name}) {
4906 $status{$addr} = $OBSOLETE;
4909 # Existence above doesn't necessarily mean there is a message
4910 # associated with it. Use the most serious message.
4911 if ($status{$addr}) {
4912 if ($why_deprecated{$complete_name}) {
4914 = $why_deprecated{$complete_name};
4916 elsif ($why_stabilized{$complete_name}) {
4918 = $why_stabilized{$complete_name};
4920 elsif ($why_obsolete{$complete_name}) {
4922 = $why_obsolete{$complete_name};
4927 $perl_extension{$addr} = $perl_extension || 0;
4929 # Don't list a property by default that is internal only
4930 if ($fate{$addr} > $MAP_PROXIED) {
4931 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
4932 $ucd = 0 if ! defined $ucd;
4935 $ucd = 1 if ! defined $ucd;
4938 # By convention what typically gets printed only or first is what's
4939 # first in the list, so put the full name there for good output
4940 # clarity. Other routines rely on the full name being first on the
4942 $self->add_alias($full_name{$addr},
4943 OK_as_Filename => $ok_as_filename,
4944 Fuzzy => $loose_match,
4945 Re_Pod_Entry => $make_re_pod_entry,
4946 Status => $status{$addr},
4950 # Then comes the other name, if meaningfully different.
4951 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
4952 $self->add_alias($name{$addr},
4953 OK_as_Filename => $ok_as_filename,
4954 Fuzzy => $loose_match,
4955 Re_Pod_Entry => $make_re_pod_entry,
4956 Status => $status{$addr},
4964 # Here are the methods that are required to be defined by any derived
4967 handle_special_range
4971 # write() knows how to write out normal ranges, but it calls
4972 # handle_special_range() when it encounters a non-normal one.
4973 # append_to_body() is called by it after it has handled all
4974 # ranges to add anything after the main portion of the table.
4975 # And finally, pre_body() is called after all this to build up
4976 # anything that should appear before the main portion of the
4977 # table. Doing it this way allows things in the middle to
4978 # affect what should appear before the main portion of the
4983 Carp::my_carp_bug( __LINE__
4984 . ": Must create method '$sub()' for "
4992 "." => \&main::_operator_dot,
4993 ".=" => \&main::_operator_dot_equal,
4994 '!=' => \&main::_operator_not_equal,
4995 '==' => \&main::_operator_equal,
4999 # Returns the array of ranges associated with this table.
5002 return $range_list{pack 'J', shift}->ranges;
5006 # Add a synonym for this table.
5008 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5011 my $name = shift; # The name to add.
5012 my $pointer = shift; # What the alias hash should point to. For
5013 # map tables, this is the parent property;
5014 # for match tables, it is the table itself.
5017 my $loose_match = delete $args{'Fuzzy'};
5019 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5020 $make_re_pod_entry = $YES unless defined $make_re_pod_entry;
5022 my $ok_as_filename = delete $args{'OK_as_Filename'};
5023 $ok_as_filename = 1 unless defined $ok_as_filename;
5025 my $status = delete $args{'Status'};
5026 $status = $NORMAL unless defined $status;
5028 # An internal name does not get documented, unless overridden by the
5030 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5032 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5034 # Capitalize the first letter of the alias unless it is one of the CJK
5035 # ones which specifically begins with a lower 'k'. Do this because
5036 # Unicode has varied whether they capitalize first letters or not, and
5037 # have later changed their minds and capitalized them, but not the
5038 # other way around. So do it always and avoid changes from release to
5040 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5042 my $addr = do { no overloading; pack 'J', $self; };
5044 # Figure out if should be loosely matched if not already specified.
5045 if (! defined $loose_match) {
5047 # Is a loose_match if isn't null, and doesn't begin with an
5048 # underscore and isn't just a number
5050 && substr($name, 0, 1) ne '_'
5051 && $name !~ qr{^[0-9_.+-/]+$})
5060 # If this alias has already been defined, do nothing.
5061 return if defined $find_table_from_alias{$addr}->{$name};
5063 # That includes if it is standardly equivalent to an existing alias,
5064 # in which case, add this name to the list, so won't have to search
5066 my $standard_name = main::standardize($name);
5067 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5068 $find_table_from_alias{$addr}->{$name}
5069 = $find_table_from_alias{$addr}->{$standard_name};
5073 # Set the index hash for this alias for future quick reference.
5074 $find_table_from_alias{$addr}->{$name} = $pointer;
5075 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5076 local $to_trace = 0 if main::DEBUG;
5077 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5078 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5081 # Put the new alias at the end of the list of aliases unless the final
5082 # element begins with an underscore (meaning it is for internal perl
5083 # use) or is all numeric, in which case, put the new one before that
5084 # one. This floats any all-numeric or underscore-beginning aliases to
5085 # the end. This is done so that they are listed last in output lists,
5086 # to encourage the user to use a better name (either more descriptive
5087 # or not an internal-only one) instead. This ordering is relied on
5088 # implicitly elsewhere in this program, like in short_name()
5089 my $list = $aliases{$addr};
5090 my $insert_position = (@$list == 0
5091 || (substr($list->[-1]->name, 0, 1) ne '_'
5092 && $list->[-1]->name =~ /\D/))
5098 Alias->new($name, $loose_match, $make_re_pod_entry,
5099 $ok_as_filename, $status, $ucd);
5101 # This name may be shorter than any existing ones, so clear the cache
5102 # of the shortest, so will have to be recalculated.
5104 undef $short_name{pack 'J', $self};
5109 # Returns a name suitable for use as the base part of a file name.
5110 # That is, shorter wins. It can return undef if there is no suitable
5111 # name. The name has all non-essential underscores removed.
5113 # The optional second parameter is a reference to a scalar in which
5114 # this routine will store the length the returned name had before the
5115 # underscores were removed, or undef if the return is undef.
5117 # The shortest name can change if new aliases are added. So using
5118 # this should be deferred until after all these are added. The code
5119 # that does that should clear this one's cache.
5120 # Any name with alphabetics is preferred over an all numeric one, even
5124 my $nominal_length_ptr = shift;
5125 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5127 my $addr = do { no overloading; pack 'J', $self; };
5129 # For efficiency, don't recalculate, but this means that adding new
5130 # aliases could change what the shortest is, so the code that does
5131 # that needs to undef this.
5132 if (defined $short_name{$addr}) {
5133 if ($nominal_length_ptr) {
5134 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5136 return $short_name{$addr};
5139 # Look at each alias
5140 foreach my $alias ($self->aliases()) {
5142 # Don't use an alias that isn't ok to use for an external name.
5143 next if ! $alias->ok_as_filename;
5145 my $name = main::Standardize($alias->name);
5146 trace $self, $name if main::DEBUG && $to_trace;
5148 # Take the first one, or a shorter one that isn't numeric. This
5149 # relies on numeric aliases always being last in the array
5150 # returned by aliases(). Any alpha one will have precedence.
5151 if (! defined $short_name{$addr}
5153 && length($name) < length($short_name{$addr})))
5155 # Remove interior underscores.
5156 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5158 $nominal_short_name_length{$addr} = length $name;
5162 # If the short name isn't a nice one, perhaps an equivalent table has
5164 if (! defined $short_name{$addr}
5165 || $short_name{$addr} eq ""
5166 || $short_name{$addr} eq "_")
5169 foreach my $follower ($self->children) { # All equivalents
5170 my $follower_name = $follower->short_name;
5171 next unless defined $follower_name;
5173 # Anything (except undefined) is better than underscore or
5175 if (! defined $return || $return eq "_") {
5176 $return = $follower_name;
5180 # If the new follower name isn't "_" and is shorter than the
5181 # current best one, prefer the new one.
5182 next if $follower_name eq "_";
5183 next if length $follower_name > length $return;
5184 $return = $follower_name;
5186 $short_name{$addr} = $return if defined $return;
5189 # If no suitable external name return undef
5190 if (! defined $short_name{$addr}) {
5191 $$nominal_length_ptr = undef if $nominal_length_ptr;
5195 # Don't allow a null short name.
5196 if ($short_name{$addr} eq "") {
5197 $short_name{$addr} = '_';
5198 $nominal_short_name_length{$addr} = 1;
5201 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5203 if ($nominal_length_ptr) {
5204 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5206 return $short_name{$addr};
5210 # Returns the external name that this table should be known by. This
5211 # is usually the short_name, but not if the short_name is undefined,
5212 # in which case the external_name is arbitrarily set to the
5216 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5218 my $short = $self->short_name;
5219 return $short if defined $short;
5224 sub add_description { # Adds the parameter as a short description.
5227 my $description = shift;
5229 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5232 push @{$description{pack 'J', $self}}, $description;
5237 sub add_note { # Adds the parameter as a short note.
5242 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5245 push @{$note{pack 'J', $self}}, $note;
5250 sub add_comment { # Adds the parameter as a comment.
5252 return unless $debugging_build;
5255 my $comment = shift;
5256 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5261 push @{$comment{pack 'J', $self}}, $comment;
5267 # Return the current comment for this table. If called in list
5268 # context, returns the array of comments. In scalar, returns a string
5269 # of each element joined together with a period ending each.
5272 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5274 my $addr = do { no overloading; pack 'J', $self; };
5275 my @list = @{$comment{$addr}};
5276 return @list if wantarray;
5278 foreach my $sentence (@list) {
5279 $return .= '. ' if $return;
5280 $return .= $sentence;
5283 $return .= '.' if $return;
5288 # Initialize the table with the argument which is any valid
5289 # initialization for range lists.
5292 my $addr = do { no overloading; pack 'J', $self; };
5293 my $initialization = shift;
5294 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5296 # Replace the current range list with a new one of the same exact
5298 my $class = ref $range_list{$addr};
5299 $range_list{$addr} = $class->new(Owner => $self,
5300 Initialize => $initialization);
5306 # The header that is output for the table in the file it is written
5310 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5313 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5319 # Write a representation of the table to its file. It calls several
5320 # functions furnished by sub-classes of this abstract base class to
5321 # handle non-normal ranges, to add stuff before the table, and at its
5322 # end. If the table is to be written so that adjustments are
5323 # required, this does that conversion.
5326 my $use_adjustments = shift; # ? output in adjusted format or not
5327 my $tab_stops = shift; # The number of tab stops over to put any
5329 my $suppress_value = shift; # Optional, if the value associated with
5330 # a range equals this one, don't write
5332 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5334 my $addr = do { no overloading; pack 'J', $self; };
5336 # Start with the header
5337 my @HEADER = $self->header;
5340 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5343 # Things discovered processing the main body of the document may
5344 # affect what gets output before it, therefore pre_body() isn't called
5345 # until after all other processing of the table is done.
5347 # The main body looks like a 'here' document. If annotating, get rid
5348 # of the comments before passing to the caller, as some callers, such
5349 # as charnames.pm, can't cope with them. (Outputting range counts
5350 # also introduces comments, but these don't show up in the tables that
5351 # can't cope with comments, and there aren't that many of them that
5352 # it's worth the extra real time to get rid of them).
5355 # Use the line below in Perls that don't have /r
5356 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5357 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5359 push @OUT, "return <<'END';\n";
5362 if ($range_list{$addr}->is_empty) {
5364 # This is a kludge for empty tables to silence a warning in
5365 # utf8.c, which can't really deal with empty tables, but it can
5366 # deal with a table that matches nothing, as the inverse of 'Any'
5368 push @OUT, "!utf8::Any\n";
5370 elsif ($self->name eq 'N'
5372 # To save disk space and table cache space, avoid putting out
5373 # binary N tables, but instead create a file which just inverts
5374 # the Y table. Since the file will still exist and occupy a
5375 # certain number of blocks, might as well output the whole
5376 # thing if it all will fit in one block. The number of
5377 # ranges below is an approximate number for that.
5378 && ($self->property->type == $BINARY
5379 || $self->property->type == $FORCED_BINARY)
5380 # && $self->property->tables == 2 Can't do this because the
5381 # non-binary properties, like NFDQC aren't specifiable
5383 && $range_list{$addr}->ranges > 15
5384 && ! $annotate) # Under --annotate, want to see everything
5386 push @OUT, "!utf8::" . $self->property->name . "\n";
5389 my $range_size_1 = $range_size_1{$addr};
5390 my $format; # Used only in $annotate option
5391 my $include_name; # Used only in $annotate option
5395 # If annotating each code point, must print 1 per line.
5396 # The variable could point to a subroutine, and we don't want
5397 # to lose that fact, so only set if not set already
5398 $range_size_1 = 1 if ! $range_size_1;
5400 $format = $self->format;
5402 # The name of the character is output only for tables that
5403 # don't already include the name in the output.
5404 my $property = $self->property;
5406 ! ($property == $perl_charname
5407 || $property == main::property_ref('Unicode_1_Name')
5408 || $property == main::property_ref('Name')
5409 || $property == main::property_ref('Name_Alias')
5413 # Values for previous time through the loop. Initialize to
5414 # something that won't be adjacent to the first iteration;
5415 # only $previous_end matters for that.
5417 my $previous_end = -2;
5420 # Values for next time through the portion of the loop that splits
5421 # the range. 0 in $next_start means there is no remaining portion
5428 # Output each range as part of the here document.
5430 for my $set ($range_list{$addr}->ranges) {
5431 if ($set->type != 0) {
5432 $self->handle_special_range($set);
5435 my $start = $set->start;
5436 my $end = $set->end;
5437 my $value = $set->value;
5439 # Don't output ranges whose value is the one to suppress
5440 next RANGE if defined $suppress_value
5441 && $value eq $suppress_value;
5443 { # This bare block encloses the scope where we may need to
5444 # split a range (when outputting adjusteds), and each time
5445 # through we handle the next portion of the original by
5446 # ending the block with a 'redo'. The values to use for
5447 # that next time through are set up just below in the
5448 # scalars whose names begin with '$next_'.
5450 if ($use_adjustments) {
5452 # When converting to use adjustments, we can handle
5453 # only single element ranges. Set up so that this
5454 # time through the loop, we look at the first element,
5455 # and the next time through, we start off with the
5456 # remainder. Thus each time through we look at the
5457 # first element of the range
5458 if ($end != $start) {
5459 $next_start = $start + 1;
5461 $next_value = $value;
5465 # The values for some of these tables are stored as
5466 # hex strings. Convert those to decimal
5467 $value = hex($value)
5468 if $self->default_map eq $CODE_POINT
5469 && $value =~ / ^ [A-Fa-f0-9]+ $ /x;
5471 # If this range is adjacent to the previous one, and
5472 # the values in each are integers that are also
5473 # adjacent (differ by 1), then this range really
5474 # extends the previous one that is already in element
5475 # $OUT[-1]. So we pop that element, and pretend that
5476 # the range starts with whatever it started with.
5477 # $offset is incremented by 1 each time so that it
5478 # gives the current offset from the first element in
5479 # the accumulating range, and we keep in $value the
5480 # value of that first element.
5481 if ($start == $previous_end + 1
5482 && $value =~ /^ -? \d+ $/xa
5483 && $previous_value =~ /^ -? \d+ $/xa
5484 && ($value == ($previous_value + ++$offset)))
5487 $start = $previous_start;
5488 $value = $previous_value;
5494 # Save the current values for the next time through
5496 $previous_start = $start;
5497 $previous_end = $end;
5498 $previous_value = $value;
5501 # If there is a range and doesn't need a single point range
5503 if ($start != $end && ! $range_size_1) {
5504 push @OUT, sprintf "%04X\t%04X", $start, $end;
5505 $OUT[-1] .= "\t$value" if $value ne "";
5507 # Add a comment with the size of the range, if
5508 # requested. Expand Tabs to make sure they all start
5509 # in the same column, and then unexpand to use mostly
5511 if (! $output_range_counts{$addr}) {
5515 $OUT[-1] = Text::Tabs::expand($OUT[-1]);
5516 my $count = main::clarify_number($end - $start + 1);
5519 my $width = $tab_stops * 8 - 1;
5520 $OUT[-1] = sprintf("%-*s # [%s]\n",
5524 $OUT[-1] = Text::Tabs::unexpand($OUT[-1]);
5528 # Here to output a single code point per line.
5529 # If not to annotate, use the simple formats
5530 elsif (! $annotate) {
5532 # Use any passed in subroutine to output.
5533 if (ref $range_size_1 eq 'CODE') {
5534 for my $i ($start .. $end) {
5535 push @OUT, &{$range_size_1}($i, $value);
5540 # Here, caller is ok with default output.
5541 for (my $i = $start; $i <= $end; $i++) {
5542 push @OUT, sprintf "%04X\t\t%s\n", $i, $value;
5548 # Here, wants annotation.
5549 for (my $i = $start; $i <= $end; $i++) {
5551 # Get character information if don't have it already
5552 main::populate_char_info($i)
5553 if ! defined $viacode[$i];
5554 my $type = $annotate_char_type[$i];
5556 # Figure out if should output the next code points
5557 # as part of a range or not. If this is not in an
5558 # annotation range, then won't output as a range,
5559 # so returns $i. Otherwise use the end of the
5560 # annotation range, but no further than the
5561 # maximum possible end point of the loop.
5562 my $range_end = main::min(
5563 $annotate_ranges->value_of($i) || $i,
5566 # Use a range if it is a range, and either is one
5567 # of the special annotation ranges, or the range
5568 # is at most 3 long. This last case causes the
5569 # algorithmically named code points to be output
5570 # individually in spans of at most 3, as they are
5571 # the ones whose $type is > 0.
5572 if ($range_end != $i
5573 && ( $type < 0 || $range_end - $i > 2))
5575 # Here is to output a range. We don't allow a
5576 # caller-specified output format--just use the
5578 push @OUT, sprintf "%04X\t%04X\t%s\t#", $i,
5581 my $range_name = $viacode[$i];
5583 # For the code points which end in their hex
5584 # value, we eliminate that from the output
5585 # annotation, and capitalize only the first
5586 # letter of each word.
5587 if ($type == $CP_IN_NAME) {
5588 my $hex = sprintf "%04X", $i;
5589 $range_name =~ s/-$hex$//;
5590 my @words = split " ", $range_name;
5591 for my $word (@words) {
5593 ucfirst(lc($word)) if $word ne 'CJK';
5595 $range_name = join " ", @words;
5597 elsif ($type == $HANGUL_SYLLABLE) {
5598 $range_name = "Hangul Syllable";
5601 $OUT[-1] .= " $range_name" if $range_name;
5603 # Include the number of code points in the
5606 main::clarify_number($range_end - $i + 1);
5607 $OUT[-1] .= " [$count]\n";
5609 # Skip to the end of the range
5612 else { # Not in a range.
5615 # When outputting the names of each character,
5616 # use the character itself if printable
5617 $comment .= "'" . chr($i) . "' "
5620 # To make it more readable, use a minimum
5624 # Determine the annotation
5625 if ($format eq $DECOMP_STRING_FORMAT) {
5627 # This is very specialized, with the type
5628 # of decomposition beginning the line
5629 # enclosed in <...>, and the code points
5630 # that the code point decomposes to
5631 # separated by blanks. Create two
5632 # strings, one of the printable
5633 # characters, and one of their official
5635 (my $map = $value) =~ s/ \ * < .*? > \ +//x;
5639 foreach my $to (split " ", $map) {
5640 $to = CORE::hex $to;
5641 $to_name .= " + " if $to_name;
5642 $to_chr .= chr($to);
5643 main::populate_char_info($to)
5644 if ! defined $viacode[$to];
5645 $to_name .= $viacode[$to];
5649 "=> '$to_chr'; $viacode[$i] => $to_name";
5650 $comment_indent = 25; # Determined by
5655 # Assume that any table that has hex
5656 # format is a mapping of one code point to
5658 if ($format eq $HEX_FORMAT) {
5659 my $decimal_value = CORE::hex $value;
5660 main::populate_char_info($decimal_value)
5661 if ! defined $viacode[$decimal_value];
5663 . chr($decimal_value)
5664 . "'; " if $printable[$decimal_value];
5666 $comment .= $viacode[$i] if $include_name
5668 if ($format eq $HEX_FORMAT) {
5669 my $decimal_value = CORE::hex $value;
5671 " => $viacode[$decimal_value]"
5672 if $viacode[$decimal_value];
5675 # If including the name, no need to
5676 # indent, as the name will already be way
5678 $comment_indent = ($include_name) ? 0 : 60;
5681 # Use any passed in routine to output the base
5683 if (ref $range_size_1 eq 'CODE') {
5684 my $base_part=&{$range_size_1}($i, $value);
5686 push @OUT, $base_part;
5689 push @OUT, sprintf "%04X\t\t%s", $i, $value;
5692 # And add the annotation.
5693 $OUT[-1] = sprintf "%-*s\t# %s",
5703 # If we split the range, set up so the next time through
5704 # we get the remainder, and redo.
5706 $start = $next_start;
5708 $value = $next_value;
5713 } # End of loop through all the table's ranges
5716 # Add anything that goes after the main body, but within the here
5718 my $append_to_body = $self->append_to_body;
5719 push @OUT, $append_to_body if $append_to_body;
5721 # And finish the here document.
5724 # Done with the main portion of the body. Can now figure out what
5725 # should appear before it in the file.
5726 my $pre_body = $self->pre_body;
5727 push @HEADER, $pre_body, "\n" if $pre_body;
5729 # All these files should have a .pl suffix added to them.
5730 my @file_with_pl = @{$file_path{$addr}};
5731 $file_with_pl[-1] .= '.pl';
5733 main::write(\@file_with_pl,
5734 $annotate, # utf8 iff annotating
5740 sub set_status { # Set the table's status
5742 my $status = shift; # The status enum value
5743 my $info = shift; # Any message associated with it.
5744 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5746 my $addr = do { no overloading; pack 'J', $self; };
5748 $status{$addr} = $status;
5749 $status_info{$addr} = $info;
5753 sub set_fate { # Set the fate of a table
5757 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5759 my $addr = do { no overloading; pack 'J', $self; };
5761 return if $fate{$addr} == $fate; # If no-op
5763 # Can only change the ordinary fate, except if going to $MAP_PROXIED
5764 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
5766 $fate{$addr} = $fate;
5768 # Don't document anything to do with a non-normal fated table
5769 if ($fate != $ORDINARY) {
5770 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
5771 foreach my $alias ($self->aliases) {
5772 $alias->set_ucd($put_in_pod);
5774 # MAP_PROXIED doesn't affect the match tables
5775 next if $fate == $MAP_PROXIED;
5776 $alias->set_make_re_pod_entry($put_in_pod);
5780 # Save the reason for suppression for output
5781 if ($fate == $SUPPRESSED && defined $reason) {
5782 $why_suppressed{$complete_name{$addr}} = $reason;
5789 # Don't allow changes to the table from now on. This stores a stack
5790 # trace of where it was called, so that later attempts to modify it
5791 # can immediately show where it got locked.
5794 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5796 my $addr = do { no overloading; pack 'J', $self; };
5798 $locked{$addr} = "";
5800 my $line = (caller(0))[2];
5803 # Accumulate the stack trace
5805 my ($pkg, $file, $caller_line, $caller) = caller $i++;
5807 last unless defined $caller;
5809 $locked{$addr} .= " called from $caller() at line $line\n";
5810 $line = $caller_line;
5812 $locked{$addr} .= " called from main at line $line\n";
5817 sub carp_if_locked {
5818 # Return whether a table is locked or not, and, by the way, complain
5822 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5824 my $addr = do { no overloading; pack 'J', $self; };
5826 return 0 if ! $locked{$addr};
5827 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
5831 sub set_file_path { # Set the final directory path for this table
5833 # Rest of parameters passed on
5836 @{$file_path{pack 'J', $self}} = @_;
5840 # Accessors for the range list stored in this table. First for
5849 matches_identically_to
5862 return $self->_range_list->$sub(@_);
5866 # Then for ones that should fail if locked
5876 return if $self->carp_if_locked;
5878 return $self->_range_list->$sub(@_);
5885 use base '_Base_Table';
5887 # A Map Table is a table that contains the mappings from code points to
5888 # values. There are two weird cases:
5889 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
5890 # are written in the table's file at the end of the table nonetheless. It
5891 # requires specially constructed code to handle these; utf8.c can not read
5892 # these in, so they should not go in $map_directory. As of this writing,
5893 # the only case that these happen is for named sequences used in
5894 # charnames.pm. But this code doesn't enforce any syntax on these, so
5895 # something else could come along that uses it.
5896 # 2) Specials are anything that doesn't fit syntactically into the body of the
5897 # table. The ranges for these have a map type of non-zero. The code below
5898 # knows about and handles each possible type. In most cases, these are
5899 # written as part of the header.
5901 # A map table deliberately can't be manipulated at will unlike match tables.
5902 # This is because of the ambiguities having to do with what to do with
5903 # overlapping code points. And there just isn't a need for those things;
5904 # what one wants to do is just query, add, replace, or delete mappings, plus
5905 # write the final result.
5906 # However, there is a method to get the list of possible ranges that aren't in
5907 # this table to use for defaulting missing code point mappings. And,
5908 # map_add_or_replace_non_nulls() does allow one to add another table to this
5909 # one, but it is clearly very specialized, and defined that the other's
5910 # non-null values replace this one's if there is any overlap.
5912 sub trace { return main::trace(@_); }
5916 main::setup_package();
5919 # Many input files omit some entries; this gives what the mapping for the
5920 # missing entries should be
5921 main::set_access('default_map', \%default_map, 'r');
5923 my %anomalous_entries;
5924 # Things that go in the body of the table which don't fit the normal
5925 # scheme of things, like having a range. Not much can be done with these
5926 # once there except to output them. This was created to handle named
5928 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
5929 main::set_access('anomalous_entries', # Append singular, read plural
5930 \%anomalous_entries,
5934 # Enum as to whether or not to write out this map table, and how:
5936 # $EXTERNAL_MAP means its existence is noted in the documentation, and
5937 # it should not be removed nor its format changed. This
5938 # is done for those files that have traditionally been
5940 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
5942 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
5943 # outputting the actual mappings as-is, we adjust things
5944 # to create a much more compact table. Only those few
5945 # tables where the mapping is convertible at least to an
5946 # integer and compacting makes a big difference should
5947 # have this. Hence, the default is to not do this
5948 # unless the table's default mapping is to $CODE_POINT,
5949 # and the range size is not 1.
5950 main::set_access('to_output_map', \%to_output_map, 's');
5958 # Optional initialization data for the table.
5959 my $initialize = delete $args{'Initialize'};
5961 my $default_map = delete $args{'Default_Map'};
5962 my $property = delete $args{'_Property'};
5963 my $full_name = delete $args{'Full_Name'};
5964 my $to_output_map = delete $args{'To_Output_Map'};
5966 # Rest of parameters passed on
5968 my $range_list = Range_Map->new(Owner => $property);
5970 my $self = $class->SUPER::new(
5972 Complete_Name => $full_name,
5973 Full_Name => $full_name,
5974 _Property => $property,
5975 _Range_List => $range_list,
5978 my $addr = do { no overloading; pack 'J', $self; };
5980 $anomalous_entries{$addr} = [];
5981 $default_map{$addr} = $default_map;
5982 $to_output_map{$addr} = $to_output_map;
5984 $self->initialize($initialize) if defined $initialize;
5991 qw("") => "_operator_stringify",
5994 sub _operator_stringify {
5997 my $name = $self->property->full_name;
5998 $name = '""' if $name eq "";
5999 return "Map table for Property '$name'";
6003 # Add a synonym for this table (which means the property itself)
6006 # Rest of parameters passed on.
6008 $self->SUPER::add_alias($name, $self->property, @_);
6013 # Add a range of code points to the list of specially-handled code
6014 # points. $MULTI_CP is assumed if the type of special is not passed
6023 my $type = delete $args{'Type'} || 0;
6024 # Rest of parameters passed on
6026 # Can't change the table if locked.
6027 return if $self->carp_if_locked;
6029 my $addr = do { no overloading; pack 'J', $self; };
6031 $self->_range_list->add_map($lower, $upper,
6038 sub append_to_body {
6039 # Adds to the written HERE document of the table's body any anomalous
6040 # entries in the table..
6043 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6045 my $addr = do { no overloading; pack 'J', $self; };
6047 return "" unless @{$anomalous_entries{$addr}};
6048 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6051 sub map_add_or_replace_non_nulls {
6052 # This adds the mappings in the table $other to $self. Non-null
6053 # mappings from $other override those in $self. It essentially merges
6054 # the two tables, with the second having priority except for null
6059 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6061 return if $self->carp_if_locked;
6063 if (! $other->isa(__PACKAGE__)) {
6064 Carp::my_carp_bug("$other should be a "
6072 my $addr = do { no overloading; pack 'J', $self; };
6073 my $other_addr = do { no overloading; pack 'J', $other; };
6075 local $to_trace = 0 if main::DEBUG;
6077 my $self_range_list = $self->_range_list;
6078 my $other_range_list = $other->_range_list;
6079 foreach my $range ($other_range_list->ranges) {
6080 my $value = $range->value;
6081 next if $value eq "";
6082 $self_range_list->_add_delete('+',
6086 Type => $range->type,
6087 Replace => $UNCONDITIONALLY);
6093 sub set_default_map {
6094 # Define what code points that are missing from the input files should
6099 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6101 my $addr = do { no overloading; pack 'J', $self; };
6103 # Convert the input to the standard equivalent, if any (won't have any
6104 # for $STRING properties)
6105 my $standard = $self->_find_table_from_alias->{$map};
6106 $map = $standard->name if defined $standard;
6108 # Warn if there already is a non-equivalent default map for this
6109 # property. Note that a default map can be a ref, which means that
6110 # what it actually means is delayed until later in the program, and it
6111 # IS permissible to override it here without a message.
6112 my $default_map = $default_map{$addr};
6113 if (defined $default_map
6114 && ! ref($default_map)
6115 && $default_map ne $map
6116 && main::Standardize($map) ne $default_map)
6118 my $property = $self->property;
6119 my $map_table = $property->table($map);
6120 my $default_table = $property->table($default_map);
6121 if (defined $map_table
6122 && defined $default_table
6123 && $map_table != $default_table)
6125 Carp::my_carp("Changing the default mapping for "
6127 . " from $default_map to $map'");
6131 $default_map{$addr} = $map;
6133 # Don't also create any missing table for this map at this point,
6134 # because if we did, it could get done before the main table add is
6135 # done for PropValueAliases.txt; instead the caller will have to make
6136 # sure it exists, if desired.
6141 # Returns boolean: should we write this map table?
6144 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6146 my $addr = do { no overloading; pack 'J', $self; };
6148 # If overridden, use that
6149 return $to_output_map{$addr} if defined $to_output_map{$addr};
6151 my $full_name = $self->full_name;
6152 return $global_to_output_map{$full_name}
6153 if defined $global_to_output_map{$full_name};
6155 # If table says to output, do so; if says to suppress it, do so.
6156 my $fate = $self->fate;
6157 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6158 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6159 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6161 my $type = $self->property->type;
6163 # Don't want to output binary map tables even for debugging.
6164 return 0 if $type == $BINARY;
6166 # But do want to output string ones. All the ones that remain to
6167 # be dealt with (i.e. which haven't explicitly been set to external)
6168 # are for internal Perl use only. The default for those that map to
6169 # $CODE_POINT and haven't been restricted to a single element range
6170 # is to use the adjusted form.
6171 if ($type == $STRING) {
6172 return $INTERNAL_MAP if $self->range_size_1
6173 || $default_map{$addr} ne $CODE_POINT;
6174 return $OUTPUT_ADJUSTED;
6177 # Otherwise is an $ENUM, do output it, for Perl's purposes
6178 return $INTERNAL_MAP;
6182 # Returns a Range_List that is gaps of the current table. That is,
6186 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6188 my $current = Range_List->new(Initialize => $self->_range_list,
6189 Owner => $self->property);
6195 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6197 my $return = $self->SUPER::header();
6199 if ($self->to_output_map >= $INTERNAL_MAP) {
6200 $return .= $INTERNAL_ONLY_HEADER;
6203 my $property_name = $self->property->full_name =~ s/Legacy_//r;
6206 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
6208 # This file is for internal use by core Perl only. It is retained for
6209 # backwards compatibility with applications that may have come to rely on it,
6210 # but its format and even its name or existence are subject to change without
6211 # notice in a future Perl version. Don't use it directly. Instead, its
6212 # contents are now retrievable through a stable API in the Unicode::UCD
6213 # module: Unicode::UCD::prop_invmap('$property_name').
6219 sub set_final_comment {
6220 # Just before output, create the comment that heads the file
6221 # containing this table.
6223 return unless $debugging_build;
6226 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6228 # No sense generating a comment if aren't going to write it out.
6229 return if ! $self->to_output_map;
6231 my $addr = do { no overloading; pack 'J', $self; };
6233 my $property = $self->property;
6235 # Get all the possible names for this property. Don't use any that
6236 # aren't ok for use in a file name, etc. This is perhaps causing that
6237 # flag to do double duty, and may have to be changed in the future to
6238 # have our own flag for just this purpose; but it works now to exclude
6239 # Perl generated synonyms from the lists for properties, where the
6240 # name is always the proper Unicode one.
6241 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6243 my $count = $self->count;
6244 my $default_map = $default_map{$addr};
6246 # The ranges that map to the default aren't output, so subtract that
6247 # to get those actually output. A property with matching tables
6248 # already has the information calculated.
6249 if ($property->type != $STRING) {
6250 $count -= $property->table($default_map)->count;
6252 elsif (defined $default_map) {
6254 # But for $STRING properties, must calculate now. Subtract the
6255 # count from each range that maps to the default.
6256 foreach my $range ($self->_range_list->ranges) {
6257 if ($range->value eq $default_map) {
6258 $count -= $range->end +1 - $range->start;
6264 # Get a string version of $count with underscores in large numbers,
6266 my $string_count = main::clarify_number($count);
6268 my $code_points = ($count == 1)
6269 ? 'single code point'
6270 : "$string_count code points";
6275 if (@property_aliases <= 1) {
6276 $mapping = 'mapping';
6277 $these_mappings = 'this mapping';
6281 $mapping = 'synonymous mappings';
6282 $these_mappings = 'these mappings';
6286 if ($count >= $MAX_UNICODE_CODEPOINTS) {
6287 $cp = "any code point in Unicode Version $string_version";
6291 if ($default_map eq "") {
6292 $map_to = 'the null string';
6294 elsif ($default_map eq $CODE_POINT) {
6298 $map_to = "'$default_map'";
6301 $cp = "the single code point";
6304 $cp = "one of the $code_points";
6306 $cp .= " in Unicode Version $string_version for which the mapping is not to $map_to";
6311 my $status = $self->status;
6312 if ($status && $status ne $PLACEHOLDER) {
6313 my $warn = uc $status_past_participles{$status};
6316 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
6317 All property or property=value combinations contained in this file are $warn.
6318 See $unicode_reference_url for what this means.
6322 $comment .= "This file returns the $mapping:\n";
6324 my $ucd_accessible_name = "";
6325 my $full_name = $self->property->full_name;
6326 for my $i (0 .. @property_aliases - 1) {
6327 my $name = $property_aliases[$i]->name;
6328 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
6329 if ($property_aliases[$i]->ucd) {
6330 if ($name eq $full_name) {
6331 $ucd_accessible_name = $full_name;
6333 elsif (! $ucd_accessible_name) {
6334 $ucd_accessible_name = $name;
6338 $comment .= "\nwhere 'cp' is $cp.";
6339 if ($ucd_accessible_name) {
6340 $comment .= " Note that $these_mappings $are accessible via the function prop_invmap('$full_name') in Unicode::UCD";
6343 # And append any commentary already set from the actual property.
6344 $comment .= "\n\n" . $self->comment if $self->comment;
6345 if ($self->description) {
6346 $comment .= "\n\n" . join " ", $self->description;
6349 $comment .= "\n\n" . join " ", $self->note;
6353 if (! $self->perl_extension) {
6356 For information about what this property really means, see:
6357 $unicode_reference_url
6361 if ($count) { # Format differs for empty table
6362 $comment.= "\nThe format of the ";
6363 if ($self->range_size_1) {
6365 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
6366 is in hex; MAPPING is what CODE_POINT maps to.
6371 # There are tables which end up only having one element per
6372 # range, but it is not worth keeping track of for making just
6373 # this comment a little better.
6375 non-comment portions of the main body of lines of this file is:
6376 START\\tSTOP\\tMAPPING where START is the starting code point of the
6377 range, in hex; STOP is the ending point, or if omitted, the range has just one
6378 code point; MAPPING is what each code point between START and STOP maps to.
6380 if ($self->output_range_counts) {
6382 Numbers in comments in [brackets] indicate how many code points are in the
6383 range (omitted when the range is a single code point or if the mapping is to
6389 $self->set_comment(main::join_lines($comment));
6393 my %swash_keys; # Makes sure don't duplicate swash names.
6395 # The remaining variables are temporaries used while writing each table,
6396 # to output special ranges.
6397 my @multi_code_point_maps; # Map is to more than one code point.
6399 sub handle_special_range {
6400 # Called in the middle of write when it finds a range it doesn't know
6405 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6407 my $addr = do { no overloading; pack 'J', $self; };
6409 my $type = $range->type;
6411 my $low = $range->start;
6412 my $high = $range->end;
6413 my $map = $range->value;
6415 # No need to output the range if it maps to the default.
6416 return if $map eq $default_map{$addr};
6418 my $property = $self->property;
6420 # Switch based on the map type...
6421 if ($type == $HANGUL_SYLLABLE) {
6423 # These are entirely algorithmically determinable based on
6424 # some constants furnished by Unicode; for now, just set a
6425 # flag to indicate that have them. After everything is figured
6426 # out, we will output the code that does the algorithm. (Don't
6427 # output them if not needed because we are suppressing this
6429 $has_hangul_syllables = 1 if $property->to_output_map;
6431 elsif ($type == $CP_IN_NAME) {
6433 # Code points whose name ends in their code point are also
6434 # algorithmically determinable, but need information about the map
6435 # to do so. Both the map and its inverse are stored in data
6436 # structures output in the file. They are stored in the mean time
6437 # in global lists The lists will be written out later into Name.pm,
6438 # which is created only if needed. In order to prevent duplicates
6439 # in the list, only add to them for one property, should multiple
6441 if ($needing_code_points_ending_in_code_point == 0) {
6442 $needing_code_points_ending_in_code_point = $property;
6444 if ($property == $needing_code_points_ending_in_code_point) {
6445 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
6446 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
6448 my $squeezed = $map =~ s/[-\s]+//gr;
6449 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
6451 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
6454 push @code_points_ending_in_code_point, { low => $low,
6460 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
6462 # Multi-code point maps and null string maps have an entry
6463 # for each code point in the range. They use the same
6465 for my $code_point ($low .. $high) {
6467 # The pack() below can't cope with surrogates. XXX This may
6469 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
6470 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
6474 # Generate the hash entries for these in the form that
6475 # utf8.c understands.
6479 foreach my $to (split " ", $map) {
6480 if ($to !~ /^$code_point_re$/) {
6481 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
6484 $tostr .= sprintf "\\x{%s}", $to;
6485 $to = CORE::hex $to;
6487 $to_name .= " + " if $to_name;
6488 $to_chr .= chr($to);
6489 main::populate_char_info($to)
6490 if ! defined $viacode[$to];
6491 $to_name .= $viacode[$to];
6495 # I (khw) have never waded through this line to
6496 # understand it well enough to comment it.
6497 my $utf8 = sprintf(qq["%s" => "$tostr",],
6498 join("", map { sprintf "\\x%02X", $_ }
6499 unpack("U0C*", pack("U", $code_point))));
6501 # Add a comment so that a human reader can more easily
6502 # see what's going on.
6503 push @multi_code_point_maps,
6504 sprintf("%-45s # U+%04X", $utf8, $code_point);
6506 $multi_code_point_maps[-1] .= " => $map";
6509 main::populate_char_info($code_point)
6510 if ! defined $viacode[$code_point];
6511 $multi_code_point_maps[-1] .= " '"
6513 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
6518 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
6525 # Returns the string that should be output in the file before the main
6526 # body of this table. It isn't called until the main body is
6527 # calculated, saving a pass. The string includes some hash entries
6528 # identifying the format of the body, and what the single value should
6529 # be for all ranges missing from it. It also includes any code points
6530 # which have map_types that don't go in the main table.
6533 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6535 my $addr = do { no overloading; pack 'J', $self; };
6537 my $name = $self->property->swash_name;
6539 # Currently there is nothing in the pre_body unless a swash is being
6541 return unless defined $name;
6543 if (defined $swash_keys{$name}) {
6544 Carp::my_carp(main::join_lines(<<END
6545 Already created a swash name '$name' for $swash_keys{$name}. This means that
6546 the same name desired for $self shouldn't be used. Bad News. This must be
6547 fixed before production use, but proceeding anyway
6551 $swash_keys{$name} = "$self";
6555 # Here we assume we were called after have gone through the whole
6556 # file. If we actually generated anything for each map type, add its
6557 # respective header and trailer
6558 my $specials_name = "";
6559 if (@multi_code_point_maps) {
6560 $specials_name = "utf8::ToSpec$name";
6563 # Some code points require special handling because their mappings are each to
6564 # multiple code points. These do not appear in the main body, but are defined
6565 # in the hash below.
6567 # Each key is the string of N bytes that together make up the UTF-8 encoding
6568 # for the code point. (i.e. the same as looking at the code point's UTF-8
6569 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
6570 \%$specials_name = (
6572 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
6575 my $format = $self->format;
6579 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6580 if ($output_adjusted) {
6581 if ($specials_name) {
6583 # The mappings in the non-hash portion of this file must be modified to get the
6584 # correct values by adding the code point ordinal number to each one that is
6590 # The mappings must be modified to get the correct values by adding the code
6591 # point ordinal number to each one that is numeric.
6598 # The name this swash is to be known by, with the format of the mappings in
6599 # the main body of the table, and what all code points missing from this file
6601 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
6603 if ($specials_name) {
6605 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
6608 my $default_map = $default_map{$addr};
6610 # For $CODE_POINT default maps and using adjustments, instead the default
6612 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
6613 . (($output_adjusted && $default_map eq $CODE_POINT)
6618 if ($default_map eq $CODE_POINT) {
6619 $return .= ' # code point maps to itself';
6621 elsif ($default_map eq "") {
6622 $return .= ' # code point maps to the null string';
6626 $return .= $pre_body;
6632 # Write the table to the file.
6635 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6637 my $addr = do { no overloading; pack 'J', $self; };
6639 # Clear the temporaries
6640 undef @multi_code_point_maps;
6642 # Calculate the format of the table if not already done.
6643 my $format = $self->format;
6644 my $type = $self->property->type;
6645 my $default_map = $self->default_map;
6646 if (! defined $format) {
6647 if ($type == $BINARY) {
6649 # Don't bother checking the values, because we elsewhere
6650 # verify that a binary table has only 2 values.
6651 $format = $BINARY_FORMAT;
6654 my @ranges = $self->_range_list->ranges;
6656 # default an empty table based on its type and default map
6659 # But it turns out that the only one we can say is a
6660 # non-string (besides binary, handled above) is when the
6661 # table is a string and the default map is to a code point
6662 if ($type == $STRING && $default_map eq $CODE_POINT) {
6663 $format = $HEX_FORMAT;
6666 $format = $STRING_FORMAT;
6671 # Start with the most restrictive format, and as we find
6672 # something that doesn't fit with that, change to the next
6673 # most restrictive, and so on.
6674 $format = $DECIMAL_FORMAT;
6675 foreach my $range (@ranges) {
6676 next if $range->type != 0; # Non-normal ranges don't
6677 # affect the main body
6678 my $map = $range->value;
6679 if ($map ne $default_map) {
6680 last if $format eq $STRING_FORMAT; # already at
6683 $format = $INTEGER_FORMAT
6684 if $format eq $DECIMAL_FORMAT
6685 && $map !~ / ^ [0-9] $ /x;
6686 $format = $FLOAT_FORMAT
6687 if $format eq $INTEGER_FORMAT
6688 && $map !~ / ^ -? [0-9]+ $ /x;
6689 $format = $RATIONAL_FORMAT
6690 if $format eq $FLOAT_FORMAT
6691 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
6692 $format = $HEX_FORMAT
6693 if ($format eq $RATIONAL_FORMAT
6695 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
6696 # Assume a leading zero means hex,
6697 # even if all digits are 0-9
6698 || ($format eq $INTEGER_FORMAT
6699 && $map =~ /^0[0-9A-F]/);
6700 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
6701 && $map =~ /[^0-9A-F]/;
6706 } # end of calculating format
6708 if ($default_map eq $CODE_POINT
6709 && $format ne $HEX_FORMAT
6710 && ! defined $self->format) # manual settings are always
6713 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
6716 # If the output is to be adjusted, the format of the table that gets
6717 # output is actually 'a' instead of whatever it is stored internally
6719 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
6720 if ($output_adjusted) {
6721 $format = $ADJUST_FORMAT;
6724 $self->_set_format($format);
6726 return $self->SUPER::write(
6728 ($self->property == $block)
6729 ? 7 # block file needs more tab stops
6731 $default_map); # don't write defaulteds
6734 # Accessors for the underlying list that should fail if locked.
6744 return if $self->carp_if_locked;
6745 return $self->_range_list->$sub(@_);
6748 } # End closure for Map_Table
6750 package Match_Table;
6751 use base '_Base_Table';
6753 # A Match table is one which is a list of all the code points that have
6754 # the same property and property value, for use in \p{property=value}
6755 # constructs in regular expressions. It adds very little data to the base
6756 # structure, but many methods, as these lists can be combined in many ways to
6758 # There are only a few concepts added:
6759 # 1) Equivalents and Relatedness.
6760 # Two tables can match the identical code points, but have different names.
6761 # This always happens when there is a perl single form extension
6762 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
6763 # tables are set to be related, with the Perl extension being a child, and
6764 # the Unicode property being the parent.
6766 # It may be that two tables match the identical code points and we don't
6767 # know if they are related or not. This happens most frequently when the
6768 # Block and Script properties have the exact range. But note that a
6769 # revision to Unicode could add new code points to the script, which would
6770 # now have to be in a different block (as the block was filled, or there
6771 # would have been 'Unknown' script code points in it and they wouldn't have
6772 # been identical). So we can't rely on any two properties from Unicode
6773 # always matching the same code points from release to release, and thus
6774 # these tables are considered coincidentally equivalent--not related. When
6775 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
6776 # 'leader', and the others are 'equivalents'. This concept is useful
6777 # to minimize the number of tables written out. Only one file is used for
6778 # any identical set of code points, with entries in Heavy.pl mapping all
6779 # the involved tables to it.
6781 # Related tables will always be identical; we set them up to be so. Thus
6782 # if the Unicode one is deprecated, the Perl one will be too. Not so for
6783 # unrelated tables. Relatedness makes generating the documentation easier.
6786 # Like equivalents, two tables may be the inverses of each other, the
6787 # intersection between them is null, and the union is every Unicode code
6788 # point. The two tables that occupy a binary property are necessarily like
6789 # this. By specifying one table as the complement of another, we can avoid
6790 # storing it on disk (using the other table and performing a fast
6791 # transform), and some memory and calculations.
6793 # 3) Conflicting. It may be that there will eventually be name clashes, with
6794 # the same name meaning different things. For a while, there actually were
6795 # conflicts, but they have so far been resolved by changing Perl's or
6796 # Unicode's definitions to match the other, but when this code was written,
6797 # it wasn't clear that that was what was going to happen. (Unicode changed
6798 # because of protests during their beta period.) Name clashes are warned
6799 # about during compilation, and the documentation. The generated tables
6800 # are sane, free of name clashes, because the code suppresses the Perl
6801 # version. But manual intervention to decide what the actual behavior
6802 # should be may be required should this happen. The introductory comments
6803 # have more to say about this.
6805 sub standardize { return main::standardize($_[0]); }
6806 sub trace { return main::trace(@_); }
6811 main::setup_package();
6814 # The leader table of this one; initially $self.
6815 main::set_access('leader', \%leader, 'r');
6818 # An array of any tables that have this one as their leader
6819 main::set_access('equivalents', \%equivalents, 'readable_array');
6822 # The parent table to this one, initially $self. This allows us to
6823 # distinguish between equivalent tables that are related (for which this
6824 # is set to), and those which may not be, but share the same output file
6825 # because they match the exact same set of code points in the current
6827 main::set_access('parent', \%parent, 'r');
6830 # An array of any tables that have this one as their parent
6831 main::set_access('children', \%children, 'readable_array');
6834 # Array of any tables that would have the same name as this one with
6835 # a different meaning. This is used for the generated documentation.
6836 main::set_access('conflicting', \%conflicting, 'readable_array');
6839 # Set in the constructor for tables that are expected to match all code
6841 main::set_access('matches_all', \%matches_all, 'r');
6844 # Points to the complement that this table is expressed in terms of; 0 if
6846 main::set_access('complement', \%complement, 'r');
6853 # The property for which this table is a listing of property values.
6854 my $property = delete $args{'_Property'};
6856 my $name = delete $args{'Name'};
6857 my $full_name = delete $args{'Full_Name'};
6858 $full_name = $name if ! defined $full_name;
6861 my $initialize = delete $args{'Initialize'};
6862 my $matches_all = delete $args{'Matches_All'} || 0;
6863 my $format = delete $args{'Format'};
6864 # Rest of parameters passed on.
6866 my $range_list = Range_List->new(Initialize => $initialize,
6867 Owner => $property);
6869 my $complete = $full_name;
6870 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
6871 # but this helps debug if it
6873 # The complete name for a match table includes it's property in a
6874 # compound form 'property=table', except if the property is the
6875 # pseudo-property, perl, in which case it is just the single form,
6876 # 'table' (If you change the '=' must also change the ':' in lots of
6877 # places in this program that assume an equal sign)
6878 $complete = $property->full_name . "=$complete" if $property != $perl;
6880 my $self = $class->SUPER::new(%args,
6882 Complete_Name => $complete,
6883 Full_Name => $full_name,
6884 _Property => $property,
6885 _Range_List => $range_list,
6886 Format => $EMPTY_FORMAT,
6888 my $addr = do { no overloading; pack 'J', $self; };
6890 $conflicting{$addr} = [ ];
6891 $equivalents{$addr} = [ ];
6892 $children{$addr} = [ ];
6893 $matches_all{$addr} = $matches_all;
6894 $leader{$addr} = $self;
6895 $parent{$addr} = $self;
6896 $complement{$addr} = 0;
6898 if (defined $format && $format ne $EMPTY_FORMAT) {
6899 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
6905 # See this program's beginning comment block about overloading these.
6908 qw("") => "_operator_stringify",
6912 return if $self->carp_if_locked;
6920 return $self->_range_list + $other;
6926 return $self->_range_list & $other;
6931 my $reversed = shift;
6934 Carp::my_carp_bug("Bad news. Can't cope with '"
6938 . "'. undef returned.");
6942 return if $self->carp_if_locked;
6944 my $addr = do { no overloading; pack 'J', $self; };
6948 # Change the range list of this table to be the
6950 $self->_set_range_list($self->_range_list
6953 else { # $other is just a simple value
6954 $self->add_range($other, $other);
6961 my $reversed = shift;
6964 Carp::my_carp_bug("Bad news. Can't cope with '"
6968 . "'. undef returned.");
6972 return if $self->carp_if_locked;
6973 $self->_set_range_list($self->_range_list & $other);
6976 '-' => sub { my $self = shift;
6978 my $reversed = shift;
6980 Carp::my_carp_bug("Bad news. Can't cope with '"
6984 . "'. undef returned.");
6988 return $self->_range_list - $other;
6990 '~' => sub { my $self = shift;
6991 return ~ $self->_range_list;
6995 sub _operator_stringify {
6998 my $name = $self->complete_name;
6999 return "Table '$name'";
7003 # Returns the range list associated with this table, which will be the
7004 # complement's if it has one.
7008 if (($complement = $self->complement) != 0) {
7009 return ~ $complement->_range_list;
7012 return $self->SUPER::_range_list;
7017 # Add a synonym for this table. See the comments in the base class
7021 # Rest of parameters passed on.
7023 $self->SUPER::add_alias($name, $self, @_);
7027 sub add_conflicting {
7028 # Add the name of some other object to the list of ones that name
7029 # clash with this match table.
7032 my $conflicting_name = shift; # The name of the conflicting object
7033 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
7034 my $conflicting_object = shift; # Optional, the conflicting object
7035 # itself. This is used to
7036 # disambiguate the text if the input
7037 # name is identical to any of the
7038 # aliases $self is known by.
7039 # Sometimes the conflicting object is
7040 # merely hypothetical, so this has to
7041 # be an optional parameter.
7042 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7044 my $addr = do { no overloading; pack 'J', $self; };
7046 # Check if the conflicting name is exactly the same as any existing
7047 # alias in this table (as long as there is a real object there to
7048 # disambiguate with).
7049 if (defined $conflicting_object) {
7050 foreach my $alias ($self->aliases) {
7051 if ($alias->name eq $conflicting_name) {
7053 # Here, there is an exact match. This results in
7054 # ambiguous comments, so disambiguate by changing the
7055 # conflicting name to its object's complete equivalent.
7056 $conflicting_name = $conflicting_object->complete_name;
7062 # Convert to the \p{...} final name
7063 $conflicting_name = "\\$p" . "{$conflicting_name}";
7066 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7068 push @{$conflicting{$addr}}, $conflicting_name;
7073 sub is_set_equivalent_to {
7074 # Return boolean of whether or not the other object is a table of this
7075 # type and has been marked equivalent to this one.
7079 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7081 return 0 if ! defined $other; # Can happen for incomplete early
7083 unless ($other->isa(__PACKAGE__)) {
7084 my $ref_other = ref $other;
7085 my $ref_self = ref $self;
7086 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
7090 # Two tables are equivalent if they have the same leader.
7092 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
7096 sub set_equivalent_to {
7097 # Set $self equivalent to the parameter table.
7098 # The required Related => 'x' parameter is a boolean indicating
7099 # whether these tables are related or not. If related, $other becomes
7100 # the 'parent' of $self; if unrelated it becomes the 'leader'
7102 # Related tables share all characteristics except names; equivalents
7103 # not quite so many.
7104 # If they are related, one must be a perl extension. This is because
7105 # we can't guarantee that Unicode won't change one or the other in a
7106 # later release even if they are identical now.
7112 my $related = delete $args{'Related'};
7114 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7116 return if ! defined $other; # Keep on going; happens in some early
7119 if (! defined $related) {
7120 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
7124 # If already are equivalent, no need to re-do it; if subroutine
7125 # returns null, it found an error, also do nothing
7126 my $are_equivalent = $self->is_set_equivalent_to($other);
7127 return if ! defined $are_equivalent || $are_equivalent;
7129 my $addr = do { no overloading; pack 'J', $self; };
7130 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7133 if ($current_leader->perl_extension) {
7134 if ($other->perl_extension) {
7135 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7138 } elsif ($self->property != $other->property # Depending on
7144 && ! $other->perl_extension)
7146 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
7151 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7152 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
7156 my $leader = do { no overloading; pack 'J', $current_leader; };
7157 my $other_addr = do { no overloading; pack 'J', $other; };
7159 # Any tables that are equivalent to or children of this table must now
7160 # instead be equivalent to or (children) to the new leader (parent),
7161 # still equivalent. The equivalency includes their matches_all info,
7162 # and for related tables, their fate and status.
7163 # All related tables are of necessity equivalent, but the converse
7164 # isn't necessarily true
7165 my $status = $other->status;
7166 my $status_info = $other->status_info;
7167 my $fate = $other->fate;
7168 my $matches_all = $matches_all{other_addr};
7169 my $caseless_equivalent = $other->caseless_equivalent;
7170 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7171 next if $table == $other;
7172 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7174 my $table_addr = do { no overloading; pack 'J', $table; };
7175 $leader{$table_addr} = $other;
7176 $matches_all{$table_addr} = $matches_all;
7177 $self->_set_range_list($other->_range_list);
7178 push @{$equivalents{$other_addr}}, $table;
7180 $parent{$table_addr} = $other;
7181 push @{$children{$other_addr}}, $table;
7182 $table->set_status($status, $status_info);
7184 # This reason currently doesn't get exposed outside; otherwise
7185 # would have to look up the parent's reason and use it instead.
7186 $table->set_fate($fate, "Parent's fate");
7188 $self->set_caseless_equivalent($caseless_equivalent);
7192 # Now that we've declared these to be equivalent, any changes to one
7193 # of the tables would invalidate that equivalency.
7199 sub set_complement {
7200 # Set $self to be the complement of the parameter table. $self is
7201 # locked, as what it contains should all come from the other table.
7207 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7209 if ($other->complement != 0) {
7210 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7213 my $addr = do { no overloading; pack 'J', $self; };
7214 $complement{$addr} = $other;
7219 sub add_range { # Add a range to the list for this table.
7221 # Rest of parameters passed on
7223 return if $self->carp_if_locked;
7224 return $self->_range_list->add_range(@_);
7229 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7231 # All match tables are to be used only by the Perl core.
7232 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
7235 sub pre_body { # Does nothing for match tables.
7239 sub append_to_body { # Does nothing for match tables.
7247 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7249 $self->SUPER::set_fate($fate, $reason);
7251 # All children share this fate
7252 foreach my $child ($self->children) {
7253 $child->set_fate($fate, $reason);
7260 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7262 return $self->SUPER::write(0, 2); # No adjustments; 2 tab stops
7265 sub set_final_comment {
7266 # This creates a comment for the file that is to hold the match table
7267 # $self. It is somewhat convoluted to make the English read nicely,
7268 # but, heh, it's just a comment.
7269 # This should be called only with the leader match table of all the
7270 # ones that share the same file. It lists all such tables, ordered so
7271 # that related ones are together.
7273 return unless $debugging_build;
7275 my $leader = shift; # Should only be called on the leader table of
7276 # an equivalent group
7277 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7279 my $addr = do { no overloading; pack 'J', $leader; };
7281 if ($leader{$addr} != $leader) {
7282 Carp::my_carp_bug(<<END
7283 set_final_comment() must be called on a leader table, which $leader is not.
7284 It is equivalent to $leader{$addr}. No comment created
7290 # Get the number of code points matched by each of the tables in this
7291 # file, and add underscores for clarity.
7292 my $count = $leader->count;
7293 my $string_count = main::clarify_number($count);
7295 my $loose_count = 0; # how many aliases loosely matched
7296 my $compound_name = ""; # ? Are any names compound?, and if so, an
7298 my $properties_with_compound_names = 0; # count of these
7301 my %flags; # The status flags used in the file
7302 my $total_entries = 0; # number of entries written in the comment
7303 my $matches_comment = ""; # The portion of the comment about the
7305 my @global_comments; # List of all the tables' comments that are
7306 # there before this routine was called.
7307 my $has_ucd_alias = 0; # If there is an alias that is accessible via
7308 # Unicode::UCD. If not, then don't say it is
7311 # Get list of all the parent tables that are equivalent to this one
7312 # (including itself).
7313 my @parents = grep { $parent{main::objaddr $_} == $_ }
7314 main::uniques($leader, @{$equivalents{$addr}});
7315 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
7318 for my $parent (@parents) {
7320 my $property = $parent->property;
7322 # Special case 'N' tables in properties with two match tables when
7323 # the other is a 'Y' one. These are likely to be binary tables,
7324 # but not necessarily. In either case, \P{} will match the
7325 # complement of \p{}, and so if something is a synonym of \p, the
7326 # complement of that something will be the synonym of \P. This
7327 # would be true of any property with just two match tables, not
7328 # just those whose values are Y and N; but that would require a
7329 # little extra work, and there are none such so far in Unicode.
7330 my $perl_p = 'p'; # which is it? \p{} or \P{}
7331 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
7333 if (scalar $property->tables == 2
7334 && $parent == $property->table('N')
7335 && defined (my $yes = $property->table('Y')))
7337 my $yes_addr = do { no overloading; pack 'J', $yes; };
7339 = grep { $_->property == $perl }
7342 $parent{$yes_addr}->children);
7344 # But these synonyms are \P{} ,not \p{}
7348 my @description; # Will hold the table description
7349 my @note; # Will hold the table notes.
7350 my @conflicting; # Will hold the table conflicts.
7352 # Look at the parent, any yes synonyms, and all the children
7353 my $parent_addr = do { no overloading; pack 'J', $parent; };
7354 for my $table ($parent,
7356 @{$children{$parent_addr}})
7358 my $table_addr = do { no overloading; pack 'J', $table; };
7359 my $table_property = $table->property;
7361 # Tables are separated by a blank line to create a grouping.
7362 $matches_comment .= "\n" if $matches_comment;
7364 # The table is named based on the property and value
7365 # combination it is for, like script=greek. But there may be
7366 # a number of synonyms for each side, like 'sc' for 'script',
7367 # and 'grek' for 'greek'. Any combination of these is a valid
7368 # name for this table. In this case, there are three more,
7369 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
7370 # listing all possible combinations in the comment, we make
7371 # sure that each synonym occurs at least once, and add
7372 # commentary that the other combinations are possible.
7373 # Because regular expressions don't recognize things like
7374 # \p{jsn=}, only look at non-null right-hand-sides
7375 my @property_aliases = $table_property->aliases;
7376 my @table_aliases = grep { $_->name ne "" } $table->aliases;
7378 # The alias lists above are already ordered in the order we
7379 # want to output them. To ensure that each synonym is listed,
7380 # we must use the max of the two numbers. But if there are no
7381 # legal synonyms (nothing in @table_aliases), then we don't
7383 my $listed_combos = (@table_aliases)
7384 ? main::max(scalar @table_aliases,
7385 scalar @property_aliases)
7387 trace "$listed_combos, tables=", scalar @table_aliases, "; names=", scalar @property_aliases if main::DEBUG;
7390 my $property_had_compound_name = 0;
7392 for my $i (0 .. $listed_combos - 1) {
7395 # The current alias for the property is the next one on
7396 # the list, or if beyond the end, start over. Similarly
7397 # for the table (\p{prop=table})
7398 my $property_alias = $property_aliases
7399 [$i % @property_aliases]->name;
7400 my $table_alias_object = $table_aliases
7401 [$i % @table_aliases];
7402 my $table_alias = $table_alias_object->name;
7403 my $loose_match = $table_alias_object->loose_match;
7404 $has_ucd_alias |= $table_alias_object->ucd;
7406 if ($table_alias !~ /\D/) { # Clarify large numbers.
7407 $table_alias = main::clarify_number($table_alias)
7410 # Add a comment for this alias combination
7411 my $current_match_comment;
7412 if ($table_property == $perl) {
7413 $current_match_comment = "\\$perl_p"
7417 $current_match_comment
7418 = "\\p{$property_alias=$table_alias}";
7419 $property_had_compound_name = 1;
7422 # Flag any abnormal status for this table.
7423 my $flag = $property->status
7425 || $table_alias_object->status;
7426 if ($flag && $flag ne $PLACEHOLDER) {
7427 $flags{$flag} = $status_past_participles{$flag};
7432 # Pretty up the comment. Note the \b; it says don't make
7433 # this line a continuation.
7434 $matches_comment .= sprintf("\b%-1s%-s%s\n",
7437 $current_match_comment);
7438 } # End of generating the entries for this table.
7440 # Save these for output after this group of related tables.
7441 push @description, $table->description;
7442 push @note, $table->note;
7443 push @conflicting, $table->conflicting;
7445 # And this for output after all the tables.
7446 push @global_comments, $table->comment;
7448 # Compute an alternate compound name using the final property
7449 # synonym and the first table synonym with a colon instead of
7450 # the equal sign used elsewhere.
7451 if ($property_had_compound_name) {
7452 $properties_with_compound_names ++;
7453 if (! $compound_name || @property_aliases > 1) {
7454 $compound_name = $property_aliases[-1]->name
7456 . $table_aliases[0]->name;
7459 } # End of looping through all children of this table
7461 # Here have assembled in $matches_comment all the related tables
7462 # to the current parent (preceded by the same info for all the
7463 # previous parents). Put out information that applies to all of
7464 # the current family.
7467 # But output the conflicting information now, as it applies to
7469 my $conflicting = join ", ", @conflicting;
7471 $matches_comment .= <<END;
7473 Note that contrary to what you might expect, the above is NOT the same as
7475 $matches_comment .= "any of: " if @conflicting > 1;
7476 $matches_comment .= "$conflicting\n";
7480 $matches_comment .= "\n Meaning: "
7481 . join('; ', @description)
7485 $matches_comment .= "\n Note: "
7486 . join("\n ", @note)
7489 } # End of looping through all tables
7497 $code_points = 'single code point';
7501 $code_points = "$string_count code points";
7506 if ($total_entries == 1) {
7509 $any_of_these = 'this'
7512 $synonyms = " any of the following regular expression constructs";
7513 $entries = 'entries';
7514 $any_of_these = 'any of these'
7518 if ($has_ucd_alias) {
7519 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
7521 if ($has_unrelated) {
7523 This file is for tables that are not necessarily related: To conserve
7524 resources, every table that matches the identical set of code points in this
7525 version of Unicode uses this file. Each one is listed in a separate group
7526 below. It could be that the tables will match the same set of code points in
7527 other Unicode releases, or it could be purely coincidence that they happen to
7528 be the same in Unicode $string_version, and hence may not in other versions.
7534 foreach my $flag (sort keys %flags) {
7536 '$flag' below means that this form is $flags{$flag}.
7537 Consult $pod_file.pod
7543 if ($total_entries == 0) {
7544 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
7546 This file returns the $code_points in Unicode Version $string_version for
7547 $leader, but it is inaccessible through Perl regular expressions, as
7548 "\\p{prop=}" is not recognized.
7553 This file returns the $code_points in Unicode Version $string_version that
7557 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
7558 including if adding or subtracting white space, underscore, and hyphen
7559 characters matters or doesn't matter, and other permissible syntactic
7560 variants. Upper/lower case distinctions never matter.
7564 if ($compound_name) {
7567 A colon can be substituted for the equals sign, and
7569 if ($properties_with_compound_names > 1) {
7571 within each group above,
7574 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
7576 # Note the \b below, it says don't make that line a continuation.
7578 anything to the left of the equals (or colon) can be combined with anything to
7579 the right. Thus, for example,
7585 # And append any comment(s) from the actual tables. They are all
7586 # gathered here, so may not read all that well.
7587 if (@global_comments) {
7588 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
7591 if ($count) { # The format differs if no code points, and needs no
7592 # explanation in that case
7595 The format of the lines of this file is:
7598 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
7599 STOP is the ending point, or if omitted, the range has just one code point.
7601 if ($leader->output_range_counts) {
7603 Numbers in comments in [brackets] indicate how many code points are in the
7609 $leader->set_comment(main::join_lines($comment));
7613 # Accessors for the underlying list
7615 get_valid_code_point
7616 get_invalid_code_point
7624 return $self->_range_list->$sub(@_);
7627 } # End closure for Match_Table
7631 # The Property class represents a Unicode property, or the $perl
7632 # pseudo-property. It contains a map table initialized empty at construction
7633 # time, and for properties accessible through regular expressions, various
7634 # match tables, created through the add_match_table() method, and referenced
7635 # by the table('NAME') or tables() methods, the latter returning a list of all
7636 # of the match tables. Otherwise table operations implicitly are for the map
7639 # Most of the data in the property is actually about its map table, so it
7640 # mostly just uses that table's accessors for most methods. The two could
7641 # have been combined into one object, but for clarity because of their
7642 # differing semantics, they have been kept separate. It could be argued that
7643 # the 'file' and 'directory' fields should be kept with the map table.
7645 # Each property has a type. This can be set in the constructor, or in the
7646 # set_type accessor, but mostly it is figured out by the data. Every property
7647 # starts with unknown type, overridden by a parameter to the constructor, or
7648 # as match tables are added, or ranges added to the map table, the data is
7649 # inspected, and the type changed. After the table is mostly or entirely
7650 # filled, compute_type() should be called to finalize they analysis.
7652 # There are very few operations defined. One can safely remove a range from
7653 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
7654 # table to this one, replacing any in the intersection of the two.
7656 sub standardize { return main::standardize($_[0]); }
7657 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
7661 # This hash will contain as keys, all the aliases of all properties, and
7662 # as values, pointers to their respective property objects. This allows
7663 # quick look-up of a property from any of its names.
7664 my %alias_to_property_of;
7666 sub dump_alias_to_property_of {
7669 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
7674 # This is a package subroutine, not called as a method.
7675 # If the single parameter is a literal '*' it returns a list of all
7676 # defined properties.
7677 # Otherwise, the single parameter is a name, and it returns a pointer
7678 # to the corresponding property object, or undef if none.
7680 # Properties can have several different names. The 'standard' form of
7681 # each of them is stored in %alias_to_property_of as they are defined.
7682 # But it's possible that this subroutine will be called with some
7683 # variant, so if the initial lookup fails, it is repeated with the
7684 # standardized form of the input name. If found, besides returning the
7685 # result, the input name is added to the list so future calls won't
7686 # have to do the conversion again.
7690 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7692 if (! defined $name) {
7693 Carp::my_carp_bug("Undefined input property. No action taken.");
7697 return main::uniques(values %alias_to_property_of) if $name eq '*';
7699 # Return cached result if have it.
7700 my $result = $alias_to_property_of{$name};
7701 return $result if defined $result;
7703 # Convert the input to standard form.
7704 my $standard_name = standardize($name);
7706 $result = $alias_to_property_of{$standard_name};
7707 return unless defined $result; # Don't cache undefs
7709 # Cache the result before returning it.
7710 $alias_to_property_of{$name} = $result;
7715 main::setup_package();
7718 # A pointer to the map table object for this property
7719 main::set_access('map', \%map);
7722 # The property's full name. This is a duplicate of the copy kept in the
7723 # map table, but is needed because stringify needs it during
7724 # construction of the map table, and then would have a chicken before egg
7726 main::set_access('full_name', \%full_name, 'r');
7729 # This hash will contain as keys, all the aliases of any match tables
7730 # attached to this property, and as values, the pointers to their
7731 # respective tables. This allows quick look-up of a table from any of its
7733 main::set_access('table_ref', \%table_ref);
7736 # The type of the property, $ENUM, $BINARY, etc
7737 main::set_access('type', \%type, 'r');
7740 # The filename where the map table will go (if actually written).
7741 # Normally defaulted, but can be overridden.
7742 main::set_access('file', \%file, 'r', 's');
7745 # The directory where the map table will go (if actually written).
7746 # Normally defaulted, but can be overridden.
7747 main::set_access('directory', \%directory, 's');
7749 my %pseudo_map_type;
7750 # This is used to affect the calculation of the map types for all the
7751 # ranges in the table. It should be set to one of the values that signify
7752 # to alter the calculation.
7753 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
7755 my %has_only_code_point_maps;
7756 # A boolean used to help in computing the type of data in the map table.
7757 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
7760 # A list of the first few distinct mappings this property has. This is
7761 # used to disambiguate between binary and enum property types, so don't
7762 # have to keep more than three.
7763 main::set_access('unique_maps', \%unique_maps);
7765 my %pre_declared_maps;
7766 # A boolean that gives whether the input data should declare all the
7767 # tables used, or not. If the former, unknown ones raise a warning.
7768 main::set_access('pre_declared_maps',
7769 \%pre_declared_maps, 'r', 's');
7772 # The only required parameter is the positionally first, name. All
7773 # other parameters are key => value pairs. See the documentation just
7774 # above for the meanings of the ones not passed directly on to the map
7775 # table constructor.
7778 my $name = shift || "";
7780 my $self = property_ref($name);
7781 if (defined $self) {
7782 my $options_string = join ", ", @_;
7783 $options_string = ". Ignoring options $options_string" if $options_string;
7784 Carp::my_carp("$self is already in use. Using existing one$options_string;");
7790 $self = bless \do { my $anonymous_scalar }, $class;
7791 my $addr = do { no overloading; pack 'J', $self; };
7793 $directory{$addr} = delete $args{'Directory'};
7794 $file{$addr} = delete $args{'File'};
7795 $full_name{$addr} = delete $args{'Full_Name'} || $name;
7796 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
7797 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
7798 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
7799 # Starting in this release, property
7800 # values should be defined for all
7801 # properties, except those overriding this
7802 // $v_version ge v5.1.0;
7804 # Rest of parameters passed on.
7806 $has_only_code_point_maps{$addr} = 1;
7807 $table_ref{$addr} = { };
7808 $unique_maps{$addr} = { };
7810 $map{$addr} = Map_Table->new($name,
7811 Full_Name => $full_name{$addr},
7812 _Alias_Hash => \%alias_to_property_of,
7818 # See this program's beginning comment block about overloading the copy
7819 # constructor. Few operations are defined on properties, but a couple are
7820 # useful. It is safe to take the inverse of a property, and to remove a
7821 # single code point from it.
7824 qw("") => "_operator_stringify",
7825 "." => \&main::_operator_dot,
7826 ".=" => \&main::_operator_dot_equal,
7827 '==' => \&main::_operator_equal,
7828 '!=' => \&main::_operator_not_equal,
7829 '=' => sub { return shift },
7830 '-=' => "_minus_and_equal",
7833 sub _operator_stringify {
7834 return "Property '" . shift->full_name . "'";
7837 sub _minus_and_equal {
7838 # Remove a single code point from the map table of a property.
7842 my $reversed = shift;
7843 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7846 Carp::my_carp_bug("Bad news. Can't cope with a "
7848 . " argument to '-='. Subtraction ignored.");
7851 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
7852 Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
7854 . " from a non-object. undef returned.");
7859 $map{pack 'J', $self}->delete_range($other, $other);
7864 sub add_match_table {
7865 # Add a new match table for this property, with name given by the
7866 # parameter. It returns a pointer to the table.
7872 my $addr = do { no overloading; pack 'J', $self; };
7874 my $table = $table_ref{$addr}{$name};
7875 my $standard_name = main::standardize($name);
7877 || (defined ($table = $table_ref{$addr}{$standard_name})))
7879 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
7880 $table_ref{$addr}{$name} = $table;
7885 # See if this is a perl extension, if not passed in.
7886 my $perl_extension = delete $args{'Perl_Extension'};
7888 = $self->perl_extension if ! defined $perl_extension;
7890 $table = Match_Table->new(
7892 Perl_Extension => $perl_extension,
7893 _Alias_Hash => $table_ref{$addr},
7896 # gets property's fate and status by default
7897 Fate => $self->fate,
7898 Status => $self->status,
7899 _Status_Info => $self->status_info,
7901 return unless defined $table;
7904 # Save the names for quick look up
7905 $table_ref{$addr}{$standard_name} = $table;
7906 $table_ref{$addr}{$name} = $table;
7908 # Perhaps we can figure out the type of this property based on the
7909 # fact of adding this match table. First, string properties don't
7910 # have match tables; second, a binary property can't have 3 match
7912 if ($type{$addr} == $UNKNOWN) {
7913 $type{$addr} = $NON_STRING;
7915 elsif ($type{$addr} == $STRING) {
7916 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
7917 $type{$addr} = $NON_STRING;
7919 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
7920 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2
7921 && $type{$addr} == $BINARY)
7923 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
7924 $type{$addr} = $ENUM;
7931 sub delete_match_table {
7932 # Delete the table referred to by $2 from the property $1.
7935 my $table_to_remove = shift;
7936 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7938 my $addr = do { no overloading; pack 'J', $self; };
7940 # Remove all names that refer to it.
7941 foreach my $key (keys %{$table_ref{$addr}}) {
7942 delete $table_ref{$addr}{$key}
7943 if $table_ref{$addr}{$key} == $table_to_remove;
7946 $table_to_remove->DESTROY;
7951 # Return a pointer to the match table (with name given by the
7952 # parameter) associated with this property; undef if none.
7956 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7958 my $addr = do { no overloading; pack 'J', $self; };
7960 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
7962 # If quick look-up failed, try again using the standard form of the
7963 # input name. If that succeeds, cache the result before returning so
7964 # won't have to standardize this input name again.
7965 my $standard_name = main::standardize($name);
7966 return unless defined $table_ref{$addr}{$standard_name};
7968 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
7969 return $table_ref{$addr}{$name};
7973 # Return a list of pointers to all the match tables attached to this
7977 return main::uniques(values %{$table_ref{pack 'J', shift}});
7981 # Returns the directory the map table for this property should be
7982 # output in. If a specific directory has been specified, that has
7983 # priority; 'undef' is returned if the type isn't defined;
7984 # or $map_directory for everything else.
7986 my $addr = do { no overloading; pack 'J', shift; };
7988 return $directory{$addr} if defined $directory{$addr};
7989 return undef if $type{$addr} == $UNKNOWN;
7990 return $map_directory;
7994 # Return the name that is used to both:
7995 # 1) Name the file that the map table is written to.
7996 # 2) The name of swash related stuff inside that file.
7997 # The reason for this is that the Perl core historically has used
7998 # certain names that aren't the same as the Unicode property names.
7999 # To continue using these, $file is hard-coded in this file for those,
8000 # but otherwise the standard name is used. This is different from the
8001 # external_name, so that the rest of the files, like in lib can use
8002 # the standard name always, without regard to historical precedent.
8005 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8007 my $addr = do { no overloading; pack 'J', $self; };
8009 # Swash names are used only on regular map tables; otherwise there
8010 # should be no access to the property map table from other parts of
8012 return if $map{$addr}->fate != $ORDINARY;
8014 return $file{$addr} if defined $file{$addr};
8015 return $map{$addr}->external_name;
8018 sub to_create_match_tables {
8019 # Returns a boolean as to whether or not match tables should be
8020 # created for this property.
8023 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8025 # The whole point of this pseudo property is match tables.
8026 return 1 if $self == $perl;
8028 my $addr = do { no overloading; pack 'J', $self; };
8030 # Don't generate tables of code points that match the property values
8031 # of a string property. Such a list would most likely have many
8032 # property values, each with just one or very few code points mapping
8034 return 0 if $type{$addr} == $STRING;
8036 # Don't generate anything for unimplemented properties.
8037 return 0 if grep { $self->complete_name eq $_ }
8038 @unimplemented_properties;
8043 sub property_add_or_replace_non_nulls {
8044 # This adds the mappings in the property $other to $self. Non-null
8045 # mappings from $other override those in $self. It essentially merges
8046 # the two properties, with the second having priority except for null
8051 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8053 if (! $other->isa(__PACKAGE__)) {
8054 Carp::my_carp_bug("$other should be a "
8063 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
8067 # Certain tables are not generally written out to files, but
8068 # Unicode::UCD has the intelligence to know that the file for $self
8069 # can be used to reconstruct those tables. This routine just changes
8070 # things so that UCD pod entries for those suppressed tables are
8071 # generated, so the fact that a proxy is used is invisible to the
8076 foreach my $property_name (@_) {
8077 my $ref = property_ref($property_name);
8078 next if $ref->to_output_map;
8079 $ref->set_fate($MAP_PROXIED);
8084 # Set the type of the property. Mostly this is figured out by the
8085 # data in the table. But this is used to set it explicitly. The
8086 # reason it is not a standard accessor is that when setting a binary
8087 # property, we need to make sure that all the true/false aliases are
8088 # present, as they were omitted in early Unicode releases.
8092 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8096 && $type != $FORCED_BINARY
8097 && $type != $STRING)
8099 Carp::my_carp("Unrecognized type '$type'. Type not set");
8103 { no overloading; $type{pack 'J', $self} = $type; }
8104 return if $type != $BINARY && $type != $FORCED_BINARY;
8106 my $yes = $self->table('Y');
8107 $yes = $self->table('Yes') if ! defined $yes;
8108 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
8111 # Add aliases in order wanted, duplicates will be ignored. We use a
8112 # binary property present in all releases for its ordered lists of
8113 # true/false aliases. Note, that could run into problems in
8114 # outputting things in that we don't distinguish between the name and
8115 # full name of these. Hopefully, if the table was already created
8116 # before this code is executed, it was done with these set properly.
8117 my $bm = property_ref("Bidi_Mirrored");
8118 foreach my $alias ($bm->table("Y")->aliases) {
8119 $yes->add_alias($alias->name);
8121 my $no = $self->table('N');
8122 $no = $self->table('No') if ! defined $no;
8123 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
8124 foreach my $alias ($bm->table("N")->aliases) {
8125 $no->add_alias($alias->name);
8132 # Add a map to the property's map table. This also keeps
8133 # track of the maps so that the property type can be determined from
8137 my $start = shift; # First code point in range
8138 my $end = shift; # Final code point in range
8139 my $map = shift; # What the range maps to.
8140 # Rest of parameters passed on.
8142 my $addr = do { no overloading; pack 'J', $self; };
8144 # If haven't the type of the property, gather information to figure it
8146 if ($type{$addr} == $UNKNOWN) {
8148 # If the map contains an interior blank or dash, or most other
8149 # nonword characters, it will be a string property. This
8150 # heuristic may actually miss some string properties. If so, they
8151 # may need to have explicit set_types called for them. This
8152 # happens in the Unihan properties.
8153 if ($map =~ / (?<= . ) [ -] (?= . ) /x
8154 || $map =~ / [^\w.\/\ -] /x)
8156 $self->set_type($STRING);
8158 # $unique_maps is used for disambiguating between ENUM and
8159 # BINARY later; since we know the property is not going to be
8160 # one of those, no point in keeping the data around
8161 undef $unique_maps{$addr};
8165 # Not necessarily a string. The final decision has to be
8166 # deferred until all the data are in. We keep track of if all
8167 # the values are code points for that eventual decision.
8168 $has_only_code_point_maps{$addr} &=
8169 $map =~ / ^ $code_point_re $/x;
8171 # For the purposes of disambiguating between binary and other
8172 # enumerations at the end, we keep track of the first three
8173 # distinct property values. Once we get to three, we know
8174 # it's not going to be binary, so no need to track more.
8175 if (scalar keys %{$unique_maps{$addr}} < 3) {
8176 $unique_maps{$addr}{main::standardize($map)} = 1;
8181 # Add the mapping by calling our map table's method
8182 return $map{$addr}->add_map($start, $end, $map, @_);
8186 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
8187 # should be called after the property is mostly filled with its maps.
8188 # We have been keeping track of what the property values have been,
8189 # and now have the necessary information to figure out the type.
8192 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8194 my $addr = do { no overloading; pack 'J', $self; };
8196 my $type = $type{$addr};
8198 # If already have figured these out, no need to do so again, but we do
8199 # a double check on ENUMS to make sure that a string property hasn't
8200 # improperly been classified as an ENUM, so continue on with those.
8201 return if $type == $STRING
8203 || $type == $FORCED_BINARY;
8205 # If every map is to a code point, is a string property.
8206 if ($type == $UNKNOWN
8207 && ($has_only_code_point_maps{$addr}
8208 || (defined $map{$addr}->default_map
8209 && $map{$addr}->default_map eq "")))
8211 $self->set_type($STRING);
8215 # Otherwise, it is to some sort of enumeration. (The case where
8216 # it is a Unicode miscellaneous property, and treated like a
8217 # string in this program is handled in add_map()). Distinguish
8218 # between binary and some other enumeration type. Of course, if
8219 # there are more than two values, it's not binary. But more
8220 # subtle is the test that the default mapping is defined means it
8221 # isn't binary. This in fact may change in the future if Unicode
8222 # changes the way its data is structured. But so far, no binary
8223 # properties ever have @missing lines for them, so the default map
8224 # isn't defined for them. The few properties that are two-valued
8225 # and aren't considered binary have the default map defined
8226 # starting in Unicode 5.0, when the @missing lines appeared; and
8227 # this program has special code to put in a default map for them
8228 # for earlier than 5.0 releases.
8230 || scalar keys %{$unique_maps{$addr}} > 2
8231 || defined $self->default_map)
8233 my $tables = $self->tables;
8234 my $count = $self->count;
8235 if ($verbosity && $count > 500 && $tables/$count > .1) {
8236 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $count\n");
8238 $self->set_type($ENUM);
8241 $self->set_type($BINARY);
8244 undef $unique_maps{$addr}; # Garbage collect
8251 my $reason = shift; # Ignored unless suppressing
8252 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8254 my $addr = do { no overloading; pack 'J', $self; };
8255 if ($fate == $SUPPRESSED) {
8256 $why_suppressed{$self->complete_name} = $reason;
8259 # Each table shares the property's fate, except that MAP_PROXIED
8260 # doesn't affect match tables
8261 $map{$addr}->set_fate($fate, $reason);
8262 if ($fate != $MAP_PROXIED) {
8263 foreach my $table ($map{$addr}, $self->tables) {
8264 $table->set_fate($fate, $reason);
8271 # Most of the accessors for a property actually apply to its map table.
8272 # Setup up accessor functions for those, referring to %map
8321 # 'property' above is for symmetry, so that one can take
8322 # the property of a property and get itself, and so don't
8323 # have to distinguish between properties and tables in
8331 return $map{pack 'J', $self}->$sub(@_);
8341 # Returns lines of the input joined together, so that they can be folded
8343 # This causes continuation lines to be joined together into one long line
8344 # for folding. A continuation line is any line that doesn't begin with a
8345 # space or "\b" (the latter is stripped from the output). This is so
8346 # lines can be be in a HERE document so as to fit nicely in the terminal
8347 # width, but be joined together in one long line, and then folded with
8348 # indents, '#' prefixes, etc, properly handled.
8349 # A blank separates the joined lines except if there is a break; an extra
8350 # blank is inserted after a period ending a line.
8352 # Initialize the return with the first line.
8353 my ($return, @lines) = split "\n", shift;
8355 # If the first line is null, it was an empty line, add the \n back in
8356 $return = "\n" if $return eq "";
8358 # Now join the remainder of the physical lines.
8359 for my $line (@lines) {
8361 # An empty line means wanted a blank line, so add two \n's to get that
8362 # effect, and go to the next line.
8363 if (length $line == 0) {
8368 # Look at the last character of what we have so far.
8369 my $previous_char = substr($return, -1, 1);
8371 # And at the next char to be output.
8372 my $next_char = substr($line, 0, 1);
8374 if ($previous_char ne "\n") {
8376 # Here didn't end wth a nl. If the next char a blank or \b, it
8377 # means that here there is a break anyway. So add a nl to the
8379 if ($next_char eq " " || $next_char eq "\b") {
8380 $previous_char = "\n";
8381 $return .= $previous_char;
8384 # Add an extra space after periods.
8385 $return .= " " if $previous_char eq '.';
8388 # Here $previous_char is still the latest character to be output. If
8389 # it isn't a nl, it means that the next line is to be a continuation
8390 # line, with a blank inserted between them.
8391 $return .= " " if $previous_char ne "\n";
8394 substr($line, 0, 1) = "" if $next_char eq "\b";
8396 # And append this next line.
8403 sub simple_fold($;$$$) {
8404 # Returns a string of the input (string or an array of strings) folded
8405 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
8407 # This is tailored for the kind of text written by this program,
8408 # especially the pod file, which can have very long names with
8409 # underscores in the middle, or words like AbcDefgHij.... We allow
8410 # breaking in the middle of such constructs if the line won't fit
8411 # otherwise. The break in such cases will come either just after an
8412 # underscore, or just before one of the Capital letters.
8414 local $to_trace = 0 if main::DEBUG;
8417 my $prefix = shift; # Optional string to prepend to each output
8419 $prefix = "" unless defined $prefix;
8421 my $hanging_indent = shift; # Optional number of spaces to indent
8422 # continuation lines
8423 $hanging_indent = 0 unless $hanging_indent;
8425 my $right_margin = shift; # Optional number of spaces to narrow the
8427 $right_margin = 0 unless defined $right_margin;
8429 # Call carp with the 'nofold' option to avoid it from trying to call us
8431 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
8433 # The space available doesn't include what's automatically prepended
8434 # to each line, or what's reserved on the right.
8435 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
8436 # XXX Instead of using the 'nofold' perhaps better to look up the stack
8438 if (DEBUG && $hanging_indent >= $max) {
8439 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
8440 $hanging_indent = 0;
8443 # First, split into the current physical lines.
8445 if (ref $line) { # Better be an array, because not bothering to
8447 foreach my $line (@{$line}) {
8448 push @line, split /\n/, $line;
8452 @line = split /\n/, $line;
8455 #local $to_trace = 1 if main::DEBUG;
8456 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
8458 # Look at each current physical line.
8459 for (my $i = 0; $i < @line; $i++) {
8460 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
8461 #local $to_trace = 1 if main::DEBUG;
8462 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
8464 # Remove prefix, because will be added back anyway, don't want
8466 $line[$i] =~ s/^$prefix//;
8468 # Remove trailing space
8469 $line[$i] =~ s/\s+\Z//;
8471 # If the line is too long, fold it.
8472 if (length $line[$i] > $max) {
8475 # Here needs to fold. Save the leading space in the line for
8477 $line[$i] =~ /^ ( \s* )/x;
8478 my $leading_space = $1;
8479 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
8481 # If character at final permissible position is white space,
8482 # fold there, which will delete that white space
8483 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
8484 $remainder = substr($line[$i], $max);
8485 $line[$i] = substr($line[$i], 0, $max - 1);
8489 # Otherwise fold at an acceptable break char closest to
8490 # the max length. Look at just the maximal initial
8491 # segment of the line
8492 my $segment = substr($line[$i], 0, $max - 1);
8494 /^ ( .{$hanging_indent} # Don't look before the
8496 \ * # Don't look in leading
8497 # blanks past the indent
8498 [^ ] .* # Find the right-most
8499 (?: # acceptable break:
8500 [ \s = ] # space or equal
8501 | - (?! [.0-9] ) # or non-unary minus.
8502 ) # $1 includes the character
8505 # Split into the initial part that fits, and remaining
8507 $remainder = substr($line[$i], length $1);
8509 trace $line[$i] if DEBUG && $to_trace;
8510 trace $remainder if DEBUG && $to_trace;
8513 # If didn't find a good breaking spot, see if there is a
8514 # not-so-good breaking spot. These are just after
8515 # underscores or where the case changes from lower to
8516 # upper. Use \a as a soft hyphen, but give up
8517 # and don't break the line if there is actually a \a
8518 # already in the input. We use an ascii character for the
8519 # soft-hyphen to avoid any attempt by miniperl to try to
8520 # access the files that this program is creating.
8521 elsif ($segment !~ /\a/
8522 && ($segment =~ s/_/_\a/g
8523 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
8525 # Here were able to find at least one place to insert
8526 # our substitute soft hyphen. Find the right-most one
8527 # and replace it by a real hyphen.
8528 trace $segment if DEBUG && $to_trace;
8530 rindex($segment, "\a"),
8533 # Then remove the soft hyphen substitutes.
8534 $segment =~ s/\a//g;
8535 trace $segment if DEBUG && $to_trace;
8537 # And split into the initial part that fits, and
8538 # remainder of the line
8539 my $pos = rindex($segment, '-');
8540 $remainder = substr($line[$i], $pos);
8541 trace $remainder if DEBUG && $to_trace;
8542 $line[$i] = substr($segment, 0, $pos + 1);
8546 # Here we know if we can fold or not. If we can, $remainder
8547 # is what remains to be processed in the next iteration.
8548 if (defined $remainder) {
8549 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
8551 # Insert the folded remainder of the line as a new element
8552 # of the array. (It may still be too long, but we will
8553 # deal with that next time through the loop.) Omit any
8554 # leading space in the remainder.
8555 $remainder =~ s/^\s+//;
8556 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
8558 # But then indent by whichever is larger of:
8559 # 1) the leading space on the input line;
8560 # 2) the hanging indent.
8561 # This preserves indentation in the original line.
8562 my $lead = ($leading_space)
8563 ? length $leading_space
8565 $lead = max($lead, $hanging_indent);
8566 splice @line, $i+1, 0, (" " x $lead) . $remainder;
8570 # Ready to output the line. Get rid of any trailing space
8571 # And prefix by the required $prefix passed in.
8572 $line[$i] =~ s/\s+$//;
8573 $line[$i] = "$prefix$line[$i]\n";
8574 } # End of looping through all the lines.
8576 return join "", @line;
8579 sub property_ref { # Returns a reference to a property object.
8580 return Property::property_ref(@_);
8583 sub force_unlink ($) {
8584 my $filename = shift;
8585 return unless file_exists($filename);
8586 return if CORE::unlink($filename);
8588 # We might need write permission
8589 chmod 0777, $filename;
8590 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
8595 # Given a filename and references to arrays of lines, write the lines of
8596 # each array to the file
8597 # Filename can be given as an arrayref of directory names
8599 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
8602 my $use_utf8 = shift;
8604 # Get into a single string if an array, and get rid of, in Unix terms, any
8606 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
8607 $file = File::Spec->canonpath($file);
8609 # If has directories, make sure that they all exist
8610 (undef, my $directories, undef) = File::Spec->splitpath($file);
8611 File::Path::mkpath($directories) if $directories && ! -d $directories;
8613 push @files_actually_output, $file;
8615 force_unlink ($file);
8618 if (not open $OUT, ">", $file) {
8619 Carp::my_carp("can't open $file for output. Skipping this file: $!");
8623 binmode $OUT, ":utf8" if $use_utf8;
8625 while (defined (my $lines_ref = shift)) {
8626 unless (@$lines_ref) {
8627 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
8630 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
8632 close $OUT or die Carp::my_carp("close '$file' failed: $!");
8634 print "$file written.\n" if $verbosity >= $VERBOSE;
8640 sub Standardize($) {
8641 # This converts the input name string into a standardized equivalent to
8645 unless (defined $name) {
8646 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
8650 # Remove any leading or trailing white space
8654 # Convert interior white space and hyphens into underscores.
8655 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
8657 # Capitalize the letter following an underscore, and convert a sequence of
8658 # multiple underscores to a single one
8659 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
8661 # And capitalize the first letter, but not for the special cjk ones.
8662 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
8666 sub standardize ($) {
8667 # Returns a lower-cased standardized name, without underscores. This form
8668 # is chosen so that it can distinguish between any real versus superficial
8669 # Unicode name differences. It relies on the fact that Unicode doesn't
8670 # have interior underscores, white space, nor dashes in any
8671 # stricter-matched name. It should not be used on Unicode code point
8672 # names (the Name property), as they mostly, but not always follow these
8675 my $name = Standardize(shift);
8676 return if !defined $name;
8678 $name =~ s/ (?<= .) _ (?= . ) //xg;
8682 sub utf8_heavy_name ($$) {
8683 # Returns the name that utf8_heavy.pl will use to find a table. XXX
8684 # perhaps this function should be placed somewhere, like Heavy.pl so that
8685 # utf8_heavy can use it directly without duplicating code that can get
8690 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8692 my $property = $table->property;
8693 $property = ($property == $perl)
8694 ? "" # 'perl' is never explicitly stated
8695 : standardize($property->name) . '=';
8696 if ($alias->loose_match) {
8697 return $property . standardize($alias->name);
8700 return lc ($property . $alias->name);
8708 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
8711 $main::simple_dumper_nesting = 0;
8714 # Like Simple Data::Dumper. Good enough for our needs. We can't use
8715 # the real thing as we have to run under miniperl.
8717 # It is designed so that on input it is at the beginning of a line,
8718 # and the final thing output in any call is a trailing ",\n".
8722 $indent = "" if ! $debugging_build || ! defined $indent;
8724 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8726 # nesting level is localized, so that as the call stack pops, it goes
8727 # back to the prior value.
8728 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
8729 undef %already_output if $main::simple_dumper_nesting == 0;
8730 $main::simple_dumper_nesting++;
8731 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
8733 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8735 # Determine the indent for recursive calls.
8736 my $next_indent = $indent . $indent_increment;
8741 # Dump of scalar: just output it in quotes if not a number. To do
8742 # so we must escape certain characters, and therefore need to
8743 # operate on a copy to avoid changing the original
8745 $copy = $UNDEF unless defined $copy;
8747 # Quote non-integers (integers also have optional leading '-')
8748 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
8750 # Escape apostrophe and backslash
8751 $copy =~ s/ ( ['\\] ) /\\$1/xg;
8754 $output = "$indent$copy,\n";
8758 # Keep track of cycles in the input, and refuse to infinitely loop
8759 my $addr = do { no overloading; pack 'J', $item; };
8760 if (defined $already_output{$addr}) {
8761 return "${indent}ALREADY OUTPUT: $item\n";
8763 $already_output{$addr} = $item;
8765 if (ref $item eq 'ARRAY') {
8768 if ($main::simple_dumper_nesting > 1) {
8770 $using_brackets = 1;
8773 $using_brackets = 0;
8776 # If the array is empty, put the closing bracket on the same
8777 # line. Otherwise, recursively add each array element
8783 for (my $i = 0; $i < @$item; $i++) {
8785 # Indent array elements one level
8786 $output .= &simple_dumper($item->[$i], $next_indent);
8787 next if ! $debugging_build;
8788 $output =~ s/\n$//; # Remove any trailing nl so
8789 $output .= " # [$i]\n"; # as to add a comment giving
8792 $output .= $indent; # Indent closing ']' to orig level
8794 $output .= ']' if $using_brackets;
8797 elsif (ref $item eq 'HASH') {
8802 # No surrounding braces at top level
8804 if ($main::simple_dumper_nesting > 1) {
8807 $body_indent = $next_indent;
8808 $next_indent .= $indent_increment;
8813 $body_indent = $indent;
8817 # Output hashes sorted alphabetically instead of apparently
8818 # random. Use caseless alphabetic sort
8819 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
8821 if ($is_first_line) {
8825 $output .= "$body_indent";
8828 # The key must be a scalar, but this recursive call quotes
8830 $output .= &simple_dumper($key);
8832 # And change the trailing comma and nl to the hash fat
8833 # comma for clarity, and so the value can be on the same
8835 $output =~ s/,\n$/ => /;
8837 # Recursively call to get the value's dump.
8838 my $next = &simple_dumper($item->{$key}, $next_indent);
8840 # If the value is all on one line, remove its indent, so
8841 # will follow the => immediately. If it takes more than
8842 # one line, start it on a new line.
8843 if ($next !~ /\n.*\n/) {
8852 $output .= "$indent},\n" if $using_braces;
8854 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
8855 $output = $indent . ref($item) . "\n";
8856 # XXX see if blessed
8858 elsif ($item->can('dump')) {
8860 # By convention in this program, objects furnish a 'dump'
8861 # method. Since not doing any output at this level, just pass
8862 # on the input indent
8863 $output = $item->dump($indent);
8866 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
8873 sub dump_inside_out {
8874 # Dump inside-out hashes in an object's state by converting them to a
8875 # regular hash and then calling simple_dumper on that.
8878 my $fields_ref = shift;
8879 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8881 my $addr = do { no overloading; pack 'J', $object; };
8884 foreach my $key (keys %$fields_ref) {
8885 $hash{$key} = $fields_ref->{$key}{$addr};
8888 return simple_dumper(\%hash, @_);
8892 # Overloaded '.' method that is common to all packages. It uses the
8893 # package's stringify method.
8897 my $reversed = shift;
8898 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8900 $other = "" unless defined $other;
8902 foreach my $which (\$self, \$other) {
8903 next unless ref $$which;
8904 if ($$which->can('_operator_stringify')) {
8905 $$which = $$which->_operator_stringify;
8908 my $ref = ref $$which;
8909 my $addr = do { no overloading; pack 'J', $$which; };
8910 $$which = "$ref ($addr)";
8918 sub _operator_dot_equal {
8919 # Overloaded '.=' method that is common to all packages.
8923 my $reversed = shift;
8924 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8926 $other = "" unless defined $other;
8929 return $other .= "$self";
8932 return "$self" . "$other";
8936 sub _operator_equal {
8937 # Generic overloaded '==' routine. To be equal, they must be the exact
8943 return 0 unless defined $other;
8944 return 0 unless ref $other;
8946 return $self == $other;
8949 sub _operator_not_equal {
8953 return ! _operator_equal($self, $other);
8956 sub process_PropertyAliases($) {
8957 # This reads in the PropertyAliases.txt file, which contains almost all
8958 # the character properties in Unicode and their equivalent aliases:
8959 # scf ; Simple_Case_Folding ; sfc
8961 # Field 0 is the preferred short name for the property.
8962 # Field 1 is the full name.
8963 # Any succeeding ones are other accepted names.
8966 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8968 # This whole file was non-existent in early releases, so use our own
8970 $file->insert_lines(get_old_property_aliases())
8971 if ! -e 'PropertyAliases.txt';
8973 # Add any cjk properties that may have been defined.
8974 $file->insert_lines(@cjk_properties);
8976 while ($file->next_line) {
8978 my @data = split /\s*;\s*/;
8980 my $full = $data[1];
8982 my $this = Property->new($data[0], Full_Name => $full);
8984 # Start looking for more aliases after these two.
8985 for my $i (2 .. @data - 1) {
8986 $this->add_alias($data[$i]);
8991 my $scf = property_ref("Simple_Case_Folding");
8992 $scf->add_alias("scf");
8993 $scf->add_alias("sfc");
8998 sub finish_property_setup {
8999 # Finishes setting up after PropertyAliases.
9002 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9004 # This entry was missing from this file in earlier Unicode versions
9005 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9006 Property->new('JSN', Full_Name => 'Jamo_Short_Name');
9009 # These two properties must be defined in all releases so we can generate
9010 # the tables from them to make regex \X work, but suppress their output so
9011 # aren't application visible prior to releases where they should be
9012 if (! defined property_ref('GCB')) {
9013 Property->new('GCB', Full_Name => 'Grapheme_Cluster_Break',
9014 Fate => $PLACEHOLDER);
9016 if (! defined property_ref('hst')) {
9017 Property->new('hst', Full_Name => 'Hangul_Syllable_Type',
9018 Fate => $PLACEHOLDER);
9021 # These are used so much, that we set globals for them.
9022 $gc = property_ref('General_Category');
9023 $block = property_ref('Block');
9024 $script = property_ref('Script');
9026 # Perl adds this alias.
9027 $gc->add_alias('Category');
9029 # Unicode::Normalize expects this file with this name and directory.
9030 my $ccc = property_ref('Canonical_Combining_Class');
9032 $ccc->set_file('CombiningClass');
9033 $ccc->set_directory(File::Spec->curdir());
9036 # These two properties aren't actually used in the core, but unfortunately
9037 # the names just above that are in the core interfere with these, so
9038 # choose different names. These aren't a problem unless the map tables
9039 # for these files get written out.
9040 my $lowercase = property_ref('Lowercase');
9041 $lowercase->set_file('IsLower') if defined $lowercase;
9042 my $uppercase = property_ref('Uppercase');
9043 $uppercase->set_file('IsUpper') if defined $uppercase;
9045 # Set up the hard-coded default mappings, but only on properties defined
9047 foreach my $property (keys %default_mapping) {
9048 my $property_object = property_ref($property);
9049 next if ! defined $property_object;
9050 my $default_map = $default_mapping{$property};
9051 $property_object->set_default_map($default_map);
9053 # A map of <code point> implies the property is string.
9054 if ($property_object->type == $UNKNOWN
9055 && $default_map eq $CODE_POINT)
9057 $property_object->set_type($STRING);
9061 # The following use the Multi_Default class to create objects for
9064 # Bidi class has a complicated default, but the derived file takes care of
9065 # the complications, leaving just 'L'.
9066 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
9067 property_ref('Bidi_Class')->set_default_map('L');
9072 # The derived file was introduced in 3.1.1. The values below are
9073 # taken from table 3-8, TUS 3.0
9075 'my $default = Range_List->new;
9076 $default->add_range(0x0590, 0x05FF);
9077 $default->add_range(0xFB1D, 0xFB4F);'
9080 # The defaults apply only to unassigned characters
9081 $default_R .= '$gc->table("Unassigned") & $default;';
9083 if ($v_version lt v3.0.0) {
9084 $default = Multi_Default->new(R => $default_R, 'L');
9088 # AL apparently not introduced until 3.0: TUS 2.x references are
9089 # not on-line to check it out
9091 'my $default = Range_List->new;
9092 $default->add_range(0x0600, 0x07BF);
9093 $default->add_range(0xFB50, 0xFDFF);
9094 $default->add_range(0xFE70, 0xFEFF);'
9097 # Non-character code points introduced in this release; aren't AL
9098 if ($v_version ge 3.1.0) {
9099 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
9101 $default_AL .= '$gc->table("Unassigned") & $default';
9102 $default = Multi_Default->new(AL => $default_AL,
9106 property_ref('Bidi_Class')->set_default_map($default);
9109 # Joining type has a complicated default, but the derived file takes care
9110 # of the complications, leaving just 'U' (or Non_Joining), except the file
9112 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
9113 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
9114 property_ref('Joining_Type')->set_default_map('Non_Joining');
9118 # Otherwise, there are not one, but two possibilities for the
9119 # missing defaults: T and U.
9120 # The missing defaults that evaluate to T are given by:
9121 # T = Mn + Cf - ZWNJ - ZWJ
9122 # where Mn and Cf are the general category values. In other words,
9123 # any non-spacing mark or any format control character, except
9124 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
9125 # WIDTH JOINER (joining type C).
9126 my $default = Multi_Default->new(
9127 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
9129 property_ref('Joining_Type')->set_default_map($default);
9133 # Line break has a complicated default in early releases. It is 'Unknown'
9134 # for non-assigned code points; 'AL' for assigned.
9135 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
9136 my $lb = property_ref('Line_Break');
9137 if ($v_version gt 3.2.0) {
9138 $lb->set_default_map('Unknown');
9141 my $default = Multi_Default->new( 'Unknown' => '$gc->table("Cn")',
9143 $lb->set_default_map($default);
9146 # If has the URS property, make sure that the standard aliases are in
9147 # it, since not in the input tables in some versions.
9148 my $urs = property_ref('Unicode_Radical_Stroke');
9150 $urs->add_alias('cjkRSUnicode');
9151 $urs->add_alias('kRSUnicode');
9155 # For backwards compatibility with applications that may read the mapping
9156 # file directly (it was documented in 5.12 and 5.14 as being thusly
9157 # usable), keep it from being adjusted. (range_size_1 is
9158 # used to force the traditional format.)
9159 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
9160 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
9161 $nfkc_cf->set_range_size_1(1);
9163 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
9164 $bmg->set_to_output_map($EXTERNAL_MAP);
9165 $bmg->set_range_size_1(1);
9168 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
9173 sub get_old_property_aliases() {
9174 # Returns what would be in PropertyAliases.txt if it existed in very old
9175 # versions of Unicode. It was derived from the one in 3.2, and pared
9176 # down based on the data that was actually in the older releases.
9177 # An attempt was made to use the existence of files to mean inclusion or
9178 # not of various aliases, but if this was not sufficient, using version
9179 # numbers was resorted to.
9183 # These are to be used in all versions (though some are constructed by
9184 # this program if missing)
9185 push @return, split /\n/, <<'END';
9187 Bidi_M ; Bidi_Mirrored
9189 ccc ; Canonical_Combining_Class
9190 dm ; Decomposition_Mapping
9191 dt ; Decomposition_Type
9192 gc ; General_Category
9194 lc ; Lowercase_Mapping
9196 na1 ; Unicode_1_Name
9199 scf ; Simple_Case_Folding
9200 slc ; Simple_Lowercase_Mapping
9201 stc ; Simple_Titlecase_Mapping
9202 suc ; Simple_Uppercase_Mapping
9203 tc ; Titlecase_Mapping
9204 uc ; Uppercase_Mapping
9207 if (-e 'Blocks.txt') {
9208 push @return, "blk ; Block\n";
9210 if (-e 'ArabicShaping.txt') {
9211 push @return, split /\n/, <<'END';
9216 if (-e 'PropList.txt') {
9218 # This first set is in the original old-style proplist.
9219 push @return, split /\n/, <<'END';
9220 Bidi_C ; Bidi_Control
9228 Join_C ; Join_Control
9230 QMark ; Quotation_Mark
9231 Term ; Terminal_Punctuation
9232 WSpace ; White_Space
9234 # The next sets were added later
9235 if ($v_version ge v3.0.0) {
9236 push @return, split /\n/, <<'END';
9241 if ($v_version ge v3.0.1) {
9242 push @return, split /\n/, <<'END';
9243 NChar ; Noncharacter_Code_Point
9246 # The next sets were added in the new-style
9247 if ($v_version ge v3.1.0) {
9248 push @return, split /\n/, <<'END';
9249 OAlpha ; Other_Alphabetic
9250 OLower ; Other_Lowercase
9252 OUpper ; Other_Uppercase
9255 if ($v_version ge v3.1.1) {
9256 push @return, "AHex ; ASCII_Hex_Digit\n";
9259 if (-e 'EastAsianWidth.txt') {
9260 push @return, "ea ; East_Asian_Width\n";
9262 if (-e 'CompositionExclusions.txt') {
9263 push @return, "CE ; Composition_Exclusion\n";
9265 if (-e 'LineBreak.txt') {
9266 push @return, "lb ; Line_Break\n";
9268 if (-e 'BidiMirroring.txt') {
9269 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
9271 if (-e 'Scripts.txt') {
9272 push @return, "sc ; Script\n";
9274 if (-e 'DNormalizationProps.txt') {
9275 push @return, split /\n/, <<'END';
9276 Comp_Ex ; Full_Composition_Exclusion
9277 FC_NFKC ; FC_NFKC_Closure
9278 NFC_QC ; NFC_Quick_Check
9279 NFD_QC ; NFD_Quick_Check
9280 NFKC_QC ; NFKC_Quick_Check
9281 NFKD_QC ; NFKD_Quick_Check
9282 XO_NFC ; Expands_On_NFC
9283 XO_NFD ; Expands_On_NFD
9284 XO_NFKC ; Expands_On_NFKC
9285 XO_NFKD ; Expands_On_NFKD
9288 if (-e 'DCoreProperties.txt') {
9289 push @return, split /\n/, <<'END';
9295 # These can also appear in some versions of PropList.txt
9296 push @return, "Lower ; Lowercase\n"
9297 unless grep { $_ =~ /^Lower\b/} @return;
9298 push @return, "Upper ; Uppercase\n"
9299 unless grep { $_ =~ /^Upper\b/} @return;
9302 # This flag requires the DAge.txt file to be copied into the directory.
9303 if (DEBUG && $compare_versions) {
9304 push @return, 'age ; Age';
9310 sub process_PropValueAliases {
9311 # This file contains values that properties look like:
9312 # bc ; AL ; Arabic_Letter
9313 # blk; n/a ; Greek_And_Coptic ; Greek
9315 # Field 0 is the property.
9316 # Field 1 is the short name of a property value or 'n/a' if no
9317 # short name exists;
9318 # Field 2 is the full property value name;
9319 # Any other fields are more synonyms for the property value.
9320 # Purely numeric property values are omitted from the file; as are some
9321 # others, fewer and fewer in later releases
9323 # Entries for the ccc property have an extra field before the
9325 # ccc; 0; NR ; Not_Reordered
9326 # It is the numeric value that the names are synonyms for.
9328 # There are comment entries for values missing from this file:
9329 # # @missing: 0000..10FFFF; ISO_Comment; <none>
9330 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
9333 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9335 # This whole file was non-existent in early releases, so use our own
9336 # internal one if necessary.
9337 if (! -e 'PropValueAliases.txt') {
9338 $file->insert_lines(get_old_property_value_aliases());
9341 if ($v_version lt 4.0.0) {
9342 $file->insert_lines(split /\n/, <<'END'
9343 hst; L ; Leading_Jamo
9344 hst; LV ; LV_Syllable
9345 hst; LVT ; LVT_Syllable
9346 hst; NA ; Not_Applicable
9347 hst; T ; Trailing_Jamo
9352 if ($v_version lt 4.1.0) {
9353 $file->insert_lines(split /\n/, <<'END'
9369 # Add any explicit cjk values
9370 $file->insert_lines(@cjk_property_values);
9372 # This line is used only for testing the code that checks for name
9373 # conflicts. There is a script Inherited, and when this line is executed
9374 # it causes there to be a name conflict with the 'Inherited' that this
9375 # program generates for this block property value
9376 #$file->insert_lines('blk; n/a; Herited');
9379 # Process each line of the file ...
9380 while ($file->next_line) {
9382 # Fix typo in input file
9383 s/CCC133/CCC132/g if $v_version eq v6.1.0;
9385 my ($property, @data) = split /\s*;\s*/;
9387 # The ccc property has an extra field at the beginning, which is the
9388 # numeric value. Move it to be after the other two, mnemonic, fields,
9389 # so that those will be used as the property value's names, and the
9390 # number will be an extra alias. (Rightmost splice removes field 1-2,
9391 # returning them in a slice; left splice inserts that before anything,
9392 # thus shifting the former field 0 to after them.)
9393 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
9395 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
9396 # there is no short name, use the full one in element 1
9397 if ($data[0] eq "n/a") {
9398 $data[0] = $data[1];
9400 elsif ($data[0] ne $data[1]
9401 && standardize($data[0]) eq standardize($data[1])
9402 && $data[1] !~ /[[:upper:]]/)
9404 # Also, there is a bug in the file in which "n/a" is omitted, and
9405 # the two fields are identical except for case, and the full name
9406 # is all lower case. Copy the "short" name unto the full one to
9407 # give it some upper case.
9409 $data[1] = $data[0];
9412 # Earlier releases had the pseudo property 'qc' that should expand to
9413 # the ones that replace it below.
9414 if ($property eq 'qc') {
9415 if (lc $data[0] eq 'y') {
9416 $file->insert_lines('NFC_QC; Y ; Yes',
9422 elsif (lc $data[0] eq 'n') {
9423 $file->insert_lines('NFC_QC; N ; No',
9429 elsif (lc $data[0] eq 'm') {
9430 $file->insert_lines('NFC_QC; M ; Maybe',
9431 'NFKC_QC; M ; Maybe',
9435 $file->carp_bad_line("qc followed by unexpected '$data[0]");
9440 # The first field is the short name, 2nd is the full one.
9441 my $property_object = property_ref($property);
9442 my $table = $property_object->add_match_table($data[0],
9443 Full_Name => $data[1]);
9445 # Start looking for more aliases after these two.
9446 for my $i (2 .. @data - 1) {
9447 $table->add_alias($data[$i]);
9449 } # End of looping through the file
9451 # As noted in the comments early in the program, it generates tables for
9452 # the default values for all releases, even those for which the concept
9453 # didn't exist at the time. Here we add those if missing.
9454 my $age = property_ref('age');
9455 if (defined $age && ! defined $age->table('Unassigned')) {
9456 $age->add_match_table('Unassigned');
9458 $block->add_match_table('No_Block') if -e 'Blocks.txt'
9459 && ! defined $block->table('No_Block');
9462 # Now set the default mappings of the properties from the file. This is
9463 # done after the loop because a number of properties have only @missings
9464 # entries in the file, and may not show up until the end.
9465 my @defaults = $file->get_missings;
9466 foreach my $default_ref (@defaults) {
9467 my $default = $default_ref->[0];
9468 my $property = property_ref($default_ref->[1]);
9469 $property->set_default_map($default);
9474 sub get_old_property_value_aliases () {
9475 # Returns what would be in PropValueAliases.txt if it existed in very old
9476 # versions of Unicode. It was derived from the one in 3.2, and pared
9477 # down. An attempt was made to use the existence of files to mean
9478 # inclusion or not of various aliases, but if this was not sufficient,
9479 # using version numbers was resorted to.
9481 my @return = split /\n/, <<'END';
9482 bc ; AN ; Arabic_Number
9483 bc ; B ; Paragraph_Separator
9484 bc ; CS ; Common_Separator
9485 bc ; EN ; European_Number
9486 bc ; ES ; European_Separator
9487 bc ; ET ; European_Terminator
9488 bc ; L ; Left_To_Right
9489 bc ; ON ; Other_Neutral
9490 bc ; R ; Right_To_Left
9491 bc ; WS ; White_Space
9493 Bidi_M; N; No; F; False
9494 Bidi_M; Y; Yes; T; True
9496 # The standard combining classes are very much different in v1, so only use
9497 # ones that look right (not checked thoroughly)
9498 ccc; 0; NR ; Not_Reordered
9499 ccc; 1; OV ; Overlay
9501 ccc; 8; KV ; Kana_Voicing
9503 ccc; 202; ATBL ; Attached_Below_Left
9504 ccc; 216; ATAR ; Attached_Above_Right
9505 ccc; 218; BL ; Below_Left
9507 ccc; 222; BR ; Below_Right
9509 ccc; 228; AL ; Above_Left
9511 ccc; 232; AR ; Above_Right
9512 ccc; 234; DA ; Double_Above
9514 dt ; can ; canonical
9528 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
9530 gc ; Cn ; Unassigned
9531 gc ; Co ; Private_Use
9532 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
9533 gc ; LC ; Cased_Letter # Ll | Lt | Lu
9534 gc ; Ll ; Lowercase_Letter
9535 gc ; Lm ; Modifier_Letter
9536 gc ; Lo ; Other_Letter
9537 gc ; Lu ; Uppercase_Letter
9538 gc ; M ; Mark # Mc | Me | Mn
9539 gc ; Mc ; Spacing_Mark
9540 gc ; Mn ; Nonspacing_Mark
9541 gc ; N ; Number # Nd | Nl | No
9542 gc ; Nd ; Decimal_Number
9543 gc ; No ; Other_Number
9544 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
9545 gc ; Pd ; Dash_Punctuation
9546 gc ; Pe ; Close_Punctuation
9547 gc ; Po ; Other_Punctuation
9548 gc ; Ps ; Open_Punctuation
9549 gc ; S ; Symbol # Sc | Sk | Sm | So
9550 gc ; Sc ; Currency_Symbol
9551 gc ; Sm ; Math_Symbol
9552 gc ; So ; Other_Symbol
9553 gc ; Z ; Separator # Zl | Zp | Zs
9554 gc ; Zl ; Line_Separator
9555 gc ; Zp ; Paragraph_Separator
9556 gc ; Zs ; Space_Separator
9564 if (-e 'ArabicShaping.txt') {
9565 push @return, split /\n/, <<'END';
9572 jg ; n/a ; NO_JOINING_GROUP
9580 jt ; C ; Join_Causing
9581 jt ; D ; Dual_Joining
9582 jt ; L ; Left_Joining
9583 jt ; R ; Right_Joining
9584 jt ; U ; Non_Joining
9585 jt ; T ; Transparent
9587 if ($v_version ge v3.0.0) {
9588 push @return, split /\n/, <<'END';
9592 jg ; n/a ; DALATH_RISH
9595 jg ; n/a ; FINAL_SEMKATH
9598 jg ; n/a ; HAMZA_ON_HEH_GOAL
9605 jg ; n/a ; KNOTTED_HEH
9612 jg ; n/a ; REVERSED_PE
9616 jg ; n/a ; SWASH_KAF
9618 jg ; n/a ; TEH_MARBUTA
9621 jg ; n/a ; YEH_BARREE
9622 jg ; n/a ; YEH_WITH_TAIL
9631 if (-e 'EastAsianWidth.txt') {
9632 push @return, split /\n/, <<'END';
9642 if (-e 'LineBreak.txt') {
9643 push @return, split /\n/, <<'END';
9645 lb ; AL ; Alphabetic
9646 lb ; B2 ; Break_Both
9647 lb ; BA ; Break_After
9648 lb ; BB ; Break_Before
9649 lb ; BK ; Mandatory_Break
9650 lb ; CB ; Contingent_Break
9651 lb ; CL ; Close_Punctuation
9652 lb ; CM ; Combining_Mark
9653 lb ; CR ; Carriage_Return
9654 lb ; EX ; Exclamation
9657 lb ; ID ; Ideographic
9658 lb ; IN ; Inseperable
9659 lb ; IS ; Infix_Numeric
9661 lb ; NS ; Nonstarter
9663 lb ; OP ; Open_Punctuation
9664 lb ; PO ; Postfix_Numeric
9665 lb ; PR ; Prefix_Numeric
9667 lb ; SA ; Complex_Context
9670 lb ; SY ; Break_Symbols
9676 if (-e 'DNormalizationProps.txt') {
9677 push @return, split /\n/, <<'END';
9684 if (-e 'Scripts.txt') {
9685 push @return, split /\n/, <<'END';
9687 sc ; Armn ; Armenian
9689 sc ; Bopo ; Bopomofo
9690 sc ; Cans ; Canadian_Aboriginal
9691 sc ; Cher ; Cherokee
9692 sc ; Cyrl ; Cyrillic
9693 sc ; Deva ; Devanagari
9695 sc ; Ethi ; Ethiopic
9696 sc ; Geor ; Georgian
9699 sc ; Gujr ; Gujarati
9700 sc ; Guru ; Gurmukhi
9704 sc ; Hira ; Hiragana
9705 sc ; Ital ; Old_Italic
9706 sc ; Kana ; Katakana
9711 sc ; Mlym ; Malayalam
9712 sc ; Mong ; Mongolian
9716 sc ; Qaai ; Inherited
9730 if ($v_version ge v2.0.0) {
9731 push @return, split /\n/, <<'END';
9735 dt ; vert ; vertical
9740 gc ; Lt ; Titlecase_Letter
9741 gc ; Me ; Enclosing_Mark
9742 gc ; Nl ; Letter_Number
9743 gc ; Pc ; Connector_Punctuation
9744 gc ; Sk ; Modifier_Symbol
9747 if ($v_version ge v2.1.2) {
9748 push @return, "bc ; S ; Segment_Separator\n";
9750 if ($v_version ge v2.1.5) {
9751 push @return, split /\n/, <<'END';
9752 gc ; Pf ; Final_Punctuation
9753 gc ; Pi ; Initial_Punctuation
9756 if ($v_version ge v2.1.8) {
9757 push @return, "ccc; 240; IS ; Iota_Subscript\n";
9760 if ($v_version ge v3.0.0) {
9761 push @return, split /\n/, <<'END';
9762 bc ; AL ; Arabic_Letter
9763 bc ; BN ; Boundary_Neutral
9764 bc ; LRE ; Left_To_Right_Embedding
9765 bc ; LRO ; Left_To_Right_Override
9766 bc ; NSM ; Nonspacing_Mark
9767 bc ; PDF ; Pop_Directional_Format
9768 bc ; RLE ; Right_To_Left_Embedding
9769 bc ; RLO ; Right_To_Left_Override
9771 ccc; 233; DB ; Double_Below
9775 if ($v_version ge v3.1.0) {
9776 push @return, "ccc; 226; R ; Right\n";
9782 sub process_NormalizationsTest {
9784 # Each line looks like:
9785 # source code point; NFC; NFD; NFKC; NFKD
9787 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
9790 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9792 # Process each line of the file ...
9793 while ($file->next_line) {
9797 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
9799 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
9800 $$var = pack "U0U*", map { hex } split " ", $$var;
9801 $$var =~ s/(\\)/$1$1/g;
9804 push @normalization_tests,
9805 "Test_N(q
\a$c1
\a, q
\a$c2
\a, q
\a$c3
\a, q
\a$c4
\a, q
\a$c5
\a);\n";
9806 } # End of looping through the file
9809 sub output_perl_charnames_line ($$) {
9811 # Output the entries in Perl_charnames specially, using 5 digits instead
9812 # of four. This makes the entries a constant length, and simplifies
9813 # charnames.pm which this table is for. Unicode can have 6 digit
9814 # ordinals, but they are all private use or noncharacters which do not
9815 # have names, so won't be in this table.
9817 return sprintf "%05X\t%s\n", $_[0], $_[1];
9821 # This is used to store the range list of all the code points usable when
9822 # the little used $compare_versions feature is enabled.
9823 my $compare_versions_range_list;
9825 # These are constants to the $property_info hash in this subroutine, to
9826 # avoid using a quoted-string which might have a typo.
9828 my $DEFAULT_MAP = 'default_map';
9829 my $DEFAULT_TABLE = 'default_table';
9830 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
9831 my $MISSINGS = 'missings';
9833 sub process_generic_property_file {
9834 # This processes a file containing property mappings and puts them
9835 # into internal map tables. It should be used to handle any property
9836 # files that have mappings from a code point or range thereof to
9837 # something else. This means almost all the UCD .txt files.
9838 # each_line_handlers() should be set to adjust the lines of these
9839 # files, if necessary, to what this routine understands:
9844 # the fields are: "codepoint-range ; property; map"
9846 # meaning the codepoints in the range all have the value 'map' under
9848 # Beginning and trailing white space in each field are not significant.
9849 # Note there is not a trailing semi-colon in the above. A trailing
9850 # semi-colon means the map is a null-string. An omitted map, as
9851 # opposed to a null-string, is assumed to be 'Y', based on Unicode
9852 # table syntax. (This could have been hidden from this routine by
9853 # doing it in the $file object, but that would require parsing of the
9854 # line there, so would have to parse it twice, or change the interface
9855 # to pass this an array. So not done.)
9857 # The map field may begin with a sequence of commands that apply to
9858 # this range. Each such command begins and ends with $CMD_DELIM.
9859 # These are used to indicate, for example, that the mapping for a
9860 # range has a non-default type.
9862 # This loops through the file, calling its next_line() method, and
9863 # then taking the map and adding it to the property's table.
9864 # Complications arise because any number of properties can be in the
9865 # file, in any order, interspersed in any way. The first time a
9866 # property is seen, it gets information about that property and
9867 # caches it for quick retrieval later. It also normalizes the maps
9868 # so that only one of many synonyms is stored. The Unicode input
9869 # files do use some multiple synonyms.
9872 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9874 my %property_info; # To keep track of what properties
9875 # have already had entries in the
9876 # current file, and info about each,
9877 # so don't have to recompute.
9878 my $property_name; # property currently being worked on
9879 my $property_type; # and its type
9880 my $previous_property_name = ""; # name from last time through loop
9881 my $property_object; # pointer to the current property's
9883 my $property_addr; # the address of that object
9884 my $default_map; # the string that code points missing
9885 # from the file map to
9886 my $default_table; # For non-string properties, a
9887 # reference to the match table that
9888 # will contain the list of code
9889 # points that map to $default_map.
9891 # Get the next real non-comment line
9893 while ($file->next_line) {
9895 # Default replacement type; means that if parts of the range have
9896 # already been stored in our tables, the new map overrides them if
9897 # they differ more than cosmetically
9898 my $replace = $IF_NOT_EQUIVALENT;
9899 my $map_type; # Default type for the map of this range
9901 #local $to_trace = 1 if main::DEBUG;
9902 trace $_ if main::DEBUG && $to_trace;
9904 # Split the line into components
9905 my ($range, $property_name, $map, @remainder)
9906 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
9908 # If more or less on the line than we are expecting, warn and skip
9911 $file->carp_bad_line('Extra fields');
9914 elsif ( ! defined $property_name) {
9915 $file->carp_bad_line('Missing property');
9919 # Examine the range.
9920 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
9922 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
9926 my $high = (defined $2) ? hex $2 : $low;
9928 # For the very specialized case of comparing two Unicode
9930 if (DEBUG && $compare_versions) {
9931 if ($property_name eq 'Age') {
9933 # Only allow code points at least as old as the version
9935 my $age = pack "C*", split(/\./, $map); # v string
9936 next LINE if $age gt $compare_versions;
9940 # Again, we throw out code points younger than those of
9941 # the specified version. By now, the Age property is
9942 # populated. We use the intersection of each input range
9943 # with this property to find what code points in it are
9944 # valid. To do the intersection, we have to convert the
9945 # Age property map to a Range_list. We only have to do
9947 if (! defined $compare_versions_range_list) {
9948 my $age = property_ref('Age');
9949 if (! -e 'DAge.txt') {
9950 croak "Need to have 'DAge.txt' file to do version comparison";
9952 elsif ($age->count == 0) {
9953 croak "The 'Age' table is empty, but its file exists";
9955 $compare_versions_range_list
9956 = Range_List->new(Initialize => $age);
9959 # An undefined map is always 'Y'
9960 $map = 'Y' if ! defined $map;
9962 # Calculate the intersection of the input range with the
9963 # code points that are known in the specified version
9964 my @ranges = ($compare_versions_range_list
9965 & Range->new($low, $high))->ranges;
9967 # If the intersection is empty, throw away this range
9968 next LINE unless @ranges;
9970 # Only examine the first range this time through the loop.
9971 my $this_range = shift @ranges;
9973 # Put any remaining ranges in the queue to be processed
9974 # later. Note that there is unnecessary work here, as we
9975 # will do the intersection again for each of these ranges
9976 # during some future iteration of the LINE loop, but this
9977 # code is not used in production. The later intersections
9978 # are guaranteed to not splinter, so this will not become
9980 my $line = join ';', $property_name, $map;
9981 foreach my $range (@ranges) {
9982 $file->insert_adjusted_lines(sprintf("%04X..%04X; %s",
9988 # And process the first range, like any other.
9989 $low = $this_range->start;
9990 $high = $this_range->end;
9992 } # End of $compare_versions
9994 # If changing to a new property, get the things constant per
9996 if ($previous_property_name ne $property_name) {
9998 $property_object = property_ref($property_name);
9999 if (! defined $property_object) {
10000 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
10003 { no overloading; $property_addr = pack 'J', $property_object; }
10005 # Defer changing names until have a line that is acceptable
10006 # (the 'next' statement above means is unacceptable)
10007 $previous_property_name = $property_name;
10009 # If not the first time for this property, retrieve info about
10010 # it from the cache
10011 if (defined ($property_info{$property_addr}{$TYPE})) {
10012 $property_type = $property_info{$property_addr}{$TYPE};
10013 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
10015 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
10017 = $property_info{$property_addr}{$DEFAULT_TABLE};
10021 # Here, is the first time for this property. Set up the
10023 $property_type = $property_info{$property_addr}{$TYPE}
10024 = $property_object->type;
10026 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
10027 = $property_object->pseudo_map_type;
10029 # The Unicode files are set up so that if the map is not
10030 # defined, it is a binary property
10031 if (! defined $map && $property_type != $BINARY) {
10032 if ($property_type != $UNKNOWN
10033 && $property_type != $NON_STRING)
10035 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
10038 $property_object->set_type($BINARY);
10040 = $property_info{$property_addr}{$TYPE}
10045 # Get any @missings default for this property. This
10046 # should precede the first entry for the property in the
10047 # input file, and is located in a comment that has been
10048 # stored by the Input_file class until we access it here.
10049 # It's possible that there is more than one such line
10050 # waiting for us; collect them all, and parse
10051 my @missings_list = $file->get_missings
10052 if $file->has_missings_defaults;
10053 foreach my $default_ref (@missings_list) {
10054 my $default = $default_ref->[0];
10055 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
10057 # For string properties, the default is just what the
10058 # file says, but non-string properties should already
10059 # have set up a table for the default property value;
10060 # use the table for these, so can resolve synonyms
10061 # later to a single standard one.
10062 if ($property_type == $STRING
10063 || $property_type == $UNKNOWN)
10065 $property_info{$addr}{$MISSINGS} = $default;
10068 $property_info{$addr}{$MISSINGS}
10069 = $property_object->table($default);
10073 # Finished storing all the @missings defaults in the input
10074 # file so far. Get the one for the current property.
10075 my $missings = $property_info{$property_addr}{$MISSINGS};
10077 # But we likely have separately stored what the default
10078 # should be. (This is to accommodate versions of the
10079 # standard where the @missings lines are absent or
10080 # incomplete.) Hopefully the two will match. But check
10082 $default_map = $property_object->default_map;
10084 # If the map is a ref, it means that the default won't be
10085 # processed until later, so undef it, so next few lines
10086 # will redefine it to something that nothing will match
10087 undef $default_map if ref $default_map;
10089 # Create a $default_map if don't have one; maybe a dummy
10090 # that won't match anything.
10091 if (! defined $default_map) {
10093 # Use any @missings line in the file.
10094 if (defined $missings) {
10095 if (ref $missings) {
10096 $default_map = $missings->full_name;
10097 $default_table = $missings;
10100 $default_map = $missings;
10103 # And store it with the property for outside use.
10104 $property_object->set_default_map($default_map);
10108 # Neither an @missings nor a default map. Create
10109 # a dummy one, so won't have to test definedness
10110 # in the main loop.
10111 $default_map = '_Perl This will never be in a file
10116 # Here, we have $default_map defined, possibly in terms of
10117 # $missings, but maybe not, and possibly is a dummy one.
10118 if (defined $missings) {
10120 # Make sure there is no conflict between the two.
10121 # $missings has priority.
10122 if (ref $missings) {
10124 = $property_object->table($default_map);
10125 if (! defined $default_table
10126 || $default_table != $missings)
10128 if (! defined $default_table) {
10129 $default_table = $UNDEF;
10131 $file->carp_bad_line(<<END
10132 The \@missings line for $property_name in $file says that missings default to
10133 $missings, but we expect it to be $default_table. $missings used.
10136 $default_table = $missings;
10137 $default_map = $missings->full_name;
10139 $property_info{$property_addr}{$DEFAULT_TABLE}
10142 elsif ($default_map ne $missings) {
10143 $file->carp_bad_line(<<END
10144 The \@missings line for $property_name in $file says that missings default to
10145 $missings, but we expect it to be $default_map. $missings used.
10148 $default_map = $missings;
10152 $property_info{$property_addr}{$DEFAULT_MAP}
10155 # If haven't done so already, find the table corresponding
10156 # to this map for non-string properties.
10157 if (! defined $default_table
10158 && $property_type != $STRING
10159 && $property_type != $UNKNOWN)
10161 $default_table = $property_info{$property_addr}
10163 = $property_object->table($default_map);
10165 } # End of is first time for this property
10166 } # End of switching properties.
10168 # Ready to process the line.
10169 # The Unicode files are set up so that if the map is not defined,
10170 # it is a binary property with value 'Y'
10171 if (! defined $map) {
10176 # If the map begins with a special command to us (enclosed in
10177 # delimiters), extract the command(s).
10178 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
10180 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
10183 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
10187 $file->carp_bad_line("Unknown command line: '$1'");
10193 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
10196 # Here, we have a map to a particular code point, and the
10197 # default map is to a code point itself. If the range
10198 # includes the particular code point, change that portion of
10199 # the range to the default. This makes sure that in the final
10200 # table only the non-defaults are listed.
10201 my $decimal_map = hex $map;
10202 if ($low <= $decimal_map && $decimal_map <= $high) {
10204 # If the range includes stuff before or after the map
10205 # we're changing, split it and process the split-off parts
10207 if ($low < $decimal_map) {
10208 $file->insert_adjusted_lines(
10209 sprintf("%04X..%04X; %s; %s",
10215 if ($high > $decimal_map) {
10216 $file->insert_adjusted_lines(
10217 sprintf("%04X..%04X; %s; %s",
10223 $low = $high = $decimal_map;
10224 $map = $CODE_POINT;
10228 # If we can tell that this is a synonym for the default map, use
10229 # the default one instead.
10230 if ($property_type != $STRING
10231 && $property_type != $UNKNOWN)
10233 my $table = $property_object->table($map);
10234 if (defined $table && $table == $default_table) {
10235 $map = $default_map;
10239 # And figure out the map type if not known.
10240 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
10241 if ($map eq "") { # Nulls are always $NULL map type
10243 } # Otherwise, non-strings, and those that don't allow
10244 # $MULTI_CP, and those that aren't multiple code points are
10247 (($property_type != $STRING && $property_type != $UNKNOWN)
10248 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
10249 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
10254 $map_type = $MULTI_CP;
10258 $property_object->add_map($low, $high,
10261 Replace => $replace);
10262 } # End of loop through file's lines
10268 { # Closure for UnicodeData.txt handling
10270 # This file was the first one in the UCD; its design leads to some
10271 # awkwardness in processing. Here is a sample line:
10272 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
10273 # The fields in order are:
10274 my $i = 0; # The code point is in field 0, and is shifted off.
10275 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
10276 my $CATEGORY = $i++; # category (e.g. "Lu")
10277 my $CCC = $i++; # Canonical combining class (e.g. "230")
10278 my $BIDI = $i++; # directional class (e.g. "L")
10279 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
10280 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
10281 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
10282 # Dual-use in this program; see below
10283 my $NUMERIC = $i++; # numeric value
10284 my $MIRRORED = $i++; # ? mirrored
10285 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
10286 my $COMMENT = $i++; # iso comment
10287 my $UPPER = $i++; # simple uppercase mapping
10288 my $LOWER = $i++; # simple lowercase mapping
10289 my $TITLE = $i++; # simple titlecase mapping
10290 my $input_field_count = $i;
10292 # This routine in addition outputs these extra fields:
10294 my $DECOMP_TYPE = $i++; # Decomposition type
10296 # These fields are modifications of ones above, and are usually
10297 # suppressed; they must come last, as for speed, the loop upper bound is
10298 # normally set to ignore them
10299 my $NAME = $i++; # This is the strict name field, not the one that
10301 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
10302 # by Unicode::Normalize
10303 my $last_field = $i - 1;
10305 # All these are read into an array for each line, with the indices defined
10306 # above. The empty fields in the example line above indicate that the
10307 # value is defaulted. The handler called for each line of the input
10308 # changes these to their defaults.
10310 # Here are the official names of the properties, in a parallel array:
10312 $field_names[$BIDI] = 'Bidi_Class';
10313 $field_names[$CATEGORY] = 'General_Category';
10314 $field_names[$CCC] = 'Canonical_Combining_Class';
10315 $field_names[$CHARNAME] = 'Perl_Charnames';
10316 $field_names[$COMMENT] = 'ISO_Comment';
10317 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
10318 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
10319 $field_names[$LOWER] = 'Lowercase_Mapping';
10320 $field_names[$MIRRORED] = 'Bidi_Mirrored';
10321 $field_names[$NAME] = 'Name';
10322 $field_names[$NUMERIC] = 'Numeric_Value';
10323 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
10324 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
10325 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
10326 $field_names[$TITLE] = 'Titlecase_Mapping';
10327 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
10328 $field_names[$UPPER] = 'Uppercase_Mapping';
10330 # Some of these need a little more explanation:
10331 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
10332 # property, but is used in calculating the Numeric_Type. Perl however,
10333 # creates a file from this field, so a Perl property is created from it.
10334 # Similarly, the Other_Digit field is used only for calculating the
10335 # Numeric_Type, and so it can be safely re-used as the place to store
10336 # the value for Numeric_Type; hence it is referred to as
10337 # $NUMERIC_TYPE_OTHER_DIGIT.
10338 # The input field named $PERL_DECOMPOSITION is a combination of both the
10339 # decomposition mapping and its type. Perl creates a file containing
10340 # exactly this field, so it is used for that. The two properties are
10341 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
10342 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
10343 # output it), as Perl doesn't use it directly.
10344 # The input field named here $CHARNAME is used to construct the
10345 # Perl_Charnames property, which is a combination of the Name property
10346 # (which the input field contains), and the Unicode_1_Name property, and
10347 # others from other files. Since, the strict Name property is not used
10348 # by Perl, this field is used for the table that Perl does use. The
10349 # strict Name property table is usually suppressed (unless the lists are
10350 # changed to output it), so it is accumulated in a separate field,
10351 # $NAME, which to save time is discarded unless the table is actually to
10354 # This file is processed like most in this program. Control is passed to
10355 # process_generic_property_file() which calls filter_UnicodeData_line()
10356 # for each input line. This filter converts the input into line(s) that
10357 # process_generic_property_file() understands. There is also a setup
10358 # routine called before any of the file is processed, and a handler for
10359 # EOF processing, all in this closure.
10361 # A huge speed-up occurred at the cost of some added complexity when these
10362 # routines were altered to buffer the outputs into ranges. Almost all the
10363 # lines of the input file apply to just one code point, and for most
10364 # properties, the map for the next code point up is the same as the
10365 # current one. So instead of creating a line for each property for each
10366 # input line, filter_UnicodeData_line() remembers what the previous map
10367 # of a property was, and doesn't generate a line to pass on until it has
10368 # to, as when the map changes; and that passed-on line encompasses the
10369 # whole contiguous range of code points that have the same map for that
10370 # property. This means a slight amount of extra setup, and having to
10371 # flush these buffers on EOF, testing if the maps have changed, plus
10372 # remembering state information in the closure. But it means a lot less
10373 # real time in not having to change the data base for each property on
10376 # Another complication is that there are already a few ranges designated
10377 # in the input. There are two lines for each, with the same maps except
10378 # the code point and name on each line. This was actually the hardest
10379 # thing to design around. The code points in those ranges may actually
10380 # have real maps not given by these two lines. These maps will either
10381 # be algorithmically determinable, or be in the extracted files furnished
10382 # with the UCD. In the event of conflicts between these extracted files,
10383 # and this one, Unicode says that this one prevails. But it shouldn't
10384 # prevail for conflicts that occur in these ranges. The data from the
10385 # extracted files prevails in those cases. So, this program is structured
10386 # so that those files are processed first, storing maps. Then the other
10387 # files are processed, generally overwriting what the extracted files
10388 # stored. But just the range lines in this input file are processed
10389 # without overwriting. This is accomplished by adding a special string to
10390 # the lines output to tell process_generic_property_file() to turn off the
10391 # overwriting for just this one line.
10392 # A similar mechanism is used to tell it that the map is of a non-default
10395 sub setup_UnicodeData { # Called before any lines of the input are read
10397 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10399 # Create a new property specially located that is a combination of the
10400 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
10401 # Name_Alias properties. (The final duplicates elements of the
10402 # first.) A comment for it will later be constructed based on the
10403 # actual properties present and used
10404 $perl_charname = Property->new('Perl_Charnames',
10406 Directory => File::Spec->curdir(),
10408 Fate => $INTERNAL_ONLY,
10409 Perl_Extension => 1,
10410 Range_Size_1 => \&output_perl_charnames_line,
10413 $perl_charname->set_proxy_for('Name');
10415 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
10416 Directory => File::Spec->curdir(),
10417 File => 'Decomposition',
10418 Format => $DECOMP_STRING_FORMAT,
10419 Fate => $INTERNAL_ONLY,
10420 Perl_Extension => 1,
10421 Default_Map => $CODE_POINT,
10423 # normalize.pm can't cope with these
10424 Output_Range_Counts => 0,
10426 # This is a specially formatted table
10427 # explicitly for normalize.pm, which
10428 # is expecting a particular format,
10429 # which means that mappings containing
10430 # multiple code points are in the main
10431 # body of the table
10432 Map_Type => $COMPUTE_NO_MULTI_CP,
10434 To_Output_Map => $INTERNAL_MAP,
10436 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
10437 $Perl_decomp->add_comment(join_lines(<<END
10438 This mapping is a combination of the Unicode 'Decomposition_Type' and
10439 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
10440 identical to the official Unicode 'Decomposition_Mapping' property except for
10442 1) It omits the algorithmically determinable Hangul syllable decompositions,
10443 which normalize.pm handles algorithmically.
10444 2) It contains the decomposition type as well. Non-canonical decompositions
10445 begin with a word in angle brackets, like <super>, which denotes the
10446 compatible decomposition type. If the map does not begin with the <angle
10447 brackets>, the decomposition is canonical.
10451 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
10453 Perl_Extension => 1,
10454 Directory => $map_directory,
10456 To_Output_Map => $OUTPUT_ADJUSTED,
10458 $Decimal_Digit->add_comment(join_lines(<<END
10459 This file gives the mapping of all code points which represent a single
10460 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
10461 points, and the mapping of each non-initial element of each range is actually
10462 not to "0", but to the offset that element has from its corresponding DIGIT 0.
10463 These code points are those that have Numeric_Type=Decimal; not special
10464 things, like subscripts nor Roman numerals.
10468 # These properties are not used for generating anything else, and are
10469 # usually not output. By making them last in the list, we can just
10470 # change the high end of the loop downwards to avoid the work of
10471 # generating a table(s) that is/are just going to get thrown away.
10472 if (! property_ref('Decomposition_Mapping')->to_output_map
10473 && ! property_ref('Name')->to_output_map)
10475 $last_field = min($NAME, $DECOMP_MAP) - 1;
10476 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
10477 $last_field = $DECOMP_MAP;
10478 } elsif (property_ref('Name')->to_output_map) {
10479 $last_field = $NAME;
10484 my $first_time = 1; # ? Is this the first line of the file
10485 my $in_range = 0; # ? Are we in one of the file's ranges
10486 my $previous_cp; # hex code point of previous line
10487 my $decimal_previous_cp = -1; # And its decimal equivalent
10488 my @start; # For each field, the current starting
10489 # code point in hex for the range
10490 # being accumulated.
10491 my @fields; # The input fields;
10492 my @previous_fields; # And those from the previous call
10494 sub filter_UnicodeData_line {
10495 # Handle a single input line from UnicodeData.txt; see comments above
10496 # Conceptually this takes a single line from the file containing N
10497 # properties, and converts it into N lines with one property per line,
10498 # which is what the final handler expects. But there are
10499 # complications due to the quirkiness of the input file, and to save
10500 # time, it accumulates ranges where the property values don't change
10501 # and only emits lines when necessary. This is about an order of
10502 # magnitude fewer lines emitted.
10505 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10507 # $_ contains the input line.
10508 # -1 in split means retain trailing null fields
10509 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
10511 #local $to_trace = 1 if main::DEBUG;
10512 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
10513 if (@fields > $input_field_count) {
10514 $file->carp_bad_line('Extra fields');
10519 my $decimal_cp = hex $cp;
10521 # We have to output all the buffered ranges when the next code point
10522 # is not exactly one after the previous one, which means there is a
10523 # gap in the ranges.
10524 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
10526 # The decomposition mapping field requires special handling. It looks
10529 # <compat> 0032 0020
10532 # The decomposition type is enclosed in <brackets>; if missing, it
10533 # means the type is canonical. There are two decomposition mapping
10534 # tables: the one for use by Perl's normalize.pm has a special format
10535 # which is this field intact; the other, for general use is of
10536 # standard format. In either case we have to find the decomposition
10537 # type. Empty fields have None as their type, and map to the code
10539 if ($fields[$PERL_DECOMPOSITION] eq "") {
10540 $fields[$DECOMP_TYPE] = 'None';
10541 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
10544 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
10545 =~ / < ( .+? ) > \s* ( .+ ) /x;
10546 if (! defined $fields[$DECOMP_TYPE]) {
10547 $fields[$DECOMP_TYPE] = 'Canonical';
10548 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
10551 $fields[$DECOMP_MAP] = $map;
10555 # The 3 numeric fields also require special handling. The 2 digit
10556 # fields must be either empty or match the number field. This means
10557 # that if it is empty, they must be as well, and the numeric type is
10558 # None, and the numeric value is 'Nan'.
10559 # The decimal digit field must be empty or match the other digit
10560 # field. If the decimal digit field is non-empty, the code point is
10561 # a decimal digit, and the other two fields will have the same value.
10562 # If it is empty, but the other digit field is non-empty, the code
10563 # point is an 'other digit', and the number field will have the same
10564 # value as the other digit field. If the other digit field is empty,
10565 # but the number field is non-empty, the code point is a generic
10567 if ($fields[$NUMERIC] eq "") {
10568 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
10569 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
10571 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
10573 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
10574 $fields[$NUMERIC] = 'NaN';
10577 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
10578 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
10579 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
10580 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
10581 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
10583 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
10584 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
10585 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
10588 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
10590 # Rationals require extra effort.
10591 register_fraction($fields[$NUMERIC])
10592 if $fields[$NUMERIC] =~ qr{/};
10596 # For the properties that have empty fields in the file, and which
10597 # mean something different from empty, change them to that default.
10598 # Certain fields just haven't been empty so far in any Unicode
10599 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
10600 # $CATEGORY. This leaves just the two fields, and so we hard-code in
10601 # the defaults; which are very unlikely to ever change.
10602 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
10603 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
10605 # UAX44 says that if title is empty, it is the same as whatever upper
10607 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
10609 # There are a few pairs of lines like:
10610 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
10611 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
10612 # that define ranges. These should be processed after the fields are
10613 # adjusted above, as they may override some of them; but mostly what
10614 # is left is to possibly adjust the $CHARNAME field. The names of all the
10615 # paired lines start with a '<', but this is also true of '<control>,
10616 # which isn't one of these special ones.
10617 if ($fields[$CHARNAME] eq '<control>') {
10619 # Some code points in this file have the pseudo-name
10620 # '<control>', but the official name for such ones is the null
10622 $fields[$NAME] = $fields[$CHARNAME] = "";
10624 # We had better not be in between range lines.
10626 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10630 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
10632 # Here is a non-range line. We had better not be in between range
10635 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
10638 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
10640 # These are code points whose names end in their code points,
10641 # which means the names are algorithmically derivable from the
10642 # code points. To shorten the output Name file, the algorithm
10643 # for deriving these is placed in the file instead of each
10644 # code point, so they have map type $CP_IN_NAME
10645 $fields[$CHARNAME] = $CMD_DELIM
10650 . $fields[$CHARNAME];
10652 $fields[$NAME] = $fields[$CHARNAME];
10654 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
10655 $fields[$CHARNAME] = $fields[$NAME] = $1;
10657 # Here we are at the beginning of a range pair.
10659 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
10663 # Because the properties in the range do not overwrite any already
10664 # in the db, we must flush the buffers of what's already there, so
10665 # they get handled in the normal scheme.
10669 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
10670 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
10674 else { # Here, we are at the last line of a range pair.
10677 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
10683 $fields[$NAME] = $fields[$CHARNAME];
10685 # Check that the input is valid: that the closing of the range is
10686 # the same as the beginning.
10687 foreach my $i (0 .. $last_field) {
10688 next if $fields[$i] eq $previous_fields[$i];
10689 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
10692 # The processing differs depending on the type of range,
10693 # determined by its $CHARNAME
10694 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
10696 # Check that the data looks right.
10697 if ($decimal_previous_cp != $SBase) {
10698 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
10700 if ($decimal_cp != $SBase + $SCount - 1) {
10701 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
10704 # The Hangul syllable range has a somewhat complicated name
10705 # generation algorithm. Each code point in it has a canonical
10706 # decomposition also computable by an algorithm. The
10707 # perl decomposition map table built from these is used only
10708 # by normalize.pm, which has the algorithm built in it, so the
10709 # decomposition maps are not needed, and are large, so are
10710 # omitted from it. If the full decomposition map table is to
10711 # be output, the decompositions are generated for it, in the
10712 # EOF handling code for this input file.
10714 $previous_fields[$DECOMP_TYPE] = 'Canonical';
10716 # This range is stored in our internal structure with its
10717 # own map type, different from all others.
10718 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10724 . $fields[$CHARNAME];
10726 elsif ($fields[$CHARNAME] =~ /^CJK/) {
10728 # The name for these contains the code point itself, and all
10729 # are defined to have the same base name, regardless of what
10730 # is in the file. They are stored in our internal structure
10731 # with a map type of $CP_IN_NAME
10732 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
10738 . 'CJK UNIFIED IDEOGRAPH';
10741 elsif ($fields[$CATEGORY] eq 'Co'
10742 || $fields[$CATEGORY] eq 'Cs')
10744 # The names of all the code points in these ranges are set to
10745 # null, as there are no names for the private use and
10746 # surrogate code points.
10748 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
10751 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
10754 # The first line of the range caused everything else to be output,
10755 # and then its values were stored as the beginning values for the
10756 # next set of ranges, which this one ends. Now, for each value,
10757 # add a command to tell the handler that these values should not
10758 # replace any existing ones in our database.
10759 foreach my $i (0 .. $last_field) {
10760 $previous_fields[$i] = $CMD_DELIM
10765 . $previous_fields[$i];
10768 # And change things so it looks like the entire range has been
10769 # gone through with this being the final part of it. Adding the
10770 # command above to each field will cause this range to be flushed
10771 # during the next iteration, as it guaranteed that the stored
10772 # field won't match whatever value the next one has.
10773 $previous_cp = $cp;
10774 $decimal_previous_cp = $decimal_cp;
10776 # We are now set up for the next iteration; so skip the remaining
10777 # code in this subroutine that does the same thing, but doesn't
10778 # know about these ranges.
10784 # On the very first line, we fake it so the code below thinks there is
10785 # nothing to output, and initialize so that when it does get output it
10786 # uses the first line's values for the lowest part of the range.
10787 # (One could avoid this by using peek(), but then one would need to
10788 # know the adjustments done above and do the same ones in the setup
10789 # routine; not worth it)
10792 @previous_fields = @fields;
10793 @start = ($cp) x scalar @fields;
10794 $decimal_previous_cp = $decimal_cp - 1;
10797 # For each field, output the stored up ranges that this code point
10798 # doesn't fit in. Earlier we figured out if all ranges should be
10799 # terminated because of changing the replace or map type styles, or if
10800 # there is a gap between this new code point and the previous one, and
10801 # that is stored in $force_output. But even if those aren't true, we
10802 # need to output the range if this new code point's value for the
10803 # given property doesn't match the stored range's.
10804 #local $to_trace = 1 if main::DEBUG;
10805 foreach my $i (0 .. $last_field) {
10806 my $field = $fields[$i];
10807 if ($force_output || $field ne $previous_fields[$i]) {
10809 # Flush the buffer of stored values.
10810 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10812 # Start a new range with this code point and its value
10814 $previous_fields[$i] = $field;
10818 # Set the values for the next time.
10819 $previous_cp = $cp;
10820 $decimal_previous_cp = $decimal_cp;
10822 # The input line has generated whatever adjusted lines are needed, and
10823 # should not be looked at further.
10828 sub EOF_UnicodeData {
10829 # Called upon EOF to flush the buffers, and create the Hangul
10830 # decomposition mappings if needed.
10833 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10835 # Flush the buffers.
10836 foreach my $i (0 .. $last_field) {
10837 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
10840 if (-e 'Jamo.txt') {
10842 # The algorithm is published by Unicode, based on values in
10843 # Jamo.txt, (which should have been processed before this
10844 # subroutine), and the results left in %Jamo
10846 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
10850 # If the full decomposition map table is being output, insert
10851 # into it the Hangul syllable mappings. This is to avoid having
10852 # to publish a subroutine in it to compute them. (which would
10853 # essentially be this code.) This uses the algorithm published by
10854 # Unicode. (No hangul syllables in version 1)
10855 if ($v_version ge v2.0.0
10856 && property_ref('Decomposition_Mapping')->to_output_map) {
10857 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
10859 my $SIndex = $S - $SBase;
10860 my $L = $LBase + $SIndex / $NCount;
10861 my $V = $VBase + ($SIndex % $NCount) / $TCount;
10862 my $T = $TBase + $SIndex % $TCount;
10864 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
10865 my $decomposition = sprintf("%04X %04X", $L, $V);
10866 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
10867 $file->insert_adjusted_lines(
10868 sprintf("%04X; Decomposition_Mapping; %s",
10878 sub filter_v1_ucd {
10879 # Fix UCD lines in version 1. This is probably overkill, but this
10880 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
10881 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
10882 # removed. This program retains them
10883 # 2) didn't include ranges, which it should have, and which are now
10884 # added in @corrected_lines below. It was hand populated by
10885 # taking the data from Version 2, verified by analyzing
10887 # 3) There is a syntax error in the entry for U+09F8 which could
10888 # cause problems for utf8_heavy, and so is changed. It's
10889 # numeric value was simply a minus sign, without any number.
10890 # (Eventually Unicode changed the code point to non-numeric.)
10891 # 4) The decomposition types often don't match later versions
10892 # exactly, and the whole syntax of that field is different; so
10893 # the syntax is changed as well as the types to their later
10894 # terminology. Otherwise normalize.pm would be very unhappy
10895 # 5) Many ccc classes are different. These are left intact.
10896 # 6) U+FF10..U+FF19 are missing their numeric values in all three
10897 # fields. These are unchanged because it doesn't really cause
10898 # problems for Perl.
10899 # 7) A number of code points, such as controls, don't have their
10900 # Unicode Version 1 Names in this file. These are added.
10901 # 8) A number of Symbols were marked as Lm. This changes those in
10902 # the Latin1 range, so that regexes work.
10903 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
10904 # referred to by their lc equivalents. Not fixed.
10906 my @corrected_lines = split /\n/, <<'END';
10907 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
10908 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10909 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
10910 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
10911 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
10912 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
10916 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10918 #local $to_trace = 1 if main::DEBUG;
10919 trace $_ if main::DEBUG && $to_trace;
10921 # -1 => retain trailing null fields
10922 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10924 # At the first place that is wrong in the input, insert all the
10925 # corrections, replacing the wrong line.
10926 if ($code_point eq '4E00') {
10927 my @copy = @corrected_lines;
10929 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
10931 $file->insert_lines(@copy);
10933 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
10935 # There are no Lm characters in Latin1; these should be 'Sk', but
10936 # there isn't that in V1.
10937 $fields[$CATEGORY] = 'So';
10940 if ($fields[$NUMERIC] eq '-') {
10941 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
10944 if ($fields[$PERL_DECOMPOSITION] ne "") {
10946 # Several entries have this change to superscript 2 or 3 in the
10947 # middle. Convert these to the modern version, which is to use
10948 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
10949 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
10950 # 'HHHH HHHH 00B3 HHHH'.
10951 # It turns out that all of these that don't have another
10952 # decomposition defined at the beginning of the line have the
10953 # <square> decomposition in later releases.
10954 if ($code_point ne '00B2' && $code_point ne '00B3') {
10955 if ($fields[$PERL_DECOMPOSITION]
10956 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
10958 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
10959 $fields[$PERL_DECOMPOSITION] = '<square> '
10960 . $fields[$PERL_DECOMPOSITION];
10965 # If is like '<+circled> 0052 <-circled>', convert to
10967 $fields[$PERL_DECOMPOSITION] =~
10968 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
10970 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
10971 $fields[$PERL_DECOMPOSITION] =~
10972 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
10973 or $fields[$PERL_DECOMPOSITION] =~
10974 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
10975 or $fields[$PERL_DECOMPOSITION] =~
10976 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
10977 or $fields[$PERL_DECOMPOSITION] =~
10978 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
10980 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
10981 $fields[$PERL_DECOMPOSITION] =~
10982 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
10984 # Change names to modern form.
10985 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
10986 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
10987 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
10988 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
10990 # One entry has weird braces
10991 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
10993 # One entry at U+2116 has an extra <sup>
10994 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
10997 $_ = join ';', $code_point, @fields;
10998 trace $_ if main::DEBUG && $to_trace;
11002 sub filter_bad_Nd_ucd {
11003 # Early versions specified a value in the decimal digit field even
11004 # though the code point wasn't a decimal digit. Clear the field in
11005 # that situation, so that the main code doesn't think it is a decimal
11008 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11009 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
11010 $fields[$PERL_DECIMAL_DIGIT] = "";
11011 $_ = join ';', $code_point, @fields;
11016 my @U1_control_names = split /\n/, <<'END';
11021 END OF TRANSMISSION
11026 HORIZONTAL TABULATION
11028 VERTICAL TABULATION
11036 DEVICE CONTROL THREE
11037 DEVICE CONTROL FOUR
11038 NEGATIVE ACKNOWLEDGE
11040 END OF TRANSMISSION BLOCK
11050 BREAK PERMITTED HERE
11054 START OF SELECTED AREA
11055 END OF SELECTED AREA
11056 CHARACTER TABULATION SET
11057 CHARACTER TABULATION WITH JUSTIFICATION
11058 LINE TABULATION SET
11064 DEVICE CONTROL STRING
11070 START OF GUARDED AREA
11071 END OF GUARDED AREA
11073 SINGLE CHARACTER INTRODUCER
11074 CONTROL SEQUENCE INTRODUCER
11076 OPERATING SYSTEM COMMAND
11078 APPLICATION PROGRAM COMMAND
11081 sub filter_early_U1_names {
11082 # Very early versions did not have the Unicode_1_name field specified.
11083 # They differed in which ones were present; make sure a U1 name
11084 # exists, so that Unicode::UCD::charinfo will work
11086 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11089 # @U1_control names above are entirely positional, so we pull them out
11090 # in the exact order required, with gaps for the ones that don't have
11092 if ($code_point =~ /^00[01]/
11093 || $code_point eq '007F'
11094 || $code_point =~ /^008[2-9A-F]/
11095 || $code_point =~ /^009[0-8A-F]/)
11097 my $u1_name = shift @U1_control_names;
11098 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
11099 $_ = join ';', $code_point, @fields;
11104 sub filter_v2_1_5_ucd {
11105 # A dozen entries in this 2.1.5 file had the mirrored and numeric
11106 # columns swapped; These all had mirrored be 'N'. So if the numeric
11107 # column appears to be N, swap it back.
11109 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11110 if ($fields[$NUMERIC] eq 'N') {
11111 $fields[$NUMERIC] = $fields[$MIRRORED];
11112 $fields[$MIRRORED] = 'N';
11113 $_ = join ';', $code_point, @fields;
11118 sub filter_v6_ucd {
11120 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
11121 # it wasn't accepted, to allow for some deprecation cycles. This
11122 # function is not called after 5.16
11124 return if $_ !~ /^(?:0007|1F514|070F);/;
11126 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11127 if ($code_point eq '0007') {
11128 $fields[$CHARNAME] = "";
11130 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
11131 # http://www.unicode.org/versions/corrigendum8.html
11132 $fields[$BIDI] = "AL";
11134 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
11135 $fields[$CHARNAME] = "";
11138 $_ = join ';', $code_point, @fields;
11142 } # End closure for UnicodeData
11144 sub process_GCB_test {
11147 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11149 while ($file->next_line) {
11150 push @backslash_X_tests, $_;
11156 sub process_NamedSequences {
11157 # NamedSequences.txt entries are just added to an array. Because these
11158 # don't look like the other tables, they have their own handler.
11160 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
11162 # This just adds the sequence to an array for later handling
11165 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11167 while ($file->next_line) {
11168 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
11170 $file->carp_bad_line(
11171 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
11175 # Note single \t in keeping with special output format of
11176 # Perl_charnames. But it turns out that the code points don't have to
11177 # be 5 digits long, like the rest, based on the internal workings of
11178 # charnames.pm. This could be easily changed for consistency.
11179 push @named_sequences, "$sequence\t$name";
11188 sub filter_early_ea_lb {
11189 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
11190 # third field be the name of the code point, which can be ignored in
11191 # most cases. But it can be meaningful if it marks a range:
11192 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
11193 # 3400;W;<CJK Ideograph Extension A, First>
11195 # We need to see the First in the example above to know it's a range.
11196 # They did not use the later range syntaxes. This routine changes it
11197 # to use the modern syntax.
11198 # $1 is the Input_file object.
11200 my @fields = split /\s*;\s*/;
11201 if ($fields[2] =~ /^<.*, First>/) {
11202 $first_range = $fields[0];
11205 elsif ($fields[2] =~ /^<.*, Last>/) {
11206 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
11209 undef $first_range;
11210 $_ = "$fields[0]; $fields[1]";
11217 sub filter_old_style_arabic_shaping {
11218 # Early versions used a different term for the later one.
11220 my @fields = split /\s*;\s*/;
11221 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
11222 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
11223 $_ = join ';', @fields;
11227 sub filter_arabic_shaping_line {
11228 # ArabicShaping.txt has entries that look like:
11229 # 062A; TEH; D; BEH
11230 # The field containing 'TEH' is not used. The next field is Joining_Type
11231 # and the last is Joining_Group
11232 # This generates two lines to pass on, one for each property on the input
11236 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11238 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11241 $file->carp_bad_line('Extra fields');
11246 $file->insert_adjusted_lines("$fields[0]; Joining_Group; $fields[3]");
11247 $_ = "$fields[0]; Joining_Type; $fields[2]";
11253 my $lc; # Table for lowercase mapping
11256 my %special_casing_code_points;
11258 sub setup_special_casing {
11259 # SpecialCasing.txt contains the non-simple case change mappings. The
11260 # simple ones are in UnicodeData.txt, which should already have been
11261 # read in to the full property data structures, so as to initialize
11262 # these with the simple ones. Then the SpecialCasing.txt entries
11263 # add or overwrite the ones which have different full mappings.
11265 # This routine sees if the simple mappings are to be output, and if
11266 # so, copies what has already been put into the full mapping tables,
11267 # while they still contain only the simple mappings.
11269 # The reason it is done this way is that the simple mappings are
11270 # probably not going to be output, so it saves work to initialize the
11271 # full tables with the simple mappings, and then overwrite those
11272 # relatively few entries in them that have different full mappings,
11273 # and thus skip the simple mapping tables altogether.
11276 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11278 $lc = property_ref('lc');
11279 $tc = property_ref('tc');
11280 $uc = property_ref('uc');
11282 # For each of the case change mappings...
11283 foreach my $full_table ($lc, $tc, $uc) {
11284 my $full_name = $full_table->name;
11285 unless (defined $full_table && ! $full_table->is_empty) {
11286 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
11289 # Create a table in the old-style format and with the original
11290 # file name for backwards compatibility with applications that
11291 # read it directly. The new tables contain both the simple and
11292 # full maps, and the old are missing simple maps when there is a
11293 # conflicting full one. Probably it would have been ok to add
11294 # those to the legacy version, as was already done in 5.14 to the
11295 # case folding one, but this was not done, out of an abundance of
11296 # caution. The tables are set up here before we deal with the
11297 # full maps so that as we handle those, we can override the simple
11298 # maps for them in the legacy table, and merely add them in the
11300 my $legacy = Property->new("Legacy_" . $full_table->full_name,
11301 File => $full_table->full_name =~
11304 Format => $HEX_FORMAT,
11305 Default_Map => $CODE_POINT,
11307 Initialize => $full_table,
11308 To_Output_Map => $EXTERNAL_MAP,
11311 $full_table->add_comment(join_lines( <<END
11312 This file includes both the simple and full case changing maps. The simple
11313 ones are in the main body of the table below, and the full ones adding to or
11314 overriding them are in the hash.
11318 # The simple version's name in each mapping merely has an 's' in
11319 # front of the full one's
11320 my $simple_name = 's' . $full_name;
11321 my $simple = property_ref($simple_name);
11322 $simple->initialize($full_table) if $simple->to_output_map();
11328 sub filter_2_1_8_special_casing_line {
11330 # This version had duplicate entries in this file. Delete all but the
11332 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11334 if (exists $special_casing_code_points{$fields[0]}) {
11339 $special_casing_code_points{$fields[0]} = 1;
11340 filter_special_casing_line(@_);
11343 sub filter_special_casing_line {
11344 # Change the format of $_ from SpecialCasing.txt into something that
11345 # the generic handler understands. Each input line contains three
11346 # case mappings. This will generate three lines to pass to the
11347 # generic handler for each of those.
11349 # The input syntax (after stripping comments and trailing white space
11350 # is like one of the following (with the final two being entries that
11352 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
11353 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
11354 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
11355 # Note the trailing semi-colon, unlike many of the input files. That
11356 # means that there will be an extra null field generated by the split
11359 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11361 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
11364 # field #4 is when this mapping is conditional. If any of these get
11365 # implemented, it would be by hard-coding in the casing functions in
11366 # the Perl core, not through tables. But if there is a new condition
11367 # we don't know about, output a warning. We know about all the
11368 # conditions through 6.0
11369 if ($fields[4] ne "") {
11370 my @conditions = split ' ', $fields[4];
11371 if ($conditions[0] ne 'tr' # We know that these languages have
11372 # conditions, and some are multiple
11373 && $conditions[0] ne 'az'
11374 && $conditions[0] ne 'lt'
11376 # And, we know about a single condition Final_Sigma, but
11378 && ($v_version gt v5.2.0
11379 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
11381 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
11383 elsif ($conditions[0] ne 'Final_Sigma') {
11385 # Don't print out a message for Final_Sigma, because we
11386 # have hard-coded handling for it. (But the standard
11387 # could change what the rule should be, but it wouldn't
11388 # show up here anyway.
11390 print "# SKIPPING Special Casing: $_\n"
11391 if $verbosity >= $VERBOSE;
11396 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
11397 $file->carp_bad_line('Extra fields');
11402 my $decimal_code_point = hex $fields[0];
11404 # Loop to handle each of the three mappings in the input line, in
11405 # order, with $i indicating the current field number.
11407 for my $object ($lc, $tc, $uc) {
11408 $i++; # First time through, $i = 0 ... 3rd time = 3
11410 my $value = $object->value_of($decimal_code_point);
11411 $value = ($value eq $CODE_POINT)
11412 ? $decimal_code_point
11415 # If this isn't a multi-character mapping, it should already have
11417 if ($fields[$i] !~ / /) {
11418 if ($value != hex $fields[$i]) {
11419 Carp::my_carp("Bad news. UnicodeData.txt thinks "
11421 . "(0x$fields[0]) is $value"
11422 . " and SpecialCasing.txt thinks it is "
11424 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
11429 # The mapping goes into both the legacy table, in which it
11430 # replaces the simple one...
11431 $file->insert_adjusted_lines("$fields[0]; Legacy_"
11432 . $object->full_name
11433 . "; $fields[$i]");
11435 # ... and, the The regular table, in which it is additional,
11436 # beyond the simple mapping.
11437 $file->insert_adjusted_lines("$fields[0]; "
11441 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
11447 # Everything has been handled by the insert_adjusted_lines()
11454 sub filter_old_style_case_folding {
11455 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
11456 # and later style. Different letters were used in the earlier.
11459 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11461 my @fields = split /\s*;\s*/;
11462 if ($fields[0] =~ /^ 013 [01] $/x) { # The two turkish fields
11465 elsif ($fields[1] eq 'L') {
11466 $fields[1] = 'C'; # L => C always
11468 elsif ($fields[1] eq 'E') {
11469 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
11477 $file->carp_bad_line("Expecting L or E in second field");
11481 $_ = join("; ", @fields) . ';';
11485 { # Closure for case folding
11487 # Create the map for simple only if are going to output it, for otherwise
11488 # it takes no part in anything we do.
11489 my $to_output_simple;
11492 sub setup_case_folding($) {
11493 # Read in the case foldings in CaseFolding.txt. This handles both
11494 # simple and full case folding.
11497 = property_ref('Simple_Case_Folding')->to_output_map;
11499 if (! $to_output_simple) {
11500 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
11503 $all_folds = $perl->add_match_table("_Perl_Any_Folds",
11504 Perl_Extension => 1,
11505 Fate => $INTERNAL_ONLY,
11506 Description => "Code points that particpate in some fold",
11509 # If we ever wanted to show that these tables were combined, a new
11510 # property method could be created, like set_combined_props()
11511 property_ref('Case_Folding')->add_comment(join_lines( <<END
11512 This file includes both the simple and full case folding maps. The simple
11513 ones are in the main body of the table below, and the full ones adding to or
11514 overriding them are in the hash.
11520 sub filter_case_folding_line {
11521 # Called for each line in CaseFolding.txt
11522 # Input lines look like:
11523 # 0041; C; 0061; # LATIN CAPITAL LETTER A
11524 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
11525 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
11527 # 'C' means that folding is the same for both simple and full
11528 # 'F' that it is only for full folding
11529 # 'S' that it is only for simple folding
11530 # 'T' is locale-dependent, and ignored
11531 # 'I' is a type of 'F' used in some early releases.
11532 # Note the trailing semi-colon, unlike many of the input files. That
11533 # means that there will be an extra null field generated by the split
11534 # below, which we ignore and hence is not an error.
11537 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11539 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
11540 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
11541 $file->carp_bad_line('Extra fields');
11546 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
11551 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
11552 # I are all full foldings; S is single-char. For S, there is always
11553 # an F entry, so we must allow multiple values for the same code
11554 # point. Fortunately this table doesn't need further manipulation
11555 # which would preclude using multiple-values. The S is now included
11556 # so that _swash_inversion_hash() is able to construct closures
11557 # without having to worry about F mappings.
11558 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
11559 my $from = hex $range; # Assumes range is single
11560 $all_folds->add_range($from, $from);
11561 $_ = "$range; Case_Folding; "
11562 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
11564 if ($type eq 'F') {
11565 my @string = split " ", $map;
11566 for my $i (0 .. @string - 1 -1) {
11567 my $decimal = hex $string[$i];
11568 $all_folds->add_range($decimal, $decimal);
11572 $all_folds->add_range(hex $map, hex $map);
11577 $file->carp_bad_line('Expecting C F I S or T in second field');
11580 # C and S are simple foldings, but simple case folding is not needed
11581 # unless we explicitly want its map table output.
11582 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
11583 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
11589 } # End case fold closure
11591 sub filter_jamo_line {
11592 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
11593 # from this file that is used in generating the Name property for Jamo
11594 # code points. But, it also is used to convert early versions' syntax
11595 # into the modern form. Here are two examples:
11596 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
11597 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
11599 # The input is $_, the output is $_ filtered.
11601 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11603 # Let the caller handle unexpected input. In earlier versions, there was
11604 # a third field which is supposed to be a comment, but did not have a '#'
11606 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
11608 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
11611 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
11612 $fields[1] = 'R' if $fields[0] eq '1105';
11614 # Add to structure so can generate Names from it.
11615 my $cp = hex $fields[0];
11616 my $short_name = $fields[1];
11617 $Jamo{$cp} = $short_name;
11618 if ($cp <= $LBase + $LCount) {
11619 $Jamo_L{$short_name} = $cp - $LBase;
11621 elsif ($cp <= $VBase + $VCount) {
11622 $Jamo_V{$short_name} = $cp - $VBase;
11624 elsif ($cp <= $TBase + $TCount) {
11625 $Jamo_T{$short_name} = $cp - $TBase;
11628 Carp::my_carp_bug("Unexpected Jamo code point in $_");
11632 # Reassemble using just the first two fields to look like a typical
11633 # property file line
11634 $_ = "$fields[0]; $fields[1]";
11639 sub register_fraction($) {
11640 # This registers the input rational number so that it can be passed on to
11641 # utf8_heavy.pl, both in rational and floating forms.
11643 my $rational = shift;
11645 my $float = eval $rational;
11646 $nv_floating_to_rational{$float} = $rational;
11650 sub filter_numeric_value_line {
11651 # DNumValues contains lines of a different syntax than the typical
11653 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
11655 # This routine transforms $_ containing the anomalous syntax to the
11656 # typical, by filtering out the extra columns, and convert early version
11657 # decimal numbers to strings that look like rational numbers.
11660 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11662 # Starting in 5.1, there is a rational field. Just use that, omitting the
11663 # extra columns. Otherwise convert the decimal number in the second field
11664 # to a rational, and omit extraneous columns.
11665 my @fields = split /\s*;\s*/, $_, -1;
11668 if ($v_version ge v5.1.0) {
11669 if (@fields != 4) {
11670 $file->carp_bad_line('Not 4 semi-colon separated fields');
11674 $rational = $fields[3];
11675 $_ = join '; ', @fields[ 0, 3 ];
11679 # Here, is an older Unicode file, which has decimal numbers instead of
11680 # rationals in it. Use the fraction to calculate the denominator and
11681 # convert to rational.
11683 if (@fields != 2 && @fields != 3) {
11684 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
11689 my $codepoints = $fields[0];
11690 my $decimal = $fields[1];
11691 if ($decimal =~ s/\.0+$//) {
11693 # Anything ending with a decimal followed by nothing but 0's is an
11695 $_ = "$codepoints; $decimal";
11696 $rational = $decimal;
11701 if ($decimal =~ /\.50*$/) {
11705 # Here have the hardcoded repeating decimals in the fraction, and
11706 # the denominator they imply. There were only a few denominators
11707 # in the older Unicode versions of this file which this code
11708 # handles, so it is easy to convert them.
11710 # The 4 is because of a round-off error in the Unicode 3.2 files
11711 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
11714 elsif ($decimal =~ /\.[27]50*$/) {
11717 elsif ($decimal =~ /\.[2468]0*$/) {
11720 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
11723 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
11726 if ($denominator) {
11727 my $sign = ($decimal < 0) ? "-" : "";
11728 my $numerator = int((abs($decimal) * $denominator) + .5);
11729 $rational = "$sign$numerator/$denominator";
11730 $_ = "$codepoints; $rational";
11733 $file->carp_bad_line("Can't cope with number '$decimal'.");
11740 register_fraction($rational) if $rational =~ qr{/};
11745 my %unihan_properties;
11748 # Do any special setup for Unihan properties.
11750 # This property gives the wrong computed type, so override.
11751 my $usource = property_ref('kIRG_USource');
11752 $usource->set_type($STRING) if defined $usource;
11754 # This property is to be considered binary (it says so in
11755 # http://www.unicode.org/reports/tr38/)
11756 my $iicore = property_ref('kIICore');
11757 if (defined $iicore) {
11758 $iicore->set_type($FORCED_BINARY);
11759 $iicore->table("Y")->add_note("Forced to a binary property as per unicode.org UAX #38.");
11761 # Unicode doesn't include the maps for this property, so don't
11762 # warn that they are missing.
11763 $iicore->set_pre_declared_maps(0);
11764 $iicore->add_comment(join_lines( <<END
11765 This property contains enum values, but Unicode UAX #38 says it should be
11766 interpreted as binary, so Perl creates tables for both 1) its enum values,
11767 plus 2) true/false tables in which it is considered true for all code points
11768 that have a non-null value
11776 sub filter_unihan_line {
11777 # Change unihan db lines to look like the others in the db. Here is
11779 # U+341C kCangjie IEKN
11781 # Tabs are used instead of semi-colons to separate fields; therefore
11782 # they may have semi-colons embedded in them. Change these to periods
11783 # so won't screw up the rest of the code.
11786 # Remove lines that don't look like ones we accept.
11787 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
11792 # Extract the property, and save a reference to its object.
11794 if (! exists $unihan_properties{$property}) {
11795 $unihan_properties{$property} = property_ref($property);
11798 # Don't do anything unless the property is one we're handling, which
11799 # we determine by seeing if there is an object defined for it or not
11800 if (! defined $unihan_properties{$property}) {
11805 # Convert the tab separators to our standard semi-colons, and convert
11806 # the U+HHHH notation to the rest of the standard's HHHH
11808 s/\b U \+ (?= $code_point_re )//xg;
11810 #local $to_trace = 1 if main::DEBUG;
11811 trace $_ if main::DEBUG && $to_trace;
11817 sub filter_blocks_lines {
11818 # In the Blocks.txt file, the names of the blocks don't quite match the
11819 # names given in PropertyValueAliases.txt, so this changes them so they
11820 # do match: Blanks and hyphens are changed into underscores. Also makes
11821 # early release versions look like later ones
11823 # $_ is transformed to the correct value.
11826 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11828 if ($v_version lt v3.2.0) {
11829 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
11834 # Old versions used a different syntax to mark the range.
11835 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
11838 my @fields = split /\s*;\s*/, $_, -1;
11839 if (@fields != 2) {
11840 $file->carp_bad_line("Expecting exactly two fields");
11845 # Change hyphens and blanks in the block name field only
11846 $fields[1] =~ s/[ -]/_/g;
11847 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/g; # Capitalize first letter of word
11849 $_ = join("; ", @fields);
11854 my $current_property;
11856 sub filter_old_style_proplist {
11857 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
11858 # was in a completely different syntax. Ken Whistler of Unicode says
11859 # that it was something he used as an aid for his own purposes, but
11860 # was never an official part of the standard. Many of the properties
11861 # in it were incorporated into the later PropList.txt, but some were
11862 # not. This program uses this early file to generate property tables
11863 # that are otherwise not accessible in the early UCD's. It does this
11864 # for the ones that eventually became official, and don't appear to be
11865 # too different in their contents from the later official version, and
11866 # throws away the rest. It could be argued that the ones it generates
11867 # were probably not really official at that time, so should be
11868 # ignored. You can easily modify things to skip all of them by
11869 # changing this function to just set $_ to "", and return; and to skip
11870 # certain of them by by simply removing their declarations from
11871 # get_old_property_aliases().
11873 # Here is a list of all the ones that are thrown away:
11874 # Alphabetic The definitions for this are very
11875 # defective, so better to not mislead
11876 # people into thinking it works.
11877 # Instead the Perl extension of the
11878 # same name is constructed from first
11880 # Bidi=* duplicates UnicodeData.txt
11881 # Combining never made into official property;
11883 # Composite never made into official property.
11884 # Currency Symbol duplicates UnicodeData.txt: gc=sc
11885 # Decimal Digit duplicates UnicodeData.txt: gc=nd
11886 # Delimiter never made into official property;
11888 # Format Control never made into official property;
11890 # High Surrogate duplicates Blocks.txt
11891 # Ignorable Control never made into official property;
11893 # ISO Control duplicates UnicodeData.txt: gc=cc
11894 # Left of Pair never made into official property;
11895 # Line Separator duplicates UnicodeData.txt: gc=zl
11896 # Low Surrogate duplicates Blocks.txt
11897 # Non-break was actually listed as a property
11898 # in 3.2, but without any code
11899 # points. Unicode denies that this
11900 # was ever an official property
11901 # Non-spacing duplicate UnicodeData.txt: gc=mn
11902 # Numeric duplicates UnicodeData.txt: gc=cc
11903 # Paired Punctuation never made into official property;
11904 # appears to be gc=ps + gc=pe
11905 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
11906 # Private Use duplicates UnicodeData.txt: gc=co
11907 # Private Use High Surrogate duplicates Blocks.txt
11908 # Punctuation duplicates UnicodeData.txt: gc=p
11909 # Space different definition than eventual
11911 # Titlecase duplicates UnicodeData.txt: gc=lt
11912 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
11913 # Zero-width never made into official property;
11915 # Most of the properties have the same names in this file as in later
11916 # versions, but a couple do not.
11918 # This subroutine filters $_, converting it from the old style into
11919 # the new style. Here's a sample of the old-style
11921 # *******************************************
11923 # Property dump for: 0x100000A0 (Join Control)
11925 # 200C..200D (2 chars)
11927 # In the example, the property is "Join Control". It is kept in this
11928 # closure between calls to the subroutine. The numbers beginning with
11929 # 0x were internal to Ken's program that generated this file.
11931 # If this line contains the property name, extract it.
11932 if (/^Property dump for: [^(]*\((.*)\)/) {
11935 # Convert white space to underscores.
11938 # Convert the few properties that don't have the same name as
11939 # their modern counterparts
11940 s/Identifier_Part/ID_Continue/
11941 or s/Not_a_Character/NChar/;
11943 # If the name matches an existing property, use it.
11944 if (defined property_ref($_)) {
11945 trace "new property=", $_ if main::DEBUG && $to_trace;
11946 $current_property = $_;
11948 else { # Otherwise discard it
11949 trace "rejected property=", $_ if main::DEBUG && $to_trace;
11950 undef $current_property;
11952 $_ = ""; # The property is saved for the next lines of the
11953 # file, but this defining line is of no further use,
11954 # so clear it so that the caller won't process it
11957 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
11959 # Here, the input line isn't a header defining a property for the
11960 # following section, and either we aren't in such a section, or
11961 # the line doesn't look like one that defines the code points in
11962 # such a section. Ignore this line.
11967 # Here, we have a line defining the code points for the current
11968 # stashed property. Anything starting with the first blank is
11969 # extraneous. Otherwise, it should look like a normal range to
11970 # the caller. Append the property name so that it looks just like
11971 # a modern PropList entry.
11974 $_ .= "; $current_property";
11976 trace $_ if main::DEBUG && $to_trace;
11979 } # End closure for old style proplist
11981 sub filter_old_style_normalization_lines {
11982 # For early releases of Unicode, the lines were like:
11983 # 74..2A76 ; NFKD_NO
11984 # For later releases this became:
11985 # 74..2A76 ; NFKD_QC; N
11986 # Filter $_ to look like those in later releases.
11987 # Similarly for MAYBEs
11989 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
11991 # Also, the property FC_NFKC was abbreviated to FNC
11996 sub setup_script_extensions {
11997 # The Script_Extensions property starts out with a clone of the Script
12000 my $scx = property_ref("Script_Extensions");
12001 $scx = Property->new("scx", Full_Name => "Script_Extensions")
12003 $scx->_set_format($STRING_WHITE_SPACE_LIST);
12004 $scx->initialize($script);
12005 $scx->set_default_map($script->default_map);
12006 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
12007 $scx->add_comment(join_lines( <<END
12008 The values for code points that appear in one script are just the same as for
12009 the 'Script' property. Likewise the values for those that appear in many
12010 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
12011 values of code points that appear in a few scripts are a space separated list
12016 # Initialize scx's tables and the aliases for them to be the same as sc's
12017 foreach my $table ($script->tables) {
12018 my $scx_table = $scx->add_match_table($table->name,
12019 Full_Name => $table->full_name);
12020 foreach my $alias ($table->aliases) {
12021 $scx_table->add_alias($alias->name);
12026 sub filter_script_extensions_line {
12027 # The Scripts file comes with the full name for the scripts; the
12028 # ScriptExtensions, with the short name. The final mapping file is a
12029 # combination of these, and without adjustment, would have inconsistent
12030 # entries. This filters the latter file to convert to full names.
12031 # Entries look like this:
12032 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
12034 my @fields = split /\s*;\s*/;
12036 # This script was erroneously omitted in this Unicode version.
12037 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
12040 foreach my $short_name (split " ", $fields[1]) {
12041 push @full_names, $script->table($short_name)->full_name;
12043 $fields[1] = join " ", @full_names;
12044 $_ = join "; ", @fields;
12051 # Populates the Hangul Syllable Type property from first principles
12054 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12056 # These few ranges are hard-coded in.
12057 $file->insert_lines(split /\n/, <<'END'
12065 # The Hangul syllables in version 1 are completely different than what came
12066 # after, so just ignore them there.
12067 if ($v_version lt v2.0.0) {
12068 my $property = property_ref($file->property);
12069 push @tables_that_may_be_empty, $property->table('LV')->complete_name;
12070 push @tables_that_may_be_empty, $property->table('LVT')->complete_name;
12074 # The algorithmically derived syllables are almost all LVT ones, so
12075 # initialize the whole range with that.
12076 $file->insert_lines(sprintf "%04X..%04X; LVT\n",
12077 $SBase, $SBase + $SCount -1);
12079 # Those ones that aren't LVT are LV, and they occur at intervals of
12080 # $TCount code points, starting with the first code point, at $SBase.
12081 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
12082 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
12090 # Populates the Grapheme Cluster Break property from first principles
12093 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12095 # All these definitions are from
12096 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
12097 # from http://www.unicode.org/reports/tr29/tr29-4.html
12099 foreach my $range ($gc->ranges) {
12101 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
12103 if ($range->value =~ / ^ M [en] $ /x) {
12104 $file->insert_lines(sprintf "%04X..%04X; Extend",
12105 $range->start, $range->end);
12107 elsif ($range->value =~ / ^ C [cf] $ /x) {
12108 $file->insert_lines(sprintf "%04X..%04X; Control",
12109 $range->start, $range->end);
12112 $file->insert_lines("2028; Control"); # Line Separator
12113 $file->insert_lines("2029; Control"); # Paragraph Separator
12115 $file->insert_lines("000D; CR");
12116 $file->insert_lines("000A; LF");
12118 # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
12119 foreach my $code_point ( qw{
12121 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
12122 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
12125 my $category = $gc->value_of(hex $code_point);
12126 next if ! defined $category || $category eq 'Cn'; # But not if
12127 # unassigned in this
12129 $file->insert_lines("$code_point; Extend");
12132 my $hst = property_ref('Hangul_Syllable_Type');
12133 if ($hst->count > 0) {
12134 foreach my $range ($hst->ranges) {
12135 $file->insert_lines(sprintf "%04X..%04X; %s",
12136 $range->start, $range->end, $range->value);
12140 generate_hst($file);
12146 sub setup_early_name_alias {
12148 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12150 # This has the effect of pretending that the Name_Alias property was
12151 # available in all Unicode releases. Strictly speaking, this property
12152 # should not be availabe in early releases, but doing this allows
12153 # charnames.pm to work on older releases without change. Prior to v5.16
12154 # it had these names hard-coded inside it. Unicode 6.1 came along and
12155 # created these names, and so they were removed from charnames.
12157 my $aliases = property_ref('Name_Alias');
12158 if (! defined $aliases) {
12159 $aliases = Property->new('Name_Alias', Default_Map => "");
12162 $file->insert_lines(get_old_name_aliases());
12167 sub get_old_name_aliases () {
12169 # The Unicode_1_Name field, contains most of these names. One would
12170 # expect, given the field's name, that its values would be fixed across
12171 # versions, giving the true Unicode version 1 name for the character.
12172 # Sadly, this is not the case. Actually Version 1.1.5 had no names for
12173 # any of the controls; Version 2.0 introduced names for the C0 controls,
12174 # and 3.0 introduced C1 names. 3.0.1 removed the name INDEX; and 3.2
12175 # changed some names: it
12176 # changed to parenthesized versions like "NEXT LINE" to
12177 # "NEXT LINE (NEL)";
12178 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
12179 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
12180 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
12181 # This list contains all the names that were defined so that
12182 # charnames::vianame(), etc. understand them all EVEN if this version of
12183 # Unicode didn't specify them (this could be construed as a bug).
12184 # mktables elsewhere gives preference to the Unicode_1_Name field over
12185 # these names, so that viacode() will return the correct value for that
12186 # version of Unicode, except when that version doesn't define a name,
12187 # viacode() will return one anyway (this also could be construed as a
12188 # bug). But these potential "bugs" allow for the smooth working of code
12189 # on earlier Unicode releases.
12191 my @return = split /\n/, <<'END';
12193 0000;NUL;abbreviation
12194 0001;START OF HEADING;control
12195 0001;SOH;abbreviation
12196 0002;START OF TEXT;control
12197 0002;STX;abbreviation
12198 0003;END OF TEXT;control
12199 0003;ETX;abbreviation
12200 0004;END OF TRANSMISSION;control
12201 0004;EOT;abbreviation
12202 0005;ENQUIRY;control
12203 0005;ENQ;abbreviation
12204 0006;ACKNOWLEDGE;control
12205 0006;ACK;abbreviation
12207 0007;BEL;abbreviation
12208 0008;BACKSPACE;control
12209 0008;BS;abbreviation
12210 0009;CHARACTER TABULATION;control
12211 0009;HORIZONTAL TABULATION;control
12212 0009;HT;abbreviation
12213 0009;TAB;abbreviation
12214 000A;LINE FEED;control
12215 000A;LINE FEED (LF);control
12216 000A;NEW LINE;control
12217 000A;END OF LINE;control
12218 000A;LF;abbreviation
12219 000A;NL;abbreviation
12220 000A;EOL;abbreviation
12221 000B;LINE TABULATION;control
12222 000B;VERTICAL TABULATION;control
12223 000B;VT;abbreviation
12224 000C;FORM FEED;control
12225 000C;FORM FEED (FF);control
12226 000C;FF;abbreviation
12227 000D;CARRIAGE RETURN;control
12228 000D;CARRIAGE RETURN (CR);control
12229 000D;CR;abbreviation
12230 000E;SHIFT OUT;control
12231 000E;LOCKING-SHIFT ONE;control
12232 000E;SO;abbreviation
12233 000F;SHIFT IN;control
12234 000F;LOCKING-SHIFT ZERO;control
12235 000F;SI;abbreviation
12236 0010;DATA LINK ESCAPE;control
12237 0010;DLE;abbreviation
12238 0011;DEVICE CONTROL ONE;control
12239 0011;DC1;abbreviation
12240 0012;DEVICE CONTROL TWO;control
12241 0012;DC2;abbreviation
12242 0013;DEVICE CONTROL THREE;control
12243 0013;DC3;abbreviation
12244 0014;DEVICE CONTROL FOUR;control
12245 0014;DC4;abbreviation
12246 0015;NEGATIVE ACKNOWLEDGE;control
12247 0015;NAK;abbreviation
12248 0016;SYNCHRONOUS IDLE;control
12249 0016;SYN;abbreviation
12250 0017;END OF TRANSMISSION BLOCK;control
12251 0017;ETB;abbreviation
12252 0018;CANCEL;control
12253 0018;CAN;abbreviation
12254 0019;END OF MEDIUM;control
12255 0019;EOM;abbreviation
12256 001A;SUBSTITUTE;control
12257 001A;SUB;abbreviation
12258 001B;ESCAPE;control
12259 001B;ESC;abbreviation
12260 001C;INFORMATION SEPARATOR FOUR;control
12261 001C;FILE SEPARATOR;control
12262 001C;FS;abbreviation
12263 001D;INFORMATION SEPARATOR THREE;control
12264 001D;GROUP SEPARATOR;control
12265 001D;GS;abbreviation
12266 001E;INFORMATION SEPARATOR TWO;control
12267 001E;RECORD SEPARATOR;control
12268 001E;RS;abbreviation
12269 001F;INFORMATION SEPARATOR ONE;control
12270 001F;UNIT SEPARATOR;control
12271 001F;US;abbreviation
12272 0020;SP;abbreviation
12273 007F;DELETE;control
12274 007F;DEL;abbreviation
12275 0080;PADDING CHARACTER;figment
12276 0080;PAD;abbreviation
12277 0081;HIGH OCTET PRESET;figment
12278 0081;HOP;abbreviation
12279 0082;BREAK PERMITTED HERE;control
12280 0082;BPH;abbreviation
12281 0083;NO BREAK HERE;control
12282 0083;NBH;abbreviation
12284 0084;IND;abbreviation
12285 0085;NEXT LINE;control
12286 0085;NEXT LINE (NEL);control
12287 0085;NEL;abbreviation
12288 0086;START OF SELECTED AREA;control
12289 0086;SSA;abbreviation
12290 0087;END OF SELECTED AREA;control
12291 0087;ESA;abbreviation
12292 0088;CHARACTER TABULATION SET;control
12293 0088;HORIZONTAL TABULATION SET;control
12294 0088;HTS;abbreviation
12295 0089;CHARACTER TABULATION WITH JUSTIFICATION;control
12296 0089;HORIZONTAL TABULATION WITH JUSTIFICATION;control
12297 0089;HTJ;abbreviation
12298 008A;LINE TABULATION SET;control
12299 008A;VERTICAL TABULATION SET;control
12300 008A;VTS;abbreviation
12301 008B;PARTIAL LINE FORWARD;control
12302 008B;PARTIAL LINE DOWN;control
12303 008B;PLD;abbreviation
12304 008C;PARTIAL LINE BACKWARD;control
12305 008C;PARTIAL LINE UP;control
12306 008C;PLU;abbreviation
12307 008D;REVERSE LINE FEED;control
12308 008D;REVERSE INDEX;control
12309 008D;RI;abbreviation
12310 008E;SINGLE SHIFT TWO;control
12311 008E;SINGLE-SHIFT-2;control
12312 008E;SS2;abbreviation
12313 008F;SINGLE SHIFT THREE;control
12314 008F;SINGLE-SHIFT-3;control
12315 008F;SS3;abbreviation
12316 0090;DEVICE CONTROL STRING;control
12317 0090;DCS;abbreviation
12318 0091;PRIVATE USE ONE;control
12319 0091;PRIVATE USE-1;control
12320 0091;PU1;abbreviation
12321 0092;PRIVATE USE TWO;control
12322 0092;PRIVATE USE-2;control
12323 0092;PU2;abbreviation
12324 0093;SET TRANSMIT STATE;control
12325 0093;STS;abbreviation
12326 0094;CANCEL CHARACTER;control
12327 0094;CCH;abbreviation
12328 0095;MESSAGE WAITING;control
12329 0095;MW;abbreviation
12330 0096;START OF GUARDED AREA;control
12331 0096;START OF PROTECTED AREA;control
12332 0096;SPA;abbreviation
12333 0097;END OF GUARDED AREA;control
12334 0097;END OF PROTECTED AREA;control
12335 0097;EPA;abbreviation
12336 0098;START OF STRING;control
12337 0098;SOS;abbreviation
12338 0099;SINGLE GRAPHIC CHARACTER INTRODUCER;figment
12339 0099;SGC;abbreviation
12340 009A;SINGLE CHARACTER INTRODUCER;control
12341 009A;SCI;abbreviation
12342 009B;CONTROL SEQUENCE INTRODUCER;control
12343 009B;CSI;abbreviation
12344 009C;STRING TERMINATOR;control
12345 009C;ST;abbreviation
12346 009D;OPERATING SYSTEM COMMAND;control
12347 009D;OSC;abbreviation
12348 009E;PRIVACY MESSAGE;control
12349 009E;PM;abbreviation
12350 009F;APPLICATION PROGRAM COMMAND;control
12351 009F;APC;abbreviation
12352 00A0;NBSP;abbreviation
12353 00AD;SHY;abbreviation
12354 200B;ZWSP;abbreviation
12355 200C;ZWNJ;abbreviation
12356 200D;ZWJ;abbreviation
12357 200E;LRM;abbreviation
12358 200F;RLM;abbreviation
12359 202A;LRE;abbreviation
12360 202B;RLE;abbreviation
12361 202C;PDF;abbreviation
12362 202D;LRO;abbreviation
12363 202E;RLO;abbreviation
12364 FEFF;BYTE ORDER MARK;alternate
12365 FEFF;BOM;abbreviation
12366 FEFF;ZWNBSP;abbreviation
12369 if ($v_version ge v3.0.0) {
12370 push @return, split /\n/, <<'END';
12371 180B; FVS1; abbreviation
12372 180C; FVS2; abbreviation
12373 180D; FVS3; abbreviation
12374 180E; MVS; abbreviation
12375 202F; NNBSP; abbreviation
12379 if ($v_version ge v3.2.0) {
12380 push @return, split /\n/, <<'END';
12381 034F; CGJ; abbreviation
12382 205F; MMSP; abbreviation
12383 2060; WJ; abbreviation
12386 my $cp = 0xFE00 - 1;
12387 for my $i (1..16) {
12388 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12391 if ($v_version ge v4.0.0) { # Add in VS17..VS256
12392 my $cp = 0xE0100 - 17;
12393 for my $i (17..256) {
12394 push @return, sprintf("%04X; VS%d; abbreviation", $cp + $i, $i);
12398 # ALERT did not come along until 6.0, at which point it became preferred
12399 # over BELL, and was never in the Unicode_1_Name field. For the same
12400 # reasons, that the other names are made known to all releases by this
12401 # function, we make ALERT known too. By inserting it
12402 # last in early releases, BELL is preferred over it; and vice-vers in 6.0
12403 my $alert = '0007; ALERT; control';
12404 if ($v_version lt v6.0.0) {
12405 push @return, $alert;
12408 unshift @return, $alert;
12414 sub filter_later_version_name_alias_line {
12416 # This file has an extra entry per line for the alias type. This is
12417 # handled by creating a compound entry: "$alias: $type"; First, split
12418 # the line into components.
12419 my ($range, $alias, $type, @remainder)
12420 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12422 # This file contains multiple entries for some components, so tell the
12423 # downstream code to allow this in our internal tables; the
12424 # $MULTIPLE_AFTER preserves the input ordering.
12425 $_ = join ";", $range, $CMD_DELIM
12435 sub filter_early_version_name_alias_line {
12437 # Early versions did not have the trailing alias type field; implicitly it
12438 # was 'correction'. But our synthetic lines we add in this program do
12439 # have it, so test for the type field.
12440 $_ .= "; correction" if $_ !~ /;.*;/;
12442 filter_later_version_name_alias_line;
12446 sub finish_Unicode() {
12447 # This routine should be called after all the Unicode files have been read
12449 # 1) Creates properties that are missing from the version of Unicode being
12450 # compiled, and which, for whatever reason, are needed for the Perl
12451 # core to function properly. These are minimally populated as
12453 # 2) Adds the mappings for code points missing from the files which have
12454 # defaults specified for them.
12455 # 3) At this this point all mappings are known, so it computes the type of
12456 # each property whose type hasn't been determined yet.
12457 # 4) Calculates all the regular expression match tables based on the
12459 # 5) Calculates and adds the tables which are defined by Unicode, but
12460 # which aren't derived by them, and certain derived tables that Perl
12463 # Folding information was introduced later into Unicode data. To get
12464 # Perl's case ignore (/i) to work at all in releases that don't have
12465 # folding, use the best available alternative, which is lower casing.
12466 my $fold = property_ref('Case_Folding');
12467 if ($fold->is_empty) {
12468 $fold->initialize(property_ref('Lowercase_Mapping'));
12469 $fold->add_note(join_lines(<<END
12470 WARNING: This table uses lower case as a substitute for missing fold
12476 # Multiple-character mapping was introduced later into Unicode data, so it
12477 # is by default the simple version. If to output the simple versions and
12478 # not present, just use the regular (which in these Unicode versions is
12479 # the simple as well).
12480 foreach my $map (qw { Uppercase_Mapping
12486 my $simple = property_ref("Simple_$map");
12487 next if ! $simple->is_empty;
12488 if ($simple->to_output_map) {
12489 $simple->initialize(property_ref($map));
12492 property_ref($map)->set_proxy_for($simple->name);
12496 # For each property, fill in any missing mappings, and calculate the re
12497 # match tables. If a property has more than one missing mapping, the
12498 # default is a reference to a data structure, and requires data from other
12499 # properties to resolve. The sort is used to cause these to be processed
12500 # last, after all the other properties have been calculated.
12501 # (Fortunately, the missing properties so far don't depend on each other.)
12502 foreach my $property
12503 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
12506 # $perl has been defined, but isn't one of the Unicode properties that
12507 # need to be finished up.
12508 next if $property == $perl;
12510 # Nor do we need to do anything with properties that aren't going to
12512 next if $property->fate == $SUPPRESSED;
12514 # Handle the properties that have more than one possible default
12515 if (ref $property->default_map) {
12516 my $default_map = $property->default_map;
12518 # These properties have stored in the default_map:
12520 # 1) A default map which applies to all code points in a
12522 # 2) an expression which will evaluate to the list of code
12523 # points in that class
12525 # 3) the default map which applies to every other missing code
12528 # Go through each list.
12529 while (my ($default, $eval) = $default_map->get_next_defaults) {
12531 # Get the class list, and intersect it with all the so-far
12532 # unspecified code points yielding all the code points
12533 # in the class that haven't been specified.
12534 my $list = eval $eval;
12536 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
12540 # Narrow down the list to just those code points we don't have
12542 $list = $list & $property->inverse_list;
12544 # Add mappings to the property for each code point in the list
12545 foreach my $range ($list->ranges) {
12546 $property->add_map($range->start, $range->end, $default,
12547 Replace => $CROAK);
12551 # All remaining code points have the other mapping. Set that up
12552 # so the normal single-default mapping code will work on them
12553 $property->set_default_map($default_map->other_default);
12555 # And fall through to do that
12558 # We should have enough data now to compute the type of the property.
12559 $property->compute_type;
12560 my $property_type = $property->type;
12562 next if ! $property->to_create_match_tables;
12564 # Here want to create match tables for this property
12566 # The Unicode db always (so far, and they claim into the future) have
12567 # the default for missing entries in binary properties be 'N' (unless
12568 # there is a '@missing' line that specifies otherwise)
12569 if ($property_type == $BINARY && ! defined $property->default_map) {
12570 $property->set_default_map('N');
12573 # Add any remaining code points to the mapping, using the default for
12574 # missing code points.
12576 if (defined (my $default_map = $property->default_map)) {
12578 # Make sure there is a match table for the default
12579 if (! defined ($default_table = $property->table($default_map))) {
12580 $default_table = $property->add_match_table($default_map);
12583 # And, if the property is binary, the default table will just
12584 # be the complement of the other table.
12585 if ($property_type == $BINARY) {
12586 my $non_default_table;
12588 # Find the non-default table.
12589 for my $table ($property->tables) {
12590 next if $table == $default_table;
12591 $non_default_table = $table;
12593 $default_table->set_complement($non_default_table);
12597 # This fills in any missing values with the default. It's not
12598 # necessary to do this with binary properties, as the default
12599 # is defined completely in terms of the Y table.
12600 $property->add_map(0, $MAX_UNICODE_CODEPOINT,
12601 $default_map, Replace => $NO);
12605 # Have all we need to populate the match tables.
12606 my $property_name = $property->name;
12607 my $maps_should_be_defined = $property->pre_declared_maps;
12608 foreach my $range ($property->ranges) {
12609 my $map = $range->value;
12610 my $table = $property->table($map);
12611 if (! defined $table) {
12613 # Integral and rational property values are not necessarily
12614 # defined in PropValueAliases, but whether all the other ones
12615 # should be depends on the property.
12616 if ($maps_should_be_defined
12617 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
12619 Carp::my_carp("Table '$property_name=$map' should have been defined. Defining it now.")
12621 $table = $property->add_match_table($map);
12624 next if $table->complement != 0; # Don't need to populate these
12625 $table->add_range($range->start, $range->end);
12628 # A forced binary property has additional true/false tables which
12629 # should have been set up when it was forced into binary. The false
12630 # table matches exactly the same set as the property's default table.
12631 # The true table matches the complement of that. The false table is
12632 # not the same as an additional set of aliases on top of the default
12633 # table, so use 'set_equivalent_to'. If it were implemented as
12634 # additional aliases, various things would have to be adjusted, but
12635 # especially, if the user wants to get a list of names for the table
12636 # using Unicode::UCD::prop_value_aliases(), s/he should get a
12637 # different set depending on whether they want the default table or
12639 if ($property_type == $FORCED_BINARY) {
12640 $property->table('N')->set_equivalent_to($default_table,
12642 $property->table('Y')->set_complement($default_table);
12645 # For Perl 5.6 compatibility, all properties matchable in regexes can
12646 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
12647 # But warn if this creates a conflict with a (new) Unicode property
12648 # name, although it appears that Unicode has made a decision never to
12649 # begin a property name with 'Is_', so this shouldn't happen.
12650 foreach my $alias ($property->aliases) {
12651 my $Is_name = 'Is_' . $alias->name;
12652 if (defined (my $pre_existing = property_ref($Is_name))) {
12653 Carp::my_carp(<<END
12654 There is already an alias named $Is_name (from " . $pre_existing . "), so
12655 creating one for $property won't work. This is bad news. If it is not too
12656 late, get Unicode to back off. Otherwise go back to the old scheme (findable
12657 from the git blame log for this area of the code that suppressed individual
12658 aliases that conflict with the new Unicode names. Proceeding anyway.
12662 } # End of loop through aliases for this property
12663 } # End of loop through all Unicode properties.
12665 # Fill in the mappings that Unicode doesn't completely furnish. First the
12666 # single letter major general categories. If Unicode were to start
12667 # delivering the values, this would be redundant, but better that than to
12668 # try to figure out if should skip and not get it right. Ths could happen
12669 # if a new major category were to be introduced, and the hard-coded test
12670 # wouldn't know about it.
12671 # This routine depends on the standard names for the general categories
12672 # being what it thinks they are, like 'Cn'. The major categories are the
12673 # union of all the general category tables which have the same first
12674 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
12675 foreach my $minor_table ($gc->tables) {
12676 my $minor_name = $minor_table->name;
12677 next if length $minor_name == 1;
12678 if (length $minor_name != 2) {
12679 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
12683 my $major_name = uc(substr($minor_name, 0, 1));
12684 my $major_table = $gc->table($major_name);
12685 $major_table += $minor_table;
12688 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
12689 # defines it as LC)
12690 my $LC = $gc->table('LC');
12691 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
12692 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
12695 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
12696 # deliver the correct values in it
12697 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
12699 # Lt not in release 1.
12700 if (defined $gc->table('Lt')) {
12701 $LC += $gc->table('Lt');
12702 $gc->table('Lt')->set_caseless_equivalent($LC);
12705 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
12707 $gc->table('Ll')->set_caseless_equivalent($LC);
12708 $gc->table('Lu')->set_caseless_equivalent($LC);
12710 my $Cs = $gc->table('Cs');
12712 # Create digit and case fold tables with the original file names for
12713 # backwards compatibility with applications that read them directly.
12714 my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
12716 Perl_Extension => 1,
12717 File => 'Digit', # Trad. location
12718 Directory => $map_directory,
12721 To_Output_Map => $EXTERNAL_MAP,
12723 Initialize => property_ref('Perl_Decimal_Digit'),
12725 $Digit->add_comment(join_lines(<<END
12726 This file gives the mapping of all code points which represent a single
12727 decimal digit [0-9] to their respective digits. For example, the code point
12728 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
12729 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
12734 Property->new('Legacy_Case_Folding',
12736 Directory => $map_directory,
12737 Default_Map => $CODE_POINT,
12741 To_Output_Map => $EXTERNAL_MAP,
12742 Format => $HEX_FORMAT,
12743 Initialize => property_ref('cf'),
12746 # The Script_Extensions property started out as a clone of the Script
12747 # property. But processing its data file caused some elements to be
12748 # replaced with different data. (These elements were for the Common and
12749 # Inherited properties.) This data is a qw() list of all the scripts that
12750 # the code points in the given range are in. An example line is:
12751 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
12753 # The code above has created a new match table named "Arab Syrc Thaa"
12754 # which contains 060C. (The cloned table started out with this code point
12755 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
12756 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
12757 # match table. This is repeated for all these tables and ranges. The map
12758 # data is retained in the map table for reference, but the spurious match
12759 # tables are deleted.
12761 my $scx = property_ref("Script_Extensions");
12762 if (defined $scx) {
12763 foreach my $table ($scx->tables) {
12764 next unless $table->name =~ /\s/; # All the new and only the new
12765 # tables have a space in their
12767 my @scripts = split /\s+/, $table->name;
12768 foreach my $script (@scripts) {
12769 my $script_table = $scx->table($script);
12770 $script_table += $table;
12772 $scx->delete_match_table($table);
12779 sub pre_3_dot_1_Nl () {
12781 # Return a range list for gc=nl for Unicode versions prior to 3.1, which
12782 # is when Unicode's became fully usable. These code points were
12783 # determined by inspection and experimentation. gc=nl is important for
12784 # certain Perl-extension properties that should be available in all
12787 my $Nl = Range_List->new();
12788 if (defined (my $official = $gc->table('Nl'))) {
12792 $Nl->add_range(0x2160, 0x2182);
12793 $Nl->add_range(0x3007, 0x3007);
12794 $Nl->add_range(0x3021, 0x3029);
12796 $Nl->add_range(0xFE20, 0xFE23);
12797 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
12802 sub compile_perl() {
12803 # Create perl-defined tables. Almost all are part of the pseudo-property
12804 # named 'perl' internally to this program. Many of these are recommended
12805 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
12806 # on those found there.
12807 # Almost all of these are equivalent to some Unicode property.
12808 # A number of these properties have equivalents restricted to the ASCII
12809 # range, with their names prefaced by 'Posix', to signify that these match
12810 # what the Posix standard says they should match. A couple are
12811 # effectively this, but the name doesn't have 'Posix' in it because there
12812 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
12813 # to the full Unicode range, by our guesses as to what is appropriate.
12815 # 'Any' is all code points. As an error check, instead of just setting it
12816 # to be that, construct it to be the union of all the major categories
12817 $Any = $perl->add_match_table('Any',
12818 Description => "[\\x{0000}-\\x{$MAX_UNICODE_CODEPOINT_STRING}]",
12821 foreach my $major_table ($gc->tables) {
12823 # Major categories are the ones with single letter names.
12824 next if length($major_table->name) != 1;
12826 $Any += $major_table;
12829 if ($Any->max != $MAX_UNICODE_CODEPOINT) {
12830 Carp::my_carp_bug("Generated highest code point ("
12831 . sprintf("%X", $Any->max)
12832 . ") doesn't match expected value $MAX_UNICODE_CODEPOINT_STRING.")
12834 if ($Any->range_count != 1 || $Any->min != 0) {
12835 Carp::my_carp_bug("Generated table 'Any' doesn't match all code points.")
12838 $Any->add_alias('All');
12840 # Assigned is the opposite of gc=unassigned
12841 my $Assigned = $perl->add_match_table('Assigned',
12842 Description => "All assigned code points",
12843 Initialize => ~ $gc->table('Unassigned'),
12846 # Our internal-only property should be treated as more than just a
12847 # synonym; grandfather it in to the pod.
12848 $perl->add_match_table('_CombAbove', Re_Pod_Entry => 1,
12849 Fate => $INTERNAL_ONLY, Status => $DISCOURAGED)
12850 ->set_equivalent_to(property_ref('ccc')->table('Above'),
12853 my $ASCII = $perl->add_match_table('ASCII', Description => '[[:ASCII:]]');
12854 if (defined $block) { # This is equivalent to the block if have it.
12855 my $Unicode_ASCII = $block->table('Basic_Latin');
12856 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
12857 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
12861 # Very early releases didn't have blocks, so initialize ASCII ourselves if
12863 if ($ASCII->is_empty) {
12864 $ASCII->add_range(0, 127);
12867 # Get the best available case definitions. Early Unicode versions didn't
12868 # have Uppercase and Lowercase defined, so use the general category
12869 # instead for them, modified by hard-coding in the code points each is
12871 my $Lower = $perl->add_match_table('Lower');
12872 my $Unicode_Lower = property_ref('Lowercase');
12873 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
12874 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
12878 $Lower += $gc->table('Lowercase_Letter');
12880 # There are quite a few code points in Lower, that aren't in gc=lc,
12881 # and not all are in all releases.
12882 foreach my $code_point ( 0x00AA,
12901 # Don't include the code point unless it is assigned in this
12903 my $category = $gc->value_of(hex $code_point);
12904 next if ! defined $category || $category eq 'Cn';
12906 $Lower += $code_point;
12909 $Lower->add_alias('XPosixLower');
12910 my $Posix_Lower = $perl->add_match_table("PosixLower",
12911 Description => "[a-z]",
12912 Initialize => $Lower & $ASCII,
12915 my $Upper = $perl->add_match_table('Upper');
12916 my $Unicode_Upper = property_ref('Uppercase');
12917 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
12918 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
12922 # Unlike Lower, there are only two ranges in Upper that aren't in
12923 # gc=Lu, and all code points were assigned in all releases.
12924 $Upper += $gc->table('Uppercase_Letter');
12925 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
12926 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
12928 $Upper->add_alias('XPosixUpper');
12929 my $Posix_Upper = $perl->add_match_table("PosixUpper",
12930 Description => "[A-Z]",
12931 Initialize => $Upper & $ASCII,
12934 # Earliest releases didn't have title case. Initialize it to empty if not
12935 # otherwise present
12936 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
12937 Description => '(= \p{Gc=Lt})');
12938 my $lt = $gc->table('Lt');
12940 # Earlier versions of mktables had this related to $lt since they have
12941 # identical code points, but their caseless equivalents are not the same,
12942 # one being 'Cased' and the other being 'LC', and so now must be kept as
12943 # separate entities.
12948 push @tables_that_may_be_empty, $Title->complete_name;
12951 my $Unicode_Cased = property_ref('Cased');
12952 if (defined $Unicode_Cased) {
12953 my $yes = $Unicode_Cased->table('Y');
12954 my $no = $Unicode_Cased->table('N');
12955 $Title->set_caseless_equivalent($yes);
12956 if (defined $Unicode_Upper) {
12957 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
12958 $Unicode_Upper->table('N')->set_caseless_equivalent($no);
12960 $Upper->set_caseless_equivalent($yes);
12961 if (defined $Unicode_Lower) {
12962 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
12963 $Unicode_Lower->table('N')->set_caseless_equivalent($no);
12965 $Lower->set_caseless_equivalent($yes);
12968 # If this Unicode version doesn't have Cased, set up the Perl
12969 # extension from first principles. From Unicode 5.1: Definition D120:
12970 # A character C is defined to be cased if and only if C has the
12971 # Lowercase or Uppercase property or has a General_Category value of
12972 # Titlecase_Letter.
12973 my $cased = $perl->add_match_table('Cased',
12974 Initialize => $Lower + $Upper + $Title,
12975 Description => 'Uppercase or Lowercase or Titlecase',
12977 # $notcased is purely for the caseless equivalents below
12978 my $notcased = $perl->add_match_table('_Not_Cased',
12979 Initialize => ~ $cased,
12980 Fate => $INTERNAL_ONLY,
12981 Description => 'All not-cased code points');
12982 $Title->set_caseless_equivalent($cased);
12983 if (defined $Unicode_Upper) {
12984 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
12985 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
12987 $Upper->set_caseless_equivalent($cased);
12988 if (defined $Unicode_Lower) {
12989 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
12990 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
12992 $Lower->set_caseless_equivalent($cased);
12995 # Similarly, set up our own Case_Ignorable property if this Unicode
12996 # version doesn't have it. From Unicode 5.1: Definition D121: A character
12997 # C is defined to be case-ignorable if C has the value MidLetter or the
12998 # value MidNumLet for the Word_Break property or its General_Category is
12999 # one of Nonspacing_Mark (Mn), Enclosing_Mark (Me), Format (Cf),
13000 # Modifier_Letter (Lm), or Modifier_Symbol (Sk).
13002 # Perl has long had an internal-only alias for this property; grandfather
13003 # it in to the pod, but discourage its use.
13004 my $perl_case_ignorable = $perl->add_match_table('_Case_Ignorable',
13006 Fate => $INTERNAL_ONLY,
13007 Status => $DISCOURAGED);
13008 my $case_ignorable = property_ref('Case_Ignorable');
13009 if (defined $case_ignorable && ! $case_ignorable->is_empty) {
13010 $perl_case_ignorable->set_equivalent_to($case_ignorable->table('Y'),
13015 $perl_case_ignorable->initialize($gc->table('Mn') + $gc->table('Lm'));
13017 # The following three properties are not in early releases
13018 $perl_case_ignorable += $gc->table('Me') if defined $gc->table('Me');
13019 $perl_case_ignorable += $gc->table('Cf') if defined $gc->table('Cf');
13020 $perl_case_ignorable += $gc->table('Sk') if defined $gc->table('Sk');
13022 # For versions 4.1 - 5.0, there is no MidNumLet property, and
13023 # correspondingly the case-ignorable definition lacks that one. For
13024 # 4.0, it appears that it was meant to be the same definition, but was
13025 # inadvertently omitted from the standard's text, so add it if the
13026 # property actually is there
13027 my $wb = property_ref('Word_Break');
13029 my $midlet = $wb->table('MidLetter');
13030 $perl_case_ignorable += $midlet if defined $midlet;
13031 my $midnumlet = $wb->table('MidNumLet');
13032 $perl_case_ignorable += $midnumlet if defined $midnumlet;
13036 # In earlier versions of the standard, instead of the above two
13037 # properties , just the following characters were used:
13038 $perl_case_ignorable += 0x0027 # APOSTROPHE
13039 + 0x00AD # SOFT HYPHEN (SHY)
13040 + 0x2019; # RIGHT SINGLE QUOTATION MARK
13044 # The remaining perl defined tables are mostly based on Unicode TR 18,
13045 # "Annex C: Compatibility Properties". All of these have two versions,
13046 # one whose name generally begins with Posix that is posix-compliant, and
13047 # one that matches Unicode characters beyond the Posix, ASCII range
13049 my $Alpha = $perl->add_match_table('Alpha');
13051 # Alphabetic was not present in early releases
13052 my $Alphabetic = property_ref('Alphabetic');
13053 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
13054 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
13058 # The Alphabetic property doesn't exist for early releases, so
13059 # generate it. The actual definition, in 5.2 terms is:
13061 # gc=L + gc=Nl + Other_Alphabetic
13063 # Other_Alphabetic is also not defined in these early releases, but it
13064 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
13065 # those last two as well, then subtract the relatively few of them that
13066 # shouldn't have been added. (The gc=So range is the circled capital
13067 # Latin characters. Early releases mistakenly didn't also include the
13068 # lower-case versions of these characters, and so we don't either, to
13069 # maintain consistency with those releases that first had this
13071 $Alpha->initialize($gc->table('Letter')
13076 $Alpha->add_range(0x24D0, 0x24E9); # gc=So
13077 foreach my $range ( [ 0x0300, 0x0344 ],
13078 [ 0x0346, 0x034E ],
13079 [ 0x0360, 0x0362 ],
13080 [ 0x0483, 0x0486 ],
13081 [ 0x0591, 0x05AF ],
13082 [ 0x06DF, 0x06E0 ],
13083 [ 0x06EA, 0x06EC ],
13084 [ 0x0740, 0x074A ],
13087 [ 0x0951, 0x0954 ],
13101 [ 0x0E47, 0x0E4C ],
13103 [ 0x0EC8, 0x0ECC ],
13104 [ 0x0F18, 0x0F19 ],
13108 [ 0x0F3E, 0x0F3F ],
13109 [ 0x0F82, 0x0F84 ],
13110 [ 0x0F86, 0x0F87 ],
13114 [ 0x17C9, 0x17D3 ],
13115 [ 0x20D0, 0x20DC ],
13117 [ 0x302A, 0x302F ],
13118 [ 0x3099, 0x309A ],
13119 [ 0xFE20, 0xFE23 ],
13120 [ 0x1D165, 0x1D169 ],
13121 [ 0x1D16D, 0x1D172 ],
13122 [ 0x1D17B, 0x1D182 ],
13123 [ 0x1D185, 0x1D18B ],
13124 [ 0x1D1AA, 0x1D1AD ],
13127 $Alpha->delete_range($range->[0], $range->[1]);
13130 $Alpha->delete_range($range, $range);
13133 $Alpha->add_description('Alphabetic');
13134 $Alpha->add_alias('Alphabetic');
13136 $Alpha->add_alias('XPosixAlpha');
13137 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
13138 Description => "[A-Za-z]",
13139 Initialize => $Alpha & $ASCII,
13141 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
13142 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
13144 my $Alnum = $perl->add_match_table('Alnum',
13145 Description => 'Alphabetic and (decimal) Numeric',
13146 Initialize => $Alpha + $gc->table('Decimal_Number'),
13148 $Alnum->add_alias('XPosixAlnum');
13149 $perl->add_match_table("PosixAlnum",
13150 Description => "[A-Za-z0-9]",
13151 Initialize => $Alnum & $ASCII,
13154 my $Word = $perl->add_match_table('Word',
13155 Description => '\w, including beyond ASCII;'
13156 . ' = \p{Alnum} + \pM + \p{Pc}',
13157 Initialize => $Alnum + $gc->table('Mark'),
13159 $Word->add_alias('XPosixWord');
13160 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
13165 $Word += ord('_'); # Make sure this is a $Word
13167 my $JC = property_ref('Join_Control'); # Wasn't in release 1
13169 $Word += $JC->table('Y');
13172 $Word += 0x200C + 0x200D;
13175 # This is a Perl extension, so the name doesn't begin with Posix.
13176 my $PerlWord = $perl->add_match_table('PerlWord',
13177 Description => '\w, restricted to ASCII = [A-Za-z0-9_]',
13178 Initialize => $Word & $ASCII,
13180 $PerlWord->add_alias('PosixWord');
13182 my $Blank = $perl->add_match_table('Blank',
13183 Description => '\h, Horizontal white space',
13185 # 200B is Zero Width Space which is for line
13186 # break control, and was listed as
13187 # Space_Separator in early releases
13188 Initialize => $gc->table('Space_Separator')
13192 $Blank->add_alias('HorizSpace'); # Another name for it.
13193 $Blank->add_alias('XPosixBlank');
13194 $perl->add_match_table("PosixBlank",
13195 Description => "\\t and ' '",
13196 Initialize => $Blank & $ASCII,
13199 my $VertSpace = $perl->add_match_table('VertSpace',
13200 Description => '\v',
13201 Initialize => $gc->table('Line_Separator')
13202 + $gc->table('Paragraph_Separator')
13203 + 0x000A # LINE FEED
13204 + 0x000B # VERTICAL TAB
13205 + 0x000C # FORM FEED
13206 + 0x000D # CARRIAGE RETURN
13209 # No Posix equivalent for vertical space
13211 my $Space = $perl->add_match_table('Space',
13212 Description => '\s including beyond ASCII and vertical tab',
13213 Initialize => $Blank + $VertSpace,
13215 $Space->add_alias('XPosixSpace');
13216 my $posix_space = $perl->add_match_table("PosixSpace",
13217 Description => "\\t, \\n, \\cK, \\f, \\r, and ' '. (\\cK is vertical tab)",
13218 Initialize => $Space & $ASCII,
13221 # Perl's traditional space doesn't include Vertical Tab prior to v5.18
13222 my $XPerlSpace = $perl->add_match_table('XPerlSpace',
13223 Description => '\s, including beyond ASCII',
13224 #Initialize => $Space - 0x000B,
13225 Initialize => $Space,
13227 $XPerlSpace->add_alias('SpacePerl'); # A pre-existing synonym
13228 my $PerlSpace = $perl->add_match_table('PerlSpace',
13229 Description => '\s, restricted to ASCII = [ \f\n\r\t] plus vertical tab',
13230 Initialize => $XPerlSpace & $ASCII,
13234 my $Cntrl = $perl->add_match_table('Cntrl',
13235 Description => 'Control characters');
13236 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
13237 $Cntrl->add_alias('XPosixCntrl');
13238 $perl->add_match_table("PosixCntrl",
13239 Description => "ASCII control characters: NUL, SOH, STX, ETX, EOT, ENQ, ACK, BEL, BS, HT, LF, VT, FF, CR, SO, SI, DLE, DC1, DC2, DC3, DC4, NAK, SYN, ETB, CAN, EOM, SUB, ESC, FS, GS, RS, US, and DEL",
13240 Initialize => $Cntrl & $ASCII,
13243 # $controls is a temporary used to construct Graph.
13244 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
13245 + $gc->table('Control'));
13246 # Cs not in release 1
13247 $controls += $gc->table('Surrogate') if defined $gc->table('Surrogate');
13249 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
13250 my $Graph = $perl->add_match_table('Graph',
13251 Description => 'Characters that are graphical',
13252 Initialize => ~ ($Space + $controls),
13254 $Graph->add_alias('XPosixGraph');
13255 $perl->add_match_table("PosixGraph",
13257 '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~0-9A-Za-z]',
13258 Initialize => $Graph & $ASCII,
13261 $print = $perl->add_match_table('Print',
13262 Description => 'Characters that are graphical plus space characters (but no controls)',
13263 Initialize => $Blank + $Graph - $gc->table('Control'),
13265 $print->add_alias('XPosixPrint');
13266 $perl->add_match_table("PosixPrint",
13268 '[- 0-9A-Za-z!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
13269 Initialize => $print & $ASCII,
13272 my $Punct = $perl->add_match_table('Punct');
13273 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
13275 # \p{punct} doesn't include the symbols, which posix does
13276 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
13277 Description => '\p{Punct} + ASCII-range \p{Symbol}',
13278 Initialize => $gc->table('Punctuation')
13279 + ($ASCII & $gc->table('Symbol')),
13280 Perl_Extension => 1
13282 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
13283 Description => '[-!"#$%&\'()*+,./:;<>?@[\\\]^_`{|}~]',
13284 Initialize => $ASCII & $XPosixPunct,
13287 my $Digit = $perl->add_match_table('Digit',
13288 Description => '[0-9] + all other decimal digits');
13289 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
13290 $Digit->add_alias('XPosixDigit');
13291 my $PosixDigit = $perl->add_match_table("PosixDigit",
13292 Description => '[0-9]',
13293 Initialize => $Digit & $ASCII,
13296 # Hex_Digit was not present in first release
13297 my $Xdigit = $perl->add_match_table('XDigit');
13298 $Xdigit->add_alias('XPosixXDigit');
13299 my $Hex = property_ref('Hex_Digit');
13300 if (defined $Hex && ! $Hex->is_empty) {
13301 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
13304 # (Have to use hex instead of e.g. '0', because could be running on an
13305 # non-ASCII machine, and we want the Unicode (ASCII) values)
13306 $Xdigit->initialize([ 0x30..0x39, 0x41..0x46, 0x61..0x66,
13307 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
13308 $Xdigit->add_description('[0-9A-Fa-f] and corresponding fullwidth versions, like U+FF10: FULLWIDTH DIGIT ZERO');
13311 # AHex was not present in early releases
13312 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
13313 my $AHex = property_ref('ASCII_Hex_Digit');
13314 if (defined $AHex && ! $AHex->is_empty) {
13315 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
13318 $PosixXDigit->initialize($Xdigit & $ASCII);
13319 $PosixXDigit->add_alias('AHex');
13320 $PosixXDigit->add_alias('Ascii_Hex_Digit');
13322 $PosixXDigit->add_description('[0-9A-Fa-f]');
13324 my $dt = property_ref('Decomposition_Type');
13325 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
13326 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
13327 Perl_Extension => 1,
13328 Note => 'Union of all non-canonical decompositions',
13331 # _CanonDCIJ is equivalent to Soft_Dotted, but if on a release earlier
13332 # than SD appeared, construct it ourselves, based on the first release SD
13333 # was in. A pod entry is grandfathered in for it
13334 my $CanonDCIJ = $perl->add_match_table('_CanonDCIJ', Re_Pod_Entry => 1,
13335 Perl_Extension => 1,
13336 Fate => $INTERNAL_ONLY,
13337 Status => $DISCOURAGED);
13338 my $soft_dotted = property_ref('Soft_Dotted');
13339 if (defined $soft_dotted && ! $soft_dotted->is_empty) {
13340 $CanonDCIJ->set_equivalent_to($soft_dotted->table('Y'), Related => 1);
13344 # This list came from 3.2 Soft_Dotted; all of these code points are in
13346 $CanonDCIJ->initialize([ 0x0069,
13355 $CanonDCIJ = $CanonDCIJ & $Assigned;
13358 # For backward compatibility, Perl has its own definition for IDStart.
13359 # It is regular XID_Start plus the underscore, but all characters must be
13360 # Word characters as well
13361 my $XID_Start = property_ref('XID_Start');
13362 my $perl_xids = $perl->add_match_table('_Perl_IDStart',
13363 Perl_Extension => 1,
13364 Fate => $INTERNAL_ONLY,
13365 Initialize => ord('_')
13367 if (defined $XID_Start
13368 || defined ($XID_Start = property_ref('ID_Start')))
13370 $perl_xids += $XID_Start->table('Y');
13373 # For Unicode versions that don't have the property, construct our own
13374 # from first principles. The actual definition is:
13376 # + letter numbers (Nl)
13378 # - Pattern_White_Space
13379 # + stability extensions
13380 # - NKFC modifications
13382 # What we do in the code below is to include the identical code points
13383 # that are in the first release that had Unicode's version of this
13384 # property, essentially extrapolating backwards. There were no
13385 # stability extensions until v4.1, so none are included; likewise in
13386 # no Unicode version so far do subtracting PatSyn and PatWS make any
13387 # difference, so those also are ignored.
13388 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
13390 # We do subtract the NFKC modifications that are in the first version
13391 # that had this property. We don't bother to test if they are in the
13392 # version in question, because if they aren't, the operation is a
13393 # no-op. The NKFC modifications are discussed in
13394 # http://www.unicode.org/reports/tr31/#NFKC_Modifications
13395 foreach my $range ( 0x037A,
13398 [ 0xFC5E, 0xFC63 ],
13399 [ 0xFDFA, 0xFE70 ],
13400 [ 0xFE72, 0xFE76 ],
13405 [ 0xFF9E, 0xFF9F ],
13408 $perl_xids->delete_range($range->[0], $range->[1]);
13411 $perl_xids->delete_range($range, $range);
13416 $perl_xids &= $Word;
13418 my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
13419 Perl_Extension => 1,
13420 Fate => $INTERNAL_ONLY);
13421 my $XIDC = property_ref('XID_Continue');
13423 || defined ($XIDC = property_ref('ID_Continue')))
13425 $perl_xidc += $XIDC->table('Y');
13428 # Similarly, we construct our own XIDC if necessary for early Unicode
13429 # versions. The definition is:
13430 # everything in XIDS
13436 # - Pattern_White_Space
13437 # + stability extensions
13438 # - NFKC modifications
13440 # The same thing applies to this as with XIDS for the PatSyn, PatWS,
13441 # and stability extensions. There is a somewhat different set of NFKC
13442 # mods to remove (and add in this case). The ones below make this
13443 # have identical code points as in the first release that defined it.
13444 $perl_xidc += $perl_xids
13451 if (defined (my $pc = $gc->table('Pc'))) {
13454 else { # 1.1.5 didn't have Pc, but these should have been in it
13455 $perl_xidc += 0xFF3F;
13456 $perl_xidc->add_range(0x203F, 0x2040);
13457 $perl_xidc->add_range(0xFE33, 0xFE34);
13458 $perl_xidc->add_range(0xFE4D, 0xFE4F);
13461 # Subtract the NFKC mods
13462 foreach my $range ( 0x037A,
13463 [ 0xFC5E, 0xFC63 ],
13464 [ 0xFDFA, 0xFE1F ],
13466 [ 0xFE72, 0xFE76 ],
13473 $perl_xidc->delete_range($range->[0], $range->[1]);
13476 $perl_xidc->delete_range($range, $range);
13481 $perl_xidc &= $Word;
13483 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
13484 Perl_Extension => 1,
13485 Fate => $INTERNAL_ONLY,
13486 Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
13489 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
13490 Perl_Extension => 1,
13491 Fate => $INTERNAL_ONLY,
13492 Initialize => $perl_xidc
13500 # These two tables are for matching \X, which is based on the 'extended'
13501 # grapheme cluster, which came in 5.1; create empty ones if not already
13502 # present. The straight 'grapheme cluster' (non-extended) is used prior
13503 # to 5.1, and differs from the extended (see
13504 # http://www.unicode.org/reports/tr29/) only by these two tables, so we
13505 # get the older definition automatically when they are empty.
13506 my $gcb = property_ref('Grapheme_Cluster_Break');
13507 my $perl_prepend = $perl->add_match_table('_X_GCB_Prepend',
13508 Perl_Extension => 1,
13509 Fate => $INTERNAL_ONLY);
13510 if (defined (my $gcb_prepend = $gcb->table('Prepend'))) {
13511 $perl_prepend->set_equivalent_to($gcb_prepend, Related => 1);
13514 push @tables_that_may_be_empty, $perl_prepend->complete_name;
13517 # All the tables with _X_ in their names are used in defining \X handling,
13518 # and are based on the Unicode GCB property. Basically, \X matches:
13520 # | Prepend* Begin Extend*
13522 # Begin is: ( Special_Begin | ! Control )
13523 # Begin is also: ( Regular_Begin | Special_Begin )
13524 # where Regular_Begin is defined as ( ! Control - Special_Begin )
13525 # Special_Begin is: ( Regional-Indicator+ | Hangul-syllable )
13526 # Extend is: ( Grapheme_Extend | Spacing_Mark )
13527 # Control is: [ GCB_Control | CR | LF ]
13528 # Hangul-syllable is: ( T+ | ( L* ( L | ( LVT | ( V | LV ) V* ) T* ) ))
13530 foreach my $gcb_name (qw{ L V T LV LVT }) {
13532 # The perl internal extension's name is the gcb table name prepended
13534 my $perl_table = $perl->add_match_table('_X_GCB_' . $gcb_name,
13535 Perl_Extension => 1,
13536 Fate => $INTERNAL_ONLY,
13537 Initialize => $gcb->table($gcb_name),
13539 # Version 1 had mostly different Hangul syllables that were removed
13540 # from later versions, so some of the tables may not apply.
13541 if ($v_version lt v2.0) {
13542 push @tables_that_may_be_empty, $perl_table->complete_name;
13546 # More GCB. Populate a combined hangul syllables table
13547 my $lv_lvt_v = $perl->add_match_table('_X_LV_LVT_V',
13548 Perl_Extension => 1,
13549 Fate => $INTERNAL_ONLY);
13550 $lv_lvt_v += $gcb->table('LV') + $gcb->table('LVT') + $gcb->table('V');
13551 $lv_lvt_v->add_comment('For use in \X; matches: gcb=LV | gcb=LVT | gcb=V');
13553 my $ri = $perl->add_match_table('_X_RI', Perl_Extension => 1,
13554 Fate => $INTERNAL_ONLY);
13555 if ($v_version ge v6.2) {
13556 $ri += $gcb->table('RI');
13559 push @tables_that_may_be_empty, $ri->full_name;
13562 my $specials_begin = $perl->add_match_table('_X_Special_Begin_Start',
13563 Perl_Extension => 1,
13564 Fate => $INTERNAL_ONLY,
13565 Initialize => $lv_lvt_v
13570 $specials_begin->add_comment(join_lines( <<END
13571 For use in \\X; matches first (perhaps only) character of potential
13572 multi-character sequences that can begin an extended grapheme cluster. They
13573 need special handling because of their complicated nature.
13576 my $regular_begin = $perl->add_match_table('_X_Regular_Begin',
13577 Perl_Extension => 1,
13578 Fate => $INTERNAL_ONLY,
13579 Initialize => ~ $gcb->table('Control')
13581 - $gcb->table('CR')
13582 - $gcb->table('LF')
13584 $regular_begin->add_comment(join_lines( <<END
13585 For use in \\X; matches first character of anything that can begin an extended
13586 grapheme cluster, except those that require special handling.
13590 my $extend = $perl->add_match_table('_X_Extend', Perl_Extension => 1,
13591 Fate => $INTERNAL_ONLY,
13592 Initialize => $gcb->table('Extend')
13594 if (defined (my $sm = $gcb->table('SpacingMark'))) {
13597 $extend->add_comment('For use in \X; matches: Extend | SpacingMark');
13599 # End of GCB \X processing
13601 my @composition = ('Name', 'Unicode_1_Name', 'Name_Alias');
13603 if (@named_sequences) {
13604 push @composition, 'Named_Sequence';
13605 foreach my $sequence (@named_sequences) {
13606 $perl_charname->add_anomalous_entry($sequence);
13610 my $alias_sentence = "";
13612 my $alias = property_ref('Name_Alias');
13613 $perl_charname->set_proxy_for('Name_Alias');
13615 # Add each entry in Name_Alias to Perl_Charnames. Where these go with
13616 # respect to any existing entry depends on the entry type. Corrections go
13617 # before said entry, as they should be returned in preference over the
13618 # existing entry. (A correction to a correction should be later in the
13619 # Name_Alias table, so it will correctly precede the erroneous correction
13620 # in Perl_Charnames.)
13622 # Abbreviations go after everything else, so they are saved temporarily in
13623 # a hash for later.
13625 # Everything else is added added afterwards, which preserves the input
13628 foreach my $range ($alias->ranges) {
13629 next if $range->value eq "";
13630 my $code_point = $range->start;
13631 if ($code_point != $range->end) {
13632 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
13634 my ($value, $type) = split ': ', $range->value;
13636 if ($type eq 'correction') {
13637 $replace_type = $MULTIPLE_BEFORE;
13639 elsif ($type eq 'abbreviation') {
13642 $abbreviations{$value} = $code_point;
13646 $replace_type = $MULTIPLE_AFTER;
13649 # Actually add; before or after current entry(ies) as determined
13652 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
13654 $alias_sentence = <<END;
13655 The Name_Alias property adds duplicate code point entries that are
13656 alternatives to the original name. If an addition is a corrected
13657 name, it will be physically first in the table. The original (less correct,
13658 but still valid) name will be next; then any alternatives, in no particular
13659 order; and finally any abbreviations, again in no particular order.
13662 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
13663 # precedence before 6.1, so should be first in the file; the other names
13664 # have precedence starting in 6.1,
13665 my $before_or_after = ($v_version lt v6.1.0)
13669 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
13670 my $code_point = $range->start;
13671 my $unicode_1_value = $range->value;
13672 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
13674 if ($code_point != $range->end) {
13675 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
13678 # To handle EBCDIC, we don't hard code in the code points of the
13679 # controls; instead realizing that all of them are below 256.
13680 last if $code_point > 255;
13682 # We only add in the controls.
13683 next if $gc->value_of($code_point) ne 'Cc';
13685 # We reject this Unicode1 name for later Perls, as it is used for
13686 # another code point
13687 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
13689 # This won't add an exact duplicate.
13690 $perl_charname->add_duplicate($code_point, $unicode_1_value,
13691 Replace => $before_or_after);
13694 # But in this version only, the ALERT has precedence over BELL, the
13695 # Unicode_1_Name that would otherwise have precedence.
13696 if ($v_version eq v6.0.0) {
13697 $perl_charname->add_duplicate(7, 'ALERT', Replace => $MULTIPLE_BEFORE);
13700 # Now that have everything added, add in abbreviations after
13701 # everything else. Sort so results don't change between runs of this
13703 foreach my $value (sort keys %abbreviations) {
13704 $perl_charname->add_duplicate($abbreviations{$value}, $value,
13705 Replace => $MULTIPLE_AFTER);
13709 if (@composition <= 2) { # Always at least 2
13710 $comment = join " and ", @composition;
13713 $comment = join ", ", @composition[0 .. scalar @composition - 2];
13714 $comment .= ", and $composition[-1]";
13717 $perl_charname->add_comment(join_lines( <<END
13718 This file is for charnames.pm. It is the union of the $comment properties.
13719 Unicode_1_Name entries are used only for nameless code points in the Name
13722 This file doesn't include the algorithmically determinable names. For those,
13723 use 'unicore/Name.pm'
13726 property_ref('Name')->add_comment(join_lines( <<END
13727 This file doesn't include the algorithmically determinable names. For those,
13728 use 'unicore/Name.pm'
13732 # Construct the Present_In property from the Age property.
13733 if (-e 'DAge.txt' && defined (my $age = property_ref('Age'))) {
13734 my $default_map = $age->default_map;
13735 my $in = Property->new('In',
13736 Default_Map => $default_map,
13737 Full_Name => "Present_In",
13738 Perl_Extension => 1,
13740 Initialize => $age,
13742 $in->add_comment(join_lines(<<END
13743 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
13744 same as for $age, and not for what $in really means. This is because anything
13745 defined in a given release should have multiple values: that release and all
13746 higher ones. But only one value per code point can be represented in a table
13751 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
13752 # lowest numbered (earliest) come first, with the non-numeric one
13754 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
13756 : ($b->name !~ /^[\d.]*$/)
13758 : $a->name <=> $b->name
13761 # The Present_In property is the cumulative age properties. The first
13762 # one hence is identical to the first age one.
13763 my $previous_in = $in->add_match_table($first_age->name);
13764 $previous_in->set_equivalent_to($first_age, Related => 1);
13766 my $description_start = "Code point's usage introduced in version ";
13767 $first_age->add_description($description_start . $first_age->name);
13769 # To construct the accumulated values, for each of the age tables
13770 # starting with the 2nd earliest, merge the earliest with it, to get
13771 # all those code points existing in the 2nd earliest. Repeat merging
13772 # the new 2nd earliest with the 3rd earliest to get all those existing
13773 # in the 3rd earliest, and so on.
13774 foreach my $current_age (@rest_ages) {
13775 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
13777 my $current_in = $in->add_match_table(
13778 $current_age->name,
13779 Initialize => $current_age + $previous_in,
13780 Description => $description_start
13781 . $current_age->name
13784 $previous_in = $current_in;
13786 # Add clarifying material for the corresponding age file. This is
13787 # in part because of the confusing and contradictory information
13788 # given in the Standard's documentation itself, as of 5.2.
13789 $current_age->add_description(
13790 "Code point's usage was introduced in version "
13791 . $current_age->name);
13792 $current_age->add_note("See also $in");
13796 # And finally the code points whose usages have yet to be decided are
13797 # the same in both properties. Note that permanently unassigned code
13798 # points actually have their usage assigned (as being permanently
13799 # unassigned), so that these tables are not the same as gc=cn.
13800 my $unassigned = $in->add_match_table($default_map);
13801 my $age_default = $age->table($default_map);
13802 $age_default->add_description(<<END
13803 Code point's usage has not been assigned in any Unicode release thus far.
13806 $unassigned->set_equivalent_to($age_default, Related => 1);
13809 # See L<perlfunc/quotemeta>
13810 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
13811 Perl_Extension => 1,
13812 Fate => $INTERNAL_ONLY,
13814 # Initialize to what's common in
13815 # all Unicode releases.
13818 + $gc->table('Control')
13821 # In early releases without the proper Unicode properties, just set to \W.
13822 if (! defined (my $patsyn = property_ref('Pattern_Syntax'))
13823 || ! defined (my $patws = property_ref('Pattern_White_Space'))
13824 || ! defined (my $di = property_ref('Default_Ignorable_Code_Point')))
13826 $quotemeta += ~ $Word;
13829 $quotemeta += $patsyn->table('Y')
13830 + $patws->table('Y')
13832 + ((~ $Word) & $ASCII);
13835 # Finished creating all the perl properties. All non-internal non-string
13836 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
13837 # an underscore.) These do not get a separate entry in the pod file
13838 foreach my $table ($perl->tables) {
13839 foreach my $alias ($table->aliases) {
13840 next if $alias->name =~ /^_/;
13841 $table->add_alias('Is_' . $alias->name,
13844 Status => $alias->status,
13845 OK_as_Filename => 0);
13849 # Here done with all the basic stuff. Ready to populate the information
13850 # about each character if annotating them.
13853 # See comments at its declaration
13854 $annotate_ranges = Range_Map->new;
13856 # This separates out the non-characters from the other unassigneds, so
13857 # can give different annotations for each.
13858 $unassigned_sans_noncharacters = Range_List->new(
13859 Initialize => $gc->table('Unassigned'));
13860 if (defined (my $nonchars = property_ref('Noncharacter_Code_Point'))) {
13861 $unassigned_sans_noncharacters &= $nonchars->table('N');
13864 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT; $i++ ) {
13865 $i = populate_char_info($i); # Note sets $i so may cause skips
13872 sub add_perl_synonyms() {
13873 # A number of Unicode tables have Perl synonyms that are expressed in
13874 # the single-form, \p{name}. These are:
13875 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
13876 # \p{Is_Name} as synonyms
13877 # \p{Script=Value} gets \p{Value}, \p{Is_Value} as synonyms
13878 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
13879 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
13880 # conflict, \p{Value} and \p{Is_Value} as well
13882 # This routine generates these synonyms, warning of any unexpected
13885 # Construct the list of tables to get synonyms for. Start with all the
13886 # binary and the General_Category ones.
13887 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
13889 push @tables, $gc->tables;
13891 # If the version of Unicode includes the Script property, add its tables
13892 push @tables, $script->tables if defined $script;
13894 # The Block tables are kept separate because they are treated differently.
13895 # And the earliest versions of Unicode didn't include them, so add only if
13898 push @blocks, $block->tables if defined $block;
13900 # Here, have the lists of tables constructed. Process blocks last so that
13901 # if there are name collisions with them, blocks have lowest priority.
13902 # Should there ever be other collisions, manual intervention would be
13903 # required. See the comments at the beginning of the program for a
13904 # possible way to handle those semi-automatically.
13905 foreach my $table (@tables, @blocks) {
13907 # For non-binary properties, the synonym is just the name of the
13908 # table, like Greek, but for binary properties the synonym is the name
13909 # of the property, and means the code points in its 'Y' table.
13910 my $nominal = $table;
13911 my $nominal_property = $nominal->property;
13913 if (! $nominal->isa('Property')) {
13918 # Here is a binary property. Use the 'Y' table. Verify that is
13920 my $yes = $nominal->table('Y');
13921 unless (defined $yes) { # Must be defined, but is permissible to
13923 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
13929 foreach my $alias ($nominal->aliases) {
13931 # Attempt to create a table in the perl directory for the
13932 # candidate table, using whatever aliases in it that don't
13933 # conflict. Also add non-conflicting aliases for all these
13934 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
13936 foreach my $prefix ("", 'Is_', 'In_') {
13938 # Only Block properties can have added 'In_' aliases.
13939 next if $prefix eq 'In_' and $nominal_property != $block;
13941 my $proposed_name = $prefix . $alias->name;
13943 # No Is_Is, In_In, nor combinations thereof
13944 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
13945 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
13947 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
13949 # Get a reference to any existing table in the perl
13950 # directory with the desired name.
13951 my $pre_existing = $perl->table($proposed_name);
13953 if (! defined $pre_existing) {
13955 # No name collision, so ok to add the perl synonym.
13957 my $make_re_pod_entry;
13958 my $ok_as_filename;
13959 my $status = $alias->status;
13960 if ($nominal_property == $block) {
13962 # For block properties, the 'In' form is preferred for
13963 # external use; the pod file contains wild cards for
13964 # this and the 'Is' form so no entries for those; and
13965 # we don't want people using the name without the
13966 # 'In', so discourage that.
13967 if ($prefix eq "") {
13968 $make_re_pod_entry = 1;
13969 $status = $status || $DISCOURAGED;
13970 $ok_as_filename = 0;
13972 elsif ($prefix eq 'In_') {
13973 $make_re_pod_entry = 0;
13974 $status = $status || $NORMAL;
13975 $ok_as_filename = 1;
13978 $make_re_pod_entry = 0;
13979 $status = $status || $DISCOURAGED;
13980 $ok_as_filename = 0;
13983 elsif ($prefix ne "") {
13985 # The 'Is' prefix is handled in the pod by a wild
13986 # card, and we won't use it for an external name
13987 $make_re_pod_entry = 0;
13988 $status = $status || $NORMAL;
13989 $ok_as_filename = 0;
13993 # Here, is an empty prefix, non block. This gets its
13994 # own pod entry and can be used for an external name.
13995 $make_re_pod_entry = 1;
13996 $status = $status || $NORMAL;
13997 $ok_as_filename = 1;
14000 # Here, there isn't a perl pre-existing table with the
14001 # name. Look through the list of equivalents of this
14002 # table to see if one is a perl table.
14003 foreach my $equivalent ($actual->leader->equivalents) {
14004 next if $equivalent->property != $perl;
14006 # Here, have found a table for $perl. Add this alias
14007 # to it, and are done with this prefix.
14008 $equivalent->add_alias($proposed_name,
14009 Re_Pod_Entry => $make_re_pod_entry,
14011 # Currently don't output these in the
14012 # ucd pod, as are strongly discouraged
14017 OK_as_Filename => $ok_as_filename);
14018 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
14022 # Here, $perl doesn't already have a table that is a
14023 # synonym for this property, add one.
14024 my $added_table = $perl->add_match_table($proposed_name,
14025 Re_Pod_Entry => $make_re_pod_entry,
14027 # See UCD comment just above
14031 OK_as_Filename => $ok_as_filename);
14032 # And it will be related to the actual table, since it is
14034 $added_table->set_equivalent_to($actual, Related => 1);
14035 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
14037 } # End of no pre-existing.
14039 # Here, there is a pre-existing table that has the proposed
14040 # name. We could be in trouble, but not if this is just a
14041 # synonym for another table that we have already made a child
14042 # of the pre-existing one.
14043 if ($pre_existing->is_set_equivalent_to($actual)) {
14044 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
14045 $pre_existing->add_alias($proposed_name);
14049 # Here, there is a name collision, but it still could be ok if
14050 # the tables match the identical set of code points, in which
14051 # case, we can combine the names. Compare each table's code
14052 # point list to see if they are identical.
14053 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
14054 if ($pre_existing->matches_identically_to($actual)) {
14056 # Here, they do match identically. Not a real conflict.
14057 # Make the perl version a child of the Unicode one, except
14058 # in the non-obvious case of where the perl name is
14059 # already a synonym of another Unicode property. (This is
14060 # excluded by the test for it being its own parent.) The
14061 # reason for this exclusion is that then the two Unicode
14062 # properties become related; and we don't really know if
14063 # they are or not. We generate documentation based on
14064 # relatedness, and this would be misleading. Code
14065 # later executed in the process will cause the tables to
14066 # be represented by a single file anyway, without making
14067 # it look in the pod like they are necessarily related.
14068 if ($pre_existing->parent == $pre_existing
14069 && ($pre_existing->property == $perl
14070 || $actual->property == $perl))
14072 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
14073 $pre_existing->set_equivalent_to($actual, Related => 1);
14075 elsif (main::DEBUG && $to_trace) {
14076 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
14077 trace $pre_existing->parent;
14082 # Here they didn't match identically, there is a real conflict
14083 # between our new name and a pre-existing property.
14084 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
14085 $pre_existing->add_conflicting($nominal->full_name,
14089 # Don't output a warning for aliases for the block
14090 # properties (unless they start with 'In_') as it is
14091 # expected that there will be conflicts and the block
14093 if ($verbosity >= $NORMAL_VERBOSITY
14094 && ($actual->property != $block || $prefix eq 'In_'))
14096 print simple_fold(join_lines(<<END
14097 There is already an alias named $proposed_name (from $pre_existing),
14098 so not creating this alias for $actual
14103 # Keep track for documentation purposes.
14104 $has_In_conflicts++ if $prefix eq 'In_';
14105 $has_Is_conflicts++ if $prefix eq 'Is_';
14110 # There are some properties which have No and Yes (and N and Y) as
14111 # property values, but aren't binary, and could possibly be confused with
14112 # binary ones. So create caveats for them. There are tables that are
14113 # named 'No', and tables that are named 'N', but confusion is not likely
14114 # unless they are the same table. For example, N meaning Number or
14115 # Neutral is not likely to cause confusion, so don't add caveats to things
14117 foreach my $property (grep { $_->type != $BINARY
14118 && $_->type != $FORCED_BINARY }
14121 my $yes = $property->table('Yes');
14122 if (defined $yes) {
14123 my $y = $property->table('Y');
14124 if (defined $y && $yes == $y) {
14125 foreach my $alias ($property->aliases) {
14126 $yes->add_conflicting($alias->name);
14130 my $no = $property->table('No');
14132 my $n = $property->table('N');
14133 if (defined $n && $no == $n) {
14134 foreach my $alias ($property->aliases) {
14135 $no->add_conflicting($alias->name, 'P');
14144 sub register_file_for_name($$$) {
14145 # Given info about a table and a datafile that it should be associated
14146 # with, register that association
14149 my $directory_ref = shift; # Array of the directory path for the file
14150 my $file = shift; # The file name in the final directory.
14151 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14153 trace "table=$table, file=$file, directory=@$directory_ref" if main::DEBUG && $to_trace;
14155 if ($table->isa('Property')) {
14156 $table->set_file_path(@$directory_ref, $file);
14157 push @map_properties, $table;
14159 # No swash means don't do the rest of this.
14160 return if $table->fate != $ORDINARY;
14162 # Get the path to the file
14163 my @path = $table->file_path;
14165 # Use just the file name if no subdirectory.
14166 shift @path if $path[0] eq File::Spec->curdir();
14168 my $file = join '/', @path;
14170 # Create a hash entry for utf8_heavy to get the file that stores this
14171 # property's map table
14172 foreach my $alias ($table->aliases) {
14173 my $name = $alias->name;
14174 $loose_property_to_file_of{standardize($name)} = $file;
14177 # And a way for utf8_heavy to find the proper key in the SwashInfo
14178 # hash for this property.
14179 $file_to_swash_name{$file} = "To" . $table->swash_name;
14183 # Do all of the work for all equivalent tables when called with the leader
14184 # table, so skip if isn't the leader.
14185 return if $table->leader != $table;
14187 # If this is a complement of another file, use that other file instead,
14188 # with a ! prepended to it.
14190 if (($complement = $table->complement) != 0) {
14191 my @directories = $complement->file_path;
14193 # This assumes that the 0th element is something like 'lib',
14194 # the 1th element the property name (in its own directory), like
14195 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
14196 # appended to it later.
14197 $directories[1] =~ s/^/!/;
14198 $file = pop @directories;
14199 $directory_ref =\@directories;
14202 # Join all the file path components together, using slashes.
14203 my $full_filename = join('/', @$directory_ref, $file);
14205 # All go in the same subdirectory of unicore
14206 if ($directory_ref->[0] ne $matches_directory) {
14207 Carp::my_carp("Unexpected directory in "
14208 . join('/', @{$directory_ref}, $file));
14211 # For this table and all its equivalents ...
14212 foreach my $table ($table, $table->equivalents) {
14214 # Associate it with its file internally. Don't include the
14215 # $matches_directory first component
14216 $table->set_file_path(@$directory_ref, $file);
14218 # No swash means don't do the rest of this.
14219 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
14221 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
14223 my $property = $table->property;
14224 my $property_name = ($property == $perl)
14225 ? "" # 'perl' is never explicitly stated
14226 : standardize($property->name) . '=';
14228 my $is_default = 0; # Is this table the default one for the property?
14230 # To calculate $is_default, we find if this table is the same as the
14231 # default one for the property. But this is complicated by the
14232 # possibility that there is a master table for this one, and the
14233 # information is stored there instead of here.
14234 my $parent = $table->parent;
14235 my $leader_prop = $parent->property;
14236 my $default_map = $leader_prop->default_map;
14237 if (defined $default_map) {
14238 my $default_table = $leader_prop->table($default_map);
14239 $is_default = 1 if defined $default_table && $parent == $default_table;
14242 # Calculate the loose name for this table. Mostly it's just its name,
14243 # standardized. But in the case of Perl tables that are single-form
14244 # equivalents to Unicode properties, it is the latter's name.
14245 my $loose_table_name =
14246 ($property != $perl || $leader_prop == $perl)
14247 ? standardize($table->name)
14248 : standardize($parent->name);
14250 my $deprecated = ($table->status eq $DEPRECATED)
14251 ? $table->status_info
14253 my $caseless_equivalent = $table->caseless_equivalent;
14255 # And for each of the table's aliases... This inner loop eventually
14256 # goes through all aliases in the UCD that we generate regex match
14258 foreach my $alias ($table->aliases) {
14259 my $standard = utf8_heavy_name($table, $alias);
14261 # Generate an entry in either the loose or strict hashes, which
14262 # will translate the property and alias names combination into the
14263 # file where the table for them is stored.
14264 if ($alias->loose_match) {
14265 if (exists $loose_to_file_of{$standard}) {
14266 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
14269 $loose_to_file_of{$standard} = $sub_filename;
14273 if (exists $stricter_to_file_of{$standard}) {
14274 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
14277 $stricter_to_file_of{$standard} = $sub_filename;
14279 # Tightly coupled with how utf8_heavy.pl works, for a
14280 # floating point number that is a whole number, get rid of
14281 # the trailing decimal point and 0's, so that utf8_heavy
14282 # will work. Also note that this assumes that such a
14283 # number is matched strictly; so if that were to change,
14284 # this would be wrong.
14285 if ((my $integer_name = $alias->name)
14286 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
14288 $stricter_to_file_of{$property_name . $integer_name}
14294 # For Unicode::UCD, create a mapping of the prop=value to the
14295 # canonical =value for that property.
14296 if ($standard =~ /=/) {
14298 # This could happen if a strict name mapped into an existing
14299 # loose name. In that event, the strict names would have to
14300 # be moved to a new hash.
14301 if (exists($loose_to_standard_value{$standard})) {
14302 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
14304 $loose_to_standard_value{$standard} = $loose_table_name;
14307 # Keep a list of the deprecated properties and their filenames
14308 if ($deprecated && $complement == 0) {
14309 $utf8::why_deprecated{$sub_filename} = $deprecated;
14312 # And a substitute table, if any, for case-insensitive matching
14313 if ($caseless_equivalent != 0) {
14314 $caseless_equivalent_to{$standard} = $caseless_equivalent;
14317 # Add to defaults list if the table this alias belongs to is the
14319 $loose_defaults{$standard} = 1 if $is_default;
14327 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
14329 my %full_dir_name_of; # Full length names of directories used.
14331 sub construct_filename($$$) {
14332 # Return a file name for a table, based on the table name, but perhaps
14333 # changed to get rid of non-portable characters in it, and to make
14334 # sure that it is unique on a file system that allows the names before
14335 # any period to be at most 8 characters (DOS). While we're at it
14336 # check and complain if there are any directory conflicts.
14338 my $name = shift; # The name to start with
14339 my $mutable = shift; # Boolean: can it be changed? If no, but
14340 # yet it must be to work properly, a warning
14342 my $directories_ref = shift; # A reference to an array containing the
14343 # path to the file, with each element one path
14344 # component. This is used because the same
14345 # name can be used in different directories.
14346 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14348 my $warn = ! defined wantarray; # If true, then if the name is
14349 # changed, a warning is issued as well.
14351 if (! defined $name) {
14352 Carp::my_carp("Undefined name in directory "
14353 . File::Spec->join(@$directories_ref)
14358 # Make sure that no directory names conflict with each other. Look at
14359 # each directory in the input file's path. If it is already in use,
14360 # assume it is correct, and is merely being re-used, but if we
14361 # truncate it to 8 characters, and find that there are two directories
14362 # that are the same for the first 8 characters, but differ after that,
14363 # then that is a problem.
14364 foreach my $directory (@$directories_ref) {
14365 my $short_dir = substr($directory, 0, 8);
14366 if (defined $full_dir_name_of{$short_dir}) {
14367 next if $full_dir_name_of{$short_dir} eq $directory;
14368 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
14371 $full_dir_name_of{$short_dir} = $directory;
14375 my $path = join '/', @$directories_ref;
14376 $path .= '/' if $path;
14378 # Remove interior underscores.
14379 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
14381 # Change any non-word character into an underscore, and truncate to 8.
14382 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
14383 substr($filename, 8) = "" if length($filename) > 8;
14385 # Make sure the basename doesn't conflict with something we
14386 # might have already written. If we have, say,
14393 while (my $num = $base_names{$path}{lc $filename}++) {
14394 $num++; # so basenames with numbers start with '2', which
14395 # just looks more natural.
14397 # Want to append $num, but if it'll make the basename longer
14398 # than 8 characters, pre-truncate $filename so that the result
14400 my $delta = length($filename) + length($num) - 8;
14402 substr($filename, -$delta) = $num;
14407 if ($warn && ! $warned) {
14409 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
14413 return $filename if $mutable;
14415 # If not changeable, must return the input name, but warn if needed to
14416 # change it beyond shortening it.
14417 if ($name ne $filename
14418 && substr($name, 0, length($filename)) ne $filename) {
14419 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
14425 # The pod file contains a very large table. Many of the lines in that table
14426 # would exceed a typical output window's size, and so need to be wrapped with
14427 # a hanging indent to make them look good. The pod language is really
14428 # insufficient here. There is no general construct to do that in pod, so it
14429 # is done here by beginning each such line with a space to cause the result to
14430 # be output without formatting, and doing all the formatting here. This leads
14431 # to the result that if the eventual display window is too narrow it won't
14432 # look good, and if the window is too wide, no advantage is taken of that
14433 # extra width. A further complication is that the output may be indented by
14434 # the formatter so that there is less space than expected. What I (khw) have
14435 # done is to assume that that indent is a particular number of spaces based on
14436 # what it is in my Linux system; people can always resize their windows if
14437 # necessary, but this is obviously less than desirable, but the best that can
14439 my $automatic_pod_indent = 8;
14441 # Try to format so that uses fewest lines, but few long left column entries
14442 # slide into the right column. An experiment on 5.1 data yielded the
14443 # following percentages that didn't cut into the other side along with the
14444 # associated first-column widths
14446 # 80% not too bad except for a few blocks
14447 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
14449 my $indent_info_column = 27; # 75% of lines didn't have overlap
14451 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
14452 # The 3 is because of:
14453 # 1 for the leading space to tell the pod formatter to
14456 # 1 for the space between the flag and the main data
14458 sub format_pod_line ($$$;$$) {
14459 # Take a pod line and return it, formatted properly
14461 my $first_column_width = shift;
14462 my $entry = shift; # Contents of left column
14463 my $info = shift; # Contents of right column
14465 my $status = shift || ""; # Any flag
14467 my $loose_match = shift; # Boolean.
14468 $loose_match = 1 unless defined $loose_match;
14470 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14473 $flags .= $STRICTER if ! $loose_match;
14475 $flags .= $status if $status;
14477 # There is a blank in the left column to cause the pod formatter to
14478 # output the line as-is.
14479 return sprintf " %-*s%-*s %s\n",
14480 # The first * in the format is replaced by this, the -1 is
14481 # to account for the leading blank. There isn't a
14482 # hard-coded blank after this to separate the flags from
14483 # the rest of the line, so that in the unlikely event that
14484 # multiple flags are shown on the same line, they both
14485 # will get displayed at the expense of that separation,
14486 # but since they are left justified, a blank will be
14487 # inserted in the normal case.
14491 # The other * in the format is replaced by this number to
14492 # cause the first main column to right fill with blanks.
14493 # The -1 is for the guaranteed blank following it.
14494 $first_column_width - $FILLER - 1,
14499 my @zero_match_tables; # List of tables that have no matches in this release
14501 sub make_re_pod_entries($) {
14502 # This generates the entries for the pod file for a given table.
14503 # Also done at this time are any children tables. The output looks like:
14504 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
14506 my $input_table = shift; # Table the entry is for
14507 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
14509 # Generate parent and all its children at the same time.
14510 return if $input_table->parent != $input_table;
14512 my $property = $input_table->property;
14513 my $type = $property->type;
14514 my $full_name = $property->full_name;
14516 my $count = $input_table->count;
14517 my $string_count = clarify_number($count);
14518 my $status = $input_table->status;
14519 my $status_info = $input_table->status_info;
14520 my $caseless_equivalent = $input_table->caseless_equivalent;
14522 # Don't mention a placeholder equivalent as it isn't to be listed in the
14524 $caseless_equivalent = 0 if $caseless_equivalent != 0
14525 && $caseless_equivalent->fate > $ORDINARY;
14527 my $entry_for_first_table; # The entry for the first table output.
14528 # Almost certainly, it is the parent.
14530 # For each related table (including itself), we will generate a pod entry
14531 # for each name each table goes by
14532 foreach my $table ($input_table, $input_table->children) {
14534 # utf8_heavy.pl cannot deal with null string property values, so skip
14535 # any tables that have no non-null names.
14536 next if ! grep { $_->name ne "" } $table->aliases;
14538 # First, gather all the info that applies to this table as a whole.
14540 push @zero_match_tables, $table if $count == 0
14541 # Don't mention special tables
14542 # as being zero length
14543 && $table->fate == $ORDINARY;
14545 my $table_property = $table->property;
14547 # The short name has all the underscores removed, while the full name
14548 # retains them. Later, we decide whether to output a short synonym
14549 # for the full one, we need to compare apples to apples, so we use the
14550 # short name's length including underscores.
14551 my $table_property_short_name_length;
14552 my $table_property_short_name
14553 = $table_property->short_name(\$table_property_short_name_length);
14554 my $table_property_full_name = $table_property->full_name;
14556 # Get how much savings there is in the short name over the full one
14557 # (delta will always be <= 0)
14558 my $table_property_short_delta = $table_property_short_name_length
14559 - length($table_property_full_name);
14560 my @table_description = $table->description;
14561 my @table_note = $table->note;
14563 # Generate an entry for each alias in this table.
14564 my $entry_for_first_alias; # saves the first one encountered.
14565 foreach my $alias ($table->aliases) {
14567 # Skip if not to go in pod.
14568 next unless $alias->make_re_pod_entry;
14570 # Start gathering all the components for the entry
14571 my $name = $alias->name;
14573 # Skip if name is empty, as can't be accessed by regexes.
14574 next if $name eq "";
14576 my $entry; # Holds the left column, may include extras
14577 my $entry_ref; # To refer to the left column's contents from
14578 # another entry; has no extras
14580 # First the left column of the pod entry. Tables for the $perl
14581 # property always use the single form.
14582 if ($table_property == $perl) {
14583 $entry = "\\p{$name}";
14584 $entry_ref = "\\p{$name}";
14586 else { # Compound form.
14588 # Only generate one entry for all the aliases that mean true
14589 # or false in binary properties. Append a '*' to indicate
14590 # some are missing. (The heading comment notes this.)
14592 if ($type == $BINARY) {
14593 next if $name ne 'N' && $name ne 'Y';
14596 elsif ($type != $FORCED_BINARY) {
14601 # Forced binary properties require special handling. It
14602 # has two sets of tables, one set is true/false; and the
14603 # other set is everything else. Entries are generated for
14604 # each set. Use the Bidi_Mirrored property (which appears
14605 # in all Unicode versions) to get a list of the aliases
14606 # for the true/false tables. Of these, only output the N
14607 # and Y ones, the same as, a regular binary property. And
14608 # output all the rest, same as a non-binary property.
14609 my $bm = property_ref("Bidi_Mirrored");
14610 if ($name eq 'N' || $name eq 'Y') {
14612 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
14613 $bm->table("N")->aliases)
14622 # Colon-space is used to give a little more space to be easier
14625 . $table_property_full_name
14628 # But for the reference to this entry, which will go in the
14629 # right column, where space is at a premium, use equals
14631 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
14634 # Then the right (info) column. This is stored as components of
14635 # an array for the moment, then joined into a string later. For
14636 # non-internal only properties, begin the info with the entry for
14637 # the first table we encountered (if any), as things are ordered
14638 # so that that one is the most descriptive. This leads to the
14639 # info column of an entry being a more descriptive version of the
14642 if ($name =~ /^_/) {
14644 '(For internal use by Perl, not necessarily stable)';
14646 elsif ($entry_for_first_alias) {
14647 push @info, $entry_for_first_alias;
14650 # If this entry is equivalent to another, add that to the info,
14651 # using the first such table we encountered
14652 if ($entry_for_first_table) {
14654 push @info, "(= $entry_for_first_table)";
14657 push @info, $entry_for_first_table;
14661 # If the name is a large integer, add an equivalent with an
14662 # exponent for better readability
14663 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
14664 push @info, sprintf "(= %.1e)", $name
14667 my $parenthesized = "";
14668 if (! $entry_for_first_alias) {
14670 # This is the first alias for the current table. The alias
14671 # array is ordered so that this is the fullest, most
14672 # descriptive alias, so it gets the fullest info. The other
14673 # aliases are mostly merely pointers to this one, using the
14674 # information already added above.
14676 # Display any status message, but only on the parent table
14677 if ($status && ! $entry_for_first_table) {
14678 push @info, $status_info;
14681 # Put out any descriptive info
14682 if (@table_description || @table_note) {
14683 push @info, join "; ", @table_description, @table_note;
14686 # Look to see if there is a shorter name we can point people
14688 my $standard_name = standardize($name);
14690 my $proposed_short = $table->short_name;
14691 if (defined $proposed_short) {
14692 my $standard_short = standardize($proposed_short);
14694 # If the short name is shorter than the standard one, or
14695 # even it it's not, but the combination of it and its
14696 # short property name (as in \p{prop=short} ($perl doesn't
14697 # have this form)) saves at least two characters, then,
14698 # cause it to be listed as a shorter synonym.
14699 if (length $standard_short < length $standard_name
14700 || ($table_property != $perl
14701 && (length($standard_short)
14702 - length($standard_name)
14703 + $table_property_short_delta) # (<= 0)
14706 $short_name = $proposed_short;
14707 if ($table_property != $perl) {
14708 $short_name = $table_property_short_name
14711 $short_name = "\\p{$short_name}";
14715 # And if this is a compound form name, see if there is a
14716 # single form equivalent
14718 if ($table_property != $perl) {
14720 # Special case the binary N tables, so that will print
14721 # \P{single}, but use the Y table values to populate
14722 # 'single', as we haven't likewise populated the N table.
14723 # For forced binary tables, we can't just look at the N
14724 # table, but must see if this table is equivalent to the N
14725 # one, as there are two equivalent beasts in these
14729 if ( ($type == $BINARY
14730 && $input_table == $property->table('No'))
14731 || ($type == $FORCED_BINARY
14732 && $property->table('No')->
14733 is_set_equivalent_to($input_table)))
14735 $test_table = $property->table('Yes');
14739 $test_table = $input_table;
14743 # Look for a single form amongst all the children.
14744 foreach my $table ($test_table->children) {
14745 next if $table->property != $perl;
14746 my $proposed_name = $table->short_name;
14747 next if ! defined $proposed_name;
14749 # Don't mention internal-only properties as a possible
14750 # single form synonym
14751 next if substr($proposed_name, 0, 1) eq '_';
14753 $proposed_name = "\\$p\{$proposed_name}";
14754 if (! defined $single_form
14755 || length($proposed_name) < length $single_form)
14757 $single_form = $proposed_name;
14759 # The goal here is to find a single form; not the
14760 # shortest possible one. We've already found a
14761 # short name. So, stop at the first single form
14762 # found, which is likely to be closer to the
14769 # Ouput both short and single in the same parenthesized
14770 # expression, but with only one of 'Single', 'Short' if there
14772 if ($short_name || $single_form || $table->conflicting) {
14773 $parenthesized .= "Short: $short_name" if $short_name;
14774 if ($short_name && $single_form) {
14775 $parenthesized .= ', ';
14777 elsif ($single_form) {
14778 $parenthesized .= 'Single: ';
14780 $parenthesized .= $single_form if $single_form;
14784 if ($caseless_equivalent != 0) {
14785 $parenthesized .= '; ' if $parenthesized ne "";
14786 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
14790 # Warn if this property isn't the same as one that a
14791 # semi-casual user might expect. The other components of this
14792 # parenthesized structure are calculated only for the first entry
14793 # for this table, but the conflicting is deemed important enough
14794 # to go on every entry.
14795 my $conflicting = join " NOR ", $table->conflicting;
14796 if ($conflicting) {
14797 $parenthesized .= '; ' if $parenthesized ne "";
14798 $parenthesized .= "NOT $conflicting";
14801 push @info, "($parenthesized)" if $parenthesized;
14803 if ($name =~ /_$/ && $alias->loose_match) {
14804 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
14807 if ($table_property != $perl && $table->perl_extension) {
14808 push @info, '(Perl extension)';
14810 push @info, "($string_count)";
14812 # Now, we have both the entry and info so add them to the
14813 # list of all the properties.
14814 push @match_properties,
14815 format_pod_line($indent_info_column,
14819 $alias->loose_match);
14821 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
14822 } # End of looping through the aliases for this table.
14824 if (! $entry_for_first_table) {
14825 $entry_for_first_table = $entry_for_first_alias;
14827 } # End of looping through all the related tables
14831 sub make_ucd_table_pod_entries {
14834 # Generate the entries for the UCD section of the pod for $table. This
14835 # also calculates if names are ambiguous, so has to be called even if the
14836 # pod is not being output
14838 my $short_name = $table->name;
14839 my $standard_short_name = standardize($short_name);
14840 my $full_name = $table->full_name;
14841 my $standard_full_name = standardize($full_name);
14843 my $full_info = ""; # Text of info column for full-name entries
14844 my $other_info = ""; # Text of info column for short-name entries
14845 my $short_info = ""; # Text of info column for other entries
14846 my $meaning = ""; # Synonym of this table
14848 my $property = ($table->isa('Property'))
14850 : $table->parent->property;
14852 my $perl_extension = $table->perl_extension;
14854 # Get the more official name for for perl extensions that aren't
14855 # stand-alone properties
14856 if ($perl_extension && $property != $table) {
14857 if ($property == $perl ||$property->type == $BINARY) {
14858 $meaning = $table->complete_name;
14861 $meaning = $property->full_name . "=$full_name";
14865 # There are three types of info column. One for the short name, one for
14866 # the full name, and one for everything else. They mostly are the same,
14867 # so initialize in the same loop.
14868 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
14869 if ($perl_extension && $property != $table) {
14871 # Add the synonymous name for the non-full name entries; and to
14872 # the full-name entry if it adds extra information
14873 if ($info_ref == \$other_info
14874 || ($info_ref == \$short_info
14875 && $standard_short_name ne $standard_full_name)
14876 || standardize($meaning) ne $standard_full_name
14878 $$info_ref .= "$meaning.";
14881 elsif ($info_ref != \$full_info) {
14883 # Otherwise, the non-full name columns include the full name
14884 $$info_ref .= $full_name;
14887 # And the full-name entry includes the short name, if different
14888 if ($info_ref == \$full_info
14889 && $standard_short_name ne $standard_full_name)
14891 $full_info =~ s/\.\Z//;
14892 $full_info .= " " if $full_info;
14893 $full_info .= "(Short: $short_name)";
14896 if ($table->perl_extension) {
14897 $$info_ref =~ s/\.\Z//;
14898 $$info_ref .= ". " if $$info_ref;
14899 $$info_ref .= "(Perl extension)";
14903 # Add any extra annotations to the full name entry
14904 foreach my $more_info ($table->description,
14906 $table->status_info)
14908 next unless $more_info;
14909 $full_info =~ s/\.\Z//;
14910 $full_info .= ". " if $full_info;
14911 $full_info .= $more_info;
14914 # These keep track if have created full and short name pod entries for the
14917 my $done_short = 0;
14919 # Every possible name is kept track of, even those that aren't going to be
14920 # output. This way we can be sure to find the ambiguities.
14921 foreach my $alias ($table->aliases) {
14922 my $name = $alias->name;
14923 my $standard = standardize($name);
14925 my $output_this = $alias->ucd;
14927 # If the full and short names are the same, we want to output the full
14928 # one's entry, so it has priority.
14929 if ($standard eq $standard_full_name) {
14930 next if $done_full;
14932 $info = $full_info;
14934 elsif ($standard eq $standard_short_name) {
14935 next if $done_short;
14937 next if $standard_short_name eq $standard_full_name;
14938 $info = $short_info;
14941 $info = $other_info;
14944 # Here, we have set up the two columns for this entry. But if an
14945 # entry already exists for this name, we have to decide which one
14946 # we're going to later output.
14947 if (exists $ucd_pod{$standard}) {
14949 # If the two entries refer to the same property, it's not going to
14950 # be ambiguous. (Likely it's because the names when standardized
14951 # are the same.) But that means if they are different properties,
14952 # there is ambiguity.
14953 if ($ucd_pod{$standard}->{'property'} != $property) {
14955 # Here, we have an ambiguity. This code assumes that one is
14956 # scheduled to be output and one not and that one is a perl
14957 # extension (which is not to be output) and the other isn't.
14958 # If those assumptions are wrong, things have to be rethought.
14959 if ($ucd_pod{$standard}{'output_this'} == $output_this
14960 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
14961 || $output_this == $perl_extension)
14963 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
14966 # We modifiy the info column of the one being output to
14967 # indicate the ambiguity. Set $which to point to that one's
14970 if ($ucd_pod{$standard}{'output_this'}) {
14971 $which = \$ucd_pod{$standard}->{'info'};
14975 $meaning = $ucd_pod{$standard}{'meaning'};
14979 $$which =~ s/\.\Z//;
14980 $$which .= "; NOT '$standard' meaning '$meaning'";
14982 $ambiguous_names{$standard} = 1;
14985 # Use the non-perl-extension variant
14986 next unless $ucd_pod{$standard}{'perl_extension'};
14989 # Store enough information about this entry that we can later look for
14990 # ambiguities, and output it properly.
14991 $ucd_pod{$standard} = { 'name' => $name,
14993 'meaning' => $meaning,
14994 'output_this' => $output_this,
14995 'perl_extension' => $perl_extension,
14996 'property' => $property,
14997 'status' => $alias->status,
14999 } # End of looping through all this table's aliases
15004 sub pod_alphanumeric_sort {
15005 # Sort pod entries alphanumerically.
15007 # The first few character columns are filler, plus the '\p{'; and get rid
15008 # of all the trailing stuff, starting with the trailing '}', so as to sort
15009 # on just 'Name=Value'
15010 (my $a = lc $a) =~ s/^ .*? { //x;
15012 (my $b = lc $b) =~ s/^ .*? { //x;
15015 # Determine if the two operands are both internal only or both not.
15016 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
15017 # should be the underscore that begins internal only
15018 my $a_is_internal = (substr($a, 0, 1) eq '_');
15019 my $b_is_internal = (substr($b, 0, 1) eq '_');
15021 # Sort so the internals come last in the table instead of first (which the
15022 # leading underscore would otherwise indicate).
15023 if ($a_is_internal != $b_is_internal) {
15024 return 1 if $a_is_internal;
15028 # Determine if the two operands are numeric property values or not.
15029 # A numeric property will look like xyz: 3. But the number
15030 # can begin with an optional minus sign, and may have a
15031 # fraction or rational component, like xyz: 3/2. If either
15032 # isn't numeric, use alphabetic sort.
15033 my ($a_initial, $a_number) =
15034 ($a =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15035 return $a cmp $b unless defined $a_number;
15036 my ($b_initial, $b_number) =
15037 ($b =~ /^ ( [^:=]+ [:=] \s* ) (-? \d+ (?: [.\/] \d+)? )/ix);
15038 return $a cmp $b unless defined $b_number;
15040 # Here they are both numeric, but use alphabetic sort if the
15041 # initial parts don't match
15042 return $a cmp $b if $a_initial ne $b_initial;
15044 # Convert rationals to floating for the comparison.
15045 $a_number = eval $a_number if $a_number =~ qr{/};
15046 $b_number = eval $b_number if $b_number =~ qr{/};
15048 return $a_number <=> $b_number;
15052 # Create the .pod file. This generates the various subsections and then
15053 # combines them in one big HERE document.
15055 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
15057 return unless defined $pod_directory;
15058 print "Making pod file\n" if $verbosity >= $PROGRESS;
15060 my $exception_message =
15061 '(Any exceptions are individually noted beginning with the word NOT.)';
15063 if (-e 'Blocks.txt') {
15065 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
15066 # if the global $has_In_conflicts indicates we have them.
15067 push @match_properties, format_pod_line($indent_info_column,
15070 . (($has_In_conflicts)
15071 ? " $exception_message"
15073 @block_warning = << "END";
15075 Matches in the Block property have shortcuts that begin with "In_". For
15076 example, C<\\p{Block=Latin1}> can be written as C<\\p{In_Latin1}>. For
15077 backward compatibility, if there is no conflict with another shortcut, these
15078 may also be written as C<\\p{Latin1}> or C<\\p{Is_Latin1}>. But, N.B., there
15079 are numerous such conflicting shortcuts. Use of these forms for Block is
15080 discouraged, and are flagged as such, not only because of the potential
15081 confusion as to what is meant, but also because a later release of Unicode may
15082 preempt the shortcut, and your program would no longer be correct. Use the
15083 "In_" form instead to avoid this, or even more clearly, use the compound form,
15084 e.g., C<\\p{blk:latin1}>. See L<perlunicode/"Blocks"> for more information
15088 my $text = $Is_flags_text;
15089 $text = "$exception_message $text" if $has_Is_conflicts;
15091 # And the 'Is_ line';
15092 push @match_properties, format_pod_line($indent_info_column,
15096 # Sort the properties array for output. It is sorted alphabetically
15097 # except numerically for numeric properties, and only output unique lines.
15098 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
15100 my $formatted_properties = simple_fold(\@match_properties,
15102 # indent succeeding lines by two extra
15103 # which looks better
15104 $indent_info_column + 2,
15106 # shorten the line length by how much
15107 # the formatter indents, so the folded
15108 # line will fit in the space
15109 # presumably available
15110 $automatic_pod_indent);
15111 # Add column headings, indented to be a little more centered, but not
15113 $formatted_properties = format_pod_line($indent_info_column,
15117 . $formatted_properties;
15119 # Generate pod documentation lines for the tables that match nothing
15120 my $zero_matches = "";
15121 if (@zero_match_tables) {
15122 @zero_match_tables = uniques(@zero_match_tables);
15123 $zero_matches = join "\n\n",
15124 map { $_ = '=item \p{' . $_->complete_name . "}" }
15125 sort { $a->complete_name cmp $b->complete_name }
15126 @zero_match_tables;
15128 $zero_matches = <<END;
15130 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
15132 Unicode has some property-value pairs that currently don't match anything.
15133 This happens generally either because they are obsolete, or they exist for
15134 symmetry with other forms, but no language has yet been encoded that uses
15135 them. In this version of Unicode, the following match zero code points:
15146 # Generate list of properties that we don't accept, grouped by the reasons
15147 # why. This is so only put out the 'why' once, and then list all the
15148 # properties that have that reason under it.
15150 my %why_list; # The keys are the reasons; the values are lists of
15151 # properties that have the key as their reason
15153 # For each property, add it to the list that are suppressed for its reason
15154 # The sort will cause the alphabetically first properties to be added to
15155 # each list first, so each list will be sorted.
15156 foreach my $property (sort keys %why_suppressed) {
15157 push @{$why_list{$why_suppressed{$property}}}, $property;
15160 # For each reason (sorted by the first property that has that reason)...
15161 my @bad_re_properties;
15162 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
15165 # Add to the output, all the properties that have that reason.
15166 my $has_item = 0; # Flag if actually output anything.
15167 foreach my $name (@{$why_list{$why}}) {
15169 # Split compound names into $property and $table components
15170 my $property = $name;
15172 if ($property =~ / (.*) = (.*) /x) {
15177 # This release of Unicode may not have a property that is
15178 # suppressed, so don't reference a non-existent one.
15179 $property = property_ref($property);
15180 next if ! defined $property;
15182 # And since this list is only for match tables, don't list the
15183 # ones that don't have match tables.
15184 next if ! $property->to_create_match_tables;
15186 # Find any abbreviation, and turn it into a compound name if this
15187 # is a property=value pair.
15188 my $short_name = $property->name;
15189 $short_name .= '=' . $property->table($table)->name if $table;
15191 # Start with an empty line.
15192 push @bad_re_properties, "\n\n" unless $has_item;
15194 # And add the property as an item for the reason.
15195 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
15199 # And add the reason under the list of properties, if such a list
15200 # actually got generated. Note that the header got added
15201 # unconditionally before. But pod ignores extra blank lines, so no
15203 push @bad_re_properties, "\n$why\n" if $has_item;
15205 } # End of looping through each reason.
15207 if (! @bad_re_properties) {
15208 push @bad_re_properties,
15209 "*** This installation accepts ALL non-Unihan properties ***";
15212 # Add =over only if non-empty to avoid an empty =over/=back section,
15213 # which is considered bad form.
15214 unshift @bad_re_properties, "\n=over 4\n";
15215 push @bad_re_properties, "\n=back\n";
15218 # Similiarly, generate a list of files that we don't use, grouped by the
15219 # reasons why. First, create a hash whose keys are the reasons, and whose
15220 # values are anonymous arrays of all the files that share that reason.
15221 my %grouped_by_reason;
15222 foreach my $file (keys %ignored_files) {
15223 push @{$grouped_by_reason{$ignored_files{$file}}}, $file;
15225 foreach my $file (keys %skipped_files) {
15226 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
15229 # Then, sort each group.
15230 foreach my $group (keys %grouped_by_reason) {
15231 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
15232 @{$grouped_by_reason{$group}} ;
15235 # Finally, create the output text. For each reason (sorted by the
15236 # alphabetically first file that has that reason)...
15238 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
15239 cmp lc $grouped_by_reason{$b}->[0]
15241 keys %grouped_by_reason)
15243 # Add all the files that have that reason to the output. Start
15244 # with an empty line.
15245 push @unused_files, "\n\n";
15246 push @unused_files, map { "\n=item F<$_> \n" }
15247 @{$grouped_by_reason{$reason}};
15248 # And add the reason under the list of files
15249 push @unused_files, "\n$reason\n";
15252 # Similarly, create the output text for the UCD section of the pod
15254 foreach my $key (keys %ucd_pod) {
15255 next unless $ucd_pod{$key}->{'output_this'};
15256 push @ucd_pod, format_pod_line($indent_info_column,
15257 $ucd_pod{$key}->{'name'},
15258 $ucd_pod{$key}->{'info'},
15259 $ucd_pod{$key}->{'status'},
15263 # Sort alphabetically, and fold for output
15264 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
15265 my $ucd_pod = simple_fold(\@ucd_pod,
15267 $indent_info_column,
15268 $automatic_pod_indent);
15269 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
15274 # Everything is ready to assemble.
15275 my @OUT = << "END";
15280 To change this file, edit $0 instead.
15286 $pod_file - Index of Unicode Version $string_version character properties in Perl
15290 This document provides information about the portion of the Unicode database
15291 that deals with character properties, that is the portion that is defined on
15292 single code points. (L</Other information in the Unicode data base>
15293 below briefly mentions other data that Unicode provides.)
15295 Perl can provide access to all non-provisional Unicode character properties,
15296 though not all are enabled by default. The omitted ones are the Unihan
15297 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
15298 deprecated or Unicode-internal properties. (An installation may choose to
15299 recompile Perl's tables to change this. See L<Unicode character
15300 properties that are NOT accepted by Perl>.)
15302 For most purposes, access to Unicode properties from the Perl core is through
15303 regular expression matches, as described in the next section.
15304 For some special purposes, and to access the properties that are not suitable
15305 for regular expression matching, all the Unicode character properties that
15306 Perl handles are accessible via the standard L<Unicode::UCD> module, as
15307 described in the section L</Properties accessible through Unicode::UCD>.
15309 Perl also provides some additional extensions and short-cut synonyms
15310 for Unicode properties.
15312 This document merely lists all available properties and does not attempt to
15313 explain what each property really means. There is a brief description of each
15314 Perl extension; see L<perlunicode/Other Properties> for more information on
15315 these. There is some detail about Blocks, Scripts, General_Category,
15316 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
15317 official Unicode properties, refer to the Unicode standard. A good starting
15318 place is L<$unicode_reference_url>.
15320 Note that you can define your own properties; see
15321 L<perlunicode/"User-Defined Character Properties">.
15323 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
15325 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
15326 most of the Unicode character properties. The table below shows all these
15327 constructs, both single and compound forms.
15329 B<Compound forms> consist of two components, separated by an equals sign or a
15330 colon. The first component is the property name, and the second component is
15331 the particular value of the property to match against, for example,
15332 C<\\p{Script: Greek}> and C<\\p{Script=Greek}> both mean to match characters
15333 whose Script property is Greek.
15335 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
15336 their equivalent compound forms. The table shows these equivalences. (In our
15337 example, C<\\p{Greek}> is a just a shortcut for C<\\p{Script=Greek}>.)
15338 There are also a few Perl-defined single forms that are not shortcuts for a
15339 compound form. One such is C<\\p{Word}>. These are also listed in the table.
15341 In parsing these constructs, Perl always ignores Upper/lower case differences
15342 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
15343 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
15344 the left brace completely changes the meaning of the construct, from "match"
15345 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
15346 for improved legibility.
15348 Also, white space, hyphens, and underscores are normally ignored
15349 everywhere between the {braces}, and hence can be freely added or removed
15350 even if the C</x> modifier hasn't been specified on the regular expression.
15351 But $a_bold_stricter at the beginning of an entry in the table below
15352 means that tighter (stricter) rules are used for that entry:
15356 =item Single form (C<\\p{name}>) tighter rules:
15358 White space, hyphens, and underscores ARE significant
15363 =item * white space adjacent to a non-word character
15365 =item * underscores separating digits in numbers
15369 That means, for example, that you can freely add or remove white space
15370 adjacent to (but within) the braces without affecting the meaning.
15372 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
15374 The tighter rules given above for the single form apply to everything to the
15375 right of the colon or equals; the looser rules still apply to everything to
15378 That means, for example, that you can freely add or remove white space
15379 adjacent to (but within) the braces and the colon or equal sign.
15383 Some properties are considered obsolete by Unicode, but still available.
15384 There are several varieties of obsolescence:
15390 A property may be stabilized. Such a determination does not indicate
15391 that the property should or should not be used; instead it is a declaration
15392 that the property will not be maintained nor extended for newly encoded
15393 characters. Such properties are marked with $a_bold_stabilized in the
15398 A property may be deprecated, perhaps because its original intent
15399 has been replaced by another property, or because its specification was
15400 somehow defective. This means that its use is strongly
15401 discouraged, so much so that a warning will be issued if used, unless the
15402 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
15403 statement. $A_bold_deprecated flags each such entry in the table, and
15404 the entry there for the longest, most descriptive version of the property will
15405 give the reason it is deprecated, and perhaps advice. Perl may issue such a
15406 warning, even for properties that aren't officially deprecated by Unicode,
15407 when there used to be characters or code points that were matched by them, but
15408 no longer. This is to warn you that your program may not work like it did on
15409 earlier Unicode releases.
15411 A deprecated property may be made unavailable in a future Perl version, so it
15412 is best to move away from them.
15414 A deprecated property may also be stabilized, but this fact is not shown.
15418 Properties marked with $a_bold_obsolete in the table are considered (plain)
15419 obsolete. Generally this designation is given to properties that Unicode once
15420 used for internal purposes (but not any longer).
15424 Some Perl extensions are present for backwards compatibility and are
15425 discouraged from being used, but are not obsolete. $A_bold_discouraged
15426 flags each such entry in the table. Future Unicode versions may force
15427 some of these extensions to be removed without warning, replaced by another
15428 property with the same name that means something different. Use the
15429 equivalent shown instead.
15433 The table below has two columns. The left column contains the C<\\p{}>
15434 constructs to look up, possibly preceded by the flags mentioned above; and
15435 the right column contains information about them, like a description, or
15436 synonyms. It shows both the single and compound forms for each property that
15437 has them. If the left column is a short name for a property, the right column
15438 will give its longer, more descriptive name; and if the left column is the
15439 longest name, the right column will show any equivalent shortest name, in both
15440 single and compound forms if applicable.
15442 The right column will also caution you if a property means something different
15443 than what might normally be expected.
15445 All single forms are Perl extensions; a few compound forms are as well, and
15448 Numbers in (parentheses) indicate the total number of code points matched by
15449 the property. For emphasis, those properties that match no code points at all
15450 are listed as well in a separate section following the table.
15452 Most properties match the same code points regardless of whether C<"/i">
15453 case-insensitive matching is specified or not. But a few properties are
15454 affected. These are shown with the notation
15456 (/i= other_property)
15458 in the second column. Under case-insensitive matching they match the
15459 same code pode points as the property "other_property".
15461 There is no description given for most non-Perl defined properties (See
15462 L<$unicode_reference_url> for that).
15464 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
15465 combinations. For example, entries like:
15467 \\p{Gc: *} \\p{General_Category: *}
15469 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
15470 for the latter is also valid for the former. Similarly,
15474 means that if and only if, for example, C<\\p{Foo}> exists, then
15475 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
15476 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
15477 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
15480 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
15481 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
15482 'N*' to indicate this, and doesn't have separate entries for the other
15483 possibilities. Note that not all properties which have values 'Yes' and 'No'
15484 are binary, and they have all their values spelled out without using this wild
15485 card, and a C<NOT> clause in their description that highlights their not being
15486 binary. These also require the compound form to match them, whereas true
15487 binary properties have both single and compound forms available.
15489 Note that all non-essential underscores are removed in the display of the
15498 B<*> is a wild-card
15502 B<(\\d+)> in the info column gives the number of code points matched by
15507 B<$DEPRECATED> means this is deprecated.
15511 B<$OBSOLETE> means this is obsolete.
15515 B<$STABILIZED> means this is stabilized.
15519 B<$STRICTER> means tighter (stricter) name matching applies.
15523 B<$DISCOURAGED> means use of this form is discouraged, and may not be
15528 $formatted_properties
15532 =head1 Properties accessible through Unicode::UCD
15534 All the Unicode character properties mentioned above (except for those marked
15535 as for internal use by Perl) are also accessible by
15536 L<Unicode::UCD/prop_invlist()>.
15538 Due to their nature, not all Unicode character properties are suitable for
15539 regular expression matches, nor C<prop_invlist()>. The remaining
15540 non-provisional, non-internal ones are accessible via
15541 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
15542 hasn't included; see L<below for which those are|/Unicode character properties
15543 that are NOT accepted by Perl>).
15545 For compatibility with other parts of Perl, all the single forms given in the
15546 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
15547 are recognized. BUT, there are some ambiguities between some Perl extensions
15548 and the Unicode properties, all of which are silently resolved in favor of the
15549 official Unicode property. To avoid surprises, you should only use
15550 C<prop_invmap()> for forms listed in the table below, which omits the
15551 non-recommended ones. The affected forms are the Perl single form equivalents
15552 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
15553 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
15554 whose short name is C<sc>. The table indicates the current ambiguities in the
15555 INFO column, beginning with the word C<"NOT">.
15557 The standard Unicode properties listed below are documented in
15558 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
15559 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
15560 L<perlunicode/Other Properties>;
15562 The first column in the table is a name for the property; the second column is
15563 an alternative name, if any, plus possibly some annotations. The alternative
15564 name is the property's full name, unless that would simply repeat the first
15565 column, in which case the second column indicates the property's short name
15566 (if different). The annotations are given only in the entry for the full
15567 name. If a property is obsolete, etc, the entry will be flagged with the same
15568 characters used in the table in the L<section above|/Properties accessible
15569 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
15573 =head1 Properties accessible through other means
15575 Certain properties are accessible also via core function calls. These are:
15577 Lowercase_Mapping lc() and lcfirst()
15578 Titlecase_Mapping ucfirst()
15579 Uppercase_Mapping uc()
15581 Also, Case_Folding is accessible through the C</i> modifier in regular
15582 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
15585 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
15586 interpolation in double-quoted strings and regular expressions; and functions
15587 C<charnames::viacode()>, C<charnames::vianame()>, and
15588 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
15591 Finally, most properties related to decomposition are accessible via
15592 L<Unicode::Normalize>.
15594 =head1 Unicode character properties that are NOT accepted by Perl
15596 Perl will generate an error for a few character properties in Unicode when
15597 used in a regular expression. The non-Unihan ones are listed below, with the
15598 reasons they are not accepted, perhaps with work-arounds. The short names for
15599 the properties are listed enclosed in (parentheses).
15600 As described after the list, an installation can change the defaults and choose
15601 to accept any of these. The list is machine generated based on the
15602 choices made for the installation that generated this document.
15606 An installation can choose to allow any of these to be matched by downloading
15607 the Unicode database from L<http://www.unicode.org/Public/> to
15608 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
15609 controlling lists contained in the program
15610 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
15611 (C<\%Config> is available from the Config module).
15613 =head1 Other information in the Unicode data base
15615 The Unicode data base is delivered in two different formats. The XML version
15616 is valid for more modern Unicode releases. The other version is a collection
15617 of files. The two are intended to give equivalent information. Perl uses the
15618 older form; this allows you to recompile Perl to use early Unicode releases.
15620 The only non-character property that Perl currently supports is Named
15621 Sequences, in which a sequence of code points
15622 is given a name and generally treated as a single entity. (Perl supports
15623 these via the C<\\N{...}> double-quotish construct,
15624 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
15626 Below is a list of the files in the Unicode data base that Perl doesn't
15627 currently use, along with very brief descriptions of their purposes.
15628 Some of the names of the files have been shortened from those that Unicode
15629 uses, in order to allow them to be distinguishable from similarly named files
15630 on file systems for which only the first 8 characters of a name are
15641 L<$unicode_reference_url>
15649 # And write it. The 0 means no utf8.
15650 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
15654 sub make_Heavy () {
15655 # Create and write Heavy.pl, which passes info about the tables to
15658 # Stringify structures for output
15659 my $loose_property_name_of
15660 = simple_dumper(\%loose_property_name_of, ' ' x 4);
15661 chomp $loose_property_name_of;
15663 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
15664 chomp $stricter_to_file_of;
15666 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
15667 chomp $loose_to_file_of;
15669 my $nv_floating_to_rational
15670 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
15671 chomp $nv_floating_to_rational;
15673 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
15674 chomp $why_deprecated;
15676 # We set the key to the file when we associated files with tables, but we
15677 # couldn't do the same for the value then, as we might not have the file
15678 # for the alternate table figured out at that time.
15679 foreach my $cased (keys %caseless_equivalent_to) {
15680 my @path = $caseless_equivalent_to{$cased}->file_path;
15681 my $path = join '/', @path[1, -1];
15682 $caseless_equivalent_to{$cased} = $path;
15684 my $caseless_equivalent_to
15685 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
15686 chomp $caseless_equivalent_to;
15688 my $loose_property_to_file_of
15689 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
15690 chomp $loose_property_to_file_of;
15692 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
15693 chomp $file_to_swash_name;
15697 $INTERNAL_ONLY_HEADER
15699 # This file is for the use of utf8_heavy.pl and Unicode::UCD
15701 # Maps Unicode (not Perl single-form extensions) property names in loose
15702 # standard form to their corresponding standard names
15703 \%utf8::loose_property_name_of = (
15704 $loose_property_name_of
15707 # Maps property, table to file for those using stricter matching
15708 \%utf8::stricter_to_file_of = (
15709 $stricter_to_file_of
15712 # Maps property, table to file for those using loose matching
15713 \%utf8::loose_to_file_of = (
15717 # Maps floating point to fractional form
15718 \%utf8::nv_floating_to_rational = (
15719 $nv_floating_to_rational
15722 # If a floating point number doesn't have enough digits in it to get this
15723 # close to a fraction, it isn't considered to be that fraction even if all the
15724 # digits it does have match.
15725 \$utf8::max_floating_slop = $MAX_FLOATING_SLOP;
15727 # Deprecated tables to generate a warning for. The key is the file containing
15728 # the table, so as to avoid duplication, as many property names can map to the
15729 # file, but we only need one entry for all of them.
15730 \%utf8::why_deprecated = (
15734 # A few properties have different behavior under /i matching. This maps
15735 # those to substitute files to use under /i.
15736 \%utf8::caseless_equivalent = (
15737 $caseless_equivalent_to
15740 # Property names to mapping files
15741 \%utf8::loose_property_to_file_of = (
15742 $loose_property_to_file_of
15745 # Files to the swash names within them.
15746 \%utf8::file_to_swash_name = (
15747 $file_to_swash_name
15753 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
15757 sub make_Name_pm () {
15758 # Create and write Name.pm, which contains subroutines and data to use in
15759 # conjunction with Name.pl
15761 # Maybe there's nothing to do.
15762 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
15766 $INTERNAL_ONLY_HEADER
15769 # Convert these structures to output format.
15770 my $code_points_ending_in_code_point =
15771 main::simple_dumper(\@code_points_ending_in_code_point,
15773 my $names = main::simple_dumper(\%names_ending_in_code_point,
15775 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
15778 # Do the same with the Hangul names,
15784 if ($has_hangul_syllables) {
15786 # Construct a regular expression of all the possible
15787 # combinations of the Hangul syllables.
15788 my @L_re; # Leading consonants
15789 for my $i ($LBase .. $LBase + $LCount - 1) {
15790 push @L_re, $Jamo{$i}
15792 my @V_re; # Middle vowels
15793 for my $i ($VBase .. $VBase + $VCount - 1) {
15794 push @V_re, $Jamo{$i}
15796 my @T_re; # Trailing consonants
15797 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
15798 push @T_re, $Jamo{$i}
15801 # The whole re is made up of the L V T combination.
15803 . join ('|', sort @L_re)
15805 . join ('|', sort @V_re)
15807 . join ('|', sort @T_re)
15810 # These hashes needed by the algorithm were generated
15811 # during reading of the Jamo.txt file
15812 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
15813 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
15814 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
15815 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
15822 # This module contains machine-generated tables and code for the
15823 # algorithmically-determinable Unicode character names. The following
15824 # routines can be used to translate between name and code point and vice versa
15828 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
15829 # two must be 10; if there are 5, the first must not be a 0. Written this
15830 # way to decrease backtracking. The first regex allows the code point to
15831 # be at the end of a word, but to work properly, the word shouldn't end
15832 # with a valid hex character. The second one won't match a code point at
15833 # the end of a word, and doesn't have the run-on issue
15834 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
15835 my \$code_point_re = qr/$code_point_re/;
15837 # In the following hash, the keys are the bases of names which include
15838 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
15839 # of each key is another hash which is used to get the low and high ends
15840 # for each range of code points that apply to the name.
15841 my %names_ending_in_code_point = (
15845 # The following hash is a copy of the previous one, except is for loose
15846 # matching, so each name has blanks and dashes squeezed out
15847 my %loose_names_ending_in_code_point = (
15851 # And the following array gives the inverse mapping from code points to
15852 # names. Lowest code points are first
15853 my \@code_points_ending_in_code_point = (
15854 $code_points_ending_in_code_point
15857 # Earlier releases didn't have Jamos. No sense outputting
15858 # them unless will be used.
15859 if ($has_hangul_syllables) {
15862 # Convert from code point to Jamo short name for use in composing Hangul
15868 # Leading consonant (can be null)
15878 # Optional trailing consonant
15883 # Computed re that splits up a Hangul name into LVT or LV syllables
15884 my \$syllable_re = qr/$jamo_re/;
15886 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
15887 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
15889 # These constants names and values were taken from the Unicode standard,
15890 # version 5.1, section 3.12. They are used in conjunction with Hangul
15892 my \$SBase = $SBase_string;
15893 my \$LBase = $LBase_string;
15894 my \$VBase = $VBase_string;
15895 my \$TBase = $TBase_string;
15896 my \$SCount = $SCount;
15897 my \$LCount = $LCount;
15898 my \$VCount = $VCount;
15899 my \$TCount = $TCount;
15900 my \$NCount = \$VCount * \$TCount;
15902 } # End of has Jamos
15904 push @name, << 'END';
15906 sub name_to_code_point_special {
15907 my ($name, $loose) = @_;
15909 # Returns undef if not one of the specially handled names; otherwise
15910 # returns the code point equivalent to the input name
15911 # $loose is non-zero if to use loose matching, 'name' in that case
15912 # must be input as upper case with all blanks and dashes squeezed out.
15914 if ($has_hangul_syllables) {
15915 push @name, << 'END';
15917 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
15918 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
15920 return if $name !~ qr/^$syllable_re$/;
15921 my $L = $Jamo_L{$1};
15922 my $V = $Jamo_V{$2};
15923 my $T = (defined $3) ? $Jamo_T{$3} : 0;
15924 return ($L * $VCount + $V) * $TCount + $T + $SBase;
15928 push @name, << 'END';
15930 # Name must end in 'code_point' for this to handle.
15931 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
15932 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
15935 my $code_point = CORE::hex $2;
15939 $names_ref = \%loose_names_ending_in_code_point;
15942 return if $base !~ s/-$//;
15943 $names_ref = \%names_ending_in_code_point;
15946 # Name must be one of the ones which has the code point in it.
15947 return if ! $names_ref->{$base};
15949 # Look through the list of ranges that apply to this name to see if
15950 # the code point is in one of them.
15951 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
15952 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
15953 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
15955 # Here, the code point is in the range.
15956 return $code_point;
15959 # Here, looked like the name had a code point number in it, but
15960 # did not match one of the valid ones.
15964 sub code_point_to_name_special {
15965 my $code_point = shift;
15967 # Returns the name of a code point if algorithmically determinable;
15970 if ($has_hangul_syllables) {
15971 push @name, << 'END';
15973 # If in the Hangul range, calculate the name based on Unicode's
15975 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
15977 my $SIndex = $code_point - $SBase;
15978 my $L = $LBase + $SIndex / $NCount;
15979 my $V = $VBase + ($SIndex % $NCount) / $TCount;
15980 my $T = $TBase + $SIndex % $TCount;
15981 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
15982 $name .= $Jamo{$T} if $T != $TBase;
15987 push @name, << 'END';
15989 # Look through list of these code points for one in range.
15990 foreach my $hash (@code_points_ending_in_code_point) {
15991 return if $code_point < $hash->{'low'};
15992 if ($code_point <= $hash->{'high'}) {
15993 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
15996 return; # None found
16003 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
16008 # Create and write UCD.pl, which passes info about the tables to
16011 # Create a mapping from each alias of Perl single-form extensions to all
16012 # its equivalent aliases, for quick look-up.
16013 my %perlprop_to_aliases;
16014 foreach my $table ($perl->tables) {
16016 # First create the list of the aliases of each extension
16017 my @aliases_list; # List of legal aliases for this extension
16019 my $table_name = $table->name;
16020 my $standard_table_name = standardize($table_name);
16021 my $table_full_name = $table->full_name;
16022 my $standard_table_full_name = standardize($table_full_name);
16024 # Make sure that the list has both the short and full names
16025 push @aliases_list, $table_name, $table_full_name;
16027 my $found_ucd = 0; # ? Did we actually get an alias that should be
16028 # output for this table
16030 # Go through all the aliases (including the two just added), and add
16031 # any new unique ones to the list
16032 foreach my $alias ($table->aliases) {
16034 # Skip non-legal names
16035 next unless $alias->ok_as_filename;
16036 next unless $alias->ucd;
16038 $found_ucd = 1; # have at least one legal name
16040 my $name = $alias->name;
16041 my $standard = standardize($name);
16043 # Don't repeat a name that is equivalent to one already on the
16045 next if $standard eq $standard_table_name;
16046 next if $standard eq $standard_table_full_name;
16048 push @aliases_list, $name;
16051 # If there were no legal names, don't output anything.
16052 next unless $found_ucd;
16054 # To conserve memory in the program reading these in, omit full names
16055 # that are identical to the short name, when those are the only two
16056 # aliases for the property.
16057 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
16061 # Here, @aliases_list is the list of all the aliases that this
16062 # extension legally has. Now can create a map to it from each legal
16063 # standardized alias
16064 foreach my $alias ($table->aliases) {
16065 next unless $alias->ucd;
16066 next unless $alias->ok_as_filename;
16067 push @{$perlprop_to_aliases{standardize($alias->name)}},
16072 # Make a list of all combinations of properties/values that are suppressed.
16074 if (! $debug_skip) { # This tends to fail in this debug mode
16075 foreach my $property_name (keys %why_suppressed) {
16078 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
16080 # The hash may contain properties not in this release of Unicode
16081 next unless defined (my $property = property_ref($property_name));
16083 # Find all combinations
16084 foreach my $prop_alias ($property->aliases) {
16085 my $prop_alias_name = standardize($prop_alias->name);
16087 # If no =value, there's just one combination possibe for this
16088 if (! $value_name) {
16090 # The property may be suppressed, but there may be a proxy
16091 # for it, so it shouldn't be listed as suppressed
16092 next if $prop_alias->ucd;
16093 push @suppressed, $prop_alias_name;
16096 foreach my $value_alias
16097 ($property->table($value_name)->aliases)
16099 next if $value_alias->ucd;
16101 push @suppressed, "$prop_alias_name="
16102 . standardize($value_alias->name);
16108 @suppressed = sort @suppressed; # So doesn't change between runs of this
16111 # Convert the structure below (designed for Name.pm) to a form that UCD
16112 # wants, so it doesn't have to modify it at all; i.e. so that it includes
16113 # an element for the Hangul syllables in the appropriate place, and
16114 # otherwise changes the name to include the "-<code point>" suffix.
16115 my @algorithm_names;
16116 my $done_hangul = 0;
16118 # Copy it linearly.
16119 for my $i (0 .. @code_points_ending_in_code_point - 1) {
16121 # Insert the hanguls in the correct place.
16123 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
16126 push @algorithm_names, { low => $SBase,
16127 high => $SBase + $SCount - 1,
16128 name => '<hangul syllable>',
16132 # Copy the current entry, modified.
16133 push @algorithm_names, {
16134 low => $code_points_ending_in_code_point[$i]->{'low'},
16135 high => $code_points_ending_in_code_point[$i]->{'high'},
16137 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
16141 # Serialize these structures for output.
16142 my $loose_to_standard_value
16143 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
16144 chomp $loose_to_standard_value;
16146 my $string_property_loose_to_name
16147 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
16148 chomp $string_property_loose_to_name;
16150 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
16151 chomp $perlprop_to_aliases;
16153 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
16154 chomp $prop_aliases;
16156 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
16157 chomp $prop_value_aliases;
16159 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
16162 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
16163 chomp $algorithm_names;
16165 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
16166 chomp $ambiguous_names;
16168 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
16169 chomp $loose_defaults;
16173 $INTERNAL_ONLY_HEADER
16175 # This file is for the use of Unicode::UCD
16177 # Highest legal Unicode code point
16178 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
16181 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
16182 \$Unicode::UCD::HANGUL_COUNT = $SCount;
16184 # Keys are all the possible "prop=value" combinations, in loose form; values
16185 # are the standard loose name for the 'value' part of the key
16186 \%Unicode::UCD::loose_to_standard_value = (
16187 $loose_to_standard_value
16190 # String property loose names to standard loose name
16191 \%Unicode::UCD::string_property_loose_to_name = (
16192 $string_property_loose_to_name
16195 # Keys are Perl extensions in loose form; values are each one's list of
16197 \%Unicode::UCD::loose_perlprop_to_name = (
16198 $perlprop_to_aliases
16201 # Keys are standard property name; values are each one's aliases
16202 \%Unicode::UCD::prop_aliases = (
16206 # Keys of top level are standard property name; values are keys to another
16207 # hash, Each one is one of the property's values, in standard form. The
16208 # values are that prop-val's aliases. If only one specified, the short and
16209 # long alias are identical.
16210 \%Unicode::UCD::prop_value_aliases = (
16211 $prop_value_aliases
16214 # Ordered (by code point ordinal) list of the ranges of code points whose
16215 # names are algorithmically determined. Each range entry is an anonymous hash
16216 # of the start and end points and a template for the names within it.
16217 \@Unicode::UCD::algorithmic_named_code_points = (
16221 # The properties that as-is have two meanings, and which must be disambiguated
16222 \%Unicode::UCD::ambiguous_names = (
16226 # Keys are the prop-val combinations which are the default values for the
16227 # given property, expressed in standard loose form
16228 \%Unicode::UCD::loose_defaults = (
16232 # All combinations of names that are suppressed.
16233 # This is actually for UCD.t, so it knows which properties shouldn't have
16234 # entries. If it got any bigger, would probably want to put it in its own
16235 # file to use memory only when it was needed, in testing.
16236 \@Unicode::UCD::suppressed_properties = (
16243 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
16247 sub write_all_tables() {
16248 # Write out all the tables generated by this program to files, as well as
16249 # the supporting data structures, pod file, and .t file.
16251 my @writables; # List of tables that actually get written
16252 my %match_tables_to_write; # Used to collapse identical match tables
16253 # into one file. Each key is a hash function
16254 # result to partition tables into buckets.
16255 # Each value is an array of the tables that
16256 # fit in the bucket.
16258 # For each property ...
16259 # (sort so that if there is an immutable file name, it has precedence, so
16260 # some other property can't come in and take over its file name. (We
16261 # don't care if both defined, as they had better be different anyway.)
16262 # The property named 'Perl' needs to be first (it doesn't have any
16263 # immutable file name) because empty properties are defined in terms of
16264 # it's table named 'Any'.) We also sort by the property's name. This is
16265 # just for repeatability of the outputs between runs of this program, but
16266 # does not affect correctness.
16268 foreach my $property ($perl,
16269 sort { return -1 if defined $a->file;
16270 return 1 if defined $b->file;
16271 return $a->name cmp $b->name;
16272 } grep { $_ != $perl } property_ref('*'))
16274 my $type = $property->type;
16276 # And for each table for that property, starting with the mapping
16279 foreach my $table($property,
16281 # and all the match tables for it (if any), sorted so
16282 # the ones with the shortest associated file name come
16283 # first. The length sorting prevents problems of a
16284 # longer file taking a name that might have to be used
16285 # by a shorter one. The alphabetic sorting prevents
16286 # differences between releases
16287 sort { my $ext_a = $a->external_name;
16288 return 1 if ! defined $ext_a;
16289 my $ext_b = $b->external_name;
16290 return -1 if ! defined $ext_b;
16292 # But return the non-complement table before
16293 # the complement one, as the latter is defined
16294 # in terms of the former, and needs to have
16295 # the information for the former available.
16296 return 1 if $a->complement != 0;
16297 return -1 if $b->complement != 0;
16299 # Similarly, return a subservient table after
16301 return 1 if $a->leader != $a;
16302 return -1 if $b->leader != $b;
16304 my $cmp = length $ext_a <=> length $ext_b;
16306 # Return result if lengths not equal
16307 return $cmp if $cmp;
16309 # Alphabetic if lengths equal
16310 return $ext_a cmp $ext_b
16311 } $property->tables
16315 # Here we have a table associated with a property. It could be
16316 # the map table (done first for each property), or one of the
16317 # other tables. Determine which type.
16318 my $is_property = $table->isa('Property');
16320 my $name = $table->name;
16321 my $complete_name = $table->complete_name;
16323 # See if should suppress the table if is empty, but warn if it
16324 # contains something.
16325 my $suppress_if_empty_warn_if_not
16326 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
16328 # Calculate if this table should have any code points associated
16330 my $expected_empty =
16332 # $perl should be empty, as well as properties that we just
16333 # don't do anything with
16335 && ($table == $perl
16336 || grep { $complete_name eq $_ }
16337 @unimplemented_properties
16341 # Match tables in properties we skipped populating should be
16343 || (! $is_property && ! $property->to_create_match_tables)
16345 # Tables and properties that are expected to have no code
16346 # points should be empty
16347 || $suppress_if_empty_warn_if_not
16350 # Set a boolean if this table is the complement of an empty binary
16352 my $is_complement_of_empty_binary =
16353 $type == $BINARY &&
16354 (($table == $property->table('Y')
16355 && $property->table('N')->is_empty)
16356 || ($table == $property->table('N')
16357 && $property->table('Y')->is_empty));
16359 if ($table->is_empty) {
16361 if ($suppress_if_empty_warn_if_not) {
16362 $table->set_fate($SUPPRESSED,
16363 $suppress_if_empty_warn_if_not);
16366 # Suppress (by skipping them) expected empty tables.
16367 next TABLE if $expected_empty;
16369 # And setup to later output a warning for those that aren't
16370 # known to be allowed to be empty. Don't do the warning if
16371 # this table is a child of another one to avoid duplicating
16372 # the warning that should come from the parent one.
16373 if (($table == $property || $table->parent == $table)
16374 && $table->fate != $SUPPRESSED
16375 && $table->fate != $MAP_PROXIED
16376 && ! grep { $complete_name =~ /^$_$/ }
16377 @tables_that_may_be_empty)
16379 push @unhandled_properties, "$table";
16382 # An empty table is just the complement of everything.
16383 $table->set_complement($Any) if $table != $property;
16385 elsif ($expected_empty) {
16387 if ($suppress_if_empty_warn_if_not) {
16388 $because = " because $suppress_if_empty_warn_if_not";
16391 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
16394 # Some tables should match everything
16395 my $expected_full =
16396 ($table->fate == $SUPPRESSED)
16399 ? # All these types of map tables will be full because
16400 # they will have been populated with defaults
16401 ($type == $ENUM || $type == $FORCED_BINARY)
16403 : # A match table should match everything if its method
16405 ($table->matches_all
16407 # The complement of an empty binary table will match
16409 || $is_complement_of_empty_binary
16413 my $count = $table->count;
16414 if ($expected_full) {
16415 if ($count != $MAX_UNICODE_CODEPOINTS) {
16416 Carp::my_carp("$table matches only "
16417 . clarify_number($count)
16418 . " Unicode code points but should match "
16419 . clarify_number($MAX_UNICODE_CODEPOINTS)
16421 . clarify_number(abs($MAX_UNICODE_CODEPOINTS - $count))
16422 . "). Proceeding anyway.");
16425 # Here is expected to be full. If it is because it is the
16426 # complement of an (empty) binary table that is to be
16427 # suppressed, then suppress this one as well.
16428 if ($is_complement_of_empty_binary) {
16429 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
16430 my $opposing = $property->table($opposing_name);
16431 my $opposing_status = $opposing->status;
16432 if ($opposing_status) {
16433 $table->set_status($opposing_status,
16434 $opposing->status_info);
16438 elsif ($count == $MAX_UNICODE_CODEPOINTS
16439 && ($table == $property || $table->leader == $table)
16440 && $table->property->status ne $PLACEHOLDER)
16442 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
16445 if ($table->fate >= $SUPPRESSED) {
16446 if (! $is_property) {
16447 my @children = $table->children;
16448 foreach my $child (@children) {
16449 if ($child->fate < $SUPPRESSED) {
16450 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
16458 if (! $is_property) {
16460 make_ucd_table_pod_entries($table) if $table->property == $perl;
16462 # Several things need to be done just once for each related
16463 # group of match tables. Do them on the parent.
16464 if ($table->parent == $table) {
16466 # Add an entry in the pod file for the table; it also does
16468 make_re_pod_entries($table) if defined $pod_directory;
16470 # See if the the table matches identical code points with
16471 # something that has already been output. In that case,
16472 # no need to have two files with the same code points in
16473 # them. We use the table's hash() method to store these
16474 # in buckets, so that it is quite likely that if two
16475 # tables are in the same bucket they will be identical, so
16476 # don't have to compare tables frequently. The tables
16477 # have to have the same status to share a file, so add
16478 # this to the bucket hash. (The reason for this latter is
16479 # that Heavy.pl associates a status with a file.)
16480 # We don't check tables that are inverses of others, as it
16481 # would lead to some coding complications, and checking
16482 # all the regular ones should find everything.
16483 if ($table->complement == 0) {
16484 my $hash = $table->hash . ';' . $table->status;
16486 # Look at each table that is in the same bucket as
16487 # this one would be.
16488 foreach my $comparison
16489 (@{$match_tables_to_write{$hash}})
16491 if ($table->matches_identically_to($comparison)) {
16492 $table->set_equivalent_to($comparison,
16498 # Here, not equivalent, add this table to the bucket.
16499 push @{$match_tables_to_write{$hash}}, $table;
16505 # Here is the property itself.
16506 # Don't write out or make references to the $perl property
16507 next if $table == $perl;
16509 make_ucd_table_pod_entries($table);
16511 # There is a mapping stored of the various synonyms to the
16512 # standardized name of the property for utf8_heavy.pl.
16513 # Also, the pod file contains entries of the form:
16514 # \p{alias: *} \p{full: *}
16515 # rather than show every possible combination of things.
16517 my @property_aliases = $property->aliases;
16519 my $full_property_name = $property->full_name;
16520 my $property_name = $property->name;
16521 my $standard_property_name = standardize($property_name);
16522 my $standard_property_full_name
16523 = standardize($full_property_name);
16525 # We also create for Unicode::UCD a list of aliases for
16526 # the property. The list starts with the property name;
16527 # then its full name.
16530 if ( $property->fate <= $MAP_PROXIED) {
16531 @property_list = ($property_name, $full_property_name);
16532 @standard_list = ($standard_property_name,
16533 $standard_property_full_name);
16536 # For each synonym ...
16537 for my $i (0 .. @property_aliases - 1) {
16538 my $alias = $property_aliases[$i];
16539 my $alias_name = $alias->name;
16540 my $alias_standard = standardize($alias_name);
16543 # Add other aliases to the list of property aliases
16544 if ($property->fate <= $MAP_PROXIED
16545 && ! grep { $alias_standard eq $_ } @standard_list)
16547 push @property_list, $alias_name;
16548 push @standard_list, $alias_standard;
16551 # For utf8_heavy, set the mapping of the alias to the
16553 if ($type == $STRING) {
16554 if ($property->fate <= $MAP_PROXIED) {
16555 $string_property_loose_to_name{$alias_standard}
16556 = $standard_property_name;
16560 if (exists ($loose_property_name_of{$alias_standard}))
16562 Carp::my_carp("There already is a property with the same standard name as $alias_name: $loose_property_name_of{$alias_standard}. Old name is retained");
16565 $loose_property_name_of{$alias_standard}
16566 = $standard_property_name;
16569 # Now for the re pod entry for this alias. Skip if not
16570 # outputting a pod; skip the first one, which is the
16571 # full name so won't have an entry like: '\p{full: *}
16572 # \p{full: *}', and skip if don't want an entry for
16575 || ! defined $pod_directory
16576 || ! $alias->make_re_pod_entry;
16578 my $rhs = "\\p{$full_property_name: *}";
16579 if ($property != $perl && $table->perl_extension) {
16580 $rhs .= ' (Perl extension)';
16582 push @match_properties,
16583 format_pod_line($indent_info_column,
16584 '\p{' . $alias->name . ': *}',
16590 # The list of all possible names is attached to each alias, so
16592 if (@property_list) {
16593 push @{$prop_aliases{$standard_list[0]}}, @property_list;
16596 if ($property->fate <= $MAP_PROXIED) {
16598 # Similarly, we create for Unicode::UCD a list of
16599 # property-value aliases.
16601 my $property_full_name = $property->full_name;
16603 # Look at each table in the property...
16604 foreach my $table ($property->tables) {
16606 my $table_full_name = $table->full_name;
16607 my $standard_table_full_name
16608 = standardize($table_full_name);
16609 my $table_name = $table->name;
16610 my $standard_table_name = standardize($table_name);
16612 # The list starts with the table name and its full
16614 push @values_list, $table_name, $table_full_name;
16616 # We add to the table each unique alias that isn't
16617 # discouraged from use.
16618 foreach my $alias ($table->aliases) {
16619 next if $alias->status
16620 && $alias->status eq $DISCOURAGED;
16621 my $name = $alias->name;
16622 my $standard = standardize($name);
16623 next if $standard eq $standard_table_name;
16624 next if $standard eq $standard_table_full_name;
16625 push @values_list, $name;
16628 # Here @values_list is a list of all the aliases for
16629 # the table. That is, all the property-values given
16630 # by this table. By agreement with Unicode::UCD,
16631 # if the name and full name are identical, and there
16632 # are no other names, drop the duplcate entry to save
16634 if (@values_list == 2
16635 && $values_list[0] eq $values_list[1])
16640 # To save memory, unlike the similar list for property
16641 # aliases above, only the standard forms hve the list.
16642 # This forces an extra step of converting from input
16643 # name to standard name, but the savings are
16644 # considerable. (There is only marginal savings if we
16645 # did this with the property aliases.)
16646 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
16650 # Don't write out a mapping file if not desired.
16651 next if ! $property->to_output_map;
16654 # Here, we know we want to write out the table, but don't do it
16655 # yet because there may be other tables that come along and will
16656 # want to share the file, and the file's comments will change to
16657 # mention them. So save for later.
16658 push @writables, $table;
16660 } # End of looping through the property and all its tables.
16661 } # End of looping through all properties.
16663 # Now have all the tables that will have files written for them. Do it.
16664 foreach my $table (@writables) {
16667 my $property = $table->property;
16668 my $is_property = ($table == $property);
16669 if (! $is_property) {
16671 # Match tables for the property go in lib/$subdirectory, which is
16672 # the property's name. Don't use the standard file name for this,
16673 # as may get an unfamiliar alias
16674 @directory = ($matches_directory, $property->external_name);
16678 @directory = $table->directory;
16679 $filename = $table->file;
16682 # Use specified filename if available, or default to property's
16683 # shortest name. We need an 8.3 safe filename (which means "an 8
16684 # safe" filename, since after the dot is only 'pl', which is < 3)
16685 # The 2nd parameter is if the filename shouldn't be changed, and
16686 # it shouldn't iff there is a hard-coded name for this table.
16687 $filename = construct_filename(
16688 $filename || $table->external_name,
16689 ! $filename, # mutable if no filename
16692 register_file_for_name($table, \@directory, $filename);
16694 # Only need to write one file when shared by more than one
16696 next if ! $is_property
16697 && ($table->leader != $table || $table->complement != 0);
16699 # Construct a nice comment to add to the file
16700 $table->set_final_comment;
16706 # Write out the pod file
16709 # And Heavy.pl, Name.pm, UCD.pl
16714 make_property_test_script() if $make_test_script;
16715 make_normalization_test_script() if $make_norm_test_script;
16719 my @white_space_separators = ( # This used only for making the test script.
16726 sub generate_separator($) {
16727 # This used only for making the test script. It generates the colon or
16728 # equal separator between the property and property value, with random
16729 # white space surrounding the separator
16733 return "" if $lhs eq ""; # No separator if there's only one (the r) side
16735 # Choose space before and after randomly
16736 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
16737 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
16739 # And return the whole complex, half the time using a colon, half the
16741 return $spaces_before
16742 . (rand() < 0.5) ? '=' : ':'
16746 sub generate_tests($$$$$) {
16747 # This used only for making the test script. It generates test cases that
16748 # are expected to compile successfully in perl. Note that the lhs and
16749 # rhs are assumed to already be as randomized as the caller wants.
16751 my $lhs = shift; # The property: what's to the left of the colon
16752 # or equals separator
16753 my $rhs = shift; # The property value; what's to the right
16754 my $valid_code = shift; # A code point that's known to be in the
16755 # table given by lhs=rhs; undef if table is
16757 my $invalid_code = shift; # A code point known to not be in the table;
16758 # undef if the table is all code points
16759 my $warning = shift;
16761 # Get the colon or equal
16762 my $separator = generate_separator($lhs);
16764 # The whole 'property=value'
16765 my $name = "$lhs$separator$rhs";
16768 # Create a complete set of tests, with complements.
16769 if (defined $valid_code) {
16770 push @output, <<"EOC"
16771 Expect(1, $valid_code, '\\p{$name}', $warning);
16772 Expect(0, $valid_code, '\\p{^$name}', $warning);
16773 Expect(0, $valid_code, '\\P{$name}', $warning);
16774 Expect(1, $valid_code, '\\P{^$name}', $warning);
16777 if (defined $invalid_code) {
16778 push @output, <<"EOC"
16779 Expect(0, $invalid_code, '\\p{$name}', $warning);
16780 Expect(1, $invalid_code, '\\p{^$name}', $warning);
16781 Expect(1, $invalid_code, '\\P{$name}', $warning);
16782 Expect(0, $invalid_code, '\\P{^$name}', $warning);
16788 sub generate_error($$$) {
16789 # This used only for making the test script. It generates test cases that
16790 # are expected to not only not match, but to be syntax or similar errors
16792 my $lhs = shift; # The property: what's to the left of the
16793 # colon or equals separator
16794 my $rhs = shift; # The property value; what's to the right
16795 my $already_in_error = shift; # Boolean; if true it's known that the
16796 # unmodified lhs and rhs will cause an error.
16797 # This routine should not force another one
16798 # Get the colon or equal
16799 my $separator = generate_separator($lhs);
16801 # Since this is an error only, don't bother to randomly decide whether to
16802 # put the error on the left or right side; and assume that the rhs is
16803 # loosely matched, again for convenience rather than rigor.
16804 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
16806 my $property = $lhs . $separator . $rhs;
16809 Error('\\p{$property}');
16810 Error('\\P{$property}');
16814 # These are used only for making the test script
16815 # XXX Maybe should also have a bad strict seps, which includes underscore.
16817 my @good_loose_seps = (
16824 my @bad_loose_seps = (
16829 sub randomize_stricter_name {
16830 # This used only for making the test script. Take the input name and
16831 # return a randomized, but valid version of it under the stricter matching
16835 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16837 # If the name looks like a number (integer, floating, or rational), do
16839 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
16842 my $separator = $3;
16844 # If there isn't a sign, part of the time add a plus
16845 # Note: Not testing having any denominator having a minus sign
16847 $sign = '+' if rand() <= .3;
16850 # And add 0 or more leading zeros.
16851 $name = $sign . ('0' x int rand(10)) . $number;
16853 if (defined $separator) {
16854 my $extra_zeros = '0' x int rand(10);
16856 if ($separator eq '.') {
16858 # Similarly, add 0 or more trailing zeros after a decimal
16860 $name .= $extra_zeros;
16864 # Or, leading zeros before the denominator
16865 $name =~ s,/,/$extra_zeros,;
16870 # For legibility of the test, only change the case of whole sections at a
16871 # time. To do this, first split into sections. The split returns the
16874 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
16875 trace $section if main::DEBUG && $to_trace;
16877 if (length $section > 1 && $section !~ /\D/) {
16879 # If the section is a sequence of digits, about half the time
16880 # randomly add underscores between some of them.
16883 # Figure out how many underscores to add. max is 1 less than
16884 # the number of digits. (But add 1 at the end to make sure
16885 # result isn't 0, and compensate earlier by subtracting 2
16887 my $num_underscores = int rand(length($section) - 2) + 1;
16889 # And add them evenly throughout, for convenience, not rigor
16891 my $spacing = (length($section) - 1)/ $num_underscores;
16892 my $temp = $section;
16894 for my $i (1 .. $num_underscores) {
16895 $section .= substr($temp, 0, $spacing, "") . '_';
16899 push @sections, $section;
16903 # Here not a sequence of digits. Change the case of the section
16905 my $switch = int rand(4);
16906 if ($switch == 0) {
16907 push @sections, uc $section;
16909 elsif ($switch == 1) {
16910 push @sections, lc $section;
16912 elsif ($switch == 2) {
16913 push @sections, ucfirst $section;
16916 push @sections, $section;
16920 trace "returning", join "", @sections if main::DEBUG && $to_trace;
16921 return join "", @sections;
16924 sub randomize_loose_name($;$) {
16925 # This used only for making the test script
16928 my $want_error = shift; # if true, make an error
16929 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16931 $name = randomize_stricter_name($name);
16934 push @parts, $good_loose_seps[rand(@good_loose_seps)];
16936 # Preserve trailing ones for the sake of not stripping the underscore from
16938 for my $part (split /[-\s_]+ (?= . )/, $name) {
16940 if ($want_error and rand() < 0.3) {
16941 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
16945 push @parts, $good_loose_seps[rand(@good_loose_seps)];
16948 push @parts, $part;
16950 my $new = join("", @parts);
16951 trace "$name => $new" if main::DEBUG && $to_trace;
16954 if (rand() >= 0.5) {
16955 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
16958 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
16964 # Used to make sure don't generate duplicate test cases.
16965 my %test_generated;
16967 sub make_property_test_script() {
16968 # This used only for making the test script
16969 # this written directly -- it's huge.
16971 print "Making test script\n" if $verbosity >= $PROGRESS;
16973 # This uses randomness to test different possibilities without testing all
16974 # possibilities. To ensure repeatability, set the seed to 0. But if
16975 # tests are added, it will perturb all later ones in the .t file
16978 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
16980 # Keep going down an order of magnitude
16981 # until find that adding this quantity to
16982 # 1 remains 1; but put an upper limit on
16983 # this so in case this algorithm doesn't
16984 # work properly on some platform, that we
16985 # won't loop forever.
16987 my $min_floating_slop = 1;
16988 while (1+ $min_floating_slop != 1
16991 my $next = $min_floating_slop / 10;
16992 last if $next == 0; # If underflows,
16994 $min_floating_slop = $next;
16997 # It doesn't matter whether the elements of this array contain single lines
16998 # or multiple lines. main::write doesn't count the lines.
17001 # Sort these so get results in same order on different runs of this
17003 foreach my $property (sort { $a->name cmp $b->name } property_ref('*')) {
17004 foreach my $table (sort { $a->name cmp $b->name } $property->tables) {
17006 # Find code points that match, and don't match this table.
17007 my $valid = $table->get_valid_code_point;
17008 my $invalid = $table->get_invalid_code_point;
17009 my $warning = ($table->status eq $DEPRECATED)
17013 # Test each possible combination of the property's aliases with
17014 # the table's. If this gets to be too many, could do what is done
17015 # in the set_final_comment() for Tables
17016 my @table_aliases = $table->aliases;
17017 my @property_aliases = $table->property->aliases;
17019 # Every property can be optionally be prefixed by 'Is_', so test
17020 # that those work, by creating such a new alias for each
17021 # pre-existing one.
17022 push @property_aliases, map { Alias->new("Is_" . $_->name,
17024 $_->make_re_pod_entry,
17025 $_->ok_as_filename,
17029 } @property_aliases;
17030 my $max = max(scalar @table_aliases, scalar @property_aliases);
17031 for my $j (0 .. $max - 1) {
17033 # The current alias for property is the next one on the list,
17034 # or if beyond the end, start over. Similarly for table
17036 = $property_aliases[$j % @property_aliases]->name;
17038 $property_name = "" if $table->property == $perl;
17039 my $table_alias = $table_aliases[$j % @table_aliases];
17040 my $table_name = $table_alias->name;
17041 my $loose_match = $table_alias->loose_match;
17043 # If the table doesn't have a file, any test for it is
17044 # already guaranteed to be in error
17045 my $already_error = ! $table->file_path;
17047 # Generate error cases for this alias.
17048 push @output, generate_error($property_name,
17052 # If the table is guaranteed to always generate an error,
17053 # quit now without generating success cases.
17054 next if $already_error;
17056 # Now for the success cases.
17058 if ($loose_match) {
17060 # For loose matching, create an extra test case for the
17062 my $standard = standardize($table_name);
17064 # $test_name should be a unique combination for each test
17065 # case; used just to avoid duplicate tests
17066 my $test_name = "$property_name=$standard";
17068 # Don't output duplicate test cases.
17069 if (! exists $test_generated{$test_name}) {
17070 $test_generated{$test_name} = 1;
17071 push @output, generate_tests($property_name,
17078 $random = randomize_loose_name($table_name)
17080 else { # Stricter match
17081 $random = randomize_stricter_name($table_name);
17084 # Now for the main test case for this alias.
17085 my $test_name = "$property_name=$random";
17086 if (! exists $test_generated{$test_name}) {
17087 $test_generated{$test_name} = 1;
17088 push @output, generate_tests($property_name,
17095 # If the name is a rational number, add tests for the
17096 # floating point equivalent.
17097 if ($table_name =~ qr{/}) {
17099 # Calculate the float, and find just the fraction.
17100 my $float = eval $table_name;
17101 my ($whole, $fraction)
17102 = $float =~ / (.*) \. (.*) /x;
17104 # Starting with one digit after the decimal point,
17105 # create a test for each possible precision (number of
17106 # digits past the decimal point) until well beyond the
17107 # native number found on this machine. (If we started
17108 # with 0 digits, it would be an integer, which could
17109 # well match an unrelated table)
17111 for my $i (1 .. $min_floating_slop + 3) {
17112 my $table_name = sprintf("%.*f", $i, $float);
17113 if ($i < $MIN_FRACTION_LENGTH) {
17115 # If the test case has fewer digits than the
17116 # minimum acceptable precision, it shouldn't
17117 # succeed, so we expect an error for it.
17118 # E.g., 2/3 = .7 at one decimal point, and we
17119 # shouldn't say it matches .7. We should make
17120 # it be .667 at least before agreeing that the
17121 # intent was to match 2/3. But at the
17122 # less-than- acceptable level of precision, it
17123 # might actually match an unrelated number.
17124 # So don't generate a test case if this
17125 # conflating is possible. In our example, we
17126 # don't want 2/3 matching 7/10, if there is
17127 # a 7/10 code point.
17129 (keys %nv_floating_to_rational)
17132 if abs($table_name - $existing)
17133 < $MAX_FLOATING_SLOP;
17135 push @output, generate_error($property_name,
17137 1 # 1 => already an error
17142 # Here the number of digits exceeds the
17143 # minimum we think is needed. So generate a
17144 # success test case for it.
17145 push @output, generate_tests($property_name,
17163 (map {"Test_X('$_');\n"} @backslash_X_tests),
17168 sub make_normalization_test_script() {
17169 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
17171 my $n_path = 'TestNorm.pl';
17173 unshift @normalization_tests, <<'END';
17177 sub ord_string { # Convert packed ords to printable string
17179 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
17180 unpack "U*", shift) . "'";
17181 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
17185 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
17186 my $display_source = ord_string($source);
17187 my $display_nfc = ord_string($nfc);
17188 my $display_nfd = ord_string($nfd);
17189 my $display_nfkc = ord_string($nfkc);
17190 my $display_nfkd = ord_string($nfkd);
17192 use Unicode::Normalize;
17194 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
17195 # nfkc == toNFC(nfkc) == toNFC(nfkd)
17198 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
17199 # nfkd == toNFD(nfkc) == toNFD(nfkd)
17202 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
17203 # toNFKC(nfkc) == toNFKC(nfkd)
17206 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
17207 # toNFKD(nfkc) == toNFKD(nfkd)
17209 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
17210 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
17211 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
17212 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
17213 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
17215 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
17216 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
17217 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
17218 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
17219 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
17221 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
17222 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
17223 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
17224 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
17225 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
17227 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
17228 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
17229 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
17230 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
17231 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
17238 @normalization_tests,
17244 # This is a list of the input files and how to handle them. The files are
17245 # processed in their order in this list. Some reordering is possible if
17246 # desired, but the v0 files should be first, and the extracted before the
17247 # others except DAge.txt (as data in an extracted file can be over-ridden by
17248 # the non-extracted. Some other files depend on data derived from an earlier
17249 # file, like UnicodeData requires data from Jamo, and the case changing and
17250 # folding requires data from Unicode. Mostly, it is safest to order by first
17251 # version releases in (except the Jamo). DAge.txt is read before the
17252 # extracted ones because of the rarely used feature $compare_versions. In the
17253 # unlikely event that there were ever an extracted file that contained the Age
17254 # property information, it would have to go in front of DAge.
17256 # The version strings allow the program to know whether to expect a file or
17257 # not, but if a file exists in the directory, it will be processed, even if it
17258 # is in a version earlier than expected, so you can copy files from a later
17259 # release into an earlier release's directory.
17260 my @input_file_objects = (
17261 Input_file->new('PropertyAliases.txt', v0,
17262 Handler => \&process_PropertyAliases,
17264 Input_file->new(undef, v0, # No file associated with this
17265 Progress_Message => 'Finishing property setup',
17266 Handler => \&finish_property_setup,
17268 Input_file->new('PropValueAliases.txt', v0,
17269 Handler => \&process_PropValueAliases,
17270 Has_Missings_Defaults => $NOT_IGNORED,
17272 Input_file->new('DAge.txt', v3.2.0,
17273 Has_Missings_Defaults => $NOT_IGNORED,
17276 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
17277 Property => 'General_Category',
17279 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
17280 Property => 'Canonical_Combining_Class',
17281 Has_Missings_Defaults => $NOT_IGNORED,
17283 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
17284 Property => 'Numeric_Type',
17285 Has_Missings_Defaults => $NOT_IGNORED,
17287 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
17288 Property => 'East_Asian_Width',
17289 Has_Missings_Defaults => $NOT_IGNORED,
17291 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
17292 Property => 'Line_Break',
17293 Has_Missings_Defaults => $NOT_IGNORED,
17295 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
17296 Property => 'Bidi_Class',
17297 Has_Missings_Defaults => $NOT_IGNORED,
17299 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
17300 Property => 'Decomposition_Type',
17301 Has_Missings_Defaults => $NOT_IGNORED,
17303 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
17304 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
17305 Property => 'Numeric_Value',
17306 Each_Line_Handler => \&filter_numeric_value_line,
17307 Has_Missings_Defaults => $NOT_IGNORED,
17309 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
17310 Property => 'Joining_Group',
17311 Has_Missings_Defaults => $NOT_IGNORED,
17314 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
17315 Property => 'Joining_Type',
17316 Has_Missings_Defaults => $NOT_IGNORED,
17318 Input_file->new('Jamo.txt', v2.0.0,
17319 Property => 'Jamo_Short_Name',
17320 Each_Line_Handler => \&filter_jamo_line,
17322 Input_file->new('UnicodeData.txt', v1.1.5,
17323 Pre_Handler => \&setup_UnicodeData,
17325 # We clean up this file for some early versions.
17326 Each_Line_Handler => [ (($v_version lt v2.0.0 )
17328 : ($v_version eq v2.1.5)
17329 ? \&filter_v2_1_5_ucd
17331 # And for 5.14 Perls with 6.0,
17332 # have to also make changes
17333 : ($v_version ge v6.0.0
17338 # Early versions did not have the
17339 # proper Unicode_1 names for the
17341 (($v_version lt v3.0.0)
17342 ? \&filter_early_U1_names
17345 # Early versions did not correctly
17346 # use the later method for giving
17347 # decimal digit values
17348 (($v_version le v3.2.0)
17349 ? \&filter_bad_Nd_ucd
17352 # And the main filter
17353 \&filter_UnicodeData_line,
17355 EOF_Handler => \&EOF_UnicodeData,
17357 Input_file->new('ArabicShaping.txt', v2.0.0,
17358 Each_Line_Handler =>
17359 [ ($v_version lt 4.1.0)
17360 ? \&filter_old_style_arabic_shaping
17362 \&filter_arabic_shaping_line,
17364 Has_Missings_Defaults => $NOT_IGNORED,
17366 Input_file->new('Blocks.txt', v2.0.0,
17367 Property => 'Block',
17368 Has_Missings_Defaults => $NOT_IGNORED,
17369 Each_Line_Handler => \&filter_blocks_lines
17371 Input_file->new('PropList.txt', v2.0.0,
17372 Each_Line_Handler => (($v_version lt v3.1.0)
17373 ? \&filter_old_style_proplist
17376 Input_file->new('Unihan.txt', v2.0.0,
17377 Pre_Handler => \&setup_unihan,
17379 Each_Line_Handler => \&filter_unihan_line,
17381 Input_file->new('SpecialCasing.txt', v2.1.8,
17382 Each_Line_Handler => ($v_version eq 2.1.8)
17383 ? \&filter_2_1_8_special_casing_line
17384 : \&filter_special_casing_line,
17385 Pre_Handler => \&setup_special_casing,
17386 Has_Missings_Defaults => $IGNORED,
17389 'LineBreak.txt', v3.0.0,
17390 Has_Missings_Defaults => $NOT_IGNORED,
17391 Property => 'Line_Break',
17392 # Early versions had problematic syntax
17393 Each_Line_Handler => (($v_version lt v3.1.0)
17394 ? \&filter_early_ea_lb
17397 Input_file->new('EastAsianWidth.txt', v3.0.0,
17398 Property => 'East_Asian_Width',
17399 Has_Missings_Defaults => $NOT_IGNORED,
17400 # Early versions had problematic syntax
17401 Each_Line_Handler => (($v_version lt v3.1.0)
17402 ? \&filter_early_ea_lb
17405 Input_file->new('CompositionExclusions.txt', v3.0.0,
17406 Property => 'Composition_Exclusion',
17408 Input_file->new('BidiMirroring.txt', v3.0.1,
17409 Property => 'Bidi_Mirroring_Glyph',
17410 Has_Missings_Defaults => ($v_version lt v6.2.0)
17412 # Is <none> which doesn't mean
17413 # anything to us, we will use the
17418 Input_file->new("NormTest.txt", v3.0.0,
17419 Handler => \&process_NormalizationsTest,
17420 Skip => ($make_norm_test_script) ? 0 : 'Validation Tests',
17422 Input_file->new('CaseFolding.txt', v3.0.1,
17423 Pre_Handler => \&setup_case_folding,
17424 Each_Line_Handler =>
17425 [ ($v_version lt v3.1.0)
17426 ? \&filter_old_style_case_folding
17428 \&filter_case_folding_line
17430 Has_Missings_Defaults => $IGNORED,
17432 Input_file->new('DCoreProperties.txt', v3.1.0,
17433 # 5.2 changed this file
17434 Has_Missings_Defaults => (($v_version ge v5.2.0)
17438 Input_file->new('Scripts.txt', v3.1.0,
17439 Property => 'Script',
17440 Has_Missings_Defaults => $NOT_IGNORED,
17442 Input_file->new('DNormalizationProps.txt', v3.1.0,
17443 Has_Missings_Defaults => $NOT_IGNORED,
17444 Each_Line_Handler => (($v_version lt v4.0.1)
17445 ? \&filter_old_style_normalization_lines
17448 Input_file->new('HangulSyllableType.txt', v0,
17449 Has_Missings_Defaults => $NOT_IGNORED,
17450 Property => 'Hangul_Syllable_Type',
17451 Pre_Handler => ($v_version lt v4.0.0)
17455 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
17456 Property => 'Word_Break',
17457 Has_Missings_Defaults => $NOT_IGNORED,
17459 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v0,
17460 Property => 'Grapheme_Cluster_Break',
17461 Has_Missings_Defaults => $NOT_IGNORED,
17462 Pre_Handler => ($v_version lt v4.1.0)
17466 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
17467 Handler => \&process_GCB_test,
17469 Input_file->new("$AUXILIARY/LBTest.txt", v4.1.0,
17470 Skip => 'Validation Tests',
17472 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
17473 Skip => 'Validation Tests',
17475 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
17476 Skip => 'Validation Tests',
17478 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
17479 Property => 'Sentence_Break',
17480 Has_Missings_Defaults => $NOT_IGNORED,
17482 Input_file->new('NamedSequences.txt', v4.1.0,
17483 Handler => \&process_NamedSequences
17485 Input_file->new('NameAliases.txt', v0,
17486 Property => 'Name_Alias',
17487 Pre_Handler => ($v_version le v6.0.0)
17488 ? \&setup_early_name_alias
17490 Each_Line_Handler => ($v_version le v6.0.0)
17491 ? \&filter_early_version_name_alias_line
17492 : \&filter_later_version_name_alias_line,
17494 Input_file->new("BidiTest.txt", v5.2.0,
17495 Skip => 'Validation Tests',
17497 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
17499 Each_Line_Handler => \&filter_unihan_line,
17501 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
17503 Each_Line_Handler => \&filter_unihan_line,
17505 Input_file->new('UnihanIRGSources.txt', v5.2.0,
17507 Pre_Handler => \&setup_unihan,
17508 Each_Line_Handler => \&filter_unihan_line,
17510 Input_file->new('UnihanNumericValues.txt', v5.2.0,
17512 Each_Line_Handler => \&filter_unihan_line,
17514 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
17516 Each_Line_Handler => \&filter_unihan_line,
17518 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
17520 Each_Line_Handler => \&filter_unihan_line,
17522 Input_file->new('UnihanReadings.txt', v5.2.0,
17524 Each_Line_Handler => \&filter_unihan_line,
17526 Input_file->new('UnihanVariants.txt', v5.2.0,
17528 Each_Line_Handler => \&filter_unihan_line,
17530 Input_file->new('ScriptExtensions.txt', v6.0.0,
17531 Property => 'Script_Extensions',
17532 Pre_Handler => \&setup_script_extensions,
17533 Each_Line_Handler => \&filter_script_extensions_line,
17534 Has_Missings_Defaults => (($v_version le v6.0.0)
17538 # The two Indic files are actually available starting in v6.0.0, but their
17539 # property values are missing from PropValueAliases.txt in that release,
17540 # so that further work would have to be done to get them to work properly
17541 # for that release.
17542 Input_file->new('IndicMatraCategory.txt', v6.1.0,
17543 Property => 'Indic_Matra_Category',
17544 Has_Missings_Defaults => $NOT_IGNORED,
17545 Skip => "Provisional; for the analysis and processing of Indic scripts",
17547 Input_file->new('IndicSyllabicCategory.txt', v6.1.0,
17548 Property => 'Indic_Syllabic_Category',
17549 Has_Missings_Defaults => $NOT_IGNORED,
17550 Skip => "Provisional; for the analysis and processing of Indic scripts",
17554 # End of all the preliminaries.
17557 if ($compare_versions) {
17558 Carp::my_carp(<<END
17559 Warning. \$compare_versions is set. Output is not suitable for production
17564 # Put into %potential_files a list of all the files in the directory structure
17565 # that could be inputs to this program, excluding those that we should ignore.
17566 # Use absolute file names because it makes it easier across machine types.
17567 my @ignored_files_full_names = map { File::Spec->rel2abs(
17568 internal_file_to_platform($_))
17569 } keys %ignored_files;
17572 return unless /\.txt$/i; # Some platforms change the name's case
17573 my $full = lc(File::Spec->rel2abs($_));
17574 $potential_files{$full} = 1
17575 if ! grep { $full eq lc($_) } @ignored_files_full_names;
17578 }, File::Spec->curdir());
17580 my @mktables_list_output_files;
17581 my $old_start_time = 0;
17583 if (! -e $file_list) {
17584 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
17585 $write_unchanged_files = 1;
17586 } elsif ($write_unchanged_files) {
17587 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
17590 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
17592 if (! open $file_handle, "<", $file_list) {
17593 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
17599 # Read and parse mktables.lst, placing the results from the first part
17600 # into @input, and the second part into @mktables_list_output_files
17601 for my $list ( \@input, \@mktables_list_output_files ) {
17602 while (<$file_handle>) {
17603 s/^ \s+ | \s+ $//xg;
17604 if (/^ \s* \# .* Autogenerated\ starting\ on\ (\d+)/x) {
17605 $old_start_time = $1;
17607 next if /^ \s* (?: \# .* )? $/x;
17609 my ( $file ) = split /\t/;
17610 push @$list, $file;
17612 @$list = uniques(@$list);
17616 # Look through all the input files
17617 foreach my $input (@input) {
17618 next if $input eq 'version'; # Already have checked this.
17620 # Ignore if doesn't exist. The checking about whether we care or
17621 # not is done via the Input_file object.
17622 next if ! file_exists($input);
17624 # The paths are stored with relative names, and with '/' as the
17625 # delimiter; convert to absolute on this machine
17626 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
17627 $potential_files{lc $full} = 1
17628 if ! grep { lc($full) eq lc($_) } @ignored_files_full_names;
17632 close $file_handle;
17637 # Here wants to process all .txt files in the directory structure.
17638 # Convert them to full path names. They are stored in the platform's
17641 foreach my $object (@input_file_objects) {
17642 my $file = $object->file;
17643 next unless defined $file;
17644 push @known_files, File::Spec->rel2abs($file);
17647 my @unknown_input_files;
17648 foreach my $file (keys %potential_files) { # The keys are stored in lc
17649 next if grep { $file eq lc($_) } @known_files;
17651 # Here, the file is unknown to us. Get relative path name
17652 $file = File::Spec->abs2rel($file);
17653 push @unknown_input_files, $file;
17655 # What will happen is we create a data structure for it, and add it to
17656 # the list of input files to process. First get the subdirectories
17658 my (undef, $directories, undef) = File::Spec->splitpath($file);
17659 $directories =~ s;/$;;; # Can have extraneous trailing '/'
17660 my @directories = File::Spec->splitdir($directories);
17662 # If the file isn't extracted (meaning none of the directories is the
17663 # extracted one), just add it to the end of the list of inputs.
17664 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
17665 push @input_file_objects, Input_file->new($file, v0);
17669 # Here, the file is extracted. It needs to go ahead of most other
17670 # processing. Search for the first input file that isn't a
17671 # special required property (that is, find one whose first_release
17672 # is non-0), and isn't extracted. Also, the Age property file is
17673 # processed before the extracted ones, just in case
17674 # $compare_versions is set.
17675 for (my $i = 0; $i < @input_file_objects; $i++) {
17676 if ($input_file_objects[$i]->first_released ne v0
17677 && lc($input_file_objects[$i]->file) ne 'dage.txt'
17678 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
17680 splice @input_file_objects, $i, 0,
17681 Input_file->new($file, v0);
17688 if (@unknown_input_files) {
17689 print STDERR simple_fold(join_lines(<<END
17691 The following files are unknown as to how to handle. Assuming they are
17692 typical property files. You'll know by later error messages if it worked or
17695 ) . " " . join(", ", @unknown_input_files) . "\n\n");
17697 } # End of looking through directory structure for more .txt files.
17699 # Create the list of input files from the objects we have defined, plus
17701 my @input_files = qw(version Makefile);
17702 foreach my $object (@input_file_objects) {
17703 my $file = $object->file;
17704 next if ! defined $file; # Not all objects have files
17705 next if $object->optional && ! -e $file;
17706 push @input_files, $file;
17709 if ( $verbosity >= $VERBOSE ) {
17710 print "Expecting ".scalar( @input_files )." input files. ",
17711 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
17714 # We set $most_recent to be the most recently changed input file, including
17715 # this program itself (done much earlier in this file)
17716 foreach my $in (@input_files) {
17717 next unless -e $in; # Keep going even if missing a file
17718 my $mod_time = (stat $in)[9];
17719 $most_recent = $mod_time if $mod_time > $most_recent;
17721 # See that the input files have distinct names, to warn someone if they
17722 # are adding a new one
17724 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
17725 $directories =~ s;/$;;; # Can have extraneous trailing '/'
17726 my @directories = File::Spec->splitdir($directories);
17727 my $base = $file =~ s/\.txt$//;
17728 construct_filename($file, 'mutable', \@directories);
17732 # We use 'Makefile' just to see if it has changed since the last time we
17733 # rebuilt. Now discard it.
17734 @input_files = grep { $_ ne 'Makefile' } @input_files;
17736 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
17737 || ! scalar @mktables_list_output_files # or if no outputs known
17738 || $old_start_time < $most_recent; # or out-of-date
17740 # Now we check to see if any output files are older than youngest, if
17741 # they are, we need to continue on, otherwise we can presumably bail.
17743 foreach my $out (@mktables_list_output_files) {
17744 if ( ! file_exists($out)) {
17745 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
17749 #local $to_trace = 1 if main::DEBUG;
17750 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
17751 if ( (stat $out)[9] <= $most_recent ) {
17752 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
17753 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
17760 print "Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
17763 print "Must rebuild tables.\n" if $verbosity >= $VERBOSE;
17765 # Ready to do the major processing. First create the perl pseudo-property.
17766 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
17768 # Process each input file
17769 foreach my $file (@input_file_objects) {
17773 # Finish the table generation.
17775 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
17778 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
17781 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
17782 add_perl_synonyms();
17784 print "Writing tables\n" if $verbosity >= $PROGRESS;
17785 write_all_tables();
17787 # Write mktables.lst
17788 if ( $file_list and $make_list ) {
17790 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
17791 foreach my $file (@input_files, @files_actually_output) {
17792 my (undef, $directories, $file) = File::Spec->splitpath($file);
17793 my @directories = File::Spec->splitdir($directories);
17794 $file = join '/', @directories, $file;
17798 if (! open $ofh,">",$file_list) {
17799 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
17803 my $localtime = localtime $start_time;
17804 print $ofh <<"END";
17806 # $file_list -- File list for $0.
17808 # Autogenerated starting on $start_time ($localtime)
17810 # - First section is input files
17811 # ($0 itself is not listed but is automatically considered an input)
17812 # - Section separator is /^=+\$/
17813 # - Second section is a list of output files.
17814 # - Lines matching /^\\s*#/ are treated as comments
17815 # which along with blank lines are ignored.
17821 print $ofh "$_\n" for sort(@input_files);
17822 print $ofh "\n=================================\n# Output files:\n\n";
17823 print $ofh "$_\n" for sort @files_actually_output;
17824 print $ofh "\n# ",scalar(@input_files)," input files\n",
17825 "# ",scalar(@files_actually_output)+1," output files\n\n",
17828 or Carp::my_carp("Failed to close $ofh: $!");
17830 print "Filelist has ",scalar(@input_files)," input files and ",
17831 scalar(@files_actually_output)+1," output files\n"
17832 if $verbosity >= $VERBOSE;
17836 # Output these warnings unless -q explicitly specified.
17837 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
17838 if (@unhandled_properties) {
17839 print "\nProperties and tables that unexpectedly have no code points\n";
17840 foreach my $property (sort @unhandled_properties) {
17841 print $property, "\n";
17845 if (%potential_files) {
17846 print "\nInput files that are not considered:\n";
17847 foreach my $file (sort keys %potential_files) {
17848 print File::Spec->abs2rel($file), "\n";
17851 print "\nAll done\n" if $verbosity >= $VERBOSE;
17855 # TRAILING CODE IS USED BY make_property_test_script()
17861 # If run outside the normal test suite on an ASCII platform, you can
17862 # just create a latin1_to_native() function that just returns its
17863 # inputs, because that's the only function used from test.pl
17866 # Test qr/\X/ and the \p{} regular expression constructs. This file is
17867 # constructed by mktables from the tables it generates, so if mktables is
17868 # buggy, this won't necessarily catch those bugs. Tests are generated for all
17869 # feasible properties; a few aren't currently feasible; see
17870 # is_code_point_usable() in mktables for details.
17872 # Standard test packages are not used because this manipulates SIG_WARN. It
17873 # exits 0 if every non-skipped test succeeded; -1 if any failed.
17879 my $expected = shift;
17882 my $warning_type = shift; # Type of warning message, like 'deprecated'
17884 my $line = (caller)[2];
17885 $ord = ord(latin1_to_native(chr($ord)));
17887 # Convert the code point to hex form
17888 my $string = sprintf "\"\\x{%04X}\"", $ord;
17892 # The first time through, use all warnings. If the input should generate
17893 # a warning, add another time through with them turned off
17894 push @tests, "no warnings '$warning_type';" if $warning_type;
17896 foreach my $no_warnings (@tests) {
17898 # Store any warning messages instead of outputting them
17899 local $SIG{__WARN__} = $SIG{__WARN__};
17900 my $warning_message;
17901 $SIG{__WARN__} = sub { $warning_message = $_[0] };
17905 # A string eval is needed because of the 'no warnings'.
17906 # Assumes no parens in the regular expression
17907 my $result = eval "$no_warnings
17908 my \$RegObj = qr($regex);
17909 $string =~ \$RegObj ? 1 : 0";
17910 if (not defined $result) {
17911 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
17914 elsif ($result ^ $expected) {
17915 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
17918 elsif ($warning_message) {
17919 if (! $warning_type || ($warning_type && $no_warnings)) {
17920 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
17924 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
17927 elsif ($warning_type && ! $no_warnings) {
17928 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
17932 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
17941 if (eval { 'x' =~ qr/$regex/; 1 }) {
17943 my $line = (caller)[2];
17944 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
17947 my $line = (caller)[2];
17948 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
17953 # GCBTest.txt character that separates grapheme clusters
17954 my $breakable_utf8 = my $breakable = chr(0xF7);
17955 utf8::upgrade($breakable_utf8);
17957 # GCBTest.txt character that indicates that the adjoining code points are part
17958 # of the same grapheme cluster
17959 my $nobreak_utf8 = my $nobreak = chr(0xD7);
17960 utf8::upgrade($nobreak_utf8);
17963 # Test qr/\X/ matches. The input is a line from auxiliary/GCBTest.txt
17964 # Each such line is a sequence of code points given by their hex numbers,
17965 # separated by the two characters defined just before this subroutine that
17966 # indicate that either there can or cannot be a break between the adjacent
17967 # code points. If there isn't a break, that means the sequence forms an
17968 # extended grapheme cluster, which means that \X should match the whole
17969 # thing. If there is a break, \X should stop there. This is all
17970 # converted by this routine into a match:
17971 # $string =~ /(\X)/,
17972 # Each \X should match the next cluster; and that is what is checked.
17974 my $template = shift;
17976 my $line = (caller)[2];
17978 # The line contains characters above the ASCII range, but in Latin1. It
17979 # may or may not be in utf8, and if it is, it may or may not know it. So,
17980 # convert these characters to 8 bits. If knows is in utf8, simply
17982 if (utf8::is_utf8($template)) {
17983 utf8::downgrade($template);
17986 # Otherwise, if it is in utf8, but doesn't know it, the next lines
17987 # convert the two problematic characters to their 8-bit equivalents.
17988 # If it isn't in utf8, they don't harm anything.
17990 $template =~ s/$nobreak_utf8/$nobreak/g;
17991 $template =~ s/$breakable_utf8/$breakable/g;
17994 # Get rid of the leading and trailing breakables
17995 $template =~ s/^ \s* $breakable \s* //x;
17996 $template =~ s/ \s* $breakable \s* $ //x;
17998 # And no-breaks become just a space.
17999 $template =~ s/ \s* $nobreak \s* / /xg;
18001 # Split the input into segments that are breakable between them.
18002 my @segments = split /\s*$breakable\s*/, $template;
18005 my $display_string = "";
18007 my @should_display;
18009 # Convert the code point sequence in each segment into a Perl string of
18011 foreach my $segment (@segments) {
18012 my @code_points = split /\s+/, $segment;
18013 my $this_string = "";
18014 my $this_display = "";
18015 foreach my $code_point (@code_points) {
18016 $this_string .= latin1_to_native(chr(hex $code_point));
18017 $this_display .= "\\x{$code_point}";
18020 # The next cluster should match the string in this segment.
18021 push @should_match, $this_string;
18022 push @should_display, $this_display;
18023 $string .= $this_string;
18024 $display_string .= $this_display;
18027 # If a string can be represented in both non-ut8 and utf8, test both cases
18029 for my $to_upgrade (0 .. 1) {
18033 # If already in utf8, would just be a repeat
18034 next UPGRADE if utf8::is_utf8($string);
18036 utf8::upgrade($string);
18039 # Finally, do the \X match.
18040 my @matches = $string =~ /(\X)/g;
18042 # Look through each matched cluster to verify that it matches what we
18044 my $min = (@matches < @should_match) ? @matches : @should_match;
18045 for my $i (0 .. $min - 1) {
18047 if ($matches[$i] eq $should_match[$i]) {
18048 print "ok $Tests - ";
18050 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
18052 print "And \\X #", $i + 1,
18054 print " correctly matched $should_display[$i]; line $line\n";
18056 $matches[$i] = join("", map { sprintf "\\x{%04X}", $_ }
18057 unpack("U*", $matches[$i]));
18058 print "not ok $Tests - In \"$display_string\" =~ /(\\X)/g, \\X #",
18060 " should have matched $should_display[$i]",
18061 " but instead matched $matches[$i]",
18062 ". Abandoning rest of line $line\n";
18067 # And the number of matches should equal the number of expected matches.
18069 if (@matches == @should_match) {
18070 print "ok $Tests - Nothing was left over; line $line\n";
18072 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line\n";
18080 print "1..$Tests\n";
18081 exit($Fails ? -1 : 0);
18084 Error('\p{Script=InGreek}'); # Bug #69018
18085 Test_X("1100 $nobreak 1161"); # Bug #70940
18086 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18087 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18088 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726