3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # There was an attempt when this was first rewritten to make it 5.8
8 # compatible, but that has now been abandoned, and newer constructs are used
11 # NOTE: this script can run quite slowly in older/slower systems.
12 # It can also consume a lot of memory (128 MB or more), you may need
13 # to raise your process resource limits (e.g. in bash, "ulimit -a"
14 # to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
17 BEGIN { # Get the time the script started running; do it at compilation to
18 # get it as close as possible
34 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
35 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
37 sub NON_ASCII_PLATFORM { ord("A") != 65 }
39 # When a new version of Unicode is published, unfortunately the algorithms for
40 # dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
41 # manually. The changes may or may not be backward compatible with older
42 # releases. The code is in regen/mk_invlist.pl and regexec.c. Make the
43 # changes, then come back here and set the variable below to what version the
44 # code is expecting. If a newer version of Unicode is being compiled than
45 # expected, a warning will be generated. If an older version is being
46 # compiled, any bounds tests that fail in the generated test file (-maketest
47 # option) will be marked as TODO.
48 my $version_of_mk_invlist_bounds = v10.0.0;
50 ##########################################################################
52 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
53 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
54 # a pod file and .t files, depending on option parameters.
56 # The structure of this file is:
57 # First these introductory comments; then
58 # code needed for everywhere, such as debugging stuff; then
59 # code to handle input parameters; then
60 # data structures likely to be of external interest (some of which depend on
61 # the input parameters, so follows them; then
62 # more data structures and subroutine and package (class) definitions; then
63 # the small actual loop to process the input files and finish up; then
64 # a __DATA__ section, for the .t tests
66 # This program works on all releases of Unicode so far. The outputs have been
67 # scrutinized most intently for release 5.1. The others have been checked for
68 # somewhat more than just sanity. It can handle all non-provisional Unicode
69 # character properties in those releases.
71 # This program is mostly about Unicode character (or code point) properties.
72 # A property describes some attribute or quality of a code point, like if it
73 # is lowercase or not, its name, what version of Unicode it was first defined
74 # in, or what its uppercase equivalent is. Unicode deals with these disparate
75 # possibilities by making all properties into mappings from each code point
76 # into some corresponding value. In the case of it being lowercase or not,
77 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
78 # property maps each Unicode code point to a single value, called a "property
79 # value". (Some more recently defined properties, map a code point to a set
82 # When using a property in a regular expression, what is desired isn't the
83 # mapping of the code point to its property's value, but the reverse (or the
84 # mathematical "inverse relation"): starting with the property value, "Does a
85 # code point map to it?" These are written in a "compound" form:
86 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
87 # files containing the lists of code points that map to each such regular
88 # expression property value, one file per list
90 # There is also a single form shortcut that Perl adds for many of the commonly
91 # used properties. This happens for all binary properties, plus script,
92 # general_category, and block properties.
94 # Thus the outputs of this program are files. There are map files, mostly in
95 # the 'To' directory; and there are list files for use in regular expression
96 # matching, all in subdirectories of the 'lib' directory, with each
97 # subdirectory being named for the property that the lists in it are for.
98 # Bookkeeping, test, and documentation files are also generated.
100 my $matches_directory = 'lib'; # Where match (\p{}) files go.
101 my $map_directory = 'To'; # Where map files go.
105 # The major data structures of this program are Property, of course, but also
106 # Table. There are two kinds of tables, very similar to each other.
107 # "Match_Table" is the data structure giving the list of code points that have
108 # a particular property value, mentioned above. There is also a "Map_Table"
109 # data structure which gives the property's mapping from code point to value.
110 # There are two structures because the match tables need to be combined in
111 # various ways, such as constructing unions, intersections, complements, etc.,
112 # and the map ones don't. And there would be problems, perhaps subtle, if
113 # a map table were inadvertently operated on in some of those ways.
114 # The use of separate classes with operations defined on one but not the other
115 # prevents accidentally confusing the two.
117 # At the heart of each table's data structure is a "Range_List", which is just
118 # an ordered list of "Ranges", plus ancillary information, and methods to
119 # operate on them. A Range is a compact way to store property information.
120 # Each range has a starting code point, an ending code point, and a value that
121 # is meant to apply to all the code points between the two end points,
122 # inclusive. For a map table, this value is the property value for those
123 # code points. Two such ranges could be written like this:
124 # 0x41 .. 0x5A, 'Upper',
125 # 0x61 .. 0x7A, 'Lower'
127 # Each range also has a type used as a convenience to classify the values.
128 # Most ranges in this program will be Type 0, or normal, but there are some
129 # ranges that have a non-zero type. These are used only in map tables, and
130 # are for mappings that don't fit into the normal scheme of things. Mappings
131 # that require a hash entry to communicate with utf8.c are one example;
132 # another example is mappings for charnames.pm to use which indicate a name
133 # that is algorithmically determinable from its code point (and the reverse).
134 # These are used to significantly compact these tables, instead of listing
135 # each one of the tens of thousands individually.
137 # In a match table, the value of a range is irrelevant (and hence the type as
138 # well, which will always be 0), and arbitrarily set to the empty string.
139 # Using the example above, there would be two match tables for those two
140 # entries, one named Upper would contain the 0x41..0x5A range, and the other
141 # named Lower would contain 0x61..0x7A.
143 # Actually, there are two types of range lists, "Range_Map" is the one
144 # associated with map tables, and "Range_List" with match tables.
145 # Again, this is so that methods can be defined on one and not the others so
146 # as to prevent operating on them in incorrect ways.
148 # Eventually, most tables are written out to files to be read by utf8_heavy.pl
149 # in the perl core. All tables could in theory be written, but some are
150 # suppressed because there is no current practical use for them. It is easy
151 # to change which get written by changing various lists that are near the top
152 # of the actual code in this file. The table data structures contain enough
153 # ancillary information to allow them to be treated as separate entities for
154 # writing, such as the path to each one's file. There is a heading in each
155 # map table that gives the format of its entries, and what the map is for all
156 # the code points missing from it. (This allows tables to be more compact.)
158 # The Property data structure contains one or more tables. All properties
159 # contain a map table (except the $perl property which is a
160 # pseudo-property containing only match tables), and any properties that
161 # are usable in regular expression matches also contain various matching
162 # tables, one for each value the property can have. A binary property can
163 # have two values, True and False (or Y and N, which are preferred by Unicode
164 # terminology). Thus each of these properties will have a map table that
165 # takes every code point and maps it to Y or N (but having ranges cuts the
166 # number of entries in that table way down), and two match tables, one
167 # which has a list of all the code points that map to Y, and one for all the
168 # code points that map to N. (For each binary property, a third table is also
169 # generated for the pseudo Perl property. It contains the identical code
170 # points as the Y table, but can be written in regular expressions, not in the
171 # compound form, but in a "single" form like \p{IsUppercase}.) Many
172 # properties are binary, but some properties have several possible values,
173 # some have many, and properties like Name have a different value for every
174 # named code point. Those will not, unless the controlling lists are changed,
175 # have their match tables written out. But all the ones which can be used in
176 # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
177 # a property would have either its map table or its match tables written but
178 # not both. Again, what gets written is controlled by lists which can easily
179 # be changed. Starting in 5.14, advantage was taken of this, and all the map
180 # tables needed to reconstruct the Unicode db are now written out, while
181 # suppressing the Unicode .txt files that contain the data. Our tables are
182 # much more compact than the .txt files, so a significant space savings was
183 # achieved. Also, tables are not written out that are trivially derivable
184 # from tables that do get written. So, there typically is no file containing
185 # the code points not matched by a binary property (the table for \P{} versus
186 # lowercase \p{}), since you just need to invert the True table to get the
189 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
190 # how many match tables there are and the content of the maps. This 'Type' is
191 # different than a range 'Type', so don't get confused by the two concepts
192 # having the same name.
194 # For information about the Unicode properties, see Unicode's UAX44 document:
196 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
198 # As stated earlier, this program will work on any release of Unicode so far.
199 # Most obvious problems in earlier data have NOT been corrected except when
200 # necessary to make Perl or this program work reasonably, and to keep out
201 # potential security issues. For example, no folding information was given in
202 # early releases, so this program substitutes lower case instead, just so that
203 # a regular expression with the /i option will do something that actually
204 # gives the right results in many cases. There are also a couple other
205 # corrections for version 1.1.5, commented at the point they are made. As an
206 # example of corrections that weren't made (but could be) is this statement
207 # from DerivedAge.txt: "The supplementary private use code points and the
208 # non-character code points were assigned in version 2.0, but not specifically
209 # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
210 # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
211 # further down in these introductory comments.
213 # This program works on all non-provisional properties as of the current
214 # Unicode release, though the files for some are suppressed for various
215 # reasons. You can change which are output by changing lists in this program.
217 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
218 # loose matchings rules (from Unicode TR18):
220 # The recommended names for UCD properties and property values are in
221 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
222 # [PropValue]. There are both abbreviated names and longer, more
223 # descriptive names. It is strongly recommended that both names be
224 # recognized, and that loose matching of property names be used,
225 # whereby the case distinctions, whitespace, hyphens, and underbar
228 # The program still allows Fuzzy to override its determination of if loose
229 # matching should be used, but it isn't currently used, as it is no longer
230 # needed; the calculations it makes are good enough.
232 # SUMMARY OF HOW IT WORKS:
236 # A list is constructed containing each input file that is to be processed
238 # Each file on the list is processed in a loop, using the associated handler
240 # The PropertyAliases.txt and PropValueAliases.txt files are processed
241 # first. These files name the properties and property values.
242 # Objects are created of all the property and property value names
243 # that the rest of the input should expect, including all synonyms.
244 # The other input files give mappings from properties to property
245 # values. That is, they list code points and say what the mapping
246 # is under the given property. Some files give the mappings for
247 # just one property; and some for many. This program goes through
248 # each file and populates the properties and their map tables from
249 # them. Some properties are listed in more than one file, and
250 # Unicode has set up a precedence as to which has priority if there
251 # is a conflict. Thus the order of processing matters, and this
252 # program handles the conflict possibility by processing the
253 # overriding input files last, so that if necessary they replace
255 # After this is all done, the program creates the property mappings not
256 # furnished by Unicode, but derivable from what it does give.
257 # The tables of code points that match each property value in each
258 # property that is accessible by regular expressions are created.
259 # The Perl-defined properties are created and populated. Many of these
260 # require data determined from the earlier steps
261 # Any Perl-defined synonyms are created, and name clashes between Perl
262 # and Unicode are reconciled and warned about.
263 # All the properties are written to files
264 # Any other files are written, and final warnings issued.
266 # For clarity, a number of operators have been overloaded to work on tables:
267 # ~ means invert (take all characters not in the set). The more
268 # conventional '!' is not used because of the possibility of confusing
269 # it with the actual boolean operation.
271 # - means subtraction
272 # & means intersection
273 # The precedence of these is the order listed. Parentheses should be
274 # copiously used. These are not a general scheme. The operations aren't
275 # defined for a number of things, deliberately, to avoid getting into trouble.
276 # Operations are done on references and affect the underlying structures, so
277 # that the copy constructors for them have been overloaded to not return a new
278 # clone, but the input object itself.
280 # The bool operator is deliberately not overloaded to avoid confusion with
281 # "should it mean if the object merely exists, or also is non-empty?".
283 # WHY CERTAIN DESIGN DECISIONS WERE MADE
285 # This program needs to be able to run under miniperl. Therefore, it uses a
286 # minimum of other modules, and hence implements some things itself that could
287 # be gotten from CPAN
289 # This program uses inputs published by the Unicode Consortium. These can
290 # change incompatibly between releases without the Perl maintainers realizing
291 # it. Therefore this program is now designed to try to flag these. It looks
292 # at the directories where the inputs are, and flags any unrecognized files.
293 # It keeps track of all the properties in the files it handles, and flags any
294 # that it doesn't know how to handle. It also flags any input lines that
295 # don't match the expected syntax, among other checks.
297 # It is also designed so if a new input file matches one of the known
298 # templates, one hopefully just needs to add it to a list to have it
301 # As mentioned earlier, some properties are given in more than one file. In
302 # particular, the files in the extracted directory are supposedly just
303 # reformattings of the others. But they contain information not easily
304 # derivable from the other files, including results for Unihan (which isn't
305 # usually available to this program) and for unassigned code points. They
306 # also have historically had errors or been incomplete. In an attempt to
307 # create the best possible data, this program thus processes them first to
308 # glean information missing from the other files; then processes those other
309 # files to override any errors in the extracted ones. Much of the design was
310 # driven by this need to store things and then possibly override them.
312 # It tries to keep fatal errors to a minimum, to generate something usable for
313 # testing purposes. It always looks for files that could be inputs, and will
314 # warn about any that it doesn't know how to handle (the -q option suppresses
317 # Why is there more than one type of range?
318 # This simplified things. There are some very specialized code points that
319 # have to be handled specially for output, such as Hangul syllable names.
320 # By creating a range type (done late in the development process), it
321 # allowed this to be stored with the range, and overridden by other input.
322 # Originally these were stored in another data structure, and it became a
323 # mess trying to decide if a second file that was for the same property was
324 # overriding the earlier one or not.
326 # Why are there two kinds of tables, match and map?
327 # (And there is a base class shared by the two as well.) As stated above,
328 # they actually are for different things. Development proceeded much more
329 # smoothly when I (khw) realized the distinction. Map tables are used to
330 # give the property value for every code point (actually every code point
331 # that doesn't map to a default value). Match tables are used for regular
332 # expression matches, and are essentially the inverse mapping. Separating
333 # the two allows more specialized methods, and error checks so that one
334 # can't just take the intersection of two map tables, for example, as that
337 # What about 'fate' and 'status'. The concept of a table's fate was created
338 # late when it became clear that something more was needed. The difference
339 # between this and 'status' is unclean, and could be improved if someone
340 # wanted to spend the effort.
344 # This program is written so it will run under miniperl. Occasionally changes
345 # will cause an error where the backtrace doesn't work well under miniperl.
346 # To diagnose the problem, you can instead run it under regular perl, if you
349 # There is a good trace facility. To enable it, first sub DEBUG must be set
350 # to return true. Then a line like
352 # local $to_trace = 1 if main::DEBUG;
354 # can be added to enable tracing in its lexical scope (plus dynamic) or until
355 # you insert another line:
357 # local $to_trace = 0 if main::DEBUG;
359 # To actually trace, use a line like "trace $a, @b, %c, ...;
361 # Some of the more complex subroutines already have trace statements in them.
362 # Permanent trace statements should be like:
364 # trace ... if main::DEBUG && $to_trace;
366 # main::stack_trace() will display what its name implies
368 # If there is just one or a few files that you're debugging, you can easily
369 # cause most everything else to be skipped. Change the line
371 # my $debug_skip = 0;
373 # to 1, and every file whose object is in @input_file_objects and doesn't have
374 # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
375 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
377 # To compare the output tables, it may be useful to specify the -annotate
378 # flag. (As of this writing, this can't be done on a clean workspace, due to
379 # requirements in Text::Tabs used in this option; so first run mktables
380 # without this option.) This option adds comment lines to each table, one for
381 # each non-algorithmically named character giving, currently its code point,
382 # name, and graphic representation if printable (and you have a font that
383 # knows about it). This makes it easier to see what the particular code
384 # points are in each output table. Non-named code points are annotated with a
385 # description of their status, and contiguous ones with the same description
386 # will be output as a range rather than individually. Algorithmically named
387 # characters are also output as ranges, except when there are just a few
392 # The program would break if Unicode were to change its names so that
393 # interior white space, underscores, or dashes differences were significant
394 # within property and property value names.
396 # It might be easier to use the xml versions of the UCD if this program ever
397 # would need heavy revision, and the ability to handle old versions was not
400 # There is the potential for name collisions, in that Perl has chosen names
401 # that Unicode could decide it also likes. There have been such collisions in
402 # the past, with mostly Perl deciding to adopt the Unicode definition of the
403 # name. However in the 5.2 Unicode beta testing, there were a number of such
404 # collisions, which were withdrawn before the final release, because of Perl's
405 # and other's protests. These all involved new properties which began with
406 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
407 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
408 # Unicode document, so they are unlikely to be used by Unicode for another
409 # purpose. However, they might try something beginning with 'In', or use any
410 # of the other Perl-defined properties. This program will warn you of name
411 # collisions, and refuse to generate tables with them, but manual intervention
412 # will be required in this event. One scheme that could be implemented, if
413 # necessary, would be to have this program generate another file, or add a
414 # field to mktables.lst that gives the date of first definition of a property.
415 # Each new release of Unicode would use that file as a basis for the next
416 # iteration. And the Perl synonym addition code could sort based on the age
417 # of the property, so older properties get priority, and newer ones that clash
418 # would be refused; hence existing code would not be impacted, and some other
419 # synonym would have to be used for the new property. This is ugly, and
420 # manual intervention would certainly be easier to do in the short run; lets
421 # hope it never comes to this.
425 # This program can generate tables from the Unihan database. But that DB
426 # isn't normally available, so it is marked as optional. Prior to version
427 # 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
428 # was split into 8 different files, all beginning with the letters 'Unihan'.
429 # If you plunk those files down into the directory mktables ($0) is in, this
430 # program will read them and automatically create tables for the properties
431 # from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
432 # plus any you add to the @cjk_properties array and the @cjk_property_values
433 # array, being sure to add necessary '# @missings' lines to the latter. For
434 # Unicode versions earlier than 5.2, most of the Unihan properties are not
435 # listed at all in PropertyAliases nor PropValueAliases. This program assumes
436 # for these early releases that you want the properties that are specified in
439 # You may need to adjust the entries to suit your purposes. setup_unihan(),
440 # and filter_unihan_line() are the functions where this is done. This program
441 # already does some adjusting to make the lines look more like the rest of the
442 # Unicode DB; You can see what that is in filter_unihan_line()
444 # There is a bug in the 3.2 data file in which some values for the
445 # kPrimaryNumeric property have commas and an unexpected comment. A filter
446 # could be added to correct these; or for a particular installation, the
447 # Unihan.txt file could be edited to fix them.
449 # HOW TO ADD A FILE TO BE PROCESSED
451 # A new file from Unicode needs to have an object constructed for it in
452 # @input_file_objects, probably at the end or at the end of the extracted
453 # ones. The program should warn you if its name will clash with others on
454 # restrictive file systems, like DOS. If so, figure out a better name, and
455 # add lines to the README.perl file giving that. If the file is a character
456 # property, it should be in the format that Unicode has implicitly
457 # standardized for such files for the more recently introduced ones.
458 # If so, the Input_file constructor for @input_file_objects can just be the
459 # file name and release it first appeared in. If not, then it should be
460 # possible to construct an each_line_handler() to massage the line into the
463 # For non-character properties, more code will be needed. You can look at
464 # the existing entries for clues.
466 # UNICODE VERSIONS NOTES
468 # The Unicode UCD has had a number of errors in it over the versions. And
469 # these remain, by policy, in the standard for that version. Therefore it is
470 # risky to correct them, because code may be expecting the error. So this
471 # program doesn't generally make changes, unless the error breaks the Perl
472 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
473 # for U+1105, which causes real problems for the algorithms for Jamo
474 # calculations, so it is changed here.
476 # But it isn't so clear cut as to what to do about concepts that are
477 # introduced in a later release; should they extend back to earlier releases
478 # where the concept just didn't exist? It was easier to do this than to not,
479 # so that's what was done. For example, the default value for code points not
480 # in the files for various properties was probably undefined until changed by
481 # some version. No_Block for blocks is such an example. This program will
482 # assign No_Block even in Unicode versions that didn't have it. This has the
483 # benefit that code being written doesn't have to special case earlier
484 # versions; and the detriment that it doesn't match the Standard precisely for
485 # the affected versions.
487 # Here are some observations about some of the issues in early versions:
489 # Prior to version 3.0, there were 3 character decompositions. These are not
490 # handled by Unicode::Normalize, nor will it compile when presented a version
491 # that has them. However, you can trivially get it to compile by simply
492 # ignoring those decompositions, by changing the croak to a carp. At the time
493 # of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
494 # dist/Unicode-Normalize/mkheader) reads
496 # croak("Weird Canonical Decomposition of U+$h");
498 # Simply comment it out. It will compile, but will not know about any three
499 # character decompositions.
501 # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
502 # that the reason is that the CJK block starting at 4E00 was removed from
503 # PropList, and was not put back in until 3.1.0. The Perl extension (the
504 # single property name \p{alpha}) has the correct values. But the compound
505 # form is simply not generated until 3.1, as it can be argued that prior to
506 # this release, this was not an official property. The comments for
507 # filter_old_style_proplist() give more details.
509 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
510 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
511 # reason is that 3.2 introduced U+205F=medium math space, which was not
512 # classed as white space, but Perl figured out that it should have been. 4.0
513 # reclassified it correctly.
515 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
516 # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
517 # became 202, and ATBL was left with no code points, as all the ones that
518 # mapped to 202 stayed mapped to 202. Thus if your program used the numeric
519 # name for the class, it would not have been affected, but if it used the
520 # mnemonic, it would have been.
522 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code
523 # points which eventually came to have this script property value, instead
524 # mapped to "Unknown". But in the next release all these code points were
525 # moved to \p{sc=common} instead.
527 # The tests furnished by Unicode for testing WordBreak and SentenceBreak
528 # generate errors in 5.0 and earlier.
530 # The default for missing code points for BidiClass is complicated. Starting
531 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
532 # tries to do the best it can for earlier releases. It is done in
533 # process_PropertyAliases()
535 # In version 2.1.2, the entry in UnicodeData.txt:
536 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
538 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
539 # Without this change, there are casing problems for this character.
541 # Search for $string_compare_versions to see how to compare changes to
542 # properties between Unicode versions
544 ##############################################################################
546 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
548 my $MAX_LINE_WIDTH = 78;
550 # Debugging aid to skip most files so as to not be distracted by them when
551 # concentrating on the ones being debugged. Add
553 # to the constructor for those files you want processed when you set this.
554 # Files with a first version number of 0 are special: they are always
555 # processed regardless of the state of this flag. Generally, Jamo.txt and
556 # UnicodeData.txt must not be skipped if you want this program to not die
557 # before normal completion.
561 # Normally these are suppressed.
562 my $write_Unicode_deprecated_tables = 0;
564 # Set to 1 to enable tracing.
567 { # Closure for trace: debugging aid
568 my $print_caller = 1; # ? Include calling subroutine name
569 my $main_with_colon = 'main::';
570 my $main_colon_length = length($main_with_colon);
573 return unless $to_trace; # Do nothing if global flag not set
577 local $DB::trace = 0;
578 $DB::trace = 0; # Quiet 'used only once' message
582 # Loop looking up the stack to get the first non-trace caller
587 $line_number = $caller_line;
588 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
589 $caller = $main_with_colon unless defined $caller;
591 $caller_name = $caller;
594 $caller_name =~ s/.*:://;
595 if (substr($caller_name, 0, $main_colon_length)
598 $caller_name = substr($caller_name, $main_colon_length);
601 } until ($caller_name ne 'trace');
603 # If the stack was empty, we were called from the top level
604 $caller_name = 'main' if ($caller_name eq ""
605 || $caller_name eq 'trace');
608 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
609 foreach my $string (@input) {
610 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
611 $output .= simple_dumper($string);
614 $string = "$string" if ref $string;
615 $string = $UNDEF unless defined $string;
617 $string = '""' if $string eq "";
618 $output .= " " if $output ne ""
620 && substr($output, -1, 1) ne " "
621 && substr($string, 0, 1) ne " ";
626 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
627 print STDERR "$caller_name: " if $print_caller;
628 print STDERR $output, "\n";
634 local $to_trace = 1 if main::DEBUG;
635 my $line = (caller(0))[2];
638 # Accumulate the stack trace
640 my ($pkg, $file, $caller_line, $caller) = caller $i++;
642 last unless defined $caller;
644 trace "called from $caller() at line $line";
645 $line = $caller_line;
649 # This is for a rarely used development feature that allows you to compare two
650 # versions of the Unicode standard without having to deal with changes caused
651 # by the code points introduced in the later version. You probably also want
652 # to use the -annotate option when using this. Run this program on a unicore
653 # containing the starting release you want to compare. Save that output
654 # structure. Then, switching to a unicore with the ending release, change the
655 # 0 in the $string_compare_versions definition just below to a string
656 # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
657 # to the starting release. This program will then compile, but throw away all
658 # code points introduced after the starting release. Finally use a diff tool
659 # to compare the two directory structures. They include only the code points
660 # common to both releases, and you can see the changes caused just by the
661 # underlying release semantic changes. For versions earlier than 3.2, you
662 # must copy a version of DAge.txt into the directory.
663 my $string_compare_versions = DEBUG && ""; # e.g., "2.1";
664 my $compare_versions = DEBUG
665 && $string_compare_versions
666 && pack "C*", split /\./, $string_compare_versions;
669 # Returns non-duplicated input values. From "Perl Best Practices:
670 # Encapsulated Cleverness". p. 455 in first edition.
673 # Arguably this breaks encapsulation, if the goal is to permit multiple
674 # distinct objects to stringify to the same value, and be interchangeable.
675 # However, for this program, no two objects stringify identically, and all
676 # lists passed to this function are either objects or strings. So this
677 # doesn't affect correctness, but it does give a couple of percent speedup.
679 return grep { ! $seen{$_}++ } @_;
682 $0 = File::Spec->canonpath($0);
684 my $make_test_script = 0; # ? Should we output a test script
685 my $make_norm_test_script = 0; # ? Should we output a normalization test script
686 my $write_unchanged_files = 0; # ? Should we update the output files even if
687 # we don't think they have changed
688 my $use_directory = ""; # ? Should we chdir somewhere.
689 my $pod_directory; # input directory to store the pod file.
690 my $pod_file = 'perluniprops';
691 my $t_path; # Path to the .t test file
692 my $file_list = 'mktables.lst'; # File to store input and output file names.
693 # This is used to speed up the build, by not
694 # executing the main body of the program if
695 # nothing on the list has changed since the
697 my $make_list = 1; # ? Should we write $file_list. Set to always
698 # make a list so that when the pumpking is
699 # preparing a release, s/he won't have to do
701 my $glob_list = 0; # ? Should we try to include unknown .txt files
703 my $output_range_counts = $debugging_build; # ? Should we include the number
704 # of code points in ranges in
706 my $annotate = 0; # ? Should character names be in the output
708 # Verbosity levels; 0 is quiet
709 my $NORMAL_VERBOSITY = 1;
713 my $verbosity = $NORMAL_VERBOSITY;
715 # Stored in mktables.lst so that if this program is called with different
716 # options, will regenerate even if the files otherwise look like they're
718 my $command_line_arguments = join " ", @ARGV;
722 my $arg = shift @ARGV;
724 $verbosity = $VERBOSE;
726 elsif ($arg eq '-p') {
727 $verbosity = $PROGRESS;
728 $| = 1; # Flush buffers as we go.
730 elsif ($arg eq '-q') {
733 elsif ($arg eq '-w') {
734 # update the files even if they haven't changed
735 $write_unchanged_files = 1;
737 elsif ($arg eq '-check') {
738 my $this = shift @ARGV;
739 my $ok = shift @ARGV;
741 print "Skipping as check params are not the same.\n";
745 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
746 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
748 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
750 $make_test_script = 1;
752 elsif ($arg eq '-makenormtest')
754 $make_norm_test_script = 1;
756 elsif ($arg eq '-makelist') {
759 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
760 -d $use_directory or croak "Unknown directory '$use_directory'";
762 elsif ($arg eq '-L') {
764 # Existence not tested until have chdir'd
767 elsif ($arg eq '-globlist') {
770 elsif ($arg eq '-c') {
771 $output_range_counts = ! $output_range_counts
773 elsif ($arg eq '-annotate') {
775 $debugging_build = 1;
776 $output_range_counts = 1;
780 $with_c .= 'out' if $output_range_counts; # Complements the state
782 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
783 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
785 -c : Output comments $with_c number of code points in ranges
786 -q : Quiet Mode: Only output serious warnings.
787 -p : Set verbosity level to normal plus show progress.
788 -v : Set Verbosity level high: Show progress and non-serious
790 -w : Write files regardless
791 -C dir : Change to this directory before proceeding. All relative paths
792 except those specified by the -P and -T options will be done
793 with respect to this directory.
794 -P dir : Output $pod_file file to directory 'dir'.
795 -T path : Create a test script as 'path'; overrides -maketest
796 -L filelist : Use alternate 'filelist' instead of standard one
797 -globlist : Take as input all non-Test *.txt files in current and sub
799 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
801 -makelist : Rewrite the file list $file_list based on current setup
802 -annotate : Output an annotation for each character in the table files;
803 useful for debugging mktables, looking at diffs; but is slow
805 -check A B : Executes $0 only if A and B are the same
810 # Stores the most-recently changed file. If none have changed, can skip the
812 my $most_recent = (stat $0)[9]; # Do this before the chdir!
814 # Change directories now, because need to read 'version' early.
815 if ($use_directory) {
816 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
817 $pod_directory = File::Spec->rel2abs($pod_directory);
819 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
820 $t_path = File::Spec->rel2abs($t_path);
822 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
823 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
824 $pod_directory = File::Spec->abs2rel($pod_directory);
826 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
827 $t_path = File::Spec->abs2rel($t_path);
831 # Get Unicode version into regular and v-string. This is done now because
832 # various tables below get populated based on it. These tables are populated
833 # here to be near the top of the file, and so easily seeable by those needing
835 open my $VERSION, "<", "version"
836 or croak "$0: can't open required file 'version': $!\n";
837 my $string_version = <$VERSION>;
839 chomp $string_version;
840 my $v_version = pack "C*", split /\./, $string_version; # v string
842 my $unicode_version = ($compare_versions)
843 ? ( "$string_compare_versions (using "
844 . "$string_version rules)")
847 # The following are the complete names of properties with property values that
848 # are known to not match any code points in some versions of Unicode, but that
849 # may change in the future so they should be matchable, hence an empty file is
850 # generated for them.
851 my @tables_that_may_be_empty;
852 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
853 if $v_version lt v6.3.0;
854 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
855 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
856 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
857 if $v_version ge v4.1.0;
858 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
859 if $v_version ge v6.0.0;
860 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
861 if $v_version ge v6.1.0;
862 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
863 if $v_version ge v6.2.0;
865 # The lists below are hashes, so the key is the item in the list, and the
866 # value is the reason why it is in the list. This makes generation of
867 # documentation easier.
869 my %why_suppressed; # No file generated for these.
871 # Files aren't generated for empty extraneous properties. This is arguable.
872 # Extraneous properties generally come about because a property is no longer
873 # used in a newer version of Unicode. If we generated a file without code
874 # points, programs that used to work on that property will still execute
875 # without errors. It just won't ever match (or will always match, with \P{}).
876 # This means that the logic is now likely wrong. I (khw) think its better to
877 # find this out by getting an error message. Just move them to the table
878 # above to change this behavior
879 my %why_suppress_if_empty_warn_if_not = (
881 # It is the only property that has ever officially been removed from the
882 # Standard. The database never contained any code points for it.
883 'Special_Case_Condition' => 'Obsolete',
885 # Apparently never official, but there were code points in some versions of
886 # old-style PropList.txt
887 'Non_Break' => 'Obsolete',
890 # These would normally go in the warn table just above, but they were changed
891 # a long time before this program was written, so warnings about them are
893 if ($v_version gt v3.2.0) {
894 push @tables_that_may_be_empty,
895 'Canonical_Combining_Class=Attached_Below_Left'
898 # Enum values for to_output_map() method in the Map_Table package. (0 is don't
900 my $EXTERNAL_MAP = 1;
901 my $INTERNAL_MAP = 2;
902 my $OUTPUT_ADJUSTED = 3;
904 # To override computed values for writing the map tables for these properties.
905 # The default for enum map tables is to write them out, so that the Unicode
906 # .txt files can be removed, but all the data to compute any property value
907 # for any code point is available in a more compact form.
908 my %global_to_output_map = (
909 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
910 # get stuck supporting it if things change. Since it is a STRING
911 # property, it normally would be listed in the pod, but INTERNAL_MAP
913 Unicode_1_Name => $INTERNAL_MAP,
915 Present_In => 0, # Suppress, as easily computed from Age
916 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is
917 # retained, but needed for
920 # Suppress, as mapping can be found instead from the
921 # Perl_Decomposition_Mapping file
922 Decomposition_Type => 0,
925 # There are several types of obsolete properties defined by Unicode. These
926 # must be hand-edited for every new Unicode release.
927 my %why_deprecated; # Generates a deprecated warning message if used.
928 my %why_stabilized; # Documentation only
929 my %why_obsolete; # Documentation only
932 my $simple = 'Perl uses the more complete version';
933 my $unihan = 'Unihan properties are by default not enabled in the Perl core. Instead use CPAN: Unicode::Unihan';
935 my $other_properties = 'other properties';
936 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
937 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
940 'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
941 'Jamo_Short_Name' => $contributory,
942 'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
943 'Other_Alphabetic' => $contributory,
944 'Other_Default_Ignorable_Code_Point' => $contributory,
945 'Other_Grapheme_Extend' => $contributory,
946 'Other_ID_Continue' => $contributory,
947 'Other_ID_Start' => $contributory,
948 'Other_Lowercase' => $contributory,
949 'Other_Math' => $contributory,
950 'Other_Uppercase' => $contributory,
951 'Expands_On_NFC' => $why_no_expand,
952 'Expands_On_NFD' => $why_no_expand,
953 'Expands_On_NFKC' => $why_no_expand,
954 'Expands_On_NFKD' => $why_no_expand,
958 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
959 # contains the same information, but without the algorithmically
960 # determinable Hangul syllables'. This file is not published, so it's
961 # existence is not noted in the comment.
962 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
964 # Don't suppress ISO_Comment, as otherwise special handling is needed
965 # to differentiate between it and gc=c, which can be written as 'isc',
966 # which is the same characters as ISO_Comment's short name.
968 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
970 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
971 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
972 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
973 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
975 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
978 foreach my $property (
980 # The following are suppressed because they were made contributory
981 # or deprecated by Unicode before Perl ever thought about
990 # The following are suppressed because they have been marked
991 # as deprecated for a sufficient amount of time
993 'Other_Default_Ignorable_Code_Point',
994 'Other_Grapheme_Extend',
1001 $why_suppressed{$property} = $why_deprecated{$property};
1004 # Customize the message for all the 'Other_' properties
1005 foreach my $property (keys %why_deprecated) {
1006 next if (my $main_property = $property) !~ s/^Other_//;
1007 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1011 if ($write_Unicode_deprecated_tables) {
1012 foreach my $property (keys %why_suppressed) {
1013 delete $why_suppressed{$property} if $property =~
1014 / ^ Other | Grapheme /x;
1018 if ($v_version ge 4.0.0) {
1019 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1020 if ($v_version ge 6.0.0) {
1021 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1024 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1025 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1026 if ($v_version ge 6.0.0) {
1027 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1031 # Probably obsolete forever
1032 if ($v_version ge v4.1.0) {
1033 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1035 if ($v_version ge v6.0.0) {
1036 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1037 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1040 # This program can create files for enumerated-like properties, such as
1041 # 'Numeric_Type'. This file would be the same format as for a string
1042 # property, with a mapping from code point to its value, so you could look up,
1043 # for example, the script a code point is in. But no one so far wants this
1044 # mapping, or they have found another way to get it since this is a new
1045 # feature. So no file is generated except if it is in this list.
1046 my @output_mapped_properties = split "\n", <<END;
1049 # If you want more Unihan properties than the default, you need to add them to
1050 # these arrays. Depending on the property type, @missing lines might have to
1051 # be added to the second array. A sample entry would be (including the '#'):
1052 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1053 my @cjk_properties = split "\n", <<'END';
1055 my @cjk_property_values = split "\n", <<'END';
1058 # The input files don't list every code point. Those not listed are to be
1059 # defaulted to some value. Below are hard-coded what those values are for
1060 # non-binary properties as of 5.1. Starting in 5.0, there are
1061 # machine-parsable comment lines in the files that give the defaults; so this
1062 # list shouldn't have to be extended. The claim is that all missing entries
1063 # for binary properties will default to 'N'. Unicode tried to change that in
1064 # 5.2, but the beta period produced enough protest that they backed off.
1066 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1067 # be in the form that it expects. The others may be synonyms.
1068 my $CODE_POINT = '<code point>';
1069 my %default_mapping = (
1070 Age => "Unassigned",
1071 # Bidi_Class => Complicated; set in code
1072 Bidi_Mirroring_Glyph => "",
1073 Block => 'No_Block',
1074 Canonical_Combining_Class => 0,
1075 Case_Folding => $CODE_POINT,
1076 Decomposition_Mapping => $CODE_POINT,
1077 Decomposition_Type => 'None',
1078 East_Asian_Width => "Neutral",
1079 FC_NFKC_Closure => $CODE_POINT,
1080 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1081 Grapheme_Cluster_Break => 'Other',
1082 Hangul_Syllable_Type => 'NA',
1084 Jamo_Short_Name => "",
1085 Joining_Group => "No_Joining_Group",
1086 # Joining_Type => Complicated; set in code
1087 kIICore => 'N', # Is converted to binary
1088 #Line_Break => Complicated; set in code
1089 Lowercase_Mapping => $CODE_POINT,
1096 Numeric_Type => 'None',
1097 Numeric_Value => 'NaN',
1098 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1099 Sentence_Break => 'Other',
1100 Simple_Case_Folding => $CODE_POINT,
1101 Simple_Lowercase_Mapping => $CODE_POINT,
1102 Simple_Titlecase_Mapping => $CODE_POINT,
1103 Simple_Uppercase_Mapping => $CODE_POINT,
1104 Titlecase_Mapping => $CODE_POINT,
1105 Unicode_1_Name => "",
1106 Unicode_Radical_Stroke => "",
1107 Uppercase_Mapping => $CODE_POINT,
1108 Word_Break => 'Other',
1111 ### End of externally interesting definitions, except for @input_file_objects
1114 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1115 # This file is machine-generated by $0 from the Unicode
1116 # database, Version $unicode_version. Any changes made here will be lost!
1119 my $INTERNAL_ONLY_HEADER = <<"EOF";
1121 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1122 # This file is for internal use by core Perl only. The format and even the
1123 # name or existence of this file are subject to change without notice. Don't
1124 # use it directly. Use Unicode::UCD to access the Unicode character data
1128 my $DEVELOPMENT_ONLY=<<"EOF";
1129 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1130 # This file contains information artificially constrained to code points
1131 # present in Unicode release $string_compare_versions.
1132 # IT CANNOT BE RELIED ON. It is for use during development only and should
1133 # not be used for production.
1137 my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1140 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1141 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1143 # We work with above-Unicode code points, up to IV_MAX, but we may want to use
1144 # sentinels above that number. Therefore for internal use, we use a much
1145 # smaller number, translating it to IV_MAX only for output. The exact number
1146 # is immaterial (all above-Unicode code points are treated exactly the same),
1147 # but the algorithm requires it to be at least
1148 # 2 * $MAX_UNICODE_CODEPOINTS + 1
1149 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1150 my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1151 my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1153 my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1155 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1156 # two must be 10; if there are 5, the first must not be a 0. Written this way
1157 # to decrease backtracking. The first regex allows the code point to be at
1158 # the end of a word, but to work properly, the word shouldn't end with a valid
1159 # hex character. The second one won't match a code point at the end of a
1160 # word, and doesn't have the run-on issue
1161 my $run_on_code_point_re =
1162 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1163 my $code_point_re = qr/\b$run_on_code_point_re/;
1165 # This matches the beginning of the line in the Unicode DB files that give the
1166 # defaults for code points not listed (i.e., missing) in the file. The code
1167 # depends on this ending with a semi-colon, so it can assume it is a valid
1168 # field when the line is split() by semi-colons
1169 my $missing_defaults_prefix = qr/^#\s+\@missing:\s+0000\.\.10FFFF\s*;/;
1171 # Property types. Unicode has more types, but these are sufficient for our
1173 my $UNKNOWN = -1; # initialized to illegal value
1174 my $NON_STRING = 1; # Either binary or enum
1176 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1177 # tables, additional true and false tables are
1178 # generated so that false is anything matching the
1179 # default value, and true is everything else.
1180 my $ENUM = 4; # Include catalog
1181 my $STRING = 5; # Anything else: string or misc
1183 # Some input files have lines that give default values for code points not
1184 # contained in the file. Sometimes these should be ignored.
1185 my $NO_DEFAULTS = 0; # Must evaluate to false
1186 my $NOT_IGNORED = 1;
1189 # Range types. Each range has a type. Most ranges are type 0, for normal,
1190 # and will appear in the main body of the tables in the output files, but
1191 # there are other types of ranges as well, listed below, that are specially
1192 # handled. There are pseudo-types as well that will never be stored as a
1193 # type, but will affect the calculation of the type.
1195 # 0 is for normal, non-specials
1196 my $MULTI_CP = 1; # Sequence of more than code point
1197 my $HANGUL_SYLLABLE = 2;
1198 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1199 my $NULL = 4; # The map is to the null string; utf8.c can't
1200 # handle these, nor is there an accepted syntax
1201 # for them in \p{} constructs
1202 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1203 # otherwise be $MULTI_CP type are instead type 0
1205 # process_generic_property_file() can accept certain overrides in its input.
1206 # Each of these must begin AND end with $CMD_DELIM.
1207 my $CMD_DELIM = "\a";
1208 my $REPLACE_CMD = 'replace'; # Override the Replace
1209 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1214 # Values for the Replace argument to add_range.
1215 # $NO # Don't replace; add only the code points not
1217 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1218 # the comments at the subroutine definition.
1219 my $UNCONDITIONALLY = 2; # Replace without conditions.
1220 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1222 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1224 my $CROAK = 6; # Die with an error if is already there
1226 # Flags to give property statuses. The phrases are to remind maintainers that
1227 # if the flag is changed, the indefinite article referring to it in the
1228 # documentation may need to be as well.
1230 my $DEPRECATED = 'D';
1231 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1232 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1233 my $DISCOURAGED = 'X';
1234 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1235 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1237 my $a_bold_stricter = "a 'B<$STRICTER>'";
1238 my $A_bold_stricter = "A 'B<$STRICTER>'";
1239 my $STABILIZED = 'S';
1240 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1241 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1243 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1244 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1246 # Aliases can also have an extra status:
1247 my $INTERNAL_ALIAS = 'P';
1249 my %status_past_participles = (
1250 $DISCOURAGED => 'discouraged',
1251 $STABILIZED => 'stabilized',
1252 $OBSOLETE => 'obsolete',
1253 $DEPRECATED => 'deprecated',
1254 $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1257 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1258 # externally documented.
1259 my $ORDINARY = 0; # The normal fate.
1260 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1261 # but there is a file written that can be used to
1262 # reconstruct this table
1263 my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
1264 # for Perl's internal use only
1265 my $LEGACY_ONLY = 3; # Like $INTERNAL_ONLY, but not actually used by Perl.
1266 # Is for backwards compatibility for applications that
1267 # read the file directly, so it's format is
1269 my $SUPPRESSED = 4; # The file for this table is not written out, and as a
1270 # result, we don't bother to do many computations on
1272 my $PLACEHOLDER = 5; # Like $SUPPRESSED, but we go through all the
1273 # computations anyway, as the values are needed for
1274 # things to work. This happens when we have Perl
1275 # extensions that depend on Unicode tables that
1276 # wouldn't normally be in a given Unicode version.
1278 # The format of the values of the tables:
1279 my $EMPTY_FORMAT = "";
1280 my $BINARY_FORMAT = 'b';
1281 my $DECIMAL_FORMAT = 'd';
1282 my $FLOAT_FORMAT = 'f';
1283 my $INTEGER_FORMAT = 'i';
1284 my $HEX_FORMAT = 'x';
1285 my $RATIONAL_FORMAT = 'r';
1286 my $STRING_FORMAT = 's';
1287 my $ADJUST_FORMAT = 'a';
1288 my $HEX_ADJUST_FORMAT = 'ax';
1289 my $DECOMP_STRING_FORMAT = 'c';
1290 my $STRING_WHITE_SPACE_LIST = 'sw';
1292 my %map_table_formats = (
1293 $BINARY_FORMAT => 'binary',
1294 $DECIMAL_FORMAT => 'single decimal digit',
1295 $FLOAT_FORMAT => 'floating point number',
1296 $INTEGER_FORMAT => 'integer',
1297 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1298 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1299 $STRING_FORMAT => 'string',
1300 $ADJUST_FORMAT => 'some entries need adjustment',
1301 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1302 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1303 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1306 # Unicode didn't put such derived files in a separate directory at first.
1307 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1308 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1309 my $AUXILIARY = 'auxiliary';
1311 # Hashes and arrays that will eventually go into Heavy.pl for the use of
1312 # utf8_heavy.pl and into UCD.pl for the use of UCD.pm
1313 my %loose_to_file_of; # loosely maps table names to their respective
1315 my %stricter_to_file_of; # same; but for stricter mapping.
1316 my %loose_property_to_file_of; # Maps a loose property name to its map file
1317 my %strict_property_to_file_of; # Same, but strict
1318 my @inline_definitions = "V0"; # Each element gives a definition of a unique
1319 # inversion list. When a definition is inlined,
1320 # its value in the hash it's in (one of the two
1321 # defined just above) will include an index into
1322 # this array. The 0th element is initialized to
1323 # the definition for a zero length inversion list
1324 my %file_to_swash_name; # Maps the file name to its corresponding key name
1325 # in the hash %utf8::SwashInfo
1326 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1327 # their rational equivalent
1328 my %loose_property_name_of; # Loosely maps (non_string) property names to
1330 my %strict_property_name_of; # Strictly maps (non_string) property names to
1332 my %string_property_loose_to_name; # Same, for string properties.
1333 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1334 # the property name in standard loose form, and
1335 # 'value' is the default value for that property,
1336 # also in standard loose form.
1337 my %loose_to_standard_value; # loosely maps table names to the canonical
1339 my %ambiguous_names; # keys are alias names (in standard form) that
1340 # have more than one possible meaning.
1341 my %combination_property; # keys are alias names (in standard form) that
1342 # have both a map table, and a binary one that
1343 # yields true for all non-null maps.
1344 my %prop_aliases; # Keys are standard property name; values are each
1346 my %prop_value_aliases; # Keys of top level are standard property name;
1347 # values are keys to another hash, Each one is
1348 # one of the property's values, in standard form.
1349 # The values are that prop-val's aliases.
1350 my %skipped_files; # List of files that we skip
1351 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1353 # Most properties are immune to caseless matching, otherwise you would get
1354 # nonsensical results, as properties are a function of a code point, not
1355 # everything that is caselessly equivalent to that code point. For example,
1356 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1357 # be true because 's' and 'S' are equivalent caselessly. However,
1358 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1359 # extend that concept to those very few properties that are like this. Each
1360 # such property will match the full range caselessly. They are hard-coded in
1361 # the program; it's not worth trying to make it general as it's extremely
1362 # unlikely that they will ever change.
1363 my %caseless_equivalent_to;
1365 # This is the range of characters that were in Release 1 of Unicode, and
1366 # removed in Release 2 (replaced with the current Hangul syllables starting at
1367 # U+AC00). The range was reused starting in Release 3 for other purposes.
1368 my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1369 my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1371 # These constants names and values were taken from the Unicode standard,
1372 # version 5.1, section 3.12. They are used in conjunction with Hangul
1373 # syllables. The '_string' versions are so generated tables can retain the
1374 # hex format, which is the more familiar value
1375 my $SBase_string = "0xAC00";
1376 my $SBase = CORE::hex $SBase_string;
1377 my $LBase_string = "0x1100";
1378 my $LBase = CORE::hex $LBase_string;
1379 my $VBase_string = "0x1161";
1380 my $VBase = CORE::hex $VBase_string;
1381 my $TBase_string = "0x11A7";
1382 my $TBase = CORE::hex $TBase_string;
1387 my $NCount = $VCount * $TCount;
1389 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1390 # with the above published constants.
1392 my %Jamo_L; # Leading consonants
1393 my %Jamo_V; # Vowels
1394 my %Jamo_T; # Trailing consonants
1396 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1397 # The key is the base name of the code point, and the value is an
1398 # array giving all the ranges that use this base name. Each range
1399 # is actually a hash giving the 'low' and 'high' values of it.
1400 my %names_ending_in_code_point;
1401 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1402 # removed from the names
1403 # Inverse mapping. The list of ranges that have these kinds of
1404 # names. Each element contains the low, high, and base names in an
1406 my @code_points_ending_in_code_point;
1408 # To hold Unicode's normalization test suite
1409 my @normalization_tests;
1411 # Boolean: does this Unicode version have the hangul syllables, and are we
1412 # writing out a table for them?
1413 my $has_hangul_syllables = 0;
1415 # Does this Unicode version have code points whose names end in their
1416 # respective code points, and are we writing out a table for them? 0 for no;
1417 # otherwise points to first property that a table is needed for them, so that
1418 # if multiple tables are needed, we don't create duplicates
1419 my $needing_code_points_ending_in_code_point = 0;
1421 my @backslash_X_tests; # List of tests read in for testing \X
1422 my @LB_tests; # List of tests read in for testing \b{lb}
1423 my @SB_tests; # List of tests read in for testing \b{sb}
1424 my @WB_tests; # List of tests read in for testing \b{wb}
1425 my @unhandled_properties; # Will contain a list of properties found in
1426 # the input that we didn't process.
1427 my @match_properties; # Properties that have match tables, to be
1429 my @map_properties; # Properties that get map files written
1430 my @named_sequences; # NamedSequences.txt contents.
1431 my %potential_files; # Generated list of all .txt files in the directory
1432 # structure so we can warn if something is being
1434 my @missing_early_files; # Generated list of absent files that we need to
1435 # proceed in compiling this early Unicode version
1436 my @files_actually_output; # List of files we generated.
1437 my @more_Names; # Some code point names are compound; this is used
1438 # to store the extra components of them.
1439 my $E_FLOAT_PRECISION = 2; # The minimum number of digits after the decimal
1440 # point of a normalized floating point number
1441 # needed to match before we consider it equivalent
1442 # to a candidate rational
1444 # These store references to certain commonly used property objects
1453 my $Assigned; # All assigned characters in this Unicode release
1454 my $DI; # Default_Ignorable_Code_Point property
1455 my $NChar; # Noncharacter_Code_Point property
1457 my $scx; # Script_Extensions property
1459 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1460 my $has_In_conflicts = 0;
1461 my $has_Is_conflicts = 0;
1463 sub internal_file_to_platform ($) {
1464 # Convert our file paths which have '/' separators to those of the
1468 return undef unless defined $file;
1470 return File::Spec->join(split '/', $file);
1473 sub file_exists ($) { # platform independent '-e'. This program internally
1474 # uses slash as a path separator.
1476 return 0 if ! defined $file;
1477 return -e internal_file_to_platform($file);
1481 # Returns the address of the blessed input object.
1482 # It doesn't check for blessedness because that would do a string eval
1483 # every call, and the program is structured so that this is never called
1484 # for a non-blessed object.
1486 no overloading; # If overloaded, numifying below won't work.
1488 # Numifying a ref gives its address.
1489 return pack 'J', $_[0];
1492 # These are used only if $annotate is true.
1493 # The entire range of Unicode characters is examined to populate these
1494 # after all the input has been processed. But most can be skipped, as they
1495 # have the same descriptive phrases, such as being unassigned
1496 my @viacode; # Contains the 1 million character names
1497 my @age; # And their ages ("" if none)
1498 my @printable; # boolean: And are those characters printable?
1499 my @annotate_char_type; # Contains a type of those characters, specifically
1500 # for the purposes of annotation.
1501 my $annotate_ranges; # A map of ranges of code points that have the same
1502 # name for the purposes of annotation. They map to the
1503 # upper edge of the range, so that the end point can
1504 # be immediately found. This is used to skip ahead to
1505 # the end of a range, and avoid processing each
1506 # individual code point in it.
1507 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1508 # characters, but excluding those which are
1509 # also noncharacter code points
1511 # The annotation types are an extension of the regular range types, though
1512 # some of the latter are folded into one. Make the new types negative to
1513 # avoid conflicting with the regular types
1514 my $SURROGATE_TYPE = -1;
1515 my $UNASSIGNED_TYPE = -2;
1516 my $PRIVATE_USE_TYPE = -3;
1517 my $NONCHARACTER_TYPE = -4;
1518 my $CONTROL_TYPE = -5;
1519 my $ABOVE_UNICODE_TYPE = -6;
1520 my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program
1522 sub populate_char_info ($) {
1523 # Used only with the $annotate option. Populates the arrays with the
1524 # input code point's info that are needed for outputting more detailed
1525 # comments. If calling context wants a return, it is the end point of
1526 # any contiguous range of characters that share essentially the same info
1529 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
1531 $viacode[$i] = $perl_charname->value_of($i) || "";
1532 $age[$i] = (defined $age)
1533 ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1534 ? $age->value_of($i)
1538 # A character is generally printable if Unicode says it is,
1539 # but below we make sure that most Unicode general category 'C' types
1541 $printable[$i] = $print->contains($i);
1543 # But the characters in this range were removed in v2.0 and replaced by
1544 # different ones later. Modern fonts will be for the replacement
1545 # characters, so suppress printing them.
1546 if (($v_version lt v2.0
1547 || ($compare_versions && $compare_versions lt v2.0))
1548 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1549 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1554 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1556 # Only these two regular types are treated specially for annotations
1558 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1559 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1561 # Give a generic name to all code points that don't have a real name.
1562 # We output ranges, if applicable, for these. Also calculate the end
1563 # point of the range.
1565 if (! $viacode[$i]) {
1566 if ($i > $MAX_UNICODE_CODEPOINT) {
1567 $viacode[$i] = 'Above-Unicode';
1568 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1570 $end = $MAX_WORKING_CODEPOINT;
1572 elsif ($gc-> table('Private_use')->contains($i)) {
1573 $viacode[$i] = 'Private Use';
1574 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1576 $end = $gc->table('Private_Use')->containing_range($i)->end;
1578 elsif ($NChar->contains($i)) {
1579 $viacode[$i] = 'Noncharacter';
1580 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1582 $end = $NChar->containing_range($i)->end;
1584 elsif ($gc-> table('Control')->contains($i)) {
1585 my $name_ref = property_ref('Name_Alias');
1586 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1587 $viacode[$i] = (defined $name_ref)
1588 ? $name_ref->value_of($i)
1590 $annotate_char_type[$i] = $CONTROL_TYPE;
1593 elsif ($gc-> table('Unassigned')->contains($i)) {
1594 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1596 $viacode[$i] = 'Unassigned';
1598 if (defined $block) { # No blocks in earliest releases
1599 $viacode[$i] .= ', block=' . $block-> value_of($i);
1600 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1602 # Because we name the unassigned by the blocks they are in, it
1603 # can't go past the end of that block, and it also can't go
1604 # past the unassigned range it is in. The special table makes
1605 # sure that the non-characters, which are unassigned, are
1607 $end = min($block->containing_range($i)->end,
1608 $unassigned_sans_noncharacters->
1609 containing_range($i)->end);
1613 while ($unassigned_sans_noncharacters->contains($end)) {
1619 elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1620 $viacode[$i] = 'Surrogate';
1621 $annotate_char_type[$i] = $SURROGATE_TYPE;
1623 $end = $gc->table('Surrogate')->containing_range($i)->end;
1626 Carp::my_carp_bug("Can't figure out how to annotate "
1627 . sprintf("U+%04X", $i)
1628 . ". Proceeding anyway.");
1629 $viacode[$i] = 'UNKNOWN';
1630 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1635 # Here, has a name, but if it's one in which the code point number is
1636 # appended to the name, do that.
1637 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1638 $viacode[$i] .= sprintf("-%04X", $i);
1640 my $limit = $perl_charname->containing_range($i)->end;
1642 # Do all these as groups of the same age, instead of individually,
1643 # because their names are so meaningless, and there are typically
1644 # large quantities of them.
1646 while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1656 # And here, has a name, but if it's a hangul syllable one, replace it with
1657 # the correct name from the Unicode algorithm
1658 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1660 my $SIndex = $i - $SBase;
1661 my $L = $LBase + $SIndex / $NCount;
1662 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1663 my $T = $TBase + $SIndex % $TCount;
1664 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1665 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1666 $end = $perl_charname->containing_range($i)->end;
1669 return if ! defined wantarray;
1670 return $i if ! defined $end; # If not a range, return the input
1672 # Save this whole range so can find the end point quickly
1673 $annotate_ranges->add_map($i, $end, $end);
1678 # Commented code below should work on Perl 5.8.
1679 ## This 'require' doesn't necessarily work in miniperl, and even if it does,
1680 ## the native perl version of it (which is what would operate under miniperl)
1681 ## is extremely slow, as it does a string eval every call.
1682 #my $has_fast_scalar_util = $^X !~ /miniperl/
1683 # && defined eval "require Scalar::Util";
1686 # # Returns the address of the blessed input object. Uses the XS version if
1687 # # available. It doesn't check for blessedness because that would do a
1688 # # string eval every call, and the program is structured so that this is
1689 # # never called for a non-blessed object.
1691 # return Scalar::Util::refaddr($_[0]) if $has_fast_scalar_util;
1693 # # Check at least that is a ref.
1694 # my $pkg = ref($_[0]) or return undef;
1696 # # Change to a fake package to defeat any overloaded stringify
1697 # bless $_[0], 'main::Fake';
1699 # # Numifying a ref gives its address.
1700 # my $addr = pack 'J', $_[0];
1702 # # Return to original class
1703 # bless $_[0], $pkg;
1710 return $a if $a >= $b;
1717 return $a if $a <= $b;
1721 sub clarify_number ($) {
1722 # This returns the input number with underscores inserted every 3 digits
1723 # in large (5 digits or more) numbers. Input must be entirely digits, not
1727 my $pos = length($number) - 3;
1728 return $number if $pos <= 1;
1730 substr($number, $pos, 0) = '_';
1736 sub clarify_code_point_count ($) {
1737 # This is like clarify_number(), but the input is assumed to be a count of
1738 # code points, rather than a generic number.
1743 if ($number > $MAX_UNICODE_CODEPOINTS) {
1744 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1745 return "All above-Unicode code points" if $number == 0;
1746 $append = " + all above-Unicode code points";
1748 return clarify_number($number) . $append;
1753 # These routines give a uniform treatment of messages in this program. They
1754 # are placed in the Carp package to cause the stack trace to not include them,
1755 # although an alternative would be to use another package and set @CARP_NOT
1758 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1760 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1761 # and overload trying to load Scalar:Util under miniperl. See
1762 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1763 undef $overload::VERSION;
1766 my $message = shift || "";
1767 my $nofold = shift || 0;
1770 $message = main::join_lines($message);
1771 $message =~ s/^$0: *//; # Remove initial program name
1772 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1773 $message = "\n$0: $message;";
1775 # Fold the message with program name, semi-colon end punctuation
1776 # (which looks good with the message that carp appends to it), and a
1777 # hanging indent for continuation lines.
1778 $message = main::simple_fold($message, "", 4) unless $nofold;
1779 $message =~ s/\n$//; # Remove the trailing nl so what carp
1780 # appends is to the same line
1783 return $message if defined wantarray; # If a caller just wants the msg
1790 # This is called when it is clear that the problem is caused by a bug in
1793 my $message = shift;
1794 $message =~ s/^$0: *//;
1795 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1800 sub carp_too_few_args {
1802 my_carp_bug("Wrong number of arguments: to 'carp_too_few_arguments'. No action taken.");
1806 my $args_ref = shift;
1809 my_carp_bug("Need at least $count arguments to "
1811 . ". Instead got: '"
1812 . join ', ', @$args_ref
1813 . "'. No action taken.");
1817 sub carp_extra_args {
1818 my $args_ref = shift;
1819 my_carp_bug("Too many arguments to 'carp_extra_args': (" . join(', ', @_) . "); Extras ignored.") if @_;
1821 unless (ref $args_ref) {
1822 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1825 my ($package, $file, $line) = caller;
1826 my $subroutine = (caller 1)[3];
1829 if (ref $args_ref eq 'HASH') {
1830 foreach my $key (keys %$args_ref) {
1831 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1833 $list = join ', ', each %{$args_ref};
1835 elsif (ref $args_ref eq 'ARRAY') {
1836 foreach my $arg (@$args_ref) {
1837 $arg = $UNDEF unless defined $arg;
1839 $list = join ', ', @$args_ref;
1842 my_carp_bug("Can't cope with ref "
1844 . " . argument to 'carp_extra_args'. Not checking arguments.");
1848 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1856 # This program uses the inside-out method for objects, as recommended in
1857 # "Perl Best Practices". (This is the best solution still, since this has
1858 # to run under miniperl.) This closure aids in generating those. There
1859 # are two routines. setup_package() is called once per package to set
1860 # things up, and then set_access() is called for each hash representing a
1861 # field in the object. These routines arrange for the object to be
1862 # properly destroyed when no longer used, and for standard accessor
1863 # functions to be generated. If you need more complex accessors, just
1864 # write your own and leave those accesses out of the call to set_access().
1865 # More details below.
1867 my %constructor_fields; # fields that are to be used in constructors; see
1870 # The values of this hash will be the package names as keys to other
1871 # hashes containing the name of each field in the package as keys, and
1872 # references to their respective hashes as values.
1876 # Sets up the package, creating standard DESTROY and dump methods
1877 # (unless already defined). The dump method is used in debugging by
1879 # The optional parameters are:
1880 # a) a reference to a hash, that gets populated by later
1881 # set_access() calls with one of the accesses being
1882 # 'constructor'. The caller can then refer to this, but it is
1883 # not otherwise used by these two routines.
1884 # b) a reference to a callback routine to call during destruction
1885 # of the object, before any fields are actually destroyed
1888 my $constructor_ref = delete $args{'Constructor_Fields'};
1889 my $destroy_callback = delete $args{'Destroy_Callback'};
1890 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1893 my $package = (caller)[0];
1895 $package_fields{$package} = \%fields;
1896 $constructor_fields{$package} = $constructor_ref;
1898 unless ($package->can('DESTROY')) {
1899 my $destroy_name = "${package}::DESTROY";
1902 # Use typeglob to give the anonymous subroutine the name we want
1903 *$destroy_name = sub {
1905 my $addr = do { no overloading; pack 'J', $self; };
1907 $self->$destroy_callback if $destroy_callback;
1908 foreach my $field (keys %{$package_fields{$package}}) {
1909 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1910 delete $package_fields{$package}{$field}{$addr};
1916 unless ($package->can('dump')) {
1917 my $dump_name = "${package}::dump";
1921 return dump_inside_out($self, $package_fields{$package}, @_);
1928 # Arrange for the input field to be garbage collected when no longer
1929 # needed. Also, creates standard accessor functions for the field
1930 # based on the optional parameters-- none if none of these parameters:
1931 # 'addable' creates an 'add_NAME()' accessor function.
1932 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1934 # 'settable' creates a 'set_NAME()' accessor function.
1935 # 'constructor' doesn't create an accessor function, but adds the
1936 # field to the hash that was previously passed to
1938 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1939 # 'add' etc. all mean 'addable'.
1940 # The read accessor function will work on both array and scalar
1941 # values. If another accessor in the parameter list is 'a', the read
1942 # access assumes an array. You can also force it to be array access
1943 # by specifying 'readable_array' instead of 'readable'
1945 # A sort-of 'protected' access can be set-up by preceding the addable,
1946 # readable or settable with some initial portion of 'protected_' (but,
1947 # the underscore is required), like 'p_a', 'pro_set', etc. The
1948 # "protection" is only by convention. All that happens is that the
1949 # accessor functions' names begin with an underscore. So instead of
1950 # calling set_foo, the call is _set_foo. (Real protection could be
1951 # accomplished by having a new subroutine, end_package, called at the
1952 # end of each package, and then storing the __LINE__ ranges and
1953 # checking them on every accessor. But that is way overkill.)
1955 # We create anonymous subroutines as the accessors and then use
1956 # typeglobs to assign them to the proper package and name
1958 my $name = shift; # Name of the field
1959 my $field = shift; # Reference to the inside-out hash containing the
1962 my $package = (caller)[0];
1964 if (! exists $package_fields{$package}) {
1965 croak "$0: Must call 'setup_package' before 'set_access'";
1968 # Stash the field so DESTROY can get it.
1969 $package_fields{$package}{$name} = $field;
1971 # Remaining arguments are the accessors. For each...
1972 foreach my $access (@_) {
1973 my $access = lc $access;
1977 # Match the input as far as it goes.
1978 if ($access =~ /^(p[^_]*)_/) {
1980 if (substr('protected_', 0, length $protected)
1984 # Add 1 for the underscore not included in $protected
1985 $access = substr($access, length($protected) + 1);
1993 if (substr('addable', 0, length $access) eq $access) {
1994 my $subname = "${package}::${protected}add_$name";
1997 # add_ accessor. Don't add if already there, which we
1998 # determine using 'eq' for scalars and '==' otherwise.
2001 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2004 my $addr = do { no overloading; pack 'J', $self; };
2005 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2007 return if grep { $value == $_ } @{$field->{$addr}};
2010 return if grep { $value eq $_ } @{$field->{$addr}};
2012 push @{$field->{$addr}}, $value;
2016 elsif (substr('constructor', 0, length $access) eq $access) {
2018 Carp::my_carp_bug("Can't set-up 'protected' constructors")
2021 $constructor_fields{$package}{$name} = $field;
2024 elsif (substr('readable_array', 0, length $access) eq $access) {
2026 # Here has read access. If one of the other parameters for
2027 # access is array, or this one specifies array (by being more
2028 # than just 'readable_'), then create a subroutine that
2029 # assumes the data is an array. Otherwise just a scalar
2030 my $subname = "${package}::${protected}$name";
2031 if (grep { /^a/i } @_
2032 or length($access) > length('readable_'))
2037 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2038 my $addr = do { no overloading; pack 'J', $_[0]; };
2039 if (ref $field->{$addr} ne 'ARRAY') {
2040 my $type = ref $field->{$addr};
2041 $type = 'scalar' unless $type;
2042 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
2045 return scalar @{$field->{$addr}} unless wantarray;
2047 # Make a copy; had problems with caller modifying the
2048 # original otherwise
2049 my @return = @{$field->{$addr}};
2055 # Here not an array value, a simpler function.
2059 Carp::carp_extra_args(\@_) if main::DEBUG && @_ > 1;
2061 return $field->{pack 'J', $_[0]};
2065 elsif (substr('settable', 0, length $access) eq $access) {
2066 my $subname = "${package}::${protected}set_$name";
2071 return Carp::carp_too_few_args(\@_, 2) if @_ < 2;
2072 Carp::carp_extra_args(\@_) if @_ > 2;
2074 # $self is $_[0]; $value is $_[1]
2076 $field->{pack 'J', $_[0]} = $_[1];
2081 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2090 # All input files use this object, which stores various attributes about them,
2091 # and provides for convenient, uniform handling. The run method wraps the
2092 # processing. It handles all the bookkeeping of opening, reading, and closing
2093 # the file, returning only significant input lines.
2095 # Each object gets a handler which processes the body of the file, and is
2096 # called by run(). All character property files must use the generic,
2097 # default handler, which has code scrubbed to handle things you might not
2098 # expect, including automatic EBCDIC handling. For files that don't deal with
2099 # mapping code points to a property value, such as test files,
2100 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2101 # handler to be a custom one. Such a handler should basically be a
2102 # while(next_line()) {...} loop.
2104 # You can also set up handlers to
2105 # 0) call during object construction time, after everything else is done
2106 # 1) call before the first line is read, for pre processing
2107 # 2) call to adjust each line of the input before the main handler gets
2108 # them. This can be automatically generated, if appropriately simple
2109 # enough, by specifying a Properties parameter in the constructor.
2110 # 3) call upon EOF before the main handler exits its loop
2111 # 4) call at the end, for post processing
2113 # $_ is used to store the input line, and is to be filtered by the
2114 # each_line_handler()s. So, if the format of the line is not in the desired
2115 # format for the main handler, these are used to do that adjusting. They can
2116 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2117 # so the $_ output of one is used as the input to the next. The EOF handler
2118 # is also stackable, but none of the others are, but could easily be changed
2121 # Some properties are used by the Perl core but aren't defined until later
2122 # Unicode releases. The perl interpreter would have problems working when
2123 # compiled with an earlier Unicode version that doesn't have them, so we need
2124 # to define them somehow for those releases. The 'Early' constructor
2125 # parameter can be used to automatically handle this. It is essentially
2126 # ignored if the Unicode version being compiled has a data file for this
2127 # property. Either code to execute or a file to read can be specified.
2128 # Details are at the %early definition.
2130 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2131 # which insert the parameters as lines to be processed before the next input
2132 # file line is read. This allows the EOF handler(s) to flush buffers, for
2133 # example. The difference between the two routines is that the lines inserted
2134 # by insert_lines() are subjected to the each_line_handler()s. (So if you
2135 # called it from such a handler, you would get infinite recursion without some
2136 # mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go
2137 # directly to the main handler without any adjustments. If the
2138 # post-processing handler calls any of these, there will be no effect. Some
2139 # error checking for these conditions could be added, but it hasn't been done.
2141 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2142 # to prevent further processing of the line. This routine will output the
2143 # message as a warning once, and then keep a count of the lines that have the
2144 # same message, and output that count at the end of the file's processing.
2145 # This keeps the number of messages down to a manageable amount.
2147 # get_missings() should be called to retrieve any @missing input lines.
2148 # Messages will be raised if this isn't done if the options aren't to ignore
2151 sub trace { return main::trace(@_); }
2154 # Keep track of fields that are to be put into the constructor.
2155 my %constructor_fields;
2157 main::setup_package(Constructor_Fields => \%constructor_fields);
2159 my %file; # Input file name, required
2160 main::set_access('file', \%file, qw{ c r });
2162 my %first_released; # Unicode version file was first released in, required
2163 main::set_access('first_released', \%first_released, qw{ c r });
2165 my %handler; # Subroutine to process the input file, defaults to
2166 # 'process_generic_property_file'
2167 main::set_access('handler', \%handler, qw{ c });
2170 # name of property this file is for. defaults to none, meaning not
2171 # applicable, or is otherwise determinable, for example, from each line.
2172 main::set_access('property', \%property, qw{ c r });
2175 # This is either an unsigned number, or a list of property names. In the
2176 # former case, if it is non-zero, it means the file is optional, so if the
2177 # file is absent, no warning about that is output. In the latter case, it
2178 # is a list of properties that the file (exclusively) defines. If the
2179 # file is present, tables for those properties will be produced; if
2180 # absent, none will, even if they are listed elsewhere (namely
2181 # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2182 # and no warnings will be raised about them not being available. (And no
2183 # warning about the file itself will be raised.)
2184 main::set_access('optional', \%optional, qw{ c readable_array } );
2187 # This is used for debugging, to skip processing of all but a few input
2188 # files. Add 'non_skip => 1' to the constructor for those files you want
2189 # processed when you set the $debug_skip global.
2190 main::set_access('non_skip', \%non_skip, 'c');
2193 # This is used to skip processing of this input file (semi-) permanently.
2194 # The value should be the reason the file is being skipped. It is used
2195 # for files that we aren't planning to process anytime soon, but want to
2196 # allow to be in the directory and be checked for their names not
2197 # conflicting with any other files on a DOS 8.3 name filesystem, but to
2198 # not otherwise be processed, and to not raise a warning about not being
2199 # handled. In the constructor call, any value that evaluates to a numeric
2200 # 0 or undef means don't skip. Any other value is a string giving the
2201 # reason it is being skipped, and this will appear in generated pod.
2202 # However, an empty string reason will suppress the pod entry.
2203 # Internally, calls that evaluate to numeric 0 are changed into undef to
2204 # distinguish them from an empty string call.
2205 main::set_access('skip', \%skip, 'c', 'r');
2207 my %each_line_handler;
2208 # list of subroutines to look at and filter each non-comment line in the
2209 # file. defaults to none. The subroutines are called in order, each is
2210 # to adjust $_ for the next one, and the final one adjusts it for
2212 main::set_access('each_line_handler', \%each_line_handler, 'c');
2214 my %retain_trailing_comments;
2215 # This is used to not discard the comments that end data lines. This
2216 # would be used only for files with non-typical syntax, and most code here
2217 # assumes that comments have been stripped, so special handlers would have
2218 # to be written. It is assumed that the code will use these in
2219 # single-quoted contexts, and so any "'" marks in the comment will be
2220 # prefixed by a backslash.
2221 main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2223 my %properties; # Optional ordered list of the properties that occur in each
2224 # meaningful line of the input file. If present, an appropriate
2225 # each_line_handler() is automatically generated and pushed onto the stack
2226 # of such handlers. This is useful when a file contains multiple
2227 # properties per line, but no other special considerations are necessary.
2228 # The special value "<ignored>" means to discard the corresponding input
2230 # Any @missing lines in the file should also match this syntax; no such
2231 # files exist as of 6.3. But if it happens in a future release, the code
2232 # could be expanded to properly parse them.
2233 main::set_access('properties', \%properties, qw{ c r });
2235 my %has_missings_defaults;
2236 # ? Are there lines in the file giving default values for code points
2237 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2238 # the norm, but IGNORED means it has such lines, but the handler doesn't
2239 # use them. Having these three states allows us to catch changes to the
2240 # UCD that this program should track. XXX This could be expanded to
2241 # specify the syntax for such lines, like %properties above.
2242 main::set_access('has_missings_defaults',
2243 \%has_missings_defaults, qw{ c r });
2245 my %construction_time_handler;
2246 # Subroutine to call at the end of the new method. If undef, no such
2247 # handler is called.
2248 main::set_access('construction_time_handler',
2249 \%construction_time_handler, qw{ c });
2252 # Subroutine to call before doing anything else in the file. If undef, no
2253 # such handler is called.
2254 main::set_access('pre_handler', \%pre_handler, qw{ c });
2257 # Subroutines to call upon getting an EOF on the input file, but before
2258 # that is returned to the main handler. This is to allow buffers to be
2259 # flushed. The handler is expected to call insert_lines() or
2260 # insert_adjusted() with the buffered material
2261 main::set_access('eof_handler', \%eof_handler, qw{ c });
2264 # Subroutine to call after all the lines of the file are read in and
2265 # processed. If undef, no such handler is called. Note that this cannot
2266 # add lines to be processed; instead use eof_handler
2267 main::set_access('post_handler', \%post_handler, qw{ c });
2269 my %progress_message;
2270 # Message to print to display progress in lieu of the standard one
2271 main::set_access('progress_message', \%progress_message, qw{ c });
2274 # cache open file handle, internal. Is undef if file hasn't been
2275 # processed at all, empty if has;
2276 main::set_access('handle', \%handle);
2279 # cache of lines added virtually to the file, internal
2280 main::set_access('added_lines', \%added_lines);
2283 # cache of lines added virtually to the file, internal
2284 main::set_access('remapped_lines', \%remapped_lines);
2287 # cache of errors found, internal
2288 main::set_access('errors', \%errors);
2291 # storage of '@missing' defaults lines
2292 main::set_access('missings', \%missings);
2295 # Used for properties that must be defined (for Perl's purposes) on
2296 # versions of Unicode earlier than Unicode itself defines them. The
2297 # parameter is an array (it would be better to be a hash, but not worth
2298 # bothering about due to its rare use).
2300 # The first element is either a code reference to call when in a release
2301 # earlier than the Unicode file is available in, or it is an alternate
2302 # file to use instead of the non-existent one. This file must have been
2303 # plunked down in the same directory as mktables. Should you be compiling
2304 # on a release that needs such a file, mktables will abort the
2305 # compilation, and tell you where to get the necessary file(s), and what
2306 # name(s) to use to store them as.
2307 # In the case of specifying an alternate file, the array must contain two
2310 # [1] is the name of the property that will be generated by this file.
2311 # The class automatically takes the input file and excludes any code
2312 # points in it that were not assigned in the Unicode version being
2313 # compiled. It then uses this result to define the property in the given
2314 # version. Since the property doesn't actually exist in the Unicode
2315 # version being compiled, this should be a name accessible only by core
2316 # perl. If it is the same name as the regular property, the constructor
2317 # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2318 # get output, and so will be unusable by non-core code. Otherwise it gets
2319 # marked as $INTERNAL_ONLY.
2321 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2322 # the Hangul syllables in that release (which were ripped out in version
2323 # 2) for the given property . (Hence it is ignored except when compiling
2324 # version 1. You only get one value that applies to all of them, which
2325 # may not be the actual reality, but probably nobody cares anyway for
2326 # these obsolete characters.)
2328 # [3] if present is the default value for the property to assign for code
2329 # points not given in the input. If not present, the default from the
2330 # normal property is used
2332 # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2333 # it means to not add the name in [1] as an alias to the property name
2334 # used for these. Normally, when compiling Unicode versions that don't
2335 # invoke the early handling, the name is added as a synonym.
2337 # Not all files can be handled in the above way, and so the code ref
2338 # alternative is available. It can do whatever it needs to. The other
2339 # array elements are optional in this case, and the code is free to use or
2340 # ignore them if they are present.
2342 # Internally, the constructor unshifts a 0 or 1 onto this array to
2343 # indicate if an early alternative is actually being used or not. This
2344 # makes for easier testing later on.
2345 main::set_access('early', \%early, 'c');
2348 main::set_access('only_early', \%only_early, 'c');
2350 my %required_even_in_debug_skip;
2351 # debug_skip is used to speed up compilation during debugging by skipping
2352 # processing files that are not needed for the task at hand. However,
2353 # some files pretty much can never be skipped, and this is used to specify
2354 # that this is one of them. In order to skip this file, the call to the
2355 # constructor must be edited to comment out this parameter.
2356 main::set_access('required_even_in_debug_skip',
2357 \%required_even_in_debug_skip, 'c');
2360 # Some files get removed from the Unicode DB. This is a version object
2361 # giving the first release without this file.
2362 main::set_access('withdrawn', \%withdrawn, 'c');
2364 my %in_this_release;
2365 # Calculated value from %first_released and %withdrawn. Are we compiling
2366 # a Unicode release which includes this file?
2367 main::set_access('in_this_release', \%in_this_release);
2370 sub _next_line_with_remapped_range;
2375 my $self = bless \do{ my $anonymous_scalar }, $class;
2376 my $addr = do { no overloading; pack 'J', $self; };
2379 $handler{$addr} = \&main::process_generic_property_file;
2380 $retain_trailing_comments{$addr} = 0;
2381 $non_skip{$addr} = 0;
2382 $skip{$addr} = undef;
2383 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2384 $handle{$addr} = undef;
2385 $added_lines{$addr} = [ ];
2386 $remapped_lines{$addr} = [ ];
2387 $each_line_handler{$addr} = [ ];
2388 $eof_handler{$addr} = [ ];
2389 $errors{$addr} = { };
2390 $missings{$addr} = [ ];
2391 $early{$addr} = [ ];
2392 $optional{$addr} = [ ];
2394 # Two positional parameters.
2395 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2396 $file{$addr} = main::internal_file_to_platform(shift);
2397 $first_released{$addr} = shift;
2399 # The rest of the arguments are key => value pairs
2400 # %constructor_fields has been set up earlier to list all possible
2401 # ones. Either set or push, depending on how the default has been set
2404 foreach my $key (keys %args) {
2405 my $argument = $args{$key};
2407 # Note that the fields are the lower case of the constructor keys
2408 my $hash = $constructor_fields{lc $key};
2409 if (! defined $hash) {
2410 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2413 if (ref $hash->{$addr} eq 'ARRAY') {
2414 if (ref $argument eq 'ARRAY') {
2415 foreach my $argument (@{$argument}) {
2416 next if ! defined $argument;
2417 push @{$hash->{$addr}}, $argument;
2421 push @{$hash->{$addr}}, $argument if defined $argument;
2425 $hash->{$addr} = $argument;
2430 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2432 # Convert 0 (meaning don't skip) to undef
2433 undef $skip{$addr} unless $skip{$addr};
2435 # Handle the case where this file is optional
2436 my $pod_message_for_non_existent_optional = "";
2437 if ($optional{$addr}->@*) {
2439 # First element is the pod message
2440 $pod_message_for_non_existent_optional
2441 = shift $optional{$addr}->@*;
2442 # Convert a 0 'Optional' argument to an empty list to make later
2443 # code more concise.
2444 if ( $optional{$addr}->@*
2445 && $optional{$addr}->@* == 1
2446 && $optional{$addr}[0] ne ""
2447 && $optional{$addr}[0] !~ /\D/
2448 && $optional{$addr}[0] == 0)
2450 $optional{$addr} = [ ];
2452 else { # But if the only element doesn't evaluate to 0, make sure
2453 # that this file is indeed considered optional below.
2454 unshift $optional{$addr}->@*, 1;
2459 my $function_instead_of_file = 0;
2461 if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2462 $only_early{$addr} = 1;
2463 pop $early{$addr}->@*;
2466 # If we are compiling a Unicode release earlier than the file became
2467 # available, the constructor may have supplied a substitute
2468 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2470 # Yes, we have a substitute, that we will use; mark it so
2471 unshift $early{$addr}->@*, 1;
2473 # See the definition of %early for what the array elements mean.
2474 # Note that we have just unshifted onto the array, so the numbers
2475 # below are +1 of those in the %early description.
2476 # If we have a property this defines, create a table and default
2477 # map for it now (at essentially compile time), so that it will be
2478 # available for the whole of run time. (We will want to add this
2479 # name as an alias when we are using the official property name;
2480 # but this must be deferred until run(), because at construction
2481 # time the official names have yet to be defined.)
2482 if ($early{$addr}[2]) {
2483 my $fate = ($property{$addr}
2484 && $property{$addr} eq $early{$addr}[2])
2487 my $prop_object = Property->new($early{$addr}[2],
2489 Perl_Extension => 1,
2492 # If not specified by the constructor, use the default mapping
2493 # for the regular property for this substitute one.
2494 if ($early{$addr}[4]) {
2495 $prop_object->set_default_map($early{$addr}[4]);
2497 elsif ( defined $property{$addr}
2498 && defined $default_mapping{$property{$addr}})
2501 ->set_default_map($default_mapping{$property{$addr}});
2505 if (ref $early{$addr}[1] eq 'CODE') {
2506 $function_instead_of_file = 1;
2508 # If the first element of the array is a code ref, the others
2510 $handler{$addr} = $early{$addr}[1];
2511 $property{$addr} = $early{$addr}[2]
2512 if defined $early{$addr}[2];
2513 $progress = "substitute $file{$addr}";
2517 else { # Specifying a substitute file
2519 if (! main::file_exists($early{$addr}[1])) {
2521 # If we don't see the substitute file, generate an error
2522 # message giving the needed things, and add it to the list
2523 # of such to output before actual processing happens
2524 # (hence the user finds out all of them in one run).
2525 # Instead of creating a general method for NameAliases,
2526 # hard-code it here, as there is unlikely to ever be a
2527 # second one which needs special handling.
2528 my $string_version = ($file{$addr} eq "NameAliases.txt")
2529 ? 'at least 6.1 (the later, the better)'
2530 : sprintf "%vd", $first_released{$addr};
2531 push @missing_early_files, <<END;
2532 '$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2537 $progress = $early{$addr}[1];
2538 $progress .= ", substituting for $file{$addr}" if $file{$addr};
2539 $file{$addr} = $early{$addr}[1];
2540 $property{$addr} = $early{$addr}[2];
2542 # Ignore code points not in the version being compiled
2543 push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2545 if ( $v_version lt v2.0 # Hanguls in this release ...
2546 && defined $early{$addr}[3]) # ... need special treatment
2548 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2552 # And this substitute is valid for all releases.
2553 $first_released{$addr} = v0;
2555 else { # Normal behavior
2556 $progress = $file{$addr};
2557 unshift $early{$addr}->@*, 0; # No substitute
2560 my $file = $file{$addr};
2561 $progress_message{$addr} = "Processing $progress"
2562 unless $progress_message{$addr};
2564 # A file should be there if it is within the window of versions for
2565 # which Unicode supplies it
2566 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2567 $in_this_release{$addr} = 0;
2571 $in_this_release{$addr} = $first_released{$addr} le $v_version;
2573 # Check that the file for this object (possibly using a substitute
2574 # for early releases) exists or we have a function alternative
2575 if ( ! $function_instead_of_file
2576 && ! main::file_exists($file))
2578 # Here there is nothing available for this release. This is
2579 # fine if we aren't expecting anything in this release.
2580 if (! $in_this_release{$addr}) {
2581 $skip{$addr} = ""; # Don't remark since we expected
2582 # nothing and got nothing
2584 elsif ($optional{$addr}->@*) {
2586 # Here the file is optional in this release; Use the
2587 # passed in text to document this case in the pod.
2588 $skip{$addr} = $pod_message_for_non_existent_optional;
2590 elsif ( $in_this_release{$addr}
2591 && ! defined $skip{$addr}
2593 { # Doesn't exist but should.
2594 $skip{$addr} = "'$file' not found. Possibly Big problems";
2595 Carp::my_carp($skip{$addr});
2598 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2601 # The file exists; if not skipped for another reason, and we are
2602 # skipping most everything during debugging builds, use that as
2604 $skip{$addr} = '$debug_skip is on'
2610 && ! $required_even_in_debug_skip{$addr}
2613 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2616 # Here, we have figured out if we will be skipping this file or not.
2617 # If so, we add any single property it defines to any passed in
2618 # optional property list. These will be dealt with at run time.
2619 if (defined $skip{$addr}) {
2620 if ($property{$addr}) {
2621 push $optional{$addr}->@*, $property{$addr};
2623 } # Otherwise, are going to process the file.
2624 elsif ($property{$addr}) {
2626 # If the file has a property defined in the constructor for it, it
2627 # means that the property is not listed in the file's entries. So
2628 # add a handler (to the list of line handlers) to insert the
2629 # property name into the lines, to provide a uniform interface to
2630 # the final processing subroutine.
2631 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2633 elsif ($properties{$addr}) {
2635 # Similarly, there may be more than one property represented on
2636 # each line, with no clue but the constructor input what those
2637 # might be. Add a handler for each line in the input so that it
2638 # creates a separate input line for each property in those input
2639 # lines, thus making them suitable to handle generically.
2641 push @{$each_line_handler{$addr}},
2644 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2646 my @fields = split /\s*;\s*/, $_, -1;
2648 if (@fields - 1 > @{$properties{$addr}}) {
2649 $file->carp_bad_line('Extra fields');
2653 my $range = shift @fields; # 0th element is always the
2656 # The next fields in the input line correspond
2657 # respectively to the stored properties.
2658 for my $i (0 .. @{$properties{$addr}} - 1) {
2659 my $property_name = $properties{$addr}[$i];
2660 next if $property_name eq '<ignored>';
2661 $file->insert_adjusted_lines(
2662 "$range; $property_name; $fields[$i]");
2670 { # On non-ascii platforms, we use a special pre-handler
2673 *next_line = (main::NON_ASCII_PLATFORM)
2674 ? *_next_line_with_remapped_range
2678 &{$construction_time_handler{$addr}}($self)
2679 if $construction_time_handler{$addr};
2687 qw("") => "_operator_stringify",
2688 "." => \&main::_operator_dot,
2689 ".=" => \&main::_operator_dot_equal,
2692 sub _operator_stringify {
2695 return __PACKAGE__ . " object for " . $self->file;
2699 # Process the input object $self. This opens and closes the file and
2700 # calls all the handlers for it. Currently, this can only be called
2701 # once per file, as it destroy's the EOF handlers
2703 # flag to make sure extracted files are processed early
2704 state $seen_non_extracted = 0;
2707 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2709 my $addr = do { no overloading; pack 'J', $self; };
2711 my $file = $file{$addr};
2714 $handle{$addr} = 'pretend_is_open';
2717 if ($seen_non_extracted) {
2718 if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2719 # case of the file's name
2721 Carp::my_carp_bug(main::join_lines(<<END
2722 $file should be processed just after the 'Prop...Alias' files, and before
2723 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2724 have subtle problems
2729 elsif ($EXTRACTED_DIR
2731 # We only do this check for generic property files
2732 && $handler{$addr} == \&main::process_generic_property_file
2734 && $file !~ /$EXTRACTED/i)
2736 # We don't set this (by the 'if' above) if we have no
2737 # extracted directory, so if running on an early version,
2738 # this test won't work. Not worth worrying about.
2739 $seen_non_extracted = 1;
2742 # Mark the file as having being processed, and warn if it
2743 # isn't a file we are expecting. As we process the files,
2744 # they are deleted from the hash, so any that remain at the
2745 # end of the program are files that we didn't process.
2746 my $fkey = File::Spec->rel2abs($file);
2747 my $exists = delete $potential_files{lc($fkey)};
2749 Carp::my_carp("Was not expecting '$file'.")
2750 if $exists && ! $in_this_release{$addr};
2752 # If there is special handling for compiling Unicode releases
2753 # earlier than the first one in which Unicode defines this
2755 if ($early{$addr}->@* > 1) {
2757 # Mark as processed any substitute file that would be used in
2759 $fkey = File::Spec->rel2abs($early{$addr}[1]);
2760 delete $potential_files{lc($fkey)};
2762 # As commented in the constructor code, when using the
2763 # official property, we still have to allow the publicly
2764 # inaccessible early name so that the core code which uses it
2765 # will work regardless.
2766 if ( ! $only_early{$addr}
2767 && ! $early{$addr}[0]
2768 && $early{$addr}->@* > 2)
2770 my $early_property_name = $early{$addr}[2];
2771 if ($property{$addr} ne $early_property_name) {
2772 main::property_ref($property{$addr})
2773 ->add_alias($early_property_name);
2778 # We may be skipping this file ...
2779 if (defined $skip{$addr}) {
2781 # If the file isn't supposed to be in this release, there is
2783 if ($in_this_release{$addr}) {
2785 # But otherwise, we may print a message
2787 print STDERR "Skipping input file '$file'",
2788 " because '$skip{$addr}'\n";
2791 # And add it to the list of skipped files, which is later
2792 # used to make the pod
2793 $skipped_files{$file} = $skip{$addr};
2795 # The 'optional' list contains properties that are also to
2796 # be skipped along with the file. (There may also be
2797 # digits which are just placeholders to make sure it isn't
2799 foreach my $property ($optional{$addr}->@*) {
2800 next unless $property =~ /\D/;
2801 my $prop_object = main::property_ref($property);
2802 next unless defined $prop_object;
2803 $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2810 # Here, we are going to process the file. Open it, converting the
2811 # slashes used in this program into the proper form for the OS
2813 if (not open $file_handle, "<", $file) {
2814 Carp::my_carp("Can't open $file. Skipping: $!");
2817 $handle{$addr} = $file_handle; # Cache the open file handle
2819 # If possible, make sure that the file is the correct version.
2820 # (This data isn't available on early Unicode releases or in
2821 # UnicodeData.txt.) We don't do this check if we are using a
2822 # substitute file instead of the official one (though the code
2823 # could be extended to do so).
2824 if ($in_this_release{$addr}
2825 && ! $early{$addr}[0]
2826 && lc($file) ne 'unicodedata.txt')
2828 if ($file !~ /^Unihan/i) {
2830 # The non-Unihan files started getting version numbers in
2831 # 3.2, but some files in 4.0 are unchanged from 3.2, and
2832 # marked as 3.2. 4.0.1 is the first version where there
2833 # are no files marked as being from less than 4.0, though
2834 # some are marked as 4.0. In versions after that, the
2835 # numbers are correct.
2836 if ($v_version ge v4.0.1) {
2837 $_ = <$file_handle>; # The version number is in the
2839 if ($_ !~ / - $string_version \. /x) {
2843 # 4.0.1 had some valid files that weren't updated.
2844 if (! ($v_version eq v4.0.1 && $_ =~ /4\.0\.0/)) {
2845 die Carp::my_carp("File '$file' is version "
2846 . "'$_'. It should be "
2847 . "version $string_version");
2852 elsif ($v_version ge v6.0.0) { # Unihan
2854 # Unihan files didn't get accurate version numbers until
2855 # 6.0. The version is somewhere in the first comment
2857 while (<$file_handle>) {
2859 Carp::my_carp_bug("Could not find the expected "
2860 . "version info in file '$file'");
2865 next if $_ !~ / version: /x;
2866 last if $_ =~ /$string_version/;
2867 die Carp::my_carp("File '$file' is version "
2868 . "'$_'. It should be "
2869 . "version $string_version");
2875 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2877 # Call any special handler for before the file.
2878 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2880 # Then the main handler
2881 &{$handler{$addr}}($self);
2883 # Then any special post-file handler.
2884 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2886 # If any errors have been accumulated, output the counts (as the first
2887 # error message in each class was output when it was encountered).
2888 if ($errors{$addr}) {
2891 foreach my $error (keys %{$errors{$addr}}) {
2892 $total += $errors{$addr}->{$error};
2893 delete $errors{$addr}->{$error};
2898 = "A total of $total lines had errors in $file. ";
2900 $message .= ($types == 1)
2901 ? '(Only the first one was displayed.)'
2902 : '(Only the first of each type was displayed.)';
2903 Carp::my_carp($message);
2907 if (@{$missings{$addr}}) {
2908 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2911 # If a real file handle, close it.
2912 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2914 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2915 # the file, as opposed to undef
2920 # Sets $_ to be the next logical input line, if any. Returns non-zero
2921 # if such a line exists. 'logical' means that any lines that have
2922 # been added via insert_lines() will be returned in $_ before the file
2926 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2928 my $addr = do { no overloading; pack 'J', $self; };
2930 # Here the file is open (or if the handle is not a ref, is an open
2931 # 'virtual' file). Get the next line; any inserted lines get priority
2932 # over the file itself.
2936 while (1) { # Loop until find non-comment, non-empty line
2937 #local $to_trace = 1 if main::DEBUG;
2938 my $inserted_ref = shift @{$added_lines{$addr}};
2939 if (defined $inserted_ref) {
2940 ($adjusted, $_) = @{$inserted_ref};
2941 trace $adjusted, $_ if main::DEBUG && $to_trace;
2942 return 1 if $adjusted;
2945 last if ! ref $handle{$addr}; # Don't read unless is real file
2946 last if ! defined ($_ = readline $handle{$addr});
2949 trace $_ if main::DEBUG && $to_trace;
2951 # See if this line is the comment line that defines what property
2952 # value that code points that are not listed in the file should
2953 # have. The format or existence of these lines is not guaranteed
2954 # by Unicode since they are comments, but the documentation says
2955 # that this was added for machine-readability, so probably won't
2956 # change. This works starting in Unicode Version 5.0. They look
2959 # @missing: 0000..10FFFF; Not_Reordered
2960 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2961 # @missing: 0000..10FFFF; ; NaN
2963 # Save the line for a later get_missings() call.
2964 if (/$missing_defaults_prefix/) {
2965 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2966 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2968 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2969 my @defaults = split /\s* ; \s*/x, $_;
2971 # The first field is the @missing, which ends in a
2972 # semi-colon, so can safely shift.
2975 # Some of these lines may have empty field placeholders
2976 # which get in the way. An example is:
2977 # @missing: 0000..10FFFF; ; NaN
2978 # Remove them. Process starting from the top so the
2979 # splice doesn't affect things still to be looked at.
2980 for (my $i = @defaults - 1; $i >= 0; $i--) {
2981 next if $defaults[$i] ne "";
2982 splice @defaults, $i, 1;
2985 # What's left should be just the property (maybe) and the
2986 # default. Having only one element means it doesn't have
2990 if (@defaults >= 1) {
2991 if (@defaults == 1) {
2992 $default = $defaults[0];
2995 $property = $defaults[0];
2996 $default = $defaults[1];
3002 || ($default =~ /^</
3003 && $default !~ /^<code *point>$/i
3004 && $default !~ /^<none>$/i
3005 && $default !~ /^<script>$/i))
3007 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
3011 # If the property is missing from the line, it should
3012 # be the one for the whole file
3013 $property = $property{$addr} if ! defined $property;
3015 # Change <none> to the null string, which is what it
3016 # really means. If the default is the code point
3017 # itself, set it to <code point>, which is what
3018 # Unicode uses (but sometimes they've forgotten the
3020 if ($default =~ /^<none>$/i) {
3023 elsif ($default =~ /^<code *point>$/i) {
3024 $default = $CODE_POINT;
3026 elsif ($default =~ /^<script>$/i) {
3028 # Special case this one. Currently is from
3029 # ScriptExtensions.txt, and means for all unlisted
3030 # code points, use their Script property values.
3031 # For the code points not listed in that file, the
3032 # default value is 'Unknown'.
3033 $default = "Unknown";
3036 # Store them as a sub-arrays with both components.
3037 push @{$missings{$addr}}, [ $default, $property ];
3041 # There is nothing for the caller to process on this comment
3046 # Unless to keep, remove comments. If to keep, ignore
3047 # comment-only lines
3048 if ($retain_trailing_comments{$addr}) {
3049 next if / ^ \s* \# /x;
3051 # But escape any single quotes (done in both the comment and
3052 # non-comment portion; this could be a bug someday, but not
3060 # Remove trailing space, and skip this line if the result is empty
3064 # Call any handlers for this line, and skip further processing of
3065 # the line if the handler sets the line to null.
3066 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3071 # Here the line is ok. return success.
3073 } # End of looping through lines.
3075 # If there are EOF handlers, call each (only once) and if it generates
3076 # more lines to process go back in the loop to handle them.
3077 while ($eof_handler{$addr}->@*) {
3078 &{$eof_handler{$addr}[0]}($self);
3079 shift $eof_handler{$addr}->@*; # Currently only get one shot at it.
3080 goto LINE if $added_lines{$addr};
3083 # Return failure -- no more lines.
3088 sub _next_line_with_remapped_range {
3090 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3092 # like _next_line(), but for use on non-ASCII platforms. It sets $_
3093 # to be the next logical input line, if any. Returns non-zero if such
3094 # a line exists. 'logical' means that any lines that have been added
3095 # via insert_lines() will be returned in $_ before the file is read
3098 # The difference from _next_line() is that this remaps the Unicode
3099 # code points in the input to those of the native platform. Each
3100 # input line contains a single code point, or a single contiguous
3101 # range of them This routine splits each range into its individual
3102 # code points and caches them. It returns the cached values,
3103 # translated into their native equivalents, one at a time, for each
3104 # call, before reading the next line. Since native values can only be
3105 # a single byte wide, no translation is needed for code points above
3106 # 0xFF, and ranges that are entirely above that number are not split.
3107 # If an input line contains the range 254-1000, it would be split into
3108 # three elements: 254, 255, and 256-1000. (The downstream table
3109 # insertion code will sort and coalesce the individual code points
3110 # into appropriate ranges.)
3112 my $addr = do { no overloading; pack 'J', $self; };
3116 # Look in cache before reading the next line. Return any cached
3118 my $inserted = shift @{$remapped_lines{$addr}};
3119 if (defined $inserted) {
3120 trace $inserted if main::DEBUG && $to_trace;
3121 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3122 trace $_ if main::DEBUG && $to_trace;
3126 # Get the next line.
3127 return 0 unless _next_line($self);
3129 # If there is a special handler for it, return the line,
3130 # untranslated. This should happen only for files that are
3131 # special, not being code-point related, such as property names.
3132 return 1 if $handler{$addr}
3133 != \&main::process_generic_property_file;
3135 my ($range, $property_name, $map, @remainder)
3136 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3139 || ! defined $property_name
3140 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3142 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored");
3146 my $high = (defined $2) ? hex $2 : $low;
3148 # If the input maps the range to another code point, remap the
3149 # target if it is between 0 and 255.
3152 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3153 $tail = "$property_name; $map";
3154 $_ = "$range; $tail";
3157 $tail = $property_name;
3160 # If entire range is above 255, just return it, unchanged (except
3161 # any mapped-to code point, already changed above)
3162 return 1 if $low > 255;
3164 # Cache an entry for every code point < 255. For those in the
3165 # range above 255, return a dummy entry for just that portion of
3166 # the range. Note that this will be out-of-order, but that is not
3168 foreach my $code_point ($low .. $high) {
3169 if ($code_point > 255) {
3170 $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3173 push @{$remapped_lines{$addr}}, "$code_point; $tail";
3175 } # End of looping through lines.
3180 # Not currently used, not fully tested.
3182 # # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3183 # # record. Not callable from an each_line_handler(), nor does it call
3184 # # an each_line_handler() on the line.
3187 # my $addr = do { no overloading; pack 'J', $self; };
3189 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
3190 # my ($adjusted, $line) = @{$inserted_ref};
3191 # next if $adjusted;
3193 # # Remove comments and trailing space, and return a non-empty
3196 # $line =~ s/\s+$//;
3197 # return $line if $line ne "";
3200 # return if ! ref $handle{$addr}; # Don't read unless is real file
3201 # while (1) { # Loop until find non-comment, non-empty line
3202 # local $to_trace = 1 if main::DEBUG;
3203 # trace $_ if main::DEBUG && $to_trace;
3204 # return if ! defined (my $line = readline $handle{$addr});
3206 # push @{$added_lines{$addr}}, [ 0, $line ];
3209 # $line =~ s/\s+$//;
3210 # return $line if $line ne "";
3218 # Lines can be inserted so that it looks like they were in the input
3219 # file at the place it was when this routine is called. See also
3220 # insert_adjusted_lines(). Lines inserted via this routine go through
3221 # any each_line_handler()
3225 # Each inserted line is an array, with the first element being 0 to
3226 # indicate that this line hasn't been adjusted, and needs to be
3229 push @{$added_lines{pack 'J', $self}}, map { [ 0, $_ ] } @_;
3233 sub insert_adjusted_lines {
3234 # Lines can be inserted so that it looks like they were in the input
3235 # file at the place it was when this routine is called. See also
3236 # insert_lines(). Lines inserted via this routine are already fully
3237 # adjusted, ready to be processed; each_line_handler()s handlers will
3238 # not be called. This means this is not a completely general
3239 # facility, as only the last each_line_handler on the stack should
3240 # call this. It could be made more general, by passing to each of the
3241 # line_handlers their position on the stack, which they would pass on
3242 # to this routine, and that would replace the boolean first element in
3243 # the anonymous array pushed here, so that the next_line routine could
3244 # use that to call only those handlers whose index is after it on the
3245 # stack. But this is overkill for what is needed now.
3248 trace $_[0] if main::DEBUG && $to_trace;
3250 # Each inserted line is an array, with the first element being 1 to
3251 # indicate that this line has been adjusted
3253 push @{$added_lines{pack 'J', $self}}, map { [ 1, $_ ] } @_;
3258 # Returns the stored up @missings lines' values, and clears the list.
3259 # The values are in an array, consisting of the default in the first
3260 # element, and the property in the 2nd. However, since these lines
3261 # can be stacked up, the return is an array of all these arrays.
3264 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3266 my $addr = do { no overloading; pack 'J', $self; };
3268 # If not accepting a list return, just return the first one.
3269 return shift @{$missings{$addr}} unless wantarray;
3271 my @return = @{$missings{$addr}};
3272 undef @{$missings{$addr}};
3276 sub _exclude_unassigned {
3278 # Takes the range in $_ and excludes code points that aren't assigned
3281 state $skip_inserted_count = 0;
3283 # Ignore recursive calls.
3284 if ($skip_inserted_count) {
3285 $skip_inserted_count--;
3289 # Find what code points are assigned in this release
3290 main::calculate_Assigned() if ! defined $Assigned;
3293 my $addr = do { no overloading; pack 'J', $self; };
3294 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3296 my ($range, @remainder)
3297 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3299 # Examine the range.
3300 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3303 my $high = (defined $2) ? hex $2 : $low;
3305 # Split the range into subranges of just those code points in it
3306 # that are assigned.
3307 my @ranges = (Range_List->new(Initialize
3308 => Range->new($low, $high)) & $Assigned)->ranges;
3310 # Do nothing if nothing in the original range is assigned in this
3311 # release; handle normally if everything is in this release.
3315 elsif (@ranges != 1) {
3317 # Here, some code points in the original range aren't in this
3318 # release; @ranges gives the ones that are. Create fake input
3319 # lines for each of the ranges, and set things up so that when
3320 # this routine is called on that fake input, it will do
3322 $skip_inserted_count = @ranges;
3323 my $remainder = join ";", @remainder;
3324 for my $range (@ranges) {
3325 $self->insert_lines(sprintf("%04X..%04X;%s",
3326 $range->start, $range->end, $remainder));
3328 $_ = ""; # The original range is now defunct.
3335 sub _fixup_obsolete_hanguls {
3337 # This is called only when compiling Unicode version 1. All Unicode
3338 # data for subsequent releases assumes that the code points that were
3339 # Hangul syllables in this release only are something else, so if
3340 # using such data, we have to override it
3343 my $addr = do { no overloading; pack 'J', $self; };
3344 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3346 my $object = main::property_ref($property{$addr});
3347 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3348 $FINAL_REMOVED_HANGUL_SYLLABLE,
3349 $early{$addr}[3], # Passed-in value for these
3350 Replace => $UNCONDITIONALLY);
3353 sub _insert_property_into_line {
3354 # Add a property field to $_, if this file requires it.
3357 my $addr = do { no overloading; pack 'J', $self; };
3358 my $property = $property{$addr};
3359 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3361 $_ =~ s/(;|$)/; $property$1/;
3366 # Output consistent error messages, using either a generic one, or the
3367 # one given by the optional parameter. To avoid gazillions of the
3368 # same message in case the syntax of a file is way off, this routine
3369 # only outputs the first instance of each message, incrementing a
3370 # count so the totals can be output at the end of the file.
3373 my $message = shift;
3374 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3376 my $addr = do { no overloading; pack 'J', $self; };
3378 $message = 'Unexpected line' unless $message;
3380 # No trailing punctuation so as to fit with our addenda.
3381 $message =~ s/[.:;,]$//;
3383 # If haven't seen this exact message before, output it now. Otherwise
3384 # increment the count of how many times it has occurred
3385 unless ($errors{$addr}->{$message}) {
3386 Carp::my_carp("$message in '$_' in "
3388 . " at line $.. Skipping this line;");
3389 $errors{$addr}->{$message} = 1;
3392 $errors{$addr}->{$message}++;
3395 # Clear the line to prevent any further (meaningful) processing of it.
3402 package Multi_Default;
3404 # Certain properties in early versions of Unicode had more than one possible
3405 # default for code points missing from the files. In these cases, one
3406 # default applies to everything left over after all the others are applied,
3407 # and for each of the others, there is a description of which class of code
3408 # points applies to it. This object helps implement this by storing the
3409 # defaults, and for all but that final default, an eval string that generates
3410 # the class that it applies to.
3415 main::setup_package();
3418 # The defaults structure for the classes
3419 main::set_access('class_defaults', \%class_defaults);
3422 # The default that applies to everything left over.
3423 main::set_access('other_default', \%other_default, 'r');
3427 # The constructor is called with default => eval pairs, terminated by
3428 # the left-over default. e.g.
3429 # Multi_Default->new(
3430 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3432 # 'R' => 'some other expression that evaluates to code points',
3437 # It is best to leave the final value be the one that matches the
3438 # above-Unicode code points.
3442 my $self = bless \do{my $anonymous_scalar}, $class;
3443 my $addr = do { no overloading; pack 'J', $self; };
3446 my $default = shift;
3448 $class_defaults{$addr}->{$default} = $eval;
3451 $other_default{$addr} = shift;
3456 sub get_next_defaults {
3457 # Iterates and returns the next class of defaults.
3459 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3461 my $addr = do { no overloading; pack 'J', $self; };
3463 return each %{$class_defaults{$addr}};
3469 # An alias is one of the names that a table goes by. This class defines them
3470 # including some attributes. Everything is currently setup in the
3476 main::setup_package();
3479 main::set_access('name', \%name, 'r');
3482 # Should this name match loosely or not.
3483 main::set_access('loose_match', \%loose_match, 'r');
3485 my %make_re_pod_entry;
3486 # Some aliases should not get their own entries in the re section of the
3487 # pod, because they are covered by a wild-card, and some we want to
3488 # discourage use of. Binary
3489 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3492 # Is this documented to be accessible via Unicode::UCD
3493 main::set_access('ucd', \%ucd, 'r', 's');
3496 # Aliases have a status, like deprecated, or even suppressed (which means
3497 # they don't appear in documentation). Enum
3498 main::set_access('status', \%status, 'r');
3501 # Similarly, some aliases should not be considered as usable ones for
3502 # external use, such as file names, or we don't want documentation to
3503 # recommend them. Boolean
3504 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3509 my $self = bless \do { my $anonymous_scalar }, $class;
3510 my $addr = do { no overloading; pack 'J', $self; };
3512 $name{$addr} = shift;
3513 $loose_match{$addr} = shift;
3514 $make_re_pod_entry{$addr} = shift;
3515 $ok_as_filename{$addr} = shift;
3516 $status{$addr} = shift;
3517 $ucd{$addr} = shift;
3519 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3521 # Null names are never ok externally
3522 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3530 # A range is the basic unit for storing code points, and is described in the
3531 # comments at the beginning of the program. Each range has a starting code
3532 # point; an ending code point (not less than the starting one); a value
3533 # that applies to every code point in between the two end-points, inclusive;
3534 # and an enum type that applies to the value. The type is for the user's
3535 # convenience, and has no meaning here, except that a non-zero type is
3536 # considered to not obey the normal Unicode rules for having standard forms.
3538 # The same structure is used for both map and match tables, even though in the
3539 # latter, the value (and hence type) is irrelevant and could be used as a
3540 # comment. In map tables, the value is what all the code points in the range
3541 # map to. Type 0 values have the standardized version of the value stored as
3542 # well, so as to not have to recalculate it a lot.
3544 sub trace { return main::trace(@_); }
3548 main::setup_package();
3551 main::set_access('start', \%start, 'r', 's');
3554 main::set_access('end', \%end, 'r', 's');
3557 main::set_access('value', \%value, 'r', 's');
3560 main::set_access('type', \%type, 'r');
3563 # The value in internal standard form. Defined only if the type is 0.
3564 main::set_access('standard_form', \%standard_form);
3566 # Note that if these fields change, the dump() method should as well
3569 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
3572 my $self = bless \do { my $anonymous_scalar }, $class;
3573 my $addr = do { no overloading; pack 'J', $self; };
3575 $start{$addr} = shift;
3576 $end{$addr} = shift;
3580 my $value = delete $args{'Value'}; # Can be 0
3581 $value = "" unless defined $value;
3582 $value{$addr} = $value;
3584 $type{$addr} = delete $args{'Type'} || 0;
3586 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3593 qw("") => "_operator_stringify",
3594 "." => \&main::_operator_dot,
3595 ".=" => \&main::_operator_dot_equal,
3598 sub _operator_stringify {
3600 my $addr = do { no overloading; pack 'J', $self; };
3602 # Output it like '0041..0065 (value)'
3603 my $return = sprintf("%04X", $start{$addr})
3605 . sprintf("%04X", $end{$addr});
3606 my $value = $value{$addr};
3607 my $type = $type{$addr};
3609 $return .= "$value";
3610 $return .= ", Type=$type" if $type != 0;
3617 # Calculate the standard form only if needed, and cache the result.
3618 # The standard form is the value itself if the type is special.
3619 # This represents a considerable CPU and memory saving - at the time
3620 # of writing there are 368676 non-special objects, but the standard
3621 # form is only requested for 22047 of them - ie about 6%.
3624 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3626 my $addr = do { no overloading; pack 'J', $self; };
3628 return $standard_form{$addr} if defined $standard_form{$addr};
3630 my $value = $value{$addr};
3631 return $value if $type{$addr};
3632 return $standard_form{$addr} = main::standardize($value);
3636 # Human, not machine readable. For machine readable, comment out this
3637 # entire routine and let the standard one take effect.
3640 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3642 my $addr = do { no overloading; pack 'J', $self; };
3644 my $return = $indent
3645 . sprintf("%04X", $start{$addr})
3647 . sprintf("%04X", $end{$addr})
3648 . " '$value{$addr}';";
3649 if (! defined $standard_form{$addr}) {
3650 $return .= "(type=$type{$addr})";
3652 elsif ($standard_form{$addr} ne $value{$addr}) {
3653 $return .= "(standard '$standard_form{$addr}')";
3659 package _Range_List_Base;
3661 # Base class for range lists. A range list is simply an ordered list of
3662 # ranges, so that the ranges with the lowest starting numbers are first in it.
3664 # When a new range is added that is adjacent to an existing range that has the
3665 # same value and type, it merges with it to form a larger range.
3667 # Ranges generally do not overlap, except that there can be multiple entries
3668 # of single code point ranges. This is because of NameAliases.txt.
3670 # In this program, there is a standard value such that if two different
3671 # values, have the same standard value, they are considered equivalent. This
3672 # value was chosen so that it gives correct results on Unicode data
3674 # There are a number of methods to manipulate range lists, and some operators
3675 # are overloaded to handle them.
3677 sub trace { return main::trace(@_); }
3683 # Max is initialized to a negative value that isn't adjacent to 0, for
3687 main::setup_package();
3690 # The list of ranges
3691 main::set_access('ranges', \%ranges, 'readable_array');
3694 # The highest code point in the list. This was originally a method, but
3695 # actual measurements said it was used a lot.
3696 main::set_access('max', \%max, 'r');
3698 my %each_range_iterator;
3699 # Iterator position for each_range()
3700 main::set_access('each_range_iterator', \%each_range_iterator);
3703 # Name of parent this is attached to, if any. Solely for better error
3705 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3707 my %_search_ranges_cache;
3708 # A cache of the previous result from _search_ranges(), for better
3710 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3716 # Optional initialization data for the range list.
3717 my $initialize = delete $args{'Initialize'};
3721 # Use _union() to initialize. _union() returns an object of this
3722 # class, which means that it will call this constructor recursively.
3723 # But it won't have this $initialize parameter so that it won't
3724 # infinitely loop on this.
3725 return _union($class, $initialize, %args) if defined $initialize;
3727 $self = bless \do { my $anonymous_scalar }, $class;
3728 my $addr = do { no overloading; pack 'J', $self; };
3730 # Optional parent object, only for debug info.
3731 $owner_name_of{$addr} = delete $args{'Owner'};
3732 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3734 # Stringify, in case it is an object.
3735 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3737 # This is used only for error messages, and so a colon is added
3738 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3740 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3742 $max{$addr} = $max_init;
3744 $_search_ranges_cache{$addr} = 0;
3745 $ranges{$addr} = [];
3752 qw("") => "_operator_stringify",
3753 "." => \&main::_operator_dot,
3754 ".=" => \&main::_operator_dot_equal,
3757 sub _operator_stringify {
3759 my $addr = do { no overloading; pack 'J', $self; };
3761 return "Range_List attached to '$owner_name_of{$addr}'"
3762 if $owner_name_of{$addr};
3763 return "anonymous Range_List " . \$self;
3767 # Returns the union of the input code points. It can be called as
3768 # either a constructor or a method. If called as a method, the result
3769 # will be a new() instance of the calling object, containing the union
3770 # of that object with the other parameter's code points; if called as
3771 # a constructor, the first parameter gives the class that the new object
3772 # should be, and the second parameter gives the code points to go into
3774 # In either case, there are two parameters looked at by this routine;
3775 # any additional parameters are passed to the new() constructor.
3777 # The code points can come in the form of some object that contains
3778 # ranges, and has a conventionally named method to access them; or
3779 # they can be an array of individual code points (as integers); or
3780 # just a single code point.
3782 # If they are ranges, this routine doesn't make any effort to preserve
3783 # the range values and types of one input over the other. Therefore
3784 # this base class should not allow _union to be called from other than
3785 # initialization code, so as to prevent two tables from being added
3786 # together where the range values matter. The general form of this
3787 # routine therefore belongs in a derived class, but it was moved here
3788 # to avoid duplication of code. The failure to overload this in this
3789 # class keeps it safe.
3791 # It does make the effort during initialization to accept tables with
3792 # multiple values for the same code point, and to preserve the order
3793 # of these. If there is only one input range or range set, it doesn't
3794 # sort (as it should already be sorted to the desired order), and will
3795 # accept multiple values per code point. Otherwise it will merge
3796 # multiple values into a single one.
3799 my @args; # Arguments to pass to the constructor
3803 # If a method call, will start the union with the object itself, and
3804 # the class of the new object will be the same as self.
3811 # Add the other required parameter.
3813 # Rest of parameters are passed on to the constructor
3815 # Accumulate all records from both lists.
3817 my $input_count = 0;
3818 for my $arg (@args) {
3819 #local $to_trace = 0 if main::DEBUG;
3820 trace "argument = $arg" if main::DEBUG && $to_trace;
3821 if (! defined $arg) {
3823 if (defined $self) {
3825 $message .= $owner_name_of{pack 'J', $self};
3827 Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
3831 $arg = [ $arg ] if ! ref $arg;
3832 my $type = ref $arg;
3833 if ($type eq 'ARRAY') {
3834 foreach my $element (@$arg) {
3835 push @records, Range->new($element, $element);
3839 elsif ($arg->isa('Range')) {
3840 push @records, $arg;
3843 elsif ($arg->can('ranges')) {
3844 push @records, $arg->ranges;
3849 if (defined $self) {
3851 $message .= $owner_name_of{pack 'J', $self};
3853 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3858 # Sort with the range containing the lowest ordinal first, but if
3859 # two ranges start at the same code point, sort with the bigger range
3860 # of the two first, because it takes fewer cycles.
3861 if ($input_count > 1) {
3862 @records = sort { ($a->start <=> $b->start)
3864 # if b is shorter than a, b->end will be
3865 # less than a->end, and we want to select
3866 # a, so want to return -1
3867 ($b->end <=> $a->end)
3871 my $new = $class->new(@_);
3873 # Fold in records so long as they add new information.
3874 for my $set (@records) {
3875 my $start = $set->start;
3876 my $end = $set->end;
3877 my $value = $set->value;
3878 my $type = $set->type;
3879 if ($start > $new->max) {
3880 $new->_add_delete('+', $start, $end, $value, Type => $type);
3882 elsif ($end > $new->max) {
3883 $new->_add_delete('+', $new->max +1, $end, $value,
3886 elsif ($input_count == 1) {
3887 # Here, overlaps existing range, but is from a single input,
3888 # so preserve the multiple values from that input.
3889 $new->_add_delete('+', $start, $end, $value, Type => $type,
3890 Replace => $MULTIPLE_AFTER);
3897 sub range_count { # Return the number of ranges in the range list
3899 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3902 return scalar @{$ranges{pack 'J', $self}};
3906 # Returns the minimum code point currently in the range list, or if
3907 # the range list is empty, 2 beyond the max possible. This is a
3908 # method because used so rarely, that not worth saving between calls,
3909 # and having to worry about changing it as ranges are added and
3913 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3915 my $addr = do { no overloading; pack 'J', $self; };
3917 # If the range list is empty, return a large value that isn't adjacent
3918 # to any that could be in the range list, for simpler tests
3919 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3920 return $ranges{$addr}->[0]->start;
3924 # Boolean: Is argument in the range list? If so returns $i such that:
3925 # range[$i]->end < $codepoint <= range[$i+1]->end
3926 # which is one beyond what you want; this is so that the 0th range
3927 # doesn't return false
3929 my $codepoint = shift;
3930 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3932 my $i = $self->_search_ranges($codepoint);
3933 return 0 unless defined $i;
3935 # The search returns $i, such that
3936 # range[$i-1]->end < $codepoint <= range[$i]->end
3937 # So is in the table if and only iff it is at least the start position
3940 return 0 if $ranges{pack 'J', $self}->[$i]->start > $codepoint;
3944 sub containing_range {
3945 # Returns the range object that contains the code point, undef if none
3948 my $codepoint = shift;
3949 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3951 my $i = $self->contains($codepoint);
3954 # contains() returns 1 beyond where we should look
3956 return $ranges{pack 'J', $self}->[$i-1];
3960 # Returns the value associated with the code point, undef if none
3963 my $codepoint = shift;
3964 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3966 my $range = $self->containing_range($codepoint);
3967 return unless defined $range;
3969 return $range->value;
3973 # Returns the type of the range containing the code point, undef if
3974 # the code point is not in the table
3977 my $codepoint = shift;
3978 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3980 my $range = $self->containing_range($codepoint);
3981 return unless defined $range;
3983 return $range->type;
3986 sub _search_ranges {
3987 # Find the range in the list which contains a code point, or where it
3988 # should go if were to add it. That is, it returns $i, such that:
3989 # range[$i-1]->end < $codepoint <= range[$i]->end
3990 # Returns undef if no such $i is possible (e.g. at end of table), or
3991 # if there is an error.
3994 my $code_point = shift;
3995 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3997 my $addr = do { no overloading; pack 'J', $self; };
3999 return if $code_point > $max{$addr};
4000 my $r = $ranges{$addr}; # The current list of ranges
4001 my $range_list_size = scalar @$r;
4004 use integer; # want integer division
4006 # Use the cached result as the starting guess for this one, because,
4007 # an experiment on 5.1 showed that 90% of the time the cache was the
4008 # same as the result on the next call (and 7% it was one less).
4009 $i = $_search_ranges_cache{$addr};
4010 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
4011 # from an intervening deletion
4012 #local $to_trace = 1 if main::DEBUG;
4013 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
4014 return $i if $code_point <= $r->[$i]->end
4015 && ($i == 0 || $r->[$i-1]->end < $code_point);
4017 # Here the cache doesn't yield the correct $i. Try adding 1.
4018 if ($i < $range_list_size - 1
4019 && $r->[$i]->end < $code_point &&
4020 $code_point <= $r->[$i+1]->end)
4023 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
4024 $_search_ranges_cache{$addr} = $i;
4028 # Here, adding 1 also didn't work. We do a binary search to
4029 # find the correct position, starting with current $i
4031 my $upper = $range_list_size - 1;
4033 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
4035 if ($code_point <= $r->[$i]->end) {
4037 # Here we have met the upper constraint. We can quit if we
4038 # also meet the lower one.
4039 last if $i == 0 || $r->[$i-1]->end < $code_point;
4041 $upper = $i; # Still too high.
4046 # Here, $r[$i]->end < $code_point, so look higher up.
4050 # Split search domain in half to try again.
4051 my $temp = ($upper + $lower) / 2;
4053 # No point in continuing unless $i changes for next time
4057 # We can't reach the highest element because of the averaging.
4058 # So if one below the upper edge, force it there and try one
4060 if ($i == $range_list_size - 2) {
4062 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4063 $i = $range_list_size - 1;
4065 # Change $lower as well so if fails next time through,
4066 # taking the average will yield the same $i, and we will
4067 # quit with the error message just below.
4071 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
4075 } # End of while loop
4077 if (main::DEBUG && $to_trace) {
4078 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4079 trace "i= [ $i ]", $r->[$i];
4080 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4083 # Here we have found the offset. Cache it as a starting point for the
4085 $_search_ranges_cache{$addr} = $i;
4090 # Add, replace or delete ranges to or from a list. The $type
4091 # parameter gives which:
4092 # '+' => insert or replace a range, returning a list of any changed
4094 # '-' => delete a range, returning a list of any deleted ranges.
4096 # The next three parameters give respectively the start, end, and
4097 # value associated with the range. 'value' should be null unless the
4100 # The range list is kept sorted so that the range with the lowest
4101 # starting position is first in the list, and generally, adjacent
4102 # ranges with the same values are merged into a single larger one (see
4103 # exceptions below).
4105 # There are more parameters; all are key => value pairs:
4106 # Type gives the type of the value. It is only valid for '+'.
4107 # All ranges have types; if this parameter is omitted, 0 is
4108 # assumed. Ranges with type 0 are assumed to obey the
4109 # Unicode rules for casing, etc; ranges with other types are
4110 # not. Otherwise, the type is arbitrary, for the caller's
4111 # convenience, and looked at only by this routine to keep
4112 # adjacent ranges of different types from being merged into
4113 # a single larger range, and when Replace =>
4114 # $IF_NOT_EQUIVALENT is specified (see just below).
4115 # Replace determines what to do if the range list already contains
4116 # ranges which coincide with all or portions of the input
4117 # range. It is only valid for '+':
4118 # => $NO means that the new value is not to replace
4119 # any existing ones, but any empty gaps of the
4120 # range list coinciding with the input range
4121 # will be filled in with the new value.
4122 # => $UNCONDITIONALLY means to replace the existing values with
4123 # this one unconditionally. However, if the
4124 # new and old values are identical, the
4125 # replacement is skipped to save cycles
4126 # => $IF_NOT_EQUIVALENT means to replace the existing values
4127 # (the default) with this one if they are not equivalent.
4128 # Ranges are equivalent if their types are the
4129 # same, and they are the same string; or if
4130 # both are type 0 ranges, if their Unicode
4131 # standard forms are identical. In this last
4132 # case, the routine chooses the more "modern"
4133 # one to use. This is because some of the
4134 # older files are formatted with values that
4135 # are, for example, ALL CAPs, whereas the
4136 # derived files have a more modern style,
4137 # which looks better. By looking for this
4138 # style when the pre-existing and replacement
4139 # standard forms are the same, we can move to
4141 # => $MULTIPLE_BEFORE means that if this range duplicates an
4142 # existing one, but has a different value,
4143 # don't replace the existing one, but insert
4144 # this one so that the same range can occur
4145 # multiple times. They are stored LIFO, so
4146 # that the final one inserted is the first one
4147 # returned in an ordered search of the table.
4148 # If this is an exact duplicate, including the
4149 # value, the original will be moved to be
4150 # first, before any other duplicate ranges
4151 # with different values.
4152 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4153 # FIFO, so that this one is inserted after all
4154 # others that currently exist. If this is an
4155 # exact duplicate, including value, of an
4156 # existing range, this one is discarded
4157 # (leaving the existing one in its original,
4158 # higher priority position
4159 # => $CROAK Die with an error if is already there
4160 # => anything else is the same as => $IF_NOT_EQUIVALENT
4162 # "same value" means identical for non-type-0 ranges, and it means
4163 # having the same standard forms for type-0 ranges.
4165 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4168 my $operation = shift; # '+' for add/replace; '-' for delete;
4175 $value = "" if not defined $value; # warning: $value can be "0"
4177 my $replace = delete $args{'Replace'};
4178 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4180 my $type = delete $args{'Type'};
4181 $type = 0 unless defined $type;
4183 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4185 my $addr = do { no overloading; pack 'J', $self; };
4187 if ($operation ne '+' && $operation ne '-') {
4188 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
4191 unless (defined $start && defined $end) {
4192 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
4195 unless ($end >= $start) {
4196 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
4199 #local $to_trace = 1 if main::DEBUG;
4201 if ($operation eq '-') {
4202 if ($replace != $IF_NOT_EQUIVALENT) {
4203 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
4204 $replace = $IF_NOT_EQUIVALENT;
4207 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
4211 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
4216 my $r = $ranges{$addr}; # The current list of ranges
4217 my $range_list_size = scalar @$r; # And its size
4218 my $max = $max{$addr}; # The current high code point in
4219 # the list of ranges
4221 # Do a special case requiring fewer machine cycles when the new range
4222 # starts after the current highest point. The Unicode input data is
4223 # structured so this is common.
4224 if ($start > $max) {
4226 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4227 return if $operation eq '-'; # Deleting a non-existing range is a
4230 # If the new range doesn't logically extend the current final one
4231 # in the range list, create a new range at the end of the range
4232 # list. (max cleverly is initialized to a negative number not
4233 # adjacent to 0 if the range list is empty, so even adding a range
4234 # to an empty range list starting at 0 will have this 'if'
4236 if ($start > $max + 1 # non-adjacent means can't extend.
4237 || @{$r}[-1]->value ne $value # values differ, can't extend.
4238 || @{$r}[-1]->type != $type # types differ, can't extend.
4240 push @$r, Range->new($start, $end,
4246 # Here, the new range starts just after the current highest in
4247 # the range list, and they have the same type and value.
4248 # Extend the existing range to incorporate the new one.
4249 @{$r}[-1]->set_end($end);
4252 # This becomes the new maximum.
4257 #local $to_trace = 0 if main::DEBUG;
4259 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4261 # Here, the input range isn't after the whole rest of the range list.
4262 # Most likely 'splice' will be needed. The rest of the routine finds
4263 # the needed splice parameters, and if necessary, does the splice.
4264 # First, find the offset parameter needed by the splice function for
4265 # the input range. Note that the input range may span multiple
4266 # existing ones, but we'll worry about that later. For now, just find
4267 # the beginning. If the input range is to be inserted starting in a
4268 # position not currently in the range list, it must (obviously) come
4269 # just after the range below it, and just before the range above it.
4270 # Slightly less obviously, it will occupy the position currently
4271 # occupied by the range that is to come after it. More formally, we
4272 # are looking for the position, $i, in the array of ranges, such that:
4274 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4276 # (The ordered relationships within existing ranges are also shown in
4277 # the equation above). However, if the start of the input range is
4278 # within an existing range, the splice offset should point to that
4279 # existing range's position in the list; that is $i satisfies a
4280 # somewhat different equation, namely:
4282 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4284 # More briefly, $start can come before or after r[$i]->start, and at
4285 # this point, we don't know which it will be. However, these
4286 # two equations share these constraints:
4288 # r[$i-1]->end < $start <= r[$i]->end
4290 # And that is good enough to find $i.
4292 my $i = $self->_search_ranges($start);
4294 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
4298 # The search function returns $i such that:
4300 # r[$i-1]->end < $start <= r[$i]->end
4302 # That means that $i points to the first range in the range list
4303 # that could possibly be affected by this operation. We still don't
4304 # know if the start of the input range is within r[$i], or if it
4305 # points to empty space between r[$i-1] and r[$i].
4306 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4308 # Special case the insertion of data that is not to replace any
4310 if ($replace == $NO) { # If $NO, has to be operation '+'
4311 #local $to_trace = 1 if main::DEBUG;
4312 trace "Doesn't replace" if main::DEBUG && $to_trace;
4314 # Here, the new range is to take effect only on those code points
4315 # that aren't already in an existing range. This can be done by
4316 # looking through the existing range list and finding the gaps in
4317 # the ranges that this new range affects, and then calling this
4318 # function recursively on each of those gaps, leaving untouched
4319 # anything already in the list. Gather up a list of the changed
4320 # gaps first so that changes to the internal state as new ranges
4321 # are added won't be a problem.
4324 # First, if the starting point of the input range is outside an
4325 # existing one, there is a gap from there to the beginning of the
4326 # existing range -- add a span to fill the part that this new
4328 if ($start < $r->[$i]->start) {
4329 push @gap_list, Range->new($start,
4331 $r->[$i]->start - 1),
4333 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4336 # Then look through the range list for other gaps until we reach
4337 # the highest range affected by the input one.
4339 for ($j = $i+1; $j < $range_list_size; $j++) {
4340 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4341 last if $end < $r->[$j]->start;
4343 # If there is a gap between when this range starts and the
4344 # previous one ends, add a span to fill it. Note that just
4345 # because there are two ranges doesn't mean there is a
4346 # non-zero gap between them. It could be that they have
4347 # different values or types
4348 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4350 Range->new($r->[$j-1]->end + 1,
4351 $r->[$j]->start - 1,
4353 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4357 # Here, we have either found an existing range in the range list,
4358 # beyond the area affected by the input one, or we fell off the
4359 # end of the loop because the input range affects the whole rest
4360 # of the range list. In either case, $j is 1 higher than the
4361 # highest affected range. If $j == $i, it means that there are no
4362 # affected ranges, that the entire insertion is in the gap between
4363 # r[$i-1], and r[$i], which we already have taken care of before
4365 # On the other hand, if there are affected ranges, it might be
4366 # that there is a gap that needs filling after the final such
4367 # range to the end of the input range
4368 if ($r->[$j-1]->end < $end) {
4369 push @gap_list, Range->new(main::max($start,
4370 $r->[$j-1]->end + 1),
4373 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4376 # Call recursively to fill in all the gaps.
4377 foreach my $gap (@gap_list) {
4378 $self->_add_delete($operation,
4388 # Here, we have taken care of the case where $replace is $NO.
4389 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4390 # If inserting a multiple record, this is where it goes, before the
4391 # first (if any) existing one if inserting LIFO. (If this is to go
4392 # afterwards, FIFO, we below move the pointer to there.) These imply
4393 # an insertion, and no change to any existing ranges. Note that $i
4394 # can be -1 if this new range doesn't actually duplicate any existing,
4395 # and comes at the beginning of the list.
4396 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4398 if ($start != $end) {
4399 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
4403 # If the new code point is within a current range ...
4404 if ($end >= $r->[$i]->start) {
4406 # Don't add an exact duplicate, as it isn't really a multiple
4407 my $existing_value = $r->[$i]->value;
4408 my $existing_type = $r->[$i]->type;
4409 return if $value eq $existing_value && $type eq $existing_type;
4411 # If the multiple value is part of an existing range, we want
4412 # to split up that range, so that only the single code point
4413 # is affected. To do this, we first call ourselves
4414 # recursively to delete that code point from the table, having
4415 # preserved its current data above. Then we call ourselves
4416 # recursively again to add the new multiple, which we know by
4417 # the test just above is different than the current code
4418 # point's value, so it will become a range containing a single
4419 # code point: just itself. Finally, we add back in the
4420 # pre-existing code point, which will again be a single code
4421 # point range. Because 'i' likely will have changed as a
4422 # result of these operations, we can't just continue on, but
4423 # do this operation recursively as well. If we are inserting
4424 # LIFO, the pre-existing code point needs to go after the new
4425 # one, so use MULTIPLE_AFTER; and vice versa.
4426 if ($r->[$i]->start != $r->[$i]->end) {
4427 $self->_add_delete('-', $start, $end, "");
4428 $self->_add_delete('+', $start, $end, $value, Type => $type);
4429 return $self->_add_delete('+',
4432 Type => $existing_type,
4433 Replace => ($replace == $MULTIPLE_BEFORE)
4435 : $MULTIPLE_BEFORE);
4439 # If to place this new record after, move to beyond all existing
4440 # ones; but don't add this one if identical to any of them, as it
4441 # isn't really a multiple. This leaves the original order, so
4442 # that the current request is ignored. The reasoning is that the
4443 # previous request that wanted this record to have high priority
4444 # should have precedence.
4445 if ($replace == $MULTIPLE_AFTER) {
4446 while ($i < @$r && $r->[$i]->start == $start) {
4447 return if $value eq $r->[$i]->value
4448 && $type eq $r->[$i]->type;
4453 # If instead we are to place this new record before any
4454 # existing ones, remove any identical ones that come after it.
4455 # This changes the existing order so that the new one is
4456 # first, as is being requested.
4457 for (my $j = $i + 1;
4458 $j < @$r && $r->[$j]->start == $start;
4461 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4463 last; # There should only be one instance, so no
4464 # need to keep looking
4469 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4470 my @return = splice @$r,
4477 if (main::DEBUG && $to_trace) {
4478 trace "After splice:";
4479 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4480 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4481 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
4482 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4483 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4484 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4489 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
4490 # leaves delete, insert, and replace either unconditionally or if not
4491 # equivalent. $i still points to the first potential affected range.
4492 # Now find the highest range affected, which will determine the length
4493 # parameter to splice. (The input range can span multiple existing
4494 # ones.) If this isn't a deletion, while we are looking through the
4495 # range list, see also if this is a replacement rather than a clean
4496 # insertion; that is if it will change the values of at least one
4497 # existing range. Start off assuming it is an insert, until find it
4499 my $clean_insert = $operation eq '+';
4500 my $j; # This will point to the highest affected range
4502 # For non-zero types, the standard form is the value itself;
4503 my $standard_form = ($type) ? $value : main::standardize($value);
4505 for ($j = $i; $j < $range_list_size; $j++) {
4506 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4508 # If find a range that it doesn't overlap into, we can stop
4510 last if $end < $r->[$j]->start;
4512 # Here, overlaps the range at $j. If the values don't match,
4513 # and so far we think this is a clean insertion, it becomes a
4514 # non-clean insertion, i.e., a 'change' or 'replace' instead.
4515 if ($clean_insert) {
4516 if ($r->[$j]->standard_form ne $standard_form) {
4518 if ($replace == $CROAK) {
4519 main::croak("The range to add "
4520 . sprintf("%04X", $start)
4522 . sprintf("%04X", $end)
4523 . " with value '$value' overlaps an existing range $r->[$j]");
4528 # Here, the two values are essentially the same. If the
4529 # two are actually identical, replacing wouldn't change
4530 # anything so skip it.
4531 my $pre_existing = $r->[$j]->value;
4532 if ($pre_existing ne $value) {
4534 # Here the new and old standardized values are the
4535 # same, but the non-standardized values aren't. If
4536 # replacing unconditionally, then replace
4537 if( $replace == $UNCONDITIONALLY) {
4542 # Here, are replacing conditionally. Decide to
4543 # replace or not based on which appears to look
4544 # the "nicest". If one is mixed case and the
4545 # other isn't, choose the mixed case one.
4546 my $new_mixed = $value =~ /[A-Z]/
4547 && $value =~ /[a-z]/;
4548 my $old_mixed = $pre_existing =~ /[A-Z]/
4549 && $pre_existing =~ /[a-z]/;
4551 if ($old_mixed != $new_mixed) {
4552 $clean_insert = 0 if $new_mixed;
4553 if (main::DEBUG && $to_trace) {
4554 if ($clean_insert) {
4555 trace "Retaining $pre_existing over $value";
4558 trace "Replacing $pre_existing with $value";
4564 # Here casing wasn't different between the two.
4565 # If one has hyphens or underscores and the
4566 # other doesn't, choose the one with the
4568 my $new_punct = $value =~ /[-_]/;
4569 my $old_punct = $pre_existing =~ /[-_]/;
4571 if ($old_punct != $new_punct) {
4572 $clean_insert = 0 if $new_punct;
4573 if (main::DEBUG && $to_trace) {
4574 if ($clean_insert) {
4575 trace "Retaining $pre_existing over $value";
4578 trace "Replacing $pre_existing with $value";
4581 } # else existing one is just as "good";
4582 # retain it to save cycles.
4588 } # End of loop looking for highest affected range.
4590 # Here, $j points to one beyond the highest range that this insertion
4591 # affects (hence to beyond the range list if that range is the final
4592 # one in the range list).
4594 # The splice length is all the affected ranges. Get it before
4595 # subtracting, for efficiency, so we don't have to later add 1.
4596 my $length = $j - $i;
4598 $j--; # $j now points to the highest affected range.
4599 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4601 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4602 # $j points to the highest affected range. But it can be < $i or even
4603 # -1. These happen only if the insertion is entirely in the gap
4604 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
4605 # above exited first time through with $end < $r->[$i]->start. (And
4606 # then we subtracted one from j) This implies also that $start <
4607 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4608 # $start, so the entire input range is in the gap.
4611 # Here the entire input range is in the gap before $i.
4613 if (main::DEBUG && $to_trace) {
4615 trace "Entire range is between $r->[$i-1] and $r->[$i]";
4618 trace "Entire range is before $r->[$i]";
4621 return if $operation ne '+'; # Deletion of a non-existent range is
4626 # Here part of the input range is not in the gap before $i. Thus,
4627 # there is at least one affected one, and $j points to the highest
4630 # At this point, here is the situation:
4631 # This is not an insertion of a multiple, nor of tentative ($NO)
4633 # $i points to the first element in the current range list that
4634 # may be affected by this operation. In fact, we know
4635 # that the range at $i is affected because we are in
4636 # the else branch of this 'if'
4637 # $j points to the highest affected range.
4639 # r[$i-1]->end < $start <= r[$i]->end
4641 # r[$i-1]->end < $start <= $end < r[$j+1]->start
4644 # $clean_insert is a boolean which is set true if and only if
4645 # this is a "clean insertion", i.e., not a change nor a
4646 # deletion (multiple was handled above).
4648 # We now have enough information to decide if this call is a no-op
4649 # or not. It is a no-op if this is an insertion of already
4650 # existing data. To be so, it must be contained entirely in one
4653 if (main::DEBUG && $to_trace && $clean_insert
4654 && $start >= $r->[$i]->start
4655 && $end <= $r->[$i]->end)
4659 return if $clean_insert
4660 && $start >= $r->[$i]->start
4661 && $end <= $r->[$i]->end;
4664 # Here, we know that some action will have to be taken. We have
4665 # calculated the offset and length (though adjustments may be needed)
4666 # for the splice. Now start constructing the replacement list.
4668 my $splice_start = $i;
4673 # See if should extend any adjacent ranges.
4674 if ($operation eq '-') { # Don't extend deletions
4675 $extends_below = $extends_above = 0;
4677 else { # Here, should extend any adjacent ranges. See if there are
4679 $extends_below = ($i > 0
4680 # can't extend unless adjacent
4681 && $r->[$i-1]->end == $start -1
4682 # can't extend unless are same standard value
4683 && $r->[$i-1]->standard_form eq $standard_form
4684 # can't extend unless share type
4685 && $r->[$i-1]->type == $type);
4686 $extends_above = ($j+1 < $range_list_size
4687 && $r->[$j+1]->start == $end +1
4688 && $r->[$j+1]->standard_form eq $standard_form
4689 && $r->[$j+1]->type == $type);
4691 if ($extends_below && $extends_above) { # Adds to both
4692 $splice_start--; # start replace at element below
4693 $length += 2; # will replace on both sides
4694 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4696 # The result will fill in any gap, replacing both sides, and
4697 # create one large range.
4698 @replacement = Range->new($r->[$i-1]->start,
4705 # Here we know that the result won't just be the conglomeration of
4706 # a new range with both its adjacent neighbors. But it could
4707 # extend one of them.
4709 if ($extends_below) {
4711 # Here the new element adds to the one below, but not to the
4712 # one above. If inserting, and only to that one range, can
4713 # just change its ending to include the new one.
4714 if ($length == 0 && $clean_insert) {
4715 $r->[$i-1]->set_end($end);
4716 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4720 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4721 $splice_start--; # start replace at element below
4722 $length++; # will replace the element below
4723 $start = $r->[$i-1]->start;
4726 elsif ($extends_above) {
4728 # Here the new element adds to the one above, but not below.
4729 # Mirror the code above
4730 if ($length == 0 && $clean_insert) {
4731 $r->[$j+1]->set_start($start);
4732 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4736 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4737 $length++; # will replace the element above
4738 $end = $r->[$j+1]->end;
4742 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4744 # Finally, here we know there will have to be a splice.
4745 # If the change or delete affects only the highest portion of the
4746 # first affected range, the range will have to be split. The
4747 # splice will remove the whole range, but will replace it by a new
4748 # range containing just the unaffected part. So, in this case,
4749 # add to the replacement list just this unaffected portion.
4750 if (! $extends_below
4751 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4754 Range->new($r->[$i]->start,
4756 Value => $r->[$i]->value,
4757 Type => $r->[$i]->type);
4760 # In the case of an insert or change, but not a delete, we have to
4761 # put in the new stuff; this comes next.
4762 if ($operation eq '+') {
4763 push @replacement, Range->new($start,
4769 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4770 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4772 # And finally, if we're changing or deleting only a portion of the
4773 # highest affected range, it must be split, as the lowest one was.
4774 if (! $extends_above
4775 && $j >= 0 # Remember that j can be -1 if before first
4777 && $end >= $r->[$j]->start
4778 && $end < $r->[$j]->end)
4781 Range->new($end + 1,
4783 Value => $r->[$j]->value,
4784 Type => $r->[$j]->type);
4788 # And do the splice, as calculated above
4789 if (main::DEBUG && $to_trace) {
4790 trace "replacing $length element(s) at $i with ";
4791 foreach my $replacement (@replacement) {
4792 trace " $replacement";
4794 trace "Before splice:";
4795 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4796 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4797 trace "i =[", $i, "]", $r->[$i];
4798 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4799 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4802 my @return = splice @$r, $splice_start, $length, @replacement;
4804 if (main::DEBUG && $to_trace) {
4805 trace "After splice:";
4806 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4807 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4808 trace "i =[", $i, "]", $r->[$i];
4809 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4810 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4811 trace "removed ", @return if @return;
4814 # An actual deletion could have changed the maximum in the list.
4815 # There was no deletion if the splice didn't return something, but
4816 # otherwise recalculate it. This is done too rarely to worry about
4818 if ($operation eq '-' && @return) {
4820 $max{$addr} = $r->[-1]->end;
4823 $max{$addr} = $max_init;
4829 sub reset_each_range { # reset the iterator for each_range();
4831 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4834 undef $each_range_iterator{pack 'J', $self};
4839 # Iterate over each range in a range list. Results are undefined if
4840 # the range list is changed during the iteration.
4843 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4845 my $addr = do { no overloading; pack 'J', $self; };
4847 return if $self->is_empty;
4849 $each_range_iterator{$addr} = -1
4850 if ! defined $each_range_iterator{$addr};
4851 $each_range_iterator{$addr}++;
4852 return $ranges{$addr}->[$each_range_iterator{$addr}]
4853 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4854 undef $each_range_iterator{$addr};
4858 sub count { # Returns count of code points in range list
4860 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4862 my $addr = do { no overloading; pack 'J', $self; };
4865 foreach my $range (@{$ranges{$addr}}) {
4866 $count += $range->end - $range->start + 1;
4871 sub delete_range { # Delete a range
4876 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4878 return $self->_add_delete('-', $start, $end, "");
4881 sub is_empty { # Returns boolean as to if a range list is empty
4883 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4886 return scalar @{$ranges{pack 'J', $self}} == 0;
4890 # Quickly returns a scalar suitable for separating tables into
4891 # buckets, i.e. it is a hash function of the contents of a table, so
4892 # there are relatively few conflicts.
4895 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
4897 my $addr = do { no overloading; pack 'J', $self; };
4899 # These are quickly computable. Return looks like 'min..max;count'
4900 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4902 } # End closure for _Range_List_Base
4905 use parent '-norequire', '_Range_List_Base';
4907 # A Range_List is a range list for match tables; i.e. the range values are
4908 # not significant. Thus a number of operations can be safely added to it,
4909 # such as inversion, intersection. Note that union is also an unsafe
4910 # operation when range values are cared about, and that method is in the base
4911 # class, not here. But things are set up so that that method is callable only
4912 # during initialization. Only in this derived class, is there an operation
4913 # that combines two tables. A Range_Map can thus be used to initialize a
4914 # Range_List, and its mappings will be in the list, but are not significant to
4917 sub trace { return main::trace(@_); }
4923 '+' => sub { my $self = shift;
4926 return $self->_union($other)
4928 '+=' => sub { my $self = shift;
4930 my $reversed = shift;
4933 Carp::my_carp_bug("Bad news. Can't cope with '"
4937 . "'. undef returned.");
4941 return $self->_union($other)
4943 '&' => sub { my $self = shift;
4946 return $self->_intersect($other, 0);
4948 '&=' => sub { my $self = shift;
4950 my $reversed = shift;
4953 Carp::my_carp_bug("Bad news. Can't cope with '"
4957 . "'. undef returned.");
4961 return $self->_intersect($other, 0);
4968 # Returns a new Range_List that gives all code points not in $self.
4972 my $new = Range_List->new;
4974 # Go through each range in the table, finding the gaps between them
4975 my $max = -1; # Set so no gap before range beginning at 0
4976 for my $range ($self->ranges) {
4977 my $start = $range->start;
4978 my $end = $range->end;
4980 # If there is a gap before this range, the inverse will contain
4982 if ($start > $max + 1) {
4983 $new->add_range($max + 1, $start - 1);
4988 # And finally, add the gap from the end of the table to the max
4989 # possible code point
4990 if ($max < $MAX_WORKING_CODEPOINT) {
4991 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4997 # Returns a new Range_List with the argument deleted from it. The
4998 # argument can be a single code point, a range, or something that has
4999 # a range, with the _range_list() method on it returning them
5003 my $reversed = shift;
5004 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5007 Carp::my_carp_bug("Bad news. Can't cope with '"
5011 . "'. undef returned.");
5015 my $new = Range_List->new(Initialize => $self);
5017 if (! ref $other) { # Single code point
5018 $new->delete_range($other, $other);
5020 elsif ($other->isa('Range')) {
5021 $new->delete_range($other->start, $other->end);
5023 elsif ($other->can('_range_list')) {
5024 foreach my $range ($other->_range_list->ranges) {
5025 $new->delete_range($range->start, $range->end);
5029 Carp::my_carp_bug("Can't cope with a "
5031 . " argument to '-'. Subtraction ignored."
5040 # Returns either a boolean giving whether the two inputs' range lists
5041 # intersect (overlap), or a new Range_List containing the intersection
5042 # of the two lists. The optional final parameter being true indicates
5043 # to do the check instead of the intersection.
5045 my $a_object = shift;
5046 my $b_object = shift;
5047 my $check_if_overlapping = shift;
5048 $check_if_overlapping = 0 unless defined $check_if_overlapping;
5049 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5051 if (! defined $b_object) {
5053 $message .= $a_object->_owner_name_of if defined $a_object;
5054 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
5058 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
5059 # Thus the intersection could be much more simply be written:
5060 # return ~(~$a_object + ~$b_object);
5061 # But, this is slower, and when taking the inverse of a large
5062 # range_size_1 table, back when such tables were always stored that
5063 # way, it became prohibitively slow, hence the code was changed to the
5066 if ($b_object->isa('Range')) {
5067 $b_object = Range_List->new(Initialize => $b_object,
5068 Owner => $a_object->_owner_name_of);
5070 $b_object = $b_object->_range_list if $b_object->can('_range_list');
5072 my @a_ranges = $a_object->ranges;
5073 my @b_ranges = $b_object->ranges;
5075 #local $to_trace = 1 if main::DEBUG;
5076 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
5078 # Start with the first range in each list
5080 my $range_a = $a_ranges[$a_i];
5082 my $range_b = $b_ranges[$b_i];
5084 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
5085 if ! $check_if_overlapping;
5087 # If either list is empty, there is no intersection and no overlap
5088 if (! defined $range_a || ! defined $range_b) {
5089 return $check_if_overlapping ? 0 : $new;
5091 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5093 # Otherwise, must calculate the intersection/overlap. Start with the
5094 # very first code point in each list
5095 my $a = $range_a->start;
5096 my $b = $range_b->start;
5098 # Loop through all the ranges of each list; in each iteration, $a and
5099 # $b are the current code points in their respective lists
5102 # If $a and $b are the same code point, ...
5105 # it means the lists overlap. If just checking for overlap
5106 # know the answer now,
5107 return 1 if $check_if_overlapping;
5109 # The intersection includes this code point plus anything else
5110 # common to both current ranges.
5112 my $end = main::min($range_a->end, $range_b->end);
5113 if (! $check_if_overlapping) {
5114 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5115 $new->add_range($start, $end);
5118 # Skip ahead to the end of the current intersect
5121 # If the current intersect ends at the end of either range (as
5122 # it must for at least one of them), the next possible one
5123 # will be the beginning code point in it's list's next range.
5124 if ($a == $range_a->end) {
5125 $range_a = $a_ranges[++$a_i];
5126 last unless defined $range_a;
5127 $a = $range_a->start;
5129 if ($b == $range_b->end) {
5130 $range_b = $b_ranges[++$b_i];
5131 last unless defined $range_b;
5132 $b = $range_b->start;
5135 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5139 # Not equal, but if the range containing $a encompasses $b,
5140 # change $a to be the middle of the range where it does equal
5141 # $b, so the next iteration will get the intersection
5142 if ($range_a->end >= $b) {
5147 # Here, the current range containing $a is entirely below
5148 # $b. Go try to find a range that could contain $b.
5149 $a_i = $a_object->_search_ranges($b);
5151 # If no range found, quit.
5152 last unless defined $a_i;
5154 # The search returns $a_i, such that
5155 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5156 # Set $a to the beginning of this new range, and repeat.
5157 $range_a = $a_ranges[$a_i];
5158 $a = $range_a->start;
5161 else { # Here, $b < $a.
5163 # Mirror image code to the leg just above
5164 if ($range_b->end >= $a) {
5168 $b_i = $b_object->_search_ranges($a);
5169 last unless defined $b_i;
5170 $range_b = $b_ranges[$b_i];
5171 $b = $range_b->start;
5174 } # End of looping through ranges.
5176 # Intersection fully computed, or now know that there is no overlap
5177 return $check_if_overlapping ? 0 : $new;
5181 # Returns boolean giving whether the two arguments overlap somewhere
5185 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5187 return $self->_intersect($other, 1);
5191 # Add a range to the list.
5196 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5198 return $self->_add_delete('+', $start, $end, "");
5201 sub matches_identically_to {
5202 # Return a boolean as to whether or not two Range_Lists match identical
5203 # sets of code points.
5207 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5209 # These are ordered in increasing real time to figure out (at least
5210 # until a patch changes that and doesn't change this)
5211 return 0 if $self->max != $other->max;
5212 return 0 if $self->min != $other->min;
5213 return 0 if $self->range_count != $other->range_count;
5214 return 0 if $self->count != $other->count;
5216 # Here they could be identical because all the tests above passed.
5217 # The loop below is somewhat simpler since we know they have the same
5218 # number of elements. Compare range by range, until reach the end or
5219 # find something that differs.
5220 my @a_ranges = $self->ranges;
5221 my @b_ranges = $other->ranges;
5222 for my $i (0 .. @a_ranges - 1) {
5223 my $a = $a_ranges[$i];
5224 my $b = $b_ranges[$i];
5225 trace "self $a; other $b" if main::DEBUG && $to_trace;
5226 return 0 if ! defined $b
5227 || $a->start != $b->start
5228 || $a->end != $b->end;
5233 sub is_code_point_usable {
5234 # This used only for making the test script. See if the input
5235 # proposed trial code point is one that Perl will handle. If second
5236 # parameter is 0, it won't select some code points for various
5237 # reasons, noted below.
5240 my $try_hard = shift;
5241 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5243 return 0 if $code < 0; # Never use a negative
5245 # shun null. I'm (khw) not sure why this was done, but NULL would be
5246 # the character very frequently used.
5247 return $try_hard if $code == 0x0000;
5249 # shun non-character code points.
5250 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5251 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5253 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
5254 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5259 sub get_valid_code_point {
5260 # Return a code point that's part of the range list. Returns nothing
5261 # if the table is empty or we can't find a suitable code point. This
5262 # used only for making the test script.
5265 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5267 my $addr = do { no overloading; pack 'J', $self; };
5269 # On first pass, don't choose less desirable code points; if no good
5270 # one is found, repeat, allowing a less desirable one to be selected.
5271 for my $try_hard (0, 1) {
5273 # Look through all the ranges for a usable code point.
5274 for my $set (reverse $self->ranges) {
5276 # Try the edge cases first, starting with the end point of the
5278 my $end = $set->end;
5279 return $end if is_code_point_usable($end, $try_hard);
5280 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5282 # End point didn't, work. Start at the beginning and try
5283 # every one until find one that does work.
5284 for my $trial ($set->start .. $end - 1) {
5285 return $trial if is_code_point_usable($trial, $try_hard);
5289 return (); # If none found, give up.
5292 sub get_invalid_code_point {
5293 # Return a code point that's not part of the table. Returns nothing
5294 # if the table covers all code points or a suitable code point can't
5295 # be found. This used only for making the test script.
5298 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5300 # Just find a valid code point of the inverse, if any.
5301 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5303 } # end closure for Range_List
5306 use parent '-norequire', '_Range_List_Base';
5308 # A Range_Map is a range list in which the range values (called maps) are
5309 # significant, and hence shouldn't be manipulated by our other code, which
5310 # could be ambiguous or lose things. For example, in taking the union of two
5311 # lists, which share code points, but which have differing values, which one
5312 # has precedence in the union?
5313 # It turns out that these operations aren't really necessary for map tables,
5314 # and so this class was created to make sure they aren't accidentally
5320 # Add a range containing a mapping value to the list
5323 # Rest of parameters passed on
5325 return $self->_add_delete('+', @_);
5333 return $self->_add_delete('+', @_, Replace => $UNCONDITIONALLY);
5337 # Adds entry to a range list which can duplicate an existing entry
5340 my $code_point = shift;
5343 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5344 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5346 return $self->add_map($code_point, $code_point,
5347 $value, Replace => $replace);
5349 } # End of closure for package Range_Map
5351 package _Base_Table;
5353 # A table is the basic data structure that gets written out into a file for
5354 # use by the Perl core. This is the abstract base class implementing the
5355 # common elements from the derived ones. A list of the methods to be
5356 # furnished by an implementing class is just after the constructor.
5358 sub standardize { return main::standardize($_[0]); }
5359 sub trace { return main::trace(@_); }
5363 main::setup_package();
5366 # Object containing the ranges of the table.
5367 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5370 # The full table name.
5371 main::set_access('full_name', \%full_name, 'r');
5374 # The table name, almost always shorter
5375 main::set_access('name', \%name, 'r');
5378 # The shortest of all the aliases for this table, with underscores removed
5379 main::set_access('short_name', \%short_name);
5381 my %nominal_short_name_length;
5382 # The length of short_name before removing underscores
5383 main::set_access('nominal_short_name_length',
5384 \%nominal_short_name_length);
5387 # The complete name, including property.
5388 main::set_access('complete_name', \%complete_name, 'r');
5391 # Parent property this table is attached to.
5392 main::set_access('property', \%property, 'r');
5395 # Ordered list of alias objects of the table's name. The first ones in
5396 # the list are output first in comments
5397 main::set_access('aliases', \%aliases, 'readable_array');
5400 # A comment associated with the table for human readers of the files
5401 main::set_access('comment', \%comment, 's');
5404 # A comment giving a short description of the table's meaning for human
5405 # readers of the files.
5406 main::set_access('description', \%description, 'readable_array');
5409 # A comment giving a short note about the table for human readers of the
5411 main::set_access('note', \%note, 'readable_array');
5414 # Enum; there are a number of possibilities for what happens to this
5415 # table: it could be normal, or suppressed, or not for external use. See
5416 # values at definition for $SUPPRESSED.
5417 main::set_access('fate', \%fate, 'r');
5419 my %find_table_from_alias;
5420 # The parent property passes this pointer to a hash which this class adds
5421 # all its aliases to, so that the parent can quickly take an alias and
5423 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5426 # After this table is made equivalent to another one; we shouldn't go
5427 # changing the contents because that could mean it's no longer equivalent
5428 main::set_access('locked', \%locked, 'r');
5431 # This gives the final path to the file containing the table. Each
5432 # directory in the path is an element in the array
5433 main::set_access('file_path', \%file_path, 'readable_array');
5436 # What is the table's status, normal, $OBSOLETE, etc. Enum
5437 main::set_access('status', \%status, 'r');
5440 # A comment about its being obsolete, or whatever non normal status it has
5441 main::set_access('status_info', \%status_info, 'r');
5443 my %caseless_equivalent;
5444 # The table this is equivalent to under /i matching, if any.
5445 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5448 # Is the table to be output with each range only a single code point?
5449 # This is done to avoid breaking existing code that may have come to rely
5450 # on this behavior in previous versions of this program.)
5451 main::set_access('range_size_1', \%range_size_1, 'r', 's');
5454 # A boolean set iff this table is a Perl extension to the Unicode
5456 main::set_access('perl_extension', \%perl_extension, 'r');
5458 my %output_range_counts;
5459 # A boolean set iff this table is to have comments written in the
5460 # output file that contain the number of code points in the range.
5461 # The constructor can override the global flag of the same name.
5462 main::set_access('output_range_counts', \%output_range_counts, 'r');
5464 my %write_as_invlist;
5465 # A boolean set iff the output file for this table is to be in the form of
5466 # an inversion list/map.
5467 main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5470 # The format of the entries of the table. This is calculated from the
5471 # data in the table (or passed in the constructor). This is an enum e.g.,
5472 # $STRING_FORMAT. It is marked protected as it should not be generally
5473 # used to override calculations.
5474 main::set_access('format', \%format, 'r', 'p_s');
5477 # A boolean that gives whether some other table in this property is
5478 # defined as the complement of this table. This is a crude, but currently
5479 # sufficient, mechanism to make this table not get destroyed before what
5480 # is dependent on it is. Other dependencies could be added, so the name
5481 # was chosen to reflect a more general situation than actually is
5482 # currently the case.
5483 main::set_access('has_dependency', \%has_dependency, 'r', 's');
5486 # All arguments are key => value pairs, which you can see below, most
5487 # of which match fields documented above. Otherwise: Re_Pod_Entry,
5488 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5489 # documented in the Alias package
5491 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5495 my $self = bless \do { my $anonymous_scalar }, $class;
5496 my $addr = do { no overloading; pack 'J', $self; };
5500 $name{$addr} = delete $args{'Name'};
5501 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5502 $full_name{$addr} = delete $args{'Full_Name'};
5503 my $complete_name = $complete_name{$addr}
5504 = delete $args{'Complete_Name'};
5505 $format{$addr} = delete $args{'Format'};
5506 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5507 $property{$addr} = delete $args{'_Property'};
5508 $range_list{$addr} = delete $args{'_Range_List'};
5509 $status{$addr} = delete $args{'Status'} || $NORMAL;
5510 $status_info{$addr} = delete $args{'_Status_Info'} || "";
5511 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5512 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5513 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5514 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5515 my $ucd = delete $args{'UCD'};
5517 my $description = delete $args{'Description'};
5518 my $ok_as_filename = delete $args{'OK_as_Filename'};
5519 my $loose_match = delete $args{'Fuzzy'};
5520 my $note = delete $args{'Note'};
5521 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5522 my $perl_extension = delete $args{'Perl_Extension'};
5523 my $suppression_reason = delete $args{'Suppression_Reason'};
5525 # Shouldn't have any left over
5526 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5528 # Can't use || above because conceivably the name could be 0, and
5529 # can't use // operator in case this program gets used in Perl 5.8
5530 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5531 $output_range_counts{$addr} = $output_range_counts if
5532 ! defined $output_range_counts{$addr};
5534 $aliases{$addr} = [ ];
5535 $comment{$addr} = [ ];
5536 $description{$addr} = [ ];
5538 $file_path{$addr} = [ ];
5539 $locked{$addr} = "";
5540 $has_dependency{$addr} = 0;
5542 push @{$description{$addr}}, $description if $description;
5543 push @{$note{$addr}}, $note if $note;
5545 if ($fate{$addr} == $PLACEHOLDER) {
5547 # A placeholder table doesn't get documented, is a perl extension,
5548 # and quite likely will be empty
5549 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5550 $perl_extension = 1 if ! defined $perl_extension;
5551 $ucd = 0 if ! defined $ucd;
5552 push @tables_that_may_be_empty, $complete_name{$addr};
5553 $self->add_comment(<<END);
5554 This is a placeholder because it is not in Version $string_version of Unicode,
5555 but is needed by the Perl core to work gracefully. Because it is not in this
5556 version of Unicode, it will not be listed in $pod_file.pod
5559 elsif (exists $why_suppressed{$complete_name}
5560 # Don't suppress if overridden
5561 && ! grep { $_ eq $complete_name{$addr} }
5562 @output_mapped_properties)
5564 $fate{$addr} = $SUPPRESSED;
5566 elsif ($fate{$addr} == $SUPPRESSED) {
5567 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5568 # Though currently unused
5570 elsif ($suppression_reason) {
5571 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5574 # If hasn't set its status already, see if it is on one of the
5575 # lists of properties or tables that have particular statuses; if
5576 # not, is normal. The lists are prioritized so the most serious
5577 # ones are checked first
5578 if (! $status{$addr}) {
5579 if (exists $why_deprecated{$complete_name}) {
5580 $status{$addr} = $DEPRECATED;
5582 elsif (exists $why_stabilized{$complete_name}) {
5583 $status{$addr} = $STABILIZED;
5585 elsif (exists $why_obsolete{$complete_name}) {
5586 $status{$addr} = $OBSOLETE;
5589 # Existence above doesn't necessarily mean there is a message
5590 # associated with it. Use the most serious message.
5591 if ($status{$addr}) {
5592 if ($why_deprecated{$complete_name}) {
5594 = $why_deprecated{$complete_name};
5596 elsif ($why_stabilized{$complete_name}) {
5598 = $why_stabilized{$complete_name};
5600 elsif ($why_obsolete{$complete_name}) {
5602 = $why_obsolete{$complete_name};
5607 $perl_extension{$addr} = $perl_extension || 0;
5609 # Don't list a property by default that is internal only
5610 if ($fate{$addr} > $MAP_PROXIED) {
5611 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5612 $ucd = 0 if ! defined $ucd;
5615 $ucd = 1 if ! defined $ucd;
5618 # By convention what typically gets printed only or first is what's
5619 # first in the list, so put the full name there for good output
5620 # clarity. Other routines rely on the full name being first on the
5622 $self->add_alias($full_name{$addr},
5623 OK_as_Filename => $ok_as_filename,
5624 Fuzzy => $loose_match,
5625 Re_Pod_Entry => $make_re_pod_entry,
5626 Status => $status{$addr},
5630 # Then comes the other name, if meaningfully different.
5631 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5632 $self->add_alias($name{$addr},
5633 OK_as_Filename => $ok_as_filename,
5634 Fuzzy => $loose_match,
5635 Re_Pod_Entry => $make_re_pod_entry,
5636 Status => $status{$addr},
5644 # Here are the methods that are required to be defined by any derived
5647 handle_special_range
5651 # write() knows how to write out normal ranges, but it calls
5652 # handle_special_range() when it encounters a non-normal one.
5653 # append_to_body() is called by it after it has handled all
5654 # ranges to add anything after the main portion of the table.
5655 # And finally, pre_body() is called after all this to build up
5656 # anything that should appear before the main portion of the
5657 # table. Doing it this way allows things in the middle to
5658 # affect what should appear before the main portion of the
5663 Carp::my_carp_bug( __LINE__
5664 . ": Must create method '$sub()' for "
5672 "." => \&main::_operator_dot,
5673 ".=" => \&main::_operator_dot_equal,
5674 '!=' => \&main::_operator_not_equal,
5675 '==' => \&main::_operator_equal,
5679 # Returns the array of ranges associated with this table.
5682 return $range_list{pack 'J', shift}->ranges;
5686 # Add a synonym for this table.
5688 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5691 my $name = shift; # The name to add.
5692 my $pointer = shift; # What the alias hash should point to. For
5693 # map tables, this is the parent property;
5694 # for match tables, it is the table itself.
5697 my $loose_match = delete $args{'Fuzzy'};
5699 my $ok_as_filename = delete $args{'OK_as_Filename'};
5700 $ok_as_filename = 1 unless defined $ok_as_filename;
5702 # An internal name does not get documented, unless overridden by the
5703 # input; same for making tests for it.
5704 my $status = delete $args{'Status'} || (($name =~ /^_/)
5707 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5708 // (($status ne $INTERNAL_ALIAS)
5709 ? (($name =~ /^_/) ? $NO : $YES)
5711 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5713 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5715 # Capitalize the first letter of the alias unless it is one of the CJK
5716 # ones which specifically begins with a lower 'k'. Do this because
5717 # Unicode has varied whether they capitalize first letters or not, and
5718 # have later changed their minds and capitalized them, but not the
5719 # other way around. So do it always and avoid changes from release to
5721 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5723 my $addr = do { no overloading; pack 'J', $self; };
5725 # Figure out if should be loosely matched if not already specified.
5726 if (! defined $loose_match) {
5728 # Is a loose_match if isn't null, and doesn't begin with an
5729 # underscore and isn't just a number
5731 && substr($name, 0, 1) ne '_'
5732 && $name !~ qr{^[0-9_.+-/]+$})
5741 # If this alias has already been defined, do nothing.
5742 return if defined $find_table_from_alias{$addr}->{$name};
5744 # That includes if it is standardly equivalent to an existing alias,
5745 # in which case, add this name to the list, so won't have to search
5747 my $standard_name = main::standardize($name);
5748 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5749 $find_table_from_alias{$addr}->{$name}
5750 = $find_table_from_alias{$addr}->{$standard_name};
5754 # Set the index hash for this alias for future quick reference.
5755 $find_table_from_alias{$addr}->{$name} = $pointer;
5756 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5757 local $to_trace = 0 if main::DEBUG;
5758 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5759 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5762 # Put the new alias at the end of the list of aliases unless the final
5763 # element begins with an underscore (meaning it is for internal perl
5764 # use) or is all numeric, in which case, put the new one before that
5765 # one. This floats any all-numeric or underscore-beginning aliases to
5766 # the end. This is done so that they are listed last in output lists,
5767 # to encourage the user to use a better name (either more descriptive
5768 # or not an internal-only one) instead. This ordering is relied on
5769 # implicitly elsewhere in this program, like in short_name()
5770 my $list = $aliases{$addr};
5771 my $insert_position = (@$list == 0
5772 || (substr($list->[-1]->name, 0, 1) ne '_'
5773 && $list->[-1]->name =~ /\D/))
5779 Alias->new($name, $loose_match, $make_re_pod_entry,
5780 $ok_as_filename, $status, $ucd);
5782 # This name may be shorter than any existing ones, so clear the cache
5783 # of the shortest, so will have to be recalculated.
5785 undef $short_name{pack 'J', $self};
5790 # Returns a name suitable for use as the base part of a file name.
5791 # That is, shorter wins. It can return undef if there is no suitable
5792 # name. The name has all non-essential underscores removed.
5794 # The optional second parameter is a reference to a scalar in which
5795 # this routine will store the length the returned name had before the
5796 # underscores were removed, or undef if the return is undef.
5798 # The shortest name can change if new aliases are added. So using
5799 # this should be deferred until after all these are added. The code
5800 # that does that should clear this one's cache.
5801 # Any name with alphabetics is preferred over an all numeric one, even
5805 my $nominal_length_ptr = shift;
5806 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5808 my $addr = do { no overloading; pack 'J', $self; };
5810 # For efficiency, don't recalculate, but this means that adding new
5811 # aliases could change what the shortest is, so the code that does
5812 # that needs to undef this.
5813 if (defined $short_name{$addr}) {
5814 if ($nominal_length_ptr) {
5815 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5817 return $short_name{$addr};
5820 # Look at each alias
5821 my $is_last_resort = 0;
5822 my $deprecated_or_discouraged
5823 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5824 foreach my $alias ($self->aliases()) {
5826 # Don't use an alias that isn't ok to use for an external name.
5827 next if ! $alias->ok_as_filename;
5829 my $name = main::Standardize($alias->name);
5830 trace $self, $name if main::DEBUG && $to_trace;
5832 # Take the first one, or any non-deprecated non-discouraged one
5833 # over one that is, or a shorter one that isn't numeric. This
5834 # relies on numeric aliases always being last in the array
5835 # returned by aliases(). Any alpha one will have precedence.
5836 if ( ! defined $short_name{$addr}
5837 || ( $is_last_resort
5838 && $alias->status !~ $deprecated_or_discouraged)
5840 && length($name) < length($short_name{$addr})))
5842 # Remove interior underscores.
5843 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5845 $nominal_short_name_length{$addr} = length $name;
5846 $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5850 # If the short name isn't a nice one, perhaps an equivalent table has
5852 if ( $self->can('children')
5853 && ( ! defined $short_name{$addr}
5854 || $short_name{$addr} eq ""
5855 || $short_name{$addr} eq "_"))
5858 foreach my $follower ($self->children) { # All equivalents
5859 my $follower_name = $follower->short_name;
5860 next unless defined $follower_name;
5862 # Anything (except undefined) is better than underscore or
5864 if (! defined $return || $return eq "_") {
5865 $return = $follower_name;
5869 # If the new follower name isn't "_" and is shorter than the
5870 # current best one, prefer the new one.
5871 next if $follower_name eq "_";
5872 next if length $follower_name > length $return;
5873 $return = $follower_name;
5875 $short_name{$addr} = $return if defined $return;
5878 # If no suitable external name return undef
5879 if (! defined $short_name{$addr}) {
5880 $$nominal_length_ptr = undef if $nominal_length_ptr;
5884 # Don't allow a null short name.
5885 if ($short_name{$addr} eq "") {
5886 $short_name{$addr} = '_';
5887 $nominal_short_name_length{$addr} = 1;
5890 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5892 if ($nominal_length_ptr) {
5893 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5895 return $short_name{$addr};
5899 # Returns the external name that this table should be known by. This
5900 # is usually the short_name, but not if the short_name is undefined,
5901 # in which case the external_name is arbitrarily set to the
5905 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5907 my $short = $self->short_name;
5908 return $short if defined $short;
5913 sub add_description { # Adds the parameter as a short description.
5916 my $description = shift;
5918 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5921 push @{$description{pack 'J', $self}}, $description;
5926 sub add_note { # Adds the parameter as a short note.
5931 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5934 push @{$note{pack 'J', $self}}, $note;
5939 sub add_comment { # Adds the parameter as a comment.
5941 return unless $debugging_build;
5944 my $comment = shift;
5945 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5950 push @{$comment{pack 'J', $self}}, $comment;
5956 # Return the current comment for this table. If called in list
5957 # context, returns the array of comments. In scalar, returns a string
5958 # of each element joined together with a period ending each.
5961 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5963 my $addr = do { no overloading; pack 'J', $self; };
5964 my @list = @{$comment{$addr}};
5965 return @list if wantarray;
5967 foreach my $sentence (@list) {
5968 $return .= '. ' if $return;
5969 $return .= $sentence;
5972 $return .= '.' if $return;
5977 # Initialize the table with the argument which is any valid
5978 # initialization for range lists.
5981 my $addr = do { no overloading; pack 'J', $self; };
5982 my $initialization = shift;
5983 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
5985 # Replace the current range list with a new one of the same exact
5987 my $class = ref $range_list{$addr};
5988 $range_list{$addr} = $class->new(Owner => $self,
5989 Initialize => $initialization);
5995 # The header that is output for the table in the file it is written
5999 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6002 $return .= $DEVELOPMENT_ONLY if $compare_versions;
6007 sub merge_single_annotation_line ($$$) {
6008 my ($output, $annotation, $annotation_column) = @_;
6010 # This appends an annotation comment, $annotation, to $output,
6011 # starting in or after column $annotation_column, removing any
6012 # pre-existing comment from $output.
6014 $annotation =~ s/^ \s* \# \ //x;
6015 $output =~ s/ \s* ( \# \N* )? \n //x;
6016 $output = Text::Tabs::expand($output);
6018 my $spaces = $annotation_column - length $output;
6019 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment
6021 $output = sprintf "%s%*s# %s",
6026 return Text::Tabs::unexpand $output;
6030 # Write a representation of the table to its file. It calls several
6031 # functions furnished by sub-classes of this abstract base class to
6032 # handle non-normal ranges, to add stuff before the table, and at its
6033 # end. If the table is to be written so that adjustments are
6034 # required, this does that conversion.
6037 my $use_adjustments = shift; # ? output in adjusted format or not
6038 my $suppress_value = shift; # Optional, if the value associated with
6039 # a range equals this one, don't write
6041 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6043 my $addr = do { no overloading; pack 'J', $self; };
6044 my $write_as_invlist = $write_as_invlist{$addr};
6046 # Start with the header
6047 my @HEADER = $self->header;
6050 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
6053 # Things discovered processing the main body of the document may
6054 # affect what gets output before it, therefore pre_body() isn't called
6055 # until after all other processing of the table is done.
6057 # The main body looks like a 'here' document. If there are comments,
6058 # get rid of them when processing it.
6060 if ($annotate || $output_range_counts) {
6061 # Use the line below in Perls that don't have /r
6062 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
6063 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
6065 push @OUT, "return <<'END';\n";
6068 if ($range_list{$addr}->is_empty) {
6070 # This is a kludge for empty tables to silence a warning in
6071 # utf8.c, which can't really deal with empty tables, but it can
6072 # deal with a table that matches nothing, as the inverse of 'All'
6074 push @OUT, "!utf8::All\n";
6076 elsif ($self->name eq 'N'
6078 # To save disk space and table cache space, avoid putting out
6079 # binary N tables, but instead create a file which just inverts
6080 # the Y table. Since the file will still exist and occupy a
6081 # certain number of blocks, might as well output the whole
6082 # thing if it all will fit in one block. The number of
6083 # ranges below is an approximate number for that.
6084 && ($self->property->type == $BINARY
6085 || $self->property->type == $FORCED_BINARY)
6086 # && $self->property->tables == 2 Can't do this because the
6087 # non-binary properties, like NFDQC aren't specifiable
6089 && $range_list{$addr}->ranges > 15
6090 && ! $annotate) # Under --annotate, want to see everything
6092 push @OUT, "!utf8::" . $self->property->name . "\n";
6095 my $range_size_1 = $range_size_1{$addr};
6097 # To make it more readable, use a minimum indentation
6100 # These are used only in $annotate option
6101 my $format; # e.g. $HEX_ADJUST_FORMAT
6102 my $include_name; # ? Include the character's name in the
6104 my $include_cp; # ? Include its code point
6107 $comment_indent = ($self->isa('Map_Table'))
6109 : ($write_as_invlist)
6114 $format = $self->format;
6116 # The name of the character is output only for tables that
6117 # don't already include the name in the output.
6118 my $property = $self->property;
6120 ! ($property == $perl_charname
6121 || $property == main::property_ref('Unicode_1_Name')
6122 || $property == main::property_ref('Name')
6123 || $property == main::property_ref('Name_Alias')
6126 # Don't include the code point in the annotation where all
6127 # lines are a single code point, so it can be easily found in
6129 $include_cp = ! $range_size_1;
6131 if (! $self->isa('Map_Table')) {
6132 $comment_indent = ($write_as_invlist) ? 8 : 16;
6135 $comment_indent = 16;
6137 # There are just a few short ranges in this table, so no
6138 # need to include the code point in the annotation.
6139 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
6141 # We're trying to get this to look good, as the whole
6142 # point is to make human-readable tables. It is easier to
6143 # read if almost all the annotation comments begin in the
6144 # same column. Map tables have varying width maps, so can
6145 # create a jagged comment appearance. This code does a
6146 # preliminary pass through these tables looking for the
6147 # maximum width map in each, and causing the comments to
6148 # begin just to the right of that. However, if the
6149 # comments begin too far to the right of most lines, it's
6150 # hard to line them up horizontally with their real data.
6151 # Therefore we ignore the longest outliers
6152 my $ignore_longest_X_percent = 2; # Discard longest X%
6154 # Each key in this hash is a width of at least one of the
6155 # maps in the table. Its value is how many lines have
6159 # We won't space things further left than one tab stop
6160 # after the rest of the line; initializing it to that
6161 # number saves some work.
6162 my $max_map_width = 8;
6164 # Fill in the %widths hash
6166 for my $set ($range_list{$addr}->ranges) {
6167 my $value = $set->value;
6169 # These range types don't appear in the main table
6170 next if $set->type == 0
6171 && defined $suppress_value
6172 && $value eq $suppress_value;
6173 next if $set->type == $MULTI_CP
6174 || $set->type == $NULL;
6176 # Include 2 spaces before the beginning of the
6178 my $this_width = length($value) + 2;
6180 # Ranges of the remaining non-zero types usually
6181 # occupy just one line (maybe occasionally two, but
6182 # this doesn't have to be dead accurate). This is
6183 # because these ranges are like "unassigned code
6185 my $count = ($set->type != 0)
6187 : $set->end - $set->start + 1;
6188 $widths{$this_width} += $count;
6190 $max_map_width = $this_width
6191 if $max_map_width < $this_width;
6194 # If the widest map gives us less than two tab stops
6195 # worth, just take it as-is.
6196 if ($max_map_width > 16) {
6198 # Otherwise go through %widths until we have included
6199 # the desired percentage of lines in the whole table.
6200 my $running_total = 0;
6201 foreach my $width (sort { $a <=> $b } keys %widths)
6203 $running_total += $widths{$width};
6205 if ($running_total * 100 / $total
6206 >= 100 - $ignore_longest_X_percent)
6208 $max_map_width = $width;
6213 $comment_indent += $max_map_width;
6217 # Values for previous time through the loop. Initialize to
6218 # something that won't be adjacent to the first iteration;
6219 # only $previous_end matters for that.
6221 my $previous_end = -2;
6224 # Values for next time through the portion of the loop that splits
6225 # the range. 0 in $next_start means there is no remaining portion
6231 my $invlist_count = 0;
6233 my $output_value_in_hex = $self->isa('Map_Table')
6234 && ($self->format eq $HEX_ADJUST_FORMAT
6235 || $self->to_output_map == $EXTERNAL_MAP);
6236 # Use leading zeroes just for files whose format should not be
6237 # changed from what it has been. Otherwise, they just take up
6238 # space and time to process.
6239 my $hex_format = ($self->isa('Map_Table')
6240 && $self->to_output_map == $EXTERNAL_MAP)
6244 # The values for some of these tables are stored in mktables as
6245 # hex strings. Normally, these are just output as strings without
6246 # change, but when we are doing adjustments, we have to operate on
6247 # these numerically, so we convert those to decimal to do that,
6248 # and back to hex for output
6249 my $convert_map_to_from_hex = 0;
6250 my $output_map_in_hex = 0;
6251 if ($self->isa('Map_Table')) {
6252 $convert_map_to_from_hex
6253 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6254 || ($annotate && $self->format eq $HEX_FORMAT);
6255 $output_map_in_hex = $convert_map_to_from_hex
6256 || $self->format eq $HEX_FORMAT;
6259 # To store any annotations about the characters.
6262 # Output each range as part of the here document.
6264 for my $set ($range_list{$addr}->ranges) {
6265 if ($set->type != 0) {
6266 $self->handle_special_range($set);
6269 my $start = $set->start;
6270 my $end = $set->end;
6271 my $value = $set->value;
6273 # Don't output ranges whose value is the one to suppress
6274 next RANGE if defined $suppress_value
6275 && $value eq $suppress_value;
6277 $value = CORE::hex $value if $convert_map_to_from_hex;
6280 { # This bare block encloses the scope where we may need to
6281 # 'redo' to. Consider a table that is to be written out
6282 # using single item ranges. This is given in the
6283 # $range_size_1 boolean. To accomplish this, we split the
6284 # range each time through the loop into two portions, the
6285 # first item, and the rest. We handle that first item
6286 # this time in the loop, and 'redo' to repeat the process
6287 # for the rest of the range.
6289 # We may also have to do it, with other special handling,
6290 # if the table has adjustments. Consider the table that
6291 # contains the lowercasing maps. mktables stores the
6292 # ASCII range ones as 26 ranges:
6293 # ord('A') => ord('a'), .. ord('Z') => ord('z')
6294 # For compactness, the table that gets written has this as
6296 # ( ord('A') .. ord('Z') ) => ord('a')
6297 # and the software that reads the tables is smart enough
6298 # to "connect the dots". This change is accomplished in
6299 # this loop by looking to see if the current iteration
6300 # fits the paradigm of the previous iteration, and if so,
6301 # we merge them by replacing the final output item with
6302 # the merged data. Repeated 25 times, this gets A-Z. But
6303 # we also have to make sure we don't screw up cases where
6304 # we have internally stored
6305 # ( 0x1C4 .. 0x1C6 ) => 0x1C5
6306 # This single internal range has to be output as 3 ranges,
6307 # which is done by splitting, like we do for $range_size_1
6308 # tables. (There are very few of such ranges that need to
6309 # be split, so the gain of doing the combining of other
6310 # ranges far outweighs the splitting of these.) The
6311 # values to use for the redo at the end of this block are
6312 # set up just below in the scalars whose names begin with
6315 if (($use_adjustments || $range_size_1) && $end != $start)
6317 $next_start = $start + 1;
6319 $next_value = $value;
6323 if ($use_adjustments && ! $range_size_1) {
6325 # If this range is adjacent to the previous one, and
6326 # the values in each are integers that are also
6327 # adjacent (differ by 1), then this range really
6328 # extends the previous one that is already in element
6329 # $OUT[-1]. So we pop that element, and pretend that
6330 # the range starts with whatever it started with.
6331 # $offset is incremented by 1 each time so that it
6332 # gives the current offset from the first element in
6333 # the accumulating range, and we keep in $value the
6334 # value of that first element.
6335 if ($start == $previous_end + 1
6336 && $value =~ /^ -? \d+ $/xa
6337 && $previous_value =~ /^ -? \d+ $/xa
6338 && ($value == ($previous_value + ++$offset)))
6341 $start = $previous_start;
6342 $value = $previous_value;
6346 if (@annotation == 1) {
6347 $OUT[-1] = merge_single_annotation_line(
6348 $OUT[-1], $annotation[0], $comment_indent);
6351 push @OUT, @annotation;
6356 # Save the current values for the next time through
6358 $previous_start = $start;
6359 $previous_end = $end;
6360 $previous_value = $value;
6363 if ($write_as_invlist) {
6364 if ( $previous_end > 0
6365 && $output_range_counts{$addr})
6367 my $complement_count = $start - $previous_end - 1;
6368 if ($complement_count > 1) {
6369 $OUT[-1] = merge_single_annotation_line(
6374 . main::clarify_code_point_count(
6376 . "] in complement\n",
6381 # Inversion list format has a single number per line,
6382 # the starting code point of a range that matches the
6384 push @OUT, $start, "\n";
6387 # Add a comment with the size of the range, if
6389 if ($output_range_counts{$addr}) {
6390 $OUT[-1] = merge_single_annotation_line(
6393 . main::clarify_code_point_count($end - $start + 1)
6398 elsif ($start != $end) { # If there is a range
6399 if ($end == $MAX_WORKING_CODEPOINT) {
6400 push @OUT, sprintf "$hex_format\t$hex_format",
6402 $MAX_PLATFORM_CODEPOINT;
6405 push @OUT, sprintf "$hex_format\t$hex_format",
6408 if (length $value) {
6409 if ($convert_map_to_from_hex) {
6410 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6413 $OUT[-1] .= "\t$value\n";
6417 # Add a comment with the size of the range, if
6419 if ($output_range_counts{$addr}) {
6420 $OUT[-1] = merge_single_annotation_line(
6423 . main::clarify_code_point_count($end - $start + 1)
6428 else { # Here to output a single code point per line.
6430 # Use any passed in subroutine to output.
6431 if (ref $range_size_1 eq 'CODE') {
6432 for my $i ($start .. $end) {
6433 push @OUT, &{$range_size_1}($i, $value);
6438 # Here, caller is ok with default output.
6439 for (my $i = $start; $i <= $end; $i++) {
6440 if ($convert_map_to_from_hex) {
6442 sprintf "$hex_format\t\t$hex_format\n",
6446 push @OUT, sprintf $hex_format, $i;
6447 $OUT[-1] .= "\t\t$value" if $value ne "";
6455 for (my $i = $start; $i <= $end; $i++) {
6456 my $annotation = "";
6458 # Get character information if don't have it already
6459 main::populate_char_info($i)
6460 if ! defined $viacode[$i];
6461 my $type = $annotate_char_type[$i];
6463 # Figure out if should output the next code points
6464 # as part of a range or not. If this is not in an
6465 # annotation range, then won't output as a range,
6466 # so returns $i. Otherwise use the end of the
6467 # annotation range, but no further than the
6468 # maximum possible end point of the loop.
6473 $annotate_ranges->value_of($i) || $i,
6476 # Use a range if it is a range, and either is one
6477 # of the special annotation ranges, or the range
6478 # is at most 3 long. This last case causes the
6479 # algorithmically named code points to be output
6480 # individually in spans of at most 3, as they are
6481 # the ones whose $type is > 0.
6482 if ($range_end != $i
6483 && ( $type < 0 || $range_end - $i > 2))
6485 # Here is to output a range. We don't allow a
6486 # caller-specified output format--just use the
6488 my $range_name = $viacode[$i];
6490 # For the code points which end in their hex
6491 # value, we eliminate that from the output
6492 # annotation, and capitalize only the first
6493 # letter of each word.
6494 if ($type == $CP_IN_NAME) {
6495 my $hex = sprintf $hex_format, $i;
6496 $range_name =~ s/-$hex$//;
6497 my @words = split " ", $range_name;
6498 for my $word (@words) {
6500 ucfirst(lc($word)) if $word ne 'CJK';
6502 $range_name = join " ", @words;
6504 elsif ($type == $HANGUL_SYLLABLE) {
6505 $range_name = "Hangul Syllable";
6508 # If the annotation would just repeat what's
6509 # already being output as the range, skip it.
6510 # (When an inversion list is being written, it
6511 # isn't a repeat, as that always is in
6513 if ( $write_as_invlist
6515 || $range_end < $end)
6517 if ($range_end < $MAX_WORKING_CODEPOINT)
6519 $annotation = sprintf "%04X..%04X",
6523 $annotation = sprintf "%04X..INFINITY",
6527 else { # Indent if not displaying code points
6528 $annotation = " " x 4;
6532 $annotation .= " $age[$i]" if $age[$i];
6533 $annotation .= " $range_name";
6536 # Include the number of code points in the
6539 main::clarify_code_point_count($range_end - $i + 1);
6540 $annotation .= " [$count]\n";
6542 # Skip to the end of the range
6545 else { # Not in a range.
6548 # When outputting the names of each character,
6549 # use the character itself if printable
6550 $comment .= "'" . main::display_chr($i) . "' "
6553 my $output_value = $value;
6555 # Determine the annotation
6556 if ($format eq $DECOMP_STRING_FORMAT) {
6558 # This is very specialized, with the type
6559 # of decomposition beginning the line
6560 # enclosed in <...>, and the code points
6561 # that the code point decomposes to
6562 # separated by blanks. Create two
6563 # strings, one of the printable
6564 # characters, and one of their official
6566 (my $map = $output_value)
6567 =~ s/ \ * < .*? > \ +//x;
6571 foreach my $to (split " ", $map) {
6572 $to = CORE::hex $to;
6573 $to_name .= " + " if $to_name;
6574 $to_chr .= main::display_chr($to);
6575 main::populate_char_info($to)
6576 if ! defined $viacode[$to];
6577 $to_name .= $viacode[$to];
6581 "=> '$to_chr'; $viacode[$i] => $to_name";
6584 $output_value += $i - $start
6586 # Don't try to adjust a
6588 && $output_value !~ /[-\D]/;
6590 if ($output_map_in_hex) {
6591 main::populate_char_info($output_value)
6592 if ! defined $viacode[$output_value];
6594 . main::display_chr($output_value)
6595 . "'; " if $printable[$output_value];
6597 if ($include_name && $viacode[$i]) {
6598 $comment .= " " if $comment;
6599 $comment .= $viacode[$i];
6601 if ($output_map_in_hex) {
6603 " => $viacode[$output_value]"
6604 if $viacode[$output_value];
6605 $output_value = sprintf($hex_format,
6611 $annotation = sprintf "%04X %s", $i, $age[$i];
6612 if ($use_adjustments) {
6613 $annotation .= " => $output_value";
6617 if ($comment ne "") {
6618 $annotation .= " " if $annotation ne "";
6619 $annotation .= $comment;
6621 $annotation .= "\n" if $annotation ne "";
6624 if ($annotation ne "") {
6625 push @annotation, (" " x $comment_indent)
6630 # If not adjusting, we don't have to go through the
6631 # loop again to know that the annotation comes next
6633 if (! $use_adjustments) {
6634 if (@annotation == 1) {
6635 $OUT[-1] = merge_single_annotation_line(
6636 $OUT[-1], $annotation[0], $comment_indent);
6639 push @OUT, map { Text::Tabs::unexpand $_ }
6646 # Add the beginning of the range that doesn't match the
6647 # property, except if the just added match range extends
6648 # to infinity. We do this after any annotations for the
6650 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6651 push @OUT, $end + 1, "\n";
6655 # If we split the range, set up so the next time through
6656 # we get the remainder, and redo.
6658 $start = $next_start;
6660 $value = $next_value;
6665 } # End of loop through all the table's ranges
6667 push @OUT, @annotation; # Add orphaned annotation, if any
6669 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6672 # Add anything that goes after the main body, but within the here
6674 my $append_to_body = $self->append_to_body;
6675 push @OUT, $append_to_body if $append_to_body;
6677 # And finish the here document.
6680 # Done with the main portion of the body. Can now figure out what
6681 # should appear before it in the file.
6682 my $pre_body = $self->pre_body;
6683 push @HEADER, $pre_body, "\n" if $pre_body;
6685 # All these files should have a .pl suffix added to them.
6686 my @file_with_pl = @{$file_path{$addr}};
6687 $file_with_pl[-1] .= '.pl';
6689 main::write(\@file_with_pl,
6690 $annotate, # utf8 iff annotating
6696 sub set_status { # Set the table's status
6698 my $status = shift; # The status enum value
6699 my $info = shift; # Any message associated with it.
6700 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6702 my $addr = do { no overloading; pack 'J', $self; };
6704 $status{$addr} = $status;
6705 $status_info{$addr} = $info;
6709 sub set_fate { # Set the fate of a table
6713 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6715 my $addr = do { no overloading; pack 'J', $self; };
6717 return if $fate{$addr} == $fate; # If no-op
6719 # Can only change the ordinary fate, except if going to $MAP_PROXIED
6720 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6722 $fate{$addr} = $fate;
6724 # Don't document anything to do with a non-normal fated table
6725 if ($fate != $ORDINARY) {
6726 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6727 foreach my $alias ($self->aliases) {
6728 $alias->set_ucd($put_in_pod);
6730 # MAP_PROXIED doesn't affect the match tables
6731 next if $fate == $MAP_PROXIED;
6732 $alias->set_make_re_pod_entry($put_in_pod);
6736 # Save the reason for suppression for output
6737 if ($fate >= $SUPPRESSED) {
6738 $reason = "" unless defined $reason;
6739 $why_suppressed{$complete_name{$addr}} = $reason;
6746 # Don't allow changes to the table from now on. This stores a stack
6747 # trace of where it was called, so that later attempts to modify it
6748 # can immediately show where it got locked.
6751 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6753 my $addr = do { no overloading; pack 'J', $self; };
6755 $locked{$addr} = "";
6757 my $line = (caller(0))[2];
6760 # Accumulate the stack trace
6762 my ($pkg, $file, $caller_line, $caller) = caller $i++;
6764 last unless defined $caller;
6766 $locked{$addr} .= " called from $caller() at line $line\n";
6767 $line = $caller_line;
6769 $locked{$addr} .= " called from main at line $line\n";
6774 sub carp_if_locked {
6775 # Return whether a table is locked or not, and, by the way, complain
6779 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
6781 my $addr = do { no overloading; pack 'J', $self; };
6783 return 0 if ! $locked{$addr};
6784 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6788 sub set_file_path { # Set the final directory path for this table
6790 # Rest of parameters passed on
6793 @{$file_path{pack 'J', $self}} = @_;
6797 # Accessors for the range list stored in this table. First for
6806 matches_identically_to
6819 return $self->_range_list->$sub(@_);
6823 # Then for ones that should fail if locked
6833 return if $self->carp_if_locked;
6835 return $self->_range_list->$sub(@_);
6842 use parent '-norequire', '_Base_Table';
6844 # A Map Table is a table that contains the mappings from code points to
6845 # values. There are two weird cases:
6846 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6847 # are written in the table's file at the end of the table nonetheless. It
6848 # requires specially constructed code to handle these; utf8.c can not read
6849 # these in, so they should not go in $map_directory. As of this writing,
6850 # the only case that these happen is for named sequences used in
6851 # charnames.pm. But this code doesn't enforce any syntax on these, so
6852 # something else could come along that uses it.
6853 # 2) Specials are anything that doesn't fit syntactically into the body of the
6854 # table. The ranges for these have a map type of non-zero. The code below
6855 # knows about and handles each possible type. In most cases, these are
6856 # written as part of the header.
6858 # A map table deliberately can't be manipulated at will unlike match tables.
6859 # This is because of the ambiguities having to do with what to do with
6860 # overlapping code points. And there just isn't a need for those things;
6861 # what one wants to do is just query, add, replace, or delete mappings, plus
6862 # write the final result.
6863 # However, there is a method to get the list of possible ranges that aren't in
6864 # this table to use for defaulting missing code point mappings. And,
6865 # map_add_or_replace_non_nulls() does allow one to add another table to this
6866 # one, but it is clearly very specialized, and defined that the other's
6867 # non-null values replace this one's if there is any overlap.
6869 sub trace { return main::trace(@_); }
6873 main::setup_package();
6876 # Many input files omit some entries; this gives what the mapping for the
6877 # missing entries should be
6878 main::set_access('default_map', \%default_map, 'r');
6880 my %anomalous_entries;
6881 # Things that go in the body of the table which don't fit the normal
6882 # scheme of things, like having a range. Not much can be done with these
6883 # once there except to output them. This was created to handle named
6885 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6886 main::set_access('anomalous_entries', # Append singular, read plural
6887 \%anomalous_entries,
6890 my %replacement_property;
6891 # Certain files are unused by Perl itself, and are kept only for backwards
6892 # compatibility for programs that used them before Unicode::UCD existed.
6893 # These are termed legacy properties. At some point they may be removed,
6894 # but for now mark them as legacy. If non empty, this is the name of the
6895 # property to use instead (i.e., the modern equivalent).
6896 main::set_access('replacement_property', \%replacement_property, 'r');
6899 # Enum as to whether or not to write out this map table, and how:
6901 # $EXTERNAL_MAP means its existence is noted in the documentation, and
6902 # it should not be removed nor its format changed. This
6903 # is done for those files that have traditionally been
6904 # output. Maps of legacy-only properties default to
6906 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
6908 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6909 # outputting the actual mappings as-is, we adjust things
6910 # to create a much more compact table. Only those few
6911 # tables where the mapping is convertible at least to an
6912 # integer and compacting makes a big difference should
6913 # have this. Hence, the default is to not do this
6914 # unless the table's default mapping is to $CODE_POINT,
6915 # and the range size is not 1.
6916 main::set_access('to_output_map', \%to_output_map, 's');
6924 # Optional initialization data for the table.
6925 my $initialize = delete $args{'Initialize'};
6927 my $default_map = delete $args{'Default_Map'};
6928 my $property = delete $args{'_Property'};
6929 my $full_name = delete $args{'Full_Name'};
6930 my $replacement_property = delete $args{'Replacement_Property'} // "";
6931 my $to_output_map = delete $args{'To_Output_Map'};
6933 # Rest of parameters passed on; legacy properties have several common
6935 if ($replacement_property) {
6936 $args{"Fate"} = $LEGACY_ONLY;
6937 $args{"Range_Size_1"} = 1;
6938 $args{"Perl_Extension"} = 1;
6942 my $range_list = Range_Map->new(Owner => $property);
6944 my $self = $class->SUPER::new(
6946 Complete_Name => $full_name,
6947 Full_Name => $full_name,
6948 _Property => $property,
6949 _Range_List => $range_list,
6950 Write_As_Invlist => 0,
6953 my $addr = do { no overloading; pack 'J', $self; };
6955 $anomalous_entries{$addr} = [];
6956 $default_map{$addr} = $default_map;
6957 $replacement_property{$addr} = $replacement_property;
6958 $to_output_map = $EXTERNAL_MAP if ! defined $to_output_map
6959 && $replacement_property;
6960 $to_output_map{$addr} = $to_output_map;
6962 $self->initialize($initialize) if defined $initialize;
6969 qw("") => "_operator_stringify",
6972 sub _operator_stringify {
6975 my $name = $self->property->full_name;
6976 $name = '""' if $name eq "";
6977 return "Map table for Property '$name'";
6981 # Add a synonym for this table (which means the property itself)
6984 # Rest of parameters passed on.
6986 $self->SUPER::add_alias($name, $self->property, @_);
6991 # Add a range of code points to the list of specially-handled code
6992 # points. $MULTI_CP is assumed if the type of special is not passed
7001 my $type = delete $args{'Type'} || 0;
7002 # Rest of parameters passed on
7004 # Can't change the table if locked.
7005 return if $self->carp_if_locked;
7007 my $addr = do { no overloading; pack 'J', $self; };
7009 $self->_range_list->add_map($lower, $upper,
7016 sub append_to_body {
7017 # Adds to the written HERE document of the table's body any anomalous
7018 # entries in the table..
7021 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7023 my $addr = do { no overloading; pack 'J', $self; };
7025 return "" unless @{$anomalous_entries{$addr}};
7026 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
7029 sub map_add_or_replace_non_nulls {
7030 # This adds the mappings in the table $other to $self. Non-null
7031 # mappings from $other override those in $self. It essentially merges
7032 # the two tables, with the second having priority except for null
7037 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7039 return if $self->carp_if_locked;
7041 if (! $other->isa(__PACKAGE__)) {
7042 Carp::my_carp_bug("$other should be a "
7050 my $addr = do { no overloading; pack 'J', $self; };
7051 my $other_addr = do { no overloading; pack 'J', $other; };
7053 local $to_trace = 0 if main::DEBUG;
7055 my $self_range_list = $self->_range_list;
7056 my $other_range_list = $other->_range_list;
7057 foreach my $range ($other_range_list->ranges) {
7058 my $value = $range->value;
7059 next if $value eq "";
7060 $self_range_list->_add_delete('+',
7064 Type => $range->type,
7065 Replace => $UNCONDITIONALLY);
7071 sub set_default_map {
7072 # Define what code points that are missing from the input files should
7073 # map to. The optional second parameter 'full_name' indicates to
7074 # force using the full name of the map instead of its standard name.
7078 my $use_full_name = shift // 0;
7079 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7081 if ($use_full_name && $use_full_name ne 'full_name') {
7082 Carp::my_carp_bug("Second parameter to set_default_map() if"
7083 . " present, must be 'full_name'");
7086 my $addr = do { no overloading; pack 'J', $self; };
7088 # Convert the input to the standard equivalent, if any (won't have any
7089 # for $STRING properties)
7090 my $standard = $self->property->table($map);
7091 if (defined $standard) {
7092 $map = ($use_full_name)
7093 ? $standard->full_name
7097 # Warn if there already is a non-equivalent default map for this
7098 # property. Note that a default map can be a ref, which means that
7099 # what it actually means is delayed until later in the program, and it
7100 # IS permissible to override it here without a message.
7101 my $default_map = $default_map{$addr};
7102 if (defined $default_map
7103 && ! ref($default_map)
7104 && $default_map ne $map
7105 && main::Standardize($map) ne $default_map)
7107 my $property = $self->property;
7108 my $map_table = $property->table($map);
7109 my $default_table = $property->table($default_map);
7110 if (defined $map_table
7111 && defined $default_table
7112 && $map_table != $default_table)
7114 Carp::my_carp("Changing the default mapping for "
7116 . " from $default_map to $map'");
7120 $default_map{$addr} = $map;
7122 # Don't also create any missing table for this map at this point,
7123 # because if we did, it could get done before the main table add is
7124 # done for PropValueAliases.txt; instead the caller will have to make
7125 # sure it exists, if desired.
7130 # Returns boolean: should we write this map table?
7133 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7135 my $addr = do { no overloading; pack 'J', $self; };
7137 # If overridden, use that
7138 return $to_output_map{$addr} if defined $to_output_map{$addr};
7140 my $full_name = $self->full_name;
7141 return $global_to_output_map{$full_name}
7142 if defined $global_to_output_map{$full_name};
7144 # If table says to output, do so; if says to suppress it, do so.
7145 my $fate = $self->fate;
7146 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
7147 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
7148 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
7150 my $type = $self->property->type;
7152 # Don't want to output binary map tables even for debugging.
7153 return 0 if $type == $BINARY;
7155 # But do want to output string ones. All the ones that remain to
7156 # be dealt with (i.e. which haven't explicitly been set to external)
7157 # are for internal Perl use only. The default for those that map to
7158 # $CODE_POINT and haven't been restricted to a single element range
7159 # is to use the adjusted form.
7160 if ($type == $STRING) {
7161 return $INTERNAL_MAP if $self->range_size_1
7162 || $default_map{$addr} ne $CODE_POINT;
7163 return $OUTPUT_ADJUSTED;
7166 # Otherwise is an $ENUM, do output it, for Perl's purposes
7167 return $INTERNAL_MAP;
7171 # Returns a Range_List that is gaps of the current table. That is,
7175 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7177 my $current = Range_List->new(Initialize => $self->_range_list,
7178 Owner => $self->property);
7184 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7186 my $return = $self->SUPER::header();
7188 if ($self->to_output_map >= $INTERNAL_MAP) {
7189 $return .= $INTERNAL_ONLY_HEADER;
7192 my $property_name = $self->property->replacement_property;
7194 # The legacy-only properties were gotten above; but there are some
7195 # other properties whose files are in current use that have fixed
7197 $property_name = $self->property->full_name unless $property_name;
7201 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
7203 # This file is for internal use by core Perl only. It is retained for
7204 # backwards compatibility with applications that may have come to rely on it,
7205 # but its format and even its name or existence are subject to change without
7206 # notice in a future Perl version. Don't use it directly. Instead, its
7207 # contents are now retrievable through a stable API in the Unicode::UCD
7208 # module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
7209 # code points can be retrieved via Unicode::UCD::charprop());
7215 sub set_final_comment {
7216 # Just before output, create the comment that heads the file
7217 # containing this table.
7219 return unless $debugging_build;
7222 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7224 # No sense generating a comment if aren't going to write it out.
7225 return if ! $self->to_output_map;
7227 my $addr = do { no overloading; pack 'J', $self; };
7229 my $property = $self->property;
7231 # Get all the possible names for this property. Don't use any that
7232 # aren't ok for use in a file name, etc. This is perhaps causing that
7233 # flag to do double duty, and may have to be changed in the future to
7234 # have our own flag for just this purpose; but it works now to exclude
7235 # Perl generated synonyms from the lists for properties, where the
7236 # name is always the proper Unicode one.
7237 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
7239 my $count = $self->count;
7240 my $default_map = $default_map{$addr};
7242 # The ranges that map to the default aren't output, so subtract that
7243 # to get those actually output. A property with matching tables
7244 # already has the information calculated.
7245 if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
7246 $count -= $property->table($default_map)->count;
7248 elsif (defined $default_map) {
7250 # But for $STRING properties, must calculate now. Subtract the
7251 # count from each range that maps to the default.
7252 foreach my $range ($self->_range_list->ranges) {
7253 if ($range->value eq $default_map) {
7254 $count -= $range->end +1 - $range->start;
7260 # Get a string version of $count with underscores in large numbers,
7262 my $string_count = main::clarify_code_point_count($count);
7264 my $code_points = ($count == 1)
7265 ? 'single code point'
7266 : "$string_count code points";
7271 if (@property_aliases <= 1) {
7272 $mapping = 'mapping';
7273 $these_mappings = 'this mapping';
7277 $mapping = 'synonymous mappings';
7278 $these_mappings = 'these mappings';
7282 if ($count >= $MAX_UNICODE_CODEPOINTS) {
7283 $cp = "any code point in Unicode Version $string_version";
7287 if ($default_map eq "") {
7288 $map_to = 'the null string';
7290 elsif ($default_map eq $CODE_POINT) {
7294 $map_to = "'$default_map'";
7297 $cp = "the single code point";
7300 $cp = "one of the $code_points";
7302 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7307 my $status = $self->status;
7308 if ($status ne $NORMAL) {
7309 my $warn = uc $status_past_participles{$status};
7312 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
7313 All property or property=value combinations contained in this file are $warn.
7314 See $unicode_reference_url for what this means.
7318 $comment .= "This file returns the $mapping:\n";
7320 my $ucd_accessible_name = "";
7321 my $has_underscore_name = 0;
7322 my $full_name = $self->property->full_name;
7323 for my $i (0 .. @property_aliases - 1) {
7324 my $name = $property_aliases[$i]->name;
7325 $has_underscore_name = 1 if $name =~ /^_/;
7326 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7327 if ($property_aliases[$i]->ucd) {
7328 if ($name eq $full_name) {
7329 $ucd_accessible_name = $full_name;
7331 elsif (! $ucd_accessible_name) {
7332 $ucd_accessible_name = $name;
7336 $comment .= "\nwhere 'cp' is $cp.";
7337 if ($ucd_accessible_name) {
7338 $comment .= " Note that $these_mappings";
7339 if ($has_underscore_name) {
7340 $comment .= " (except for the one(s) that begin with an underscore)";
7342 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7346 # And append any commentary already set from the actual property.
7347 $comment .= "\n\n" . $self->comment if $self->comment;
7348 if ($self->description) {
7349 $comment .= "\n\n" . join " ", $self->description;
7352 $comment .= "\n\n" . join " ", $self->note;
7356 if (! $self->perl_extension) {
7359 For information about what this property really means, see:
7360 $unicode_reference_url
7364 if ($count) { # Format differs for empty table
7365 $comment.= "\nThe format of the ";
7366 if ($self->range_size_1) {
7368 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7369 is in hex; MAPPING is what CODE_POINT maps to.
7374 # There are tables which end up only having one element per
7375 # range, but it is not worth keeping track of for making just
7376 # this comment a little better.
7378 non-comment portions of the main body of lines of this file is:
7379 START\\tSTOP\\tMAPPING where START is the starting code point of the
7380 range, in hex; STOP is the ending point, or if omitted, the range has just one
7381 code point; MAPPING is what each code point between START and STOP maps to.
7383 if ($self->output_range_counts) {
7385 Numbers in comments in [brackets] indicate how many code points are in the
7386 range (omitted when the range is a single code point or if the mapping is to
7392 $self->set_comment(main::join_lines($comment));
7396 my %swash_keys; # Makes sure don't duplicate swash names.
7398 # The remaining variables are temporaries used while writing each table,
7399 # to output special ranges.
7400 my @multi_code_point_maps; # Map is to more than one code point.
7402 sub handle_special_range {
7403 # Called in the middle of write when it finds a range it doesn't know
7408 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7410 my $addr = do { no overloading; pack 'J', $self; };
7412 my $type = $range->type;
7414 my $low = $range->start;
7415 my $high = $range->end;
7416 my $map = $range->value;
7418 # No need to output the range if it maps to the default.
7419 return if $map eq $default_map{$addr};
7421 my $property = $self->property;
7423 # Switch based on the map type...
7424 if ($type == $HANGUL_SYLLABLE) {
7426 # These are entirely algorithmically determinable based on
7427 # some constants furnished by Unicode; for now, just set a
7428 # flag to indicate that have them. After everything is figured
7429 # out, we will output the code that does the algorithm. (Don't
7430 # output them if not needed because we are suppressing this
7432 $has_hangul_syllables = 1 if $property->to_output_map;
7434 elsif ($type == $CP_IN_NAME) {
7436 # Code points whose name ends in their code point are also
7437 # algorithmically determinable, but need information about the map
7438 # to do so. Both the map and its inverse are stored in data
7439 # structures output in the file. They are stored in the mean time
7440 # in global lists The lists will be written out later into Name.pm,
7441 # which is created only if needed. In order to prevent duplicates
7442 # in the list, only add to them for one property, should multiple
7444 if ($needing_code_points_ending_in_code_point == 0) {
7445 $needing_code_points_ending_in_code_point = $property;
7447 if ($property == $needing_code_points_ending_in_code_point) {
7448 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7449 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7451 my $squeezed = $map =~ s/[-\s]+//gr;
7452 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7454 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7457 push @code_points_ending_in_code_point, { low => $low,
7463 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7465 # Multi-code point maps and null string maps have an entry
7466 # for each code point in the range. They use the same
7468 for my $code_point ($low .. $high) {
7470 # The pack() below can't cope with surrogates. XXX This may
7472 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7473 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
7477 # Generate the hash entries for these in the form that
7478 # utf8.c understands.
7482 foreach my $to (split " ", $map) {
7483 if ($to !~ /^$code_point_re$/) {
7484 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
7487 $tostr .= sprintf "\\x{%s}", $to;
7488 $to = CORE::hex $to;
7490 $to_name .= " + " if $to_name;
7491 $to_chr .= main::display_chr($to);
7492 main::populate_char_info($to)
7493 if ! defined $viacode[$to];
7494 $to_name .= $viacode[$to];
7498 # The unpack yields a list of the bytes that comprise the
7499 # UTF-8 of $code_point, which are each placed in \xZZ format
7500 # and output in the %s to map to $tostr, so the result looks
7502 # "\xC4\xB0" => "\x{0069}\x{0307}",
7503 my $utf8 = sprintf(qq["%s" => "$tostr",],
7504 join("", map { sprintf "\\x%02X", $_ }
7505 unpack("U0C*", chr $code_point)));
7507 # Add a comment so that a human reader can more easily
7508 # see what's going on.
7509 push @multi_code_point_maps,
7510 sprintf("%-45s # U+%04X", $utf8, $code_point);
7512 $multi_code_point_maps[-1] .= " => $map";
7515 main::populate_char_info($code_point)
7516 if ! defined $viacode[$code_point];
7517 $multi_code_point_maps[-1] .= " '"
7518 . main::display_chr($code_point)
7519 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7524 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
7531 # Returns the string that should be output in the file before the main
7532 # body of this table. It isn't called until the main body is
7533 # calculated, saving a pass. The string includes some hash entries
7534 # identifying the format of the body, and what the single value should
7535 # be for all ranges missing from it. It also includes any code points
7536 # which have map_types that don't go in the main table.
7539 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7541 my $addr = do { no overloading; pack 'J', $self; };
7543 my $name = $self->property->swash_name;
7545 # Currently there is nothing in the pre_body unless a swash is being
7547 return unless defined $name;
7549 if (defined $swash_keys{$name}) {
7550 Carp::my_carp(main::join_lines(<<END
7551 Already created a swash name '$name' for $swash_keys{$name}. This means that
7552 the same name desired for $self shouldn't be used. Bad News. This must be
7553 fixed before production use, but proceeding anyway
7557 $swash_keys{$name} = "$self";
7561 # Here we assume we were called after have gone through the whole
7562 # file. If we actually generated anything for each map type, add its
7563 # respective header and trailer
7564 my $specials_name = "";
7565 if (@multi_code_point_maps) {
7566 $specials_name = "utf8::ToSpec$name";
7569 # Some code points require special handling because their mappings are each to
7570 # multiple code points. These do not appear in the main body, but are defined
7571 # in the hash below.
7573 # Each key is the string of N bytes that together make up the UTF-8 encoding
7574 # for the code point. (i.e. the same as looking at the code point's UTF-8
7575 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
7576 \%$specials_name = (
7578 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7581 my $format = $self->format;
7585 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7586 if ($output_adjusted) {
7587 if ($specials_name) {
7589 # The mappings in the non-hash portion of this file must be modified to get the
7590 # correct values by adding the code point ordinal number to each one that is
7596 # The mappings must be modified to get the correct values by adding the code
7597 # point ordinal number to each one that is numeric.
7604 # The name this swash is to be known by, with the format of the mappings in
7605 # the main body of the table, and what all code points missing from this file
7607 \$utf8::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7609 if ($specials_name) {
7611 \$utf8::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7614 my $default_map = $default_map{$addr};
7616 # For $CODE_POINT default maps and using adjustments, instead the default
7618 $return .= "\$utf8::SwashInfo{'To$name'}{'missing'} = '"
7619 . (($output_adjusted && $default_map eq $CODE_POINT)
7624 if ($default_map eq $CODE_POINT) {
7625 $return .= ' # code point maps to itself';
7627 elsif ($default_map eq "") {
7628 $return .= ' # code point maps to the null string';
7632 $return .= $pre_body;
7638 # Write the table to the file.
7641 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7643 my $addr = do { no overloading; pack 'J', $self; };
7645 # Clear the temporaries
7646 undef @multi_code_point_maps;
7648 # Calculate the format of the table if not already done.
7649 my $format = $self->format;
7650 my $type = $self->property->type;
7651 my $default_map = $self->default_map;
7652 if (! defined $format) {
7653 if ($type == $BINARY) {
7655 # Don't bother checking the values, because we elsewhere
7656 # verify that a binary table has only 2 values.
7657 $format = $BINARY_FORMAT;
7660 my @ranges = $self->_range_list->ranges;
7662 # default an empty table based on its type and default map
7665 # But it turns out that the only one we can say is a
7666 # non-string (besides binary, handled above) is when the
7667 # table is a string and the default map is to a code point
7668 if ($type == $STRING && $default_map eq $CODE_POINT) {
7669 $format = $HEX_FORMAT;
7672 $format = $STRING_FORMAT;
7677 # Start with the most restrictive format, and as we find
7678 # something that doesn't fit with that, change to the next
7679 # most restrictive, and so on.
7680 $format = $DECIMAL_FORMAT;
7681 foreach my $range (@ranges) {
7682 next if $range->type != 0; # Non-normal ranges don't
7683 # affect the main body
7684 my $map = $range->value;
7685 if ($map ne $default_map) {
7686 last if $format eq $STRING_FORMAT; # already at
7689 $format = $INTEGER_FORMAT
7690 if $format eq $DECIMAL_FORMAT
7691 && $map !~ / ^ [0-9] $ /x;
7692 $format = $FLOAT_FORMAT
7693 if $format eq $INTEGER_FORMAT
7694 && $map !~ / ^ -? [0-9]+ $ /x;
7695 $format = $RATIONAL_FORMAT
7696 if $format eq $FLOAT_FORMAT
7697 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7698 $format = $HEX_FORMAT
7699 if ($format eq $RATIONAL_FORMAT
7701 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7702 # Assume a leading zero means hex,
7703 # even if all digits are 0-9
7704 || ($format eq $INTEGER_FORMAT
7705 && $map =~ /^0[0-9A-F]/);
7706 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7707 && $map =~ /[^0-9A-F]/;
7712 } # end of calculating format
7714 if ($default_map eq $CODE_POINT
7715 && $format ne $HEX_FORMAT
7716 && ! defined $self->format) # manual settings are always
7719 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7722 # If the output is to be adjusted, the format of the table that gets
7723 # output is actually 'a' or 'ax' instead of whatever it is stored
7725 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7726 if ($output_adjusted) {
7727 if ($default_map eq $CODE_POINT) {
7728 $format = $HEX_ADJUST_FORMAT;
7731 $format = $ADJUST_FORMAT;
7735 $self->_set_format($format);
7737 return $self->SUPER::write(
7739 $default_map); # don't write defaulteds
7742 # Accessors for the underlying list that should fail if locked.
7753 return if $self->carp_if_locked;
7754 return $self->_range_list->$sub(@_);
7757 } # End closure for Map_Table
7759 package Match_Table;
7760 use parent '-norequire', '_Base_Table';
7762 # A Match table is one which is a list of all the code points that have
7763 # the same property and property value, for use in \p{property=value}
7764 # constructs in regular expressions. It adds very little data to the base
7765 # structure, but many methods, as these lists can be combined in many ways to
7767 # There are only a few concepts added:
7768 # 1) Equivalents and Relatedness.
7769 # Two tables can match the identical code points, but have different names.
7770 # This always happens when there is a perl single form extension
7771 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
7772 # tables are set to be related, with the Perl extension being a child, and
7773 # the Unicode property being the parent.
7775 # It may be that two tables match the identical code points and we don't
7776 # know if they are related or not. This happens most frequently when the
7777 # Block and Script properties have the exact range. But note that a
7778 # revision to Unicode could add new code points to the script, which would
7779 # now have to be in a different block (as the block was filled, or there
7780 # would have been 'Unknown' script code points in it and they wouldn't have
7781 # been identical). So we can't rely on any two properties from Unicode
7782 # always matching the same code points from release to release, and thus
7783 # these tables are considered coincidentally equivalent--not related. When
7784 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
7785 # 'leader', and the others are 'equivalents'. This concept is useful
7786 # to minimize the number of tables written out. Only one file is used for
7787 # any identical set of code points, with entries in Heavy.pl mapping all
7788 # the involved tables to it.
7790 # Related tables will always be identical; we set them up to be so. Thus
7791 # if the Unicode one is deprecated, the Perl one will be too. Not so for
7792 # unrelated tables. Relatedness makes generating the documentation easier.
7795 # Like equivalents, two tables may be the inverses of each other, the
7796 # intersection between them is null, and the union is every Unicode code
7797 # point. The two tables that occupy a binary property are necessarily like
7798 # this. By specifying one table as the complement of another, we can avoid
7799 # storing it on disk (using the other table and performing a fast
7800 # transform), and some memory and calculations.
7802 # 3) Conflicting. It may be that there will eventually be name clashes, with
7803 # the same name meaning different things. For a while, there actually were
7804 # conflicts, but they have so far been resolved by changing Perl's or
7805 # Unicode's definitions to match the other, but when this code was written,
7806 # it wasn't clear that that was what was going to happen. (Unicode changed
7807 # because of protests during their beta period.) Name clashes are warned
7808 # about during compilation, and the documentation. The generated tables
7809 # are sane, free of name clashes, because the code suppresses the Perl
7810 # version. But manual intervention to decide what the actual behavior
7811 # should be may be required should this happen. The introductory comments
7812 # have more to say about this.
7814 # 4) Definition. This is a string for human consumption that specifies the
7815 # code points that this table matches. This is used only for the generated
7816 # pod file. It may be specified explicitly, or automatically computed.
7817 # Only the first portion of complicated definitions is computed and
7820 sub standardize { return main::standardize($_[0]); }
7821 sub trace { return main::trace(@_); }
7826 main::setup_package();
7829 # The leader table of this one; initially $self.
7830 main::set_access('leader', \%leader, 'r');
7833 # An array of any tables that have this one as their leader
7834 main::set_access('equivalents', \%equivalents, 'readable_array');
7837 # The parent table to this one, initially $self. This allows us to
7838 # distinguish between equivalent tables that are related (for which this
7839 # is set to), and those which may not be, but share the same output file
7840 # because they match the exact same set of code points in the current
7842 main::set_access('parent', \%parent, 'r');
7845 # An array of any tables that have this one as their parent
7846 main::set_access('children', \%children, 'readable_array');
7849 # Array of any tables that would have the same name as this one with
7850 # a different meaning. This is used for the generated documentation.
7851 main::set_access('conflicting', \%conflicting, 'readable_array');
7854 # Set in the constructor for tables that are expected to match all code
7856 main::set_access('matches_all', \%matches_all, 'r');
7859 # Points to the complement that this table is expressed in terms of; 0 if
7861 main::set_access('complement', \%complement, 'r');
7864 # Human readable string of the first few ranges of code points matched by
7866 main::set_access('definition', \%definition, 'r', 's');
7873 # The property for which this table is a listing of property values.
7874 my $property = delete $args{'_Property'};
7876 my $name = delete $args{'Name'};
7877 my $full_name = delete $args{'Full_Name'};
7878 $full_name = $name if ! defined $full_name;
7881 my $initialize = delete $args{'Initialize'};
7882 my $matches_all = delete $args{'Matches_All'} || 0;
7883 my $format = delete $args{'Format'};
7884 my $definition = delete $args{'Definition'} // "";
7885 # Rest of parameters passed on.
7887 my $range_list = Range_List->new(Initialize => $initialize,
7888 Owner => $property);
7890 my $complete = $full_name;
7891 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
7892 # but this helps debug if it
7894 # The complete name for a match table includes it's property in a
7895 # compound form 'property=table', except if the property is the
7896 # pseudo-property, perl, in which case it is just the single form,
7897 # 'table' (If you change the '=' must also change the ':' in lots of
7898 # places in this program that assume an equal sign)
7899 $complete = $property->full_name . "=$complete" if $property != $perl;
7901 my $self = $class->SUPER::new(%args,
7903 Complete_Name => $complete,
7904 Full_Name => $full_name,
7905 _Property => $property,
7906 _Range_List => $range_list,
7907 Format => $EMPTY_FORMAT,
7908 Write_As_Invlist => 1,
7910 my $addr = do { no overloading; pack 'J', $self; };
7912 $conflicting{$addr} = [ ];
7913 $equivalents{$addr} = [ ];
7914 $children{$addr} = [ ];
7915 $matches_all{$addr} = $matches_all;
7916 $leader{$addr} = $self;
7917 $parent{$addr} = $self;
7918 $complement{$addr} = 0;
7919 $definition{$addr} = $definition;
7921 if (defined $format && $format ne $EMPTY_FORMAT) {
7922 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
7928 # See this program's beginning comment block about overloading these.
7931 qw("") => "_operator_stringify",
7935 return if $self->carp_if_locked;
7943 return $self->_range_list + $other;
7949 return $self->_range_list & $other;
7954 my $reversed = shift;
7957 Carp::my_carp_bug("Bad news. Can't cope with '"
7961 . "'. undef returned.");
7965 return if $self->carp_if_locked;
7967 my $addr = do { no overloading; pack 'J', $self; };
7971 # Change the range list of this table to be the
7973 $self->_set_range_list($self->_range_list
7976 else { # $other is just a simple value
7977 $self->add_range($other, $other);
7984 my $reversed = shift;
7987 Carp::my_carp_bug("Bad news. Can't cope with '"
7991 . "'. undef returned.");
7995 return if $self->carp_if_locked;
7996 $self->_set_range_list($self->_range_list & $other);
7999 '-' => sub { my $self = shift;
8001 my $reversed = shift;
8003 Carp::my_carp_bug("Bad news. Can't cope with '"
8007 . "'. undef returned.");
8011 return $self->_range_list - $other;
8013 '~' => sub { my $self = shift;
8014 return ~ $self->_range_list;
8018 sub _operator_stringify {
8021 my $name = $self->complete_name;
8022 return "Table '$name'";
8026 # Returns the range list associated with this table, which will be the
8027 # complement's if it has one.
8030 my $complement = $self->complement;
8032 # In order to avoid re-complementing on each access, only do the
8033 # complement the first time, and store the result in this table's
8034 # range list to use henceforth. However, this wouldn't work if the
8035 # controlling (complement) table changed after we do this, so lock it.
8036 # Currently, the value of the complement isn't needed until after it
8037 # is fully constructed, so this works. If this were to change, the
8038 # each_range iteration functionality would no longer work on this
8040 if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
8041 $self->_set_range_list($self->SUPER::_range_list
8042 + ~ $complement->_range_list);
8046 return $self->SUPER::_range_list;
8050 # Add a synonym for this table. See the comments in the base class
8054 # Rest of parameters passed on.
8056 $self->SUPER::add_alias($name, $self, @_);
8060 sub add_conflicting {
8061 # Add the name of some other object to the list of ones that name
8062 # clash with this match table.
8065 my $conflicting_name = shift; # The name of the conflicting object
8066 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
8067 my $conflicting_object = shift; # Optional, the conflicting object
8068 # itself. This is used to
8069 # disambiguate the text if the input
8070 # name is identical to any of the
8071 # aliases $self is known by.
8072 # Sometimes the conflicting object is
8073 # merely hypothetical, so this has to
8074 # be an optional parameter.
8075 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8077 my $addr = do { no overloading; pack 'J', $self; };
8079 # Check if the conflicting name is exactly the same as any existing
8080 # alias in this table (as long as there is a real object there to
8081 # disambiguate with).
8082 if (defined $conflicting_object) {
8083 foreach my $alias ($self->aliases) {
8084 if (standardize($alias->name) eq standardize($conflicting_name)) {
8086 # Here, there is an exact match. This results in
8087 # ambiguous comments, so disambiguate by changing the
8088 # conflicting name to its object's complete equivalent.
8089 $conflicting_name = $conflicting_object->complete_name;
8095 # Convert to the \p{...} final name
8096 $conflicting_name = "\\$p" . "{$conflicting_name}";
8099 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
8101 push @{$conflicting{$addr}}, $conflicting_name;
8106 sub is_set_equivalent_to {
8107 # Return boolean of whether or not the other object is a table of this
8108 # type and has been marked equivalent to this one.
8112 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8114 return 0 if ! defined $other; # Can happen for incomplete early
8116 unless ($other->isa(__PACKAGE__)) {
8117 my $ref_other = ref $other;
8118 my $ref_self = ref $self;
8119 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
8123 # Two tables are equivalent if they have the same leader.
8125 return $leader{pack 'J', $self} == $leader{pack 'J', $other};
8129 sub set_equivalent_to {
8130 # Set $self equivalent to the parameter table.
8131 # The required Related => 'x' parameter is a boolean indicating
8132 # whether these tables are related or not. If related, $other becomes
8133 # the 'parent' of $self; if unrelated it becomes the 'leader'
8135 # Related tables share all characteristics except names; equivalents
8136 # not quite so many.
8137 # If they are related, one must be a perl extension. This is because
8138 # we can't guarantee that Unicode won't change one or the other in a
8139 # later release even if they are identical now.
8145 my $related = delete $args{'Related'};
8147 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8149 return if ! defined $other; # Keep on going; happens in some early
8152 if (! defined $related) {
8153 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
8157 # If already are equivalent, no need to re-do it; if subroutine
8158 # returns null, it found an error, also do nothing
8159 my $are_equivalent = $self->is_set_equivalent_to($other);
8160 return if ! defined $are_equivalent || $are_equivalent;
8162 my $addr = do { no overloading; pack 'J', $self; };
8163 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
8166 if ($current_leader->perl_extension) {
8167 if ($other->perl_extension) {
8168 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
8171 } elsif ($self->property != $other->property # Depending on
8177 && ! $other->perl_extension
8179 # We allow the sc and scx properties to be marked as
8180 # related. They are in fact related, and this allows
8181 # the pod to show that better. This test isn't valid
8182 # if this is an early Unicode release without the scx
8183 # property (having that also implies the sc property
8184 # exists, so don't have to test for no 'sc')
8186 && ! ( ( $self->property == $script
8187 || $self->property == $scx)
8188 && ( $self->property == $script
8189 || $self->property == $scx))))
8191 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
8196 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
8197 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
8201 my $leader = do { no overloading; pack 'J', $current_leader; };
8202 my $other_addr = do { no overloading; pack 'J', $other; };
8204 # Any tables that are equivalent to or children of this table must now
8205 # instead be equivalent to or (children) to the new leader (parent),
8206 # still equivalent. The equivalency includes their matches_all info,
8207 # and for related tables, their fate and status.
8208 # All related tables are of necessity equivalent, but the converse
8209 # isn't necessarily true
8210 my $status = $other->status;
8211 my $status_info = $other->status_info;
8212 my $fate = $other->fate;
8213 my $matches_all = $matches_all{other_addr};
8214 my $caseless_equivalent = $other->caseless_equivalent;
8215 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
8216 next if $table == $other;
8217 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
8219 my $table_addr = do { no overloading; pack 'J', $table; };
8220 $leader{$table_addr} = $other;
8221 $matches_all{$table_addr} = $matches_all;
8222 $self->_set_range_list($other->_range_list);
8223 push @{$equivalents{$other_addr}}, $table;
8225 $parent{$table_addr} = $other;
8226 push @{$children{$other_addr}}, $table;
8227 $table->set_status($status, $status_info);
8229 # This reason currently doesn't get exposed outside; otherwise
8230 # would have to look up the parent's reason and use it instead.
8231 $table->set_fate($fate, "Parent's fate");
8233 $self->set_caseless_equivalent($caseless_equivalent);
8237 # Now that we've declared these to be equivalent, any changes to one
8238 # of the tables would invalidate that equivalency.
8244 sub set_complement {
8245 # Set $self to be the complement of the parameter table. $self is
8246 # locked, as what it contains should all come from the other table.
8252 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
8254 if ($other->complement != 0) {
8255 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
8258 my $addr = do { no overloading; pack 'J', $self; };
8259 $complement{$addr} = $other;
8261 # Be sure the other property knows we are depending on them; or the
8262 # other table if it is one in the current property.
8263 if ($self->property != $other->property) {
8264 $other->property->set_has_dependency(1);
8267 $other->set_has_dependency(1);
8273 sub add_range { # Add a range to the list for this table.
8275 # Rest of parameters passed on
8277 return if $self->carp_if_locked;
8278 return $self->_range_list->add_range(@_);
8283 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8285 # All match tables are to be used only by the Perl core.
8286 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8289 sub pre_body { # Does nothing for match tables.
8293 sub append_to_body { # Does nothing for match tables.
8301 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8303 $self->SUPER::set_fate($fate, $reason);
8305 # All children share this fate
8306 foreach my $child ($self->children) {
8307 $child->set_fate($fate, $reason);
8312 sub calculate_table_definition
8314 # Returns a human-readable string showing some or all of the code
8315 # points matched by this table. The string will include a
8316 # bracketed-character class for all characters matched in the 00-FF
8317 # range, and the first few ranges matched beyond that.
8321 my $definition = $self->definition || "";
8323 # Skip this if already have a definition.
8324 return $definition if $definition;
8326 my $lows_string = ""; # The string representation of the 0-FF
8328 my $string_range = ""; # The string rep. of the above FF ranges
8329 my $range_count = 0; # How many ranges in $string_rage
8331 my @lows_invlist; # The inversion list of the 0-FF code points
8332 my $first_non_control = ord(" "); # Everything below this is a
8333 # control, on ASCII or EBCDIC
8334 my $max_table_code_point = $self->max;
8336 # On ASCII platforms, the range 80-FF contains no printables.
8337 my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8340 # Look through the first few ranges matched by this table.
8341 $self->reset_each_range; # Defensive programming
8342 while (defined (my $range = $self->each_range())) {
8343 my $start = $range->start;
8344 my $end = $range->end;
8346 # Accumulate an inversion list of the 00-FF code points
8347 if ($start < 256 && ($start > 0 || $end < 256)) {
8348 push @lows_invlist, $start;
8349 push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8351 # Get next range if there are more ranges below 256
8352 next if $end < 256 && $end < $max_table_code_point;
8354 # If the range straddles the 255/256 boundary, we split it
8355 # there. We already added above the low portion to the
8357 $start = 256 if $end > 256;
8360 # Here, @lows_invlist contains the code points below 256, and
8361 # there is no other range, or the current one starts at or above
8362 # 256. Generate the [char class] for the 0-255 ones.
8363 while (@lows_invlist) {
8365 # If this range (necessarily the first one, by the way) starts
8367 if ($lows_invlist[0] == 0) {
8369 # If it ends within the block of controls, that means that
8370 # some controls are in it and some aren't. Since Unicode
8371 # properties pretty much only know about a few of the
8372 # controls, like \n, \t, this means that its one of them
8373 # that isn't in the range. Complement the inversion list
8374 # which will likely cause these to be output using their
8375 # mnemonics, hence being clearer.
8376 if ($lows_invlist[1] < $first_non_control) {
8377 $lows_string .= '^';
8378 shift @lows_invlist;
8379 push @lows_invlist, 256;
8381 elsif ($lows_invlist[1] <= $highest_printable) {
8383 # Here, it extends into the printables block. Split
8384 # into two ranges so that the controls are separate.
8385 $lows_string .= sprintf "\\x00-\\x%02x",
8386 $first_non_control - 1;
8387 $lows_invlist[0] = $first_non_control;
8391 # If the range completely contains the printables, don't
8392 # individually spell out the printables.
8393 if ( $lows_invlist[0] <= $first_non_control
8394 && $lows_invlist[1] > $highest_printable)
8396 $lows_string .= sprintf "\\x%02x-\\x%02x",
8397 $lows_invlist[0], $lows_invlist[1] - 1;
8398 shift @lows_invlist;
8399 shift @lows_invlist;
8403 # Here, the range may include some but not all printables.
8404 # Look at each one individually
8405 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8406 my $char = chr $ord;
8408 # If there is already something in the list, an
8409 # alphanumeric char could be the next in sequence. If so,
8410 # we start or extend a range. That is, we could have so
8411 # far something like 'a-c', and the next char is a 'd', so
8412 # we change it to 'a-d'. We use native_to_unicode()
8413 # because a-z on EBCDIC means 26 chars, and excludes the
8415 if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8416 my $prev = substr($lows_string, -1);
8417 if ( $prev !~ /[[:alnum:]]/
8418 || utf8::native_to_unicode(ord $prev) + 1
8419 != utf8::native_to_unicode(ord $char))
8421 # Not extending the range
8422 $lows_string .= $char;
8424 elsif ( length $lows_string > 1
8425 && substr($lows_string, -2, 1) eq '-')
8427 # We had a sequence like '-c' and the current
8428 # character is 'd'. Extend the range.
8429 substr($lows_string, -1, 1) = $char;
8432 # We had something like 'd' and this is 'e'.
8434 $lows_string .= "-$char";
8437 elsif ($char =~ /[[:graph:]]/) {
8439 # We output a graphic char as-is, preceded by a
8440 # backslash if it is a metacharacter
8441 $lows_string .= '\\'
8442 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8443 $lows_string .= $char;
8444 } # Otherwise use mnemonic for any that have them
8445 elsif ($char =~ /[\a]/) {
8446 $lows_string .= '\a';
8448 elsif ($char =~ /[\b]/) {
8449 $lows_string .= '\b';
8451 elsif ($char eq "\e") {
8452 $lows_string .= '\e';
8454 elsif ($char eq "\f") {
8455 $lows_string .= '\f';
8457 elsif ($char eq "\cK") {
8458 $lows_string .= '\cK';
8460 elsif ($char eq "\n") {
8461 $lows_string .= '\n';
8463 elsif ($char eq "\r") {
8464 $lows_string .= '\r';
8466 elsif ($char eq "\t") {
8467 $lows_string .= '\t';
8471 # Here is a non-graphic without a mnemonic. We use \x
8472 # notation. But if the ordinal of this is one above
8473 # the previous, create or extend the range
8474 my $hex_representation = sprintf("%02x", ord $char);
8475 if ( length $lows_string >= 4
8476 && substr($lows_string, -4, 2) eq '\\x'
8477 && hex(substr($lows_string, -2)) + 1 == ord $char)
8479 if ( length $lows_string >= 5
8480 && substr($lows_string, -5, 1) eq '-'
8481 && ( length $lows_string == 5
8482 || substr($lows_string, -6, 1) ne '\\'))
8484 substr($lows_string, -2) = $hex_representation;
8487 $lows_string .= '-\\x' . $hex_representation;
8491 $lows_string .= '\\x' . $hex_representation;
8497 # Done with assembling the string of all lows. If there are only
8498 # lows in the property, are completely done.
8499 if ($max_table_code_point < 256) {
8500 $self->reset_each_range;
8504 # Otherwise, quit if reached max number of non-lows ranges. If
8505 # there are lows, count them as one unit towards the maximum.
8507 if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8508 $string_range .= " ...";
8509 $self->reset_each_range;
8513 # Otherwise add this range.
8514 $string_range .= ", " if $string_range ne "";
8515 if ($start == $end) {
8516 $string_range .= sprintf("U+%04X", $start);
8518 elsif ($end >= $MAX_WORKING_CODEPOINT) {
8519 $string_range .= sprintf("U+%04X..infinity", $start);
8522 $string_range .= sprintf("U+%04X..%04X",
8527 # Done with all the ranges we're going to look at. Assemble the
8528 # definition from the lows + non-lows.
8530 if ($lows_string ne "" || $string_range ne "") {
8531 if ($lows_string ne "") {
8532 $definition .= "[$lows_string]";
8533 $definition .= ", " if $string_range;
8535 $definition .= $string_range;
8543 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8545 return $self->SUPER::write(0); # No adjustments
8548 sub set_final_comment {
8549 # This creates a comment for the file that is to hold the match table
8550 # $self. It is somewhat convoluted to make the English read nicely,
8551 # but, heh, it's just a comment.
8552 # This should be called only with the leader match table of all the
8553 # ones that share the same file. It lists all such tables, ordered so
8554 # that related ones are together.
8556 return unless $debugging_build;
8558 my $leader = shift; # Should only be called on the leader table of
8559 # an equivalent group
8560 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
8562 my $addr = do { no overloading; pack 'J', $leader; };
8564 if ($leader{$addr} != $leader) {
8565 Carp::my_carp_bug(<<END
8566 set_final_comment() must be called on a leader table, which $leader is not.
8567 It is equivalent to $leader{$addr}. No comment created
8573 # Get the number of code points matched by each of the tables in this
8574 # file, and add underscores for clarity.
8575 my $count = $leader->count;
8577 my $non_unicode_string;
8578 if ($count > $MAX_UNICODE_CODEPOINTS) {
8579 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8580 - $MAX_UNICODE_CODEPOINT);
8581 $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8584 $unicode_count = $count;
8585 $non_unicode_string = "";
8587 my $string_count = main::clarify_code_point_count($unicode_count);
8589 my $loose_count = 0; # how many aliases loosely matched
8590 my $compound_name = ""; # ? Are any names compound?, and if so, an
8592 my $properties_with_compound_names = 0; # count of these
8595 my %flags; # The status flags used in the file
8596 my $total_entries = 0; # number of entries written in the comment
8597 my $matches_comment = ""; # The portion of the comment about the
8599 my @global_comments; # List of all the tables' comments that are
8600 # there before this routine was called.
8601 my $has_ucd_alias = 0; # If there is an alias that is accessible via
8602 # Unicode::UCD. If not, then don't say it is
8605 # Get list of all the parent tables that are equivalent to this one
8606 # (including itself).
8607 my @parents = grep { $parent{main::objaddr $_} == $_ }
8608 main::uniques($leader, @{$equivalents{$addr}});
8609 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
8611 for my $parent (@parents) {
8613 my $property = $parent->property;
8615 # Special case 'N' tables in properties with two match tables when
8616 # the other is a 'Y' one. These are likely to be binary tables,
8617 # but not necessarily. In either case, \P{} will match the
8618 # complement of \p{}, and so if something is a synonym of \p, the
8619 # complement of that something will be the synonym of \P. This
8620 # would be true of any property with just two match tables, not
8621 # just those whose values are Y and N; but that would require a
8622 # little extra work, and there are none such so far in Unicode.
8623 my $perl_p = 'p'; # which is it? \p{} or \P{}
8624 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
8626 if (scalar $property->tables == 2
8627 && $parent == $property->table('N')
8628 && defined (my $yes = $property->table('Y')))
8630 my $yes_addr = do { no overloading; pack 'J', $yes; };
8632 = grep { $_->property == $perl }
8635 $parent{$yes_addr}->children);
8637 # But these synonyms are \P{} ,not \p{}
8641 my @description; # Will hold the table description
8642 my @note; # Will hold the table notes.
8643 my @conflicting; # Will hold the table conflicts.
8645 # Look at the parent, any yes synonyms, and all the children
8646 my $parent_addr = do { no overloading; pack 'J', $parent; };
8647 for my $table ($parent,
8649 @{$children{$parent_addr}})
8651 my $table_addr = do { no overloading; pack 'J', $table; };
8652 my $table_property = $table->property;
8654 # Tables are separated by a blank line to create a grouping.
8655 $matches_comment .= "\n" if $matches_comment;
8657 # The table is named based on the property and value
8658 # combination it is for, like script=greek. But there may be
8659 # a number of synonyms for each side, like 'sc' for 'script',
8660 # and 'grek' for 'greek'. Any combination of these is a valid
8661 # name for this table. In this case, there are three more,
8662 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
8663 # listing all possible combinations in the comment, we make
8664 # sure that each synonym occurs at least once, and add
8665 # commentary that the other combinations are possible.
8666 # Because regular expressions don't recognize things like
8667 # \p{jsn=}, only look at non-null right-hand-sides
8668 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8669 my @table_aliases = grep { $_->name ne "" } $table->aliases;
8671 # The alias lists above are already ordered in the order we
8672 # want to output them. To ensure that each synonym is listed,
8673 # we must use the max of the two numbers. But if there are no
8674 # legal synonyms (nothing in @table_aliases), then we don't
8676 my $listed_combos = (@table_aliases)
8677 ? main::max(scalar @table_aliases,
8678 scalar @property_aliases)
8680 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8682 my $property_had_compound_name = 0;
8684 for my $i (0 .. $listed_combos - 1) {
8687 # The current alias for the property is the next one on
8688 # the list, or if beyond the end, start over. Similarly
8689 # for the table (\p{prop=table})
8690 my $property_alias = $property_aliases
8691 [$i % @property_aliases]->name;
8692 my $table_alias_object = $table_aliases
8693 [$i % @table_aliases];
8694 my $table_alias = $table_alias_object->name;
8695 my $loose_match = $table_alias_object->loose_match;
8696 $has_ucd_alias |= $table_alias_object->ucd;
8698 if ($table_alias !~ /\D/) { # Clarify large numbers.
8699 $table_alias = main::clarify_number($table_alias)
8702 # Add a comment for this alias combination
8703 my $current_match_comment;
8704 if ($table_property == $perl) {
8705 $current_match_comment = "\\$perl_p"
8709 $current_match_comment
8710 = "\\p{$property_alias=$table_alias}";
8711 $property_had_compound_name = 1;
8714 # Flag any abnormal status for this table.
8715 my $flag = $property->status
8717 || $table_alias_object->status;
8718 if ($flag && $flag ne $PLACEHOLDER) {
8719 $flags{$flag} = $status_past_participles{$flag};
8724 # Pretty up the comment. Note the \b; it says don't make
8725 # this line a continuation.
8726 $matches_comment .= sprintf("\b%-1s%-s%s\n",
8729 $current_match_comment);
8730 } # End of generating the entries for this table.
8732 # Save these for output after this group of related tables.
8733 push @description, $table->description;
8734 push @note, $table->note;
8735 push @conflicting, $table->conflicting;
8737 # And this for output after all the tables.
8738 push @global_comments, $table->comment;
8740 # Compute an alternate compound name using the final property
8741 # synonym and the first table synonym with a colon instead of
8742 # the equal sign used elsewhere.
8743 if ($property_had_compound_name) {
8744 $properties_with_compound_names ++;
8745 if (! $compound_name || @property_aliases > 1) {
8746 $compound_name = $property_aliases[-1]->name
8748 . $table_aliases[0]->name;
8751 } # End of looping through all children of this table
8753 # Here have assembled in $matches_comment all the related tables
8754 # to the current parent (preceded by the same info for all the
8755 # previous parents). Put out information that applies to all of
8756 # the current family.
8759 # But output the conflicting information now, as it applies to
8761 my $conflicting = join ", ", @conflicting;
8763 $matches_comment .= <<END;
8765 Note that contrary to what you might expect, the above is NOT the same as
8767 $matches_comment .= "any of: " if @conflicting > 1;
8768 $matches_comment .= "$conflicting\n";
8772 $matches_comment .= "\n Meaning: "
8773 . join('; ', @description)
8777 $matches_comment .= "\n Note: "
8778 . join("\n ", @note)
8781 } # End of looping through all tables
8783 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8789 if ($unicode_count == 1) {
8791 $code_points = 'single code point';
8795 $code_points = "$string_count code points";
8800 if ($total_entries == 1) {
8803 $any_of_these = 'this'
8806 $synonyms = " any of the following regular expression constructs";
8807 $entries = 'entries';
8808 $any_of_these = 'any of these'
8812 if ($has_ucd_alias) {
8813 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8815 if ($has_unrelated) {
8817 This file is for tables that are not necessarily related: To conserve
8818 resources, every table that matches the identical set of code points in this
8819 version of Unicode uses this file. Each one is listed in a separate group
8820 below. It could be that the tables will match the same set of code points in
8821 other Unicode releases, or it could be purely coincidence that they happen to
8822 be the same in Unicode $unicode_version, and hence may not in other versions.
8828 foreach my $flag (sort keys %flags) {
8830 '$flag' below means that this form is $flags{$flag}.
8832 if ($flag eq $INTERNAL_ALIAS) {
8833 $comment .= "DO NOT USE!!!";
8836 $comment .= "Consult $pod_file.pod";
8843 if ($total_entries == 0) {
8844 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
8846 This file returns the $code_points in Unicode Version
8847 $unicode_version for
8848 $leader, but it is inaccessible through Perl regular expressions, as
8849 "\\p{prop=}" is not recognized.
8854 This file returns the $code_points in Unicode Version
8855 $unicode_version that
8859 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
8860 including if adding or subtracting white space, underscore, and hyphen
8861 characters matters or doesn't matter, and other permissible syntactic
8862 variants. Upper/lower case distinctions never matter.
8866 if ($compound_name) {
8869 A colon can be substituted for the equals sign, and
8871 if ($properties_with_compound_names > 1) {
8873 within each group above,
8876 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8878 # Note the \b below, it says don't make that line a continuation.
8880 anything to the left of the equals (or colon) can be combined with anything to
8881 the right. Thus, for example,
8887 # And append any comment(s) from the actual tables. They are all
8888 # gathered here, so may not read all that well.
8889 if (@global_comments) {
8890 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8893 if ($count) { # The format differs if no code points, and needs no
8894 # explanation in that case
8895 if ($leader->write_as_invlist) {
8898 The first data line of this file begins with the letter V to indicate it is in
8899 inversion list format. The number following the V gives the number of lines
8900 remaining. Each of those remaining lines is a single number representing the
8901 starting code point of a range which goes up to but not including the number
8902 on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8903 the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8904 the property. The final line's range extends to the platform's infinity.
8909 The format of the lines of this file is:
8910 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8911 STOP is the ending point, or if omitted, the range has just one code point.
8914 if ($leader->output_range_counts) {
8916 Numbers in comments in [brackets] indicate how many code points are in the
8922 $leader->set_comment(main::join_lines($comment));
8926 # Accessors for the underlying list
8928 get_valid_code_point
8929 get_invalid_code_point
8937 return $self->_range_list->$sub(@_);
8940 } # End closure for Match_Table
8944 # The Property class represents a Unicode property, or the $perl
8945 # pseudo-property. It contains a map table initialized empty at construction
8946 # time, and for properties accessible through regular expressions, various
8947 # match tables, created through the add_match_table() method, and referenced
8948 # by the table('NAME') or tables() methods, the latter returning a list of all
8949 # of the match tables. Otherwise table operations implicitly are for the map
8952 # Most of the data in the property is actually about its map table, so it
8953 # mostly just uses that table's accessors for most methods. The two could
8954 # have been combined into one object, but for clarity because of their
8955 # differing semantics, they have been kept separate. It could be argued that
8956 # the 'file' and 'directory' fields should be kept with the map table.
8958 # Each property has a type. This can be set in the constructor, or in the
8959 # set_type accessor, but mostly it is figured out by the data. Every property
8960 # starts with unknown type, overridden by a parameter to the constructor, or
8961 # as match tables are added, or ranges added to the map table, the data is
8962 # inspected, and the type changed. After the table is mostly or entirely
8963 # filled, compute_type() should be called to finalize they analysis.
8965 # There are very few operations defined. One can safely remove a range from
8966 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
8967 # table to this one, replacing any in the intersection of the two.
8969 sub standardize { return main::standardize($_[0]); }
8970 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8974 # This hash will contain as keys, all the aliases of all properties, and
8975 # as values, pointers to their respective property objects. This allows
8976 # quick look-up of a property from any of its names.
8977 my %alias_to_property_of;
8979 sub dump_alias_to_property_of {
8982 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8987 # This is a package subroutine, not called as a method.
8988 # If the single parameter is a literal '*' it returns a list of all
8989 # defined properties.
8990 # Otherwise, the single parameter is a name, and it returns a pointer
8991 # to the corresponding property object, or undef if none.
8993 # Properties can have several different names. The 'standard' form of
8994 # each of them is stored in %alias_to_property_of as they are defined.
8995 # But it's possible that this subroutine will be called with some
8996 # variant, so if the initial lookup fails, it is repeated with the
8997 # standardized form of the input name. If found, besides returning the
8998 # result, the input name is added to the list so future calls won't
8999 # have to do the conversion again.
9003 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9005 if (! defined $name) {
9006 Carp::my_carp_bug("Undefined input property. No action taken.");
9010 return main::uniques(values %alias_to_property_of) if $name eq '*';
9012 # Return cached result if have it.
9013 my $result = $alias_to_property_of{$name};
9014 return $result if defined $result;
9016 # Convert the input to standard form.
9017 my $standard_name = standardize($name);
9019 $result = $alias_to_property_of{$standard_name};
9020 return unless defined $result; # Don't cache undefs
9022 # Cache the result before returning it.
9023 $alias_to_property_of{$name} = $result;
9028 main::setup_package();
9031 # A pointer to the map table object for this property
9032 main::set_access('map', \%map);
9035 # The property's full name. This is a duplicate of the copy kept in the
9036 # map table, but is needed because stringify needs it during
9037 # construction of the map table, and then would have a chicken before egg
9039 main::set_access('full_name', \%full_name, 'r');
9042 # This hash will contain as keys, all the aliases of any match tables
9043 # attached to this property, and as values, the pointers to their
9044 # respective tables. This allows quick look-up of a table from any of its
9046 main::set_access('table_ref', \%table_ref);
9049 # The type of the property, $ENUM, $BINARY, etc
9050 main::set_access('type', \%type, 'r');
9053 # The filename where the map table will go (if actually written).
9054 # Normally defaulted, but can be overridden.
9055 main::set_access('file', \%file, 'r', 's');
9058 # The directory where the map table will go (if actually written).
9059 # Normally defaulted, but can be overridden.
9060 main::set_access('directory', \%directory, 's');
9062 my %pseudo_map_type;
9063 # This is used to affect the calculation of the map types for all the
9064 # ranges in the table. It should be set to one of the values that signify
9065 # to alter the calculation.
9066 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
9068 my %has_only_code_point_maps;
9069 # A boolean used to help in computing the type of data in the map table.
9070 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
9073 # A list of the first few distinct mappings this property has. This is
9074 # used to disambiguate between binary and enum property types, so don't
9075 # have to keep more than three.
9076 main::set_access('unique_maps', \%unique_maps);
9078 my %pre_declared_maps;
9079 # A boolean that gives whether the input data should declare all the
9080 # tables used, or not. If the former, unknown ones raise a warning.
9081 main::set_access('pre_declared_maps',
9082 \%pre_declared_maps, 'r', 's');
9085 # A boolean that gives whether some table somewhere is defined as the
9086 # complement of a table in this property. This is a crude, but currently
9087 # sufficient, mechanism to make this property not get destroyed before
9088 # what is dependent on it is. Other dependencies could be added, so the
9089 # name was chosen to reflect a more general situation than actually is
9090 # currently the case.
9091 main::set_access('has_dependency', \%has_dependency, 'r', 's');
9094 # The only required parameter is the positionally first, name. All
9095 # other parameters are key => value pairs. See the documentation just
9096 # above for the meanings of the ones not passed directly on to the map
9097 # table constructor.
9100 my $name = shift || "";
9102 my $self = property_ref($name);
9103 if (defined $self) {
9104 my $options_string = join ", ", @_;
9105 $options_string = ". Ignoring options $options_string" if $options_string;
9106 Carp::my_carp("$self is already in use. Using existing one$options_string;");
9112 $self = bless \do { my $anonymous_scalar }, $class;
9113 my $addr = do { no overloading; pack 'J', $self; };
9115 $directory{$addr} = delete $args{'Directory'};
9116 $file{$addr} = delete $args{'File'};
9117 $full_name{$addr} = delete $args{'Full_Name'} || $name;
9118 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
9119 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
9120 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
9121 # Starting in this release, property
9122 # values should be defined for all
9123 # properties, except those overriding this
9124 // $v_version ge v5.1.0;
9126 # Rest of parameters passed on.
9128 $has_only_code_point_maps{$addr} = 1;
9129 $table_ref{$addr} = { };
9130 $unique_maps{$addr} = { };
9131 $has_dependency{$addr} = 0;
9133 $map{$addr} = Map_Table->new($name,
9134 Full_Name => $full_name{$addr},
9135 _Alias_Hash => \%alias_to_property_of,
9141 # See this program's beginning comment block about overloading the copy
9142 # constructor. Few operations are defined on properties, but a couple are
9143 # useful. It is safe to take the inverse of a property, and to remove a
9144 # single code point from it.
9147 qw("") => "_operator_stringify",
9148 "." => \&main::_operator_dot,
9149 ".=" => \&main::_operator_dot_equal,
9150 '==' => \&main::_operator_equal,
9151 '!=' => \&main::_operator_not_equal,
9152 '=' => sub { return shift },
9153 '-=' => "_minus_and_equal",
9156 sub _operator_stringify {
9157 return "Property '" . shift->full_name . "'";
9160 sub _minus_and_equal {
9161 # Remove a single code point from the map table of a property.
9165 my $reversed = shift;
9166 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9169 Carp::my_carp_bug("Bad news. Can't cope with a "
9171 . " argument to '-='. Subtraction ignored.");
9174 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
9175 Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
9177 . " from a non-object. undef returned.");
9182 $map{pack 'J', $self}->delete_range($other, $other);
9187 sub add_match_table {
9188 # Add a new match table for this property, with name given by the
9189 # parameter. It returns a pointer to the table.
9195 my $addr = do { no overloading; pack 'J', $self; };
9197 my $table = $table_ref{$addr}{$name};
9198 my $standard_name = main::standardize($name);
9200 || (defined ($table = $table_ref{$addr}{$standard_name})))
9202 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
9203 $table_ref{$addr}{$name} = $table;
9208 # See if this is a perl extension, if not passed in.
9209 my $perl_extension = delete $args{'Perl_Extension'};
9211 = $self->perl_extension if ! defined $perl_extension;
9214 my $suppression_reason = "";
9215 if ($self->name =~ /^_/) {
9216 $fate = $SUPPRESSED;
9217 $suppression_reason = "Parent property is internal only";
9219 elsif ($self->fate >= $SUPPRESSED) {
9220 $fate = $self->fate;
9221 $suppression_reason = $why_suppressed{$self->complete_name};
9224 elsif ($name =~ /^_/) {
9225 $fate = $INTERNAL_ONLY;
9227 $table = Match_Table->new(
9229 Perl_Extension => $perl_extension,
9230 _Alias_Hash => $table_ref{$addr},
9233 Suppression_Reason => $suppression_reason,
9234 Status => $self->status,
9235 _Status_Info => $self->status_info,
9237 return unless defined $table;
9240 # Save the names for quick look up
9241 $table_ref{$addr}{$standard_name} = $table;
9242 $table_ref{$addr}{$name} = $table;
9244 # Perhaps we can figure out the type of this property based on the
9245 # fact of adding this match table. First, string properties don't
9246 # have match tables; second, a binary property can't have 3 match
9248 if ($type{$addr} == $UNKNOWN) {
9249 $type{$addr} = $NON_STRING;
9251 elsif ($type{$addr} == $STRING) {
9252 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
9253 $type{$addr} = $NON_STRING;
9255 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
9256 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
9257 if ($type{$addr} == $BINARY) {
9258 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
9260 $type{$addr} = $ENUM;
9267 sub delete_match_table {
9268 # Delete the table referred to by $2 from the property $1.
9271 my $table_to_remove = shift;
9272 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9274 my $addr = do { no overloading; pack 'J', $self; };
9276 # Remove all names that refer to it.
9277 foreach my $key (keys %{$table_ref{$addr}}) {
9278 delete $table_ref{$addr}{$key}
9279 if $table_ref{$addr}{$key} == $table_to_remove;
9282 $table_to_remove->DESTROY;
9287 # Return a pointer to the match table (with name given by the
9288 # parameter) associated with this property; undef if none.
9292 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9294 my $addr = do { no overloading; pack 'J', $self; };
9296 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9298 # If quick look-up failed, try again using the standard form of the
9299 # input name. If that succeeds, cache the result before returning so
9300 # won't have to standardize this input name again.
9301 my $standard_name = main::standardize($name);
9302 return unless defined $table_ref{$addr}{$standard_name};
9304 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9305 return $table_ref{$addr}{$name};
9309 # Return a list of pointers to all the match tables attached to this
9313 return main::uniques(values %{$table_ref{pack 'J', shift}});
9317 # Returns the directory the map table for this property should be
9318 # output in. If a specific directory has been specified, that has
9319 # priority; 'undef' is returned if the type isn't defined;
9320 # or $map_directory for everything else.
9322 my $addr = do { no overloading; pack 'J', shift; };
9324 return $directory{$addr} if defined $directory{$addr};
9325 return undef if $type{$addr} == $UNKNOWN;
9326 return $map_directory;
9330 # Return the name that is used to both:
9331 # 1) Name the file that the map table is written to.
9332 # 2) The name of swash related stuff inside that file.
9333 # The reason for this is that the Perl core historically has used
9334 # certain names that aren't the same as the Unicode property names.
9335 # To continue using these, $file is hard-coded in this file for those,
9336 # but otherwise the standard name is used. This is different from the
9337 # external_name, so that the rest of the files, like in lib can use
9338 # the standard name always, without regard to historical precedent.
9341 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9343 my $addr = do { no overloading; pack 'J', $self; };
9345 # Swash names are used only on either
9346 # 1) legacy-only properties, because the formats for these are
9347 # unchangeable, and they have had these lines in them; or
9348 # 2) regular or internal-only map tables
9349 # 3) otherwise there should be no access to the
9350 # property map table from other parts of Perl.
9351 return if $map{$addr}->fate != $ORDINARY
9352 && $map{$addr}->fate != $LEGACY_ONLY
9353 && ! ($map{$addr}->name =~ /^_/
9354 && $map{$addr}->fate == $INTERNAL_ONLY);
9356 return $file{$addr} if defined $file{$addr};
9357 return $map{$addr}->external_name;
9360 sub to_create_match_tables {
9361 # Returns a boolean as to whether or not match tables should be
9362 # created for this property.
9365 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9367 # The whole point of this pseudo property is match tables.
9368 return 1 if $self == $perl;
9370 my $addr = do { no overloading; pack 'J', $self; };
9372 # Don't generate tables of code points that match the property values
9373 # of a string property. Such a list would most likely have many
9374 # property values, each with just one or very few code points mapping
9376 return 0 if $type{$addr} == $STRING;
9382 sub property_add_or_replace_non_nulls {
9383 # This adds the mappings in the property $other to $self. Non-null
9384 # mappings from $other override those in $self. It essentially merges
9385 # the two properties, with the second having priority except for null
9390 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9392 if (! $other->isa(__PACKAGE__)) {
9393 Carp::my_carp_bug("$other should be a "
9402 return $map{pack 'J', $self}->map_add_or_replace_non_nulls($map{pack 'J', $other});
9406 # Certain tables are not generally written out to files, but
9407 # Unicode::UCD has the intelligence to know that the file for $self
9408 # can be used to reconstruct those tables. This routine just changes
9409 # things so that UCD pod entries for those suppressed tables are
9410 # generated, so the fact that a proxy is used is invisible to the
9415 foreach my $property_name (@_) {
9416 my $ref = property_ref($property_name);
9417 next if $ref->to_output_map;
9418 $ref->set_fate($MAP_PROXIED);
9423 # Set the type of the property. Mostly this is figured out by the
9424 # data in the table. But this is used to set it explicitly. The
9425 # reason it is not a standard accessor is that when setting a binary
9426 # property, we need to make sure that all the true/false aliases are
9427 # present, as they were omitted in early Unicode releases.
9431 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9435 && $type != $FORCED_BINARY
9436 && $type != $STRING)
9438 Carp::my_carp("Unrecognized type '$type'. Type not set");
9442 { no overloading; $type{pack 'J', $self} = $type; }
9443 return if $type != $BINARY && $type != $FORCED_BINARY;
9445 my $yes = $self->table('Y');
9446 $yes = $self->table('Yes') if ! defined $yes;
9447 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9450 # Add aliases in order wanted, duplicates will be ignored. We use a
9451 # binary property present in all releases for its ordered lists of
9452 # true/false aliases. Note, that could run into problems in
9453 # outputting things in that we don't distinguish between the name and
9454 # full name of these. Hopefully, if the table was already created
9455 # before this code is executed, it was done with these set properly.
9456 my $bm = property_ref("Bidi_Mirrored");
9457 foreach my $alias ($bm->table("Y")->aliases) {
9458 $yes->add_alias($alias->name);
9460 my $no = $self->table('N');
9461 $no = $self->table('No') if ! defined $no;
9462 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9463 foreach my $alias ($bm->table("N")->aliases) {
9464 $no->add_alias($alias->name);
9471 # Add a map to the property's map table. This also keeps
9472 # track of the maps so that the property type can be determined from
9476 my $start = shift; # First code point in range
9477 my $end = shift; # Final code point in range
9478 my $map = shift; # What the range maps to.
9479 # Rest of parameters passed on.
9481 my $addr = do { no overloading; pack 'J', $self; };
9483 # If haven't the type of the property, gather information to figure it
9485 if ($type{$addr} == $UNKNOWN) {
9487 # If the map contains an interior blank or dash, or most other
9488 # nonword characters, it will be a string property. This
9489 # heuristic may actually miss some string properties. If so, they
9490 # may need to have explicit set_types called for them. This
9491 # happens in the Unihan properties.
9492 if ($map =~ / (?<= . ) [ -] (?= . ) /x
9493 || $map =~ / [^\w.\/\ -] /x)
9495 $self->set_type($STRING);
9497 # $unique_maps is used for disambiguating between ENUM and
9498 # BINARY later; since we know the property is not going to be
9499 # one of those, no point in keeping the data around
9500 undef $unique_maps{$addr};
9504 # Not necessarily a string. The final decision has to be
9505 # deferred until all the data are in. We keep track of if all
9506 # the values are code points for that eventual decision.
9507 $has_only_code_point_maps{$addr} &=
9508 $map =~ / ^ $code_point_re $/x;
9510 # For the purposes of disambiguating between binary and other
9511 # enumerations at the end, we keep track of the first three
9512 # distinct property values. Once we get to three, we know
9513 # it's not going to be binary, so no need to track more.
9514 if (scalar keys %{$unique_maps{$addr}} < 3) {
9515 $unique_maps{$addr}{main::standardize($map)} = 1;
9520 # Add the mapping by calling our map table's method
9521 return $map{$addr}->add_map($start, $end, $map, @_);
9525 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
9526 # should be called after the property is mostly filled with its maps.
9527 # We have been keeping track of what the property values have been,
9528 # and now have the necessary information to figure out the type.
9531 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9533 my $addr = do { no overloading; pack 'J', $self; };
9535 my $type = $type{$addr};
9537 # If already have figured these out, no need to do so again, but we do
9538 # a double check on ENUMS to make sure that a string property hasn't
9539 # improperly been classified as an ENUM, so continue on with those.
9540 return if $type == $STRING
9542 || $type == $FORCED_BINARY;
9544 # If every map is to a code point, is a string property.
9545 if ($type == $UNKNOWN
9546 && ($has_only_code_point_maps{$addr}
9547 || (defined $map{$addr}->default_map
9548 && $map{$addr}->default_map eq "")))
9550 $self->set_type($STRING);
9554 # Otherwise, it is to some sort of enumeration. (The case where
9555 # it is a Unicode miscellaneous property, and treated like a
9556 # string in this program is handled in add_map()). Distinguish
9557 # between binary and some other enumeration type. Of course, if
9558 # there are more than two values, it's not binary. But more
9559 # subtle is the test that the default mapping is defined means it
9560 # isn't binary. This in fact may change in the future if Unicode
9561 # changes the way its data is structured. But so far, no binary
9562 # properties ever have @missing lines for them, so the default map
9563 # isn't defined for them. The few properties that are two-valued
9564 # and aren't considered binary have the default map defined
9565 # starting in Unicode 5.0, when the @missing lines appeared; and
9566 # this program has special code to put in a default map for them
9567 # for earlier than 5.0 releases.
9569 || scalar keys %{$unique_maps{$addr}} > 2
9570 || defined $self->default_map)
9572 my $tables = $self->tables;
9573 my $count = $self->count;
9574 if ($verbosity && $tables > 500 && $tables/$count > .1) {
9575 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9577 $self->set_type($ENUM);
9580 $self->set_type($BINARY);
9583 undef $unique_maps{$addr}; # Garbage collect
9590 my $reason = shift; # Ignored unless suppressing
9591 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
9593 my $addr = do { no overloading; pack 'J', $self; };
9594 if ($fate >= $SUPPRESSED) {
9595 $why_suppressed{$self->complete_name} = $reason;
9598 # Each table shares the property's fate, except that MAP_PROXIED
9599 # doesn't affect match tables
9600 $map{$addr}->set_fate($fate, $reason);
9601 if ($fate != $MAP_PROXIED) {
9602 foreach my $table ($map{$addr}, $self->tables) {
9603 $table->set_fate($fate, $reason);
9610 # Most of the accessors for a property actually apply to its map table.
9611 # Setup up accessor functions for those, referring to %map
9637 replacement_property
9663 # 'property' above is for symmetry, so that one can take
9664 # the property of a property and get itself, and so don't
9665 # have to distinguish between properties and tables in
9673 return $map{pack 'J', $self}->$sub(@_);
9683 # Converts an ordinal printable character value to a displayable string,
9684 # using a dotted circle to hold combining characters.
9688 return $chr if $ccc->table(0)->contains($ord);
9689 return "\x{25CC}$chr";
9693 # Returns lines of the input joined together, so that they can be folded
9695 # This causes continuation lines to be joined together into one long line
9696 # for folding. A continuation line is any line that doesn't begin with a
9697 # space or "\b" (the latter is stripped from the output). This is so
9698 # lines can be be in a HERE document so as to fit nicely in the terminal
9699 # width, but be joined together in one long line, and then folded with
9700 # indents, '#' prefixes, etc, properly handled.
9701 # A blank separates the joined lines except if there is a break; an extra
9702 # blank is inserted after a period ending a line.
9704 # Initialize the return with the first line.
9705 my ($return, @lines) = split "\n", shift;
9707 # If the first line is null, it was an empty line, add the \n back in
9708 $return = "\n" if $return eq "";
9710 # Now join the remainder of the physical lines.
9711 for my $line (@lines) {
9713 # An empty line means wanted a blank line, so add two \n's to get that
9714 # effect, and go to the next line.
9715 if (length $line == 0) {
9720 # Look at the last character of what we have so far.
9721 my $previous_char = substr($return, -1, 1);
9723 # And at the next char to be output.
9724 my $next_char = substr($line, 0, 1);
9726 if ($previous_char ne "\n") {
9728 # Here didn't end wth a nl. If the next char a blank or \b, it
9729 # means that here there is a break anyway. So add a nl to the
9731 if ($next_char eq " " || $next_char eq "\b") {
9732 $previous_char = "\n";
9733 $return .= $previous_char;
9736 # Add an extra space after periods.
9737 $return .= " " if $previous_char eq '.';
9740 # Here $previous_char is still the latest character to be output. If
9741 # it isn't a nl, it means that the next line is to be a continuation
9742 # line, with a blank inserted between them.
9743 $return .= " " if $previous_char ne "\n";
9746 substr($line, 0, 1) = "" if $next_char eq "\b";
9748 # And append this next line.
9755 sub simple_fold($;$$$) {
9756 # Returns a string of the input (string or an array of strings) folded
9757 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9759 # This is tailored for the kind of text written by this program,
9760 # especially the pod file, which can have very long names with
9761 # underscores in the middle, or words like AbcDefgHij.... We allow
9762 # breaking in the middle of such constructs if the line won't fit
9763 # otherwise. The break in such cases will come either just after an
9764 # underscore, or just before one of the Capital letters.
9766 local $to_trace = 0 if main::DEBUG;
9769 my $prefix = shift; # Optional string to prepend to each output
9771 $prefix = "" unless defined $prefix;
9773 my $hanging_indent = shift; # Optional number of spaces to indent
9774 # continuation lines
9775 $hanging_indent = 0 unless $hanging_indent;
9777 my $right_margin = shift; # Optional number of spaces to narrow the
9779 $right_margin = 0 unless defined $right_margin;
9781 # Call carp with the 'nofold' option to avoid it from trying to call us
9783 Carp::carp_extra_args(\@_, 'nofold') if main::DEBUG && @_;
9785 # The space available doesn't include what's automatically prepended
9786 # to each line, or what's reserved on the right.
9787 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9788 # XXX Instead of using the 'nofold' perhaps better to look up the stack
9790 if (DEBUG && $hanging_indent >= $max) {
9791 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
9792 $hanging_indent = 0;
9795 # First, split into the current physical lines.
9797 if (ref $line) { # Better be an array, because not bothering to
9799 foreach my $line (@{$line}) {
9800 push @line, split /\n/, $line;
9804 @line = split /\n/, $line;
9807 #local $to_trace = 1 if main::DEBUG;
9808 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9810 # Look at each current physical line.
9811 for (my $i = 0; $i < @line; $i++) {
9812 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9813 #local $to_trace = 1 if main::DEBUG;
9814 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9816 # Remove prefix, because will be added back anyway, don't want
9818 $line[$i] =~ s/^$prefix//;
9820 # Remove trailing space
9821 $line[$i] =~ s/\s+\Z//;
9823 # If the line is too long, fold it.
9824 if (length $line[$i] > $max) {
9827 # Here needs to fold. Save the leading space in the line for
9829 $line[$i] =~ /^ ( \s* )/x;
9830 my $leading_space = $1;
9831 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9833 # If character at final permissible position is white space,
9834 # fold there, which will delete that white space
9835 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9836 $remainder = substr($line[$i], $max);
9837 $line[$i] = substr($line[$i], 0, $max - 1);
9841 # Otherwise fold at an acceptable break char closest to
9842 # the max length. Look at just the maximal initial
9843 # segment of the line
9844 my $segment = substr($line[$i], 0, $max - 1);
9846 /^ ( .{$hanging_indent} # Don't look before the
9848 \ * # Don't look in leading
9849 # blanks past the indent
9850 [^ ] .* # Find the right-most
9851 (?: # acceptable break:
9852 [ \s = ] # space or equal
9853 | - (?! [.0-9] ) # or non-unary minus.
9854 ) # $1 includes the character
9857 # Split into the initial part that fits, and remaining
9859 $remainder = substr($line[$i], length $1);
9861 trace $line[$i] if DEBUG && $to_trace;
9862 trace $remainder if DEBUG && $to_trace;
9865 # If didn't find a good breaking spot, see if there is a
9866 # not-so-good breaking spot. These are just after
9867 # underscores or where the case changes from lower to
9868 # upper. Use \a as a soft hyphen, but give up
9869 # and don't break the line if there is actually a \a
9870 # already in the input. We use an ascii character for the
9871 # soft-hyphen to avoid any attempt by miniperl to try to
9872 # access the files that this program is creating.
9873 elsif ($segment !~ /\a/
9874 && ($segment =~ s/_/_\a/g
9875 || $segment =~ s/ ( [a-z] ) (?= [A-Z] )/$1\a/xg))
9877 # Here were able to find at least one place to insert
9878 # our substitute soft hyphen. Find the right-most one
9879 # and replace it by a real hyphen.
9880 trace $segment if DEBUG && $to_trace;
9882 rindex($segment, "\a"),
9885 # Then remove the soft hyphen substitutes.
9886 $segment =~ s/\a//g;
9887 trace $segment if DEBUG && $to_trace;
9889 # And split into the initial part that fits, and
9890 # remainder of the line
9891 my $pos = rindex($segment, '-');
9892 $remainder = substr($line[$i], $pos);
9893 trace $remainder if DEBUG && $to_trace;
9894 $line[$i] = substr($segment, 0, $pos + 1);
9898 # Here we know if we can fold or not. If we can, $remainder
9899 # is what remains to be processed in the next iteration.
9900 if (defined $remainder) {
9901 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9903 # Insert the folded remainder of the line as a new element
9904 # of the array. (It may still be too long, but we will
9905 # deal with that next time through the loop.) Omit any
9906 # leading space in the remainder.
9907 $remainder =~ s/^\s+//;
9908 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9910 # But then indent by whichever is larger of:
9911 # 1) the leading space on the input line;
9912 # 2) the hanging indent.
9913 # This preserves indentation in the original line.
9914 my $lead = ($leading_space)
9915 ? length $leading_space
9917 $lead = max($lead, $hanging_indent);
9918 splice @line, $i+1, 0, (" " x $lead) . $remainder;
9922 # Ready to output the line. Get rid of any trailing space
9923 # And prefix by the required $prefix passed in.
9924 $line[$i] =~ s/\s+$//;
9925 $line[$i] = "$prefix$line[$i]\n";
9926 } # End of looping through all the lines.
9928 return join "", @line;
9931 sub property_ref { # Returns a reference to a property object.
9932 return Property::property_ref(@_);
9935 sub force_unlink ($) {
9936 my $filename = shift;
9937 return unless file_exists($filename);
9938 return if CORE::unlink($filename);
9940 # We might need write permission
9941 chmod 0777, $filename;
9942 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
9947 # Given a filename and references to arrays of lines, write the lines of
9948 # each array to the file
9949 # Filename can be given as an arrayref of directory names
9951 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
9954 my $use_utf8 = shift;
9956 # Get into a single string if an array, and get rid of, in Unix terms, any
9958 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9959 $file = File::Spec->canonpath($file);
9961 # If has directories, make sure that they all exist
9962 (undef, my $directories, undef) = File::Spec->splitpath($file);
9963 File::Path::mkpath($directories) if $directories && ! -d $directories;
9965 push @files_actually_output, $file;
9967 force_unlink ($file);
9970 if (not open $OUT, ">", $file) {
9971 Carp::my_carp("can't open $file for output. Skipping this file: $!");
9975 binmode $OUT, ":utf8" if $use_utf8;
9977 while (defined (my $lines_ref = shift)) {
9978 unless (@$lines_ref) {
9979 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9982 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9984 close $OUT or die Carp::my_carp("close '$file' failed: $!");
9986 print "$file written.\n" if $verbosity >= $VERBOSE;
9992 sub Standardize($) {
9993 # This converts the input name string into a standardized equivalent to
9997 unless (defined $name) {
9998 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
10002 # Remove any leading or trailing white space
10003 $name =~ s/^\s+//g;
10004 $name =~ s/\s+$//g;
10006 # Convert interior white space and hyphens into underscores.
10007 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
10009 # Capitalize the letter following an underscore, and convert a sequence of
10010 # multiple underscores to a single one
10011 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
10013 # And capitalize the first letter, but not for the special cjk ones.
10014 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
10018 sub standardize ($) {
10019 # Returns a lower-cased standardized name, without underscores. This form
10020 # is chosen so that it can distinguish between any real versus superficial
10021 # Unicode name differences. It relies on the fact that Unicode doesn't
10022 # have interior underscores, white space, nor dashes in any
10023 # stricter-matched name. It should not be used on Unicode code point
10024 # names (the Name property), as they mostly, but not always follow these
10027 my $name = Standardize(shift);
10028 return if !defined $name;
10030 $name =~ s/ (?<= .) _ (?= . ) //xg;
10034 sub utf8_heavy_name ($$) {
10035 # Returns the name that utf8_heavy.pl will use to find a table. XXX
10036 # perhaps this function should be placed somewhere, like Heavy.pl so that
10037 # utf8_heavy can use it directly without duplicating code that can get
10042 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10044 my $property = $table->property;
10045 $property = ($property == $perl)
10046 ? "" # 'perl' is never explicitly stated
10047 : standardize($property->name) . '=';
10048 if ($alias->loose_match) {
10049 return $property . standardize($alias->name);
10052 return lc ($property . $alias->name);
10060 my $indent_increment = " " x (($debugging_build) ? 2 : 0);
10061 %main::already_output = ();
10063 $main::simple_dumper_nesting = 0;
10065 sub simple_dumper {
10066 # Like Simple Data::Dumper. Good enough for our needs. We can't use
10067 # the real thing as we have to run under miniperl.
10069 # It is designed so that on input it is at the beginning of a line,
10070 # and the final thing output in any call is a trailing ",\n".
10073 my $indent = shift;
10074 $indent = "" if ! $debugging_build || ! defined $indent;
10076 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10078 # nesting level is localized, so that as the call stack pops, it goes
10079 # back to the prior value.
10080 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
10081 local %main::already_output = %main::already_output;
10082 $main::simple_dumper_nesting++;
10083 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
10085 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10087 # Determine the indent for recursive calls.
10088 my $next_indent = $indent . $indent_increment;
10093 # Dump of scalar: just output it in quotes if not a number. To do
10094 # so we must escape certain characters, and therefore need to
10095 # operate on a copy to avoid changing the original
10097 $copy = $UNDEF unless defined $copy;
10099 # Quote non-integers (integers also have optional leading '-')
10100 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
10102 # Escape apostrophe and backslash
10103 $copy =~ s/ ( ['\\] ) /\\$1/xg;
10106 $output = "$indent$copy,\n";
10110 # Keep track of cycles in the input, and refuse to infinitely loop
10111 my $addr = do { no overloading; pack 'J', $item; };
10112 if (defined $main::already_output{$addr}) {
10113 return "${indent}ALREADY OUTPUT: $item\n";
10115 $main::already_output{$addr} = $item;
10117 if (ref $item eq 'ARRAY') {
10118 my $using_brackets;
10120 if ($main::simple_dumper_nesting > 1) {
10122 $using_brackets = 1;
10125 $using_brackets = 0;
10128 # If the array is empty, put the closing bracket on the same
10129 # line. Otherwise, recursively add each array element
10135 for (my $i = 0; $i < @$item; $i++) {
10137 # Indent array elements one level
10138 $output .= &simple_dumper($item->[$i], $next_indent);
10139 next if ! $debugging_build;
10140 $output =~ s/\n$//; # Remove any trailing nl so
10141 $output .= " # [$i]\n"; # as to add a comment giving
10144 $output .= $indent; # Indent closing ']' to orig level
10146 $output .= ']' if $using_brackets;
10149 elsif (ref $item eq 'HASH') {
10154 # No surrounding braces at top level
10155 $output .= $indent;
10156 if ($main::simple_dumper_nesting > 1) {
10158 $is_first_line = 0;
10159 $body_indent = $next_indent;
10160 $next_indent .= $indent_increment;
10164 $is_first_line = 1;
10165 $body_indent = $indent;
10169 # Output hashes sorted alphabetically instead of apparently
10170 # random. Use caseless alphabetic sort
10171 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
10173 if ($is_first_line) {
10174 $is_first_line = 0;
10177 $output .= "$body_indent";
10180 # The key must be a scalar, but this recursive call quotes
10182 $output .= &simple_dumper($key);
10184 # And change the trailing comma and nl to the hash fat
10185 # comma for clarity, and so the value can be on the same
10187 $output =~ s/,\n$/ => /;
10189 # Recursively call to get the value's dump.
10190 my $next = &simple_dumper($item->{$key}, $next_indent);
10192 # If the value is all on one line, remove its indent, so
10193 # will follow the => immediately. If it takes more than
10194 # one line, start it on a new line.
10195 if ($next !~ /\n.*\n/) {
10204 $output .= "$indent},\n" if $using_braces;
10206 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
10207 $output = $indent . ref($item) . "\n";
10208 # XXX see if blessed
10210 elsif ($item->can('dump')) {
10212 # By convention in this program, objects furnish a 'dump'
10213 # method. Since not doing any output at this level, just pass
10214 # on the input indent
10215 $output = $item->dump($indent);
10218 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
10225 sub dump_inside_out {
10226 # Dump inside-out hashes in an object's state by converting them to a
10227 # regular hash and then calling simple_dumper on that.
10229 my $object = shift;
10230 my $fields_ref = shift;
10232 my $addr = do { no overloading; pack 'J', $object; };
10235 foreach my $key (keys %$fields_ref) {
10236 $hash{$key} = $fields_ref->{$key}{$addr};
10239 return simple_dumper(\%hash, @_);
10242 sub _operator_dot {
10243 # Overloaded '.' method that is common to all packages. It uses the
10244 # package's stringify method.
10248 my $reversed = shift;
10249 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10251 $other = "" unless defined $other;
10253 foreach my $which (\$self, \$other) {
10254 next unless ref $$which;
10255 if ($$which->can('_operator_stringify')) {
10256 $$which = $$which->_operator_stringify;
10259 my $ref = ref $$which;
10260 my $addr = do { no overloading; pack 'J', $$which; };
10261 $$which = "$ref ($addr)";
10269 sub _operator_dot_equal {
10270 # Overloaded '.=' method that is common to all packages.
10274 my $reversed = shift;
10275 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10277 $other = "" unless defined $other;
10280 return $other .= "$self";
10283 return "$self" . "$other";
10287 sub _operator_equal {
10288 # Generic overloaded '==' routine. To be equal, they must be the exact
10294 return 0 unless defined $other;
10295 return 0 unless ref $other;
10297 return $self == $other;
10300 sub _operator_not_equal {
10304 return ! _operator_equal($self, $other);
10307 sub substitute_PropertyAliases($) {
10308 # Deal with early releases that don't have the crucial PropertyAliases.txt
10311 my $file_object = shift;
10312 $file_object->insert_lines(get_old_property_aliases());
10314 process_PropertyAliases($file_object);
10318 sub process_PropertyAliases($) {
10319 # This reads in the PropertyAliases.txt file, which contains almost all
10320 # the character properties in Unicode and their equivalent aliases:
10321 # scf ; Simple_Case_Folding ; sfc
10323 # Field 0 is the preferred short name for the property.
10324 # Field 1 is the full name.
10325 # Any succeeding ones are other accepted names.
10328 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10330 # Add any cjk properties that may have been defined.
10331 $file->insert_lines(@cjk_properties);
10333 while ($file->next_line) {
10335 my @data = split /\s*;\s*/;
10337 my $full = $data[1];
10339 # This line is defective in early Perls. The property in Unihan.txt
10341 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
10342 push @data, qw(cjkRSUnicode kRSUnicode);
10345 my $this = Property->new($data[0], Full_Name => $full);
10347 $this->set_fate($SUPPRESSED, $why_suppressed{$full})
10348 if $why_suppressed{$full};
10350 # Start looking for more aliases after these two.
10351 for my $i (2 .. @data - 1) {
10352 $this->add_alias($data[$i]);
10357 my $scf = property_ref("Simple_Case_Folding");
10358 $scf->add_alias("scf");
10359 $scf->add_alias("sfc");
10364 sub finish_property_setup {
10365 # Finishes setting up after PropertyAliases.
10368 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10370 # This entry was missing from this file in earlier Unicode versions
10371 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
10372 Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10375 # These are used so much, that we set globals for them.
10376 $gc = property_ref('General_Category');
10377 $block = property_ref('Block');
10378 $script = property_ref('Script');
10379 $age = property_ref('Age');
10381 # Perl adds this alias.
10382 $gc->add_alias('Category');
10384 # Unicode::Normalize expects this file with this name and directory.
10385 $ccc = property_ref('Canonical_Combining_Class');
10386 if (defined $ccc) {
10387 $ccc->set_file('CombiningClass');
10388 $ccc->set_directory(File::Spec->curdir());
10391 # These two properties aren't actually used in the core, but unfortunately
10392 # the names just above that are in the core interfere with these, so
10393 # choose different names. These aren't a problem unless the map tables
10394 # for these files get written out.
10395 my $lowercase = property_ref('Lowercase');
10396 $lowercase->set_file('IsLower') if defined $lowercase;
10397 my $uppercase = property_ref('Uppercase');
10398 $uppercase->set_file('IsUpper') if defined $uppercase;
10400 # Set up the hard-coded default mappings, but only on properties defined
10402 foreach my $property (keys %default_mapping) {
10403 my $property_object = property_ref($property);
10404 next if ! defined $property_object;
10405 my $default_map = $default_mapping{$property};
10406 $property_object->set_default_map($default_map);
10408 # A map of <code point> implies the property is string.
10409 if ($property_object->type == $UNKNOWN
10410 && $default_map eq $CODE_POINT)
10412 $property_object->set_type($STRING);
10416 # The following use the Multi_Default class to create objects for
10419 # Bidi class has a complicated default, but the derived file takes care of
10420 # the complications, leaving just 'L'.
10421 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10422 property_ref('Bidi_Class')->set_default_map('L');
10427 # The derived file was introduced in 3.1.1. The values below are
10428 # taken from table 3-8, TUS 3.0
10430 'my $default = Range_List->new;
10431 $default->add_range(0x0590, 0x05FF);
10432 $default->add_range(0xFB1D, 0xFB4F);'
10435 # The defaults apply only to unassigned characters
10436 $default_R .= '$gc->table("Unassigned") & $default;';
10438 if ($v_version lt v3.0.0) {
10439 $default = Multi_Default->new(R => $default_R, 'L');
10443 # AL apparently not introduced until 3.0: TUS 2.x references are
10444 # not on-line to check it out
10446 'my $default = Range_List->new;
10447 $default->add_range(0x0600, 0x07BF);
10448 $default->add_range(0xFB50, 0xFDFF);
10449 $default->add_range(0xFE70, 0xFEFF);'
10452 # Non-character code points introduced in this release; aren't AL
10453 if ($v_version ge 3.1.0) {
10454 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10456 $default_AL .= '$gc->table("Unassigned") & $default';
10457 $default = Multi_Default->new(AL => $default_AL,
10461 property_ref('Bidi_Class')->set_default_map($default);
10464 # Joining type has a complicated default, but the derived file takes care
10465 # of the complications, leaving just 'U' (or Non_Joining), except the file
10467 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10468 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10469 property_ref('Joining_Type')->set_default_map('Non_Joining');
10473 # Otherwise, there are not one, but two possibilities for the
10474 # missing defaults: T and U.
10475 # The missing defaults that evaluate to T are given by:
10476 # T = Mn + Cf - ZWNJ - ZWJ
10477 # where Mn and Cf are the general category values. In other words,
10478 # any non-spacing mark or any format control character, except
10479 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10480 # WIDTH JOINER (joining type C).
10481 my $default = Multi_Default->new(
10482 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10484 property_ref('Joining_Type')->set_default_map($default);
10488 # Line break has a complicated default in early releases. It is 'Unknown'
10489 # for non-assigned code points; 'AL' for assigned.
10490 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10491 my $lb = property_ref('Line_Break');
10492 if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10493 $lb->set_default_map('Unknown');
10496 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10499 $lb->set_default_map($default);
10503 # For backwards compatibility with applications that may read the mapping
10504 # file directly (it was documented in 5.12 and 5.14 as being thusly
10505 # usable), keep it from being adjusted. (range_size_1 is
10506 # used to force the traditional format.)
10507 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10508 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10509 $nfkc_cf->set_range_size_1(1);
10511 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10512 $bmg->set_to_output_map($EXTERNAL_MAP);
10513 $bmg->set_range_size_1(1);
10516 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10521 sub get_old_property_aliases() {
10522 # Returns what would be in PropertyAliases.txt if it existed in very old
10523 # versions of Unicode. It was derived from the one in 3.2, and pared
10524 # down based on the data that was actually in the older releases.
10525 # An attempt was made to use the existence of files to mean inclusion or
10526 # not of various aliases, but if this was not sufficient, using version
10527 # numbers was resorted to.
10531 # These are to be used in all versions (though some are constructed by
10532 # this program if missing)
10533 push @return, split /\n/, <<'END';
10535 Bidi_M ; Bidi_Mirrored
10537 ccc ; Canonical_Combining_Class
10538 dm ; Decomposition_Mapping
10539 dt ; Decomposition_Type
10540 gc ; General_Category
10542 lc ; Lowercase_Mapping
10544 na1 ; Unicode_1_Name
10547 scf ; Simple_Case_Folding
10548 slc ; Simple_Lowercase_Mapping
10549 stc ; Simple_Titlecase_Mapping
10550 suc ; Simple_Uppercase_Mapping
10551 tc ; Titlecase_Mapping
10552 uc ; Uppercase_Mapping
10555 if (-e 'Blocks.txt') {
10556 push @return, "blk ; Block\n";
10558 if (-e 'ArabicShaping.txt') {
10559 push @return, split /\n/, <<'END';
10564 if (-e 'PropList.txt') {
10566 # This first set is in the original old-style proplist.
10567 push @return, split /\n/, <<'END';
10568 Bidi_C ; Bidi_Control
10576 Join_C ; Join_Control
10578 QMark ; Quotation_Mark
10579 Term ; Terminal_Punctuation
10580 WSpace ; White_Space
10582 # The next sets were added later
10583 if ($v_version ge v3.0.0) {
10584 push @return, split /\n/, <<'END';
10589 if ($v_version ge v3.0.1) {
10590 push @return, split /\n/, <<'END';
10591 NChar ; Noncharacter_Code_Point
10594 # The next sets were added in the new-style
10595 if ($v_version ge v3.1.0) {
10596 push @return, split /\n/, <<'END';
10597 OAlpha ; Other_Alphabetic
10598 OLower ; Other_Lowercase
10600 OUpper ; Other_Uppercase
10603 if ($v_version ge v3.1.1) {
10604 push @return, "AHex ; ASCII_Hex_Digit\n";
10607 if (-e 'EastAsianWidth.txt') {
10608 push @return, "ea ; East_Asian_Width\n";
10610 if (-e 'CompositionExclusions.txt') {
10611 push @return, "CE ; Composition_Exclusion\n";
10613 if (-e 'LineBreak.txt') {
10614 push @return, "lb ; Line_Break\n";
10616 if (-e 'BidiMirroring.txt') {
10617 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
10619 if (-e 'Scripts.txt') {
10620 push @return, "sc ; Script\n";
10622 if (-e 'DNormalizationProps.txt') {
10623 push @return, split /\n/, <<'END';
10624 Comp_Ex ; Full_Composition_Exclusion
10625 FC_NFKC ; FC_NFKC_Closure
10626 NFC_QC ; NFC_Quick_Check
10627 NFD_QC ; NFD_Quick_Check
10628 NFKC_QC ; NFKC_Quick_Check
10629 NFKD_QC ; NFKD_Quick_Check
10630 XO_NFC ; Expands_On_NFC
10631 XO_NFD ; Expands_On_NFD
10632 XO_NFKC ; Expands_On_NFKC
10633 XO_NFKD ; Expands_On_NFKD
10636 if (-e 'DCoreProperties.txt') {
10637 push @return, split /\n/, <<'END';
10640 XIDC ; XID_Continue
10643 # These can also appear in some versions of PropList.txt
10644 push @return, "Lower ; Lowercase\n"
10645 unless grep { $_ =~ /^Lower\b/} @return;
10646 push @return, "Upper ; Uppercase\n"
10647 unless grep { $_ =~ /^Upper\b/} @return;
10650 # This flag requires the DAge.txt file to be copied into the directory.
10651 if (DEBUG && $compare_versions) {
10652 push @return, 'age ; Age';
10658 sub substitute_PropValueAliases($) {
10659 # Deal with early releases that don't have the crucial
10660 # PropValueAliases.txt file.
10662 my $file_object = shift;
10663 $file_object->insert_lines(get_old_property_value_aliases());
10665 process_PropValueAliases($file_object);
10668 sub process_PropValueAliases {
10669 # This file contains values that properties look like:
10670 # bc ; AL ; Arabic_Letter
10671 # blk; n/a ; Greek_And_Coptic ; Greek
10673 # Field 0 is the property.
10674 # Field 1 is the short name of a property value or 'n/a' if no
10675 # short name exists;
10676 # Field 2 is the full property value name;
10677 # Any other fields are more synonyms for the property value.
10678 # Purely numeric property values are omitted from the file; as are some
10679 # others, fewer and fewer in later releases
10681 # Entries for the ccc property have an extra field before the
10683 # ccc; 0; NR ; Not_Reordered
10684 # It is the numeric value that the names are synonyms for.
10686 # There are comment entries for values missing from this file:
10687 # # @missing: 0000..10FFFF; ISO_Comment; <none>
10688 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10691 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
10693 if ($v_version lt 4.0.0) {
10694 $file->insert_lines(split /\n/, <<'END'
10695 Hangul_Syllable_Type; L ; Leading_Jamo
10696 Hangul_Syllable_Type; LV ; LV_Syllable
10697 Hangul_Syllable_Type; LVT ; LVT_Syllable
10698 Hangul_Syllable_Type; NA ; Not_Applicable
10699 Hangul_Syllable_Type; T ; Trailing_Jamo
10700 Hangul_Syllable_Type; V ; Vowel_Jamo
10704 if ($v_version lt 4.1.0) {
10705 $file->insert_lines(split /\n/, <<'END'
10706 _Perl_GCB; CN ; Control
10708 _Perl_GCB; EX ; Extend
10712 _Perl_GCB; LVT ; LVT
10715 _Perl_GCB; XX ; Other
10721 # Add any explicit cjk values
10722 $file->insert_lines(@cjk_property_values);
10724 # This line is used only for testing the code that checks for name
10725 # conflicts. There is a script Inherited, and when this line is executed
10726 # it causes there to be a name conflict with the 'Inherited' that this
10727 # program generates for this block property value
10728 #$file->insert_lines('blk; n/a; Herited');
10730 # Process each line of the file ...
10731 while ($file->next_line) {
10733 # Fix typo in input file
10734 s/CCC133/CCC132/g if $v_version eq v6.1.0;
10736 my ($property, @data) = split /\s*;\s*/;
10738 # The ccc property has an extra field at the beginning, which is the
10739 # numeric value. Move it to be after the other two, mnemonic, fields,
10740 # so that those will be used as the property value's names, and the
10741 # number will be an extra alias. (Rightmost splice removes field 1-2,
10742 # returning them in a slice; left splice inserts that before anything,
10743 # thus shifting the former field 0 to after them.)
10744 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10746 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10747 my $new_style = $data[1] =~ s/-/_/gr;
10748 splice @data, 1, 0, $new_style;
10751 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
10752 # there is no short name, use the full one in element 1
10753 if ($data[0] eq "n/a") {
10754 $data[0] = $data[1];
10756 elsif ($data[0] ne $data[1]
10757 && standardize($data[0]) eq standardize($data[1])
10758 && $data[1] !~ /[[:upper:]]/)
10760 # Also, there is a bug in the file in which "n/a" is omitted, and
10761 # the two fields are identical except for case, and the full name
10762 # is all lower case. Copy the "short" name unto the full one to
10763 # give it some upper case.
10765 $data[1] = $data[0];
10768 # Earlier releases had the pseudo property 'qc' that should expand to
10769 # the ones that replace it below.
10770 if ($property eq 'qc') {
10771 if (lc $data[0] eq 'y') {
10772 $file->insert_lines('NFC_QC; Y ; Yes',
10774 'NFKC_QC; Y ; Yes',
10775 'NFKD_QC; Y ; Yes',
10778 elsif (lc $data[0] eq 'n') {
10779 $file->insert_lines('NFC_QC; N ; No',
10785 elsif (lc $data[0] eq 'm') {
10786 $file->insert_lines('NFC_QC; M ; Maybe',
10787 'NFKC_QC; M ; Maybe',
10791 $file->carp_bad_line("qc followed by unexpected '$data[0]");
10796 # The first field is the short name, 2nd is the full one.
10797 my $property_object = property_ref($property);
10798 my $table = $property_object->add_match_table($data[0],
10799 Full_Name => $data[1]);
10801 # Start looking for more aliases after these two.
10802 for my $i (2 .. @data - 1) {
10803 $table->add_alias($data[$i]);
10805 } # End of looping through the file
10807 # As noted in the comments early in the program, it generates tables for
10808 # the default values for all releases, even those for which the concept
10809 # didn't exist at the time. Here we add those if missing.
10810 if (defined $age && ! defined $age->table('Unassigned')) {
10811 $age->add_match_table('Unassigned');
10813 $block->add_match_table('No_Block') if -e 'Blocks.txt'
10814 && ! defined $block->table('No_Block');
10817 # Now set the default mappings of the properties from the file. This is
10818 # done after the loop because a number of properties have only @missings
10819 # entries in the file, and may not show up until the end.
10820 my @defaults = $file->get_missings;
10821 foreach my $default_ref (@defaults) {
10822 my $default = $default_ref->[0];
10823 my $property = property_ref($default_ref->[1]);
10824 $property->set_default_map($default);
10829 sub get_old_property_value_aliases () {
10830 # Returns what would be in PropValueAliases.txt if it existed in very old
10831 # versions of Unicode. It was derived from the one in 3.2, and pared
10832 # down. An attempt was made to use the existence of files to mean
10833 # inclusion or not of various aliases, but if this was not sufficient,
10834 # using version numbers was resorted to.
10836 my @return = split /\n/, <<'END';
10837 bc ; AN ; Arabic_Number
10838 bc ; B ; Paragraph_Separator
10839 bc ; CS ; Common_Separator
10840 bc ; EN ; European_Number
10841 bc ; ES ; European_Separator
10842 bc ; ET ; European_Terminator
10843 bc ; L ; Left_To_Right
10844 bc ; ON ; Other_Neutral
10845 bc ; R ; Right_To_Left
10846 bc ; WS ; White_Space
10848 Bidi_M; N; No; F; False
10849 Bidi_M; Y; Yes; T; True
10851 # The standard combining classes are very much different in v1, so only use
10852 # ones that look right (not checked thoroughly)
10853 ccc; 0; NR ; Not_Reordered
10854 ccc; 1; OV ; Overlay
10856 ccc; 8; KV ; Kana_Voicing
10857 ccc; 9; VR ; Virama
10858 ccc; 202; ATBL ; Attached_Below_Left
10859 ccc; 216; ATAR ; Attached_Above_Right
10860 ccc; 218; BL ; Below_Left
10861 ccc; 220; B ; Below
10862 ccc; 222; BR ; Below_Right
10864 ccc; 228; AL ; Above_Left
10865 ccc; 230; A ; Above
10866 ccc; 232; AR ; Above_Right
10867 ccc; 234; DA ; Double_Above
10869 dt ; can ; canonical
10873 dt ; fra ; fraction
10874 dt ; init ; initial
10875 dt ; iso ; isolated
10883 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
10885 gc ; Cn ; Unassigned
10886 gc ; Co ; Private_Use
10887 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
10888 gc ; LC ; Cased_Letter # Ll | Lt | Lu
10889 gc ; Ll ; Lowercase_Letter
10890 gc ; Lm ; Modifier_Letter
10891 gc ; Lo ; Other_Letter
10892 gc ; Lu ; Uppercase_Letter
10893 gc ; M ; Mark # Mc | Me | Mn
10894 gc ; Mc ; Spacing_Mark
10895 gc ; Mn ; Nonspacing_Mark
10896 gc ; N ; Number # Nd | Nl | No
10897 gc ; Nd ; Decimal_Number
10898 gc ; No ; Other_Number
10899 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
10900 gc ; Pd ; Dash_Punctuation
10901 gc ; Pe ; Close_Punctuation
10902 gc ; Po ; Other_Punctuation
10903 gc ; Ps ; Open_Punctuation
10904 gc ; S ; Symbol # Sc | Sk | Sm | So
10905 gc ; Sc ; Currency_Symbol
10906 gc ; Sm ; Math_Symbol
10907 gc ; So ; Other_Symbol
10908 gc ; Z ; Separator # Zl | Zp | Zs
10909 gc ; Zl ; Line_Separator
10910 gc ; Zp ; Paragraph_Separator
10911 gc ; Zs ; Space_Separator
10919 if (-e 'ArabicShaping.txt') {
10920 push @return, split /\n/, <<'END';
10927 jg ; n/a ; NO_JOINING_GROUP
10935 jt ; C ; Join_Causing
10936 jt ; D ; Dual_Joining
10937 jt ; L ; Left_Joining
10938 jt ; R ; Right_Joining
10939 jt ; U ; Non_Joining
10940 jt ; T ; Transparent
10942 if ($v_version ge v3.0.0) {
10943 push @return, split /\n/, <<'END';
10947 jg ; n/a ; DALATH_RISH
10950 jg ; n/a ; FINAL_SEMKATH
10953 jg ; n/a ; HAMZA_ON_HEH_GOAL
10956 jg ; n/a ; HEH_GOAL
10960 jg ; n/a ; KNOTTED_HEH
10967 jg ; n/a ; REVERSED_PE
10971 jg ; n/a ; SWASH_KAF
10973 jg ; n/a ; TEH_MARBUTA
10976 jg ; n/a ; YEH_BARREE
10977 jg ; n/a ; YEH_WITH_TAIL
10986 if (-e 'EastAsianWidth.txt') {
10987 push @return, split /\n/, <<'END';
10997 if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10998 my @lb = split /\n/, <<'END';
10999 lb ; AI ; Ambiguous
11000 lb ; AL ; Alphabetic
11001 lb ; B2 ; Break_Both
11002 lb ; BA ; Break_After
11003 lb ; BB ; Break_Before
11004 lb ; BK ; Mandatory_Break
11005 lb ; CB ; Contingent_Break
11006 lb ; CL ; Close_Punctuation
11007 lb ; CM ; Combining_Mark
11008 lb ; CR ; Carriage_Return
11009 lb ; EX ; Exclamation
11012 lb ; ID ; Ideographic
11013 lb ; IN ; Inseperable
11014 lb ; IS ; Infix_Numeric
11015 lb ; LF ; Line_Feed
11016 lb ; NS ; Nonstarter
11018 lb ; OP ; Open_Punctuation
11019 lb ; PO ; Postfix_Numeric
11020 lb ; PR ; Prefix_Numeric
11021 lb ; QU ; Quotation
11022 lb ; SA ; Complex_Context
11023 lb ; SG ; Surrogate
11025 lb ; SY ; Break_Symbols
11029 # If this Unicode version predates the lb property, we use our
11031 if (-e 'LBsubst.txt') {
11032 $_ = s/^lb/_Perl_LB/r for @lb;
11037 if (-e 'DNormalizationProps.txt') {
11038 push @return, split /\n/, <<'END';
11045 if (-e 'Scripts.txt') {
11046 push @return, split /\n/, <<'END';
11048 sc ; Armn ; Armenian
11049 sc ; Beng ; Bengali
11050 sc ; Bopo ; Bopomofo
11051 sc ; Cans ; Canadian_Aboriginal
11052 sc ; Cher ; Cherokee
11053 sc ; Cyrl ; Cyrillic
11054 sc ; Deva ; Devanagari
11055 sc ; Dsrt ; Deseret
11056 sc ; Ethi ; Ethiopic
11057 sc ; Geor ; Georgian
11060 sc ; Gujr ; Gujarati
11061 sc ; Guru ; Gurmukhi
11065 sc ; Hira ; Hiragana
11066 sc ; Ital ; Old_Italic
11067 sc ; Kana ; Katakana
11069 sc ; Knda ; Kannada
11072 sc ; Mlym ; Malayalam
11073 sc ; Mong ; Mongolian
11074 sc ; Mymr ; Myanmar
11077 sc ; Qaai ; Inherited
11079 sc ; Sinh ; Sinhala
11085 sc ; Tibt ; Tibetan
11091 if ($v_version ge v2.0.0) {
11092 push @return, split /\n/, <<'END';
11096 dt ; vert ; vertical
11100 gc ; Cs ; Surrogate
11101 gc ; Lt ; Titlecase_Letter
11102 gc ; Me ; Enclosing_Mark
11103 gc ; Nl ; Letter_Number
11104 gc ; Pc ; Connector_Punctuation
11105 gc ; Sk ; Modifier_Symbol
11108 if ($v_version ge v2.1.2) {
11109 push @return, "bc ; S ; Segment_Separator\n";
11111 if ($v_version ge v2.1.5) {
11112 push @return, split /\n/, <<'END';
11113 gc ; Pf ; Final_Punctuation
11114 gc ; Pi ; Initial_Punctuation
11117 if ($v_version ge v2.1.8) {
11118 push @return, "ccc; 240; IS ; Iota_Subscript\n";
11121 if ($v_version ge v3.0.0) {
11122 push @return, split /\n/, <<'END';
11123 bc ; AL ; Arabic_Letter
11124 bc ; BN ; Boundary_Neutral
11125 bc ; LRE ; Left_To_Right_Embedding
11126 bc ; LRO ; Left_To_Right_Override
11127 bc ; NSM ; Nonspacing_Mark
11128 bc ; PDF ; Pop_Directional_Format
11129 bc ; RLE ; Right_To_Left_Embedding
11130 bc ; RLO ; Right_To_Left_Override
11132 ccc; 233; DB ; Double_Below
11136 if ($v_version ge v3.1.0) {
11137 push @return, "ccc; 226; R ; Right\n";
11143 sub process_NormalizationsTest {
11145 # Each line looks like:
11146 # source code point; NFC; NFD; NFKC; NFKD
11148 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
11151 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11153 # Process each line of the file ...
11154 while ($file->next_line) {
11158 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
11160 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
11161 $$var = pack "U0U*", map { hex } split " ", $$var;
11162 $$var =~ s/(\\)/$1$1/g;
11165 push @normalization_tests,
11166 "Test_N(q
\a$c1
\a, q
\a$c2
\a, q
\a$c3
\a, q
\a$c4
\a, q
\a$c5
\a);\n";
11167 } # End of looping through the file
11170 sub output_perl_charnames_line ($$) {
11172 # Output the entries in Perl_charnames specially, using 5 digits instead
11173 # of four. This makes the entries a constant length, and simplifies
11174 # charnames.pm which this table is for. Unicode can have 6 digit
11175 # ordinals, but they are all private use or noncharacters which do not
11176 # have names, so won't be in this table.
11178 return sprintf "%05X\t%s\n", $_[0], $_[1];
11183 # These are constants to the $property_info hash in this subroutine, to
11184 # avoid using a quoted-string which might have a typo.
11186 my $DEFAULT_MAP = 'default_map';
11187 my $DEFAULT_TABLE = 'default_table';
11188 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
11189 my $MISSINGS = 'missings';
11191 sub process_generic_property_file {
11192 # This processes a file containing property mappings and puts them
11193 # into internal map tables. It should be used to handle any property
11194 # files that have mappings from a code point or range thereof to
11195 # something else. This means almost all the UCD .txt files.
11196 # each_line_handlers() should be set to adjust the lines of these
11197 # files, if necessary, to what this routine understands:
11200 # 003C..003E ; Math
11202 # the fields are: "codepoint-range ; property; map"
11204 # meaning the codepoints in the range all have the value 'map' under
11206 # Beginning and trailing white space in each field are not significant.
11207 # Note there is not a trailing semi-colon in the above. A trailing
11208 # semi-colon means the map is a null-string. An omitted map, as
11209 # opposed to a null-string, is assumed to be 'Y', based on Unicode
11210 # table syntax. (This could have been hidden from this routine by
11211 # doing it in the $file object, but that would require parsing of the
11212 # line there, so would have to parse it twice, or change the interface
11213 # to pass this an array. So not done.)
11215 # The map field may begin with a sequence of commands that apply to
11216 # this range. Each such command begins and ends with $CMD_DELIM.
11217 # These are used to indicate, for example, that the mapping for a
11218 # range has a non-default type.
11220 # This loops through the file, calling its next_line() method, and
11221 # then taking the map and adding it to the property's table.
11222 # Complications arise because any number of properties can be in the
11223 # file, in any order, interspersed in any way. The first time a
11224 # property is seen, it gets information about that property and
11225 # caches it for quick retrieval later. It also normalizes the maps
11226 # so that only one of many synonyms is stored. The Unicode input
11227 # files do use some multiple synonyms.
11230 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11232 my %property_info; # To keep track of what properties
11233 # have already had entries in the
11234 # current file, and info about each,
11235 # so don't have to recompute.
11236 my $property_name; # property currently being worked on
11237 my $property_type; # and its type
11238 my $previous_property_name = ""; # name from last time through loop
11239 my $property_object; # pointer to the current property's
11241 my $property_addr; # the address of that object
11242 my $default_map; # the string that code points missing
11243 # from the file map to
11244 my $default_table; # For non-string properties, a
11245 # reference to the match table that
11246 # will contain the list of code
11247 # points that map to $default_map.
11249 # Get the next real non-comment line
11251 while ($file->next_line) {
11253 # Default replacement type; means that if parts of the range have
11254 # already been stored in our tables, the new map overrides them if
11255 # they differ more than cosmetically
11256 my $replace = $IF_NOT_EQUIVALENT;
11257 my $map_type; # Default type for the map of this range
11259 #local $to_trace = 1 if main::DEBUG;
11260 trace $_ if main::DEBUG && $to_trace;
11262 # Split the line into components
11263 my ($range, $property_name, $map, @remainder)
11264 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
11266 # If more or less on the line than we are expecting, warn and skip
11269 $file->carp_bad_line('Extra fields');
11272 elsif ( ! defined $property_name) {
11273 $file->carp_bad_line('Missing property');
11277 # Examine the range.
11278 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
11280 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
11284 my $high = (defined $2) ? hex $2 : $low;
11286 # If changing to a new property, get the things constant per
11288 if ($previous_property_name ne $property_name) {
11290 $property_object = property_ref($property_name);
11291 if (! defined $property_object) {
11292 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
11295 { no overloading; $property_addr = pack 'J', $property_object; }
11297 # Defer changing names until have a line that is acceptable
11298 # (the 'next' statement above means is unacceptable)
11299 $previous_property_name = $property_name;
11301 # If not the first time for this property, retrieve info about
11302 # it from the cache
11303 if (defined ($property_info{$property_addr}{$TYPE})) {
11304 $property_type = $property_info{$property_addr}{$TYPE};
11305 $default_map = $property_info{$property_addr}{$DEFAULT_MAP};
11307 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE};
11309 = $property_info{$property_addr}{$DEFAULT_TABLE};
11313 # Here, is the first time for this property. Set up the
11315 $property_type = $property_info{$property_addr}{$TYPE}
11316 = $property_object->type;
11318 = $property_info{$property_addr}{$PSEUDO_MAP_TYPE}
11319 = $property_object->pseudo_map_type;
11321 # The Unicode files are set up so that if the map is not
11322 # defined, it is a binary property
11323 if (! defined $map && $property_type != $BINARY) {
11324 if ($property_type != $UNKNOWN
11325 && $property_type != $NON_STRING)
11327 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
11330 $property_object->set_type($BINARY);
11332 = $property_info{$property_addr}{$TYPE}
11337 # Get any @missings default for this property. This
11338 # should precede the first entry for the property in the
11339 # input file, and is located in a comment that has been
11340 # stored by the Input_file class until we access it here.
11341 # It's possible that there is more than one such line
11342 # waiting for us; collect them all, and parse
11343 my @missings_list = $file->get_missings
11344 if $file->has_missings_defaults;
11345 foreach my $default_ref (@missings_list) {
11346 my $default = $default_ref->[0];
11347 my $addr = do { no overloading; pack 'J', property_ref($default_ref->[1]); };
11349 # For string properties, the default is just what the
11350 # file says, but non-string properties should already
11351 # have set up a table for the default property value;
11352 # use the table for these, so can resolve synonyms
11353 # later to a single standard one.
11354 if ($property_type == $STRING
11355 || $property_type == $UNKNOWN)
11357 $property_info{$addr}{$MISSINGS} = $default;
11360 $property_info{$addr}{$MISSINGS}
11361 = $property_object->table($default);
11365 # Finished storing all the @missings defaults in the input
11366 # file so far. Get the one for the current property.
11367 my $missings = $property_info{$property_addr}{$MISSINGS};
11369 # But we likely have separately stored what the default
11370 # should be. (This is to accommodate versions of the
11371 # standard where the @missings lines are absent or
11372 # incomplete.) Hopefully the two will match. But check
11374 $default_map = $property_object->default_map;
11376 # If the map is a ref, it means that the default won't be
11377 # processed until later, so undef it, so next few lines
11378 # will redefine it to something that nothing will match
11379 undef $default_map if ref $default_map;
11381 # Create a $default_map if don't have one; maybe a dummy
11382 # that won't match anything.
11383 if (! defined $default_map) {
11385 # Use any @missings line in the file.
11386 if (defined $missings) {
11387 if (ref $missings) {
11388 $default_map = $missings->full_name;
11389 $default_table = $missings;
11392 $default_map = $missings;
11395 # And store it with the property for outside use.
11396 $property_object->set_default_map($default_map);
11400 # Neither an @missings nor a default map. Create
11401 # a dummy one, so won't have to test definedness
11402 # in the main loop.
11403 $default_map = '_Perl This will never be in a file
11408 # Here, we have $default_map defined, possibly in terms of
11409 # $missings, but maybe not, and possibly is a dummy one.
11410 if (defined $missings) {
11412 # Make sure there is no conflict between the two.
11413 # $missings has priority.
11414 if (ref $missings) {
11416 = $property_object->table($default_map);
11417 if (! defined $default_table
11418 || $default_table != $missings)
11420 if (! defined $default_table) {
11421 $default_table = $UNDEF;
11423 $file->carp_bad_line(<<END
11424 The \@missings line for $property_name in $file says that missings default to
11425 $missings, but we expect it to be $default_table. $missings used.
11428 $default_table = $missings;
11429 $default_map = $missings->full_name;
11431 $property_info{$property_addr}{$DEFAULT_TABLE}
11434 elsif ($default_map ne $missings) {
11435 $file->carp_bad_line(<<END
11436 The \@missings line for $property_name in $file says that missings default to
11437 $missings, but we expect it to be $default_map. $missings used.
11440 $default_map = $missings;
11444 $property_info{$property_addr}{$DEFAULT_MAP}
11447 # If haven't done so already, find the table corresponding
11448 # to this map for non-string properties.
11449 if (! defined $default_table
11450 && $property_type != $STRING
11451 && $property_type != $UNKNOWN)
11453 $default_table = $property_info{$property_addr}
11455 = $property_object->table($default_map);
11457 } # End of is first time for this property
11458 } # End of switching properties.
11460 # Ready to process the line.
11461 # The Unicode files are set up so that if the map is not defined,
11462 # it is a binary property with value 'Y'
11463 if (! defined $map) {
11468 # If the map begins with a special command to us (enclosed in
11469 # delimiters), extract the command(s).
11470 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11472 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
11475 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
11479 $file->carp_bad_line("Unknown command line: '$1'");
11485 if ($default_map eq $CODE_POINT && $map =~ / ^ $code_point_re $/x)
11488 # Here, we have a map to a particular code point, and the
11489 # default map is to a code point itself. If the range
11490 # includes the particular code point, change that portion of
11491 # the range to the default. This makes sure that in the final
11492 # table only the non-defaults are listed.
11493 my $decimal_map = hex $map;
11494 if ($low <= $decimal_map && $decimal_map <= $high) {
11496 # If the range includes stuff before or after the map
11497 # we're changing, split it and process the split-off parts
11499 if ($low < $decimal_map) {
11500 $file->insert_adjusted_lines(
11501 sprintf("%04X..%04X; %s; %s",
11507 if ($high > $decimal_map) {
11508 $file->insert_adjusted_lines(
11509 sprintf("%04X..%04X; %s; %s",
11515 $low = $high = $decimal_map;
11516 $map = $CODE_POINT;
11520 # If we can tell that this is a synonym for the default map, use
11521 # the default one instead.
11522 if ($property_type != $STRING
11523 && $property_type != $UNKNOWN)
11525 my $table = $property_object->table($map);
11526 if (defined $table && $table == $default_table) {
11527 $map = $default_map;
11531 # And figure out the map type if not known.
11532 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11533 if ($map eq "") { # Nulls are always $NULL map type
11535 } # Otherwise, non-strings, and those that don't allow
11536 # $MULTI_CP, and those that aren't multiple code points are
11539 (($property_type != $STRING && $property_type != $UNKNOWN)
11540 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11541 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
11546 $map_type = $MULTI_CP;
11550 $property_object->add_map($low, $high,
11553 Replace => $replace);
11554 } # End of loop through file's lines
11560 { # Closure for UnicodeData.txt handling
11562 # This file was the first one in the UCD; its design leads to some
11563 # awkwardness in processing. Here is a sample line:
11564 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11565 # The fields in order are:
11566 my $i = 0; # The code point is in field 0, and is shifted off.
11567 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
11568 my $CATEGORY = $i++; # category (e.g. "Lu")
11569 my $CCC = $i++; # Canonical combining class (e.g. "230")
11570 my $BIDI = $i++; # directional class (e.g. "L")
11571 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
11572 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
11573 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11574 # Dual-use in this program; see below
11575 my $NUMERIC = $i++; # numeric value
11576 my $MIRRORED = $i++; # ? mirrored
11577 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11578 my $COMMENT = $i++; # iso comment
11579 my $UPPER = $i++; # simple uppercase mapping
11580 my $LOWER = $i++; # simple lowercase mapping
11581 my $TITLE = $i++; # simple titlecase mapping
11582 my $input_field_count = $i;
11584 # This routine in addition outputs these extra fields:
11586 my $DECOMP_TYPE = $i++; # Decomposition type
11588 # These fields are modifications of ones above, and are usually
11589 # suppressed; they must come last, as for speed, the loop upper bound is
11590 # normally set to ignore them
11591 my $NAME = $i++; # This is the strict name field, not the one that
11593 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
11594 # by Unicode::Normalize
11595 my $last_field = $i - 1;
11597 # All these are read into an array for each line, with the indices defined
11598 # above. The empty fields in the example line above indicate that the
11599 # value is defaulted. The handler called for each line of the input
11600 # changes these to their defaults.
11602 # Here are the official names of the properties, in a parallel array:
11604 $field_names[$BIDI] = 'Bidi_Class';
11605 $field_names[$CATEGORY] = 'General_Category';
11606 $field_names[$CCC] = 'Canonical_Combining_Class';
11607 $field_names[$CHARNAME] = 'Perl_Charnames';
11608 $field_names[$COMMENT] = 'ISO_Comment';
11609 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11610 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11611 $field_names[$LOWER] = 'Lowercase_Mapping';
11612 $field_names[$MIRRORED] = 'Bidi_Mirrored';
11613 $field_names[$NAME] = 'Name';
11614 $field_names[$NUMERIC] = 'Numeric_Value';
11615 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11616 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11617 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11618 $field_names[$TITLE] = 'Titlecase_Mapping';
11619 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11620 $field_names[$UPPER] = 'Uppercase_Mapping';
11622 # Some of these need a little more explanation:
11623 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11624 # property, but is used in calculating the Numeric_Type. Perl however,
11625 # creates a file from this field, so a Perl property is created from it.
11626 # Similarly, the Other_Digit field is used only for calculating the
11627 # Numeric_Type, and so it can be safely re-used as the place to store
11628 # the value for Numeric_Type; hence it is referred to as
11629 # $NUMERIC_TYPE_OTHER_DIGIT.
11630 # The input field named $PERL_DECOMPOSITION is a combination of both the
11631 # decomposition mapping and its type. Perl creates a file containing
11632 # exactly this field, so it is used for that. The two properties are
11633 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11634 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
11635 # output it), as Perl doesn't use it directly.
11636 # The input field named here $CHARNAME is used to construct the
11637 # Perl_Charnames property, which is a combination of the Name property
11638 # (which the input field contains), and the Unicode_1_Name property, and
11639 # others from other files. Since, the strict Name property is not used
11640 # by Perl, this field is used for the table that Perl does use. The
11641 # strict Name property table is usually suppressed (unless the lists are
11642 # changed to output it), so it is accumulated in a separate field,
11643 # $NAME, which to save time is discarded unless the table is actually to
11646 # This file is processed like most in this program. Control is passed to
11647 # process_generic_property_file() which calls filter_UnicodeData_line()
11648 # for each input line. This filter converts the input into line(s) that
11649 # process_generic_property_file() understands. There is also a setup
11650 # routine called before any of the file is processed, and a handler for
11651 # EOF processing, all in this closure.
11653 # A huge speed-up occurred at the cost of some added complexity when these
11654 # routines were altered to buffer the outputs into ranges. Almost all the
11655 # lines of the input file apply to just one code point, and for most
11656 # properties, the map for the next code point up is the same as the
11657 # current one. So instead of creating a line for each property for each
11658 # input line, filter_UnicodeData_line() remembers what the previous map
11659 # of a property was, and doesn't generate a line to pass on until it has
11660 # to, as when the map changes; and that passed-on line encompasses the
11661 # whole contiguous range of code points that have the same map for that
11662 # property. This means a slight amount of extra setup, and having to
11663 # flush these buffers on EOF, testing if the maps have changed, plus
11664 # remembering state information in the closure. But it means a lot less
11665 # real time in not having to change the data base for each property on
11668 # Another complication is that there are already a few ranges designated
11669 # in the input. There are two lines for each, with the same maps except
11670 # the code point and name on each line. This was actually the hardest
11671 # thing to design around. The code points in those ranges may actually
11672 # have real maps not given by these two lines. These maps will either
11673 # be algorithmically determinable, or be in the extracted files furnished
11674 # with the UCD. In the event of conflicts between these extracted files,
11675 # and this one, Unicode says that this one prevails. But it shouldn't
11676 # prevail for conflicts that occur in these ranges. The data from the
11677 # extracted files prevails in those cases. So, this program is structured
11678 # so that those files are processed first, storing maps. Then the other
11679 # files are processed, generally overwriting what the extracted files
11680 # stored. But just the range lines in this input file are processed
11681 # without overwriting. This is accomplished by adding a special string to
11682 # the lines output to tell process_generic_property_file() to turn off the
11683 # overwriting for just this one line.
11684 # A similar mechanism is used to tell it that the map is of a non-default
11687 sub setup_UnicodeData { # Called before any lines of the input are read
11689 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11691 # Create a new property specially located that is a combination of
11692 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11693 # _Perl_Name_Alias properties. (The final one duplicates elements of the
11694 # first, and starting in v6.1, is the same as the 'Name_Alias
11695 # property.) A comment for the new property will later be constructed
11696 # based on the actual properties present and used
11697 $perl_charname = Property->new('Perl_Charnames',
11699 Directory => File::Spec->curdir(),
11701 Fate => $INTERNAL_ONLY,
11702 Perl_Extension => 1,
11703 Range_Size_1 => \&output_perl_charnames_line,
11706 $perl_charname->set_proxy_for('Name');
11708 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11709 Directory => File::Spec->curdir(),
11710 File => 'Decomposition',
11711 Format => $DECOMP_STRING_FORMAT,
11712 Fate => $INTERNAL_ONLY,
11713 Perl_Extension => 1,
11714 Default_Map => $CODE_POINT,
11716 # normalize.pm can't cope with these
11717 Output_Range_Counts => 0,
11719 # This is a specially formatted table
11720 # explicitly for normalize.pm, which
11721 # is expecting a particular format,
11722 # which means that mappings containing
11723 # multiple code points are in the main
11724 # body of the table
11725 Map_Type => $COMPUTE_NO_MULTI_CP,
11727 To_Output_Map => $INTERNAL_MAP,
11729 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11730 $Perl_decomp->add_comment(join_lines(<<END
11731 This mapping is a combination of the Unicode 'Decomposition_Type' and
11732 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
11733 identical to the official Unicode 'Decomposition_Mapping' property except for
11735 1) It omits the algorithmically determinable Hangul syllable decompositions,
11736 which normalize.pm handles algorithmically.
11737 2) It contains the decomposition type as well. Non-canonical decompositions
11738 begin with a word in angle brackets, like <super>, which denotes the
11739 compatible decomposition type. If the map does not begin with the <angle
11740 brackets>, the decomposition is canonical.
11744 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11746 Perl_Extension => 1,
11747 Directory => $map_directory,
11749 To_Output_Map => $OUTPUT_ADJUSTED,
11751 $Decimal_Digit->add_comment(join_lines(<<END
11752 This file gives the mapping of all code points which represent a single
11753 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11754 points, and the mapping of each non-initial element of each range is actually
11755 not to "0", but to the offset that element has from its corresponding DIGIT 0.
11756 These code points are those that have Numeric_Type=Decimal; not special
11757 things, like subscripts nor Roman numerals.
11761 # These properties are not used for generating anything else, and are
11762 # usually not output. By making them last in the list, we can just
11763 # change the high end of the loop downwards to avoid the work of
11764 # generating a table(s) that is/are just going to get thrown away.
11765 if (! property_ref('Decomposition_Mapping')->to_output_map
11766 && ! property_ref('Name')->to_output_map)
11768 $last_field = min($NAME, $DECOMP_MAP) - 1;
11769 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11770 $last_field = $DECOMP_MAP;
11771 } elsif (property_ref('Name')->to_output_map) {
11772 $last_field = $NAME;
11777 my $first_time = 1; # ? Is this the first line of the file
11778 my $in_range = 0; # ? Are we in one of the file's ranges
11779 my $previous_cp; # hex code point of previous line
11780 my $decimal_previous_cp = -1; # And its decimal equivalent
11781 my @start; # For each field, the current starting
11782 # code point in hex for the range
11783 # being accumulated.
11784 my @fields; # The input fields;
11785 my @previous_fields; # And those from the previous call
11787 sub filter_UnicodeData_line {
11788 # Handle a single input line from UnicodeData.txt; see comments above
11789 # Conceptually this takes a single line from the file containing N
11790 # properties, and converts it into N lines with one property per line,
11791 # which is what the final handler expects. But there are
11792 # complications due to the quirkiness of the input file, and to save
11793 # time, it accumulates ranges where the property values don't change
11794 # and only emits lines when necessary. This is about an order of
11795 # magnitude fewer lines emitted.
11798 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
11800 # $_ contains the input line.
11801 # -1 in split means retain trailing null fields
11802 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11804 #local $to_trace = 1 if main::DEBUG;
11805 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11806 if (@fields > $input_field_count) {
11807 $file->carp_bad_line('Extra fields');
11812 my $decimal_cp = hex $cp;
11814 # We have to output all the buffered ranges when the next code point
11815 # is not exactly one after the previous one, which means there is a
11816 # gap in the ranges.
11817 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11819 # The decomposition mapping field requires special handling. It looks
11822 # <compat> 0032 0020
11825 # The decomposition type is enclosed in <brackets>; if missing, it
11826 # means the type is canonical. There are two decomposition mapping
11827 # tables: the one for use by Perl's normalize.pm has a special format
11828 # which is this field intact; the other, for general use is of
11829 # standard format. In either case we have to find the decomposition
11830 # type. Empty fields have None as their type, and map to the code
11832 if ($fields[$PERL_DECOMPOSITION] eq "") {
11833 $fields[$DECOMP_TYPE] = 'None';
11834 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11837 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11838 =~ / < ( .+? ) > \s* ( .+ ) /x;
11839 if (! defined $fields[$DECOMP_TYPE]) {
11840 $fields[$DECOMP_TYPE] = 'Canonical';
11841 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11844 $fields[$DECOMP_MAP] = $map;
11848 # The 3 numeric fields also require special handling. The 2 digit
11849 # fields must be either empty or match the number field. This means
11850 # that if it is empty, they must be as well, and the numeric type is
11851 # None, and the numeric value is 'Nan'.
11852 # The decimal digit field must be empty or match the other digit
11853 # field. If the decimal digit field is non-empty, the code point is
11854 # a decimal digit, and the other two fields will have the same value.
11855 # If it is empty, but the other digit field is non-empty, the code
11856 # point is an 'other digit', and the number field will have the same
11857 # value as the other digit field. If the other digit field is empty,
11858 # but the number field is non-empty, the code point is a generic
11860 if ($fields[$NUMERIC] eq "") {
11861 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11862 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11864 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
11866 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11867 $fields[$NUMERIC] = 'NaN';
11870 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11871 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11872 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11873 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11874 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11876 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11877 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11878 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11881 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11883 # Rationals require extra effort.
11884 if ($fields[$NUMERIC] =~ qr{/}) {
11885 reduce_fraction(\$fields[$NUMERIC]);
11886 register_fraction($fields[$NUMERIC])
11891 # For the properties that have empty fields in the file, and which
11892 # mean something different from empty, change them to that default.
11893 # Certain fields just haven't been empty so far in any Unicode
11894 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11895 # $CATEGORY. This leaves just the two fields, and so we hard-code in
11896 # the defaults; which are very unlikely to ever change.
11897 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11898 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11900 # UAX44 says that if title is empty, it is the same as whatever upper
11902 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11904 # There are a few pairs of lines like:
11905 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11906 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11907 # that define ranges. These should be processed after the fields are
11908 # adjusted above, as they may override some of them; but mostly what
11909 # is left is to possibly adjust the $CHARNAME field. The names of all the
11910 # paired lines start with a '<', but this is also true of '<control>,
11911 # which isn't one of these special ones.
11912 if ($fields[$CHARNAME] eq '<control>') {
11914 # Some code points in this file have the pseudo-name
11915 # '<control>', but the official name for such ones is the null
11917 $fields[$NAME] = $fields[$CHARNAME] = "";
11919 # We had better not be in between range lines.
11921 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11925 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11927 # Here is a non-range line. We had better not be in between range
11930 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11933 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11935 # These are code points whose names end in their code points,
11936 # which means the names are algorithmically derivable from the
11937 # code points. To shorten the output Name file, the algorithm
11938 # for deriving these is placed in the file instead of each
11939 # code point, so they have map type $CP_IN_NAME
11940 $fields[$CHARNAME] = $CMD_DELIM
11945 . $fields[$CHARNAME];
11947 $fields[$NAME] = $fields[$CHARNAME];
11949 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11950 $fields[$CHARNAME] = $fields[$NAME] = $1;
11952 # Here we are at the beginning of a range pair.
11954 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
11958 # Because the properties in the range do not overwrite any already
11959 # in the db, we must flush the buffers of what's already there, so
11960 # they get handled in the normal scheme.
11964 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11965 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
11969 else { # Here, we are at the last line of a range pair.
11972 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
11978 $fields[$NAME] = $fields[$CHARNAME];
11980 # Check that the input is valid: that the closing of the range is
11981 # the same as the beginning.
11982 foreach my $i (0 .. $last_field) {
11983 next if $fields[$i] eq $previous_fields[$i];
11984 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
11987 # The processing differs depending on the type of range,
11988 # determined by its $CHARNAME
11989 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11991 # Check that the data looks right.
11992 if ($decimal_previous_cp != $SBase) {
11993 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
11995 if ($decimal_cp != $SBase + $SCount - 1) {
11996 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
11999 # The Hangul syllable range has a somewhat complicated name
12000 # generation algorithm. Each code point in it has a canonical
12001 # decomposition also computable by an algorithm. The
12002 # perl decomposition map table built from these is used only
12003 # by normalize.pm, which has the algorithm built in it, so the
12004 # decomposition maps are not needed, and are large, so are
12005 # omitted from it. If the full decomposition map table is to
12006 # be output, the decompositions are generated for it, in the
12007 # EOF handling code for this input file.
12009 $previous_fields[$DECOMP_TYPE] = 'Canonical';
12011 # This range is stored in our internal structure with its
12012 # own map type, different from all others.
12013 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12019 . $fields[$CHARNAME];
12021 elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter
12023 # All the CJK ranges like this have the name given as a
12024 # special case in the next code line. And for the others, we
12025 # hope that Unicode continues to use the correct name in
12026 # future releases, so we don't have to make further special
12028 my $name = ($fields[$CHARNAME] =~ /^CJK/)
12029 ? 'CJK UNIFIED IDEOGRAPH'
12030 : uc $fields[$CHARNAME];
12032 # The name for these contains the code point itself, and all
12033 # are defined to have the same base name, regardless of what
12034 # is in the file. They are stored in our internal structure
12035 # with a map type of $CP_IN_NAME
12036 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
12045 elsif ($fields[$CATEGORY] eq 'Co'
12046 || $fields[$CATEGORY] eq 'Cs')
12048 # The names of all the code points in these ranges are set to
12049 # null, as there are no names for the private use and
12050 # surrogate code points.
12052 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
12055 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
12058 # The first line of the range caused everything else to be output,
12059 # and then its values were stored as the beginning values for the
12060 # next set of ranges, which this one ends. Now, for each value,
12061 # add a command to tell the handler that these values should not
12062 # replace any existing ones in our database.
12063 foreach my $i (0 .. $last_field) {
12064 $previous_fields[$i] = $CMD_DELIM
12069 . $previous_fields[$i];
12072 # And change things so it looks like the entire range has been
12073 # gone through with this being the final part of it. Adding the
12074 # command above to each field will cause this range to be flushed
12075 # during the next iteration, as it guaranteed that the stored
12076 # field won't match whatever value the next one has.
12077 $previous_cp = $cp;
12078 $decimal_previous_cp = $decimal_cp;
12080 # We are now set up for the next iteration; so skip the remaining
12081 # code in this subroutine that does the same thing, but doesn't
12082 # know about these ranges.
12088 # On the very first line, we fake it so the code below thinks there is
12089 # nothing to output, and initialize so that when it does get output it
12090 # uses the first line's values for the lowest part of the range.
12091 # (One could avoid this by using peek(), but then one would need to
12092 # know the adjustments done above and do the same ones in the setup
12093 # routine; not worth it)
12096 @previous_fields = @fields;
12097 @start = ($cp) x scalar @fields;
12098 $decimal_previous_cp = $decimal_cp - 1;
12101 # For each field, output the stored up ranges that this code point
12102 # doesn't fit in. Earlier we figured out if all ranges should be
12103 # terminated because of changing the replace or map type styles, or if
12104 # there is a gap between this new code point and the previous one, and
12105 # that is stored in $force_output. But even if those aren't true, we
12106 # need to output the range if this new code point's value for the
12107 # given property doesn't match the stored range's.
12108 #local $to_trace = 1 if main::DEBUG;
12109 foreach my $i (0 .. $last_field) {
12110 my $field = $fields[$i];
12111 if ($force_output || $field ne $previous_fields[$i]) {
12113 # Flush the buffer of stored values.
12114 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12116 # Start a new range with this code point and its value
12118 $previous_fields[$i] = $field;
12122 # Set the values for the next time.
12123 $previous_cp = $cp;
12124 $decimal_previous_cp = $decimal_cp;
12126 # The input line has generated whatever adjusted lines are needed, and
12127 # should not be looked at further.
12132 sub EOF_UnicodeData {
12133 # Called upon EOF to flush the buffers, and create the Hangul
12134 # decomposition mappings if needed.
12137 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12139 # Flush the buffers.
12140 foreach my $i (0 .. $last_field) {
12141 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
12144 if (-e 'Jamo.txt') {
12146 # The algorithm is published by Unicode, based on values in
12147 # Jamo.txt, (which should have been processed before this
12148 # subroutine), and the results left in %Jamo
12150 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
12154 # If the full decomposition map table is being output, insert
12155 # into it the Hangul syllable mappings. This is to avoid having
12156 # to publish a subroutine in it to compute them. (which would
12157 # essentially be this code.) This uses the algorithm published by
12158 # Unicode. (No hangul syllables in version 1)
12159 if ($v_version ge v2.0.0
12160 && property_ref('Decomposition_Mapping')->to_output_map) {
12161 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
12163 my $SIndex = $S - $SBase;
12164 my $L = $LBase + $SIndex / $NCount;
12165 my $V = $VBase + ($SIndex % $NCount) / $TCount;
12166 my $T = $TBase + $SIndex % $TCount;
12168 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
12169 my $decomposition = sprintf("%04X %04X", $L, $V);
12170 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
12171 $file->insert_adjusted_lines(
12172 sprintf("%04X; Decomposition_Mapping; %s",
12182 sub filter_v1_ucd {
12183 # Fix UCD lines in version 1. This is probably overkill, but this
12184 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
12185 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
12186 # removed. This program retains them
12187 # 2) didn't include ranges, which it should have, and which are now
12188 # added in @corrected_lines below. It was hand populated by
12189 # taking the data from Version 2, verified by analyzing
12191 # 3) There is a syntax error in the entry for U+09F8 which could
12192 # cause problems for utf8_heavy, and so is changed. It's
12193 # numeric value was simply a minus sign, without any number.
12194 # (Eventually Unicode changed the code point to non-numeric.)
12195 # 4) The decomposition types often don't match later versions
12196 # exactly, and the whole syntax of that field is different; so
12197 # the syntax is changed as well as the types to their later
12198 # terminology. Otherwise normalize.pm would be very unhappy
12199 # 5) Many ccc classes are different. These are left intact.
12200 # 6) U+FF10..U+FF19 are missing their numeric values in all three
12201 # fields. These are unchanged because it doesn't really cause
12202 # problems for Perl.
12203 # 7) A number of code points, such as controls, don't have their
12204 # Unicode Version 1 Names in this file. These are added.
12205 # 8) A number of Symbols were marked as Lm. This changes those in
12206 # the Latin1 range, so that regexes work.
12207 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
12208 # referred to by their lc equivalents. Not fixed.
12210 my @corrected_lines = split /\n/, <<'END';
12211 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
12212 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12213 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
12214 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
12215 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
12216 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
12220 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12222 #local $to_trace = 1 if main::DEBUG;
12223 trace $_ if main::DEBUG && $to_trace;
12225 # -1 => retain trailing null fields
12226 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12228 # At the first place that is wrong in the input, insert all the
12229 # corrections, replacing the wrong line.
12230 if ($code_point eq '4E00') {
12231 my @copy = @corrected_lines;
12233 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12235 $file->insert_lines(@copy);
12237 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
12239 # There are no Lm characters in Latin1; these should be 'Sk', but
12240 # there isn't that in V1.
12241 $fields[$CATEGORY] = 'So';
12244 if ($fields[$NUMERIC] eq '-') {
12245 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
12248 if ($fields[$PERL_DECOMPOSITION] ne "") {
12250 # Several entries have this change to superscript 2 or 3 in the
12251 # middle. Convert these to the modern version, which is to use
12252 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
12253 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
12254 # 'HHHH HHHH 00B3 HHHH'.
12255 # It turns out that all of these that don't have another
12256 # decomposition defined at the beginning of the line have the
12257 # <square> decomposition in later releases.
12258 if ($code_point ne '00B2' && $code_point ne '00B3') {
12259 if ($fields[$PERL_DECOMPOSITION]
12260 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
12262 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
12263 $fields[$PERL_DECOMPOSITION] = '<square> '
12264 . $fields[$PERL_DECOMPOSITION];
12269 # If is like '<+circled> 0052 <-circled>', convert to
12271 $fields[$PERL_DECOMPOSITION] =~
12272 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
12274 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
12275 $fields[$PERL_DECOMPOSITION] =~
12276 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
12277 or $fields[$PERL_DECOMPOSITION] =~
12278 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
12279 or $fields[$PERL_DECOMPOSITION] =~
12280 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
12281 or $fields[$PERL_DECOMPOSITION] =~
12282 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
12284 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
12285 $fields[$PERL_DECOMPOSITION] =~
12286 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
12288 # Change names to modern form.
12289 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
12290 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
12291 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
12292 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
12294 # One entry has weird braces
12295 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
12297 # One entry at U+2116 has an extra <sup>
12298 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
12301 $_ = join ';', $code_point, @fields;
12302 trace $_ if main::DEBUG && $to_trace;
12306 sub filter_bad_Nd_ucd {
12307 # Early versions specified a value in the decimal digit field even
12308 # though the code point wasn't a decimal digit. Clear the field in
12309 # that situation, so that the main code doesn't think it is a decimal
12312 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12313 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
12314 $fields[$PERL_DECIMAL_DIGIT] = "";
12315 $_ = join ';', $code_point, @fields;
12320 my @U1_control_names = split /\n/, <<'END';
12325 END OF TRANSMISSION
12330 HORIZONTAL TABULATION
12332 VERTICAL TABULATION
12340 DEVICE CONTROL THREE
12341 DEVICE CONTROL FOUR
12342 NEGATIVE ACKNOWLEDGE
12344 END OF TRANSMISSION BLOCK
12354 BREAK PERMITTED HERE
12358 START OF SELECTED AREA
12359 END OF SELECTED AREA
12360 CHARACTER TABULATION SET
12361 CHARACTER TABULATION WITH JUSTIFICATION
12362 LINE TABULATION SET
12368 DEVICE CONTROL STRING
12374 START OF GUARDED AREA
12375 END OF GUARDED AREA
12377 SINGLE CHARACTER INTRODUCER
12378 CONTROL SEQUENCE INTRODUCER
12380 OPERATING SYSTEM COMMAND
12382 APPLICATION PROGRAM COMMAND
12385 sub filter_early_U1_names {
12386 # Very early versions did not have the Unicode_1_name field specified.
12387 # They differed in which ones were present; make sure a U1 name
12388 # exists, so that Unicode::UCD::charinfo will work
12390 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12393 # @U1_control names above are entirely positional, so we pull them out
12394 # in the exact order required, with gaps for the ones that don't have
12396 if ($code_point =~ /^00[01]/
12397 || $code_point eq '007F'
12398 || $code_point =~ /^008[2-9A-F]/
12399 || $code_point =~ /^009[0-8A-F]/)
12401 my $u1_name = shift @U1_control_names;
12402 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12403 $_ = join ';', $code_point, @fields;
12408 sub filter_v2_1_5_ucd {
12409 # A dozen entries in this 2.1.5 file had the mirrored and numeric
12410 # columns swapped; These all had mirrored be 'N'. So if the numeric
12411 # column appears to be N, swap it back.
12413 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12414 if ($fields[$NUMERIC] eq 'N') {
12415 $fields[$NUMERIC] = $fields[$MIRRORED];
12416 $fields[$MIRRORED] = 'N';
12417 $_ = join ';', $code_point, @fields;
12422 sub filter_v6_ucd {
12424 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12425 # it wasn't accepted, to allow for some deprecation cycles. This
12426 # function is not called after 5.16
12428 return if $_ !~ /^(?:0007|1F514|070F);/;
12430 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12431 if ($code_point eq '0007') {
12432 $fields[$CHARNAME] = "";
12434 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12435 # http://www.unicode.org/versions/corrigendum8.html
12436 $fields[$BIDI] = "AL";
12438 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12439 $fields[$CHARNAME] = "";
12442 $_ = join ';', $code_point, @fields;
12446 } # End closure for UnicodeData
12448 sub process_GCB_test {
12451 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12453 while ($file->next_line) {
12454 push @backslash_X_tests, $_;
12460 sub process_LB_test {
12463 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12465 while ($file->next_line) {
12466 push @LB_tests, $_;
12472 sub process_SB_test {
12475 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12477 while ($file->next_line) {
12478 push @SB_tests, $_;
12484 sub process_WB_test {
12487 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12489 while ($file->next_line) {
12490 push @WB_tests, $_;
12496 sub process_NamedSequences {
12497 # NamedSequences.txt entries are just added to an array. Because these
12498 # don't look like the other tables, they have their own handler.
12500 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12502 # This just adds the sequence to an array for later handling
12505 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12507 while ($file->next_line) {
12508 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12510 $file->carp_bad_line(
12511 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12515 # Note single \t in keeping with special output format of
12516 # Perl_charnames. But it turns out that the code points don't have to
12517 # be 5 digits long, like the rest, based on the internal workings of
12518 # charnames.pm. This could be easily changed for consistency.
12519 push @named_sequences, "$sequence\t$name";
12528 sub filter_early_ea_lb {
12529 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
12530 # third field be the name of the code point, which can be ignored in
12531 # most cases. But it can be meaningful if it marks a range:
12532 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12533 # 3400;W;<CJK Ideograph Extension A, First>
12535 # We need to see the First in the example above to know it's a range.
12536 # They did not use the later range syntaxes. This routine changes it
12537 # to use the modern syntax.
12538 # $1 is the Input_file object.
12540 my @fields = split /\s*;\s*/;
12541 if ($fields[2] =~ /^<.*, First>/) {
12542 $first_range = $fields[0];
12545 elsif ($fields[2] =~ /^<.*, Last>/) {
12546 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12549 undef $first_range;
12550 $_ = "$fields[0]; $fields[1]";
12557 sub filter_substitute_lb {
12558 # Used on Unicodes that predate the LB property, where there is a
12559 # substitute file. This just does the regular ea_lb handling for such
12560 # files, and then substitutes the long property value name for the short
12561 # one that comes with the file. (The other break files have the long
12562 # names in them, so this is the odd one out.) The reason for doing this
12563 # kludge is that regen/mk_invlists.pl is expecting the long name. This
12564 # also fixes the typo 'Inseperable' that leads to problems.
12566 filter_early_ea_lb;
12569 my @fields = split /\s*;\s*/;
12570 $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12571 $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12572 $_ = join '; ', @fields;
12575 sub filter_old_style_arabic_shaping {
12576 # Early versions used a different term for the later one.
12578 my @fields = split /\s*;\s*/;
12579 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12580 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
12581 $_ = join ';', @fields;
12586 my $lc; # Table for lowercase mapping
12589 my %special_casing_code_points;
12591 sub setup_special_casing {
12592 # SpecialCasing.txt contains the non-simple case change mappings. The
12593 # simple ones are in UnicodeData.txt, which should already have been
12594 # read in to the full property data structures, so as to initialize
12595 # these with the simple ones. Then the SpecialCasing.txt entries
12596 # add or overwrite the ones which have different full mappings.
12598 # This routine sees if the simple mappings are to be output, and if
12599 # so, copies what has already been put into the full mapping tables,
12600 # while they still contain only the simple mappings.
12602 # The reason it is done this way is that the simple mappings are
12603 # probably not going to be output, so it saves work to initialize the
12604 # full tables with the simple mappings, and then overwrite those
12605 # relatively few entries in them that have different full mappings,
12606 # and thus skip the simple mapping tables altogether.
12609 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12611 $lc = property_ref('lc');
12612 $tc = property_ref('tc');
12613 $uc = property_ref('uc');
12615 # For each of the case change mappings...
12616 foreach my $full_casing_table ($lc, $tc, $uc) {
12617 my $full_casing_name = $full_casing_table->name;
12618 my $full_casing_full_name = $full_casing_table->full_name;
12619 unless (defined $full_casing_table
12620 && ! $full_casing_table->is_empty)
12622 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
12625 # Create a table in the old-style format and with the original
12626 # file name for backwards compatibility with applications that
12627 # read it directly. The new tables contain both the simple and
12628 # full maps, and the old are missing simple maps when there is a
12629 # conflicting full one. Probably it would have been ok to add
12630 # those to the legacy version, as was already done in 5.14 to the
12631 # case folding one, but this was not done, out of an abundance of
12632 # caution. The tables are set up here before we deal with the
12633 # full maps so that as we handle those, we can override the simple
12634 # maps for them in the legacy table, and merely add them in the
12636 my $legacy = Property->new("Legacy_" . $full_casing_full_name,
12637 File => $full_casing_full_name
12638 =~ s/case_Mapping//r,
12639 Format => $HEX_FORMAT,
12640 Default_Map => $CODE_POINT,
12641 Initialize => $full_casing_table,
12642 Replacement_Property => $full_casing_full_name,
12645 $full_casing_table->add_comment(join_lines( <<END
12646 This file includes both the simple and full case changing maps. The simple
12647 ones are in the main body of the table below, and the full ones adding to or
12648 overriding them are in the hash.
12652 # The simple version's name in each mapping merely has an 's' in
12653 # front of the full one's
12654 my $simple_name = 's' . $full_casing_name;
12655 my $simple = property_ref($simple_name);
12656 $simple->initialize($full_casing_table) if $simple->to_output_map();
12662 sub filter_2_1_8_special_casing_line {
12664 # This version had duplicate entries in this file. Delete all but the
12666 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12668 if (exists $special_casing_code_points{$fields[0]}) {
12673 $special_casing_code_points{$fields[0]} = 1;
12674 filter_special_casing_line(@_);
12677 sub filter_special_casing_line {
12678 # Change the format of $_ from SpecialCasing.txt into something that
12679 # the generic handler understands. Each input line contains three
12680 # case mappings. This will generate three lines to pass to the
12681 # generic handler for each of those.
12683 # The input syntax (after stripping comments and trailing white space
12684 # is like one of the following (with the final two being entries that
12686 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12687 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12688 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12689 # Note the trailing semi-colon, unlike many of the input files. That
12690 # means that there will be an extra null field generated by the split
12693 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12695 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12698 # field #4 is when this mapping is conditional. If any of these get
12699 # implemented, it would be by hard-coding in the casing functions in
12700 # the Perl core, not through tables. But if there is a new condition
12701 # we don't know about, output a warning. We know about all the
12702 # conditions through 6.0
12703 if ($fields[4] ne "") {
12704 my @conditions = split ' ', $fields[4];
12705 if ($conditions[0] ne 'tr' # We know that these languages have
12706 # conditions, and some are multiple
12707 && $conditions[0] ne 'az'
12708 && $conditions[0] ne 'lt'
12710 # And, we know about a single condition Final_Sigma, but
12712 && ($v_version gt v5.2.0
12713 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12715 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12717 elsif ($conditions[0] ne 'Final_Sigma') {
12719 # Don't print out a message for Final_Sigma, because we
12720 # have hard-coded handling for it. (But the standard
12721 # could change what the rule should be, but it wouldn't
12722 # show up here anyway.
12724 print "# SKIPPING Special Casing: $_\n"
12725 if $verbosity >= $VERBOSE;
12730 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12731 $file->carp_bad_line('Extra fields');
12736 my $decimal_code_point = hex $fields[0];
12738 # Loop to handle each of the three mappings in the input line, in
12739 # order, with $i indicating the current field number.
12741 for my $object ($lc, $tc, $uc) {
12742 $i++; # First time through, $i = 0 ... 3rd time = 3
12744 my $value = $object->value_of($decimal_code_point);
12745 $value = ($value eq $CODE_POINT)
12746 ? $decimal_code_point
12749 # If this isn't a multi-character mapping, it should already have
12751 if ($fields[$i] !~ / /) {
12752 if ($value != hex $fields[$i]) {
12753 Carp::my_carp("Bad news. UnicodeData.txt thinks "
12755 . "(0x$fields[0]) is $value"
12756 . " and SpecialCasing.txt thinks it is "
12758 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
12763 # The mapping goes into both the legacy table, in which it
12764 # replaces the simple one...
12765 $file->insert_adjusted_lines("$fields[0]; Legacy_"
12766 . $object->full_name
12767 . "; $fields[$i]");
12769 # ... and the regular table, in which it is additional,
12770 # beyond the simple mapping.
12771 $file->insert_adjusted_lines("$fields[0]; "
12775 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12781 # Everything has been handled by the insert_adjusted_lines()
12788 sub filter_old_style_case_folding {
12789 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12790 # and later style. Different letters were used in the earlier.
12793 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12795 my @fields = split /\s*;\s*/;
12797 if ($fields[1] eq 'L') {
12798 $fields[1] = 'C'; # L => C always
12800 elsif ($fields[1] eq 'E') {
12801 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
12809 $file->carp_bad_line("Expecting L or E in second field");
12813 $_ = join("; ", @fields) . ';';
12817 { # Closure for case folding
12819 # Create the map for simple only if are going to output it, for otherwise
12820 # it takes no part in anything we do.
12821 my $to_output_simple;
12823 sub setup_case_folding($) {
12824 # Read in the case foldings in CaseFolding.txt. This handles both
12825 # simple and full case folding.
12828 = property_ref('Simple_Case_Folding')->to_output_map;
12830 if (! $to_output_simple) {
12831 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12834 # If we ever wanted to show that these tables were combined, a new
12835 # property method could be created, like set_combined_props()
12836 property_ref('Case_Folding')->add_comment(join_lines( <<END
12837 This file includes both the simple and full case folding maps. The simple
12838 ones are in the main body of the table below, and the full ones adding to or
12839 overriding them are in the hash.
12845 sub filter_case_folding_line {
12846 # Called for each line in CaseFolding.txt
12847 # Input lines look like:
12848 # 0041; C; 0061; # LATIN CAPITAL LETTER A
12849 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12850 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12852 # 'C' means that folding is the same for both simple and full
12853 # 'F' that it is only for full folding
12854 # 'S' that it is only for simple folding
12855 # 'T' is locale-dependent, and ignored
12856 # 'I' is a type of 'F' used in some early releases.
12857 # Note the trailing semi-colon, unlike many of the input files. That
12858 # means that there will be an extra null field generated by the split
12859 # below, which we ignore and hence is not an error.
12862 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
12864 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12865 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12866 $file->carp_bad_line('Extra fields');
12871 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
12876 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12877 # I are all full foldings; S is single-char. For S, there is always
12878 # an F entry, so we must allow multiple values for the same code
12879 # point. Fortunately this table doesn't need further manipulation
12880 # which would preclude using multiple-values. The S is now included
12881 # so that _swash_inversion_hash() is able to construct closures
12882 # without having to worry about F mappings.
12883 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12884 $_ = "$range; Case_Folding; "
12885 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12889 $file->carp_bad_line('Expecting C F I S or T in second field');
12892 # C and S are simple foldings, but simple case folding is not needed
12893 # unless we explicitly want its map table output.
12894 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12895 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12901 } # End case fold closure
12903 sub filter_jamo_line {
12904 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
12905 # from this file that is used in generating the Name property for Jamo
12906 # code points. But, it also is used to convert early versions' syntax
12907 # into the modern form. Here are two examples:
12908 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
12909 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
12911 # The input is $_, the output is $_ filtered.
12913 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12915 # Let the caller handle unexpected input. In earlier versions, there was
12916 # a third field which is supposed to be a comment, but did not have a '#'
12918 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12920 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
12923 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
12924 $fields[1] = 'R' if $fields[0] eq '1105';
12926 # Add to structure so can generate Names from it.
12927 my $cp = hex $fields[0];
12928 my $short_name = $fields[1];
12929 $Jamo{$cp} = $short_name;
12930 if ($cp <= $LBase + $LCount) {
12931 $Jamo_L{$short_name} = $cp - $LBase;
12933 elsif ($cp <= $VBase + $VCount) {
12934 $Jamo_V{$short_name} = $cp - $VBase;
12936 elsif ($cp <= $TBase + $TCount) {
12937 $Jamo_T{$short_name} = $cp - $TBase;
12940 Carp::my_carp_bug("Unexpected Jamo code point in $_");
12944 # Reassemble using just the first two fields to look like a typical
12945 # property file line
12946 $_ = "$fields[0]; $fields[1]";
12951 sub register_fraction($) {
12952 # This registers the input rational number so that it can be passed on to
12953 # utf8_heavy.pl, both in rational and floating forms.
12955 my $rational = shift;
12957 my $float = eval $rational;
12958 $float = sprintf "%.*e", $E_FLOAT_PRECISION, $float;
12959 if ( defined $nv_floating_to_rational{$float}
12960 && $nv_floating_to_rational{$float} ne $rational)
12962 die Carp::my_carp_bug("Both '$rational' and"
12963 . " '$nv_floating_to_rational{$float}' evaluate to"
12964 . " the same floating point number."
12965 . " \$E_FLOAT_PRECISION must be increased");
12967 $nv_floating_to_rational{$float} = $rational;
12971 sub gcd($$) { # Greatest-common-divisor; from
12972 # http://en.wikipedia.org/wiki/Euclidean_algorithm
12985 sub reduce_fraction($) {
12986 my $fraction_ref = shift;
12988 # Reduce a fraction to lowest terms. The Unicode data may be reducible,
12989 # hence this is needed. The argument is a reference to the
12990 # string denoting the fraction, which must be of the form:
12991 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12992 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged");
12997 my $numerator = $2;
12998 my $denominator = $3;
13002 # Find greatest common divisor
13003 my $gcd = gcd($numerator, $denominator);
13005 # And reduce using the gcd.
13007 $numerator /= $gcd;
13008 $denominator /= $gcd;
13009 $$fraction_ref = "$sign$numerator/$denominator";
13015 sub filter_numeric_value_line {
13016 # DNumValues contains lines of a different syntax than the typical
13018 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
13020 # This routine transforms $_ containing the anomalous syntax to the
13021 # typical, by filtering out the extra columns, and convert early version
13022 # decimal numbers to strings that look like rational numbers.
13025 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13027 # Starting in 5.1, there is a rational field. Just use that, omitting the
13028 # extra columns. Otherwise convert the decimal number in the second field
13029 # to a rational, and omit extraneous columns.
13030 my @fields = split /\s*;\s*/, $_, -1;
13033 if ($v_version ge v5.1.0) {
13034 if (@fields != 4) {
13035 $file->carp_bad_line('Not 4 semi-colon separated fields');
13039 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
13040 $rational = $fields[3];
13042 $_ = join '; ', @fields[ 0, 3 ];
13046 # Here, is an older Unicode file, which has decimal numbers instead of
13047 # rationals in it. Use the fraction to calculate the denominator and
13048 # convert to rational.
13050 if (@fields != 2 && @fields != 3) {
13051 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
13056 my $codepoints = $fields[0];
13057 my $decimal = $fields[1];
13058 if ($decimal =~ s/\.0+$//) {
13060 # Anything ending with a decimal followed by nothing but 0's is an
13062 $_ = "$codepoints; $decimal";
13063 $rational = $decimal;
13068 if ($decimal =~ /\.50*$/) {
13072 # Here have the hardcoded repeating decimals in the fraction, and
13073 # the denominator they imply. There were only a few denominators
13074 # in the older Unicode versions of this file which this code
13075 # handles, so it is easy to convert them.
13077 # The 4 is because of a round-off error in the Unicode 3.2 files
13078 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
13081 elsif ($decimal =~ /\.[27]50*$/) {
13084 elsif ($decimal =~ /\.[2468]0*$/) {
13087 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
13090 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
13093 if ($denominator) {
13094 my $sign = ($decimal < 0) ? "-" : "";
13095 my $numerator = int((abs($decimal) * $denominator) + .5);
13096 $rational = "$sign$numerator/$denominator";
13097 $_ = "$codepoints; $rational";
13100 $file->carp_bad_line("Can't cope with number '$decimal'.");
13107 register_fraction($rational) if $rational =~ qr{/};
13112 my %unihan_properties;
13114 sub construct_unihan {
13116 my $file_object = shift;
13118 return unless file_exists($file_object->file);
13120 if ($v_version lt v4.0.0) {
13121 push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
13122 push @cjk_property_values, split "\n", <<'END';
13123 # @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
13127 if ($v_version ge v3.0.0) {
13128 push @cjk_properties, split "\n", <<'END';
13129 cjkIRG_GSource; kIRG_GSource
13130 cjkIRG_JSource; kIRG_JSource
13131 cjkIRG_KSource; kIRG_KSource
13132 cjkIRG_TSource; kIRG_TSource
13133 cjkIRG_VSource; kIRG_VSource
13135 push @cjk_property_values, split "\n", <<'END';
13136 # @missing: 0000..10FFFF; cjkIRG_GSource; <none>
13137 # @missing: 0000..10FFFF; cjkIRG_JSource; <none>
13138 # @missing: 0000..10FFFF; cjkIRG_KSource; <none>
13139 # @missing: 0000..10FFFF; cjkIRG_TSource; <none>
13140 # @missing: 0000..10FFFF; cjkIRG_VSource; <none>
13143 if ($v_version ge v3.1.0) {
13144 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
13145 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
13147 if ($v_version ge v3.1.1) {
13148 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
13149 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
13151 if ($v_version ge v3.2.0) {
13152 push @cjk_properties, split "\n", <<'END';
13153 cjkAccountingNumeric; kAccountingNumeric
13154 cjkCompatibilityVariant; kCompatibilityVariant
13155 cjkOtherNumeric; kOtherNumeric
13156 cjkPrimaryNumeric; kPrimaryNumeric
13158 push @cjk_property_values, split "\n", <<'END';
13159 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
13160 # @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
13161 # @missing: 0000..10FFFF; cjkOtherNumeric; NaN
13162 # @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
13165 if ($v_version gt v4.0.0) {
13166 push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
13167 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
13170 if ($v_version ge v4.1.0) {
13171 push @cjk_properties, 'cjkIICore ; kIICore';
13172 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
13177 # Do any special setup for Unihan properties.
13179 # This property gives the wrong computed type, so override.
13180 my $usource = property_ref('kIRG_USource');
13181 $usource->set_type($STRING) if defined $usource;
13183 # This property is to be considered binary (it says so in
13184 # http://www.unicode.org/reports/tr38/)
13185 my $iicore = property_ref('kIICore');
13186 if (defined $iicore) {
13187 $iicore->set_type($FORCED_BINARY);
13188 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
13190 # Unicode doesn't include the maps for this property, so don't
13191 # warn that they are missing.
13192 $iicore->set_pre_declared_maps(0);
13193 $iicore->add_comment(join_lines( <<END
13194 This property contains string values, but any non-empty ones are considered to
13195 be 'core', so Perl creates tables for both: 1) its string values, plus 2)
13196 tables so that \\p{kIICore} matches any code point which has a non-empty
13197 value for this property.
13205 sub filter_unihan_line {
13206 # Change unihan db lines to look like the others in the db. Here is
13208 # U+341C kCangjie IEKN
13210 # Tabs are used instead of semi-colons to separate fields; therefore
13211 # they may have semi-colons embedded in them. Change these to periods
13212 # so won't screw up the rest of the code.
13215 # Remove lines that don't look like ones we accept.
13216 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
13221 # Extract the property, and save a reference to its object.
13223 if (! exists $unihan_properties{$property}) {
13224 $unihan_properties{$property} = property_ref($property);
13227 # Don't do anything unless the property is one we're handling, which
13228 # we determine by seeing if there is an object defined for it or not
13229 if (! defined $unihan_properties{$property}) {
13234 # Convert the tab separators to our standard semi-colons, and convert
13235 # the U+HHHH notation to the rest of the standard's HHHH
13237 s/\b U \+ (?= $code_point_re )//xg;
13239 #local $to_trace = 1 if main::DEBUG;
13240 trace $_ if main::DEBUG && $to_trace;
13246 sub filter_blocks_lines {
13247 # In the Blocks.txt file, the names of the blocks don't quite match the
13248 # names given in PropertyValueAliases.txt, so this changes them so they
13249 # do match: Blanks and hyphens are changed into underscores. Also makes
13250 # early release versions look like later ones
13252 # $_ is transformed to the correct value.
13255 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13257 if ($v_version lt v3.2.0) {
13258 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
13263 # Old versions used a different syntax to mark the range.
13264 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
13267 my @fields = split /\s*;\s*/, $_, -1;
13268 if (@fields != 2) {
13269 $file->carp_bad_line("Expecting exactly two fields");
13274 # Change hyphens and blanks in the block name field only
13275 $fields[1] =~ s/[ -]/_/g;
13276 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
13278 $_ = join("; ", @fields);
13283 my $current_property;
13285 sub filter_old_style_proplist {
13286 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
13287 # was in a completely different syntax. Ken Whistler of Unicode says
13288 # that it was something he used as an aid for his own purposes, but
13289 # was never an official part of the standard. Many of the properties
13290 # in it were incorporated into the later PropList.txt, but some were
13291 # not. This program uses this early file to generate property tables
13292 # that are otherwise not accessible in the early UCD's. It does this
13293 # for the ones that eventually became official, and don't appear to be
13294 # too different in their contents from the later official version, and
13295 # throws away the rest. It could be argued that the ones it generates
13296 # were probably not really official at that time, so should be
13297 # ignored. You can easily modify things to skip all of them by
13298 # changing this function to just set $_ to "", and return; and to skip
13299 # certain of them by by simply removing their declarations from
13300 # get_old_property_aliases().
13302 # Here is a list of all the ones that are thrown away:
13303 # Alphabetic The definitions for this are very
13304 # defective, so better to not mislead
13305 # people into thinking it works.
13306 # Instead the Perl extension of the
13307 # same name is constructed from first
13309 # Bidi=* duplicates UnicodeData.txt
13310 # Combining never made into official property;
13312 # Composite never made into official property.
13313 # Currency Symbol duplicates UnicodeData.txt: gc=sc
13314 # Decimal Digit duplicates UnicodeData.txt: gc=nd
13315 # Delimiter never made into official property;
13317 # Format Control never made into official property;
13319 # High Surrogate duplicates Blocks.txt
13320 # Ignorable Control never made into official property;
13322 # ISO Control duplicates UnicodeData.txt: gc=cc
13323 # Left of Pair never made into official property;
13324 # Line Separator duplicates UnicodeData.txt: gc=zl
13325 # Low Surrogate duplicates Blocks.txt
13326 # Non-break was actually listed as a property
13327 # in 3.2, but without any code
13328 # points. Unicode denies that this
13329 # was ever an official property
13330 # Non-spacing duplicate UnicodeData.txt: gc=mn
13331 # Numeric duplicates UnicodeData.txt: gc=cc
13332 # Paired Punctuation never made into official property;
13333 # appears to be gc=ps + gc=pe
13334 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
13335 # Private Use duplicates UnicodeData.txt: gc=co
13336 # Private Use High Surrogate duplicates Blocks.txt
13337 # Punctuation duplicates UnicodeData.txt: gc=p
13338 # Space different definition than eventual
13340 # Titlecase duplicates UnicodeData.txt: gc=lt
13341 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
13342 # Zero-width never made into official property;
13344 # Most of the properties have the same names in this file as in later
13345 # versions, but a couple do not.
13347 # This subroutine filters $_, converting it from the old style into
13348 # the new style. Here's a sample of the old-style
13350 # *******************************************
13352 # Property dump for: 0x100000A0 (Join Control)
13354 # 200C..200D (2 chars)
13356 # In the example, the property is "Join Control". It is kept in this
13357 # closure between calls to the subroutine. The numbers beginning with
13358 # 0x were internal to Ken's program that generated this file.
13360 # If this line contains the property name, extract it.
13361 if (/^Property dump for: [^(]*\((.*)\)/) {
13364 # Convert white space to underscores.
13367 # Convert the few properties that don't have the same name as
13368 # their modern counterparts
13369 s/Identifier_Part/ID_Continue/
13370 or s/Not_a_Character/NChar/;
13372 # If the name matches an existing property, use it.
13373 if (defined property_ref($_)) {
13374 trace "new property=", $_ if main::DEBUG && $to_trace;
13375 $current_property = $_;
13377 else { # Otherwise discard it
13378 trace "rejected property=", $_ if main::DEBUG && $to_trace;
13379 undef $current_property;
13381 $_ = ""; # The property is saved for the next lines of the
13382 # file, but this defining line is of no further use,
13383 # so clear it so that the caller won't process it
13386 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13388 # Here, the input line isn't a header defining a property for the
13389 # following section, and either we aren't in such a section, or
13390 # the line doesn't look like one that defines the code points in
13391 # such a section. Ignore this line.
13396 # Here, we have a line defining the code points for the current
13397 # stashed property. Anything starting with the first blank is
13398 # extraneous. Otherwise, it should look like a normal range to
13399 # the caller. Append the property name so that it looks just like
13400 # a modern PropList entry.
13403 $_ .= "; $current_property";
13405 trace $_ if main::DEBUG && $to_trace;
13408 } # End closure for old style proplist
13410 sub filter_old_style_normalization_lines {
13411 # For early releases of Unicode, the lines were like:
13412 # 74..2A76 ; NFKD_NO
13413 # For later releases this became:
13414 # 74..2A76 ; NFKD_QC; N
13415 # Filter $_ to look like those in later releases.
13416 # Similarly for MAYBEs
13418 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13420 # Also, the property FC_NFKC was abbreviated to FNC
13425 sub setup_script_extensions {
13426 # The Script_Extensions property starts out with a clone of the Script
13429 $scx = property_ref("Script_Extensions");
13430 return unless defined $scx;
13432 $scx->_set_format($STRING_WHITE_SPACE_LIST);
13433 $scx->initialize($script);
13434 $scx->set_default_map($script->default_map);
13435 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
13436 $scx->add_comment(join_lines( <<END
13437 The values for code points that appear in one script are just the same as for
13438 the 'Script' property. Likewise the values for those that appear in many
13439 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
13440 values of code points that appear in a few scripts are a space separated list
13445 # Initialize scx's tables and the aliases for them to be the same as sc's
13446 foreach my $table ($script->tables) {
13447 my $scx_table = $scx->add_match_table($table->name,
13448 Full_Name => $table->full_name);
13449 foreach my $alias ($table->aliases) {
13450 $scx_table->add_alias($alias->name);
13455 sub filter_script_extensions_line {
13456 # The Scripts file comes with the full name for the scripts; the
13457 # ScriptExtensions, with the short name. The final mapping file is a
13458 # combination of these, and without adjustment, would have inconsistent
13459 # entries. This filters the latter file to convert to full names.
13460 # Entries look like this:
13461 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13463 my @fields = split /\s*;\s*/;
13465 # This script was erroneously omitted in this Unicode version.
13466 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13469 foreach my $short_name (split " ", $fields[1]) {
13470 push @full_names, $script->table($short_name)->full_name;
13472 $fields[1] = join " ", @full_names;
13473 $_ = join "; ", @fields;
13480 # Populates the Hangul Syllable Type property from first principles
13483 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13485 # These few ranges are hard-coded in.
13486 $file->insert_lines(split /\n/, <<'END'
13494 # The Hangul syllables in version 1 are at different code points than
13495 # those that came along starting in version 2, and have different names;
13496 # they comprise about 60% of the code points of the later version.
13497 # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13498 # initial set is a subset of the later version, with different English
13499 # transliterations. I did not see an easy mapping between them. The
13500 # later set includes essentially all possibilities, even ones that aren't
13501 # in modern use (if they ever were), and over 96% of the new ones are type
13502 # LVT. Mathematically, the early set must also contain a preponderance of
13503 # LVT values. In lieu of doing nothing, we just set them all to LVT, and
13504 # expect that this will be right most of the time, which is better than
13505 # not being right at all.
13506 if ($v_version lt v2.0.0) {
13507 my $property = property_ref($file->property);
13508 $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13509 $FIRST_REMOVED_HANGUL_SYLLABLE,
13510 $FINAL_REMOVED_HANGUL_SYLLABLE));
13511 push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13515 # The algorithmically derived syllables are almost all LVT ones, so
13516 # initialize the whole range with that.
13517 $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13518 $SBase, $SBase + $SCount -1);
13520 # Those ones that aren't LVT are LV, and they occur at intervals of
13521 # $TCount code points, starting with the first code point, at $SBase.
13522 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13523 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13531 # Populates the Grapheme Cluster Break property from first principles
13534 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13536 # All these definitions are from
13537 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13538 # from http://www.unicode.org/reports/tr29/tr29-4.html
13540 foreach my $range ($gc->ranges) {
13542 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13544 if ($range->value =~ / ^ M [en] $ /x) {
13545 $file->insert_lines(sprintf "%04X..%04X; Extend",
13546 $range->start, $range->end);
13548 elsif ($range->value =~ / ^ C [cf] $ /x) {
13549 $file->insert_lines(sprintf "%04X..%04X; Control",
13550 $range->start, $range->end);
13553 $file->insert_lines("2028; Control"); # Line Separator
13554 $file->insert_lines("2029; Control"); # Paragraph Separator
13556 $file->insert_lines("000D; CR");
13557 $file->insert_lines("000A; LF");
13559 # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13560 foreach my $code_point ( qw{
13561 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13562 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13565 my $category = $gc->value_of(hex $code_point);
13566 next if ! defined $category || $category eq 'Cn'; # But not if
13567 # unassigned in this
13569 $file->insert_lines("$code_point; Extend");
13572 my $hst = property_ref('Hangul_Syllable_Type');
13573 if ($hst->count > 0) {
13574 foreach my $range ($hst->ranges) {
13575 $file->insert_lines(sprintf "%04X..%04X; %s",
13576 $range->start, $range->end, $range->value);
13580 generate_hst($file);
13583 main::process_generic_property_file($file);
13587 sub fixup_early_perl_name_alias {
13589 # Different versions of Unicode have varying support for the name synonyms
13590 # below. Just include everything. As of 6.1, all these are correct in
13591 # the Unicode-supplied file.
13594 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
13597 # ALERT did not come along until 6.0, at which point it became preferred
13598 # over BELL. By inserting it last in early releases, BELL is preferred
13599 # over it; and vice-vers in 6.0
13600 my $type_for_bell = ($v_version lt v6.0.0)
13603 $file->insert_lines(split /\n/, <<END
13604 0007;BELL; $type_for_bell
13605 000A;LINE FEED (LF);alternate
13606 000C;FORM FEED (FF);alternate
13607 000D;CARRIAGE RETURN (CR);alternate
13608 0085;NEXT LINE (NEL);alternate
13613 # One might think that the the 'Unicode_1_Name' field, could work for most
13614 # of the above names, but sadly that field varies depending on the
13615 # release. Version 1.1.5 had no names for any of the controls; Version
13616 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13617 # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13618 # changed to parenthesized versions like "NEXT LINE" to
13619 # "NEXT LINE (NEL)";
13620 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13621 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13622 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13624 # All these are present in the 6.1 NameAliases.txt
13629 sub filter_later_version_name_alias_line {
13631 # This file has an extra entry per line for the alias type. This is
13632 # handled by creating a compound entry: "$alias: $type"; First, split
13633 # the line into components.
13634 my ($range, $alias, $type, @remainder)
13635 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13637 # This file contains multiple entries for some components, so tell the
13638 # downstream code to allow this in our internal tables; the
13639 # $MULTIPLE_AFTER preserves the input ordering.
13640 $_ = join ";", $range, $CMD_DELIM
13650 sub filter_early_version_name_alias_line {
13652 # Early versions did not have the trailing alias type field; implicitly it
13653 # was 'correction'.
13654 $_ .= "; correction";
13656 filter_later_version_name_alias_line;
13660 sub filter_all_caps_script_names {
13662 # Some early Unicode releases had the script names in all CAPS. This
13663 # converts them to just the first letter of each word being capital.
13665 my ($range, $script, @remainder)
13666 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13667 my @words = split /[_-]/, $script;
13668 for my $word (@words) {
13670 ucfirst(lc($word)) if $word ne 'CJK';
13672 $script = join "_", @words;
13673 $_ = join ";", $range, $script, @remainder;
13676 sub finish_Unicode() {
13677 # This routine should be called after all the Unicode files have been read
13679 # 1) Creates properties that are missing from the version of Unicode being
13680 # compiled, and which, for whatever reason, are needed for the Perl
13681 # core to function properly. These are minimally populated as
13683 # 2) Adds the mappings for code points missing from the files which have
13684 # defaults specified for them.
13685 # 3) At this this point all mappings are known, so it computes the type of
13686 # each property whose type hasn't been determined yet.
13687 # 4) Calculates all the regular expression match tables based on the
13689 # 5) Calculates and adds the tables which are defined by Unicode, but
13690 # which aren't derived by them, and certain derived tables that Perl
13693 # Folding information was introduced later into Unicode data. To get
13694 # Perl's case ignore (/i) to work at all in releases that don't have
13695 # folding, use the best available alternative, which is lower casing.
13696 my $fold = property_ref('Case_Folding');
13697 if ($fold->is_empty) {
13698 $fold->initialize(property_ref('Lowercase_Mapping'));
13699 $fold->add_note(join_lines(<<END
13700 WARNING: This table uses lower case as a substitute for missing fold
13706 # Multiple-character mapping was introduced later into Unicode data, so it
13707 # is by default the simple version. If to output the simple versions and
13708 # not present, just use the regular (which in these Unicode versions is
13709 # the simple as well).
13710 foreach my $map (qw { Uppercase_Mapping
13716 my $comment = <<END;
13718 Note that although the Perl core uses this file, it has the standard values
13719 for code points from U+0000 to U+00FF compiled in, so changing this table will
13720 not change the core's behavior with respect to these code points. Use
13721 Unicode::Casing to override this table.
13723 if ($map eq 'Case_Folding') {
13725 (/i regex matching is not overridable except by using a custom regex engine)
13728 property_ref($map)->add_comment(join_lines($comment));
13729 my $simple = property_ref("Simple_$map");
13730 next if ! $simple->is_empty;
13731 if ($simple->to_output_map) {
13732 $simple->initialize(property_ref($map));
13735 property_ref($map)->set_proxy_for($simple->name);
13739 # For each property, fill in any missing mappings, and calculate the re
13740 # match tables. If a property has more than one missing mapping, the
13741 # default is a reference to a data structure, and may require data from
13742 # other properties to resolve. The sort is used to cause these to be
13743 # processed last, after all the other properties have been calculated.
13744 # (Fortunately, the missing properties so far don't depend on each other.)
13745 foreach my $property
13746 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13749 # $perl has been defined, but isn't one of the Unicode properties that
13750 # need to be finished up.
13751 next if $property == $perl;
13753 # Nor do we need to do anything with properties that aren't going to
13755 next if $property->fate == $SUPPRESSED;
13757 # Handle the properties that have more than one possible default
13758 if (ref $property->default_map) {
13759 my $default_map = $property->default_map;
13761 # These properties have stored in the default_map:
13763 # 1) A default map which applies to all code points in a
13765 # 2) an expression which will evaluate to the list of code
13766 # points in that class
13768 # 3) the default map which applies to every other missing code
13771 # Go through each list.
13772 while (my ($default, $eval) = $default_map->get_next_defaults) {
13774 # Get the class list, and intersect it with all the so-far
13775 # unspecified code points yielding all the code points
13776 # in the class that haven't been specified.
13777 my $list = eval $eval;
13779 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13783 # Narrow down the list to just those code points we don't have
13785 $list = $list & $property->inverse_list;
13787 # Add mappings to the property for each code point in the list
13788 foreach my $range ($list->ranges) {
13789 $property->add_map($range->start, $range->end, $default,
13790 Replace => $CROAK);
13794 # All remaining code points have the other mapping. Set that up
13795 # so the normal single-default mapping code will work on them
13796 $property->set_default_map($default_map->other_default);
13798 # And fall through to do that
13801 # We should have enough data now to compute the type of the property.
13802 my $property_name = $property->name;
13803 $property->compute_type;
13804 my $property_type = $property->type;
13806 next if ! $property->to_create_match_tables;
13808 # Here want to create match tables for this property
13810 # The Unicode db always (so far, and they claim into the future) have
13811 # the default for missing entries in binary properties be 'N' (unless
13812 # there is a '@missing' line that specifies otherwise)
13813 if (! defined $property->default_map) {
13814 if ($property_type == $BINARY) {
13815 $property->set_default_map('N');
13817 elsif ($property_type == $ENUM) {
13818 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one");
13819 $property->set_default_map('XXX This makes sure there is a default map');
13823 # Add any remaining code points to the mapping, using the default for
13824 # missing code points.
13826 my $default_map = $property->default_map;
13827 if ($property_type == $FORCED_BINARY) {
13829 # A forced binary property creates a 'Y' table that matches all
13830 # non-default values. The actual string values are also written out
13831 # as a map table. (The default value will almost certainly be the
13832 # empty string, so the pod glosses over the distinction, and just
13833 # talks about empty vs non-empty.)
13834 my $yes = $property->table("Y");
13835 foreach my $range ($property->ranges) {
13836 next if $range->value eq $default_map;
13837 $yes->add_range($range->start, $range->end);
13839 $property->table("N")->set_complement($yes);
13842 if (defined $default_map) {
13844 # Make sure there is a match table for the default
13845 if (! defined ($default_table = $property->table($default_map)))
13847 $default_table = $property->add_match_table($default_map);
13850 # And, if the property is binary, the default table will just
13851 # be the complement of the other table.
13852 if ($property_type == $BINARY) {
13853 my $non_default_table;
13855 # Find the non-default table.
13856 for my $table ($property->tables) {
13857 if ($table == $default_table) {
13858 if ($v_version le v5.0.0) {
13859 $table->add_alias($_) for qw(N No F False);
13862 } elsif ($v_version le v5.0.0) {
13863 $table->add_alias($_) for qw(Y Yes T True);
13865 $non_default_table = $table;
13867 $default_table->set_complement($non_default_table);
13871 # This fills in any missing values with the default. It's
13872 # not necessary to do this with binary properties, as the
13873 # default is defined completely in terms of the Y table.
13874 $property->add_map(0, $MAX_WORKING_CODEPOINT,
13875 $default_map, Replace => $NO);
13879 # Have all we need to populate the match tables.
13880 my $maps_should_be_defined = $property->pre_declared_maps;
13881 foreach my $range ($property->ranges) {
13882 my $map = $range->value;
13883 my $table = $property->table($map);
13884 if (! defined $table) {
13886 # Integral and rational property values are not
13887 # necessarily defined in PropValueAliases, but whether all
13888 # the other ones should be depends on the property.
13889 if ($maps_should_be_defined
13890 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13892 Carp::my_carp("Table '$property_name=$map' should "
13893 . "have been defined. Defining it now.")
13895 $table = $property->add_match_table($map);
13898 next if $table->complement != 0; # Don't need to populate these
13899 $table->add_range($range->start, $range->end);
13903 # For Perl 5.6 compatibility, all properties matchable in regexes can
13904 # have an optional 'Is_' prefix. This is now done in utf8_heavy.pl.
13905 # But warn if this creates a conflict with a (new) Unicode property
13906 # name, although it appears that Unicode has made a decision never to
13907 # begin a property name with 'Is_', so this shouldn't happen.
13908 foreach my $alias ($property->aliases) {
13909 my $Is_name = 'Is_' . $alias->name;
13910 if (defined (my $pre_existing = property_ref($Is_name))) {
13911 Carp::my_carp(<<END
13912 There is already an alias named $Is_name (from " . $pre_existing . "), so
13913 creating one for $property won't work. This is bad news. If it is not too
13914 late, get Unicode to back off. Otherwise go back to the old scheme (findable
13915 from the git blame log for this area of the code that suppressed individual
13916 aliases that conflict with the new Unicode names. Proceeding anyway.
13920 } # End of loop through aliases for this property
13921 } # End of loop through all Unicode properties.
13923 # Fill in the mappings that Unicode doesn't completely furnish. First the
13924 # single letter major general categories. If Unicode were to start
13925 # delivering the values, this would be redundant, but better that than to
13926 # try to figure out if should skip and not get it right. Ths could happen
13927 # if a new major category were to be introduced, and the hard-coded test
13928 # wouldn't know about it.
13929 # This routine depends on the standard names for the general categories
13930 # being what it thinks they are, like 'Cn'. The major categories are the
13931 # union of all the general category tables which have the same first
13932 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13933 foreach my $minor_table ($gc->tables) {
13934 my $minor_name = $minor_table->name;
13935 next if length $minor_name == 1;
13936 if (length $minor_name != 2) {
13937 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
13941 my $major_name = uc(substr($minor_name, 0, 1));
13942 my $major_table = $gc->table($major_name);
13943 $major_table += $minor_table;
13946 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
13947 # defines it as LC)
13948 my $LC = $gc->table('LC');
13949 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
13950 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
13953 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13954 # deliver the correct values in it
13955 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13957 # Lt not in release 1.
13958 if (defined $gc->table('Lt')) {
13959 $LC += $gc->table('Lt');
13960 $gc->table('Lt')->set_caseless_equivalent($LC);
13963 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13965 $gc->table('Ll')->set_caseless_equivalent($LC);
13966 $gc->table('Lu')->set_caseless_equivalent($LC);
13968 # Create digit and case fold tables with the original file names for
13969 # backwards compatibility with applications that read them directly.
13970 my $Digit = Property->new("Legacy_Perl_Decimal_Digit",
13972 File => 'Digit', # Trad. location
13973 Directory => $map_directory,
13975 Replacement_Property => "Perl_Decimal_Digit",
13976 Initialize => property_ref('Perl_Decimal_Digit'),
13978 $Digit->add_comment(join_lines(<<END
13979 This file gives the mapping of all code points which represent a single
13980 decimal digit [0-9] to their respective digits. For example, the code point
13981 U+0031 (an ASCII '1') is mapped to a numeric 1. These code points are those
13982 that have Numeric_Type=Decimal; not special things, like subscripts nor Roman
13987 # Make sure this assumption in perl core code is valid in this Unicode
13988 # release, with known exceptions
13989 foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13990 next if $range->end - $range->start == 9;
13991 next if $range->start == 0x1D7CE; # This whole range was added in 3.1
13992 next if $range->end == 0x19DA && $v_version eq v5.2.0;
13993 next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13994 Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13995 . " decimal digits. Code in regcomp.c assumes it does,"
13996 . " and will have to be fixed. Proceeding anyway.");
13999 Property->new('Legacy_Case_Folding',
14001 Directory => $map_directory,
14002 Default_Map => $CODE_POINT,
14004 Replacement_Property => "Case_Folding",
14005 Format => $HEX_FORMAT,
14006 Initialize => property_ref('cf'),
14009 # The Script_Extensions property started out as a clone of the Script
14010 # property. But processing its data file caused some elements to be
14011 # replaced with different data. (These elements were for the Common and
14012 # Inherited properties.) This data is a qw() list of all the scripts that
14013 # the code points in the given range are in. An example line is:
14014 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
14016 # The code above has created a new match table named "Arab Syrc Thaa"
14017 # which contains 060C. (The cloned table started out with this code point
14018 # mapping to "Common".) Now we add 060C to each of the Arab, Syrc, and
14019 # Thaa match tables. Then we delete the now spurious "Arab Syrc Thaa"
14020 # match table. This is repeated for all these tables and ranges. The map
14021 # data is retained in the map table for reference, but the spurious match
14022 # tables are deleted.
14024 if (defined $scx) {
14025 foreach my $table ($scx->tables) {
14026 next unless $table->name =~ /\s/; # All the new and only the new
14027 # tables have a space in their
14029 my @scripts = split /\s+/, $table->name;
14030 foreach my $script (@scripts) {
14031 my $script_table = $scx->table($script);
14032 $script_table += $table;
14034 $scx->delete_match_table($table);
14037 # Mark the scx table as the parent of the corresponding sc table for
14038 # those which are identical. This causes the pod for the script table
14039 # to refer to the corresponding scx one.
14041 # This has to be in a separate loop from above, so as to wait until
14042 # the tables are stabilized before checking for equivalency.
14043 if (defined $pod_directory) {
14044 foreach my $table ($scx->tables) {
14045 my $plain_sc_equiv = $script->table($table->name);
14046 if ($table->matches_identically_to($plain_sc_equiv)) {
14047 $plain_sc_equiv->set_equivalent_to($table, Related => 1);
14056 sub pre_3_dot_1_Nl () {
14058 # Return a range list for gc=nl for Unicode versions prior to 3.1, which
14059 # is when Unicode's became fully usable. These code points were
14060 # determined by inspection and experimentation. gc=nl is important for
14061 # certain Perl-extension properties that should be available in all
14064 my $Nl = Range_List->new();
14065 if (defined (my $official = $gc->table('Nl'))) {
14069 $Nl->add_range(0x2160, 0x2182);
14070 $Nl->add_range(0x3007, 0x3007);
14071 $Nl->add_range(0x3021, 0x3029);
14073 $Nl->add_range(0xFE20, 0xFE23);
14074 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
14079 sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be
14080 # called before the Cn's are completely filled.
14081 # Works on Unicodes earlier than ones that
14082 # explicitly specify Cn.
14083 return if defined $Assigned;
14085 if (! defined $gc || $gc->is_empty()) {
14086 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
14089 $Assigned = $perl->add_match_table('Assigned',
14090 Description => "All assigned code points",
14092 while (defined (my $range = $gc->each_range())) {
14093 my $standard_value = standardize($range->value);
14094 next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
14095 $Assigned->add_range($range->start, $range->end);
14099 sub calculate_DI() { # Set $DI to a Range_List equivalent to the
14100 # Default_Ignorable_Code_Point property. Works on
14101 # Unicodes earlier than ones that explicitly specify
14103 return if defined $DI;
14105 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
14106 $DI = $di->table('Y');
14109 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
14114 if ($v_version ge v2.0) {
14115 $DI += $gc->table('Cf')
14116 + $gc->table('Cs');
14118 # These are above the Unicode version 1 max
14119 $DI->add_range(0xE0000, 0xE0FFF);
14121 $DI += $gc->table('Cc')
14123 - utf8::unicode_to_native(0x0A) # LINE FEED
14124 - utf8::unicode_to_native(0x0B) # VERTICAL TAB
14126 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
14127 - utf8::unicode_to_native(0x85); # NEL
14131 sub calculate_NChar() { # Create a Perl extension match table which is the
14132 # same as the Noncharacter_Code_Point property, and
14133 # set $NChar to point to it. Works on Unicodes
14134 # earlier than ones that explicitly specify NChar
14135 return if defined $NChar;
14137 $NChar = $perl->add_match_table('_Perl_Nchar',
14138 Perl_Extension => 1,
14139 Fate => $INTERNAL_ONLY);
14140 if (defined (my $off_nchar = property_ref('NChar'))) {
14141 $NChar->initialize($off_nchar->table('Y'));
14144 $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
14145 if ($v_version ge v2.0) { # First release with these nchars
14146 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
14147 $NChar += [ $i .. $i+1 ];
14153 sub handle_compare_versions () {
14154 # This fixes things up for the $compare_versions capability, where we
14155 # compare Unicode version X with version Y (with Y > X), and we are
14156 # running it on the Unicode Data for version Y.
14158 # It works by calculating the code points whose meaning has been specified
14159 # after release X, by using the Age property. The complement of this set
14160 # is the set of code points whose meaning is unchanged between the
14161 # releases. This is the set the program restricts itself to. It includes
14162 # everything whose meaning has been specified by the time version X came
14163 # along, plus those still unassigned by the time of version Y. (We will
14164 # continue to use the word 'assigned' to mean 'meaning has been
14165 # specified', as it's shorter and is accurate in all cases except the
14166 # Noncharacter code points.)
14168 # This function is run after all the properties specified by Unicode have
14169 # been calculated for release Y. This makes sure we get all the nuances
14170 # of Y's rules. (It is done before the Perl extensions are calculated, as
14171 # those are based entirely on the Unicode ones.) But doing it after the
14172 # Unicode table calculations means we have to fix up the Unicode tables.
14173 # We do this by subtracting the code points that have been assigned since
14174 # X (which is actually done by ANDing each table of assigned code points
14175 # with the set of unchanged code points). Most Unicode properties are of
14176 # the form such that all unassigned code points have a default, grab-bag,
14177 # property value which is changed when the code point gets assigned. For
14178 # these, we just remove the changed code points from the table for the
14179 # latter property value, and add them back in to the grab-bag one. A few
14180 # other properties are not entirely of this form and have values for some
14181 # or all unassigned code points that are not the grab-bag one. These have
14182 # to be handled specially, and are hard-coded in to this routine based on
14183 # manual inspection of the Unicode character database. A list of the
14184 # outlier code points is made for each of these properties, and those
14185 # outliers are excluded from adding and removing from tables.
14187 # Note that there are glitches when comparing against Unicode 1.1, as some
14188 # Hangul syllables in it were later ripped out and eventually replaced
14189 # with other things.
14191 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
14193 my $after_first_version = "All matching code points were added after "
14194 . "Unicode $string_compare_versions";
14196 # Calculate the delta as those code points that have been newly assigned
14197 # since the first compare version.
14198 my $delta = Range_List->new();
14199 foreach my $table ($age->tables) {
14201 next if $table == $age->table('Unassigned');
14202 next if version->parse($table->name)
14203 le version->parse($string_compare_versions);
14206 if ($delta->is_empty) {
14207 die ("No changes; perhaps you need a 'DAge.txt' file?");
14210 my $unchanged = ~ $delta;
14212 calculate_Assigned() if ! defined $Assigned;
14213 $Assigned &= $unchanged;
14215 # $Assigned now contains the code points that were assigned as of Unicode
14218 # A block is all or nothing. If nothing is assigned in it, it all goes
14219 # back to the No_Block pool; but if even one code point is assigned, the
14220 # block is retained.
14221 my $no_block = $block->table('No_Block');
14222 foreach my $this_block ($block->tables) {
14223 next if $this_block == $no_block
14224 || ! ($this_block & $Assigned)->is_empty;
14225 $this_block->set_fate($SUPPRESSED, $after_first_version);
14226 foreach my $range ($this_block->ranges) {
14227 $block->replace_map($range->start, $range->end, 'No_Block')
14229 $no_block += $this_block;
14232 my @special_delta_properties; # List of properties that have to be
14233 # handled specially.
14234 my %restricted_delta; # Keys are the entries in
14235 # @special_delta_properties; values
14236 # are the range list of the code points
14237 # that behave normally when they get
14240 # In the next three properties, the Default Ignorable code points are
14245 push @special_delta_properties, property_ref('_Perl_GCB');
14246 $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14248 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
14250 push @special_delta_properties, $cwnfkcc;
14251 $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
14254 calculate_NChar(); # Non-character code points
14255 $NChar &= $unchanged;
14257 # This may have to be updated from time-to-time to get the most accurate
14259 my $default_BC_non_LtoR = Range_List->new(Initialize =>
14260 # These came from the comments in v8.0 DBidiClass.txt
14267 0x1EE00 .. 0x1EEFF,
14272 0x10800 .. 0x10FFF,
14273 0x1E800 .. 0x1EDFF,
14274 0x1EF00 .. 0x1EFFF,
14279 $default_BC_non_LtoR += $DI + $NChar;
14280 push @special_delta_properties, property_ref('BidiClass');
14281 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
14283 if (defined (my $eaw = property_ref('East_Asian_Width'))) {
14285 my $default_EA_width_W = Range_List->new(Initialize =>
14286 # From comments in v8.0 EastAsianWidth.txt
14291 0x20000 .. 0x2A6DF,
14292 0x2A700 .. 0x2B73F,
14293 0x2B740 .. 0x2B81F,
14294 0x2B820 .. 0x2CEAF,
14295 0x2F800 .. 0x2FA1F,
14296 0x20000 .. 0x2FFFD,
14297 0x30000 .. 0x3FFFD,
14300 push @special_delta_properties, $eaw;
14301 $restricted_delta{$special_delta_properties[-1]}
14302 = ~ $default_EA_width_W;
14304 # Line break came along in the same release as East_Asian_Width, and
14305 # the non-grab-bag default set is a superset of the EAW one.
14306 if (defined (my $lb = property_ref('Line_Break'))) {
14307 my $default_LB_non_XX = Range_List->new(Initialize =>
14308 # From comments in v8.0 LineBreak.txt
14309 [ 0x20A0 .. 0x20CF ]);
14310 $default_LB_non_XX += $default_EA_width_W;
14311 push @special_delta_properties, $lb;
14312 $restricted_delta{$special_delta_properties[-1]}
14313 = ~ $default_LB_non_XX;
14317 # Go through every property, skipping those we've already worked on, those
14318 # that are immutable, and the perl ones that will be calculated after this
14319 # routine has done its fixup.
14320 foreach my $property (property_ref('*')) {
14321 next if $property == $perl # Done later in the program
14322 || $property == $block # Done just above
14323 || $property == $DI # Done just above
14324 || $property == $NChar # Done just above
14326 # The next two are invariant across Unicode versions
14327 || $property == property_ref('Pattern_Syntax')
14328 || $property == property_ref('Pattern_White_Space');
14330 # Find the grab-bag value.
14331 my $default_map = $property->default_map;
14333 if (! $property->to_create_match_tables) {
14335 # Here there aren't any match tables. So far, all such properties
14336 # have a default map, and don't require special handling. Just
14337 # change each newly assigned code point back to the default map,
14338 # as if they were unassigned.
14339 foreach my $range ($delta->ranges) {
14340 $property->add_map($range->start,
14343 Replace => $UNCONDITIONALLY);
14346 else { # Here there are match tables. Find the one (if any) for the
14347 # grab-bag value that unassigned code points go to.
14349 if (defined $default_map) {
14350 $default_table = $property->table($default_map);
14353 # If some code points don't go back to the the grab-bag when they
14354 # are considered unassigned, exclude them from the list that does
14356 my $this_delta = $delta;
14357 my $this_unchanged = $unchanged;
14358 if (grep { $_ == $property } @special_delta_properties) {
14359 $this_delta = $delta & $restricted_delta{$property};
14360 $this_unchanged = ~ $this_delta;
14363 # Fix up each match table for this property.
14364 foreach my $table ($property->tables) {
14365 if (defined $default_table && $table == $default_table) {
14367 # The code points assigned after release X (the ones we
14368 # are excluding in this routine) go back on to the default
14369 # (grab-bag) table. However, some of these tables don't
14370 # actually exist, but are specified solely by the other
14371 # tables. (In a binary property, we don't need to
14372 # actually have an 'N' table, as it's just the complement
14373 # of the 'Y' table.) Such tables will be locked, so just
14375 $table += $this_delta unless $table->locked;
14379 # Here the table is not for the default value. We need to
14380 # subtract the code points we are ignoring for this
14381 # comparison (the deltas) from it. But if the table
14382 # started out with nothing, no need to exclude anything,
14383 # and want to skip it here anyway, so it gets listed
14384 # properly in the pod.
14385 next if $table->is_empty;
14387 # Save the deltas for later, before we do the subtraction
14388 my $deltas = $table & $this_delta;
14390 $table &= $this_unchanged;
14392 # Suppress the table if the subtraction left it with
14394 if ($table->is_empty) {
14395 if ($property->type == $BINARY) {
14396 push @tables_that_may_be_empty, $table->complete_name;
14399 $table->set_fate($SUPPRESSED, $after_first_version);
14403 # Now we add the removed code points to the property's
14404 # map, as they should now map to the grab-bag default
14405 # property (which they did in the first comparison
14406 # version). But we don't have to do this if the map is
14407 # only for internal use.
14408 if (defined $default_map && $property->to_output_map) {
14410 # The gc property has pseudo property values whose names
14411 # have length 1. These are the union of all the
14412 # property values whose name is longer than 1 and
14413 # whose first letter is all the same. The replacement
14414 # is done once for the longer-named tables.
14415 next if $property == $gc && length $table->name == 1;
14417 foreach my $range ($deltas->ranges) {
14418 $property->add_map($range->start,
14421 Replace => $UNCONDITIONALLY);
14429 # The above code doesn't work on 'gc=C', as it is a superset of the default
14430 # ('Cn') table. It's easiest to just special case it here.
14431 my $C = $gc->table('C');
14432 $C += $gc->table('Cn');
14437 sub compile_perl() {
14438 # Create perl-defined tables. Almost all are part of the pseudo-property
14439 # named 'perl' internally to this program. Many of these are recommended
14440 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14441 # on those found there.
14442 # Almost all of these are equivalent to some Unicode property.
14443 # A number of these properties have equivalents restricted to the ASCII
14444 # range, with their names prefaced by 'Posix', to signify that these match
14445 # what the Posix standard says they should match. A couple are
14446 # effectively this, but the name doesn't have 'Posix' in it because there
14447 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
14448 # to the full Unicode range, by our guesses as to what is appropriate.
14450 # 'All' is all code points. As an error check, instead of just setting it
14451 # to be that, construct it to be the union of all the major categories
14452 $All = $perl->add_match_table('All',
14454 => "All code points, including those above Unicode. Same as qr/./s",
14457 foreach my $major_table ($gc->tables) {
14459 # Major categories are the ones with single letter names.
14460 next if length($major_table->name) != 1;
14462 $All += $major_table;
14465 if ($All->max != $MAX_WORKING_CODEPOINT) {
14466 Carp::my_carp_bug("Generated highest code point ("
14467 . sprintf("%X", $All->max)
14468 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14470 if ($All->range_count != 1 || $All->min != 0) {
14471 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14474 my $Any = $perl->add_match_table('Any',
14475 Description => "All Unicode code points");
14476 $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14477 $Any->add_alias('Unicode');
14479 calculate_Assigned();
14481 my $ASCII = $perl->add_match_table('ASCII');
14482 if (defined $block) { # This is equivalent to the block if have it.
14483 my $Unicode_ASCII = $block->table('Basic_Latin');
14484 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14485 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14489 # Very early releases didn't have blocks, so initialize ASCII ourselves if
14491 if ($ASCII->is_empty) {
14492 if (! NON_ASCII_PLATFORM) {
14493 $ASCII->add_range(0, 127);
14496 for my $i (0 .. 127) {
14497 $ASCII->add_range(utf8::unicode_to_native($i),
14498 utf8::unicode_to_native($i));
14503 # Get the best available case definitions. Early Unicode versions didn't
14504 # have Uppercase and Lowercase defined, so use the general category
14505 # instead for them, modified by hard-coding in the code points each is
14507 my $Lower = $perl->add_match_table('XPosixLower');
14508 my $Unicode_Lower = property_ref('Lowercase');
14509 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14510 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14514 $Lower += $gc->table('Lowercase_Letter');
14516 # There are quite a few code points in Lower, that aren't in gc=lc,
14517 # and not all are in all releases.
14518 my $temp = Range_List->new(Initialize => [
14519 utf8::unicode_to_native(0xAA),
14520 utf8::unicode_to_native(0xBA),
14538 $Lower += $temp & $Assigned;
14540 my $Posix_Lower = $perl->add_match_table("PosixLower",
14541 Initialize => $Lower & $ASCII,
14544 my $Upper = $perl->add_match_table("XPosixUpper");
14545 my $Unicode_Upper = property_ref('Uppercase');
14546 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14547 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14551 # Unlike Lower, there are only two ranges in Upper that aren't in
14552 # gc=Lu, and all code points were assigned in all releases.
14553 $Upper += $gc->table('Uppercase_Letter');
14554 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
14555 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
14557 my $Posix_Upper = $perl->add_match_table("PosixUpper",
14558 Initialize => $Upper & $ASCII,
14561 # Earliest releases didn't have title case. Initialize it to empty if not
14562 # otherwise present
14563 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14564 Description => '(= \p{Gc=Lt})');
14565 my $lt = $gc->table('Lt');
14567 # Earlier versions of mktables had this related to $lt since they have
14568 # identical code points, but their caseless equivalents are not the same,
14569 # one being 'Cased' and the other being 'LC', and so now must be kept as
14570 # separate entities.
14575 push @tables_that_may_be_empty, $Title->complete_name;
14578 my $Unicode_Cased = property_ref('Cased');
14579 if (defined $Unicode_Cased) {
14580 my $yes = $Unicode_Cased->table('Y');
14581 my $no = $Unicode_Cased->table('N');
14582 $Title->set_caseless_equivalent($yes);
14583 if (defined $Unicode_Upper) {
14584 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14585 $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14587 $Upper->set_caseless_equivalent($yes);
14588 if (defined $Unicode_Lower) {
14589 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14590 $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14592 $Lower->set_caseless_equivalent($yes);
14595 # If this Unicode version doesn't have Cased, set up the Perl
14596 # extension from first principles. From Unicode 5.1: Definition D120:
14597 # A character C is defined to be cased if and only if C has the
14598 # Lowercase or Uppercase property or has a General_Category value of
14599 # Titlecase_Letter.
14600 my $cased = $perl->add_match_table('Cased',
14601 Initialize => $Lower + $Upper + $Title,
14602 Description => 'Uppercase or Lowercase or Titlecase',
14604 # $notcased is purely for the caseless equivalents below
14605 my $notcased = $perl->add_match_table('_Not_Cased',
14606 Initialize => ~ $cased,
14607 Fate => $INTERNAL_ONLY,
14608 Description => 'All not-cased code points');
14609 $Title->set_caseless_equivalent($cased);
14610 if (defined $Unicode_Upper) {
14611 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14612 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14614 $Upper->set_caseless_equivalent($cased);
14615 if (defined $Unicode_Lower) {
14616 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14617 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14619 $Lower->set_caseless_equivalent($cased);
14622 # The remaining perl defined tables are mostly based on Unicode TR 18,
14623 # "Annex C: Compatibility Properties". All of these have two versions,
14624 # one whose name generally begins with Posix that is posix-compliant, and
14625 # one that matches Unicode characters beyond the Posix, ASCII range
14627 my $Alpha = $perl->add_match_table('XPosixAlpha');
14629 # Alphabetic was not present in early releases
14630 my $Alphabetic = property_ref('Alphabetic');
14631 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14632 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14636 # The Alphabetic property doesn't exist for early releases, so
14637 # generate it. The actual definition, in 5.2 terms is:
14639 # gc=L + gc=Nl + Other_Alphabetic
14641 # Other_Alphabetic is also not defined in these early releases, but it
14642 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14643 # those last two as well, then subtract the relatively few of them that
14644 # shouldn't have been added. (The gc=So range is the circled capital
14645 # Latin characters. Early releases mistakenly didn't also include the
14646 # lower-case versions of these characters, and so we don't either, to
14647 # maintain consistency with those releases that first had this
14649 $Alpha->initialize($gc->table('Letter')
14654 $Alpha->add_range(0x24D0, 0x24E9); # gc=So
14655 foreach my $range ( [ 0x0300, 0x0344 ],
14656 [ 0x0346, 0x034E ],
14657 [ 0x0360, 0x0362 ],
14658 [ 0x0483, 0x0486 ],
14659 [ 0x0591, 0x05AF ],
14660 [ 0x06DF, 0x06E0 ],
14661 [ 0x06EA, 0x06EC ],
14662 [ 0x0740, 0x074A ],
14665 [ 0x0951, 0x0954 ],
14679 [ 0x0E47, 0x0E4C ],
14681 [ 0x0EC8, 0x0ECC ],
14682 [ 0x0F18, 0x0F19 ],
14686 [ 0x0F3E, 0x0F3F ],
14687 [ 0x0F82, 0x0F84 ],
14688 [ 0x0F86, 0x0F87 ],
14692 [ 0x17C9, 0x17D3 ],
14693 [ 0x20D0, 0x20DC ],
14695 [ 0x302A, 0x302F ],
14696 [ 0x3099, 0x309A ],
14697 [ 0xFE20, 0xFE23 ],
14698 [ 0x1D165, 0x1D169 ],
14699 [ 0x1D16D, 0x1D172 ],
14700 [ 0x1D17B, 0x1D182 ],
14701 [ 0x1D185, 0x1D18B ],
14702 [ 0x1D1AA, 0x1D1AD ],
14705 $Alpha->delete_range($range->[0], $range->[1]);
14708 $Alpha->delete_range($range, $range);
14711 $Alpha->add_description('Alphabetic');
14712 $Alpha->add_alias('Alphabetic');
14714 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14715 Initialize => $Alpha & $ASCII,
14717 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14718 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14720 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14721 Description => 'Alphabetic and (decimal) Numeric',
14722 Initialize => $Alpha + $gc->table('Decimal_Number'),
14724 $perl->add_match_table("PosixAlnum",
14725 Initialize => $Alnum & $ASCII,
14728 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14729 Description => '\w, including beyond ASCII;'
14730 . ' = \p{Alnum} + \pM + \p{Pc}'
14731 . ' + \p{Join_Control}',
14732 Initialize => $Alnum + $gc->table('Mark'),
14734 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14739 $Word += ord('_'); # Make sure this is a $Word
14741 my $JC = property_ref('Join_Control'); # Wasn't in release 1
14743 $Word += $JC->table('Y');
14746 $Word += 0x200C + 0x200D;
14749 # This is a Perl extension, so the name doesn't begin with Posix.
14750 my $PerlWord = $perl->add_match_table('PosixWord',
14751 Description => '\w, restricted to ASCII',
14752 Initialize => $Word & $ASCII,
14754 $PerlWord->add_alias('PerlWord');
14756 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14757 Description => '\h, Horizontal white space',
14759 # 200B is Zero Width Space which is for line
14760 # break control, and was listed as
14761 # Space_Separator in early releases
14762 Initialize => $gc->table('Space_Separator')
14766 $Blank->add_alias('HorizSpace'); # Another name for it.
14767 $perl->add_match_table("PosixBlank",
14768 Initialize => $Blank & $ASCII,
14771 my $VertSpace = $perl->add_match_table('VertSpace',
14772 Description => '\v',
14774 $gc->table('Line_Separator')
14775 + $gc->table('Paragraph_Separator')
14776 + utf8::unicode_to_native(0x0A) # LINE FEED
14777 + utf8::unicode_to_native(0x0B) # VERTICAL TAB
14779 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
14780 + utf8::unicode_to_native(0x85) # NEL
14782 # No Posix equivalent for vertical space
14784 my $Space = $perl->add_match_table('XPosixSpace',
14785 Description => '\s including beyond ASCII and vertical tab',
14786 Initialize => $Blank + $VertSpace,
14788 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms
14789 $Space->add_alias('SpacePerl');
14790 $Space->add_alias('Space') if $v_version lt v4.1.0;
14792 my $Posix_space = $perl->add_match_table("PosixSpace",
14793 Initialize => $Space & $ASCII,
14795 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14797 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14798 Description => 'Control characters');
14799 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14800 $perl->add_match_table("PosixCntrl",
14801 Description => "ASCII control characters",
14802 Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14803 . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14804 . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14805 . " HT, LF, NAK, NUL, RS, SI, SO,"
14806 . " SOH, STX, SUB, SYN, US, VT",
14807 Initialize => $Cntrl & $ASCII,
14810 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14811 my $Cs = $gc->table('Cs');
14812 if (defined $Cs && ! $Cs->is_empty) {
14813 $perl_surrogate += $Cs;
14816 push @tables_that_may_be_empty, '_Perl_Surrogate';
14819 # $controls is a temporary used to construct Graph.
14820 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14821 + $gc->table('Control')
14822 + $perl_surrogate);
14824 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
14825 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14826 Description => 'Characters that are graphical',
14827 Initialize => ~ ($Space + $controls),
14829 $perl->add_match_table("PosixGraph",
14830 Initialize => $Graph & $ASCII,
14833 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14834 Description => 'Characters that are graphical plus space characters (but no controls)',
14835 Initialize => $Blank + $Graph - $gc->table('Control'),
14837 $perl->add_match_table("PosixPrint",
14838 Initialize => $print & $ASCII,
14841 my $Punct = $perl->add_match_table('Punct');
14842 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14844 # \p{punct} doesn't include the symbols, which posix does
14845 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14846 Description => '\p{Punct} + ASCII-range \p{Symbol}',
14847 Initialize => $gc->table('Punctuation')
14848 + ($ASCII & $gc->table('Symbol')),
14849 Perl_Extension => 1
14851 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14852 Initialize => $ASCII & $XPosixPunct,
14855 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14856 Description => '[0-9] + all other decimal digits');
14857 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14858 my $PosixDigit = $perl->add_match_table("PosixDigit",
14859 Initialize => $Digit & $ASCII,
14862 # Hex_Digit was not present in first release
14863 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14864 my $Hex = property_ref('Hex_Digit');
14865 if (defined $Hex && ! $Hex->is_empty) {
14866 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14869 $Xdigit->initialize([ ord('0') .. ord('9'),
14870 ord('A') .. ord('F'),
14871 ord('a') .. ord('f'),
14872 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14875 # AHex was not present in early releases
14876 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14877 my $AHex = property_ref('ASCII_Hex_Digit');
14878 if (defined $AHex && ! $AHex->is_empty) {
14879 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14882 $PosixXDigit->initialize($Xdigit & $ASCII);
14883 $PosixXDigit->add_alias('AHex');
14884 $PosixXDigit->add_alias('Ascii_Hex_Digit');
14887 my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14888 Description => "Code points that particpate in some fold",
14890 my $loc_problem_folds = $perl->add_match_table(
14891 "_Perl_Problematic_Locale_Folds",
14893 "Code points that are in some way problematic under locale",
14896 # This allows regexec.c to skip some work when appropriate. Some of the
14897 # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14898 my $loc_problem_folds_start = $perl->add_match_table(
14899 "_Perl_Problematic_Locale_Foldeds_Start",
14901 "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14904 my $cf = property_ref('Case_Folding');
14906 # Every character 0-255 is problematic because what each folds to depends
14907 # on the current locale
14908 $loc_problem_folds->add_range(0, 255);
14909 $loc_problem_folds_start += $loc_problem_folds;
14911 # Also problematic are anything these fold to outside the range. Likely
14912 # forever the only thing folded to by these outside the 0-255 range is the
14913 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14914 # completely general, which should catch any unexpected changes or errors.
14915 # We look at each code point 0-255, and add its fold (including each part
14916 # of a multi-char fold) to the list. See commit message
14917 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14919 foreach my $range ($loc_problem_folds->ranges) {
14920 foreach my $code_point ($range->start .. $range->end) {
14921 my $fold_range = $cf->containing_range($code_point);
14922 next unless defined $fold_range;
14924 # Skip if folds to itself
14925 next if $fold_range->value eq $CODE_POINT;
14927 my @hex_folds = split " ", $fold_range->value;
14928 my $start_cp = $hex_folds[0];
14929 next if $start_cp eq $CODE_POINT;
14930 $start_cp = hex $start_cp;
14931 foreach my $i (0 .. @hex_folds - 1) {
14932 my $cp = $hex_folds[$i];
14933 next if $cp eq $CODE_POINT;
14935 next unless $cp > 255; # Already have the < 256 ones
14937 $loc_problem_folds->add_range($cp, $cp);
14938 $loc_problem_folds_start->add_range($start_cp, $start_cp);
14943 my $folds_to_multi_char = $perl->add_match_table(
14944 "_Perl_Folds_To_Multi_Char",
14946 "Code points whose fold is a string of more than one character",
14948 if ($v_version lt v3.0.1) {
14949 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char';
14952 # Look through all the known folds to populate these tables.
14953 foreach my $range ($cf->ranges) {
14954 next if $range->value eq $CODE_POINT;
14955 my $start = $range->start;
14956 my $end = $range->end;
14957 $any_folds->add_range($start, $end);
14959 my @hex_folds = split " ", $range->value;
14960 if (@hex_folds > 1) { # Is multi-char fold
14961 $folds_to_multi_char->add_range($start, $end);
14964 my $found_locale_problematic = 0;
14966 # Look at each of the folded-to characters...
14967 foreach my $i (0 .. @hex_folds - 1) {
14968 my $cp = hex $hex_folds[$i];
14969 $any_folds->add_range($cp, $cp);
14971 # The fold is problematic if any of the folded-to characters is
14972 # already considered problematic.
14973 if ($loc_problem_folds->contains($cp)) {
14974 $loc_problem_folds->add_range($start, $end);
14975 $found_locale_problematic = 1;
14979 # If this is a problematic fold, add to the start chars the
14980 # folding-from characters and first folded-to character.
14981 if ($found_locale_problematic) {
14982 $loc_problem_folds_start->add_range($start, $end);
14983 my $cp = hex $hex_folds[0];
14984 $loc_problem_folds_start->add_range($cp, $cp);
14988 my $dt = property_ref('Decomposition_Type');
14989 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14990 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14991 Perl_Extension => 1,
14992 Note => 'Union of all non-canonical decompositions',
14995 # For backward compatibility, Perl has its own definition for IDStart.
14996 # It is regular XID_Start plus the underscore, but all characters must be
14997 # Word characters as well
14998 my $XID_Start = property_ref('XID_Start');
14999 my $perl_xids = $perl->add_match_table('_Perl_IDStart',
15000 Perl_Extension => 1,
15001 Fate => $INTERNAL_ONLY,
15002 Initialize => ord('_')
15004 if (defined $XID_Start
15005 || defined ($XID_Start = property_ref('ID_Start')))
15007 $perl_xids += $XID_Start->table('Y');
15010 # For Unicode versions that don't have the property, construct our own
15011 # from first principles. The actual definition is:
15013 # + letter numbers (Nl)
15015 # - Pattern_White_Space
15016 # + stability extensions
15017 # - NKFC modifications
15019 # What we do in the code below is to include the identical code points
15020 # that are in the first release that had Unicode's version of this
15021 # property, essentially extrapolating backwards. There were no
15022 # stability extensions until v4.1, so none are included; likewise in
15023 # no Unicode version so far do subtracting PatSyn and PatWS make any
15024 # difference, so those also are ignored.
15025 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
15027 # We do subtract the NFKC modifications that are in the first version
15028 # that had this property. We don't bother to test if they are in the
15029 # version in question, because if they aren't, the operation is a
15030 # no-op. The NKFC modifications are discussed in
15031 # http://www.unicode.org/reports/tr31/#NFKC_Modifications
15032 foreach my $range ( 0x037A,
15035 [ 0xFC5E, 0xFC63 ],
15036 [ 0xFDFA, 0xFE70 ],
15037 [ 0xFE72, 0xFE76 ],
15042 [ 0xFF9E, 0xFF9F ],
15045 $perl_xids->delete_range($range->[0], $range->[1]);
15048 $perl_xids->delete_range($range, $range);
15053 $perl_xids &= $Word;
15055 my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
15056 Perl_Extension => 1,
15057 Fate => $INTERNAL_ONLY);
15058 my $XIDC = property_ref('XID_Continue');
15060 || defined ($XIDC = property_ref('ID_Continue')))
15062 $perl_xidc += $XIDC->table('Y');
15065 # Similarly, we construct our own XIDC if necessary for early Unicode
15066 # versions. The definition is:
15067 # everything in XIDS
15073 # - Pattern_White_Space
15074 # + stability extensions
15075 # - NFKC modifications
15077 # The same thing applies to this as with XIDS for the PatSyn, PatWS,
15078 # and stability extensions. There is a somewhat different set of NFKC
15079 # mods to remove (and add in this case). The ones below make this
15080 # have identical code points as in the first release that defined it.
15081 $perl_xidc += $perl_xids
15086 + utf8::unicode_to_native(0xB7)
15088 if (defined (my $pc = $gc->table('Pc'))) {
15091 else { # 1.1.5 didn't have Pc, but these should have been in it
15092 $perl_xidc += 0xFF3F;
15093 $perl_xidc->add_range(0x203F, 0x2040);
15094 $perl_xidc->add_range(0xFE33, 0xFE34);
15095 $perl_xidc->add_range(0xFE4D, 0xFE4F);
15098 # Subtract the NFKC mods
15099 foreach my $range ( 0x037A,
15100 [ 0xFC5E, 0xFC63 ],
15101 [ 0xFDFA, 0xFE1F ],
15103 [ 0xFE72, 0xFE76 ],
15110 $perl_xidc->delete_range($range->[0], $range->[1]);
15113 $perl_xidc->delete_range($range, $range);
15118 $perl_xidc &= $Word;
15120 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
15121 Perl_Extension => 1,
15122 Fate => $INTERNAL_ONLY,
15123 Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
15126 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
15127 Perl_Extension => 1,
15128 Fate => $INTERNAL_ONLY,
15129 Initialize => $perl_xidc
15136 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
15138 if (@named_sequences) {
15139 push @composition, 'Named_Sequence';
15140 foreach my $sequence (@named_sequences) {
15141 $perl_charname->add_anomalous_entry($sequence);
15145 my $alias_sentence = "";
15147 my $alias = property_ref('_Perl_Name_Alias');
15148 $perl_charname->set_proxy_for('_Perl_Name_Alias');
15150 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go
15151 # with respect to any existing entry depends on the entry type.
15152 # Corrections go before said entry, as they should be returned in
15153 # preference over the existing entry. (A correction to a correction
15154 # should be later in the _Perl_Name_Alias table, so it will correctly
15155 # precede the erroneous correction in Perl_Charnames.)
15157 # Abbreviations go after everything else, so they are saved temporarily in
15158 # a hash for later.
15160 # Everything else is added added afterwards, which preserves the input
15163 foreach my $range ($alias->ranges) {
15164 next if $range->value eq "";
15165 my $code_point = $range->start;
15166 if ($code_point != $range->end) {
15167 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
15169 my ($value, $type) = split ': ', $range->value;
15171 if ($type eq 'correction') {
15172 $replace_type = $MULTIPLE_BEFORE;
15174 elsif ($type eq 'abbreviation') {
15177 $abbreviations{$value} = $code_point;
15181 $replace_type = $MULTIPLE_AFTER;
15184 # Actually add; before or after current entry(ies) as determined
15187 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
15189 $alias_sentence = <<END;
15190 The _Perl_Name_Alias property adds duplicate code point entries that are
15191 alternatives to the original name. If an addition is a corrected
15192 name, it will be physically first in the table. The original (less correct,
15193 but still valid) name will be next; then any alternatives, in no particular
15194 order; and finally any abbreviations, again in no particular order.
15197 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
15198 # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
15199 # so should be first in the file; the other names have precedence starting
15201 my $before_or_after = ($v_version lt v6.1.0)
15205 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
15206 my $code_point = $range->start;
15207 my $unicode_1_value = $range->value;
15208 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
15210 if ($code_point != $range->end) {
15211 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
15214 # To handle EBCDIC, we don't hard code in the code points of the
15215 # controls; instead realizing that all of them are below 256.
15216 last if $code_point > 255;
15218 # We only add in the controls.
15219 next if $gc->value_of($code_point) ne 'Cc';
15221 # We reject this Unicode1 name for later Perls, as it is used for
15222 # another code point
15223 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
15225 # This won't add an exact duplicate.
15226 $perl_charname->add_duplicate($code_point, $unicode_1_value,
15227 Replace => $before_or_after);
15230 # Now that have everything added, add in abbreviations after
15231 # everything else. Sort so results don't change between runs of this
15233 foreach my $value (sort keys %abbreviations) {
15234 $perl_charname->add_duplicate($abbreviations{$value}, $value,
15235 Replace => $MULTIPLE_AFTER);
15239 if (@composition <= 2) { # Always at least 2
15240 $comment = join " and ", @composition;
15243 $comment = join ", ", @composition[0 .. scalar @composition - 2];
15244 $comment .= ", and $composition[-1]";
15247 $perl_charname->add_comment(join_lines( <<END
15248 This file is for charnames.pm. It is the union of the $comment properties.
15249 Unicode_1_Name entries are used only for nameless code points in the Name
15252 This file doesn't include the algorithmically determinable names. For those,
15253 use 'unicore/Name.pm'
15256 property_ref('Name')->add_comment(join_lines( <<END
15257 This file doesn't include the algorithmically determinable names. For those,
15258 use 'unicore/Name.pm'
15262 # Construct the Present_In property from the Age property.
15263 if (-e 'DAge.txt' && defined $age) {
15264 my $default_map = $age->default_map;
15265 my $in = Property->new('In',
15266 Default_Map => $default_map,
15267 Full_Name => "Present_In",
15268 Perl_Extension => 1,
15270 Initialize => $age,
15272 $in->add_comment(join_lines(<<END
15273 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
15274 same as for $age, and not for what $in really means. This is because anything
15275 defined in a given release should have multiple values: that release and all
15276 higher ones. But only one value per code point can be represented in a table
15281 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
15282 # lowest numbered (earliest) come first, with the non-numeric one
15284 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15286 : ($b->name !~ /^[\d.]*$/)
15288 : $a->name <=> $b->name
15291 # The Present_In property is the cumulative age properties. The first
15292 # one hence is identical to the first age one.
15293 my $previous_in = $in->add_match_table($first_age->name);
15294 $previous_in->set_equivalent_to($first_age, Related => 1);
15296 my $description_start = "Code point's usage introduced in version ";
15297 $first_age->add_description($description_start . $first_age->name);
15299 # To construct the accumulated values, for each of the age tables
15300 # starting with the 2nd earliest, merge the earliest with it, to get
15301 # all those code points existing in the 2nd earliest. Repeat merging
15302 # the new 2nd earliest with the 3rd earliest to get all those existing
15303 # in the 3rd earliest, and so on.
15304 foreach my $current_age (@rest_ages) {
15305 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
15307 my $current_in = $in->add_match_table(
15308 $current_age->name,
15309 Initialize => $current_age + $previous_in,
15310 Description => $description_start
15311 . $current_age->name
15314 foreach my $alias ($current_age->aliases) {
15315 $current_in->add_alias($alias->name);
15317 $previous_in = $current_in;
15319 # Add clarifying material for the corresponding age file. This is
15320 # in part because of the confusing and contradictory information
15321 # given in the Standard's documentation itself, as of 5.2.
15322 $current_age->add_description(
15323 "Code point's usage was introduced in version "
15324 . $current_age->name);
15325 $current_age->add_note("See also $in");
15329 # And finally the code points whose usages have yet to be decided are
15330 # the same in both properties. Note that permanently unassigned code
15331 # points actually have their usage assigned (as being permanently
15332 # unassigned), so that these tables are not the same as gc=cn.
15333 my $unassigned = $in->add_match_table($default_map);
15334 my $age_default = $age->table($default_map);
15335 $age_default->add_description(<<END
15336 Code point's usage has not been assigned in any Unicode release thus far.
15339 $unassigned->set_equivalent_to($age_default, Related => 1);
15342 my $patws = $perl->add_match_table('_Perl_PatWS',
15343 Perl_Extension => 1,
15344 Fate => $INTERNAL_ONLY);
15345 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15346 $patws->initialize($off_patws->table('Y'));
15349 $patws->initialize([ ord("\t"),
15351 utf8::unicode_to_native(0x0B), # VT
15355 utf8::unicode_to_native(0x85), # NEL
15356 0x200E..0x200F, # Left, Right marks
15357 0x2028..0x2029 # Line, Paragraph seps
15361 # See L<perlfunc/quotemeta>
15362 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15363 Perl_Extension => 1,
15364 Fate => $INTERNAL_ONLY,
15366 # Initialize to what's common in
15367 # all Unicode releases.
15369 $gc->table('Control')
15372 + ((~ $Word) & $ASCII)
15375 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15376 $quotemeta += $patsyn->table('Y');
15379 $quotemeta += ((~ $Word) & Range->new(0, 255))
15380 - utf8::unicode_to_native(0xA8)
15381 - utf8::unicode_to_native(0xAF)
15382 - utf8::unicode_to_native(0xB2)
15383 - utf8::unicode_to_native(0xB3)
15384 - utf8::unicode_to_native(0xB4)
15385 - utf8::unicode_to_native(0xB7)
15386 - utf8::unicode_to_native(0xB8)
15387 - utf8::unicode_to_native(0xB9)
15388 - utf8::unicode_to_native(0xBC)
15389 - utf8::unicode_to_native(0xBD)
15390 - utf8::unicode_to_native(0xBE);
15391 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15392 # same in all releases
15409 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15410 $quotemeta += $di->table('Y')
15413 if ($v_version ge v2.0) {
15414 $quotemeta += $gc->table('Cf')
15415 + $gc->table('Cs');
15417 # These are above the Unicode version 1 max
15418 $quotemeta->add_range(0xE0000, 0xE0FFF);
15420 $quotemeta += $gc->table('Cc')
15422 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15427 $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15428 $quotemeta += $temp;
15435 # Finished creating all the perl properties. All non-internal non-string
15436 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
15437 # an underscore.) These do not get a separate entry in the pod file
15438 foreach my $table ($perl->tables) {
15439 foreach my $alias ($table->aliases) {
15440 next if $alias->name =~ /^_/;
15441 $table->add_alias('Is_' . $alias->name,
15444 Status => $alias->status,
15445 OK_as_Filename => 0);
15449 # Perl tailors the WordBreak property so that \b{wb} doesn't split
15450 # adjacent spaces into separate words. First create a copy of the regular
15451 # WB property as '_Perl_WB'. (On Unicode releases earlier than when WB
15452 # was defined for, this will already have been done by the substitute file
15453 # portion for 'Input_file' code for WB.)
15454 my $perl_wb = property_ref('_Perl_WB');
15455 if (! defined $perl_wb) {
15456 $perl_wb = Property->new('_Perl_WB',
15457 Fate => $INTERNAL_ONLY,
15458 Perl_Extension => 1,
15459 Directory => $map_directory,
15461 my $wb = property_ref('Word_Break');
15462 $perl_wb->initialize($wb);
15463 $perl_wb->set_default_map($wb->default_map);
15466 # And simply replace the mappings of horizontal space characters that
15467 # otherwise would map to the default to instead map to our tailoring.
15468 my $default = $perl_wb->default_map;
15469 for my $range ($Blank->ranges) {
15470 for my $i ($range->start .. $range->end) {
15471 next unless $perl_wb->value_of($i) eq $default;
15472 $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15473 Replace => $UNCONDITIONALLY);
15477 # Create a version of the LineBreak property with the mappings that are
15478 # omitted in the default algorithm remapped to what
15479 # http://www.unicode.org/reports/tr14 says they should be.
15481 # Original Resolved General_Category
15482 # AI, SG, XX AL Any
15483 # SA CM Only Mn or Mc
15484 # SA AL Any except Mn and Mc
15487 # All property values are also written out in their long form, as
15488 # regen/mk_invlist.pl expects that. This also fixes occurrences of the
15489 # typo in early Unicode versions: 'inseperable'.
15490 my $perl_lb = property_ref('_Perl_LB');
15491 if (! defined $perl_lb) {
15492 $perl_lb = Property->new('_Perl_LB',
15493 Fate => $INTERNAL_ONLY,
15494 Perl_Extension => 1,
15495 Directory => $map_directory,
15497 my $lb = property_ref('Line_Break');
15499 # Populate from $lb, but use full name and fix typo.
15500 foreach my $range ($lb->ranges) {
15501 my $full_name = $lb->table($range->value)->full_name;
15502 $full_name = 'Inseparable'
15503 if standardize($full_name) eq 'inseperable';
15504 $perl_lb->add_map($range->start, $range->end, $full_name);
15508 $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL
15510 for my $range ($perl_lb->ranges) {
15511 my $value = standardize($range->value);
15512 if ( $value eq standardize('Unknown')
15513 || $value eq standardize('Ambiguous')
15514 || $value eq standardize('Surrogate'))
15516 $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15517 Replace => $UNCONDITIONALLY);
15519 elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15520 $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15521 Replace => $UNCONDITIONALLY);
15523 elsif ($value eq standardize('Complex_Context')) {
15524 for my $i ($range->start .. $range->end) {
15525 my $gc_val = $gc->value_of($i);
15526 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15527 $perl_lb->add_map($i, $i, 'Combining_Mark',
15528 Replace => $UNCONDITIONALLY);
15531 $perl_lb->add_map($i, $i, 'Alphabetic',
15532 Replace => $UNCONDITIONALLY);
15538 # This property is a modification of the scx property
15539 my $perl_scx = Property->new('_Perl_SCX',
15540 Fate => $INTERNAL_ONLY,
15541 Perl_Extension => 1,
15542 Directory => $map_directory,
15546 # Use scx if available; otherwise sc; if neither is there (a very old
15547 # Unicode version, just say that everything is 'Common'
15548 if (defined $scx) {
15550 $perl_scx->set_default_map('Unknown');
15552 elsif (defined $script) {
15555 # Early versions of 'sc', had everything be 'Common'
15556 if (defined $script->table('Unknown')) {
15557 $perl_scx->set_default_map('Unknown');
15560 $perl_scx->set_default_map('Common');
15563 $perl_scx->add_match_table('Common');
15564 $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15566 $perl_scx->add_match_table('Unknown');
15567 $perl_scx->set_default_map('Unknown');
15570 $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15571 $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15573 if (defined $source) {
15574 $perl_scx->initialize($source);
15576 # UTS 39 says that the scx property should be modified for these
15577 # countries where certain mixed scripts are commonly used.
15578 for my $range ($perl_scx->ranges) {
15579 my $value = $range->value;
15580 my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15581 $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15582 $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15583 $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15584 {$1 Katakana Hiragana Jpan}xi;
15585 $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15586 $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15589 $value = join " ", uniques split " ", $value;
15590 $range->set_value($value)
15594 foreach my $table ($source->tables) {
15595 my $scx_table = $perl_scx->add_match_table($table->name,
15596 Full_Name => $table->full_name);
15597 foreach my $alias ($table->aliases) {
15598 $scx_table->add_alias($alias->name);
15603 # Here done with all the basic stuff. Ready to populate the information
15604 # about each character if annotating them.
15607 # See comments at its declaration
15608 $annotate_ranges = Range_Map->new;
15610 # This separates out the non-characters from the other unassigneds, so
15611 # can give different annotations for each.
15612 $unassigned_sans_noncharacters = Range_List->new(
15613 Initialize => $gc->table('Unassigned'));
15614 $unassigned_sans_noncharacters &= (~ $NChar);
15616 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15617 $i = populate_char_info($i); # Note sets $i so may cause skips
15625 sub add_perl_synonyms() {
15626 # A number of Unicode tables have Perl synonyms that are expressed in
15627 # the single-form, \p{name}. These are:
15628 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15629 # \p{Is_Name} as synonyms
15630 # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15631 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15632 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15633 # conflict, \p{Value} and \p{Is_Value} as well
15635 # This routine generates these synonyms, warning of any unexpected
15638 # Construct the list of tables to get synonyms for. Start with all the
15639 # binary and the General_Category ones.
15640 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15642 push @tables, $gc->tables;
15644 # If the version of Unicode includes the Script Extensions (preferably),
15645 # or Script property, add its tables
15646 if (defined $scx) {
15647 push @tables, $scx->tables;
15650 push @tables, $script->tables if defined $script;
15653 # The Block tables are kept separate because they are treated differently.
15654 # And the earliest versions of Unicode didn't include them, so add only if
15657 push @blocks, $block->tables if defined $block;
15659 # Here, have the lists of tables constructed. Process blocks last so that
15660 # if there are name collisions with them, blocks have lowest priority.
15661 # Should there ever be other collisions, manual intervention would be
15662 # required. See the comments at the beginning of the program for a
15663 # possible way to handle those semi-automatically.
15664 foreach my $table (@tables, @blocks) {
15666 # For non-binary properties, the synonym is just the name of the
15667 # table, like Greek, but for binary properties the synonym is the name
15668 # of the property, and means the code points in its 'Y' table.
15669 my $nominal = $table;
15670 my $nominal_property = $nominal->property;
15672 if (! $nominal->isa('Property')) {
15677 # Here is a binary property. Use the 'Y' table. Verify that is
15679 my $yes = $nominal->table('Y');
15680 unless (defined $yes) { # Must be defined, but is permissible to
15682 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
15688 foreach my $alias ($nominal->aliases) {
15690 # Attempt to create a table in the perl directory for the
15691 # candidate table, using whatever aliases in it that don't
15692 # conflict. Also add non-conflicting aliases for all these
15693 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15695 foreach my $prefix ("", 'Is_', 'In_') {
15697 # Only Block properties can have added 'In_' aliases.
15698 next if $prefix eq 'In_' and $nominal_property != $block;
15700 my $proposed_name = $prefix . $alias->name;
15702 # No Is_Is, In_In, nor combinations thereof
15703 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15704 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15706 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15708 # Get a reference to any existing table in the perl
15709 # directory with the desired name.
15710 my $pre_existing = $perl->table($proposed_name);
15712 if (! defined $pre_existing) {
15714 # No name collision, so OK to add the perl synonym.
15716 my $make_re_pod_entry;
15717 my $ok_as_filename;
15718 my $status = $alias->status;
15719 if ($nominal_property == $block) {
15721 # For block properties, only the compound form is
15722 # preferred for external use; the others are
15723 # discouraged. The pod file contains wild cards for
15724 # the 'In' and 'Is' forms so no entries for those; and
15725 # we don't want people using the name without any
15726 # prefix, so discourage that.
15727 if ($prefix eq "") {
15728 $make_re_pod_entry = 1;
15729 $status = $status || $DISCOURAGED;
15730 $ok_as_filename = 0;
15732 elsif ($prefix eq 'In_') {
15733 $make_re_pod_entry = 0;
15734 $status = $status || $DISCOURAGED;
15735 $ok_as_filename = 1;
15738 $make_re_pod_entry = 0;
15739 $status = $status || $DISCOURAGED;
15740 $ok_as_filename = 0;
15743 elsif ($prefix ne "") {
15745 # The 'Is' prefix is handled in the pod by a wild
15746 # card, and we won't use it for an external name
15747 $make_re_pod_entry = 0;
15748 $status = $status || $NORMAL;
15749 $ok_as_filename = 0;
15753 # Here, is an empty prefix, non block. This gets its
15754 # own pod entry and can be used for an external name.
15755 $make_re_pod_entry = 1;
15756 $status = $status || $NORMAL;
15757 $ok_as_filename = 1;
15760 # Here, there isn't a perl pre-existing table with the
15761 # name. Look through the list of equivalents of this
15762 # table to see if one is a perl table.
15763 foreach my $equivalent ($actual->leader->equivalents) {
15764 next if $equivalent->property != $perl;
15766 # Here, have found a table for $perl. Add this alias
15767 # to it, and are done with this prefix.
15768 $equivalent->add_alias($proposed_name,
15769 Re_Pod_Entry => $make_re_pod_entry,
15771 # Currently don't output these in the
15772 # ucd pod, as are strongly discouraged
15777 OK_as_Filename => $ok_as_filename);
15778 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15782 # Here, $perl doesn't already have a table that is a
15783 # synonym for this property, add one.
15784 my $added_table = $perl->add_match_table($proposed_name,
15785 Re_Pod_Entry => $make_re_pod_entry,
15787 # See UCD comment just above
15791 OK_as_Filename => $ok_as_filename);
15792 # And it will be related to the actual table, since it is
15794 $added_table->set_equivalent_to($actual, Related => 1);
15795 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15797 } # End of no pre-existing.
15799 # Here, there is a pre-existing table that has the proposed
15800 # name. We could be in trouble, but not if this is just a
15801 # synonym for another table that we have already made a child
15802 # of the pre-existing one.
15803 if ($pre_existing->is_set_equivalent_to($actual)) {
15804 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15805 $pre_existing->add_alias($proposed_name);
15809 # Here, there is a name collision, but it still could be OK if
15810 # the tables match the identical set of code points, in which
15811 # case, we can combine the names. Compare each table's code
15812 # point list to see if they are identical.
15813 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15814 if ($pre_existing->matches_identically_to($actual)) {
15816 # Here, they do match identically. Not a real conflict.
15817 # Make the perl version a child of the Unicode one, except
15818 # in the non-obvious case of where the perl name is
15819 # already a synonym of another Unicode property. (This is
15820 # excluded by the test for it being its own parent.) The
15821 # reason for this exclusion is that then the two Unicode
15822 # properties become related; and we don't really know if
15823 # they are or not. We generate documentation based on
15824 # relatedness, and this would be misleading. Code
15825 # later executed in the process will cause the tables to
15826 # be represented by a single file anyway, without making
15827 # it look in the pod like they are necessarily related.
15828 if ($pre_existing->parent == $pre_existing
15829 && ($pre_existing->property == $perl
15830 || $actual->property == $perl))
15832 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15833 $pre_existing->set_equivalent_to($actual, Related => 1);
15835 elsif (main::DEBUG && $to_trace) {
15836 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15837 trace $pre_existing->parent;
15842 # Here they didn't match identically, there is a real conflict
15843 # between our new name and a pre-existing property.
15844 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15845 $pre_existing->add_conflicting($nominal->full_name,
15849 # Don't output a warning for aliases for the block
15850 # properties (unless they start with 'In_') as it is
15851 # expected that there will be conflicts and the block
15853 if ($verbosity >= $NORMAL_VERBOSITY
15854 && ($actual->property != $block || $prefix eq 'In_'))
15856 print simple_fold(join_lines(<<END
15857 There is already an alias named $proposed_name (from $pre_existing),
15858 so not creating this alias for $actual
15863 # Keep track for documentation purposes.
15864 $has_In_conflicts++ if $prefix eq 'In_';
15865 $has_Is_conflicts++ if $prefix eq 'Is_';
15870 # There are some properties which have No and Yes (and N and Y) as
15871 # property values, but aren't binary, and could possibly be confused with
15872 # binary ones. So create caveats for them. There are tables that are
15873 # named 'No', and tables that are named 'N', but confusion is not likely
15874 # unless they are the same table. For example, N meaning Number or
15875 # Neutral is not likely to cause confusion, so don't add caveats to things
15877 foreach my $property (grep { $_->type != $BINARY
15878 && $_->type != $FORCED_BINARY }
15881 my $yes = $property->table('Yes');
15882 if (defined $yes) {
15883 my $y = $property->table('Y');
15884 if (defined $y && $yes == $y) {
15885 foreach my $alias ($property->aliases) {
15886 $yes->add_conflicting($alias->name);
15890 my $no = $property->table('No');
15892 my $n = $property->table('N');
15893 if (defined $n && $no == $n) {
15894 foreach my $alias ($property->aliases) {
15895 $no->add_conflicting($alias->name, 'P');
15904 sub register_file_for_name($$$) {
15905 # Given info about a table and a datafile that it should be associated
15906 # with, register that association
15909 my $directory_ref = shift; # Array of the directory path for the file
15910 my $file = shift; # The file name in the final directory.
15911 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
15913 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15915 if ($table->isa('Property')) {
15916 $table->set_file_path(@$directory_ref, $file);
15917 push @map_properties, $table;
15919 # No swash means don't do the rest of this.
15920 return if $table->fate != $ORDINARY
15921 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15923 # Get the path to the file
15924 my @path = $table->file_path;
15926 # Use just the file name if no subdirectory.
15927 shift @path if $path[0] eq File::Spec->curdir();
15929 my $file = join '/', @path;
15931 # Create a hash entry for utf8_heavy to get the file that stores this
15932 # property's map table
15933 foreach my $alias ($table->aliases) {
15934 my $name = $alias->name;
15935 if ($name =~ /^_/) {
15936 $strict_property_to_file_of{lc $name} = $file;
15939 $loose_property_to_file_of{standardize($name)} = $file;
15943 # And a way for utf8_heavy to find the proper key in the SwashInfo
15944 # hash for this property.
15945 $file_to_swash_name{$file} = "To" . $table->swash_name;
15949 # Do all of the work for all equivalent tables when called with the leader
15950 # table, so skip if isn't the leader.
15951 return if $table->leader != $table;
15953 # If this is a complement of another file, use that other file instead,
15954 # with a ! prepended to it.
15956 if (($complement = $table->complement) != 0) {
15957 my @directories = $complement->file_path;
15959 # This assumes that the 0th element is something like 'lib',
15960 # the 1th element the property name (in its own directory), like
15961 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15962 # appended to it later.
15963 $directories[1] =~ s/^/!/;
15964 $file = pop @directories;
15965 $directory_ref =\@directories;
15968 # Join all the file path components together, using slashes.
15969 my $full_filename = join('/', @$directory_ref, $file);
15971 # All go in the same subdirectory of unicore, or the special
15972 # pseudo-directory '#'
15973 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15974 Carp::my_carp("Unexpected directory in "
15975 . join('/', @{$directory_ref}, $file));
15978 # For this table and all its equivalents ...
15979 foreach my $table ($table, $table->equivalents) {
15981 # Associate it with its file internally. Don't include the
15982 # $matches_directory first component
15983 $table->set_file_path(@$directory_ref, $file);
15985 # No swash means don't do the rest of this.
15986 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15988 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15990 my $property = $table->property;
15991 my $property_name = ($property == $perl)
15992 ? "" # 'perl' is never explicitly stated
15993 : standardize($property->name) . '=';
15995 my $is_default = 0; # Is this table the default one for the property?
15997 # To calculate $is_default, we find if this table is the same as the
15998 # default one for the property. But this is complicated by the
15999 # possibility that there is a master table for this one, and the
16000 # information is stored there instead of here.
16001 my $parent = $table->parent;
16002 my $leader_prop = $parent->property;
16003 my $default_map = $leader_prop->default_map;
16004 if (defined $default_map) {
16005 my $default_table = $leader_prop->table($default_map);
16006 $is_default = 1 if defined $default_table && $parent == $default_table;
16009 # Calculate the loose name for this table. Mostly it's just its name,
16010 # standardized. But in the case of Perl tables that are single-form
16011 # equivalents to Unicode properties, it is the latter's name.
16012 my $loose_table_name =
16013 ($property != $perl || $leader_prop == $perl)
16014 ? standardize($table->name)
16015 : standardize($parent->name);
16017 my $deprecated = ($table->status eq $DEPRECATED)
16018 ? $table->status_info
16020 my $caseless_equivalent = $table->caseless_equivalent;
16022 # And for each of the table's aliases... This inner loop eventually
16023 # goes through all aliases in the UCD that we generate regex match
16025 foreach my $alias ($table->aliases) {
16026 my $standard = utf8_heavy_name($table, $alias);
16028 # Generate an entry in either the loose or strict hashes, which
16029 # will translate the property and alias names combination into the
16030 # file where the table for them is stored.
16031 if ($alias->loose_match) {
16032 if (exists $loose_to_file_of{$standard}) {
16033 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
16036 $loose_to_file_of{$standard} = $sub_filename;
16040 if (exists $stricter_to_file_of{$standard}) {
16041 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
16044 $stricter_to_file_of{$standard} = $sub_filename;
16046 # Tightly coupled with how utf8_heavy.pl works, for a
16047 # floating point number that is a whole number, get rid of
16048 # the trailing decimal point and 0's, so that utf8_heavy
16049 # will work. Also note that this assumes that such a
16050 # number is matched strictly; so if that were to change,
16051 # this would be wrong.
16052 if ((my $integer_name = $alias->name)
16053 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
16055 $stricter_to_file_of{$property_name . $integer_name}
16061 # For Unicode::UCD, create a mapping of the prop=value to the
16062 # canonical =value for that property.
16063 if ($standard =~ /=/) {
16065 # This could happen if a strict name mapped into an existing
16066 # loose name. In that event, the strict names would have to
16067 # be moved to a new hash.
16068 if (exists($loose_to_standard_value{$standard})) {
16069 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
16071 $loose_to_standard_value{$standard} = $loose_table_name;
16074 # Keep a list of the deprecated properties and their filenames
16075 if ($deprecated && $complement == 0) {
16076 $utf8::why_deprecated{$sub_filename} = $deprecated;
16079 # And a substitute table, if any, for case-insensitive matching
16080 if ($caseless_equivalent != 0) {
16081 $caseless_equivalent_to{$standard} = $caseless_equivalent;
16084 # Add to defaults list if the table this alias belongs to is the
16086 $loose_defaults{$standard} = 1 if $is_default;
16094 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
16096 my %full_dir_name_of; # Full length names of directories used.
16098 sub construct_filename($$$) {
16099 # Return a file name for a table, based on the table name, but perhaps
16100 # changed to get rid of non-portable characters in it, and to make
16101 # sure that it is unique on a file system that allows the names before
16102 # any period to be at most 8 characters (DOS). While we're at it
16103 # check and complain if there are any directory conflicts.
16105 my $name = shift; # The name to start with
16106 my $mutable = shift; # Boolean: can it be changed? If no, but
16107 # yet it must be to work properly, a warning
16109 my $directories_ref = shift; # A reference to an array containing the
16110 # path to the file, with each element one path
16111 # component. This is used because the same
16112 # name can be used in different directories.
16113 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16115 my $warn = ! defined wantarray; # If true, then if the name is
16116 # changed, a warning is issued as well.
16118 if (! defined $name) {
16119 Carp::my_carp("Undefined name in directory "
16120 . File::Spec->join(@$directories_ref)
16125 # Make sure that no directory names conflict with each other. Look at
16126 # each directory in the input file's path. If it is already in use,
16127 # assume it is correct, and is merely being re-used, but if we
16128 # truncate it to 8 characters, and find that there are two directories
16129 # that are the same for the first 8 characters, but differ after that,
16130 # then that is a problem.
16131 foreach my $directory (@$directories_ref) {
16132 my $short_dir = substr($directory, 0, 8);
16133 if (defined $full_dir_name_of{$short_dir}) {
16134 next if $full_dir_name_of{$short_dir} eq $directory;
16135 Carp::my_carp("$directory conflicts with $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
16138 $full_dir_name_of{$short_dir} = $directory;
16142 my $path = join '/', @$directories_ref;
16143 $path .= '/' if $path;
16145 # Remove interior underscores.
16146 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
16148 # Convert the dot in floating point numbers to an underscore
16149 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
16153 # Extract any suffix, delete any non-word character, and truncate to 3
16155 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
16158 $suffix =~ s/\W+//g;
16159 substr($suffix, 4) = "" if length($suffix) > 4;
16162 # Change any non-word character outside the suffix into an underscore,
16163 # and truncate to 8.
16164 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
16165 substr($filename, 8) = "" if length($filename) > 8;
16167 # Make sure the basename doesn't conflict with something we
16168 # might have already written. If we have, say,
16175 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
16176 $num++; # so basenames with numbers start with '2', which
16177 # just looks more natural.
16179 # Want to append $num, but if it'll make the basename longer
16180 # than 8 characters, pre-truncate $filename so that the result
16182 my $delta = length($filename) + length($num) - 8;
16184 substr($filename, -$delta) = $num;
16189 if ($warn && ! $warned) {
16191 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
16195 return $filename if $mutable;
16197 # If not changeable, must return the input name, but warn if needed to
16198 # change it beyond shortening it.
16199 if ($name ne $filename
16200 && substr($name, 0, length($filename)) ne $filename) {
16201 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
16207 # The pod file contains a very large table. Many of the lines in that table
16208 # would exceed a typical output window's size, and so need to be wrapped with
16209 # a hanging indent to make them look good. The pod language is really
16210 # insufficient here. There is no general construct to do that in pod, so it
16211 # is done here by beginning each such line with a space to cause the result to
16212 # be output without formatting, and doing all the formatting here. This leads
16213 # to the result that if the eventual display window is too narrow it won't
16214 # look good, and if the window is too wide, no advantage is taken of that
16215 # extra width. A further complication is that the output may be indented by
16216 # the formatter so that there is less space than expected. What I (khw) have
16217 # done is to assume that that indent is a particular number of spaces based on
16218 # what it is in my Linux system; people can always resize their windows if
16219 # necessary, but this is obviously less than desirable, but the best that can
16221 my $automatic_pod_indent = 8;
16223 # Try to format so that uses fewest lines, but few long left column entries
16224 # slide into the right column. An experiment on 5.1 data yielded the
16225 # following percentages that didn't cut into the other side along with the
16226 # associated first-column widths
16228 # 80% not too bad except for a few blocks
16229 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
16231 my $indent_info_column = 27; # 75% of lines didn't have overlap
16233 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
16234 # The 3 is because of:
16235 # 1 for the leading space to tell the pod formatter to
16238 # 1 for the space between the flag and the main data
16240 sub format_pod_line ($$$;$$) {
16241 # Take a pod line and return it, formatted properly
16243 my $first_column_width = shift;
16244 my $entry = shift; # Contents of left column
16245 my $info = shift; # Contents of right column
16247 my $status = shift || ""; # Any flag
16249 my $loose_match = shift; # Boolean.
16250 $loose_match = 1 unless defined $loose_match;
16252 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16255 $flags .= $STRICTER if ! $loose_match;
16257 $flags .= $status if $status;
16259 # There is a blank in the left column to cause the pod formatter to
16260 # output the line as-is.
16261 return sprintf " %-*s%-*s %s\n",
16262 # The first * in the format is replaced by this, the -1 is
16263 # to account for the leading blank. There isn't a
16264 # hard-coded blank after this to separate the flags from
16265 # the rest of the line, so that in the unlikely event that
16266 # multiple flags are shown on the same line, they both
16267 # will get displayed at the expense of that separation,
16268 # but since they are left justified, a blank will be
16269 # inserted in the normal case.
16273 # The other * in the format is replaced by this number to
16274 # cause the first main column to right fill with blanks.
16275 # The -1 is for the guaranteed blank following it.
16276 $first_column_width - $FILLER - 1,
16281 my @zero_match_tables; # List of tables that have no matches in this release
16283 sub make_re_pod_entries($) {
16284 # This generates the entries for the pod file for a given table.
16285 # Also done at this time are any children tables. The output looks like:
16286 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
16288 my $input_table = shift; # Table the entry is for
16289 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
16291 # Generate parent and all its children at the same time.
16292 return if $input_table->parent != $input_table;
16294 my $property = $input_table->property;
16295 my $type = $property->type;
16296 my $full_name = $property->full_name;
16298 my $count = $input_table->count;
16300 my $non_unicode_string;
16301 if ($count > $MAX_UNICODE_CODEPOINTS) {
16302 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16303 - $MAX_UNICODE_CODEPOINT);
16304 $non_unicode_string = " plus all above-Unicode code points";
16307 $unicode_count = $count;
16308 $non_unicode_string = "";
16311 my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16313 my $definition = $input_table->calculate_table_definition;
16316 # Save the definition for later use.
16317 $input_table->set_definition($definition);
16319 $definition = ": $definition";
16322 my $status = $input_table->status;
16323 my $status_info = $input_table->status_info;
16324 my $caseless_equivalent = $input_table->caseless_equivalent;
16326 # Don't mention a placeholder equivalent as it isn't to be listed in the
16328 $caseless_equivalent = 0 if $caseless_equivalent != 0
16329 && $caseless_equivalent->fate > $ORDINARY;
16331 my $entry_for_first_table; # The entry for the first table output.
16332 # Almost certainly, it is the parent.
16334 # For each related table (including itself), we will generate a pod entry
16335 # for each name each table goes by
16336 foreach my $table ($input_table, $input_table->children) {
16338 # utf8_heavy.pl cannot deal with null string property values, so skip
16339 # any tables that have no non-null names.
16340 next if ! grep { $_->name ne "" } $table->aliases;
16342 # First, gather all the info that applies to this table as a whole.
16344 push @zero_match_tables, $table if $count == 0
16345 # Don't mention special tables
16346 # as being zero length
16347 && $table->fate == $ORDINARY;
16349 my $table_property = $table->property;
16351 # The short name has all the underscores removed, while the full name
16352 # retains them. Later, we decide whether to output a short synonym
16353 # for the full one, we need to compare apples to apples, so we use the
16354 # short name's length including underscores.
16355 my $table_property_short_name_length;
16356 my $table_property_short_name
16357 = $table_property->short_name(\$table_property_short_name_length);
16358 my $table_property_full_name = $table_property->full_name;
16360 # Get how much savings there is in the short name over the full one
16361 # (delta will always be <= 0)
16362 my $table_property_short_delta = $table_property_short_name_length
16363 - length($table_property_full_name);
16364 my @table_description = $table->description;
16365 my @table_note = $table->note;
16367 # Generate an entry for each alias in this table.
16368 my $entry_for_first_alias; # saves the first one encountered.
16369 foreach my $alias ($table->aliases) {
16371 # Skip if not to go in pod.
16372 next unless $alias->make_re_pod_entry;
16374 # Start gathering all the components for the entry
16375 my $name = $alias->name;
16377 # Skip if name is empty, as can't be accessed by regexes.
16378 next if $name eq "";
16380 my $entry; # Holds the left column, may include extras
16381 my $entry_ref; # To refer to the left column's contents from
16382 # another entry; has no extras
16384 # First the left column of the pod entry. Tables for the $perl
16385 # property always use the single form.
16386 if ($table_property == $perl) {
16387 $entry = "\\p{$name}";
16388 $entry .= " \\p$name" if length $name == 1; # Show non-braced
16390 $entry_ref = "\\p{$name}";
16392 else { # Compound form.
16394 # Only generate one entry for all the aliases that mean true
16395 # or false in binary properties. Append a '*' to indicate
16396 # some are missing. (The heading comment notes this.)
16398 if ($type == $BINARY) {
16399 next if $name ne 'N' && $name ne 'Y';
16402 elsif ($type != $FORCED_BINARY) {
16407 # Forced binary properties require special handling. It
16408 # has two sets of tables, one set is true/false; and the
16409 # other set is everything else. Entries are generated for
16410 # each set. Use the Bidi_Mirrored property (which appears
16411 # in all Unicode versions) to get a list of the aliases
16412 # for the true/false tables. Of these, only output the N
16413 # and Y ones, the same as, a regular binary property. And
16414 # output all the rest, same as a non-binary property.
16415 my $bm = property_ref("Bidi_Mirrored");
16416 if ($name eq 'N' || $name eq 'Y') {
16418 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16419 $bm->table("N")->aliases)
16428 # Colon-space is used to give a little more space to be easier
16431 . $table_property_full_name
16434 # But for the reference to this entry, which will go in the
16435 # right column, where space is at a premium, use equals
16437 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16440 # Then the right (info) column. This is stored as components of
16441 # an array for the moment, then joined into a string later. For
16442 # non-internal only properties, begin the info with the entry for
16443 # the first table we encountered (if any), as things are ordered
16444 # so that that one is the most descriptive. This leads to the
16445 # info column of an entry being a more descriptive version of the
16448 if ($name =~ /^_/) {
16450 '(For internal use by Perl, not necessarily stable)';
16452 elsif ($entry_for_first_alias) {
16453 push @info, $entry_for_first_alias;
16456 # If this entry is equivalent to another, add that to the info,
16457 # using the first such table we encountered
16458 if ($entry_for_first_table) {
16460 push @info, "(= $entry_for_first_table)";
16463 push @info, $entry_for_first_table;
16467 # If the name is a large integer, add an equivalent with an
16468 # exponent for better readability
16469 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16470 push @info, sprintf "(= %.1e)", $name
16473 my $parenthesized = "";
16474 if (! $entry_for_first_alias) {
16476 # This is the first alias for the current table. The alias
16477 # array is ordered so that this is the fullest, most
16478 # descriptive alias, so it gets the fullest info. The other
16479 # aliases are mostly merely pointers to this one, using the
16480 # information already added above.
16482 # Display any status message, but only on the parent table
16483 if ($status && ! $entry_for_first_table) {
16484 push @info, $status_info;
16487 # Put out any descriptive info
16488 if (@table_description || @table_note) {
16489 push @info, join "; ", @table_description, @table_note;
16492 # Look to see if there is a shorter name we can point people
16494 my $standard_name = standardize($name);
16496 my $proposed_short = $table->short_name;
16497 if (defined $proposed_short) {
16498 my $standard_short = standardize($proposed_short);
16500 # If the short name is shorter than the standard one, or
16501 # even it it's not, but the combination of it and its
16502 # short property name (as in \p{prop=short} ($perl doesn't
16503 # have this form)) saves at least two characters, then,
16504 # cause it to be listed as a shorter synonym.
16505 if (length $standard_short < length $standard_name
16506 || ($table_property != $perl
16507 && (length($standard_short)
16508 - length($standard_name)
16509 + $table_property_short_delta) # (<= 0)
16512 $short_name = $proposed_short;
16513 if ($table_property != $perl) {
16514 $short_name = $table_property_short_name
16517 $short_name = "\\p{$short_name}";
16521 # And if this is a compound form name, see if there is a
16522 # single form equivalent
16524 if ($table_property != $perl && $table_property != $block) {
16526 # Special case the binary N tables, so that will print
16527 # \P{single}, but use the Y table values to populate
16528 # 'single', as we haven't likewise populated the N table.
16529 # For forced binary tables, we can't just look at the N
16530 # table, but must see if this table is equivalent to the N
16531 # one, as there are two equivalent beasts in these
16535 if ( ($type == $BINARY
16536 && $input_table == $property->table('No'))
16537 || ($type == $FORCED_BINARY
16538 && $property->table('No')->
16539 is_set_equivalent_to($input_table)))
16541 $test_table = $property->table('Yes');
16545 $test_table = $input_table;
16549 # Look for a single form amongst all the children.
16550 foreach my $table ($test_table->children) {
16551 next if $table->property != $perl;
16552 my $proposed_name = $table->short_name;
16553 next if ! defined $proposed_name;
16555 # Don't mention internal-only properties as a possible
16556 # single form synonym
16557 next if substr($proposed_name, 0, 1) eq '_';
16559 $proposed_name = "\\$p\{$proposed_name}";
16560 if (! defined $single_form
16561 || length($proposed_name) < length $single_form)
16563 $single_form = $proposed_name;
16565 # The goal here is to find a single form; not the
16566 # shortest possible one. We've already found a
16567 # short name. So, stop at the first single form
16568 # found, which is likely to be closer to the
16575 # Output both short and single in the same parenthesized
16576 # expression, but with only one of 'Single', 'Short' if there
16578 if ($short_name || $single_form || $table->conflicting) {
16579 $parenthesized .= "Short: $short_name" if $short_name;
16580 if ($short_name && $single_form) {
16581 $parenthesized .= ', ';
16583 elsif ($single_form) {
16584 $parenthesized .= 'Single: ';
16586 $parenthesized .= $single_form if $single_form;
16590 if ($caseless_equivalent != 0) {
16591 $parenthesized .= '; ' if $parenthesized ne "";
16592 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16596 # Warn if this property isn't the same as one that a
16597 # semi-casual user might expect. The other components of this
16598 # parenthesized structure are calculated only for the first entry
16599 # for this table, but the conflicting is deemed important enough
16600 # to go on every entry.
16601 my $conflicting = join " NOR ", $table->conflicting;
16602 if ($conflicting) {
16603 $parenthesized .= '; ' if $parenthesized ne "";
16604 $parenthesized .= "NOT $conflicting";
16607 push @info, "($parenthesized)" if $parenthesized;
16609 if ($name =~ /_$/ && $alias->loose_match) {
16610 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16613 if ($table_property != $perl && $table->perl_extension) {
16614 push @info, '(Perl extension)';
16616 my $definition = $table->definition // "";
16617 $definition = "" if $entry_for_first_alias;
16618 $definition = ": $definition" if $definition;
16619 push @info, "($string_count$definition)";
16621 # Now, we have both the entry and info so add them to the
16622 # list of all the properties.
16623 push @match_properties,
16624 format_pod_line($indent_info_column,
16628 $alias->loose_match);
16630 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16631 } # End of looping through the aliases for this table.
16633 if (! $entry_for_first_table) {
16634 $entry_for_first_table = $entry_for_first_alias;
16636 } # End of looping through all the related tables
16640 sub make_ucd_table_pod_entries {
16643 # Generate the entries for the UCD section of the pod for $table. This
16644 # also calculates if names are ambiguous, so has to be called even if the
16645 # pod is not being output
16647 my $short_name = $table->name;
16648 my $standard_short_name = standardize($short_name);
16649 my $full_name = $table->full_name;
16650 my $standard_full_name = standardize($full_name);
16652 my $full_info = ""; # Text of info column for full-name entries
16653 my $other_info = ""; # Text of info column for short-name entries
16654 my $short_info = ""; # Text of info column for other entries
16655 my $meaning = ""; # Synonym of this table
16657 my $property = ($table->isa('Property'))
16659 : $table->parent->property;
16661 my $perl_extension = $table->perl_extension;
16662 my $is_perl_extension_match_table_but_not_dollar_perl
16663 = $property != $perl
16665 && $property != $table;
16667 # Get the more official name for for perl extensions that aren't
16668 # stand-alone properties
16669 if ($is_perl_extension_match_table_but_not_dollar_perl) {
16670 if ($property->type == $BINARY) {
16671 $meaning = $property->full_name;
16674 $meaning = $table->parent->complete_name;
16678 # There are three types of info column. One for the short name, one for
16679 # the full name, and one for everything else. They mostly are the same,
16680 # so initialize in the same loop.
16682 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16683 if ($info_ref != \$full_info) {
16685 # The non-full name columns include the full name
16686 $$info_ref .= $full_name;
16690 if ($is_perl_extension_match_table_but_not_dollar_perl) {
16692 # Add the synonymous name for the non-full name entries; and to
16693 # the full-name entry if it adds extra information
16694 if ( standardize($meaning) ne $standard_full_name
16695 || $info_ref == \$other_info
16696 || $info_ref == \$short_info)
16698 my $parenthesized = $info_ref != \$full_info;
16699 $$info_ref .= " " if $$info_ref && $parenthesized;
16700 $$info_ref .= "(=" if $parenthesized;
16701 $$info_ref .= "$meaning";
16702 $$info_ref .= ")" if $parenthesized;
16707 # And the full-name entry includes the short name, if shorter
16708 if ($info_ref == \$full_info
16709 && length $standard_short_name < length $standard_full_name)
16711 $full_info =~ s/\.\Z//;
16712 $full_info .= " " if $full_info;
16713 $full_info .= "(Short: $short_name)";
16716 if ($table->perl_extension) {
16717 $$info_ref =~ s/\.\Z//;
16718 $$info_ref .= ". " if $$info_ref;
16719 $$info_ref .= "(Perl extension)";
16724 my $definition_table;
16725 my $type = $table->property->type;
16726 if ($type == $BINARY || $type == $FORCED_BINARY) {
16727 $definition_table = $table->property->table('Y');
16729 elsif ($table->isa('Match_Table')) {
16730 $definition_table = $table;
16733 $definition = $definition_table->calculate_table_definition
16734 if defined $definition_table
16735 && $definition_table != 0;
16737 # Add any extra annotations to the full name entry
16738 foreach my $more_info ($table->description,
16741 $table->status_info)
16743 next unless $more_info;
16744 $full_info =~ s/\.\Z//;
16745 $full_info .= ". " if $full_info;
16746 $full_info .= $more_info;
16748 if ($table->property->type == $FORCED_BINARY) {
16750 $full_info =~ s/\.\Z//;
16751 $full_info .= ". ";
16753 $full_info .= "This is a combination property which has both:"
16754 . " 1) a map to various string values; and"
16755 . " 2) a map to boolean Y/N, where 'Y' means the"
16756 . " string value is non-empty. Add the prefix 'is'"
16757 . " to the prop_invmap() call to get the latter";
16760 # These keep track if have created full and short name pod entries for the
16763 my $done_short = 0;
16765 # Every possible name is kept track of, even those that aren't going to be
16766 # output. This way we can be sure to find the ambiguities.
16767 foreach my $alias ($table->aliases) {
16768 my $name = $alias->name;
16769 my $standard = standardize($name);
16771 my $output_this = $alias->ucd;
16773 # If the full and short names are the same, we want to output the full
16774 # one's entry, so it has priority.
16775 if ($standard eq $standard_full_name) {
16776 next if $done_full;
16778 $info = $full_info;
16780 elsif ($standard eq $standard_short_name) {
16781 next if $done_short;
16783 next if $standard_short_name eq $standard_full_name;
16784 $info = $short_info;
16787 $info = $other_info;
16790 $combination_property{$standard} = 1
16791 if $table->property->type == $FORCED_BINARY;
16793 # Here, we have set up the two columns for this entry. But if an
16794 # entry already exists for this name, we have to decide which one
16795 # we're going to later output.
16796 if (exists $ucd_pod{$standard}) {
16798 # If the two entries refer to the same property, it's not going to
16799 # be ambiguous. (Likely it's because the names when standardized
16800 # are the same.) But that means if they are different properties,
16801 # there is ambiguity.
16802 if ($ucd_pod{$standard}->{'property'} != $property) {
16804 # Here, we have an ambiguity. This code assumes that one is
16805 # scheduled to be output and one not and that one is a perl
16806 # extension (which is not to be output) and the other isn't.
16807 # If those assumptions are wrong, things have to be rethought.
16808 if ($ucd_pod{$standard}{'output_this'} == $output_this
16809 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16810 || $output_this == $perl_extension)
16812 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
16815 # We modify the info column of the one being output to
16816 # indicate the ambiguity. Set $which to point to that one's
16819 if ($ucd_pod{$standard}{'output_this'}) {
16820 $which = \$ucd_pod{$standard}->{'info'};
16824 $meaning = $ucd_pod{$standard}{'meaning'};
16828 $$which =~ s/\.\Z//;
16829 $$which .= "; NOT '$standard' meaning '$meaning'";
16831 $ambiguous_names{$standard} = 1;
16834 # Use the non-perl-extension variant
16835 next unless $ucd_pod{$standard}{'perl_extension'};
16838 # Store enough information about this entry that we can later look for
16839 # ambiguities, and output it properly.
16840 $ucd_pod{$standard} = { 'name' => $name,
16842 'meaning' => $meaning,
16843 'output_this' => $output_this,
16844 'perl_extension' => $perl_extension,
16845 'property' => $property,
16846 'status' => $alias->status,
16848 } # End of looping through all this table's aliases
16853 sub pod_alphanumeric_sort {
16854 # Sort pod entries alphanumerically.
16856 # The first few character columns are filler, plus the '\p{'; and get rid
16857 # of all the trailing stuff, starting with the trailing '}', so as to sort
16858 # on just 'Name=Value'
16859 (my $a = lc $a) =~ s/^ .*? \{ //x;
16861 (my $b = lc $b) =~ s/^ .*? \{ //x;
16864 # Determine if the two operands are both internal only or both not.
16865 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16866 # should be the underscore that begins internal only
16867 my $a_is_internal = (substr($a, 0, 1) eq '_');
16868 my $b_is_internal = (substr($b, 0, 1) eq '_');
16870 # Sort so the internals come last in the table instead of first (which the
16871 # leading underscore would otherwise indicate).
16872 if ($a_is_internal != $b_is_internal) {
16873 return 1 if $a_is_internal;
16877 # Determine if the two operands are compound or not, and if so if are
16878 # "numeric" property values or not, like \p{Age: 3.0}. But there are also
16879 # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16880 # all of which this considers numeric, and for sorting, looks just at the
16881 # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}.
16883 ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16885 [:=] \s* # The syntax for the compound form
16886 (?: # followed by ...
16887 ( # $2 gets defined if what follows is a "numeric"
16888 # expression, which is ...
16889 ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational
16890 # number, optionally signed
16891 | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either
16892 # of these go into $3
16893 | ( V \d+ _ \d+ ) # or a Unicode's Age property version
16896 | .* $ # If not "numeric", accept anything so that $1 gets
16897 # defined if it is any compound form
16899 my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16900 my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16902 # Sort alphabeticlly on the whole property name if either operand isn't
16903 # compound, or they differ.
16904 return $a cmp $b if ! defined $a_initial
16905 || ! defined $b_initial
16906 || $a_initial ne $b_initial;
16908 if (! defined $a_numeric) {
16910 # If neither is numeric, use alpha sort
16911 return $a cmp $b if ! defined $b_numeric;
16912 return 1; # Sort numeric ahead of alpha
16915 # Here $a is numeric
16916 return -1 if ! defined $b_numeric; # Numeric sorts before alpha
16918 # Here they are both numeric in the same property.
16919 # Convert version numbers into regular numbers
16920 if (defined $a_version) {
16921 ($a_number = $a_version) =~ s/^V//i;
16922 $a_number =~ s/_/./;
16924 else { # Otherwise get rid of the, e.g., CCC in CCC9 */
16925 $a_number =~ s/ ^ [[:alpha:]]+ //x;
16927 if (defined $b_version) {
16928 ($b_number = $b_version) =~ s/^V//i;
16929 $b_number =~ s/_/./;
16932 $b_number =~ s/ ^ [[:alpha:]]+ //x;
16935 # Convert rationals to floating for the comparison.
16936 $a_number = eval $a_number if $a_number =~ qr{/};
16937 $b_number = eval $b_number if $b_number =~ qr{/};
16939 return $a_number <=> $b_number || $a cmp $b;
16943 # Create the .pod file. This generates the various subsections and then
16944 # combines them in one big HERE document.
16946 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16948 return unless defined $pod_directory;
16949 print "Making pod file\n" if $verbosity >= $PROGRESS;
16951 my $exception_message =
16952 '(Any exceptions are individually noted beginning with the word NOT.)';
16954 if (-e 'Blocks.txt') {
16956 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
16957 # if the global $has_In_conflicts indicates we have them.
16958 push @match_properties, format_pod_line($indent_info_column,
16961 . (($has_In_conflicts)
16962 ? " $exception_message"
16965 @block_warning = << "END";
16967 In particular, matches in the Block property have single forms
16968 defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
16969 all, Like all B<DISCOURAGED> forms, these are not stable. For example,
16970 C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
16971 C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may
16972 come along that would force Perl to change the meaning of one or more of
16973 these, and your program would no longer be correct. Currently there are no
16974 such conflicts with the form that begins C<"In_">, but there are many with the
16975 other two shortcuts, and Unicode continues to define new properties that begin
16976 with C<"In">, so it's quite possible that a conflict will occur in the future.
16977 The compound form is guaranteed to not become obsolete, and its meaning is
16978 clearer anyway. See L<perlunicode/"Blocks"> for more information about this.
16981 my $text = $Is_flags_text;
16982 $text = "$exception_message $text" if $has_Is_conflicts;
16984 # And the 'Is_ line';
16985 push @match_properties, format_pod_line($indent_info_column,
16989 # Sort the properties array for output. It is sorted alphabetically
16990 # except numerically for numeric properties, and only output unique lines.
16991 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16993 my $formatted_properties = simple_fold(\@match_properties,
16995 # indent succeeding lines by two extra
16996 # which looks better
16997 $indent_info_column + 2,
16999 # shorten the line length by how much
17000 # the formatter indents, so the folded
17001 # line will fit in the space
17002 # presumably available
17003 $automatic_pod_indent);
17004 # Add column headings, indented to be a little more centered, but not
17006 $formatted_properties = format_pod_line($indent_info_column,
17010 . $formatted_properties;
17012 # Generate pod documentation lines for the tables that match nothing
17013 my $zero_matches = "";
17014 if (@zero_match_tables) {
17015 @zero_match_tables = uniques(@zero_match_tables);
17016 $zero_matches = join "\n\n",
17017 map { $_ = '=item \p{' . $_->complete_name . "}" }
17018 sort { $a->complete_name cmp $b->complete_name }
17019 @zero_match_tables;
17021 $zero_matches = <<END;
17023 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
17025 Unicode has some property-value pairs that currently don't match anything.
17026 This happens generally either because they are obsolete, or they exist for
17027 symmetry with other forms, but no language has yet been encoded that uses
17028 them. In this version of Unicode, the following match zero code points:
17039 # Generate list of properties that we don't accept, grouped by the reasons
17040 # why. This is so only put out the 'why' once, and then list all the
17041 # properties that have that reason under it.
17043 my %why_list; # The keys are the reasons; the values are lists of
17044 # properties that have the key as their reason
17046 # For each property, add it to the list that are suppressed for its reason
17047 # The sort will cause the alphabetically first properties to be added to
17048 # each list first, so each list will be sorted.
17049 foreach my $property (sort keys %why_suppressed) {
17050 next unless $why_suppressed{$property};
17051 push @{$why_list{$why_suppressed{$property}}}, $property;
17054 # For each reason (sorted by the first property that has that reason)...
17055 my @bad_re_properties;
17056 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
17059 # Add to the output, all the properties that have that reason.
17060 my $has_item = 0; # Flag if actually output anything.
17061 foreach my $name (@{$why_list{$why}}) {
17063 # Split compound names into $property and $table components
17064 my $property = $name;
17066 if ($property =~ / (.*) = (.*) /x) {
17071 # This release of Unicode may not have a property that is
17072 # suppressed, so don't reference a non-existent one.
17073 $property = property_ref($property);
17074 next if ! defined $property;
17076 # And since this list is only for match tables, don't list the
17077 # ones that don't have match tables.
17078 next if ! $property->to_create_match_tables;
17080 # Find any abbreviation, and turn it into a compound name if this
17081 # is a property=value pair.
17082 my $short_name = $property->name;
17083 $short_name .= '=' . $property->table($table)->name if $table;
17085 # Start with an empty line.
17086 push @bad_re_properties, "\n\n" unless $has_item;
17088 # And add the property as an item for the reason.
17089 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
17093 # And add the reason under the list of properties, if such a list
17094 # actually got generated. Note that the header got added
17095 # unconditionally before. But pod ignores extra blank lines, so no
17097 push @bad_re_properties, "\n$why\n" if $has_item;
17099 } # End of looping through each reason.
17101 if (! @bad_re_properties) {
17102 push @bad_re_properties,
17103 "*** This installation accepts ALL non-Unihan properties ***";
17106 # Add =over only if non-empty to avoid an empty =over/=back section,
17107 # which is considered bad form.
17108 unshift @bad_re_properties, "\n=over 4\n";
17109 push @bad_re_properties, "\n=back\n";
17112 # Similarly, generate a list of files that we don't use, grouped by the
17113 # reasons why (Don't output if the reason is empty). First, create a hash
17114 # whose keys are the reasons, and whose values are anonymous arrays of all
17115 # the files that share that reason.
17116 my %grouped_by_reason;
17117 foreach my $file (keys %skipped_files) {
17118 next unless $skipped_files{$file};
17119 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
17122 # Then, sort each group.
17123 foreach my $group (keys %grouped_by_reason) {
17124 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
17125 @{$grouped_by_reason{$group}} ;
17128 # Finally, create the output text. For each reason (sorted by the
17129 # alphabetically first file that has that reason)...
17131 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
17132 cmp lc $grouped_by_reason{$b}->[0]
17134 keys %grouped_by_reason)
17136 # Add all the files that have that reason to the output. Start
17137 # with an empty line.
17138 push @unused_files, "\n\n";
17139 push @unused_files, map { "\n=item F<$_> \n" }
17140 @{$grouped_by_reason{$reason}};
17141 # And add the reason under the list of files
17142 push @unused_files, "\n$reason\n";
17145 # Similarly, create the output text for the UCD section of the pod
17147 foreach my $key (keys %ucd_pod) {
17148 next unless $ucd_pod{$key}->{'output_this'};
17149 push @ucd_pod, format_pod_line($indent_info_column,
17150 $ucd_pod{$key}->{'name'},
17151 $ucd_pod{$key}->{'info'},
17152 $ucd_pod{$key}->{'status'},
17156 # Sort alphabetically, and fold for output
17157 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
17158 my $ucd_pod = simple_fold(\@ucd_pod,
17160 $indent_info_column,
17161 $automatic_pod_indent);
17162 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
17165 my $space_hex = sprintf("%02x", ord " ");
17168 # Everything is ready to assemble.
17169 my @OUT = << "END";
17174 To change this file, edit $0 instead.
17180 $pod_file - Index of Unicode Version $unicode_version character properties in Perl
17184 This document provides information about the portion of the Unicode database
17185 that deals with character properties, that is the portion that is defined on
17186 single code points. (L</Other information in the Unicode data base>
17187 below briefly mentions other data that Unicode provides.)
17189 Perl can provide access to all non-provisional Unicode character properties,
17190 though not all are enabled by default. The omitted ones are the Unihan
17191 properties (accessible via the CPAN module L<Unicode::Unihan>) and certain
17192 deprecated or Unicode-internal properties. (An installation may choose to
17193 recompile Perl's tables to change this. See L<Unicode character
17194 properties that are NOT accepted by Perl>.)
17196 For most purposes, access to Unicode properties from the Perl core is through
17197 regular expression matches, as described in the next section.
17198 For some special purposes, and to access the properties that are not suitable
17199 for regular expression matching, all the Unicode character properties that
17200 Perl handles are accessible via the standard L<Unicode::UCD> module, as
17201 described in the section L</Properties accessible through Unicode::UCD>.
17203 Perl also provides some additional extensions and short-cut synonyms
17204 for Unicode properties.
17206 This document merely lists all available properties and does not attempt to
17207 explain what each property really means. There is a brief description of each
17208 Perl extension; see L<perlunicode/Other Properties> for more information on
17209 these. There is some detail about Blocks, Scripts, General_Category,
17210 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
17211 official Unicode properties, refer to the Unicode standard. A good starting
17212 place is L<$unicode_reference_url>.
17214 Note that you can define your own properties; see
17215 L<perlunicode/"User-Defined Character Properties">.
17217 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
17219 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
17220 most of the Unicode character properties. The table below shows all these
17221 constructs, both single and compound forms.
17223 B<Compound forms> consist of two components, separated by an equals sign or a
17224 colon. The first component is the property name, and the second component is
17225 the particular value of the property to match against, for example,
17226 C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
17227 to match characters whose Script_Extensions property value is Greek.
17228 (C<Script_Extensions> is an improved version of the C<Script> property.)
17230 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
17231 their equivalent compound forms. The table shows these equivalences. (In our
17232 example, C<\\p{Greek}> is a just a shortcut for
17233 C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single
17234 forms that are not shortcuts for a compound form. One such is C<\\p{Word}>.
17235 These are also listed in the table.
17237 In parsing these constructs, Perl always ignores Upper/lower case differences
17238 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
17239 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
17240 the left brace completely changes the meaning of the construct, from "match"
17241 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
17242 for improved legibility.
17244 Also, white space, hyphens, and underscores are normally ignored
17245 everywhere between the {braces}, and hence can be freely added or removed
17246 even if the C</x> modifier hasn't been specified on the regular expression.
17247 But in the table below $a_bold_stricter at the beginning of an entry
17248 means that tighter (stricter) rules are used for that entry:
17254 =item Single form (C<\\p{name}>) tighter rules:
17256 White space, hyphens, and underscores ARE significant
17261 =item * white space adjacent to a non-word character
17263 =item * underscores separating digits in numbers
17267 That means, for example, that you can freely add or remove white space
17268 adjacent to (but within) the braces without affecting the meaning.
17270 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17272 The tighter rules given above for the single form apply to everything to the
17273 right of the colon or equals; the looser rules still apply to everything to
17276 That means, for example, that you can freely add or remove white space
17277 adjacent to (but within) the braces and the colon or equal sign.
17283 Some properties are considered obsolete by Unicode, but still available.
17284 There are several varieties of obsolescence:
17292 A property may be stabilized. Such a determination does not indicate
17293 that the property should or should not be used; instead it is a declaration
17294 that the property will not be maintained nor extended for newly encoded
17295 characters. Such properties are marked with $a_bold_stabilized in the
17300 A property may be deprecated, perhaps because its original intent
17301 has been replaced by another property, or because its specification was
17302 somehow defective. This means that its use is strongly
17303 discouraged, so much so that a warning will be issued if used, unless the
17304 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17305 statement. $A_bold_deprecated flags each such entry in the table, and
17306 the entry there for the longest, most descriptive version of the property will
17307 give the reason it is deprecated, and perhaps advice. Perl may issue such a
17308 warning, even for properties that aren't officially deprecated by Unicode,
17309 when there used to be characters or code points that were matched by them, but
17310 no longer. This is to warn you that your program may not work like it did on
17311 earlier Unicode releases.
17313 A deprecated property may be made unavailable in a future Perl version, so it
17314 is best to move away from them.
17316 A deprecated property may also be stabilized, but this fact is not shown.
17320 Properties marked with $a_bold_obsolete in the table are considered (plain)
17321 obsolete. Generally this designation is given to properties that Unicode once
17322 used for internal purposes (but not any longer).
17326 This is not actually a Unicode-specified obsolescence, but applies to certain
17327 Perl extensions that are present for backwards compatibility, but are
17328 discouraged from being used. These are not obsolete, but their meanings are
17329 not stable. Future Unicode versions could force any of these extensions to be
17330 removed without warning, replaced by another property with the same name that
17331 means something different. $A_bold_discouraged flags each such entry in the
17332 table. Use the equivalent shown instead.
17340 The table below has two columns. The left column contains the C<\\p{}>
17341 constructs to look up, possibly preceded by the flags mentioned above; and
17342 the right column contains information about them, like a description, or
17343 synonyms. The table shows both the single and compound forms for each
17344 property that has them. If the left column is a short name for a property,
17345 the right column will give its longer, more descriptive name; and if the left
17346 column is the longest name, the right column will show any equivalent shortest
17347 name, in both single and compound forms if applicable.
17349 If braces are not needed to specify a property (e.g., C<\\pL>), the left
17350 column contains both forms, with and without braces.
17352 The right column will also caution you if a property means something different
17353 than what might normally be expected.
17355 All single forms are Perl extensions; a few compound forms are as well, and
17358 Numbers in (parentheses) indicate the total number of Unicode code points
17359 matched by the property. For the entries that give the longest, most
17360 descriptive version of the property, the count is followed by a list of some
17361 of the code points matched by it. The list includes all the matched
17362 characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17363 a regular expression bracketed character class. Following that, the next few
17364 higher matching ranges are also given. To avoid visual ambiguity, the SPACE
17365 character is represented as C<\\x$space_hex>.
17367 For emphasis, those properties that match no code points at all are listed as
17368 well in a separate section following the table.
17370 Most properties match the same code points regardless of whether C<"/i">
17371 case-insensitive matching is specified or not. But a few properties are
17372 affected. These are shown with the notation S<C<(/i= I<other_property>)>>
17373 in the second column. Under case-insensitive matching they match the
17374 same code pode points as the property I<other_property>.
17376 There is no description given for most non-Perl defined properties (See
17377 L<$unicode_reference_url> for that).
17379 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17380 combinations. For example, entries like:
17382 \\p{Gc: *} \\p{General_Category: *}
17384 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17385 for the latter is also valid for the former. Similarly,
17389 means that if and only if, for example, C<\\p{Foo}> exists, then
17390 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17391 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17392 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
17395 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17396 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
17397 'N*' to indicate this, and doesn't have separate entries for the other
17398 possibilities. Note that not all properties which have values 'Yes' and 'No'
17399 are binary, and they have all their values spelled out without using this wild
17400 card, and a C<NOT> clause in their description that highlights their not being
17401 binary. These also require the compound form to match them, whereas true
17402 binary properties have both single and compound forms available.
17404 Note that all non-essential underscores are removed in the display of the
17411 =item Z<>B<*> is a wild-card
17413 =item B<(\\d+)> in the info column gives the number of Unicode code points matched
17416 =item B<$DEPRECATED> means this is deprecated.
17418 =item B<$OBSOLETE> means this is obsolete.
17420 =item B<$STABILIZED> means this is stabilized.
17422 =item B<$STRICTER> means tighter (stricter) name matching applies.
17424 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17429 $formatted_properties
17433 =head1 Properties accessible through Unicode::UCD
17435 The value of any Unicode (not including Perl extensions) character
17436 property mentioned above for any single code point is available through
17437 L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the
17438 values of all the Unicode properties for a given code point.
17440 Besides these, all the Unicode character properties mentioned above
17441 (except for those marked as for internal use by Perl) are also
17442 accessible by L<Unicode::UCD/prop_invlist()>.
17444 Due to their nature, not all Unicode character properties are suitable for
17445 regular expression matches, nor C<prop_invlist()>. The remaining
17446 non-provisional, non-internal ones are accessible via
17447 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17448 hasn't included; see L<below for which those are|/Unicode character properties
17449 that are NOT accepted by Perl>).
17451 For compatibility with other parts of Perl, all the single forms given in the
17452 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17453 are recognized. BUT, there are some ambiguities between some Perl extensions
17454 and the Unicode properties, all of which are silently resolved in favor of the
17455 official Unicode property. To avoid surprises, you should only use
17456 C<prop_invmap()> for forms listed in the table below, which omits the
17457 non-recommended ones. The affected forms are the Perl single form equivalents
17458 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17459 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17460 whose short name is C<sc>. The table indicates the current ambiguities in the
17461 INFO column, beginning with the word C<"NOT">.
17463 The standard Unicode properties listed below are documented in
17464 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17465 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
17466 L<perlunicode/Other Properties>;
17468 The first column in the table is a name for the property; the second column is
17469 an alternative name, if any, plus possibly some annotations. The alternative
17470 name is the property's full name, unless that would simply repeat the first
17471 column, in which case the second column indicates the property's short name
17472 (if different). The annotations are given only in the entry for the full
17473 name. The annotations for binary properties include a list of the first few
17474 ranges that the property matches. To avoid any ambiguity, the SPACE character
17475 is represented as C<\\x$space_hex>.
17477 If a property is obsolete, etc, the entry will be flagged with the same
17478 characters used in the table in the L<section above|/Properties accessible
17479 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17483 =head1 Properties accessible through other means
17485 Certain properties are accessible also via core function calls. These are:
17487 Lowercase_Mapping lc() and lcfirst()
17488 Titlecase_Mapping ucfirst()
17489 Uppercase_Mapping uc()
17491 Also, Case_Folding is accessible through the C</i> modifier in regular
17492 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17495 And, the Name and Name_Aliases properties are accessible through the C<\\N{}>
17496 interpolation in double-quoted strings and regular expressions; and functions
17497 C<charnames::viacode()>, C<charnames::vianame()>, and
17498 C<charnames::string_vianame()> (which require a C<use charnames ();> to be
17501 Finally, most properties related to decomposition are accessible via
17502 L<Unicode::Normalize>.
17504 =head1 Unicode character properties that are NOT accepted by Perl
17506 Perl will generate an error for a few character properties in Unicode when
17507 used in a regular expression. The non-Unihan ones are listed below, with the
17508 reasons they are not accepted, perhaps with work-arounds. The short names for
17509 the properties are listed enclosed in (parentheses).
17510 As described after the list, an installation can change the defaults and choose
17511 to accept any of these. The list is machine generated based on the
17512 choices made for the installation that generated this document.
17516 An installation can choose to allow any of these to be matched by downloading
17517 the Unicode database from L<http://www.unicode.org/Public/> to
17518 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17519 controlling lists contained in the program
17520 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17521 (C<\%Config> is available from the Config module).
17523 Also, perl can be recompiled to operate on an earlier version of the Unicode
17524 standard. Further information is at
17525 C<\$Config{privlib}>/F<unicore/README.perl>.
17527 =head1 Other information in the Unicode data base
17529 The Unicode data base is delivered in two different formats. The XML version
17530 is valid for more modern Unicode releases. The other version is a collection
17531 of files. The two are intended to give equivalent information. Perl uses the
17532 older form; this allows you to recompile Perl to use early Unicode releases.
17534 The only non-character property that Perl currently supports is Named
17535 Sequences, in which a sequence of code points
17536 is given a name and generally treated as a single entity. (Perl supports
17537 these via the C<\\N{...}> double-quotish construct,
17538 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17540 Below is a list of the files in the Unicode data base that Perl doesn't
17541 currently use, along with very brief descriptions of their purposes.
17542 Some of the names of the files have been shortened from those that Unicode
17543 uses, in order to allow them to be distinguishable from similarly named files
17544 on file systems for which only the first 8 characters of a name are
17555 L<$unicode_reference_url>
17563 # And write it. The 0 means no utf8.
17564 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17568 sub make_Heavy () {
17569 # Create and write Heavy.pl, which passes info about the tables to
17572 # Stringify structures for output
17573 my $loose_property_name_of
17574 = simple_dumper(\%loose_property_name_of, ' ' x 4);
17575 chomp $loose_property_name_of;
17577 my $strict_property_name_of
17578 = simple_dumper(\%strict_property_name_of, ' ' x 4);
17579 chomp $strict_property_name_of;
17581 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17582 chomp $stricter_to_file_of;
17584 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17585 chomp $inline_definitions;
17587 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17588 chomp $loose_to_file_of;
17590 my $nv_floating_to_rational
17591 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17592 chomp $nv_floating_to_rational;
17594 my $why_deprecated = simple_dumper(\%utf8::why_deprecated, ' ' x 4);
17595 chomp $why_deprecated;
17597 # We set the key to the file when we associated files with tables, but we
17598 # couldn't do the same for the value then, as we might not have the file
17599 # for the alternate table figured out at that time.
17600 foreach my $cased (keys %caseless_equivalent_to) {
17601 my @path = $caseless_equivalent_to{$cased}->file_path;
17603 if ($path[0] eq "#") { # Pseudo-directory '#'
17604 $path = join '/', @path;
17606 else { # Gets rid of lib/
17607 $path = join '/', @path[1, -1];
17609 $caseless_equivalent_to{$cased} = $path;
17611 my $caseless_equivalent_to
17612 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17613 chomp $caseless_equivalent_to;
17615 my $loose_property_to_file_of
17616 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17617 chomp $loose_property_to_file_of;
17619 my $strict_property_to_file_of
17620 = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17621 chomp $strict_property_to_file_of;
17623 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17624 chomp $file_to_swash_name;
17628 $INTERNAL_ONLY_HEADER
17630 # This file is for the use of utf8_heavy.pl and Unicode::UCD
17632 # Maps Unicode (not Perl single-form extensions) property names in loose
17633 # standard form to their corresponding standard names
17634 \%utf8::loose_property_name_of = (
17635 $loose_property_name_of
17638 # Same, but strict names
17639 \%utf8::strict_property_name_of = (
17640 $strict_property_name_of
17643 # Gives the definitions (in the form of inversion lists) for those properties
17644 # whose definitions aren't kept in files
17645 \@utf8::inline_definitions = (
17646 $inline_definitions
17649 # Maps property, table to file for those using stricter matching. For paths
17650 # whose directory is '#', the file is in the form of a numeric index into
17651 # \@inline_definitions
17652 \%utf8::stricter_to_file_of = (
17653 $stricter_to_file_of
17656 # Maps property, table to file for those using loose matching. For paths
17657 # whose directory is '#', the file is in the form of a numeric index into
17658 # \@inline_definitions
17659 \%utf8::loose_to_file_of = (
17663 # Maps floating point to fractional form
17664 \%utf8::nv_floating_to_rational = (
17665 $nv_floating_to_rational
17668 # If a %e floating point number doesn't have this number of digits in it after
17669 # the decimal point to get this close to a fraction, it isn't considered to be
17670 # that fraction even if all the digits it does have match.
17671 \$utf8::e_precision = $E_FLOAT_PRECISION;
17673 # Deprecated tables to generate a warning for. The key is the file containing
17674 # the table, so as to avoid duplication, as many property names can map to the
17675 # file, but we only need one entry for all of them.
17676 \%utf8::why_deprecated = (
17680 # A few properties have different behavior under /i matching. This maps
17681 # those to substitute files to use under /i.
17682 \%utf8::caseless_equivalent = (
17683 $caseless_equivalent_to
17686 # Property names to mapping files
17687 \%utf8::loose_property_to_file_of = (
17688 $loose_property_to_file_of
17691 # Property names to mapping files
17692 \%utf8::strict_property_to_file_of = (
17693 $strict_property_to_file_of
17696 # Files to the swash names within them.
17697 \%utf8::file_to_swash_name = (
17698 $file_to_swash_name
17704 main::write("Heavy.pl", 0, \@heavy); # The 0 means no utf8.
17708 sub make_Name_pm () {
17709 # Create and write Name.pm, which contains subroutines and data to use in
17710 # conjunction with Name.pl
17712 # Maybe there's nothing to do.
17713 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17717 $INTERNAL_ONLY_HEADER
17720 # Convert these structures to output format.
17721 my $code_points_ending_in_code_point =
17722 main::simple_dumper(\@code_points_ending_in_code_point,
17724 my $names = main::simple_dumper(\%names_ending_in_code_point,
17726 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17729 # Do the same with the Hangul names,
17735 if ($has_hangul_syllables) {
17737 # Construct a regular expression of all the possible
17738 # combinations of the Hangul syllables.
17739 my @L_re; # Leading consonants
17740 for my $i ($LBase .. $LBase + $LCount - 1) {
17741 push @L_re, $Jamo{$i}
17743 my @V_re; # Middle vowels
17744 for my $i ($VBase .. $VBase + $VCount - 1) {
17745 push @V_re, $Jamo{$i}
17747 my @T_re; # Trailing consonants
17748 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17749 push @T_re, $Jamo{$i}
17752 # The whole re is made up of the L V T combination.
17754 . join ('|', sort @L_re)
17756 . join ('|', sort @V_re)
17758 . join ('|', sort @T_re)
17761 # These hashes needed by the algorithm were generated
17762 # during reading of the Jamo.txt file
17763 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17764 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17765 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17766 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17773 # This module contains machine-generated tables and code for the
17774 # algorithmically-determinable Unicode character names. The following
17775 # routines can be used to translate between name and code point and vice versa
17779 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
17780 # two must be 10; if there are 5, the first must not be a 0. Written this
17781 # way to decrease backtracking. The first regex allows the code point to
17782 # be at the end of a word, but to work properly, the word shouldn't end
17783 # with a valid hex character. The second one won't match a code point at
17784 # the end of a word, and doesn't have the run-on issue
17785 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17786 my \$code_point_re = qr/$code_point_re/;
17788 # In the following hash, the keys are the bases of names which include
17789 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
17790 # of each key is another hash which is used to get the low and high ends
17791 # for each range of code points that apply to the name.
17792 my %names_ending_in_code_point = (
17796 # The following hash is a copy of the previous one, except is for loose
17797 # matching, so each name has blanks and dashes squeezed out
17798 my %loose_names_ending_in_code_point = (
17802 # And the following array gives the inverse mapping from code points to
17803 # names. Lowest code points are first
17804 my \@code_points_ending_in_code_point = (
17805 $code_points_ending_in_code_point
17808 # Earlier releases didn't have Jamos. No sense outputting
17809 # them unless will be used.
17810 if ($has_hangul_syllables) {
17813 # Convert from code point to Jamo short name for use in composing Hangul
17819 # Leading consonant (can be null)
17829 # Optional trailing consonant
17834 # Computed re that splits up a Hangul name into LVT or LV syllables
17835 my \$syllable_re = qr/$jamo_re/;
17837 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17838 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17840 # These constants names and values were taken from the Unicode standard,
17841 # version 5.1, section 3.12. They are used in conjunction with Hangul
17843 my \$SBase = $SBase_string;
17844 my \$LBase = $LBase_string;
17845 my \$VBase = $VBase_string;
17846 my \$TBase = $TBase_string;
17847 my \$SCount = $SCount;
17848 my \$LCount = $LCount;
17849 my \$VCount = $VCount;
17850 my \$TCount = $TCount;
17851 my \$NCount = \$VCount * \$TCount;
17853 } # End of has Jamos
17855 push @name, << 'END';
17857 sub name_to_code_point_special {
17858 my ($name, $loose) = @_;
17860 # Returns undef if not one of the specially handled names; otherwise
17861 # returns the code point equivalent to the input name
17862 # $loose is non-zero if to use loose matching, 'name' in that case
17863 # must be input as upper case with all blanks and dashes squeezed out.
17865 if ($has_hangul_syllables) {
17866 push @name, << 'END';
17868 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17869 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17871 return if $name !~ qr/^$syllable_re$/;
17872 my $L = $Jamo_L{$1};
17873 my $V = $Jamo_V{$2};
17874 my $T = (defined $3) ? $Jamo_T{$3} : 0;
17875 return ($L * $VCount + $V) * $TCount + $T + $SBase;
17879 push @name, << 'END';
17881 # Name must end in 'code_point' for this to handle.
17882 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17883 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17886 my $code_point = CORE::hex $2;
17890 $names_ref = \%loose_names_ending_in_code_point;
17893 return if $base !~ s/-$//;
17894 $names_ref = \%names_ending_in_code_point;
17897 # Name must be one of the ones which has the code point in it.
17898 return if ! $names_ref->{$base};
17900 # Look through the list of ranges that apply to this name to see if
17901 # the code point is in one of them.
17902 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17903 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17904 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17906 # Here, the code point is in the range.
17907 return $code_point;
17910 # Here, looked like the name had a code point number in it, but
17911 # did not match one of the valid ones.
17915 sub code_point_to_name_special {
17916 my $code_point = shift;
17918 # Returns the name of a code point if algorithmically determinable;
17921 if ($has_hangul_syllables) {
17922 push @name, << 'END';
17924 # If in the Hangul range, calculate the name based on Unicode's
17926 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17928 my $SIndex = $code_point - $SBase;
17929 my $L = $LBase + $SIndex / $NCount;
17930 my $V = $VBase + ($SIndex % $NCount) / $TCount;
17931 my $T = $TBase + $SIndex % $TCount;
17932 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17933 $name .= $Jamo{$T} if $T != $TBase;
17938 push @name, << 'END';
17940 # Look through list of these code points for one in range.
17941 foreach my $hash (@code_points_ending_in_code_point) {
17942 return if $code_point < $hash->{'low'};
17943 if ($code_point <= $hash->{'high'}) {
17944 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17947 return; # None found
17954 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
17959 # Create and write UCD.pl, which passes info about the tables to
17962 # Create a mapping from each alias of Perl single-form extensions to all
17963 # its equivalent aliases, for quick look-up.
17964 my %perlprop_to_aliases;
17965 foreach my $table ($perl->tables) {
17967 # First create the list of the aliases of each extension
17968 my @aliases_list; # List of legal aliases for this extension
17970 my $table_name = $table->name;
17971 my $standard_table_name = standardize($table_name);
17972 my $table_full_name = $table->full_name;
17973 my $standard_table_full_name = standardize($table_full_name);
17975 # Make sure that the list has both the short and full names
17976 push @aliases_list, $table_name, $table_full_name;
17978 my $found_ucd = 0; # ? Did we actually get an alias that should be
17979 # output for this table
17981 # Go through all the aliases (including the two just added), and add
17982 # any new unique ones to the list
17983 foreach my $alias ($table->aliases) {
17985 # Skip non-legal names
17986 next unless $alias->ok_as_filename;
17987 next unless $alias->ucd;
17989 $found_ucd = 1; # have at least one legal name
17991 my $name = $alias->name;
17992 my $standard = standardize($name);
17994 # Don't repeat a name that is equivalent to one already on the
17996 next if $standard eq $standard_table_name;
17997 next if $standard eq $standard_table_full_name;
17999 push @aliases_list, $name;
18002 # If there were no legal names, don't output anything.
18003 next unless $found_ucd;
18005 # To conserve memory in the program reading these in, omit full names
18006 # that are identical to the short name, when those are the only two
18007 # aliases for the property.
18008 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
18012 # Here, @aliases_list is the list of all the aliases that this
18013 # extension legally has. Now can create a map to it from each legal
18014 # standardized alias
18015 foreach my $alias ($table->aliases) {
18016 next unless $alias->ucd;
18017 next unless $alias->ok_as_filename;
18018 push @{$perlprop_to_aliases{standardize($alias->name)}},
18023 # Make a list of all combinations of properties/values that are suppressed.
18025 if (! $debug_skip) { # This tends to fail in this debug mode
18026 foreach my $property_name (keys %why_suppressed) {
18029 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
18031 # The hash may contain properties not in this release of Unicode
18032 next unless defined (my $property = property_ref($property_name));
18034 # Find all combinations
18035 foreach my $prop_alias ($property->aliases) {
18036 my $prop_alias_name = standardize($prop_alias->name);
18038 # If no =value, there's just one combination possible for this
18039 if (! $value_name) {
18041 # The property may be suppressed, but there may be a proxy
18042 # for it, so it shouldn't be listed as suppressed
18043 next if $prop_alias->ucd;
18044 push @suppressed, $prop_alias_name;
18047 foreach my $value_alias
18048 ($property->table($value_name)->aliases)
18050 next if $value_alias->ucd;
18052 push @suppressed, "$prop_alias_name="
18053 . standardize($value_alias->name);
18059 @suppressed = sort @suppressed; # So doesn't change between runs of this
18062 # Convert the structure below (designed for Name.pm) to a form that UCD
18063 # wants, so it doesn't have to modify it at all; i.e. so that it includes
18064 # an element for the Hangul syllables in the appropriate place, and
18065 # otherwise changes the name to include the "-<code point>" suffix.
18066 my @algorithm_names;
18067 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came
18068 # along in this version
18069 # Copy it linearly.
18070 for my $i (0 .. @code_points_ending_in_code_point - 1) {
18072 # Insert the hanguls in the correct place.
18074 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
18077 push @algorithm_names, { low => $SBase,
18078 high => $SBase + $SCount - 1,
18079 name => '<hangul syllable>',
18083 # Copy the current entry, modified.
18084 push @algorithm_names, {
18085 low => $code_points_ending_in_code_point[$i]->{'low'},
18086 high => $code_points_ending_in_code_point[$i]->{'high'},
18088 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
18092 # Serialize these structures for output.
18093 my $loose_to_standard_value
18094 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
18095 chomp $loose_to_standard_value;
18097 my $string_property_loose_to_name
18098 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
18099 chomp $string_property_loose_to_name;
18101 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
18102 chomp $perlprop_to_aliases;
18104 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
18105 chomp $prop_aliases;
18107 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
18108 chomp $prop_value_aliases;
18110 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
18113 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
18114 chomp $algorithm_names;
18116 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
18117 chomp $ambiguous_names;
18119 my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
18120 chomp $combination_property;
18122 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
18123 chomp $loose_defaults;
18127 $INTERNAL_ONLY_HEADER
18129 # This file is for the use of Unicode::UCD
18131 # Highest legal Unicode code point
18132 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
18135 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
18136 \$Unicode::UCD::HANGUL_COUNT = $SCount;
18138 # Keys are all the possible "prop=value" combinations, in loose form; values
18139 # are the standard loose name for the 'value' part of the key
18140 \%Unicode::UCD::loose_to_standard_value = (
18141 $loose_to_standard_value
18144 # String property loose names to standard loose name
18145 \%Unicode::UCD::string_property_loose_to_name = (
18146 $string_property_loose_to_name
18149 # Keys are Perl extensions in loose form; values are each one's list of
18151 \%Unicode::UCD::loose_perlprop_to_name = (
18152 $perlprop_to_aliases
18155 # Keys are standard property name; values are each one's aliases
18156 \%Unicode::UCD::prop_aliases = (
18160 # Keys of top level are standard property name; values are keys to another
18161 # hash, Each one is one of the property's values, in standard form. The
18162 # values are that prop-val's aliases. If only one specified, the short and
18163 # long alias are identical.
18164 \%Unicode::UCD::prop_value_aliases = (
18165 $prop_value_aliases
18168 # Ordered (by code point ordinal) list of the ranges of code points whose
18169 # names are algorithmically determined. Each range entry is an anonymous hash
18170 # of the start and end points and a template for the names within it.
18171 \@Unicode::UCD::algorithmic_named_code_points = (
18175 # The properties that as-is have two meanings, and which must be disambiguated
18176 \%Unicode::UCD::ambiguous_names = (
18180 # Keys are the prop-val combinations which are the default values for the
18181 # given property, expressed in standard loose form
18182 \%Unicode::UCD::loose_defaults = (
18186 # The properties that are combinations, in that they have both a map table and
18187 # a match table. This is actually for UCD.t, so it knows how to test for
18189 \%Unicode::UCD::combination_property = (
18190 $combination_property
18193 # All combinations of names that are suppressed.
18194 # This is actually for UCD.t, so it knows which properties shouldn't have
18195 # entries. If it got any bigger, would probably want to put it in its own
18196 # file to use memory only when it was needed, in testing.
18197 \@Unicode::UCD::suppressed_properties = (
18204 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
18208 sub write_all_tables() {
18209 # Write out all the tables generated by this program to files, as well as
18210 # the supporting data structures, pod file, and .t file.
18212 my @writables; # List of tables that actually get written
18213 my %match_tables_to_write; # Used to collapse identical match tables
18214 # into one file. Each key is a hash function
18215 # result to partition tables into buckets.
18216 # Each value is an array of the tables that
18217 # fit in the bucket.
18219 # For each property ...
18220 # (sort so that if there is an immutable file name, it has precedence, so
18221 # some other property can't come in and take over its file name. (We
18222 # don't care if both defined, as they had better be different anyway.)
18223 # The property named 'Perl' needs to be first (it doesn't have any
18224 # immutable file name) because empty properties are defined in terms of
18225 # its table named 'All' under the -annotate option.) We also sort by
18226 # the property's name. This is just for repeatability of the outputs
18227 # between runs of this program, but does not affect correctness.
18229 foreach my $property ($perl,
18230 sort { return -1 if defined $a->file;
18231 return 1 if defined $b->file;
18232 return $a->name cmp $b->name;
18233 } grep { $_ != $perl } property_ref('*'))
18235 my $type = $property->type;
18237 # And for each table for that property, starting with the mapping
18240 foreach my $table($property,
18242 # and all the match tables for it (if any), sorted so
18243 # the ones with the shortest associated file name come
18244 # first. The length sorting prevents problems of a
18245 # longer file taking a name that might have to be used
18246 # by a shorter one. The alphabetic sorting prevents
18247 # differences between releases
18248 sort { my $ext_a = $a->external_name;
18249 return 1 if ! defined $ext_a;
18250 my $ext_b = $b->external_name;
18251 return -1 if ! defined $ext_b;
18253 # But return the non-complement table before
18254 # the complement one, as the latter is defined
18255 # in terms of the former, and needs to have
18256 # the information for the former available.
18257 return 1 if $a->complement != 0;
18258 return -1 if $b->complement != 0;
18260 # Similarly, return a subservient table after
18262 return 1 if $a->leader != $a;
18263 return -1 if $b->leader != $b;
18265 my $cmp = length $ext_a <=> length $ext_b;
18267 # Return result if lengths not equal
18268 return $cmp if $cmp;
18270 # Alphabetic if lengths equal
18271 return $ext_a cmp $ext_b
18272 } $property->tables
18276 # Here we have a table associated with a property. It could be
18277 # the map table (done first for each property), or one of the
18278 # other tables. Determine which type.
18279 my $is_property = $table->isa('Property');
18281 my $name = $table->name;
18282 my $complete_name = $table->complete_name;
18284 # See if should suppress the table if is empty, but warn if it
18285 # contains something.
18286 my $suppress_if_empty_warn_if_not
18287 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18289 # Calculate if this table should have any code points associated
18291 my $expected_empty =
18293 # $perl should be empty
18294 ($is_property && ($table == $perl))
18296 # Match tables in properties we skipped populating should be
18298 || (! $is_property && ! $property->to_create_match_tables)
18300 # Tables and properties that are expected to have no code
18301 # points should be empty
18302 || $suppress_if_empty_warn_if_not
18305 # Set a boolean if this table is the complement of an empty binary
18307 my $is_complement_of_empty_binary =
18308 $type == $BINARY &&
18309 (($table == $property->table('Y')
18310 && $property->table('N')->is_empty)
18311 || ($table == $property->table('N')
18312 && $property->table('Y')->is_empty));
18314 if ($table->is_empty) {
18316 if ($suppress_if_empty_warn_if_not) {
18317 $table->set_fate($SUPPRESSED,
18318 $suppress_if_empty_warn_if_not);
18321 # Suppress (by skipping them) expected empty tables.
18322 next TABLE if $expected_empty;
18324 # And setup to later output a warning for those that aren't
18325 # known to be allowed to be empty. Don't do the warning if
18326 # this table is a child of another one to avoid duplicating
18327 # the warning that should come from the parent one.
18328 if (($table == $property || $table->parent == $table)
18329 && $table->fate != $SUPPRESSED
18330 && $table->fate != $MAP_PROXIED
18331 && ! grep { $complete_name =~ /^$_$/ }
18332 @tables_that_may_be_empty)
18334 push @unhandled_properties, "$table";
18337 # The old way of expressing an empty match list was to
18338 # complement the list that matches everything. The new way is
18339 # to create an empty inversion list, but this doesn't work for
18340 # annotating, so use the old way then.
18341 $table->set_complement($All) if $annotate
18342 && $table != $property;
18344 elsif ($expected_empty) {
18346 if ($suppress_if_empty_warn_if_not) {
18347 $because = " because $suppress_if_empty_warn_if_not";
18350 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
18353 # Some tables should match everything
18354 my $expected_full =
18355 ($table->fate == $SUPPRESSED)
18358 ? # All these types of map tables will be full because
18359 # they will have been populated with defaults
18362 : # A match table should match everything if its method
18364 ($table->matches_all
18366 # The complement of an empty binary table will match
18368 || $is_complement_of_empty_binary
18372 my $count = $table->count;
18373 if ($expected_full) {
18374 if ($count != $MAX_WORKING_CODEPOINTS) {
18375 Carp::my_carp("$table matches only "
18376 . clarify_number($count)
18377 . " Unicode code points but should match "
18378 . clarify_number($MAX_WORKING_CODEPOINTS)
18380 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18381 . "). Proceeding anyway.");
18384 # Here is expected to be full. If it is because it is the
18385 # complement of an (empty) binary table that is to be
18386 # suppressed, then suppress this one as well.
18387 if ($is_complement_of_empty_binary) {
18388 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18389 my $opposing = $property->table($opposing_name);
18390 my $opposing_status = $opposing->status;
18391 if ($opposing_status) {
18392 $table->set_status($opposing_status,
18393 $opposing->status_info);
18397 elsif ($count == $MAX_UNICODE_CODEPOINTS
18399 && ($table == $property || $table->leader == $table)
18400 && $table->property->status ne $NORMAL)
18402 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
18405 if ($table->fate >= $SUPPRESSED) {
18406 if (! $is_property) {
18407 my @children = $table->children;
18408 foreach my $child (@children) {
18409 if ($child->fate < $SUPPRESSED) {
18410 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18418 if (! $is_property) {
18420 make_ucd_table_pod_entries($table) if $table->property == $perl;
18422 # Several things need to be done just once for each related
18423 # group of match tables. Do them on the parent.
18424 if ($table->parent == $table) {
18426 # Add an entry in the pod file for the table; it also does
18428 make_re_pod_entries($table) if defined $pod_directory;
18430 # See if the table matches identical code points with
18431 # something that has already been processed and is ready
18432 # for output. In that case, no need to have two files
18433 # with the same code points in them. We use the table's
18434 # hash() method to store these in buckets, so that it is
18435 # quite likely that if two tables are in the same bucket
18436 # they will be identical, so don't have to compare tables
18437 # frequently. The tables have to have the same status to
18438 # share a file, so add this to the bucket hash. (The
18439 # reason for this latter is that Heavy.pl associates a
18440 # status with a file.) We don't check tables that are
18441 # inverses of others, as it would lead to some coding
18442 # complications, and checking all the regular ones should
18444 if ($table->complement == 0) {
18445 my $hash = $table->hash . ';' . $table->status;
18447 # Look at each table that is in the same bucket as
18448 # this one would be.
18449 foreach my $comparison
18450 (@{$match_tables_to_write{$hash}})
18452 # If the table doesn't point back to this one, we
18453 # see if it matches identically
18454 if ( $comparison->leader != $table
18455 && $table->matches_identically_to($comparison))
18457 $table->set_equivalent_to($comparison,
18463 # Here, not equivalent, add this table to the bucket.
18464 push @{$match_tables_to_write{$hash}}, $table;
18470 # Here is the property itself.
18471 # Don't write out or make references to the $perl property
18472 next if $table == $perl;
18474 make_ucd_table_pod_entries($table);
18476 # There is a mapping stored of the various synonyms to the
18477 # standardized name of the property for utf8_heavy.pl.
18478 # Also, the pod file contains entries of the form:
18479 # \p{alias: *} \p{full: *}
18480 # rather than show every possible combination of things.
18482 my @property_aliases = $property->aliases;
18484 my $full_property_name = $property->full_name;
18485 my $property_name = $property->name;
18486 my $standard_property_name = standardize($property_name);
18487 my $standard_property_full_name
18488 = standardize($full_property_name);
18490 # We also create for Unicode::UCD a list of aliases for
18491 # the property. The list starts with the property name;
18492 # then its full name. Legacy properties are not listed in
18496 if ( $property->fate <= $MAP_PROXIED) {
18497 @property_list = ($property_name, $full_property_name);
18498 @standard_list = ($standard_property_name,
18499 $standard_property_full_name);
18502 # For each synonym ...
18503 for my $i (0 .. @property_aliases - 1) {
18504 my $alias = $property_aliases[$i];
18505 my $alias_name = $alias->name;
18506 my $alias_standard = standardize($alias_name);
18509 # Add other aliases to the list of property aliases
18510 if ($property->fate <= $MAP_PROXIED
18511 && ! grep { $alias_standard eq $_ } @standard_list)
18513 push @property_list, $alias_name;
18514 push @standard_list, $alias_standard;
18517 # For utf8_heavy, set the mapping of the alias to the
18519 if ($type == $STRING) {
18520 if ($property->fate <= $MAP_PROXIED) {
18521 $string_property_loose_to_name{$alias_standard}
18522 = $standard_property_name;
18526 my $hash_ref = ($alias_standard =~ /^_/)
18527 ? \%strict_property_name_of
18528 : \%loose_property_name_of;
18529 if (exists $hash_ref->{$alias_standard}) {
18530 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained");
18533 $hash_ref->{$alias_standard}
18534 = $standard_property_name;
18537 # Now for the re pod entry for this alias. Skip if not
18538 # outputting a pod; skip the first one, which is the
18539 # full name so won't have an entry like: '\p{full: *}
18540 # \p{full: *}', and skip if don't want an entry for
18543 || ! defined $pod_directory
18544 || ! $alias->make_re_pod_entry;
18546 my $rhs = "\\p{$full_property_name: *}";
18547 if ($property != $perl && $table->perl_extension) {
18548 $rhs .= ' (Perl extension)';
18550 push @match_properties,
18551 format_pod_line($indent_info_column,
18552 '\p{' . $alias->name . ': *}',
18558 # The list of all possible names is attached to each alias, so
18560 if (@property_list) {
18561 push @{$prop_aliases{$standard_list[0]}}, @property_list;
18564 if ($property->fate <= $MAP_PROXIED) {
18566 # Similarly, we create for Unicode::UCD a list of
18567 # property-value aliases.
18569 # Look at each table in the property...
18570 foreach my $table ($property->tables) {
18572 my $table_full_name = $table->full_name;
18573 my $standard_table_full_name
18574 = standardize($table_full_name);
18575 my $table_name = $table->name;
18576 my $standard_table_name = standardize($table_name);
18578 # The list starts with the table name and its full
18580 push @values_list, $table_name, $table_full_name;
18582 # We add to the table each unique alias that isn't
18583 # discouraged from use.
18584 foreach my $alias ($table->aliases) {
18585 next if $alias->status
18586 && $alias->status eq $DISCOURAGED;
18587 my $name = $alias->name;
18588 my $standard = standardize($name);
18589 next if $standard eq $standard_table_name;
18590 next if $standard eq $standard_table_full_name;
18591 push @values_list, $name;
18594 # Here @values_list is a list of all the aliases for
18595 # the table. That is, all the property-values given
18596 # by this table. By agreement with Unicode::UCD,
18597 # if the name and full name are identical, and there
18598 # are no other names, drop the duplicate entry to save
18600 if (@values_list == 2
18601 && $values_list[0] eq $values_list[1])
18606 # To save memory, unlike the similar list for property
18607 # aliases above, only the standard forms have the list.
18608 # This forces an extra step of converting from input
18609 # name to standard name, but the savings are
18610 # considerable. (There is only marginal savings if we
18611 # did this with the property aliases.)
18612 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18616 # Don't write out a mapping file if not desired.
18617 next if ! $property->to_output_map;
18620 # Here, we know we want to write out the table, but don't do it
18621 # yet because there may be other tables that come along and will
18622 # want to share the file, and the file's comments will change to
18623 # mention them. So save for later.
18624 push @writables, $table;
18626 } # End of looping through the property and all its tables.
18627 } # End of looping through all properties.
18629 # Now have all the tables that will have files written for them. Do it.
18630 foreach my $table (@writables) {
18633 my $property = $table->property;
18634 my $is_property = ($table == $property);
18636 # For very short tables, instead of writing them out to actual files,
18637 # we in-line their inversion list definitions into Heavy.pl. The
18638 # definition replaces the file name, and the special pseudo-directory
18639 # '#' is used to signal this. This significantly cuts down the number
18640 # of files written at little extra cost to the hashes in Heavy.pl.
18641 # And it means, no run-time files to read to get the definitions.
18643 && ! $annotate # For annotation, we want to explicitly show
18644 # everything, so keep in files
18645 && $table->ranges <= 3)
18647 my @ranges = $table->ranges;
18648 my $count = @ranges;
18649 if ($count == 0) { # 0th index reserved for 0-length lists
18652 elsif ($table->leader != $table) {
18654 # Here, is a table that is equivalent to another; code
18655 # in register_file_for_name() causes its leader's definition
18660 else { # No equivalent table so far.
18662 # Build up its definition range-by-range.
18663 my $definition = "";
18664 while (defined (my $range = shift @ranges)) {
18665 my $end = $range->end;
18666 if ($end < $MAX_WORKING_CODEPOINT) {
18668 $end = "\n" . ($end + 1);
18670 else { # Extends to infinity, hence no 'end'
18673 $definition .= "\n" . $range->start . $end;
18675 $definition = "V$count" . $definition;
18676 $filename = @inline_definitions;
18677 push @inline_definitions, $definition;
18680 register_file_for_name($table, \@directory, $filename);
18684 if (! $is_property) {
18685 # Match tables for the property go in lib/$subdirectory, which is
18686 # the property's name. Don't use the standard file name for this,
18687 # as may get an unfamiliar alias
18688 @directory = ($matches_directory, $property->external_name);
18692 @directory = $table->directory;
18693 $filename = $table->file;
18696 # Use specified filename if available, or default to property's
18697 # shortest name. We need an 8.3 safe filename (which means "an 8
18698 # safe" filename, since after the dot is only 'pl', which is < 3)
18699 # The 2nd parameter is if the filename shouldn't be changed, and
18700 # it shouldn't iff there is a hard-coded name for this table.
18701 $filename = construct_filename(
18702 $filename || $table->external_name,
18703 ! $filename, # mutable if no filename
18706 register_file_for_name($table, \@directory, $filename);
18708 # Only need to write one file when shared by more than one
18710 next if ! $is_property
18711 && ($table->leader != $table || $table->complement != 0);
18713 # Construct a nice comment to add to the file
18714 $table->set_final_comment;
18720 # Write out the pod file
18723 # And Heavy.pl, Name.pm, UCD.pl
18728 make_property_test_script() if $make_test_script;
18729 make_normalization_test_script() if $make_norm_test_script;
18733 my @white_space_separators = ( # This used only for making the test script.
18740 sub generate_separator($) {
18741 # This used only for making the test script. It generates the colon or
18742 # equal separator between the property and property value, with random
18743 # white space surrounding the separator
18747 return "" if $lhs eq ""; # No separator if there's only one (the r) side
18749 # Choose space before and after randomly
18750 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18751 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18753 # And return the whole complex, half the time using a colon, half the
18755 return $spaces_before
18756 . (rand() < 0.5) ? '=' : ':'
18760 sub generate_tests($$$$$) {
18761 # This used only for making the test script. It generates test cases that
18762 # are expected to compile successfully in perl. Note that the LHS and
18763 # RHS are assumed to already be as randomized as the caller wants.
18765 my $lhs = shift; # The property: what's to the left of the colon
18766 # or equals separator
18767 my $rhs = shift; # The property value; what's to the right
18768 my $valid_code = shift; # A code point that's known to be in the
18769 # table given by LHS=RHS; undef if table is
18771 my $invalid_code = shift; # A code point known to not be in the table;
18772 # undef if the table is all code points
18773 my $warning = shift;
18775 # Get the colon or equal
18776 my $separator = generate_separator($lhs);
18778 # The whole 'property=value'
18779 my $name = "$lhs$separator$rhs";
18782 # Create a complete set of tests, with complements.
18783 if (defined $valid_code) {
18784 push @output, <<"EOC"
18785 Expect(1, $valid_code, '\\p{$name}', $warning);
18786 Expect(0, $valid_code, '\\p{^$name}', $warning);
18787 Expect(0, $valid_code, '\\P{$name}', $warning);
18788 Expect(1, $valid_code, '\\P{^$name}', $warning);
18791 if (defined $invalid_code) {
18792 push @output, <<"EOC"
18793 Expect(0, $invalid_code, '\\p{$name}', $warning);
18794 Expect(1, $invalid_code, '\\p{^$name}', $warning);
18795 Expect(1, $invalid_code, '\\P{$name}', $warning);
18796 Expect(0, $invalid_code, '\\P{^$name}', $warning);
18802 sub generate_error($$$) {
18803 # This used only for making the test script. It generates test cases that
18804 # are expected to not only not match, but to be syntax or similar errors
18806 my $lhs = shift; # The property: what's to the left of the
18807 # colon or equals separator
18808 my $rhs = shift; # The property value; what's to the right
18809 my $already_in_error = shift; # Boolean; if true it's known that the
18810 # unmodified LHS and RHS will cause an error.
18811 # This routine should not force another one
18812 # Get the colon or equal
18813 my $separator = generate_separator($lhs);
18815 # Since this is an error only, don't bother to randomly decide whether to
18816 # put the error on the left or right side; and assume that the RHS is
18817 # loosely matched, again for convenience rather than rigor.
18818 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18820 my $property = $lhs . $separator . $rhs;
18823 Error('\\p{$property}');
18824 Error('\\P{$property}');
18828 # These are used only for making the test script
18829 # XXX Maybe should also have a bad strict seps, which includes underscore.
18831 my @good_loose_seps = (
18838 my @bad_loose_seps = (
18843 sub randomize_stricter_name {
18844 # This used only for making the test script. Take the input name and
18845 # return a randomized, but valid version of it under the stricter matching
18849 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18851 # If the name looks like a number (integer, floating, or rational), do
18853 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18856 my $separator = $3;
18858 # If there isn't a sign, part of the time add a plus
18859 # Note: Not testing having any denominator having a minus sign
18861 $sign = '+' if rand() <= .3;
18864 # And add 0 or more leading zeros.
18865 $name = $sign . ('0' x int rand(10)) . $number;
18867 if (defined $separator) {
18868 my $extra_zeros = '0' x int rand(10);
18870 if ($separator eq '.') {
18872 # Similarly, add 0 or more trailing zeros after a decimal
18874 $name .= $extra_zeros;
18878 # Or, leading zeros before the denominator
18879 $name =~ s,/,/$extra_zeros,;
18884 # For legibility of the test, only change the case of whole sections at a
18885 # time. To do this, first split into sections. The split returns the
18888 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18889 trace $section if main::DEBUG && $to_trace;
18891 if (length $section > 1 && $section !~ /\D/) {
18893 # If the section is a sequence of digits, about half the time
18894 # randomly add underscores between some of them.
18897 # Figure out how many underscores to add. max is 1 less than
18898 # the number of digits. (But add 1 at the end to make sure
18899 # result isn't 0, and compensate earlier by subtracting 2
18901 my $num_underscores = int rand(length($section) - 2) + 1;
18903 # And add them evenly throughout, for convenience, not rigor
18905 my $spacing = (length($section) - 1)/ $num_underscores;
18906 my $temp = $section;
18908 for my $i (1 .. $num_underscores) {
18909 $section .= substr($temp, 0, $spacing, "") . '_';
18913 push @sections, $section;
18917 # Here not a sequence of digits. Change the case of the section
18919 my $switch = int rand(4);
18920 if ($switch == 0) {
18921 push @sections, uc $section;
18923 elsif ($switch == 1) {
18924 push @sections, lc $section;
18926 elsif ($switch == 2) {
18927 push @sections, ucfirst $section;
18930 push @sections, $section;
18934 trace "returning", join "", @sections if main::DEBUG && $to_trace;
18935 return join "", @sections;
18938 sub randomize_loose_name($;$) {
18939 # This used only for making the test script
18942 my $want_error = shift; # if true, make an error
18943 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
18945 $name = randomize_stricter_name($name);
18948 push @parts, $good_loose_seps[rand(@good_loose_seps)];
18950 # Preserve trailing ones for the sake of not stripping the underscore from
18952 for my $part (split /[-\s_]+ (?= . )/, $name) {
18954 if ($want_error and rand() < 0.3) {
18955 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18959 push @parts, $good_loose_seps[rand(@good_loose_seps)];
18962 push @parts, $part;
18964 my $new = join("", @parts);
18965 trace "$name => $new" if main::DEBUG && $to_trace;
18968 if (rand() >= 0.5) {
18969 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18972 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18978 # Used to make sure don't generate duplicate test cases.
18979 my %test_generated;
18981 sub make_property_test_script() {
18982 # This used only for making the test script
18983 # this written directly -- it's huge.
18985 print "Making test script\n" if $verbosity >= $PROGRESS;
18987 # This uses randomness to test different possibilities without testing all
18988 # possibilities. To ensure repeatability, set the seed to 0. But if
18989 # tests are added, it will perturb all later ones in the .t file
18992 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18994 # Create a list of what the %f representation is for each rational number.
18995 # This will be used below.
18996 my @valid_base_floats = '0.0';
18997 foreach my $e_representation (keys %nv_floating_to_rational) {
18998 push @valid_base_floats,
18999 eval $nv_floating_to_rational{$e_representation};
19002 # It doesn't matter whether the elements of this array contain single lines
19003 # or multiple lines. main::write doesn't count the lines.
19006 push @output, <<'EOF_CODE';
19007 Error('\p{Script=InGreek}'); # Bug #69018
19008 Test_GCB("1100 $nobreak 1161"); # Bug #70940
19009 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
19010 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
19011 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
19013 # Make sure this gets tested; it was not part of the official test suite at
19014 # the time this was added. Note that this is as it would appear in the
19015 # official suite, and gets modified to check for the perl tailoring by
19017 Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
19018 Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
19021 # Sort these so get results in same order on different runs of this
19023 foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
19025 lc $a->name cmp lc $b->name
19026 } property_ref('*'))
19028 # Non-binary properties should not match \p{}; Test all for that.
19029 if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
19030 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
19031 $property->aliases;
19032 foreach my $property_alias ($property->aliases) {
19033 my $name = standardize($property_alias->name);
19035 # But some names are ambiguous, meaning a binary property with
19036 # the same name when used in \p{}, and a different
19037 # (non-binary) property in other contexts.
19038 next if grep { $name eq $_ } keys %ambiguous_names;
19040 push @output, <<"EOF_CODE";
19041 Error('\\p{$name}');
19042 Error('\\P{$name}');
19046 foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
19048 lc $a->name cmp lc $b->name
19049 } $property->tables)
19052 # Find code points that match, and don't match this table.
19053 my $valid = $table->get_valid_code_point;
19054 my $invalid = $table->get_invalid_code_point;
19055 my $warning = ($table->status eq $DEPRECATED)
19059 # Test each possible combination of the property's aliases with
19060 # the table's. If this gets to be too many, could do what is done
19061 # in the set_final_comment() for Tables
19062 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
19063 next unless @table_aliases;
19064 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
19065 next unless @property_aliases;
19067 # Every property can be optionally be prefixed by 'Is_', so test
19068 # that those work, by creating such a new alias for each
19069 # pre-existing one.
19070 push @property_aliases, map { Alias->new("Is_" . $_->name,
19072 $_->make_re_pod_entry,
19073 $_->ok_as_filename,
19077 } @property_aliases;
19078 my $max = max(scalar @table_aliases, scalar @property_aliases);
19079 for my $j (0 .. $max - 1) {
19081 # The current alias for property is the next one on the list,
19082 # or if beyond the end, start over. Similarly for table
19084 = $property_aliases[$j % @property_aliases]->name;
19086 $property_name = "" if $table->property == $perl;
19087 my $table_alias = $table_aliases[$j % @table_aliases];
19088 my $table_name = $table_alias->name;
19089 my $loose_match = $table_alias->loose_match;
19091 # If the table doesn't have a file, any test for it is
19092 # already guaranteed to be in error
19093 my $already_error = ! $table->file_path;
19095 # Generate error cases for this alias.
19096 push @output, generate_error($property_name,
19100 # If the table is guaranteed to always generate an error,
19101 # quit now without generating success cases.
19102 next if $already_error;
19104 # Now for the success cases.
19106 if ($loose_match) {
19108 # For loose matching, create an extra test case for the
19110 my $standard = standardize($table_name);
19112 # $test_name should be a unique combination for each test
19113 # case; used just to avoid duplicate tests
19114 my $test_name = "$property_name=$standard";
19116 # Don't output duplicate test cases.
19117 if (! exists $test_generated{$test_name}) {
19118 $test_generated{$test_name} = 1;
19119 push @output, generate_tests($property_name,
19126 $random = randomize_loose_name($table_name)
19128 else { # Stricter match
19129 $random = randomize_stricter_name($table_name);
19132 # Now for the main test case for this alias.
19133 my $test_name = "$property_name=$random";
19134 if (! exists $test_generated{$test_name}) {
19135 $test_generated{$test_name} = 1;
19136 push @output, generate_tests($property_name,
19143 if ($property->name eq 'nv') {
19144 if ($table_name !~ qr{/}) {
19145 push @output, generate_tests($property_name,
19146 sprintf("%.15e", $table_name),
19153 # If the name is a rational number, add tests for a
19154 # non-reduced form, and for a floating point equivalent.
19156 # 60 is a number divisible by a bunch of things
19157 my ($numerator, $denominator) = $table_name
19158 =~ m! (.+) / (.+) !x;
19160 $denominator *= 60;
19161 push @output, generate_tests($property_name,
19162 "$numerator/$denominator",
19168 # Calculate the float, and the %e representation
19169 my $float = eval $table_name;
19170 my $e_representation = sprintf("%.*e",
19171 $E_FLOAT_PRECISION, $float);
19173 my ($non_zeros, $zeros, $exponent_sign, $exponent)
19174 = $e_representation
19175 =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
19176 my $min_e_precision;
19177 my $min_f_precision;
19179 if ($exponent_sign eq '+' && $exponent != 0) {
19180 Carp::my_carp_bug("Not yet equipped to handle"
19181 . " positive exponents");
19185 # We're trying to find the minimum precision that
19186 # is needed to indicate this particular rational
19187 # for the given $E_FLOAT_PRECISION. For %e, any
19188 # trailing zeros, like 1.500e-02 aren't needed, so
19189 # the correct value is how many non-trailing zeros
19190 # there are after the decimal point.
19191 $min_e_precision = length $non_zeros;
19193 # For %f, like .01500, we want at least
19194 # $E_FLOAT_PRECISION digits, but any trailing
19195 # zeros aren't needed, so we can subtract the
19196 # length of those. But we also need to include
19197 # the zeros after the decimal point, but before
19198 # the first significant digit.
19199 $min_f_precision = $E_FLOAT_PRECISION
19204 # Make tests for each possible precision from 1 to
19205 # just past the worst case.
19206 my $upper_limit = ($min_e_precision > $min_f_precision)
19208 : $min_f_precision;
19210 for my $i (1 .. $upper_limit + 1) {
19211 for my $format ("e", "f") {
19213 = sprintf("%.*$format", $i, $float);
19215 # If we don't have enough precision digits,
19216 # make a fail test; otherwise a pass test.
19217 my $pass = ($format eq "e")
19218 ? $i >= $min_e_precision
19219 : $i >= $min_f_precision;
19221 push @output, generate_tests($property_name,
19228 elsif ( $format eq "e"
19230 # Here we would fail, but in the %f
19231 # case, the representation at this
19232 # precision could actually be a
19233 # valid one for some other rational
19234 || ! grep { $_ eq $this_table }
19235 @valid_base_floats)
19238 generate_error($property_name,
19240 1 # 1 => already an
19252 $property->DESTROY();
19255 # Make any test of the boundary (break) properties TODO if the code
19256 # doesn't match the version being compiled
19257 my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19258 ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19259 : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19267 # Cause there to be 'if' statements to only execute a portion of this
19268 # long-running test each time, so that we can have a bunch of .t's running
19270 my $chunks = 10 # Number of test files
19273 - 4; # LB split into this many files
19274 my @output_chunked;
19276 my $chunk_size= int(@output / $chunks) + 1;
19279 my @chunk= splice @output, 0, $chunk_size;
19280 push @output_chunked,
19281 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19287 push @output_chunked,
19288 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19289 (map {" Test_GCB('$_');\n"} @backslash_X_tests),
19290 (map {" Test_SB('$_');\n"} @SB_tests),
19294 $chunk_size= int(@LB_tests / 4) + 1;
19295 @LB_tests = map {" Test_LB('$_');\n"} @LB_tests;
19296 while (@LB_tests) {
19298 my @chunk= splice @LB_tests, 0, $chunk_size;
19299 push @output_chunked,
19300 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19306 push @output_chunked,
19307 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19308 (map {" Test_WB('$_');\n"} @WB_tests),
19314 $TODO_FAILING_BREAKS,
19323 sub make_normalization_test_script() {
19324 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19326 my $n_path = 'TestNorm.pl';
19328 unshift @normalization_tests, <<'END';
19332 sub ord_string { # Convert packed ords to printable string
19334 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19335 unpack "U*", shift) . "'";
19336 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
19340 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19341 my $display_source = ord_string($source);
19342 my $display_nfc = ord_string($nfc);
19343 my $display_nfd = ord_string($nfd);
19344 my $display_nfkc = ord_string($nfkc);
19345 my $display_nfkd = ord_string($nfkd);
19347 use Unicode::Normalize;
19349 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
19350 # nfkc == toNFC(nfkc) == toNFC(nfkd)
19353 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
19354 # nfkd == toNFD(nfkc) == toNFD(nfkd)
19357 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19358 # toNFKC(nfkc) == toNFKC(nfkd)
19361 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19362 # toNFKD(nfkc) == toNFKD(nfkd)
19364 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19365 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19366 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19367 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19368 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19370 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19371 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19372 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19373 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19374 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19376 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19377 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19378 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19379 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19380 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19382 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19383 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19384 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19385 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19386 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19393 @normalization_tests,
19399 # Skip reasons, so will be exact same text and hence the files with each
19400 # reason will get grouped together in perluniprops.
19401 my $Documentation = "Documentation";
19403 = "Provisional; for the analysis and processing of Indic scripts";
19404 my $Validation = "Validation Tests";
19405 my $Validation_Documentation = "Documentation of validation Tests";
19407 # This is a list of the input files and how to handle them. The files are
19408 # processed in their order in this list. Some reordering is possible if
19409 # desired, but the PropertyAliases and PropValueAliases files should be first,
19410 # and the extracted before the others (as data in an extracted file can be
19411 # over-ridden by the non-extracted. Some other files depend on data derived
19412 # from an earlier file, like UnicodeData requires data from Jamo, and the case
19413 # changing and folding requires data from Unicode. Mostly, it is safest to
19414 # order by first version releases in (except the Jamo).
19416 # The version strings allow the program to know whether to expect a file or
19417 # not, but if a file exists in the directory, it will be processed, even if it
19418 # is in a version earlier than expected, so you can copy files from a later
19419 # release into an earlier release's directory.
19420 my @input_file_objects = (
19421 Input_file->new('PropertyAliases.txt', v3.2,
19422 Handler => \&process_PropertyAliases,
19423 Early => [ \&substitute_PropertyAliases ],
19424 Required_Even_in_Debug_Skip => 1,
19426 Input_file->new(undef, v0, # No file associated with this
19427 Progress_Message => 'Finishing property setup',
19428 Handler => \&finish_property_setup,
19430 Input_file->new('PropValueAliases.txt', v3.2,
19431 Handler => \&process_PropValueAliases,
19432 Early => [ \&substitute_PropValueAliases ],
19433 Has_Missings_Defaults => $NOT_IGNORED,
19434 Required_Even_in_Debug_Skip => 1,
19436 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19437 Property => 'General_Category',
19439 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19440 Property => 'Canonical_Combining_Class',
19441 Has_Missings_Defaults => $NOT_IGNORED,
19443 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19444 Property => 'Numeric_Type',
19445 Has_Missings_Defaults => $NOT_IGNORED,
19447 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19448 Property => 'East_Asian_Width',
19449 Has_Missings_Defaults => $NOT_IGNORED,
19451 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19452 Property => 'Line_Break',
19453 Has_Missings_Defaults => $NOT_IGNORED,
19455 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19456 Property => 'Bidi_Class',
19457 Has_Missings_Defaults => $NOT_IGNORED,
19459 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19460 Property => 'Decomposition_Type',
19461 Has_Missings_Defaults => $NOT_IGNORED,
19463 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19464 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19465 Property => 'Numeric_Value',
19466 Each_Line_Handler => \&filter_numeric_value_line,
19467 Has_Missings_Defaults => $NOT_IGNORED,
19469 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19470 Property => 'Joining_Group',
19471 Has_Missings_Defaults => $NOT_IGNORED,
19474 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19475 Property => 'Joining_Type',
19476 Has_Missings_Defaults => $NOT_IGNORED,
19478 Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19479 Skip => 'This file adds no new information not already'
19480 . ' present in other files',
19481 # And it's unnecessary programmer work to handle this new
19482 # format. Previous Derived files actually had bug fixes
19483 # in them that were useful, but that should not be the
19486 Input_file->new('Jamo.txt', v2.0.0,
19487 Property => 'Jamo_Short_Name',
19488 Each_Line_Handler => \&filter_jamo_line,
19490 Input_file->new('UnicodeData.txt', v1.1.5,
19491 Pre_Handler => \&setup_UnicodeData,
19493 # We clean up this file for some early versions.
19494 Each_Line_Handler => [ (($v_version lt v2.0.0 )
19496 : ($v_version eq v2.1.5)
19497 ? \&filter_v2_1_5_ucd
19499 # And for 5.14 Perls with 6.0,
19500 # have to also make changes
19501 : ($v_version ge v6.0.0
19506 # Early versions did not have the
19507 # proper Unicode_1 names for the
19509 (($v_version lt v3.0.0)
19510 ? \&filter_early_U1_names
19513 # Early versions did not correctly
19514 # use the later method for giving
19515 # decimal digit values
19516 (($v_version le v3.2.0)
19517 ? \&filter_bad_Nd_ucd
19520 # And the main filter
19521 \&filter_UnicodeData_line,
19523 EOF_Handler => \&EOF_UnicodeData,
19525 Input_file->new('CJKXREF.TXT', v1.1.5,
19526 Withdrawn => v2.0.0,
19527 Skip => 'Gives the mapping of CJK code points '
19528 . 'between Unicode and various other standards',
19530 Input_file->new('ArabicShaping.txt', v2.0.0,
19531 Each_Line_Handler =>
19532 ($v_version lt 4.1.0)
19533 ? \&filter_old_style_arabic_shaping
19535 # The first field after the range is a "schematic name"
19537 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19538 Has_Missings_Defaults => $NOT_IGNORED,
19540 Input_file->new('Blocks.txt', v2.0.0,
19541 Property => 'Block',
19542 Has_Missings_Defaults => $NOT_IGNORED,
19543 Each_Line_Handler => \&filter_blocks_lines
19545 Input_file->new('Index.txt', v2.0.0,
19546 Skip => 'Alphabetical index of Unicode characters',
19548 Input_file->new('NamesList.txt', v2.0.0,
19549 Skip => 'Annotated list of characters',
19551 Input_file->new('PropList.txt', v2.0.0,
19552 Each_Line_Handler => (($v_version lt v3.1.0)
19553 ? \&filter_old_style_proplist
19556 Input_file->new('Props.txt', v2.0.0,
19557 Withdrawn => v3.0.0,
19558 Skip => 'A subset of F<PropList.txt> (which is used instead)',
19560 Input_file->new('ReadMe.txt', v2.0.0,
19561 Skip => $Documentation,
19563 Input_file->new('Unihan.txt', v2.0.0,
19564 Withdrawn => v5.2.0,
19565 Construction_Time_Handler => \&construct_unihan,
19566 Pre_Handler => \&setup_unihan,
19568 'Unicode_Radical_Stroke'
19570 Each_Line_Handler => \&filter_unihan_line,
19572 Input_file->new('SpecialCasing.txt', v2.1.8,
19573 Each_Line_Handler => ($v_version eq 2.1.8)
19574 ? \&filter_2_1_8_special_casing_line
19575 : \&filter_special_casing_line,
19576 Pre_Handler => \&setup_special_casing,
19577 Has_Missings_Defaults => $IGNORED,
19580 'LineBreak.txt', v3.0.0,
19581 Has_Missings_Defaults => $NOT_IGNORED,
19582 Property => 'Line_Break',
19583 # Early versions had problematic syntax
19584 Each_Line_Handler => ($v_version ge v3.1.0)
19586 : ($v_version lt v3.0.0)
19587 ? \&filter_substitute_lb
19588 : \&filter_early_ea_lb,
19589 # Must use long names for property values see comments at
19590 # sub filter_substitute_lb
19591 Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19592 'Alphabetic', # default to this because XX ->
19595 # Don't use _Perl_LB as a synonym for
19596 # Line_Break in later perls, as it is tailored
19597 # and isn't the same as Line_Break
19600 Input_file->new('EastAsianWidth.txt', v3.0.0,
19601 Property => 'East_Asian_Width',
19602 Has_Missings_Defaults => $NOT_IGNORED,
19603 # Early versions had problematic syntax
19604 Each_Line_Handler => (($v_version lt v3.1.0)
19605 ? \&filter_early_ea_lb
19608 Input_file->new('CompositionExclusions.txt', v3.0.0,
19609 Property => 'Composition_Exclusion',
19611 Input_file->new('UnicodeData.html', v3.0.0,
19612 Withdrawn => v4.0.1,
19613 Skip => $Documentation,
19615 Input_file->new('BidiMirroring.txt', v3.0.1,
19616 Property => 'Bidi_Mirroring_Glyph',
19617 Has_Missings_Defaults => ($v_version lt v6.2.0)
19619 # Is <none> which doesn't mean
19620 # anything to us, we will use the
19624 Input_file->new('NamesList.html', v3.0.0,
19625 Skip => 'Describes the format and contents of '
19626 . 'F<NamesList.txt>',
19628 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19630 Skip => $Documentation,
19632 Input_file->new('CaseFolding.txt', v3.0.1,
19633 Pre_Handler => \&setup_case_folding,
19634 Each_Line_Handler =>
19635 [ ($v_version lt v3.1.0)
19636 ? \&filter_old_style_case_folding
19638 \&filter_case_folding_line
19640 Has_Missings_Defaults => $IGNORED,
19642 Input_file->new("NormTest.txt", v3.0.1,
19643 Handler => \&process_NormalizationsTest,
19644 Skip => ($make_norm_test_script) ? 0 : $Validation,
19646 Input_file->new('DCoreProperties.txt', v3.1.0,
19647 # 5.2 changed this file
19648 Has_Missings_Defaults => (($v_version ge v5.2.0)
19652 Input_file->new('DProperties.html', v3.1.0,
19653 Withdrawn => v3.2.0,
19654 Skip => $Documentation,
19656 Input_file->new('PropList.html', v3.1.0,
19658 Skip => $Documentation,
19660 Input_file->new('Scripts.txt', v3.1.0,
19661 Property => 'Script',
19662 Each_Line_Handler => (($v_version le v4.0.0)
19663 ? \&filter_all_caps_script_names
19665 Has_Missings_Defaults => $NOT_IGNORED,
19667 Input_file->new('DNormalizationProps.txt', v3.1.0,
19668 Has_Missings_Defaults => $NOT_IGNORED,
19669 Each_Line_Handler => (($v_version lt v4.0.1)
19670 ? \&filter_old_style_normalization_lines
19673 Input_file->new('DerivedProperties.html', v3.1.1,
19675 Skip => $Documentation,
19677 Input_file->new('DAge.txt', v3.2.0,
19678 Has_Missings_Defaults => $NOT_IGNORED,
19681 Input_file->new('HangulSyllableType.txt', v4.0,
19682 Has_Missings_Defaults => $NOT_IGNORED,
19683 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19684 Property => 'Hangul_Syllable_Type'
19686 Input_file->new('NormalizationCorrections.txt', v3.2.0,
19687 # This documents the cumulative fixes to erroneous
19688 # normalizations in earlier Unicode versions. Its main
19689 # purpose is so that someone running on an earlier
19690 # version can use this file to override what got
19691 # published in that earlier release. It would be easy
19692 # for mktables to handle this file. But all the
19693 # corrections in it should already be in the other files
19694 # for the release it is. To get it to actually mean
19695 # something useful, someone would have to be using an
19696 # earlier Unicode release, and copy it into the directory
19697 # for that release and recompile. So far there has been
19698 # no demand to do that, so this hasn't been implemented.
19699 Skip => 'Documentation of corrections already '
19700 . 'incorporated into the Unicode data base',
19702 Input_file->new('StandardizedVariants.html', v3.2.0,
19703 Skip => 'Obsoleted as of Unicode 9.0, but previously '
19704 . 'provided a visual display of the standard '
19705 . 'variant sequences derived from '
19706 . 'F<StandardizedVariants.txt>.',
19707 # I don't know why the html came earlier than the
19708 # .txt, but both are skipped anyway, so it doesn't
19711 Input_file->new('StandardizedVariants.txt', v4.0.0,
19712 Skip => 'Certain glyph variations for character display '
19713 . 'are standardized. This lists the non-Unihan '
19714 . 'ones; the Unihan ones are also not used by '
19715 . 'Perl, and are in a separate Unicode data base '
19716 . 'L<http://www.unicode.org/ivd>',
19718 Input_file->new('UCD.html', v4.0.0,
19720 Skip => $Documentation,
19722 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19723 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter',
19725 # Don't use _Perl_WB as a synonym for
19726 # Word_Break in later perls, as it is tailored
19727 # and isn't the same as Word_Break
19729 Property => 'Word_Break',
19730 Has_Missings_Defaults => $NOT_IGNORED,
19732 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19733 Early => [ \&generate_GCB, '_Perl_GCB' ],
19734 Property => 'Grapheme_Cluster_Break',
19735 Has_Missings_Defaults => $NOT_IGNORED,
19737 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19738 Handler => \&process_GCB_test,
19739 retain_trailing_comments => 1,
19741 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19742 Skip => $Validation_Documentation,
19744 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19745 Handler => \&process_SB_test,
19746 retain_trailing_comments => 1,
19748 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19749 Skip => $Validation_Documentation,
19751 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19752 Handler => \&process_WB_test,
19753 retain_trailing_comments => 1,
19755 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19756 Skip => $Validation_Documentation,
19758 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19759 Property => 'Sentence_Break',
19760 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19761 Has_Missings_Defaults => $NOT_IGNORED,
19763 Input_file->new('NamedSequences.txt', v4.1.0,
19764 Handler => \&process_NamedSequences
19766 Input_file->new('Unihan.html', v4.1.0,
19768 Skip => $Documentation,
19770 Input_file->new('NameAliases.txt', v5.0,
19771 Property => 'Name_Alias',
19772 Each_Line_Handler => ($v_version le v6.0.0)
19773 ? \&filter_early_version_name_alias_line
19774 : \&filter_later_version_name_alias_line,
19776 # NameAliases.txt came along in v5.0. The above constructor handles
19777 # this. But until 6.1, it was lacking some information needed by core
19778 # perl. The constructor below handles that. It is either a kludge or
19779 # clever, depending on your point of view. The 'Withdrawn' parameter
19780 # indicates not to use it at all starting in 6.1 (so the above
19781 # constructor applies), and the 'v6.1' parameter indicates to use the
19782 # Early parameter before 6.1. Therefore 'Early" is always used,
19783 # yielding the internal-only property '_Perl_Name_Alias', which it
19784 # gets from a NameAliases.txt from 6.1 or later stored in
19785 # N_Asubst.txt. In combination with the above constructor,
19786 # 'Name_Alias' is publicly accessible starting with v5.0, and the
19787 # better 6.1 version is accessible to perl core in all releases.
19788 Input_file->new("NameAliases.txt", v6.1,
19790 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19791 Property => 'Name_Alias',
19792 EOF_Handler => \&fixup_early_perl_name_alias,
19793 Each_Line_Handler =>
19794 \&filter_later_version_name_alias_line,
19796 Input_file->new('NamedSqProv.txt', v5.0.0,
19797 Skip => 'Named sequences proposed for inclusion in a '
19798 . 'later version of the Unicode Standard; if you '
19799 . 'need them now, you can append this file to '
19800 . 'F<NamedSequences.txt> and recompile perl',
19802 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19803 Handler => \&process_LB_test,
19804 retain_trailing_comments => 1,
19806 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19807 Skip => $Validation_Documentation,
19809 Input_file->new("BidiTest.txt", v5.2.0,
19810 Skip => $Validation,
19812 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19814 Each_Line_Handler => \&filter_unihan_line,
19816 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19818 Each_Line_Handler => \&filter_unihan_line,
19820 Input_file->new('UnihanIRGSources.txt', v5.2.0,
19822 'kCompatibilityVariant',
19834 Pre_Handler => \&setup_unihan,
19835 Each_Line_Handler => \&filter_unihan_line,
19837 Input_file->new('UnihanNumericValues.txt', v5.2.0,
19839 'kAccountingNumeric',
19843 Each_Line_Handler => \&filter_unihan_line,
19845 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19847 Each_Line_Handler => \&filter_unihan_line,
19849 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19851 'Unicode_Radical_Stroke'
19853 Each_Line_Handler => \&filter_unihan_line,
19855 Input_file->new('UnihanReadings.txt', v5.2.0,
19857 Each_Line_Handler => \&filter_unihan_line,
19859 Input_file->new('UnihanVariants.txt', v5.2.0,
19861 Each_Line_Handler => \&filter_unihan_line,
19863 Input_file->new('CJKRadicals.txt', v5.2.0,
19864 Skip => 'Maps the kRSUnicode property values to '
19865 . 'corresponding code points',
19867 Input_file->new('EmojiSources.txt', v6.0.0,
19868 Skip => 'Maps certain Unicode code points to their '
19869 . 'legacy Japanese cell-phone values',
19871 Input_file->new('ScriptExtensions.txt', v6.0.0,
19872 Property => 'Script_Extensions',
19873 Early => [ sub {} ], # Doesn't do anything but ensures
19874 # that this isn't skipped for early
19876 Pre_Handler => \&setup_script_extensions,
19877 Each_Line_Handler => \&filter_script_extensions_line,
19878 Has_Missings_Defaults => (($v_version le v6.0.0)
19882 # These two Indic files are actually not usable as-is until 6.1.0,
19883 # because their property values are missing from PropValueAliases.txt
19884 # until that release, so that further work would have to be done to get
19885 # them to work properly, which isn't worth it because of them being
19887 Input_file->new('IndicMatraCategory.txt', v6.0.0,
19888 Withdrawn => v8.0.0,
19889 Property => 'Indic_Matra_Category',
19890 Has_Missings_Defaults => $NOT_IGNORED,
19891 Skip => $Indic_Skip,
19893 Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19894 Property => 'Indic_Syllabic_Category',
19895 Has_Missings_Defaults => $NOT_IGNORED,
19896 Skip => (($v_version lt v8.0.0)
19900 Input_file->new('USourceData.txt', v6.2.0,
19901 Skip => 'Documentation of status and cross reference of '
19902 . 'proposals for encoding by Unicode of Unihan '
19905 Input_file->new('USourceGlyphs.pdf', v6.2.0,
19906 Skip => 'Pictures of the characters in F<USourceData.txt>',
19908 Input_file->new('BidiBrackets.txt', v6.3.0,
19909 Properties => [ 'Bidi_Paired_Bracket',
19910 'Bidi_Paired_Bracket_Type'
19912 Has_Missings_Defaults => $NO_DEFAULTS,
19914 Input_file->new("BidiCharacterTest.txt", v6.3.0,
19915 Skip => $Validation,
19917 Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19918 Property => 'Indic_Positional_Category',
19919 Has_Missings_Defaults => $NOT_IGNORED,
19921 Input_file->new('TangutSources.txt', v9.0.0,
19922 Skip => 'Specifies source mappings for Tangut ideographs'
19923 . ' and components. This data file also includes'
19924 . ' informative radical-stroke values that are used'
19925 . ' internally by Unicode',
19927 Input_file->new('VerticalOrientation.txt', v10.0.0,
19928 Property => 'Vertical_Orientation',
19929 Has_Missings_Defaults => $NOT_IGNORED,
19931 Input_file->new('NushuSources.txt', v10.0.0,
19932 Skip => 'Specifies source material for Nushu characters',
19936 # End of all the preliminaries.
19939 if (@missing_early_files) {
19940 print simple_fold(join_lines(<<END
19942 The compilation cannot be completed because one or more required input files,
19943 listed below, are missing. This is because you are compiling Unicode version
19944 $unicode_version, which predates the existence of these file(s). To fully
19945 function, perl needs the data that these files would have contained if they
19946 had been in this release. To work around this, create copies of later
19947 versions of the missing files in the directory containing '$0'. (Perl will
19948 make the necessary adjustments to the data to compensate for it not being the
19949 same version as is being compiled.) The files are available from unicode.org,
19950 via either ftp or http. If using http, they will be under
19951 www.unicode.org/versions/. Below are listed the source file name of each
19952 missing file, the Unicode version to copy it from, and the name to store it
19953 as. (Note that the listed source file name may not be exactly the one that
19954 Unicode calls it. If you don't find it, you can look it up in 'README.perl'
19955 to get the correct name.)
19958 print simple_fold(join_lines("\n$_")) for @missing_early_files;
19962 if ($compare_versions) {
19963 Carp::my_carp(<<END
19964 Warning. \$compare_versions is set. Output is not suitable for production
19969 # Put into %potential_files a list of all the files in the directory structure
19970 # that could be inputs to this program
19973 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the
19975 my $full = lc(File::Spec->rel2abs($_));
19976 $potential_files{$full} = 1;
19979 }, File::Spec->curdir());
19981 my @mktables_list_output_files;
19982 my $old_start_time = 0;
19983 my $old_options = "";
19985 if (! -e $file_list) {
19986 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19987 $write_unchanged_files = 1;
19988 } elsif ($write_unchanged_files) {
19989 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19992 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19994 if (! open $file_handle, "<", $file_list) {
19995 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
20001 # Read and parse mktables.lst, placing the results from the first part
20002 # into @input, and the second part into @mktables_list_output_files
20003 for my $list ( \@input, \@mktables_list_output_files ) {
20004 while (<$file_handle>) {
20005 s/^ \s+ | \s+ $//xg;
20006 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
20007 $old_start_time = $1;
20010 if (/^ \s* \# \s* From\ options\ (.+) /x) {
20014 next if /^ \s* (?: \# .* )? $/x;
20016 my ( $file ) = split /\t/;
20017 push @$list, $file;
20019 @$list = uniques(@$list);
20023 # Look through all the input files
20024 foreach my $input (@input) {
20025 next if $input eq 'version'; # Already have checked this.
20027 # Ignore if doesn't exist. The checking about whether we care or
20028 # not is done via the Input_file object.
20029 next if ! file_exists($input);
20031 # The paths are stored with relative names, and with '/' as the
20032 # delimiter; convert to absolute on this machine
20033 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
20034 $potential_files{lc $full} = 1;
20038 close $file_handle;
20043 # Here wants to process all .txt files in the directory structure.
20044 # Convert them to full path names. They are stored in the platform's
20047 foreach my $object (@input_file_objects) {
20048 my $file = $object->file;
20049 next unless defined $file;
20050 push @known_files, File::Spec->rel2abs($file);
20053 my @unknown_input_files;
20054 foreach my $file (keys %potential_files) { # The keys are stored in lc
20055 next if grep { $file eq lc($_) } @known_files;
20057 # Here, the file is unknown to us. Get relative path name
20058 $file = File::Spec->abs2rel($file);
20059 push @unknown_input_files, $file;
20061 # What will happen is we create a data structure for it, and add it to
20062 # the list of input files to process. First get the subdirectories
20064 my (undef, $directories, undef) = File::Spec->splitpath($file);
20065 $directories =~ s;/$;;; # Can have extraneous trailing '/'
20066 my @directories = File::Spec->splitdir($directories);
20068 # If the file isn't extracted (meaning none of the directories is the
20069 # extracted one), just add it to the end of the list of inputs.
20070 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
20071 push @input_file_objects, Input_file->new($file, v0);
20075 # Here, the file is extracted. It needs to go ahead of most other
20076 # processing. Search for the first input file that isn't a
20077 # special required property (that is, find one whose first_release
20078 # is non-0), and isn't extracted. Also, the Age property file is
20079 # processed before the extracted ones, just in case
20080 # $compare_versions is set.
20081 for (my $i = 0; $i < @input_file_objects; $i++) {
20082 if ($input_file_objects[$i]->first_released ne v0
20083 && lc($input_file_objects[$i]->file) ne 'dage.txt'
20084 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
20086 splice @input_file_objects, $i, 0,
20087 Input_file->new($file, v0);
20094 if (@unknown_input_files) {
20095 print STDERR simple_fold(join_lines(<<END
20097 The following files are unknown as to how to handle. Assuming they are
20098 typical property files. You'll know by later error messages if it worked or
20101 ) . " " . join(", ", @unknown_input_files) . "\n\n");
20103 } # End of looking through directory structure for more .txt files.
20105 # Create the list of input files from the objects we have defined, plus
20107 my @input_files = qw(version Makefile);
20108 foreach my $object (@input_file_objects) {
20109 my $file = $object->file;
20110 next if ! defined $file; # Not all objects have files
20111 next if defined $object->skip;;
20112 push @input_files, $file;
20115 if ( $verbosity >= $VERBOSE ) {
20116 print "Expecting ".scalar( @input_files )." input files. ",
20117 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
20120 # We set $most_recent to be the most recently changed input file, including
20121 # this program itself (done much earlier in this file)
20122 foreach my $in (@input_files) {
20123 next unless -e $in; # Keep going even if missing a file
20124 my $mod_time = (stat $in)[9];
20125 $most_recent = $mod_time if $mod_time > $most_recent;
20127 # See that the input files have distinct names, to warn someone if they
20128 # are adding a new one
20130 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
20131 $directories =~ s;/$;;; # Can have extraneous trailing '/'
20132 my @directories = File::Spec->splitdir($directories);
20133 construct_filename($file, 'mutable', \@directories);
20137 # We use 'Makefile' just to see if it has changed since the last time we
20138 # rebuilt. Now discard it.
20139 @input_files = grep { $_ ne 'Makefile' } @input_files;
20141 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
20142 || ! scalar @mktables_list_output_files # or if no outputs known
20143 || $old_start_time < $most_recent # or out-of-date
20144 || $old_options ne $command_line_arguments; # or with different
20147 # Now we check to see if any output files are older than youngest, if
20148 # they are, we need to continue on, otherwise we can presumably bail.
20150 foreach my $out (@mktables_list_output_files) {
20151 if ( ! file_exists($out)) {
20152 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
20156 #local $to_trace = 1 if main::DEBUG;
20157 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
20158 if ( (stat $out)[9] <= $most_recent ) {
20159 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
20160 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
20167 print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
20170 print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20172 # Ready to do the major processing. First create the perl pseudo-property.
20173 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20175 # Process each input file
20176 foreach my $file (@input_file_objects) {
20180 # Finish the table generation.
20182 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20185 # For the very specialized case of comparing two Unicode versions...
20186 if (DEBUG && $compare_versions) {
20187 handle_compare_versions();
20190 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20193 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20194 add_perl_synonyms();
20196 print "Writing tables\n" if $verbosity >= $PROGRESS;
20197 write_all_tables();
20199 # Write mktables.lst
20200 if ( $file_list and $make_list ) {
20202 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20203 foreach my $file (@input_files, @files_actually_output) {
20204 my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20205 my @directories = grep length, File::Spec->splitdir($directories);
20206 $file = join '/', @directories, $basefile;
20210 if (! open $ofh,">",$file_list) {
20211 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
20215 my $localtime = localtime $start_time;
20216 print $ofh <<"END";
20218 # $file_list -- File list for $0.
20220 # Autogenerated starting on $start_time ($localtime)
20221 # From options $command_line_arguments
20223 # - First section is input files
20224 # ($0 itself is not listed but is automatically considered an input)
20225 # - Section separator is /^=+\$/
20226 # - Second section is a list of output files.
20227 # - Lines matching /^\\s*#/ are treated as comments
20228 # which along with blank lines are ignored.
20234 print $ofh "$_\n" for sort(@input_files);
20235 print $ofh "\n=================================\n# Output files:\n\n";
20236 print $ofh "$_\n" for sort @files_actually_output;
20237 print $ofh "\n# ",scalar(@input_files)," input files\n",
20238 "# ",scalar(@files_actually_output)+1," output files\n\n",
20241 or Carp::my_carp("Failed to close $ofh: $!");
20243 print "Filelist has ",scalar(@input_files)," input files and ",
20244 scalar(@files_actually_output)+1," output files\n"
20245 if $verbosity >= $VERBOSE;
20249 # Output these warnings unless -q explicitly specified.
20250 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20251 if (@unhandled_properties) {
20252 print "\nProperties and tables that unexpectedly have no code points\n";
20253 foreach my $property (sort @unhandled_properties) {
20254 print $property, "\n";
20258 if (%potential_files) {
20259 print "\nInput files that are not considered:\n";
20260 foreach my $file (sort keys %potential_files) {
20261 print File::Spec->abs2rel($file), "\n";
20264 print "\nAll done\n" if $verbosity >= $VERBOSE;
20267 if ($version_of_mk_invlist_bounds lt $v_version) {
20268 Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20269 . " to be checked and possibly updated to Unicode"
20270 . " $string_version");
20275 # TRAILING CODE IS USED BY make_property_test_script()
20281 # Test qr/\X/ and the \p{} regular expression constructs. This file is
20282 # constructed by mktables from the tables it generates, so if mktables is
20283 # buggy, this won't necessarily catch those bugs. Tests are generated for all
20284 # feasible properties; a few aren't currently feasible; see
20285 # is_code_point_usable() in mktables for details.
20287 # Standard test packages are not used because this manipulates SIG_WARN. It
20288 # exits 0 if every non-skipped test succeeded; -1 if any failed.
20293 # loc_tools.pl requires this function to be defined
20295 my ($pass, @msg) = @_;
20296 print "not " unless $pass;
20299 print " - ", join "", @msg if @msg;
20304 my $expected = shift;
20307 my $warning_type = shift; # Type of warning message, like 'deprecated'
20309 my $line = (caller)[2];
20311 # Convert the code point to hex form
20312 my $string = sprintf "\"\\x{%04X}\"", $ord;
20316 # The first time through, use all warnings. If the input should generate
20317 # a warning, add another time through with them turned off
20318 push @tests, "no warnings '$warning_type';" if $warning_type;
20320 foreach my $no_warnings (@tests) {
20322 # Store any warning messages instead of outputting them
20323 local $SIG{__WARN__} = $SIG{__WARN__};
20324 my $warning_message;
20325 $SIG{__WARN__} = sub { $warning_message = $_[0] };
20329 # A string eval is needed because of the 'no warnings'.
20330 # Assumes no parentheses in the regular expression
20331 my $result = eval "$no_warnings
20332 my \$RegObj = qr($regex);
20333 $string =~ \$RegObj ? 1 : 0";
20334 if (not defined $result) {
20335 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20338 elsif ($result ^ $expected) {
20339 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20342 elsif ($warning_message) {
20343 if (! $warning_type || ($warning_type && $no_warnings)) {
20344 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20348 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20351 elsif ($warning_type && ! $no_warnings) {
20352 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20356 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20365 if (eval { 'x' =~ qr/$regex/; 1 }) {
20367 my $line = (caller)[2];
20368 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20371 my $line = (caller)[2];
20372 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20377 # Break test files (e.g. GCBTest.txt) character that break allowed here
20378 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20379 utf8::upgrade($breakable_utf8);
20381 # Break test files (e.g. GCBTest.txt) character that indicates can't break
20383 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20384 utf8::upgrade($nobreak_utf8);
20386 my $are_ctype_locales_available;
20388 chdir 't' if -d 't';
20389 eval { require "./loc_tools.pl" };
20390 if (defined &locales_enabled) {
20391 $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20392 if ($are_ctype_locales_available) {
20393 $utf8_locale = &find_utf8_ctype_locale;
20397 # Eval'd so can run on versions earlier than the property is available in
20398 my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20399 if (! defined $WB_Extend_or_Format_re) {
20400 $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20403 sub _test_break($$) {
20404 # Test various break property matches. The 2nd parameter gives the
20405 # property name. The input is a line from auxiliary/*Test.txt for the
20406 # given property. Each such line is a sequence of Unicode (not native)
20407 # code points given by their hex numbers, separated by the two characters
20408 # defined just before this subroutine that indicate that either there can
20409 # or cannot be a break between the adjacent code points. All these are
20412 # For the gcb property extra tests are made. if there isn't a break, that
20413 # means the sequence forms an extended grapheme cluster, which means that
20414 # \X should match the whole thing. If there is a break, \X should stop
20415 # there. This is all converted by this routine into a match: $string =~
20416 # /(\X)/, Each \X should match the next cluster; and that is what is
20419 my $template = shift;
20420 my $break_type = shift;
20422 my $line = (caller 1)[2]; # Line number
20425 if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20429 # Replace leading spaces with a single one.
20430 $comment =~ s/ ^ \s* / # /x;
20433 # The line contains characters above the ASCII range, but in Latin1. It
20434 # may or may not be in utf8, and if it is, it may or may not know it. So,
20435 # convert these characters to 8 bits. If knows is in utf8, simply
20437 if (utf8::is_utf8($template)) {
20438 utf8::downgrade($template);
20441 # Otherwise, if it is in utf8, but doesn't know it, the next lines
20442 # convert the two problematic characters to their 8-bit equivalents.
20443 # If it isn't in utf8, they don't harm anything.
20445 $template =~ s/$nobreak_utf8/$nobreak/g;
20446 $template =~ s/$breakable_utf8/$breakable/g;
20449 # Perl customizes wb. So change the official tests accordingly
20450 if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20452 # Split into elements that alternate between code point and
20454 my @line = split / +/, $template;
20456 # Look at each code point and its following one
20457 for (my $i = 1; $i < @line - 1 - 1; $i+=2) {
20459 # The customization only involves changing some breaks to
20461 next if $line[$i+1] =~ /$nobreak/;
20463 my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20464 my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20466 # And it only affects adjacent space characters.
20467 next if $lhs !~ /\s/u;
20469 # But, we want to make sure to test spaces followed by a Extend
20471 next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20473 # To test the customization, add some white-space before this to
20474 # create a span. The $lhs white space may or may not be bound to
20475 # that span, and also with the $rhs. If the $rhs is a binding
20476 # character, the $lhs is bound to it and not to the span, unless
20477 # $lhs is vertical space. In all other cases, the $lhs is bound
20478 # to the span. If the $rhs is white space, it is bound to the
20482 if ($rhs =~ /$WB_Extend_or_Format_re/) {
20483 if ($lhs =~ /\v/) {
20484 $bound = $breakable;
20489 $span = $breakable;
20497 splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20499 $line[$i+1] = $bound;
20501 $template = join " ", @line;
20504 # The input is just the break/no-break symbols and sequences of Unicode
20505 # code points as hex digits separated by spaces for legibility. e.g.:
20506 # ÷ 0020 × 0308 ÷ 0020 ÷
20507 # Convert to native \x format
20508 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20509 $template =~ s/ \s* //gx; # Probably the line above removed all spaces;
20512 # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20514 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20515 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20517 my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20518 my $string = eval "\"$display_string\"";
20520 # The remaining massaging of the input is for the \X tests. Get rid of
20521 # the leading and trailing breakables
20522 $template =~ s/^ \s* $breakable \s* //x;
20523 $template =~ s/ \s* $breakable \s* $ //x;
20526 $template =~ s/ \s* $nobreak \s* //xg;
20528 # Split the input into segments that are breakable between them.
20529 my @should_display = split /\s*$breakable\s*/, $template;
20530 my @should_match = map { eval "\"$_\"" } @should_display;
20532 # If a string can be represented in both non-ut8 and utf8, test both cases
20533 my $display_upgrade = "";
20535 for my $to_upgrade (0 .. 1) {
20539 # If already in utf8, would just be a repeat
20540 next UPGRADE if utf8::is_utf8($string);
20542 utf8::upgrade($string);
20543 $display_upgrade = " (utf8-upgraded)";
20546 my @modifiers = qw(a aa d u i);
20547 if ($are_ctype_locales_available) {
20548 push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20550 # The /l modifier has C after it to indicate the locale to try
20551 push @modifiers, "lC";
20554 # Test for each of the regex modifiers.
20555 for my $modifier (@modifiers) {
20556 my $display_locale = "";
20558 # For /l, set the locale to what it says to.
20559 if ($modifier =~ / ^ l (.*) /x) {
20561 $display_locale = "(locale = $locale)";
20562 POSIX::setlocale(&POSIX::LC_CTYPE, $locale);
20566 no warnings qw(locale regexp surrogate);
20567 my $pattern = "(?$modifier:$break_pattern)";
20569 # Actually do the test
20571 my $matched = $string =~ qr/$pattern/;
20573 $matched_text = "matched";
20576 $matched_text = "failed to match";
20579 if (TODO_FAILING_BREAKS) {
20580 $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20581 $comment =~ s/#/# TODO/;
20584 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20586 # Only print the comment on the first use of this line
20589 # Repeat with the first \B{} in the pattern. This makes sure the
20590 # code in regexec.c:find_byclass() for \B gets executed
20591 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20592 my $B_pattern = "$1$2";
20593 $matched = $string =~ qr/$B_pattern/;
20594 print "not " unless $matched;
20595 $matched_text = ($matched) ? "matched" : "failed to match";
20596 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20597 print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20602 next if $break_type ne 'gcb';
20604 # Finally, do the \X match.
20605 my @matches = $string =~ /(\X)/g;
20607 # Look through each matched cluster to verify that it matches what we
20609 my $min = (@matches < @should_match) ? @matches : @should_match;
20610 for my $i (0 .. $min - 1) {
20612 if ($matches[$i] eq $should_match[$i]) {
20613 print "ok $Tests - ";
20615 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20617 print "And \\X #", $i + 1,
20619 print " correctly matched $should_display[$i]; line $line\n";
20621 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20622 split "", $matches[$i]);
20623 print "not ok $Tests -";
20624 print " # TODO" if TODO_FAILING_BREAKS;
20625 print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20627 " should have matched $should_display[$i]",
20628 " but instead matched $matches[$i]",
20629 ". Abandoning rest of line $line\n";
20634 # And the number of matches should equal the number of expected matches.
20636 if (@matches == @should_match) {
20637 print "ok $Tests - Nothing was left over; line $line\n";
20639 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20640 print " # TODO" if TODO_FAILING_BREAKS;
20649 _test_break(shift, 'gcb');
20653 _test_break(shift, 'lb');
20657 _test_break(shift, 'sb');
20661 _test_break(shift, 'wb');
20665 print "1..$Tests\n";
20666 exit($Fails ? -1 : 0);