3 # !!!!!!!!!!!!!! IF YOU MODIFY THIS FILE !!!!!!!!!!!!!!!!!!!!!!!!!
4 # Any files created or read by this program should be listed in 'mktables.lst'
5 # Use -makelist to regenerate it.
7 # There was an attempt when this was first rewritten to make it 5.8
8 # compatible, but that has now been abandoned, and newer constructs are used
11 # NOTE: this script can run quite slowly in older/slower systems.
12 # It can also consume a lot of memory (128 MB or more), you may need
13 # to raise your process resource limits (e.g. in bash, "ulimit -a"
14 # to inspect, and "ulimit -d ..." or "ulimit -m ..." to set)
17 BEGIN { # Get the time the script started running; do it at compilation to
18 # get it as close as possible
25 use builtin qw(refaddr);
35 use feature 'signatures';
36 no warnings qw( experimental::builtin );
38 sub DEBUG () { 0 } # Set to 0 for production; 1 for development
39 my $debugging_build = $Config{"ccflags"} =~ /-DDEBUGGING/;
41 sub NON_ASCII_PLATFORM { ord("A") != 65 }
43 # When a new version of Unicode is published, unfortunately the algorithms for
44 # dealing with various bounds, like \b{gcb}, \b{lb} may have to be updated
45 # manually. The changes may or may not be backward compatible with older
46 # releases. The code is in regen/mk_invlist.pl and regexec.c. Make the
47 # changes, then come back here and set the variable below to what version the
48 # code is expecting. If a newer version of Unicode is being compiled than
49 # expected, a warning will be generated. If an older version is being
50 # compiled, any bounds tests that fail in the generated test file (-maketest
51 # option) will be marked as TODO.
52 my $version_of_mk_invlist_bounds = v15.0.0;
54 ##########################################################################
56 # mktables -- create the runtime Perl Unicode files (lib/unicore/.../*.pl),
57 # from the Unicode database files (lib/unicore/.../*.txt), It also generates
58 # a pod file and .t files, depending on option parameters.
60 # The structure of this file is:
61 # First these introductory comments; then
62 # code needed for everywhere, such as debugging stuff; then
63 # code to handle input parameters; then
64 # data structures likely to be of external interest (some of which depend on
65 # the input parameters, so follows them; then
66 # more data structures and subroutine and package (class) definitions; then
67 # the small actual loop to process the input files and finish up; then
68 # a __DATA__ section, for the .t tests
70 # This program works on all releases of Unicode so far. The outputs have been
71 # scrutinized most intently for release 5.1. The others have been checked for
72 # somewhat more than just sanity. It can handle all non-provisional Unicode
73 # character properties in those releases.
75 # This program is mostly about Unicode character (or code point) properties.
76 # A property describes some attribute or quality of a code point, like if it
77 # is lowercase or not, its name, what version of Unicode it was first defined
78 # in, or what its uppercase equivalent is. Unicode deals with these disparate
79 # possibilities by making all properties into mappings from each code point
80 # into some corresponding value. In the case of it being lowercase or not,
81 # the mapping is either to 'Y' or 'N' (or various synonyms thereof). Each
82 # property maps each Unicode code point to a single value, called a "property
83 # value". (Some more recently defined properties, map a code point to a set
86 # When using a property in a regular expression, what is desired isn't the
87 # mapping of the code point to its property's value, but the reverse (or the
88 # mathematical "inverse relation"): starting with the property value, "Does a
89 # code point map to it?" These are written in a "compound" form:
90 # \p{property=value}, e.g., \p{category=punctuation}. This program generates
91 # files containing the lists of code points that map to each such regular
92 # expression property value, one file per list
94 # There is also a single form shortcut that Perl adds for many of the commonly
95 # used properties. This happens for all binary properties, plus script,
96 # general_category, and block properties.
98 # Thus the outputs of this program are files. There are map files, mostly in
99 # the 'To' directory; and there are list files for use in regular expression
100 # matching, all in subdirectories of the 'lib' directory, with each
101 # subdirectory being named for the property that the lists in it are for.
102 # Bookkeeping, test, and documentation files are also generated.
104 my $matches_directory = 'lib'; # Where match (\p{}) files go.
105 my $map_directory = 'To'; # Where map files go.
109 # The major data structures of this program are Property, of course, but also
110 # Table. There are two kinds of tables, very similar to each other.
111 # "Match_Table" is the data structure giving the list of code points that have
112 # a particular property value, mentioned above. There is also a "Map_Table"
113 # data structure which gives the property's mapping from code point to value.
114 # There are two structures because the match tables need to be combined in
115 # various ways, such as constructing unions, intersections, complements, etc.,
116 # and the map ones don't. And there would be problems, perhaps subtle, if
117 # a map table were inadvertently operated on in some of those ways.
118 # The use of separate classes with operations defined on one but not the other
119 # prevents accidentally confusing the two.
121 # At the heart of each table's data structure is a "Range_List", which is just
122 # an ordered list of "Ranges", plus ancillary information, and methods to
123 # operate on them. A Range is a compact way to store property information.
124 # Each range has a starting code point, an ending code point, and a value that
125 # is meant to apply to all the code points between the two end points,
126 # inclusive. For a map table, this value is the property value for those
127 # code points. Two such ranges could be written like this:
128 # 0x41 .. 0x5A, 'Upper',
129 # 0x61 .. 0x7A, 'Lower'
131 # Each range also has a type used as a convenience to classify the values.
132 # Most ranges in this program will be Type 0, or normal, but there are some
133 # ranges that have a non-zero type. These are used only in map tables, and
134 # are for mappings that don't fit into the normal scheme of things. Mappings
135 # that require a hash entry to communicate with utf8.c are one example;
136 # another example is mappings for charnames.pm to use which indicate a name
137 # that is algorithmically determinable from its code point (and the reverse).
138 # These are used to significantly compact these tables, instead of listing
139 # each one of the tens of thousands individually.
141 # In a match table, the value of a range is irrelevant (and hence the type as
142 # well, which will always be 0), and arbitrarily set to the empty string.
143 # Using the example above, there would be two match tables for those two
144 # entries, one named Upper would contain the 0x41..0x5A range, and the other
145 # named Lower would contain 0x61..0x7A.
147 # Actually, there are two types of range lists, "Range_Map" is the one
148 # associated with map tables, and "Range_List" with match tables.
149 # Again, this is so that methods can be defined on one and not the others so
150 # as to prevent operating on them in incorrect ways.
152 # Eventually, most tables are written out to files to be read by Unicode::UCD.
153 # All tables could in theory be written, but some are suppressed because there
154 # is no current practical use for them. It is easy to change which get
155 # written by changing various lists that are near the top of the actual code
156 # in this file. The table data structures contain enough ancillary
157 # information to allow them to be treated as separate entities for writing,
158 # such as the path to each one's file. There is a heading in each map table
159 # that gives the format of its entries, and what the map is for all the code
160 # points missing from it. (This allows tables to be more compact.)
162 # The Property data structure contains one or more tables. All properties
163 # contain a map table (except the $perl property which is a
164 # pseudo-property containing only match tables), and any properties that
165 # are usable in regular expression matches also contain various matching
166 # tables, one for each value the property can have. A binary property can
167 # have two values, True and False (or Y and N, which are preferred by Unicode
168 # terminology). Thus each of these properties will have a map table that
169 # takes every code point and maps it to Y or N (but having ranges cuts the
170 # number of entries in that table way down), and two match tables, one
171 # which has a list of all the code points that map to Y, and one for all the
172 # code points that map to N. (For each binary property, a third table is also
173 # generated for the pseudo Perl property. It contains the identical code
174 # points as the Y table, but can be written in regular expressions, not in the
175 # compound form, but in a "single" form like \p{IsUppercase}.) Many
176 # properties are binary, but some properties have several possible values,
177 # some have many, and properties like Name have a different value for every
178 # named code point. Those will not, unless the controlling lists are changed,
179 # have their match tables written out. But all the ones which can be used in
180 # regular expression \p{} and \P{} constructs will. Prior to 5.14, generally
181 # a property would have either its map table or its match tables written but
182 # not both. Again, what gets written is controlled by lists which can easily
183 # be changed. Starting in 5.14, advantage was taken of this, and all the map
184 # tables needed to reconstruct the Unicode db are now written out, while
185 # suppressing the Unicode .txt files that contain the data. Our tables are
186 # much more compact than the .txt files, so a significant space savings was
187 # achieved. Also, tables are not written out that are trivially derivable
188 # from tables that do get written. So, there typically is no file containing
189 # the code points not matched by a binary property (the table for \P{} versus
190 # lowercase \p{}), since you just need to invert the True table to get the
193 # Properties have a 'Type', like 'binary', or 'string', or 'enum' depending on
194 # how many match tables there are and the content of the maps. This 'Type' is
195 # different than a range 'Type', so don't get confused by the two concepts
196 # having the same name.
198 # For information about the Unicode properties, see Unicode's UAX44 document:
200 my $unicode_reference_url = 'http://www.unicode.org/reports/tr44/';
202 # As stated earlier, this program will work on any release of Unicode so far.
203 # Most obvious problems in earlier data have NOT been corrected except when
204 # necessary to make Perl or this program work reasonably, and to keep out
205 # potential security issues. For example, no folding information was given in
206 # early releases, so this program substitutes lower case instead, just so that
207 # a regular expression with the /i option will do something that actually
208 # gives the right results in many cases. There are also a couple other
209 # corrections for version 1.1.5, commented at the point they are made. As an
210 # example of corrections that weren't made (but could be) is this statement
211 # from DerivedAge.txt: "The supplementary private use code points and the
212 # non-character code points were assigned in version 2.0, but not specifically
213 # listed in the UCD until versions 3.0 and 3.1 respectively." (To be precise
214 # it was 3.0.1 not 3.0.0) More information on Unicode version glitches is
215 # further down in these introductory comments.
217 # This program works on all non-provisional properties as of the current
218 # Unicode release, though the files for some are suppressed for various
219 # reasons. You can change which are output by changing lists in this program.
221 # The old version of mktables emphasized the term "Fuzzy" to mean Unicode's
222 # loose matchings rules (from Unicode TR18):
224 # The recommended names for UCD properties and property values are in
225 # PropertyAliases.txt [Prop] and PropertyValueAliases.txt
226 # [PropValue]. There are both abbreviated names and longer, more
227 # descriptive names. It is strongly recommended that both names be
228 # recognized, and that loose matching of property names be used,
229 # whereby the case distinctions, whitespace, hyphens, and underbar
232 # The program still allows Fuzzy to override its determination of if loose
233 # matching should be used, but it isn't currently used, as it is no longer
234 # needed; the calculations it makes are good enough.
236 # SUMMARY OF HOW IT WORKS:
240 # A list is constructed containing each input file that is to be processed
242 # Each file on the list is processed in a loop, using the associated handler
244 # The PropertyAliases.txt and PropValueAliases.txt files are processed
245 # first. These files name the properties and property values.
246 # Objects are created of all the property and property value names
247 # that the rest of the input should expect, including all synonyms.
248 # The other input files give mappings from properties to property
249 # values. That is, they list code points and say what the mapping
250 # is under the given property. Some files give the mappings for
251 # just one property; and some for many. This program goes through
252 # each file and populates the properties and their map tables from
253 # them. Some properties are listed in more than one file, and
254 # Unicode has set up a precedence as to which has priority if there
255 # is a conflict. Thus the order of processing matters, and this
256 # program handles the conflict possibility by processing the
257 # overriding input files last, so that if necessary they replace
259 # After this is all done, the program creates the property mappings not
260 # furnished by Unicode, but derivable from what it does give.
261 # The tables of code points that match each property value in each
262 # property that is accessible by regular expressions are created.
263 # The Perl-defined properties are created and populated. Many of these
264 # require data determined from the earlier steps
265 # Any Perl-defined synonyms are created, and name clashes between Perl
266 # and Unicode are reconciled and warned about.
267 # All the properties are written to files
268 # Any other files are written, and final warnings issued.
270 # For clarity, a number of operators have been overloaded to work on tables:
271 # ~ means invert (take all characters not in the set). The more
272 # conventional '!' is not used because of the possibility of confusing
273 # it with the actual boolean operation.
275 # - means subtraction
276 # & means intersection
277 # The precedence of these is the order listed. Parentheses should be
278 # copiously used. These are not a general scheme. The operations aren't
279 # defined for a number of things, deliberately, to avoid getting into trouble.
280 # Operations are done on references and affect the underlying structures, so
281 # that the copy constructors for them have been overloaded to not return a new
282 # clone, but the input object itself.
284 # The bool operator is deliberately not overloaded to avoid confusion with
285 # "should it mean if the object merely exists, or also is non-empty?".
287 # WHY CERTAIN DESIGN DECISIONS WERE MADE
289 # This program needs to be able to run under miniperl. Therefore, it uses a
290 # minimum of other modules, and hence implements some things itself that could
291 # be gotten from CPAN
293 # This program uses inputs published by the Unicode Consortium. These can
294 # change incompatibly between releases without the Perl maintainers realizing
295 # it. Therefore this program is now designed to try to flag these. It looks
296 # at the directories where the inputs are, and flags any unrecognized files.
297 # It keeps track of all the properties in the files it handles, and flags any
298 # that it doesn't know how to handle. It also flags any input lines that
299 # don't match the expected syntax, among other checks.
301 # It is also designed so if a new input file matches one of the known
302 # templates, one hopefully just needs to add it to a list to have it
305 # As mentioned earlier, some properties are given in more than one file. In
306 # particular, the files in the extracted directory are supposedly just
307 # reformattings of the others. But they contain information not easily
308 # derivable from the other files, including results for Unihan (which isn't
309 # usually available to this program) and for unassigned code points. They
310 # also have historically had errors or been incomplete. In an attempt to
311 # create the best possible data, this program thus processes them first to
312 # glean information missing from the other files; then processes those other
313 # files to override any errors in the extracted ones. Much of the design was
314 # driven by this need to store things and then possibly override them.
316 # It tries to keep fatal errors to a minimum, to generate something usable for
317 # testing purposes. It always looks for files that could be inputs, and will
318 # warn about any that it doesn't know how to handle (the -q option suppresses
321 # Why is there more than one type of range?
322 # This simplified things. There are some very specialized code points that
323 # have to be handled specially for output, such as Hangul syllable names.
324 # By creating a range type (done late in the development process), it
325 # allowed this to be stored with the range, and overridden by other input.
326 # Originally these were stored in another data structure, and it became a
327 # mess trying to decide if a second file that was for the same property was
328 # overriding the earlier one or not.
330 # Why are there two kinds of tables, match and map?
331 # (And there is a base class shared by the two as well.) As stated above,
332 # they actually are for different things. Development proceeded much more
333 # smoothly when I (khw) realized the distinction. Map tables are used to
334 # give the property value for every code point (actually every code point
335 # that doesn't map to a default value). Match tables are used for regular
336 # expression matches, and are essentially the inverse mapping. Separating
337 # the two allows more specialized methods, and error checks so that one
338 # can't just take the intersection of two map tables, for example, as that
341 # What about 'fate' and 'status'. The concept of a table's fate was created
342 # late when it became clear that something more was needed. The difference
343 # between this and 'status' is unclean, and could be improved if someone
344 # wanted to spend the effort.
348 # This program is written so it will run under miniperl. Occasionally changes
349 # will cause an error where the backtrace doesn't work well under miniperl.
350 # To diagnose the problem, you can instead run it under regular perl, if you
353 # There is a good trace facility. To enable it, first sub DEBUG must be set
354 # to return true. Then a line like
356 # local $to_trace = 1 if main::DEBUG;
358 # can be added to enable tracing in its lexical scope (plus dynamic) or until
359 # you insert another line:
361 # local $to_trace = 0 if main::DEBUG;
363 # To actually trace, use a line like "trace $a, @b, %c, ...;
365 # Some of the more complex subroutines already have trace statements in them.
366 # Permanent trace statements should be like:
368 # trace ... if main::DEBUG && $to_trace;
370 # main::stack_trace() will display what its name implies
372 # If there is just one or a few files that you're debugging, you can easily
373 # cause most everything else to be skipped. Change the line
375 # my $debug_skip = 0;
377 # to 1, and every file whose object is in @input_file_objects and doesn't have
378 # a, 'non_skip => 1,' in its constructor will be skipped. However, skipping
379 # Jamo.txt or UnicodeData.txt will likely cause fatal errors.
381 # To compare the output tables, it may be useful to specify the -annotate
382 # flag. (As of this writing, this can't be done on a clean workspace, due to
383 # requirements in Text::Tabs used in this option; so first run mktables
384 # without this option.) This option adds comment lines to each table, one for
385 # each non-algorithmically named character giving, currently its code point,
386 # name, and graphic representation if printable (and you have a font that
387 # knows about it). This makes it easier to see what the particular code
388 # points are in each output table. Non-named code points are annotated with a
389 # description of their status, and contiguous ones with the same description
390 # will be output as a range rather than individually. Algorithmically named
391 # characters are also output as ranges, except when there are just a few
396 # The program would break if Unicode were to change its names so that
397 # interior white space, underscores, or dashes differences were significant
398 # within property and property value names.
400 # It might be easier to use the xml versions of the UCD if this program ever
401 # would need heavy revision, and the ability to handle old versions was not
404 # There is the potential for name collisions, in that Perl has chosen names
405 # that Unicode could decide it also likes. There have been such collisions in
406 # the past, with mostly Perl deciding to adopt the Unicode definition of the
407 # name. However in the 5.2 Unicode beta testing, there were a number of such
408 # collisions, which were withdrawn before the final release, because of Perl's
409 # and other's protests. These all involved new properties which began with
410 # 'Is'. Based on the protests, Unicode is unlikely to try that again. Also,
411 # many of the Perl-defined synonyms, like Any, Word, etc, are listed in a
412 # Unicode document, so they are unlikely to be used by Unicode for another
413 # purpose. However, they might try something beginning with 'In', or use any
414 # of the other Perl-defined properties. This program will warn you of name
415 # collisions, and refuse to generate tables with them, but manual intervention
416 # will be required in this event. One scheme that could be implemented, if
417 # necessary, would be to have this program generate another file, or add a
418 # field to mktables.lst that gives the date of first definition of a property.
419 # Each new release of Unicode would use that file as a basis for the next
420 # iteration. And the Perl synonym addition code could sort based on the age
421 # of the property, so older properties get priority, and newer ones that clash
422 # would be refused; hence existing code would not be impacted, and some other
423 # synonym would have to be used for the new property. This is ugly, and
424 # manual intervention would certainly be easier to do in the short run; lets
425 # hope it never comes to this.
429 # This program can generate tables from the Unihan database. But that DB
430 # isn't normally available, so it is marked as optional. Prior to version
431 # 5.2, this database was in a single file, Unihan.txt. In 5.2 the database
432 # was split into 8 different files, all beginning with the letters 'Unihan'.
433 # If you plunk those files down into the directory mktables ($0) is in, this
434 # program will read them and automatically create tables for the properties
435 # from it that are listed in PropertyAliases.txt and PropValueAliases.txt,
436 # plus any you add to the @cjk_properties array and the @cjk_property_values
437 # array, being sure to add necessary '# @missings' lines to the latter. For
438 # Unicode versions earlier than 5.2, most of the Unihan properties are not
439 # listed at all in PropertyAliases nor PropValueAliases. This program assumes
440 # for these early releases that you want the properties that are specified in
443 # You may need to adjust the entries to suit your purposes. setup_unihan(),
444 # and filter_unihan_line() are the functions where this is done. This program
445 # already does some adjusting to make the lines look more like the rest of the
446 # Unicode DB; You can see what that is in filter_unihan_line()
448 # There is a bug in the 3.2 data file in which some values for the
449 # kPrimaryNumeric property have commas and an unexpected comment. A filter
450 # could be added to correct these; or for a particular installation, the
451 # Unihan.txt file could be edited to fix them.
453 # HOW TO ADD A FILE TO BE PROCESSED
455 # A new file from Unicode needs to have an object constructed for it in
456 # @input_file_objects, probably at the end or at the end of the extracted
457 # ones. The program should warn you if its name will clash with others on
458 # restrictive file systems, like DOS. If so, figure out a better name, and
459 # add lines to the README.perl file giving that. If the file is a character
460 # property, it should be in the format that Unicode has implicitly
461 # standardized for such files for the more recently introduced ones.
462 # If so, the Input_file constructor for @input_file_objects can just be the
463 # file name and release it first appeared in. If not, then it should be
464 # possible to construct an each_line_handler() to massage the line into the
467 # For non-character properties, more code will be needed. You can look at
468 # the existing entries for clues.
470 # UNICODE VERSIONS NOTES
472 # The Unicode UCD has had a number of errors in it over the versions. And
473 # these remain, by policy, in the standard for that version. Therefore it is
474 # risky to correct them, because code may be expecting the error. So this
475 # program doesn't generally make changes, unless the error breaks the Perl
476 # core. As an example, some versions of 2.1.x Jamo.txt have the wrong value
477 # for U+1105, which causes real problems for the algorithms for Jamo
478 # calculations, so it is changed here.
480 # But it isn't so clear cut as to what to do about concepts that are
481 # introduced in a later release; should they extend back to earlier releases
482 # where the concept just didn't exist? It was easier to do this than to not,
483 # so that's what was done. For example, the default value for code points not
484 # in the files for various properties was probably undefined until changed by
485 # some version. No_Block for blocks is such an example. This program will
486 # assign No_Block even in Unicode versions that didn't have it. This has the
487 # benefit that code being written doesn't have to special case earlier
488 # versions; and the detriment that it doesn't match the Standard precisely for
489 # the affected versions.
491 # Here are some observations about some of the issues in early versions:
493 # Prior to version 3.0, there were 3 character decompositions. These are not
494 # handled by Unicode::Normalize, nor will it compile when presented a version
495 # that has them. However, you can trivially get it to compile by simply
496 # ignoring those decompositions, by changing the croak to a carp. At the time
497 # of this writing, the line (in dist/Unicode-Normalize/Normalize.pm or
498 # dist/Unicode-Normalize/mkheader) reads
500 # croak("Weird Canonical Decomposition of U+$h");
502 # Simply comment it out. It will compile, but will not know about any three
503 # character decompositions.
505 # The number of code points in \p{alpha=True} halved in 2.1.9. It turns out
506 # that the reason is that the CJK block starting at 4E00 was removed from
507 # PropList, and was not put back in until 3.1.0. The Perl extension (the
508 # single property name \p{alpha}) has the correct values. But the compound
509 # form is simply not generated until 3.1, as it can be argued that prior to
510 # this release, this was not an official property. The comments for
511 # filter_old_style_proplist() give more details.
513 # Unicode introduced the synonym Space for White_Space in 4.1. Perl has
514 # always had a \p{Space}. In release 3.2 only, they are not synonymous. The
515 # reason is that 3.2 introduced U+205F=medium math space, which was not
516 # classed as white space, but Perl figured out that it should have been. 4.0
517 # reclassified it correctly.
519 # Another change between 3.2 and 4.0 is the CCC property value ATBL. In 3.2
520 # this was erroneously a synonym for 202 (it should be 200). In 4.0, ATB
521 # became 202, and ATBL was left with no code points, as all the ones that
522 # mapped to 202 stayed mapped to 202. Thus if your program used the numeric
523 # name for the class, it would not have been affected, but if it used the
524 # mnemonic, it would have been.
526 # \p{Script=Hrkt} (Katakana_Or_Hiragana) came in 4.0.1. Before that, code
527 # points which eventually came to have this script property value, instead
528 # mapped to "Unknown". But in the next release all these code points were
529 # moved to \p{sc=common} instead.
531 # The tests furnished by Unicode for testing WordBreak and SentenceBreak
532 # generate errors in 5.0 and earlier.
534 # The default for missing code points for BidiClass is complicated. Starting
535 # in 3.1.1, the derived file DBidiClass.txt handles this, but this program
536 # tries to do the best it can for earlier releases. It is done in
537 # process_PropertyAliases()
539 # In version 2.1.2, the entry in UnicodeData.txt:
540 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;;019F;
542 # 0275;LATIN SMALL LETTER BARRED O;Ll;0;L;;;;;N;;;019F;;019F
543 # Without this change, there are casing problems for this character.
545 # Search for $string_compare_versions to see how to compare changes to
546 # properties between Unicode versions
548 ##############################################################################
550 my $UNDEF = ':UNDEF:'; # String to print out for undefined values in tracing
552 my $MAX_LINE_WIDTH = 78;
554 # Debugging aid to skip most files so as to not be distracted by them when
555 # concentrating on the ones being debugged. Add
557 # to the constructor for those files you want processed when you set this.
558 # Files with a first version number of 0 are special: they are always
559 # processed regardless of the state of this flag. Generally, Jamo.txt and
560 # UnicodeData.txt must not be skipped if you want this program to not die
561 # before normal completion.
565 # Normally these are suppressed.
566 my $write_Unicode_deprecated_tables = 0;
568 # Set to 1 to enable tracing.
571 { # Closure for trace: debugging aid
572 my $print_caller = 1; # ? Include calling subroutine name
573 my $main_with_colon = 'main::';
574 my $main_colon_length = length($main_with_colon);
577 return unless $to_trace; # Do nothing if global flag not set
581 local $DB::trace = 0;
582 $DB::trace = 0; # Quiet 'used only once' message
586 # Loop looking up the stack to get the first non-trace caller
591 $line_number = $caller_line;
592 (my $pkg, my $file, $caller_line, my $caller) = caller $i++;
593 $caller = $main_with_colon unless defined $caller;
595 $caller_name = $caller;
598 $caller_name =~ s/.*:://;
599 if (substr($caller_name, 0, $main_colon_length)
602 $caller_name = substr($caller_name, $main_colon_length);
605 } until ($caller_name ne 'trace');
607 # If the stack was empty, we were called from the top level
608 $caller_name = 'main' if ($caller_name eq ""
609 || $caller_name eq 'trace');
612 #print STDERR __LINE__, ": ", join ", ", @input, "\n";
613 foreach my $string (@input) {
614 if (ref $string eq 'ARRAY' || ref $string eq 'HASH') {
615 $output .= simple_dumper($string);
618 $string = "$string" if ref $string;
619 $string = $UNDEF unless defined $string;
621 $string = '""' if $string eq "";
622 $output .= " " if $output ne ""
624 && substr($output, -1, 1) ne " "
625 && substr($string, 0, 1) ne " ";
630 print STDERR sprintf "%4d: ", $line_number if defined $line_number;
631 print STDERR "$caller_name: " if $print_caller;
632 print STDERR $output, "\n";
638 local $to_trace = 1 if main::DEBUG;
639 my $line = (caller(0))[2];
642 # Accumulate the stack trace
644 my ($pkg, $file, $caller_line, $caller) = caller $i++;
646 last unless defined $caller;
648 trace "called from $caller() at line $line";
649 $line = $caller_line;
653 # This is for a rarely used development feature that allows you to compare two
654 # versions of the Unicode standard without having to deal with changes caused
655 # by the code points introduced in the later version. You probably also want
656 # to use the -annotate option when using this. Run this program on a unicore
657 # containing the starting release you want to compare. Save that output
658 # structure. Then, switching to a unicore with the ending release, change the
659 # "" in the $string_compare_versions definition just below to a string
660 # containing a SINGLE dotted Unicode release number (e.g. "2.1") corresponding
661 # to the starting release. This program will then compile, but throw away all
662 # code points introduced after the starting release. Finally use a diff tool
663 # to compare the two directory structures. They include only the code points
664 # common to both releases, and you can see the changes caused just by the
665 # underlying release semantic changes. For versions earlier than 3.2, you
666 # must copy a version of DAge.txt into the directory.
667 my $string_compare_versions = DEBUG && "";
668 my $compare_versions = DEBUG
669 && $string_compare_versions
670 && pack "C*", split /\./, $string_compare_versions;
673 # Returns non-duplicated input values. From "Perl Best Practices:
674 # Encapsulated Cleverness". p. 455 in first edition.
677 # Arguably this breaks encapsulation, if the goal is to permit multiple
678 # distinct objects to stringify to the same value, and be interchangeable.
679 # However, for this program, no two objects stringify identically, and all
680 # lists passed to this function are either objects or strings. So this
681 # doesn't affect correctness, but it does give a couple of percent speedup.
683 return grep { ! $seen{$_}++ } @_;
686 $0 = File::Spec->canonpath($0);
688 my $make_test_script = 0; # ? Should we output a test script
689 my $make_norm_test_script = 0; # ? Should we output a normalization test script
690 my $write_unchanged_files = 0; # ? Should we update the output files even if
691 # we don't think they have changed
692 my $use_directory = ""; # ? Should we chdir somewhere.
693 my $pod_directory; # input directory to store the pod file.
694 my $pod_file = 'perluniprops';
695 my $t_path; # Path to the .t test file
696 my $file_list = 'mktables.lst'; # File to store input and output file names.
697 # This is used to speed up the build, by not
698 # executing the main body of the program if
699 # nothing on the list has changed since the
701 my $make_list = 1; # ? Should we write $file_list. Set to always
702 # make a list so that when the release manager
703 # is preparing a release, they won't have to do
705 my $glob_list = 0; # ? Should we try to include unknown .txt files
707 my $output_range_counts = $debugging_build; # ? Should we include the number
708 # of code points in ranges in
710 my $annotate = 0; # ? Should character names be in the output
712 # Verbosity levels; 0 is quiet
713 my $NORMAL_VERBOSITY = 1;
717 my $verbosity = $NORMAL_VERBOSITY;
719 # Stored in mktables.lst so that if this program is called with different
720 # options, will regenerate even if the files otherwise look like they're
722 my $command_line_arguments = join " ", @ARGV;
726 my $arg = shift @ARGV;
728 $verbosity = $VERBOSE;
730 elsif ($arg eq '-p') {
731 $verbosity = $PROGRESS;
732 $| = 1; # Flush buffers as we go.
734 elsif ($arg eq '-q') {
737 elsif ($arg eq '-w') {
738 # update the files even if they haven't changed
739 $write_unchanged_files = 1;
741 elsif ($arg eq '-check') {
742 my $this = shift @ARGV;
743 my $ok = shift @ARGV;
745 print "Skipping as check params are not the same.\n";
749 elsif ($arg eq '-P' && defined ($pod_directory = shift)) {
750 -d $pod_directory or croak "Directory '$pod_directory' doesn't exist";
752 elsif ($arg eq '-maketest' || ($arg eq '-T' && defined ($t_path = shift)))
754 $make_test_script = 1;
756 elsif ($arg eq '-makenormtest')
758 $make_norm_test_script = 1;
760 elsif ($arg eq '-makelist') {
763 elsif ($arg eq '-C' && defined ($use_directory = shift)) {
764 -d $use_directory or croak "Unknown directory '$use_directory'";
766 elsif ($arg eq '-L') {
768 # Existence not tested until have chdir'd
771 elsif ($arg eq '-globlist') {
774 elsif ($arg eq '-c') {
775 $output_range_counts = ! $output_range_counts
777 elsif ($arg eq '-annotate') {
779 $debugging_build = 1;
780 $output_range_counts = 1;
784 $with_c .= 'out' if $output_range_counts; # Complements the state
786 usage: $0 [-c|-p|-q|-v|-w] [-C dir] [-L filelist] [ -P pod_dir ]
787 [ -T test_file_path ] [-globlist] [-makelist] [-maketest]
789 -c : Output comments $with_c number of code points in ranges
790 -q : Quiet Mode: Only output serious warnings.
791 -p : Set verbosity level to normal plus show progress.
792 -v : Set Verbosity level high: Show progress and non-serious
794 -w : Write files regardless
795 -C dir : Change to this directory before proceeding. All relative paths
796 except those specified by the -P and -T options will be done
797 with respect to this directory.
798 -P dir : Output $pod_file file to directory 'dir'.
799 -T path : Create a test script as 'path'; overrides -maketest
800 -L filelist : Use alternate 'filelist' instead of standard one
801 -globlist : Take as input all non-Test *.txt files in current and sub
803 -maketest : Make test script 'TestProp.pl' in current (or -C directory),
805 -makelist : Rewrite the file list $file_list based on current setup
806 -annotate : Output an annotation for each character in the table files;
807 useful for debugging mktables, looking at diffs; but is slow
809 -check A B : Executes $0 only if A and B are the same
814 # Stores the most-recently changed file. If none have changed, can skip the
816 my $most_recent = (stat $0)[9]; # Do this before the chdir!
818 # Change directories now, because need to read 'version' early.
819 if ($use_directory) {
820 if ($pod_directory && ! File::Spec->file_name_is_absolute($pod_directory)) {
821 $pod_directory = File::Spec->rel2abs($pod_directory);
823 if ($t_path && ! File::Spec->file_name_is_absolute($t_path)) {
824 $t_path = File::Spec->rel2abs($t_path);
826 chdir $use_directory or croak "Failed to chdir to '$use_directory':$!";
827 if ($pod_directory && File::Spec->file_name_is_absolute($pod_directory)) {
828 $pod_directory = File::Spec->abs2rel($pod_directory);
830 if ($t_path && File::Spec->file_name_is_absolute($t_path)) {
831 $t_path = File::Spec->abs2rel($t_path);
835 # Get Unicode version into regular and v-string. This is done now because
836 # various tables below get populated based on it. These tables are populated
837 # here to be near the top of the file, and so easily seeable by those needing
839 open my $VERSION, "<", "version"
840 or croak "$0: can't open required file 'version': $!\n";
841 my $string_version = <$VERSION>;
843 chomp $string_version;
844 my $v_version = pack "C*", split /\./, $string_version; # v string
846 my $unicode_version = ($compare_versions)
847 ? ( "$string_compare_versions (using "
848 . "$string_version rules)")
851 # The following are the complete names of properties with property values that
852 # are known to not match any code points in some versions of Unicode, but that
853 # may change in the future so they should be matchable, hence an empty file is
854 # generated for them.
855 my @tables_that_may_be_empty;
856 push @tables_that_may_be_empty, 'Joining_Type=Left_Joining'
857 if $v_version lt v6.3.0;
858 push @tables_that_may_be_empty, 'Script=Common' if $v_version le v4.0.1;
859 push @tables_that_may_be_empty, 'Title' if $v_version lt v2.0.0;
860 push @tables_that_may_be_empty, 'Script=Katakana_Or_Hiragana'
861 if $v_version ge v4.1.0;
862 push @tables_that_may_be_empty, 'Script_Extensions=Katakana_Or_Hiragana'
863 if $v_version ge v6.0.0;
864 push @tables_that_may_be_empty, 'Grapheme_Cluster_Break=Prepend'
865 if $v_version ge v6.1.0;
866 push @tables_that_may_be_empty, 'Canonical_Combining_Class=CCC133'
867 if $v_version ge v6.2.0;
869 # The lists below are hashes, so the key is the item in the list, and the
870 # value is the reason why it is in the list. This makes generation of
871 # documentation easier.
873 my %why_suppressed; # No file generated for these.
875 # Files aren't generated for empty extraneous properties. This is arguable.
876 # Extraneous properties generally come about because a property is no longer
877 # used in a newer version of Unicode. If we generated a file without code
878 # points, programs that used to work on that property will still execute
879 # without errors. It just won't ever match (or will always match, with \P{}).
880 # This means that the logic is now likely wrong. I (khw) think its better to
881 # find this out by getting an error message. Just move them to the table
882 # above to change this behavior
883 my %why_suppress_if_empty_warn_if_not = (
885 # It is the only property that has ever officially been removed from the
886 # Standard. The database never contained any code points for it.
887 'Special_Case_Condition' => 'Obsolete',
889 # Apparently never official, but there were code points in some versions of
890 # old-style PropList.txt
891 'Non_Break' => 'Obsolete',
894 # These would normally go in the warn table just above, but they were changed
895 # a long time before this program was written, so warnings about them are
897 if ($v_version gt v3.2.0) {
898 push @tables_that_may_be_empty,
899 'Canonical_Combining_Class=Attached_Below_Left'
903 if ($v_version ge v11.0.0) {
904 push @tables_that_may_be_empty, qw(
905 Grapheme_Cluster_Break=E_Base
906 Grapheme_Cluster_Break=E_Base_GAZ
907 Grapheme_Cluster_Break=E_Modifier
908 Grapheme_Cluster_Break=Glue_After_Zwj
910 Word_Break=E_Base_GAZ
911 Word_Break=E_Modifier
912 Word_Break=Glue_After_Zwj);
915 # Enum values for to_output_map() method in the Map_Table package. (0 is don't
917 my $EXTERNAL_MAP = 1;
918 my $INTERNAL_MAP = 2;
919 my $OUTPUT_ADJUSTED = 3;
921 # To override computed values for writing the map tables for these properties.
922 # The default for enum map tables is to write them out, so that the Unicode
923 # .txt files can be removed, but all the data to compute any property value
924 # for any code point is available in a more compact form.
925 my %global_to_output_map = (
926 # Needed by UCD.pm, but don't want to publicize that it exists, so won't
927 # get stuck supporting it if things change. Since it is a STRING
928 # property, it normally would be listed in the pod, but INTERNAL_MAP
930 Unicode_1_Name => $INTERNAL_MAP,
932 Present_In => 0, # Suppress, as easily computed from Age
933 Block => (NON_ASCII_PLATFORM) ? 1 : 0, # Suppress, as Blocks.txt is
934 # retained, but needed for
937 # Suppress, as mapping can be found instead from the
938 # Perl_Decomposition_Mapping file
939 Decomposition_Type => 0,
942 # There are several types of obsolete properties defined by Unicode. These
943 # must be hand-edited for every new Unicode release.
944 my %why_deprecated; # Generates a deprecated warning message if used.
945 my %why_stabilized; # Documentation only
946 my %why_obsolete; # Documentation only
949 my $simple = 'Perl uses the more complete version';
950 my $unihan = 'Unihan properties are by default not enabled in the Perl core.';
952 my $other_properties = 'other properties';
953 my $contributory = "Used by Unicode internally for generating $other_properties and not intended to be used stand-alone";
954 my $why_no_expand = "Deprecated by Unicode. These are characters that expand to more than one character in the specified normalization form, but whether they actually take up more bytes or not depends on the encoding being used. For example, a UTF-8 encoded character may expand to a different number of bytes than a UTF-32 encoded character.";
957 'Grapheme_Link' => 'Duplicates ccc=vr (Canonical_Combining_Class=Virama)',
958 'Jamo_Short_Name' => $contributory,
959 'Line_Break=Surrogate' => 'Surrogates should never appear in well-formed text, and therefore shouldn\'t be the basis for line breaking',
960 'Other_Alphabetic' => $contributory,
961 'Other_Default_Ignorable_Code_Point' => $contributory,
962 'Other_Grapheme_Extend' => $contributory,
963 'Other_ID_Continue' => $contributory,
964 'Other_ID_Start' => $contributory,
965 'Other_Lowercase' => $contributory,
966 'Other_Math' => $contributory,
967 'Other_Uppercase' => $contributory,
968 'Expands_On_NFC' => $why_no_expand,
969 'Expands_On_NFD' => $why_no_expand,
970 'Expands_On_NFKC' => $why_no_expand,
971 'Expands_On_NFKD' => $why_no_expand,
975 # There is a lib/unicore/Decomposition.pl (used by Normalize.pm) which
976 # contains the same information, but without the algorithmically
977 # determinable Hangul syllables'. This file is not published, so it's
978 # existence is not noted in the comment.
979 'Decomposition_Mapping' => 'Accessible via Unicode::Normalize or prop_invmap() or charprop() in Unicode::UCD::',
981 # Don't suppress ISO_Comment, as otherwise special handling is needed
982 # to differentiate between it and gc=c, which can be written as 'isc',
983 # which is the same characters as ISO_Comment's short name.
985 'Name' => "Accessible via \\N{...} or 'use charnames;' or charprop() or prop_invmap() in Unicode::UCD::",
987 'Simple_Case_Folding' => "$simple. Can access this through casefold(), charprop(), or prop_invmap() in Unicode::UCD",
988 'Simple_Lowercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
989 'Simple_Titlecase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
990 'Simple_Uppercase_Mapping' => "$simple. Can access this through charinfo(), charprop(), or prop_invmap() in Unicode::UCD",
992 FC_NFKC_Closure => 'Deprecated by Unicode, and supplanted in usage by NFKC_Casefold; otherwise not useful',
995 foreach my $property (
997 # The following are suppressed because they were made contributory
998 # or deprecated by Unicode before Perl ever thought about
1007 # The following are suppressed because they have been marked
1008 # as deprecated for a sufficient amount of time
1010 'Other_Default_Ignorable_Code_Point',
1011 'Other_Grapheme_Extend',
1012 'Other_ID_Continue',
1018 $why_suppressed{$property} = $why_deprecated{$property};
1021 # Customize the message for all the 'Other_' properties
1022 foreach my $property (keys %why_deprecated) {
1023 next if (my $main_property = $property) !~ s/^Other_//;
1024 $why_deprecated{$property} =~ s/$other_properties/the $main_property property (which should be used instead)/;
1028 if ($write_Unicode_deprecated_tables) {
1029 foreach my $property (keys %why_suppressed) {
1030 delete $why_suppressed{$property} if $property =~
1031 / ^ Other | Grapheme /x;
1035 if ($v_version ge 4.0.0) {
1036 $why_stabilized{'Hyphen'} = 'Use the Line_Break property instead; see www.unicode.org/reports/tr14';
1037 if ($v_version ge 6.0.0) {
1038 $why_deprecated{'Hyphen'} = 'Supplanted by Line_Break property values; see www.unicode.org/reports/tr14';
1041 if ($v_version ge 5.2.0 && $v_version lt 6.0.0) {
1042 $why_obsolete{'ISO_Comment'} = 'Code points for it have been removed';
1043 if ($v_version ge 6.0.0) {
1044 $why_deprecated{'ISO_Comment'} = 'No longer needed for Unicode\'s internal chart generation; otherwise not useful, and code points for it have been removed';
1048 # Probably obsolete forever
1049 if ($v_version ge v4.1.0) {
1050 $why_suppressed{'Script=Katakana_Or_Hiragana'} = 'Obsolete. All code points previously matched by this have been moved to "Script=Common".';
1052 if ($v_version ge v6.0.0) {
1053 $why_suppressed{'Script=Katakana_Or_Hiragana'} .= ' Consider instead using "Script_Extensions=Katakana" or "Script_Extensions=Hiragana" (or both)';
1054 $why_suppressed{'Script_Extensions=Katakana_Or_Hiragana'} = 'All code points that would be matched by this are matched by either "Script_Extensions=Katakana" or "Script_Extensions=Hiragana"';
1057 # This program can create files for enumerated-like properties, such as
1058 # 'Numeric_Type'. This file would be the same format as for a string
1059 # property, with a mapping from code point to its value, so you could look up,
1060 # for example, the script a code point is in. But no one so far wants this
1061 # mapping, or they have found another way to get it since this is a new
1062 # feature. So no file is generated except if it is in this list.
1063 my @output_mapped_properties = split "\n", <<END;
1066 # If you want more Unihan properties than the default, you need to add them to
1067 # these arrays. Depending on the property type, @missing lines might have to
1068 # be added to the second array. A sample entry would be (including the '#'):
1069 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
1070 my @cjk_properties = split "\n", <<'END';
1072 my @cjk_property_values = split "\n", <<'END';
1075 # The input files don't list every code point. Those not listed are to be
1076 # defaulted to some value. Below are hard-coded what those values are for
1077 # non-binary properties as of 5.1. Starting in 5.0, there are
1078 # machine-parsable comment lines in the files that give the defaults; so this
1079 # list shouldn't have to be extended. The claim is that all missing entries
1080 # for binary properties will default to 'N'. Unicode tried to change that in
1081 # 5.2, but the beta period produced enough protest that they backed off.
1083 # The defaults for the fields that appear in UnicodeData.txt in this hash must
1084 # be in the form that it expects. The others may be synonyms.
1085 my $CODE_POINT = '<code point>';
1086 my %default_mapping = (
1087 Age => "Unassigned",
1088 # Bidi_Class => Complicated; set in code
1089 Bidi_Mirroring_Glyph => "",
1090 Block => 'No_Block',
1091 Canonical_Combining_Class => 0,
1092 Case_Folding => $CODE_POINT,
1093 Decomposition_Mapping => $CODE_POINT,
1094 Decomposition_Type => 'None',
1095 East_Asian_Width => "Neutral",
1096 FC_NFKC_Closure => $CODE_POINT,
1097 General_Category => ($v_version le 6.3.0) ? 'Cn' : 'Unassigned',
1098 Grapheme_Cluster_Break => 'Other',
1099 Hangul_Syllable_Type => 'NA',
1101 Jamo_Short_Name => "",
1102 Joining_Group => "No_Joining_Group",
1103 # Joining_Type => Complicated; set in code
1104 kIICore => 'N', # Is converted to binary
1105 #Line_Break => Complicated; set in code
1106 Lowercase_Mapping => $CODE_POINT,
1113 Numeric_Type => 'None',
1114 Numeric_Value => 'NaN',
1115 Script => ($v_version le 4.1.0) ? 'Common' : 'Unknown',
1116 Sentence_Break => 'Other',
1117 Simple_Case_Folding => $CODE_POINT,
1118 Simple_Lowercase_Mapping => $CODE_POINT,
1119 Simple_Titlecase_Mapping => $CODE_POINT,
1120 Simple_Uppercase_Mapping => $CODE_POINT,
1121 Titlecase_Mapping => $CODE_POINT,
1122 Unicode_1_Name => "",
1123 Unicode_Radical_Stroke => "",
1124 Uppercase_Mapping => $CODE_POINT,
1125 Word_Break => 'Other',
1128 ### End of externally interesting definitions, except for @input_file_objects
1131 # !!!!!!! DO NOT EDIT THIS FILE !!!!!!!
1132 # This file is machine-generated by $0 from the Unicode
1133 # database, Version $unicode_version. Any changes made here will be lost!
1136 my $INTERNAL_ONLY_HEADER = <<"EOF";
1138 # !!!!!!! INTERNAL PERL USE ONLY !!!!!!!
1139 # This file is for internal use by core Perl only. The format and even the
1140 # name or existence of this file are subject to change without notice. Don't
1141 # use it directly. Use Unicode::UCD to access the Unicode character data
1145 my $DEVELOPMENT_ONLY=<<"EOF";
1146 # !!!!!!! DEVELOPMENT USE ONLY !!!!!!!
1147 # This file contains information artificially constrained to code points
1148 # present in Unicode release $string_compare_versions.
1149 # IT CANNOT BE RELIED ON. It is for use during development only and should
1150 # not be used for production.
1154 my $MAX_UNICODE_CODEPOINT_STRING = ($v_version ge v2.0.0)
1157 my $MAX_UNICODE_CODEPOINT = hex $MAX_UNICODE_CODEPOINT_STRING;
1158 my $MAX_UNICODE_CODEPOINTS = $MAX_UNICODE_CODEPOINT + 1;
1160 # We work with above-Unicode code points, up to IV_MAX, but we may want to use
1161 # sentinels above that number. Therefore for internal use, we use a much
1162 # smaller number, translating it to IV_MAX only for output. The exact number
1163 # is immaterial (all above-Unicode code points are treated exactly the same),
1164 # but the algorithm requires it to be at least
1165 # 2 * $MAX_UNICODE_CODEPOINTS + 1
1166 my $MAX_WORKING_CODEPOINTS= $MAX_UNICODE_CODEPOINT * 8;
1167 my $MAX_WORKING_CODEPOINT = $MAX_WORKING_CODEPOINTS - 1;
1168 my $MAX_WORKING_CODEPOINT_STRING = sprintf("%X", $MAX_WORKING_CODEPOINT);
1170 my $MAX_PLATFORM_CODEPOINT = ~0 >> 1;
1172 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
1173 # two must be 10; if there are 5, the first must not be a 0. Written this way
1174 # to decrease backtracking. The first regex allows the code point to be at
1175 # the end of a word, but to work properly, the word shouldn't end with a valid
1176 # hex character. The second one won't match a code point at the end of a
1177 # word, and doesn't have the run-on issue
1178 my $run_on_code_point_re =
1179 qr/ (?: 10[0-9A-F]{4} | [1-9A-F][0-9A-F]{4} | [0-9A-F]{4} ) \b/x;
1180 my $code_point_re = qr/\b$run_on_code_point_re/;
1182 # This matches the beginning of the line in the Unicode DB files that give the
1183 # defaults for code points not listed (i.e., missing) in the file. The code
1184 # depends on this ending with a semi-colon, so it can assume it is a valid
1185 # field when the line is split() by semi-colons
1186 my $missing_defaults_prefix = qr/ ^ \# \s+ \@missing: \s+
1193 # Property types. Unicode has more types, but these are sufficient for our
1195 my $UNKNOWN = -1; # initialized to illegal value
1196 my $NON_STRING = 1; # Either binary or enum
1198 my $FORCED_BINARY = 3; # Not a binary property, but, besides its normal
1199 # tables, additional true and false tables are
1200 # generated so that false is anything matching the
1201 # default value, and true is everything else.
1202 my $ENUM = 4; # Include catalog
1203 my $STRING = 5; # Anything else: string or misc
1205 # Some input files have lines that give default values for code points not
1206 # contained in the file. Sometimes these should be ignored.
1207 my $NO_DEFAULTS = 0; # Must evaluate to false
1208 my $NOT_IGNORED = 1;
1211 # Range types. Each range has a type. Most ranges are type 0, for normal,
1212 # and will appear in the main body of the tables in the output files, but
1213 # there are other types of ranges as well, listed below, that are specially
1214 # handled. There are pseudo-types as well that will never be stored as a
1215 # type, but will affect the calculation of the type.
1217 # 0 is for normal, non-specials
1218 my $MULTI_CP = 1; # Sequence of more than code point
1219 my $HANGUL_SYLLABLE = 2;
1220 my $CP_IN_NAME = 3; # The NAME contains the code point appended to it.
1221 my $NULL = 4; # The map is to the null string; utf8.c can't
1222 # handle these, nor is there an accepted syntax
1223 # for them in \p{} constructs
1224 my $COMPUTE_NO_MULTI_CP = 5; # Pseudo-type; means that ranges that would
1225 # otherwise be $MULTI_CP type are instead type 0
1227 # process_generic_property_file() can accept certain overrides in its input.
1228 # Each of these must begin AND end with $CMD_DELIM.
1229 my $CMD_DELIM = "\a";
1230 my $REPLACE_CMD = 'replace'; # Override the Replace
1231 my $MAP_TYPE_CMD = 'map_type'; # Override the Type
1236 # Values for the Replace argument to add_range.
1237 # $NO # Don't replace; add only the code points not
1239 my $IF_NOT_EQUIVALENT = 1; # Replace only under certain conditions; details in
1240 # the comments at the subroutine definition.
1241 my $UNCONDITIONALLY = 2; # Replace without conditions.
1242 my $MULTIPLE_BEFORE = 4; # Don't replace, but add a duplicate record if
1244 my $MULTIPLE_AFTER = 5; # Don't replace, but add a duplicate record if
1246 my $CROAK = 6; # Die with an error if is already there
1248 # Flags to give property statuses. The phrases are to remind maintainers that
1249 # if the flag is changed, the indefinite article referring to it in the
1250 # documentation may need to be as well.
1252 my $DEPRECATED = 'D';
1253 my $a_bold_deprecated = "a 'B<$DEPRECATED>'";
1254 my $A_bold_deprecated = "A 'B<$DEPRECATED>'";
1255 my $DISCOURAGED = 'X';
1256 my $a_bold_discouraged = "an 'B<$DISCOURAGED>'";
1257 my $A_bold_discouraged = "An 'B<$DISCOURAGED>'";
1259 my $a_bold_stricter = "a 'B<$STRICTER>'";
1260 my $A_bold_stricter = "A 'B<$STRICTER>'";
1261 my $STABILIZED = 'S';
1262 my $a_bold_stabilized = "an 'B<$STABILIZED>'";
1263 my $A_bold_stabilized = "An 'B<$STABILIZED>'";
1265 my $a_bold_obsolete = "an 'B<$OBSOLETE>'";
1266 my $A_bold_obsolete = "An 'B<$OBSOLETE>'";
1268 # Aliases can also have an extra status:
1269 my $INTERNAL_ALIAS = 'P';
1271 my %status_past_participles = (
1272 $DISCOURAGED => 'discouraged',
1273 $STABILIZED => 'stabilized',
1274 $OBSOLETE => 'obsolete',
1275 $DEPRECATED => 'deprecated',
1276 $INTERNAL_ALIAS => 'reserved for Perl core internal use only',
1279 # Table fates. These are somewhat ordered, so that fates < $MAP_PROXIED should be
1280 # externally documented.
1281 my $ORDINARY = 0; # The normal fate.
1282 my $MAP_PROXIED = 1; # The map table for the property isn't written out,
1283 # but there is a file written that can be used to
1284 # reconstruct this table
1285 my $INTERNAL_ONLY = 2; # The file for this table is written out, but it is
1286 # for Perl's internal use only
1287 my $SUPPRESSED = 3; # The file for this table is not written out, and as a
1288 # result, we don't bother to do many computations on
1290 my $PLACEHOLDER = 4; # Like $SUPPRESSED, but we go through all the
1291 # computations anyway, as the values are needed for
1292 # things to work. This happens when we have Perl
1293 # extensions that depend on Unicode tables that
1294 # wouldn't normally be in a given Unicode version.
1296 # The format of the values of the tables:
1297 my $EMPTY_FORMAT = "";
1298 my $BINARY_FORMAT = 'b';
1299 my $DECIMAL_FORMAT = 'd';
1300 my $FLOAT_FORMAT = 'f';
1301 my $INTEGER_FORMAT = 'i';
1302 my $HEX_FORMAT = 'x';
1303 my $RATIONAL_FORMAT = 'r';
1304 my $STRING_FORMAT = 's';
1305 my $ADJUST_FORMAT = 'a';
1306 my $HEX_ADJUST_FORMAT = 'ax';
1307 my $DECOMP_STRING_FORMAT = 'c';
1308 my $STRING_WHITE_SPACE_LIST = 'sw';
1310 my %map_table_formats = (
1311 $BINARY_FORMAT => 'binary',
1312 $DECIMAL_FORMAT => 'single decimal digit',
1313 $FLOAT_FORMAT => 'floating point number',
1314 $INTEGER_FORMAT => 'integer',
1315 $HEX_FORMAT => 'non-negative hex whole number; a code point',
1316 $RATIONAL_FORMAT => 'rational: an integer or a fraction',
1317 $STRING_FORMAT => 'string',
1318 $ADJUST_FORMAT => 'some entries need adjustment',
1319 $HEX_ADJUST_FORMAT => 'mapped value in hex; some entries need adjustment',
1320 $DECOMP_STRING_FORMAT => 'Perl\'s internal (Normalize.pm) decomposition mapping',
1321 $STRING_WHITE_SPACE_LIST => 'string, but some elements are interpreted as a list; white space occurs only as list item separators'
1324 # Unicode didn't put such derived files in a separate directory at first.
1325 my $EXTRACTED_DIR = (-d 'extracted') ? 'extracted' : "";
1326 my $EXTRACTED = ($EXTRACTED_DIR) ? "$EXTRACTED_DIR/" : "";
1327 my $AUXILIARY = 'auxiliary';
1328 my $EMOJI = 'emoji';
1330 # Hashes and arrays that will eventually go into UCD.pl for the use of UCD.pm
1331 my %loose_to_file_of; # loosely maps table names to their respective
1333 my %stricter_to_file_of; # same; but for stricter mapping.
1334 my %loose_property_to_file_of; # Maps a loose property name to its map file
1335 my %strict_property_to_file_of; # Same, but strict
1336 my @inline_definitions = "V0"; # Each element gives a definition of a unique
1337 # inversion list. When a definition is inlined,
1338 # its value in the hash it's in (one of the two
1339 # defined just above) will include an index into
1340 # this array. The 0th element is initialized to
1341 # the definition for a zero length inversion list
1342 my %file_to_swash_name; # Maps the file name to its corresponding key name
1343 # in the hash %Unicode::UCD::SwashInfo
1344 my %nv_floating_to_rational; # maps numeric values floating point numbers to
1345 # their rational equivalent
1346 my %loose_property_name_of; # Loosely maps (non_string) property names to
1348 my %strict_property_name_of; # Strictly maps (non_string) property names to
1350 my %string_property_loose_to_name; # Same, for string properties.
1351 my %loose_defaults; # keys are of form "prop=value", where 'prop' is
1352 # the property name in standard loose form, and
1353 # 'value' is the default value for that property,
1354 # also in standard loose form.
1355 my %loose_to_standard_value; # loosely maps table names to the canonical
1357 my %ambiguous_names; # keys are alias names (in standard form) that
1358 # have more than one possible meaning.
1359 my %combination_property; # keys are alias names (in standard form) that
1360 # have both a map table, and a binary one that
1361 # yields true for all non-null maps.
1362 my %prop_aliases; # Keys are standard property name; values are each
1364 my %prop_value_aliases; # Keys of top level are standard property name;
1365 # values are keys to another hash, Each one is
1366 # one of the property's values, in standard form.
1367 # The values are that prop-val's aliases.
1368 my %skipped_files; # List of files that we skip
1369 my %ucd_pod; # Holds entries that will go into the UCD section of the pod
1371 # Most properties are immune to caseless matching, otherwise you would get
1372 # nonsensical results, as properties are a function of a code point, not
1373 # everything that is caselessly equivalent to that code point. For example,
1374 # Changes_When_Case_Folded('s') should be false, whereas caselessly it would
1375 # be true because 's' and 'S' are equivalent caselessly. However,
1376 # traditionally, [:upper:] and [:lower:] are equivalent caselessly, so we
1377 # extend that concept to those very few properties that are like this. Each
1378 # such property will match the full range caselessly. They are hard-coded in
1379 # the program; it's not worth trying to make it general as it's extremely
1380 # unlikely that they will ever change.
1381 my %caseless_equivalent_to;
1383 # This is the range of characters that were in Release 1 of Unicode, and
1384 # removed in Release 2 (replaced with the current Hangul syllables starting at
1385 # U+AC00). The range was reused starting in Release 3 for other purposes.
1386 my $FIRST_REMOVED_HANGUL_SYLLABLE = 0x3400;
1387 my $FINAL_REMOVED_HANGUL_SYLLABLE = 0x4DFF;
1389 # These constants names and values were taken from the Unicode standard,
1390 # version 5.1, section 3.12. They are used in conjunction with Hangul
1391 # syllables. The '_string' versions are so generated tables can retain the
1392 # hex format, which is the more familiar value
1393 my $SBase_string = "0xAC00";
1394 my $SBase = CORE::hex $SBase_string;
1395 my $LBase_string = "0x1100";
1396 my $LBase = CORE::hex $LBase_string;
1397 my $VBase_string = "0x1161";
1398 my $VBase = CORE::hex $VBase_string;
1399 my $TBase_string = "0x11A7";
1400 my $TBase = CORE::hex $TBase_string;
1405 my $NCount = $VCount * $TCount;
1407 # For Hangul syllables; These store the numbers from Jamo.txt in conjunction
1408 # with the above published constants.
1410 my %Jamo_L; # Leading consonants
1411 my %Jamo_V; # Vowels
1412 my %Jamo_T; # Trailing consonants
1414 # For code points whose name contains its ordinal as a '-ABCD' suffix.
1415 # The key is the base name of the code point, and the value is an
1416 # array giving all the ranges that use this base name. Each range
1417 # is actually a hash giving the 'low' and 'high' values of it.
1418 my %names_ending_in_code_point;
1419 my %loose_names_ending_in_code_point; # Same as above, but has blanks, dashes
1420 # removed from the names
1421 # Inverse mapping. The list of ranges that have these kinds of
1422 # names. Each element contains the low, high, and base names in an
1424 my @code_points_ending_in_code_point;
1426 # To hold Unicode's normalization test suite
1427 my @normalization_tests;
1429 # Boolean: does this Unicode version have the hangul syllables, and are we
1430 # writing out a table for them?
1431 my $has_hangul_syllables = 0;
1433 # Does this Unicode version have code points whose names end in their
1434 # respective code points, and are we writing out a table for them? 0 for no;
1435 # otherwise points to first property that a table is needed for them, so that
1436 # if multiple tables are needed, we don't create duplicates
1437 my $needing_code_points_ending_in_code_point = 0;
1439 my @backslash_X_tests; # List of tests read in for testing \X
1440 my @LB_tests; # List of tests read in for testing \b{lb}
1441 my @SB_tests; # List of tests read in for testing \b{sb}
1442 my @WB_tests; # List of tests read in for testing \b{wb}
1443 my @unhandled_properties; # Will contain a list of properties found in
1444 # the input that we didn't process.
1445 my @match_properties; # Properties that have match tables, to be
1447 my @map_properties; # Properties that get map files written
1448 my @named_sequences; # NamedSequences.txt contents.
1449 my %potential_files; # Generated list of all .txt files in the directory
1450 # structure so we can warn if something is being
1452 my @missing_early_files; # Generated list of absent files that we need to
1453 # proceed in compiling this early Unicode version
1454 my @files_actually_output; # List of files we generated.
1455 my @more_Names; # Some code point names are compound; this is used
1456 # to store the extra components of them.
1457 my $E_FLOAT_PRECISION = 3; # The minimum number of digits after the decimal
1458 # point of a normalized floating point number
1459 # needed to match before we consider it equivalent
1460 # to a candidate rational
1462 # These store references to certain commonly used property objects
1471 my $Assigned; # All assigned characters in this Unicode release
1472 my $DI; # Default_Ignorable_Code_Point property
1473 my $NChar; # Noncharacter_Code_Point property
1475 my $scx; # Script_Extensions property
1476 my $idt; # Identifier_Type property
1478 # Are there conflicting names because of beginning with 'In_', or 'Is_'
1479 my $has_In_conflicts = 0;
1480 my $has_Is_conflicts = 0;
1482 sub internal_file_to_platform ($file=undef) {
1483 # Convert our file paths which have '/' separators to those of the
1486 return undef unless defined $file;
1488 return File::Spec->join(split '/', $file);
1491 sub file_exists ($file=undef) { # platform independent '-e'. This program internally
1492 # uses slash as a path separator.
1493 return 0 unless defined $file;
1494 return -e internal_file_to_platform($file);
1497 sub objaddr($addr) {
1498 # Returns the address of the blessed input object.
1499 # It doesn't check for blessedness because that would do a string eval
1500 # every call, and the program is structured so that this is never called
1501 # for a non-blessed object.
1503 return pack 'J', refaddr $addr;
1506 # These are used only if $annotate is true.
1507 # The entire range of Unicode characters is examined to populate these
1508 # after all the input has been processed. But most can be skipped, as they
1509 # have the same descriptive phrases, such as being unassigned
1510 my @viacode; # Contains the 1 million character names
1511 my @age; # And their ages ("" if none)
1512 my @printable; # boolean: And are those characters printable?
1513 my @annotate_char_type; # Contains a type of those characters, specifically
1514 # for the purposes of annotation.
1515 my $annotate_ranges; # A map of ranges of code points that have the same
1516 # name for the purposes of annotation. They map to the
1517 # upper edge of the range, so that the end point can
1518 # be immediately found. This is used to skip ahead to
1519 # the end of a range, and avoid processing each
1520 # individual code point in it.
1521 my $unassigned_sans_noncharacters; # A Range_List of the unassigned
1522 # characters, but excluding those which are
1523 # also noncharacter code points
1525 # The annotation types are an extension of the regular range types, though
1526 # some of the latter are folded into one. Make the new types negative to
1527 # avoid conflicting with the regular types
1528 my $SURROGATE_TYPE = -1;
1529 my $UNASSIGNED_TYPE = -2;
1530 my $PRIVATE_USE_TYPE = -3;
1531 my $NONCHARACTER_TYPE = -4;
1532 my $CONTROL_TYPE = -5;
1533 my $ABOVE_UNICODE_TYPE = -6;
1534 my $UNKNOWN_TYPE = -7; # Used only if there is a bug in this program
1536 sub populate_char_info ($i) {
1537 # Used only with the $annotate option. Populates the arrays with the
1538 # input code point's info that are needed for outputting more detailed
1539 # comments. If calling context wants a return, it is the end point of
1540 # any contiguous range of characters that share essentially the same info
1542 $viacode[$i] = $perl_charname->value_of($i) || "";
1543 $age[$i] = (defined $age)
1544 ? (($age->value_of($i) =~ / ^ \d+ \. \d+ $ /x)
1545 ? $age->value_of($i)
1549 # A character is generally printable if Unicode says it is,
1550 # but below we make sure that most Unicode general category 'C' types
1552 $printable[$i] = $print->contains($i);
1554 # But the characters in this range were removed in v2.0 and replaced by
1555 # different ones later. Modern fonts will be for the replacement
1556 # characters, so suppress printing them.
1557 if (($v_version lt v2.0
1558 || ($compare_versions && $compare_versions lt v2.0))
1559 && ( $i >= $FIRST_REMOVED_HANGUL_SYLLABLE
1560 && $i <= $FINAL_REMOVED_HANGUL_SYLLABLE))
1565 $annotate_char_type[$i] = $perl_charname->type_of($i) || 0;
1567 # Only these two regular types are treated specially for annotations
1569 $annotate_char_type[$i] = 0 if $annotate_char_type[$i] != $CP_IN_NAME
1570 && $annotate_char_type[$i] != $HANGUL_SYLLABLE;
1572 # Give a generic name to all code points that don't have a real name.
1573 # We output ranges, if applicable, for these. Also calculate the end
1574 # point of the range.
1576 if (! $viacode[$i]) {
1577 if ($i > $MAX_UNICODE_CODEPOINT) {
1578 $viacode[$i] = 'Above-Unicode';
1579 $annotate_char_type[$i] = $ABOVE_UNICODE_TYPE;
1581 $end = $MAX_WORKING_CODEPOINT;
1583 elsif ($gc-> table('Private_use')->contains($i)) {
1584 $viacode[$i] = 'Private Use';
1585 $annotate_char_type[$i] = $PRIVATE_USE_TYPE;
1587 $end = $gc->table('Private_Use')->containing_range($i)->end;
1589 elsif ($NChar->contains($i)) {
1590 $viacode[$i] = 'Noncharacter';
1591 $annotate_char_type[$i] = $NONCHARACTER_TYPE;
1593 $end = $NChar->containing_range($i)->end;
1595 elsif ($gc-> table('Control')->contains($i)) {
1596 my $name_ref = property_ref('Name_Alias');
1597 $name_ref = property_ref('Unicode_1_Name') if ! defined $name_ref;
1598 $viacode[$i] = (defined $name_ref)
1599 ? $name_ref->value_of($i)
1601 $annotate_char_type[$i] = $CONTROL_TYPE;
1604 elsif ($gc-> table('Unassigned')->contains($i)) {
1605 $annotate_char_type[$i] = $UNASSIGNED_TYPE;
1607 $viacode[$i] = 'Unassigned';
1609 if (defined $block) { # No blocks in earliest releases
1610 $viacode[$i] .= ', block=' . $block-> value_of($i);
1611 $end = $gc-> table('Unassigned')->containing_range($i)->end;
1613 # Because we name the unassigned by the blocks they are in, it
1614 # can't go past the end of that block, and it also can't go
1615 # past the unassigned range it is in. The special table makes
1616 # sure that the non-characters, which are unassigned, are
1618 $end = min($block->containing_range($i)->end,
1619 $unassigned_sans_noncharacters->
1620 containing_range($i)->end);
1624 while ($unassigned_sans_noncharacters->contains($end)) {
1630 elsif ($perl->table('_Perl_Surrogate')->contains($i)) {
1631 $viacode[$i] = 'Surrogate';
1632 $annotate_char_type[$i] = $SURROGATE_TYPE;
1634 $end = $gc->table('Surrogate')->containing_range($i)->end;
1637 Carp::my_carp_bug("Can't figure out how to annotate "
1638 . sprintf("U+%04X", $i)
1639 . ". Proceeding anyway.");
1640 $viacode[$i] = 'UNKNOWN';
1641 $annotate_char_type[$i] = $UNKNOWN_TYPE;
1646 # Here, has a name, but if it's one in which the code point number is
1647 # appended to the name, do that.
1648 elsif ($annotate_char_type[$i] == $CP_IN_NAME) {
1649 $viacode[$i] .= sprintf("-%04X", $i);
1651 my $limit = $perl_charname->containing_range($i)->end;
1653 # Do all these as groups of the same age, instead of individually,
1654 # because their names are so meaningless, and there are typically
1655 # large quantities of them.
1657 while ($end <= $limit && $age->value_of($end) == $age[$i]) {
1667 # And here, has a name, but if it's a hangul syllable one, replace it with
1668 # the correct name from the Unicode algorithm
1669 elsif ($annotate_char_type[$i] == $HANGUL_SYLLABLE) {
1671 my $SIndex = $i - $SBase;
1672 my $L = $LBase + $SIndex / $NCount;
1673 my $V = $VBase + ($SIndex % $NCount) / $TCount;
1674 my $T = $TBase + $SIndex % $TCount;
1675 $viacode[$i] = "HANGUL SYLLABLE $Jamo{$L}$Jamo{$V}";
1676 $viacode[$i] .= $Jamo{$T} if $T != $TBase;
1677 $end = $perl_charname->containing_range($i)->end;
1680 return if ! defined wantarray;
1681 return $i if ! defined $end; # If not a range, return the input
1683 # Save this whole range so can find the end point quickly
1684 $annotate_ranges->add_map($i, $end, $end);
1690 return $a >= $b ? $a : $b;
1694 return $a <= $b ? $a : $b;
1697 sub clarify_number ($number) {
1698 # This returns the input number with underscores inserted every 3 digits
1699 # in large (5 digits or more) numbers. Input must be entirely digits, not
1702 my $pos = length($number) - 3;
1703 return $number if $pos <= 1;
1705 substr($number, $pos, 0) = '_';
1711 sub clarify_code_point_count ($number) {
1712 # This is like clarify_number(), but the input is assumed to be a count of
1713 # code points, rather than a generic number.
1717 if ($number > $MAX_UNICODE_CODEPOINTS) {
1718 $number -= ($MAX_WORKING_CODEPOINTS - $MAX_UNICODE_CODEPOINTS);
1719 return "All above-Unicode code points" if $number == 0;
1720 $append = " + all above-Unicode code points";
1722 return clarify_number($number) . $append;
1727 # These routines give a uniform treatment of messages in this program. They
1728 # are placed in the Carp package to cause the stack trace to not include them,
1729 # although an alternative would be to use another package and set @CARP_NOT
1732 our $Verbose = 1 if main::DEBUG; # Useful info when debugging
1734 # This is a work-around suggested by Nicholas Clark to fix a problem with Carp
1735 # and overload trying to load Scalar:Util under miniperl. See
1736 # http://www.xray.mpe.mpg.de/mailing-lists/perl5-porters/2009-11/msg01057.html
1737 undef $overload::VERSION;
1739 sub my_carp($message="", $nofold=0) {
1742 $message = main::join_lines($message);
1743 $message =~ s/^$0: *//; # Remove initial program name
1744 $message =~ s/[.;,]+$//; # Remove certain ending punctuation
1745 $message = "\n$0: $message;";
1747 # Fold the message with program name, semi-colon end punctuation
1748 # (which looks good with the message that carp appends to it), and a
1749 # hanging indent for continuation lines.
1750 $message = main::simple_fold($message, "", 4) unless $nofold;
1751 $message =~ s/\n$//; # Remove the trailing nl so what carp
1752 # appends is to the same line
1755 return $message if defined wantarray; # If a caller just wants the msg
1761 sub my_carp_bug($message="") {
1762 # This is called when it is clear that the problem is caused by a bug in
1764 $message =~ s/^$0: *//;
1765 $message = my_carp("Bug in $0. Please report it by running perlbug or if that is unavailable, by sending email to perbug\@perl.org:\n$message");
1770 sub carp_too_few_args($args_ref, $count) {
1771 my_carp_bug("Need at least $count arguments to "
1773 . ". Instead got: '"
1774 . join ', ', @$args_ref
1775 . "'. No action taken.");
1779 sub carp_extra_args($args_ref) {
1780 unless (ref $args_ref) {
1781 my_carp_bug("Argument to 'carp_extra_args' ($args_ref) must be a ref. Not checking arguments.");
1784 my ($package, $file, $line) = caller;
1785 my $subroutine = (caller 1)[3];
1788 if (ref $args_ref eq 'HASH') {
1789 foreach my $key (keys %$args_ref) {
1790 $args_ref->{$key} = $UNDEF unless defined $args_ref->{$key};
1792 $list = join ', ', each %{$args_ref};
1794 elsif (ref $args_ref eq 'ARRAY') {
1795 foreach my $arg (@$args_ref) {
1796 $arg = $UNDEF unless defined $arg;
1798 $list = join ', ', @$args_ref;
1801 my_carp_bug("Can't cope with ref "
1803 . " . argument to 'carp_extra_args'. Not checking arguments.");
1807 my_carp_bug("Unrecognized parameters in options: '$list' to $subroutine. Skipped.");
1815 # This program uses the inside-out method for objects, as recommended in
1816 # "Perl Best Practices". (This is the best solution still, since this has
1817 # to run under miniperl.) This closure aids in generating those. There
1818 # are two routines. setup_package() is called once per package to set
1819 # things up, and then set_access() is called for each hash representing a
1820 # field in the object. These routines arrange for the object to be
1821 # properly destroyed when no longer used, and for standard accessor
1822 # functions to be generated. If you need more complex accessors, just
1823 # write your own and leave those accesses out of the call to set_access().
1824 # More details below.
1826 my %constructor_fields; # fields that are to be used in constructors; see
1829 # The values of this hash will be the package names as keys to other
1830 # hashes containing the name of each field in the package as keys, and
1831 # references to their respective hashes as values.
1835 # Sets up the package, creating standard DESTROY and dump methods
1836 # (unless already defined). The dump method is used in debugging by
1838 # The optional parameters are:
1839 # a) a reference to a hash, that gets populated by later
1840 # set_access() calls with one of the accesses being
1841 # 'constructor'. The caller can then refer to this, but it is
1842 # not otherwise used by these two routines.
1843 # b) a reference to a callback routine to call during destruction
1844 # of the object, before any fields are actually destroyed
1847 my $constructor_ref = delete $args{'Constructor_Fields'};
1848 my $destroy_callback = delete $args{'Destroy_Callback'};
1849 Carp::carp_extra_args(\@_) if main::DEBUG && %args;
1852 my $package = (caller)[0];
1854 $package_fields{$package} = \%fields;
1855 $constructor_fields{$package} = $constructor_ref;
1857 unless ($package->can('DESTROY')) {
1858 my $destroy_name = "${package}::DESTROY";
1861 # Use typeglob to give the anonymous subroutine the name we want
1862 *$destroy_name = sub ($self) {
1863 my $addr = pack 'J', refaddr $self;
1865 $self->$destroy_callback if $destroy_callback;
1866 foreach my $field (keys %{$package_fields{$package}}) {
1867 #print STDERR __LINE__, ": Destroying ", ref $self, " ", sprintf("%04X", $addr), ": ", $field, "\n";
1868 delete $package_fields{$package}{$field}{$addr};
1874 unless ($package->can('dump')) {
1875 my $dump_name = "${package}::dump";
1877 *$dump_name = sub ($self, @_args) {
1878 return dump_inside_out($self, $package_fields{$package}, @_args);
1884 sub set_access($name, $field, @accessors) {
1885 # Arrange for the input field to be garbage collected when no longer
1886 # needed. Also, creates standard accessor functions for the field
1887 # based on the optional parameters-- none if none of these parameters:
1888 # 'addable' creates an 'add_NAME()' accessor function.
1889 # 'readable' or 'readable_array' creates a 'NAME()' accessor
1891 # 'settable' creates a 'set_NAME()' accessor function.
1892 # 'constructor' doesn't create an accessor function, but adds the
1893 # field to the hash that was previously passed to
1895 # Any of the accesses can be abbreviated down, so that 'a', 'ad',
1896 # 'add' etc. all mean 'addable'.
1897 # The read accessor function will work on both array and scalar
1898 # values. If another accessor in the parameter list is 'a', the read
1899 # access assumes an array. You can also force it to be array access
1900 # by specifying 'readable_array' instead of 'readable'
1902 # A sort-of 'protected' access can be set-up by preceding the addable,
1903 # readable or settable with some initial portion of 'protected_' (but,
1904 # the underscore is required), like 'p_a', 'pro_set', etc. The
1905 # "protection" is only by convention. All that happens is that the
1906 # accessor functions' names begin with an underscore. So instead of
1907 # calling set_foo, the call is _set_foo. (Real protection could be
1908 # accomplished by having a new subroutine, end_package, called at the
1909 # end of each package, and then storing the __LINE__ ranges and
1910 # checking them on every accessor. But that is way overkill.)
1912 # We create anonymous subroutines as the accessors and then use
1913 # typeglobs to assign them to the proper package and name
1915 # $name Name of the field
1916 # $field Reference to the inside-out hash containing the
1919 my $package = (caller)[0];
1921 if (! exists $package_fields{$package}) {
1922 croak "$0: Must call 'setup_package' before 'set_access'";
1925 # Stash the field so DESTROY can get it.
1926 $package_fields{$package}{$name} = $field;
1928 # Remaining arguments are the accessors. For each...
1929 foreach my $access (@accessors) {
1930 my $access = lc $access;
1934 # Match the input as far as it goes.
1935 if ($access =~ /^(p[^_]*)_/) {
1937 if (substr('protected_', 0, length $protected)
1941 # Add 1 for the underscore not included in $protected
1942 $access = substr($access, length($protected) + 1);
1950 if (substr('addable', 0, length $access) eq $access) {
1951 my $subname = "${package}::${protected}add_$name";
1954 # add_ accessor. Don't add if already there, which we
1955 # determine using 'eq' for scalars and '==' otherwise.
1956 *$subname = sub ($self, $value) {
1958 my $addr = pack 'J', refaddr $self;
1960 return if grep { $value == $_ } @{$field->{$addr}};
1963 return if grep { $value eq $_ } @{$field->{$addr}};
1965 push @{$field->{$addr}}, $value;
1969 elsif (substr('constructor', 0, length $access) eq $access) {
1971 Carp::my_carp_bug("Can't set-up 'protected' constructors")
1974 $constructor_fields{$package}{$name} = $field;
1977 elsif (substr('readable_array', 0, length $access) eq $access) {
1979 # Here has read access. If one of the other parameters for
1980 # access is array, or this one specifies array (by being more
1981 # than just 'readable_'), then create a subroutine that
1982 # assumes the data is an array. Otherwise just a scalar
1983 my $subname = "${package}::${protected}$name";
1984 if (grep { /^a/i } (@accessors)
1985 or length($access) > length('readable_'))
1988 *$subname = sub ($_addr) {
1990 my $addr = pack 'J', refaddr $_addr;
1991 if (ref $field->{$addr} ne 'ARRAY') {
1992 my $type = ref $field->{$addr};
1993 $type = 'scalar' unless $type;
1994 Carp::my_carp_bug("Trying to read $name as an array when it is a $type. Big problems.");
1997 return scalar @{$field->{$addr}} unless wantarray;
1999 # Make a copy; had problems with caller modifying the
2000 # original otherwise
2001 my @return = @{$field->{$addr}};
2007 # Here not an array value, a simpler function.
2009 *$subname = sub ($addr) {
2011 return $field->{pack 'J', refaddr $addr};
2015 elsif (substr('settable', 0, length $access) eq $access) {
2016 my $subname = "${package}::${protected}set_$name";
2018 *$subname = sub ($self, $value) {
2020 # $self is $_[0]; $value is $_[1]
2021 $field->{pack 'J', refaddr $self} = $value;
2026 Carp::my_carp_bug("Unknown accessor type $access. No accessor set.");
2035 # All input files use this object, which stores various attributes about them,
2036 # and provides for convenient, uniform handling. The run method wraps the
2037 # processing. It handles all the bookkeeping of opening, reading, and closing
2038 # the file, returning only significant input lines.
2040 # Each object gets a handler which processes the body of the file, and is
2041 # called by run(). All character property files must use the generic,
2042 # default handler, which has code scrubbed to handle things you might not
2043 # expect, including automatic EBCDIC handling. For files that don't deal with
2044 # mapping code points to a property value, such as test files,
2045 # PropertyAliases, PropValueAliases, and named sequences, you can override the
2046 # handler to be a custom one. Such a handler should basically be a
2047 # while(next_line()) {...} loop.
2049 # You can also set up handlers to
2050 # 0) call during object construction time, after everything else is done
2051 # 1) call before the first line is read, for pre processing
2052 # 2) call to adjust each line of the input before the main handler gets
2053 # them. This can be automatically generated, if appropriately simple
2054 # enough, by specifying a Properties parameter in the constructor.
2055 # 3) call upon EOF before the main handler exits its loop
2056 # 4) call at the end, for post processing
2058 # $_ is used to store the input line, and is to be filtered by the
2059 # each_line_handler()s. So, if the format of the line is not in the desired
2060 # format for the main handler, these are used to do that adjusting. They can
2061 # be stacked (by enclosing them in an [ anonymous array ] in the constructor,
2062 # so the $_ output of one is used as the input to the next. The EOF handler
2063 # is also stackable, but none of the others are, but could easily be changed
2066 # Some properties are used by the Perl core but aren't defined until later
2067 # Unicode releases. The perl interpreter would have problems working when
2068 # compiled with an earlier Unicode version that doesn't have them, so we need
2069 # to define them somehow for those releases. The 'Early' constructor
2070 # parameter can be used to automatically handle this. It is essentially
2071 # ignored if the Unicode version being compiled has a data file for this
2072 # property. Either code to execute or a file to read can be specified.
2073 # Details are at the %early definition.
2075 # Most of the handlers can call insert_lines() or insert_adjusted_lines()
2076 # which insert the parameters as lines to be processed before the next input
2077 # file line is read. This allows the EOF handler(s) to flush buffers, for
2078 # example. The difference between the two routines is that the lines inserted
2079 # by insert_lines() are subjected to the each_line_handler()s. (So if you
2080 # called it from such a handler, you would get infinite recursion without some
2081 # mechanism to prevent that.) Lines inserted by insert_adjusted_lines() go
2082 # directly to the main handler without any adjustments. If the
2083 # post-processing handler calls any of these, there will be no effect. Some
2084 # error checking for these conditions could be added, but it hasn't been done.
2086 # carp_bad_line() should be called to warn of bad input lines, which clears $_
2087 # to prevent further processing of the line. This routine will output the
2088 # message as a warning once, and then keep a count of the lines that have the
2089 # same message, and output that count at the end of the file's processing.
2090 # This keeps the number of messages down to a manageable amount.
2092 # get_missings() should be called to retrieve any @missing input lines.
2093 # Messages will be raised if this isn't done if the options aren't to ignore
2096 sub trace { return main::trace(@_); }
2099 # Keep track of fields that are to be put into the constructor.
2100 my %constructor_fields;
2102 main::setup_package(Constructor_Fields => \%constructor_fields);
2104 my %file; # Input file name, required
2105 main::set_access('file', \%file, qw{ c r });
2107 my %first_released; # Unicode version file was first released in, required
2108 main::set_access('first_released', \%first_released, qw{ c r });
2110 my %handler; # Subroutine to process the input file, defaults to
2111 # 'process_generic_property_file'
2112 main::set_access('handler', \%handler, qw{ c });
2115 # name of property this file is for. defaults to none, meaning not
2116 # applicable, or is otherwise determinable, for example, from each line.
2117 main::set_access('property', \%property, qw{ c r });
2120 # This is either an unsigned number, or a list of property names. In the
2121 # former case, if it is non-zero, it means the file is optional, so if the
2122 # file is absent, no warning about that is output. In the latter case, it
2123 # is a list of properties that the file (exclusively) defines. If the
2124 # file is present, tables for those properties will be produced; if
2125 # absent, none will, even if they are listed elsewhere (namely
2126 # PropertyAliases.txt and PropValueAliases.txt) as being in this release,
2127 # and no warnings will be raised about them not being available. (And no
2128 # warning about the file itself will be raised.)
2129 main::set_access('optional', \%optional, qw{ c readable_array } );
2132 # This is used for debugging, to skip processing of all but a few input
2133 # files. Add 'non_skip => 1' to the constructor for those files you want
2134 # processed when you set the $debug_skip global.
2135 main::set_access('non_skip', \%non_skip, 'c');
2138 # This is used to skip processing of this input file (semi-) permanently.
2139 # The value should be the reason the file is being skipped. It is used
2140 # for files that we aren't planning to process anytime soon, but want to
2141 # allow to be in the directory and be checked for their names not
2142 # conflicting with any other files on a DOS 8.3 name filesystem, but to
2143 # not otherwise be processed, and to not raise a warning about not being
2144 # handled. In the constructor call, any value that evaluates to a numeric
2145 # 0 or undef means don't skip. Any other value is a string giving the
2146 # reason it is being skipped, and this will appear in generated pod.
2147 # However, an empty string reason will suppress the pod entry.
2148 # Internally, calls that evaluate to numeric 0 are changed into undef to
2149 # distinguish them from an empty string call.
2150 main::set_access('skip', \%skip, 'c', 'r');
2152 my %each_line_handler;
2153 # list of subroutines to look at and filter each non-comment line in the
2154 # file. defaults to none. The subroutines are called in order, each is
2155 # to adjust $_ for the next one, and the final one adjusts it for
2157 main::set_access('each_line_handler', \%each_line_handler, 'c');
2159 my %retain_trailing_comments;
2160 # This is used to not discard the comments that end data lines. This
2161 # would be used only for files with non-typical syntax, and most code here
2162 # assumes that comments have been stripped, so special handlers would have
2163 # to be written. It is assumed that the code will use these in
2164 # single-quoted contexts, and so any "'" marks in the comment will be
2165 # prefixed by a backslash.
2166 main::set_access('retain_trailing_comments', \%retain_trailing_comments, 'c');
2168 my %properties; # Optional ordered list of the properties that occur in each
2169 # meaningful line of the input file. If present, an appropriate
2170 # each_line_handler() is automatically generated and pushed onto the stack
2171 # of such handlers. This is useful when a file contains multiple
2172 # properties per line, but no other special considerations are necessary.
2173 # The special value "<ignored>" means to discard the corresponding input
2175 # Any @missing lines in the file should also match this syntax; no such
2176 # files exist as of 6.3. But if it happens in a future release, the code
2177 # could be expanded to properly parse them.
2178 main::set_access('properties', \%properties, qw{ c r });
2180 my %has_missings_defaults;
2181 # ? Are there lines in the file giving default values for code points
2182 # missing from it?. Defaults to NO_DEFAULTS. Otherwise NOT_IGNORED is
2183 # the norm, but IGNORED means it has such lines, but the handler doesn't
2184 # use them. Having these three states allows us to catch changes to the
2185 # UCD that this program should track. XXX This could be expanded to
2186 # specify the syntax for such lines, like %properties above.
2187 main::set_access('has_missings_defaults',
2188 \%has_missings_defaults, qw{ c r });
2190 my %construction_time_handler;
2191 # Subroutine to call at the end of the new method. If undef, no such
2192 # handler is called.
2193 main::set_access('construction_time_handler',
2194 \%construction_time_handler, qw{ c });
2197 # Subroutine to call before doing anything else in the file. If undef, no
2198 # such handler is called.
2199 main::set_access('pre_handler', \%pre_handler, qw{ c });
2202 # Subroutines to call upon getting an EOF on the input file, but before
2203 # that is returned to the main handler. This is to allow buffers to be
2204 # flushed. The handler is expected to call insert_lines() or
2205 # insert_adjusted() with the buffered material
2206 main::set_access('eof_handler', \%eof_handler, qw{ c });
2209 # Subroutine to call after all the lines of the file are read in and
2210 # processed. If undef, no such handler is called. Note that this cannot
2211 # add lines to be processed; instead use eof_handler
2212 main::set_access('post_handler', \%post_handler, qw{ c });
2214 my %progress_message;
2215 # Message to print to display progress in lieu of the standard one
2216 main::set_access('progress_message', \%progress_message, qw{ c });
2219 # cache open file handle, internal. Is undef if file hasn't been
2220 # processed at all, empty if has;
2221 main::set_access('handle', \%handle);
2224 # cache of lines added virtually to the file, internal
2225 main::set_access('added_lines', \%added_lines);
2228 # cache of lines added virtually to the file, internal
2229 main::set_access('remapped_lines', \%remapped_lines);
2232 # cache of errors found, internal
2233 main::set_access('errors', \%errors);
2236 # storage of '@missing' defaults lines
2237 main::set_access('missings', \%missings);
2240 # Used for properties that must be defined (for Perl's purposes) on
2241 # versions of Unicode earlier than Unicode itself defines them. The
2242 # parameter is an array (it would be better to be a hash, but not worth
2243 # bothering about due to its rare use).
2245 # The first element is either a code reference to call when in a release
2246 # earlier than the Unicode file is available in, or it is an alternate
2247 # file to use instead of the non-existent one. This file must have been
2248 # plunked down in the same directory as mktables. Should you be compiling
2249 # on a release that needs such a file, mktables will abort the
2250 # compilation, and tell you where to get the necessary file(s), and what
2251 # name(s) to use to store them as.
2252 # In the case of specifying an alternate file, the array must contain two
2255 # [1] is the name of the property that will be generated by this file.
2256 # The class automatically takes the input file and excludes any code
2257 # points in it that were not assigned in the Unicode version being
2258 # compiled. It then uses this result to define the property in the given
2259 # version. Since the property doesn't actually exist in the Unicode
2260 # version being compiled, this should be a name accessible only by core
2261 # perl. If it is the same name as the regular property, the constructor
2262 # will mark the output table as a $PLACEHOLDER so that it doesn't actually
2263 # get output, and so will be unusable by non-core code. Otherwise it gets
2264 # marked as $INTERNAL_ONLY.
2266 # [2] is a property value to assign (only when compiling Unicode 1.1.5) to
2267 # the Hangul syllables in that release (which were ripped out in version
2268 # 2) for the given property . (Hence it is ignored except when compiling
2269 # version 1. You only get one value that applies to all of them, which
2270 # may not be the actual reality, but probably nobody cares anyway for
2271 # these obsolete characters.)
2273 # [3] if present is the default value for the property to assign for code
2274 # points not given in the input. If not present, the default from the
2275 # normal property is used
2277 # [-1] If there is an extra final element that is the string 'ONLY_EARLY'.
2278 # it means to not add the name in [1] as an alias to the property name
2279 # used for these. Normally, when compiling Unicode versions that don't
2280 # invoke the early handling, the name is added as a synonym.
2282 # Not all files can be handled in the above way, and so the code ref
2283 # alternative is available. It can do whatever it needs to. The other
2284 # array elements are optional in this case, and the code is free to use or
2285 # ignore them if they are present.
2287 # Internally, the constructor unshifts a 0 or 1 onto this array to
2288 # indicate if an early alternative is actually being used or not. This
2289 # makes for easier testing later on.
2290 main::set_access('early', \%early, 'c');
2293 main::set_access('only_early', \%only_early, 'c');
2295 my %required_even_in_debug_skip;
2296 # debug_skip is used to speed up compilation during debugging by skipping
2297 # processing files that are not needed for the task at hand. However,
2298 # some files pretty much can never be skipped, and this is used to specify
2299 # that this is one of them. In order to skip this file, the call to the
2300 # constructor must be edited to comment out this parameter.
2301 main::set_access('required_even_in_debug_skip',
2302 \%required_even_in_debug_skip, 'c');
2305 # Some files get removed from the Unicode DB. This is a version object
2306 # giving the first release without this file.
2307 main::set_access('withdrawn', \%withdrawn, 'c');
2310 # Some files are not actually part of the Unicode Character Database.
2311 # These typically have a different way of indicating their version
2312 main::set_access('ucd', \%ucd, 'c');
2314 my %in_this_release;
2315 # Calculated value from %first_released and %withdrawn. Are we compiling
2316 # a Unicode release which includes this file?
2317 main::set_access('in_this_release', \%in_this_release);
2320 sub _next_line_with_remapped_range;
2325 my $self = bless \do{ my $anonymous_scalar }, $class;
2326 my $addr = pack 'J', refaddr $self;
2329 $handler{$addr} = \&main::process_generic_property_file;
2330 $retain_trailing_comments{$addr} = 0;
2331 $non_skip{$addr} = 0;
2332 $skip{$addr} = undef;
2333 $has_missings_defaults{$addr} = $NO_DEFAULTS;
2334 $handle{$addr} = undef;
2335 $added_lines{$addr} = [ ];
2336 $remapped_lines{$addr} = [ ];
2337 $each_line_handler{$addr} = [ ];
2338 $eof_handler{$addr} = [ ];
2339 $errors{$addr} = { };
2340 $missings{$addr} = [ ];
2341 $early{$addr} = [ ];
2342 $optional{$addr} = [ ];
2345 # Two positional parameters.
2346 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
2347 $file{$addr} = main::internal_file_to_platform(shift);
2348 $first_released{$addr} = shift;
2350 # The rest of the arguments are key => value pairs
2351 # %constructor_fields has been set up earlier to list all possible
2352 # ones. Either set or push, depending on how the default has been set
2355 foreach my $key (keys %args) {
2356 my $argument = $args{$key};
2358 # Note that the fields are the lower case of the constructor keys
2359 my $hash = $constructor_fields{lc $key};
2360 if (! defined $hash) {
2361 Carp::my_carp_bug("Unrecognized parameters '$key => $argument' to new() for $self. Skipped");
2364 if (ref $hash->{$addr} eq 'ARRAY') {
2365 if (ref $argument eq 'ARRAY') {
2366 foreach my $argument (@{$argument}) {
2367 next if ! defined $argument;
2368 push @{$hash->{$addr}}, $argument;
2372 push @{$hash->{$addr}}, $argument if defined $argument;
2376 $hash->{$addr} = $argument;
2381 $non_skip{$addr} = 1 if $required_even_in_debug_skip{$addr};
2383 # Convert 0 (meaning don't skip) to undef
2384 undef $skip{$addr} unless $skip{$addr};
2386 # Handle the case where this file is optional
2387 my $pod_message_for_non_existent_optional = "";
2388 if ($optional{$addr}->@*) {
2390 # First element is the pod message
2391 $pod_message_for_non_existent_optional
2392 = shift $optional{$addr}->@*;
2393 # Convert a 0 'Optional' argument to an empty list to make later
2394 # code more concise.
2395 if ( $optional{$addr}->@*
2396 && $optional{$addr}->@* == 1
2397 && $optional{$addr}[0] ne ""
2398 && $optional{$addr}[0] !~ /\D/
2399 && $optional{$addr}[0] == 0)
2401 $optional{$addr} = [ ];
2403 else { # But if the only element doesn't evaluate to 0, make sure
2404 # that this file is indeed considered optional below.
2405 unshift $optional{$addr}->@*, 1;
2410 my $function_instead_of_file = 0;
2412 if ($early{$addr}->@* && $early{$addr}[-1] eq 'ONLY_EARLY') {
2413 $only_early{$addr} = 1;
2414 pop $early{$addr}->@*;
2417 # If we are compiling a Unicode release earlier than the file became
2418 # available, the constructor may have supplied a substitute
2419 if ($first_released{$addr} gt $v_version && $early{$addr}->@*) {
2421 # Yes, we have a substitute, that we will use; mark it so
2422 unshift $early{$addr}->@*, 1;
2424 # See the definition of %early for what the array elements mean.
2425 # Note that we have just unshifted onto the array, so the numbers
2426 # below are +1 of those in the %early description.
2427 # If we have a property this defines, create a table and default
2428 # map for it now (at essentially compile time), so that it will be
2429 # available for the whole of run time. (We will want to add this
2430 # name as an alias when we are using the official property name;
2431 # but this must be deferred until run(), because at construction
2432 # time the official names have yet to be defined.)
2433 if ($early{$addr}[2]) {
2434 my $fate = ($property{$addr}
2435 && $property{$addr} eq $early{$addr}[2])
2438 my $prop_object = Property->new($early{$addr}[2],
2440 Perl_Extension => 1,
2443 # If not specified by the constructor, use the default mapping
2444 # for the regular property for this substitute one.
2445 if ($early{$addr}[4]) {
2446 $prop_object->set_default_map($early{$addr}[4]);
2448 elsif ( defined $property{$addr}
2449 && defined $default_mapping{$property{$addr}})
2452 ->set_default_map($default_mapping{$property{$addr}});
2456 if (ref $early{$addr}[1] eq 'CODE') {
2457 $function_instead_of_file = 1;
2459 # If the first element of the array is a code ref, the others
2461 $handler{$addr} = $early{$addr}[1];
2462 $property{$addr} = $early{$addr}[2]
2463 if defined $early{$addr}[2];
2464 $progress = "substitute $file{$addr}";
2468 else { # Specifying a substitute file
2470 if (! main::file_exists($early{$addr}[1])) {
2472 # If we don't see the substitute file, generate an error
2473 # message giving the needed things, and add it to the list
2474 # of such to output before actual processing happens
2475 # (hence the user finds out all of them in one run).
2476 # Instead of creating a general method for NameAliases,
2477 # hard-code it here, as there is unlikely to ever be a
2478 # second one which needs special handling.
2479 my $string_version = ($file{$addr} eq "NameAliases.txt")
2480 ? 'at least 6.1 (the later, the better)'
2481 : sprintf "%vd", $first_released{$addr};
2482 push @missing_early_files, <<END;
2483 '$file{$addr}' version $string_version should be copied to '$early{$addr}[1]'.
2488 $progress = $early{$addr}[1];
2489 $progress .= ", substituting for $file{$addr}" if $file{$addr};
2490 $file{$addr} = $early{$addr}[1];
2491 $property{$addr} = $early{$addr}[2];
2493 # Ignore code points not in the version being compiled
2494 push $each_line_handler{$addr}->@*, \&_exclude_unassigned;
2496 if ( $v_version lt v2.0 # Hanguls in this release ...
2497 && defined $early{$addr}[3]) # ... need special treatment
2499 push $eof_handler{$addr}->@*, \&_fixup_obsolete_hanguls;
2503 # And this substitute is valid for all releases.
2504 $first_released{$addr} = v0;
2506 else { # Normal behavior
2507 $progress = $file{$addr};
2508 unshift $early{$addr}->@*, 0; # No substitute
2511 my $file = $file{$addr};
2512 $progress_message{$addr} = "Processing $progress"
2513 unless $progress_message{$addr};
2515 # A file should be there if it is within the window of versions for
2516 # which Unicode supplies it
2517 if ($withdrawn{$addr} && $withdrawn{$addr} le $v_version) {
2518 $in_this_release{$addr} = 0;
2522 $in_this_release{$addr} = $first_released{$addr} le $v_version;
2524 # Check that the file for this object (possibly using a substitute
2525 # for early releases) exists or we have a function alternative
2526 if ( ! $function_instead_of_file
2527 && ! main::file_exists($file))
2529 # Here there is nothing available for this release. This is
2530 # fine if we aren't expecting anything in this release.
2531 if (! $in_this_release{$addr}) {
2532 $skip{$addr} = ""; # Don't remark since we expected
2533 # nothing and got nothing
2535 elsif ($optional{$addr}->@*) {
2537 # Here the file is optional in this release; Use the
2538 # passed in text to document this case in the pod.
2539 $skip{$addr} = $pod_message_for_non_existent_optional;
2541 elsif ( $in_this_release{$addr}
2542 && ! defined $skip{$addr}
2544 { # Doesn't exist but should.
2545 $skip{$addr} = "'$file' not found. Possibly Big problems";
2546 Carp::my_carp($skip{$addr});
2549 elsif ($debug_skip && ! defined $skip{$addr} && ! $non_skip{$addr})
2552 # The file exists; if not skipped for another reason, and we are
2553 # skipping most everything during debugging builds, use that as
2555 $skip{$addr} = '$debug_skip is on'
2561 && ! $required_even_in_debug_skip{$addr}
2564 print "Warning: " . __PACKAGE__ . " constructor for $file has useless 'non_skip' in it\n";
2567 # Here, we have figured out if we will be skipping this file or not.
2568 # If so, we add any single property it defines to any passed in
2569 # optional property list. These will be dealt with at run time.
2570 if (defined $skip{$addr}) {
2571 if ($property{$addr}) {
2572 push $optional{$addr}->@*, $property{$addr};
2574 } # Otherwise, are going to process the file.
2575 elsif ($property{$addr}) {
2577 # If the file has a property defined in the constructor for it, it
2578 # means that the property is not listed in the file's entries. So
2579 # add a handler (to the list of line handlers) to insert the
2580 # property name into the lines, to provide a uniform interface to
2581 # the final processing subroutine.
2582 push @{$each_line_handler{$addr}}, \&_insert_property_into_line;
2584 elsif ($properties{$addr}) {
2586 # Similarly, there may be more than one property represented on
2587 # each line, with no clue but the constructor input what those
2588 # might be. Add a handler for each line in the input so that it
2589 # creates a separate input line for each property in those input
2590 # lines, thus making them suitable to handle generically.
2592 push @{$each_line_handler{$addr}},
2595 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
2596 my @fields = split /\s*;\s*/, $_, -1;
2598 if (@fields - 1 > @{$properties{$addr}}) {
2599 $file->carp_bad_line('Extra fields');
2603 my $range = shift @fields; # 0th element is always the
2606 # The next fields in the input line correspond
2607 # respectively to the stored properties.
2608 for my $i (0 .. @{$properties{$addr}} - 1) {
2609 my $property_name = $properties{$addr}[$i];
2610 next if $property_name eq '<ignored>';
2611 $file->insert_adjusted_lines(
2612 "$range; $property_name; $fields[$i]");
2620 { # On non-ascii platforms, we use a special pre-handler
2623 *next_line = (main::NON_ASCII_PLATFORM)
2624 ? *_next_line_with_remapped_range
2628 &{$construction_time_handler{$addr}}($self)
2629 if $construction_time_handler{$addr};
2637 qw("") => "_operator_stringify",
2638 "." => \&main::_operator_dot,
2639 ".=" => \&main::_operator_dot_equal,
2642 sub _operator_stringify($self, $other="", $reversed=0) {
2643 return __PACKAGE__ . " object for " . $self->file;
2647 # Process the input object $self. This opens and closes the file and
2648 # calls all the handlers for it. Currently, this can only be called
2649 # once per file, as it destroy's the EOF handlers
2651 # flag to make sure extracted files are processed early
2652 state $seen_non_extracted = 0;
2654 my $addr = pack 'J', refaddr $self;
2656 my $file = $file{$addr};
2659 $handle{$addr} = 'pretend_is_open';
2662 if ($seen_non_extracted) {
2663 if ($file =~ /$EXTRACTED/i) # Some platforms may change the
2664 # case of the file's name
2666 Carp::my_carp_bug(main::join_lines(<<END
2667 $file should be processed just after the 'Prop...Alias' files, and before
2668 anything not in the $EXTRACTED_DIR directory. Proceeding, but the results may
2669 have subtle problems
2674 elsif ($EXTRACTED_DIR
2676 # We only do this check for generic property files
2677 && $handler{$addr} == \&main::process_generic_property_file
2679 && $file !~ /$EXTRACTED/i)
2681 # We don't set this (by the 'if' above) if we have no
2682 # extracted directory, so if running on an early version,
2683 # this test won't work. Not worth worrying about.
2684 $seen_non_extracted = 1;
2687 # Mark the file as having being processed, and warn if it
2688 # isn't a file we are expecting. As we process the files,
2689 # they are deleted from the hash, so any that remain at the
2690 # end of the program are files that we didn't process.
2691 my $fkey = File::Spec->rel2abs($file);
2692 my $exists = delete $potential_files{lc($fkey)};
2694 Carp::my_carp("Was not expecting '$file'.")
2695 if $exists && ! $in_this_release{$addr};
2697 # If there is special handling for compiling Unicode releases
2698 # earlier than the first one in which Unicode defines this
2700 if ($early{$addr}->@* > 1) {
2702 # Mark as processed any substitute file that would be used in
2704 $fkey = File::Spec->rel2abs($early{$addr}[1]);
2705 delete $potential_files{lc($fkey)};
2707 # As commented in the constructor code, when using the
2708 # official property, we still have to allow the publicly
2709 # inaccessible early name so that the core code which uses it
2710 # will work regardless.
2711 if ( ! $only_early{$addr}
2712 && ! $early{$addr}[0]
2713 && $early{$addr}->@* > 2)
2715 my $early_property_name = $early{$addr}[2];
2716 if ($property{$addr} ne $early_property_name) {
2717 main::property_ref($property{$addr})
2718 ->add_alias($early_property_name);
2723 # We may be skipping this file ...
2724 if (defined $skip{$addr}) {
2726 # If the file isn't supposed to be in this release, there is
2728 if ($in_this_release{$addr}) {
2730 # But otherwise, we may print a message
2732 print STDERR "Skipping input file '$file'",
2733 " because '$skip{$addr}'\n";
2736 # And add it to the list of skipped files, which is later
2737 # used to make the pod
2738 $skipped_files{$file} = $skip{$addr};
2740 # The 'optional' list contains properties that are also to
2741 # be skipped along with the file. (There may also be
2742 # digits which are just placeholders to make sure it isn't
2744 foreach my $property ($optional{$addr}->@*) {
2745 next unless $property =~ /\D/;
2746 my $prop_object = main::property_ref($property);
2747 next unless defined $prop_object;
2748 $prop_object->set_fate($SUPPRESSED, $skip{$addr});
2755 # Here, we are going to process the file. Open it, converting the
2756 # slashes used in this program into the proper form for the OS
2758 if (not open $file_handle, "<", $file) {
2759 Carp::my_carp("Can't open $file. Skipping: $!");
2762 $handle{$addr} = $file_handle; # Cache the open file handle
2764 # If possible, make sure that the file is the correct version.
2765 # (This data isn't available on early Unicode releases or in
2766 # UnicodeData.txt.) We don't do this check if we are using a
2767 # substitute file instead of the official one (though the code
2768 # could be extended to do so).
2769 if ($in_this_release{$addr}
2770 && ! $early{$addr}[0]
2771 && lc($file) ne 'unicodedata.txt')
2775 if ($file !~ /^Unihan/i) {
2777 # The non-Unihan files started getting version numbers in
2778 # 3.2, but some files in 4.0 are unchanged from 3.2, and
2779 # marked as 3.2. 4.0.1 is the first version where there
2780 # are no files marked as being from less than 4.0, though
2781 # some are marked as 4.0. In versions after that, the
2782 # numbers are correct.
2783 if ($v_version ge v4.0.1) {
2784 $_ = <$file_handle>; # The version number is in the
2785 # very first line if it is a
2786 # UCD file; otherwise, it
2788 goto valid_version if $_ =~ / - $string_version \. /x;
2793 # 4.0.1 had some valid files that weren't updated.
2795 if $v_version eq v4.0.1 && $_ =~ /4\.0\.0/;
2800 my $BOM = "\x{FEFF}";
2802 my $BOM_re = qr/ ^ (?:$BOM)? /x;
2807 # BOM; seems to be on many lines in some
2813 # Only look for the version if in the
2814 # first comment block.
2815 goto no_version unless $_ =~ /^#/;
2817 if ($_ =~ /Version:? (\S*)/) {
2820 if $this_version eq $string_version;
2822 if "$this_version.0"
2826 } while (<$file_handle>);
2832 elsif ($v_version ge v6.0.0) { # Unihan
2834 # Unihan files didn't get accurate version numbers until
2835 # 6.0. The version is somewhere in the first comment
2837 while (<$file_handle>) {
2838 goto no_version if $_ !~ /^#/;
2841 next if $_ !~ / version: /x;
2842 goto valid_version if $_ =~ /$string_version/;
2847 else { # Old Unihan; have to assume is valid
2852 die Carp::my_carp("File '$file' is version "
2853 . "'$this_version'. It should be "
2854 . "version $string_version");
2856 Carp::my_carp_bug("Could not find the expected "
2857 . "version info in file '$file'");
2862 print "$progress_message{$addr}\n" if $verbosity >= $PROGRESS;
2864 # Call any special handler for before the file.
2865 &{$pre_handler{$addr}}($self) if $pre_handler{$addr};
2867 # Then the main handler
2868 &{$handler{$addr}}($self);
2870 # Then any special post-file handler.
2871 &{$post_handler{$addr}}($self) if $post_handler{$addr};
2873 # If any errors have been accumulated, output the counts (as the first
2874 # error message in each class was output when it was encountered).
2875 if ($errors{$addr}) {
2878 foreach my $error (keys %{$errors{$addr}}) {
2879 $total += $errors{$addr}->{$error};
2880 delete $errors{$addr}->{$error};
2885 = "A total of $total lines had errors in $file. ";
2887 $message .= ($types == 1)
2888 ? '(Only the first one was displayed.)'
2889 : '(Only the first of each type was displayed.)';
2890 Carp::my_carp($message);
2894 if (@{$missings{$addr}}) {
2895 Carp::my_carp_bug("Handler for $file didn't look at all the \@missing lines. Generated tables likely are wrong");
2898 # If a real file handle, close it.
2899 close $handle{$addr} or Carp::my_carp("Can't close $file: $!") if
2901 $handle{$addr} = ""; # Uses empty to indicate that has already seen
2902 # the file, as opposed to undef
2906 sub _next_line($self) {
2907 # Sets $_ to be the next logical input line, if any. Returns non-zero
2908 # if such a line exists. 'logical' means that any lines that have
2909 # been added via insert_lines() will be returned in $_ before the file
2912 my $addr = pack 'J', refaddr $self;
2914 # Here the file is open (or if the handle is not a ref, is an open
2915 # 'virtual' file). Get the next line; any inserted lines get priority
2916 # over the file itself.
2920 while (1) { # Loop until find non-comment, non-empty line
2921 #local $to_trace = 1 if main::DEBUG;
2922 my $inserted_ref = shift @{$added_lines{$addr}};
2923 if (defined $inserted_ref) {
2924 ($adjusted, $_) = @{$inserted_ref};
2925 trace $adjusted, $_ if main::DEBUG && $to_trace;
2926 return 1 if $adjusted;
2929 last if ! ref $handle{$addr}; # Don't read unless is real file
2930 last if ! defined ($_ = readline $handle{$addr});
2933 trace $_ if main::DEBUG && $to_trace;
2935 # See if this line is the comment line that defines what property
2936 # value that code points that are not listed in the file should
2937 # have. The format or existence of these lines is not guaranteed
2938 # by Unicode since they are comments, but the documentation says
2939 # that this was added for machine-readability, so probably won't
2940 # change. This works starting in Unicode Version 5.0. They look
2943 # @missing: 0000..10FFFF; Not_Reordered
2944 # @missing: 0000..10FFFF; Decomposition_Mapping; <code point>
2945 # @missing: 0000..10FFFF; ; NaN
2947 # Save the line for a later get_missings() call.
2948 if (/$missing_defaults_prefix/) {
2949 if ($has_missings_defaults{$addr} == $NO_DEFAULTS) {
2950 $self->carp_bad_line("Unexpected \@missing line. Assuming no missing entries");
2952 elsif ($has_missings_defaults{$addr} == $NOT_IGNORED) {
2953 my $start = $1; # The pattern saves the beginning and
2954 my $end = $2; # end points of the range the default
2956 my @defaults = split /\s* ; \s*/x, $_;
2958 # The first field is the @missing, which ends in a
2959 # semi-colon, so can safely shift.
2962 # Some of these lines may have empty field placeholders
2963 # which get in the way. An example is:
2964 # @missing: 0000..10FFFF; ; NaN
2965 # Remove them. Process starting from the top so the
2966 # splice doesn't affect things still to be looked at.
2967 for (my $i = @defaults - 1; $i >= 0; $i--) {
2968 next if $defaults[$i] ne "";
2969 splice @defaults, $i, 1;
2972 # What's left should be just the property (maybe) and the
2973 # default. Having only one element means it doesn't have
2977 if (@defaults >= 1) {
2978 if (@defaults == 1) {
2979 $default = $defaults[0];
2982 $property = $defaults[0];
2983 $default = $defaults[1];
2989 || ($default =~ /^</
2990 && $default !~ /^<code *point>$/i
2991 && $default !~ /^<none>$/i
2992 && $default !~ /^<script>$/i))
2994 $self->carp_bad_line("Unrecognized \@missing line: $_. Assuming no missing entries");
2998 # If the property is missing from the line, it should
2999 # be the one for the whole file
3000 $property = $property{$addr} if ! defined $property;
3002 # Change <none> to the null string, which is what it
3003 # really means. If the default is the code point
3004 # itself, set it to <code point>, which is what
3005 # Unicode uses (but sometimes they've forgotten the
3007 if ($default =~ /^<none>$/i) {
3010 elsif ($default =~ /^<code *point>$/i) {
3011 $default = $CODE_POINT;
3013 elsif ($default =~ /^<script>$/i) {
3015 # Special case this one. Currently is from
3016 # ScriptExtensions.txt, and means for all unlisted
3017 # code points, use their Script property values.
3018 # For the code points not listed in that file, the
3019 # default value is 'Unknown'.
3020 $default = "Unknown";
3023 # Store them as a sub-hash as part of an array, with
3025 push @{$missings{$addr}}, { start => hex $start,
3027 default => $default,
3028 property => $property
3033 # There is nothing for the caller to process on this comment
3038 # Unless to keep, remove comments. If to keep, ignore
3039 # comment-only lines
3040 if ($retain_trailing_comments{$addr}) {
3041 next if / ^ \s* \# /x;
3043 # But escape any single quotes (done in both the comment and
3044 # non-comment portion; this could be a bug someday, but not
3052 # Remove trailing space, and skip this line if the result is empty
3056 # Call any handlers for this line, and skip further processing of
3057 # the line if the handler sets the line to null.
3058 foreach my $sub_ref (@{$each_line_handler{$addr}}) {
3063 # Here the line is ok. return success.
3065 } # End of looping through lines.
3067 # If there are EOF handlers, call each (only once) and if it generates
3068 # more lines to process go back in the loop to handle them.
3069 while ($eof_handler{$addr}->@*) {
3070 &{$eof_handler{$addr}[0]}($self);
3071 shift $eof_handler{$addr}->@*; # Currently only get one shot at it.
3072 goto LINE if $added_lines{$addr};
3075 # Return failure -- no more lines.
3080 sub _next_line_with_remapped_range($self) {
3081 # like _next_line(), but for use on non-ASCII platforms. It sets $_
3082 # to be the next logical input line, if any. Returns non-zero if such
3083 # a line exists. 'logical' means that any lines that have been added
3084 # via insert_lines() will be returned in $_ before the file is read
3087 # The difference from _next_line() is that this remaps the Unicode
3088 # code points in the input to those of the native platform. Each
3089 # input line contains a single code point, or a single contiguous
3090 # range of them This routine splits each range into its individual
3091 # code points and caches them. It returns the cached values,
3092 # translated into their native equivalents, one at a time, for each
3093 # call, before reading the next line. Since native values can only be
3094 # a single byte wide, no translation is needed for code points above
3095 # 0xFF, and ranges that are entirely above that number are not split.
3096 # If an input line contains the range 254-1000, it would be split into
3097 # three elements: 254, 255, and 256-1000. (The downstream table
3098 # insertion code will sort and coalesce the individual code points
3099 # into appropriate ranges.)
3101 my $addr = pack 'J', refaddr $self;
3105 # Look in cache before reading the next line. Return any cached
3107 my $inserted = shift @{$remapped_lines{$addr}};
3108 if (defined $inserted) {
3109 trace $inserted if main::DEBUG && $to_trace;
3110 $_ = $inserted =~ s/^ ( \d+ ) /sprintf("%04X", utf8::unicode_to_native($1))/xer;
3111 trace $_ if main::DEBUG && $to_trace;
3115 # Get the next line.
3116 return 0 unless _next_line($self);
3118 # If there is a special handler for it, return the line,
3119 # untranslated. This should happen only for files that are
3120 # special, not being code-point related, such as property names.
3121 return 1 if $handler{$addr}
3122 != \&main::process_generic_property_file;
3124 my ($range, $property_name, $map, @remainder)
3125 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3128 || ! defined $property_name
3129 || $range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3131 Carp::my_carp_bug("Unrecognized input line '$_'. Ignored");
3135 my $high = (defined $2) ? hex $2 : $low;
3137 # If the input maps the range to another code point, remap the
3138 # target if it is between 0 and 255.
3141 $map =~ s/\b 00 ( [0-9A-F]{2} ) \b/sprintf("%04X", utf8::unicode_to_native(hex $1))/gxe;
3142 $tail = "$property_name; $map";
3143 $_ = "$range; $tail";
3146 $tail = $property_name;
3149 # If entire range is above 255, just return it, unchanged (except
3150 # any mapped-to code point, already changed above)
3151 return 1 if $low > 255;
3153 # Cache an entry for every code point < 255. For those in the
3154 # range above 255, return a dummy entry for just that portion of
3155 # the range. Note that this will be out-of-order, but that is not
3157 foreach my $code_point ($low .. $high) {
3158 if ($code_point > 255) {
3159 $_ = sprintf "%04X..%04X; $tail", $code_point, $high;
3162 push @{$remapped_lines{$addr}}, "$code_point; $tail";
3164 } # End of looping through lines.
3169 # Not currently used, not fully tested.
3171 # # Non-destructive lookahead one non-adjusted, non-comment, non-blank
3172 # # record. Not callable from an each_line_handler(), nor does it call
3173 # # an each_line_handler() on the line.
3176 # my $addr = pack 'J', refaddr $self;
3178 # foreach my $inserted_ref (@{$added_lines{$addr}}) {
3179 # my ($adjusted, $line) = @{$inserted_ref};
3180 # next if $adjusted;
3182 # # Remove comments and trailing space, and return a non-empty
3185 # $line =~ s/\s+$//;
3186 # return $line if $line ne "";
3189 # return if ! ref $handle{$addr}; # Don't read unless is real file
3190 # while (1) { # Loop until find non-comment, non-empty line
3191 # local $to_trace = 1 if main::DEBUG;
3192 # trace $_ if main::DEBUG && $to_trace;
3193 # return if ! defined (my $line = readline $handle{$addr});
3195 # push @{$added_lines{$addr}}, [ 0, $line ];
3198 # $line =~ s/\s+$//;
3199 # return $line if $line ne "";
3206 sub insert_lines($self, @lines) {
3207 # Lines can be inserted so that it looks like they were in the input
3208 # file at the place it was when this routine is called. See also
3209 # insert_adjusted_lines(). Lines inserted via this routine go through
3210 # any each_line_handler()
3212 # Each inserted line is an array, with the first element being 0 to
3213 # indicate that this line hasn't been adjusted, and needs to be
3215 push @{$added_lines{pack 'J', refaddr $self}}, map { [ 0, $_ ] } @lines;
3219 sub insert_adjusted_lines($self, @lines) {
3220 # Lines can be inserted so that it looks like they were in the input
3221 # file at the place it was when this routine is called. See also
3222 # insert_lines(). Lines inserted via this routine are already fully
3223 # adjusted, ready to be processed; each_line_handler()s handlers will
3224 # not be called. This means this is not a completely general
3225 # facility, as only the last each_line_handler on the stack should
3226 # call this. It could be made more general, by passing to each of the
3227 # line_handlers their position on the stack, which they would pass on
3228 # to this routine, and that would replace the boolean first element in
3229 # the anonymous array pushed here, so that the next_line routine could
3230 # use that to call only those handlers whose index is after it on the
3231 # stack. But this is overkill for what is needed now.
3233 trace $self if main::DEBUG && $to_trace;
3235 # Each inserted line is an array, with the first element being 1 to
3236 # indicate that this line has been adjusted
3237 push @{$added_lines{pack 'J', refaddr $self}}, map { [ 1, $_ ] } @lines;
3241 sub get_missings($self) {
3242 # Returns the stored up @missings lines' values, and clears the list.
3243 # The values are in a hash, consisting of 'default' and 'property'.
3244 # However, since these lines can be stacked up, the return is an array
3245 # of all these hashes.
3247 my $addr = pack 'J', refaddr $self;
3249 # If not accepting a list return, just return the first one.
3250 return shift @{$missings{$addr}} unless wantarray;
3252 my @return = @{$missings{$addr}};
3253 undef @{$missings{$addr}};
3257 sub _exclude_unassigned($self) {
3259 # Takes the range in $_ and excludes code points that aren't assigned
3262 state $skip_inserted_count = 0;
3264 # Ignore recursive calls.
3265 if ($skip_inserted_count) {
3266 $skip_inserted_count--;
3270 # Find what code points are assigned in this release
3271 main::calculate_Assigned() if ! defined $Assigned;
3273 my ($range, @remainder)
3274 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
3276 # Examine the range.
3277 if ($range =~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
3280 my $high = (defined $2) ? hex $2 : $low;
3282 # Split the range into subranges of just those code points in it
3283 # that are assigned.
3284 my @ranges = (Range_List->new(Initialize
3285 => Range->new($low, $high)) & $Assigned)->ranges;
3287 # Do nothing if nothing in the original range is assigned in this
3288 # release; handle normally if everything is in this release.
3292 elsif (@ranges != 1) {
3294 # Here, some code points in the original range aren't in this
3295 # release; @ranges gives the ones that are. Create fake input
3296 # lines for each of the ranges, and set things up so that when
3297 # this routine is called on that fake input, it will do
3299 $skip_inserted_count = @ranges;
3300 my $remainder = join ";", @remainder;
3301 for my $range (@ranges) {
3302 $self->insert_lines(sprintf("%04X..%04X;%s",
3303 $range->start, $range->end, $remainder));
3305 $_ = ""; # The original range is now defunct.
3312 sub _fixup_obsolete_hanguls($self) {
3314 # This is called only when compiling Unicode version 1. All Unicode
3315 # data for subsequent releases assumes that the code points that were
3316 # Hangul syllables in this release only are something else, so if
3317 # using such data, we have to override it
3319 my $addr = pack 'J', refaddr $self;
3321 my $object = main::property_ref($property{$addr});
3322 $object->add_map($FIRST_REMOVED_HANGUL_SYLLABLE,
3323 $FINAL_REMOVED_HANGUL_SYLLABLE,
3324 $early{$addr}[3], # Passed-in value for these
3325 Replace => $UNCONDITIONALLY);
3328 sub _insert_property_into_line($self) {
3329 # Add a property field to $_, if this file requires it.
3331 my $property = $property{pack 'J', refaddr $self};
3332 $_ =~ s/(;|$)/; $property$1/;
3336 sub carp_bad_line($self, $message="") {
3337 # Output consistent error messages, using either a generic one, or the
3338 # one given by the optional parameter. To avoid gazillions of the
3339 # same message in case the syntax of a file is way off, this routine
3340 # only outputs the first instance of each message, incrementing a
3341 # count so the totals can be output at the end of the file.
3343 my $addr = pack 'J', refaddr $self;
3345 $message = 'Unexpected line' unless $message;
3347 # No trailing punctuation so as to fit with our addenda.
3348 $message =~ s/[.:;,]$//;
3350 # If haven't seen this exact message before, output it now. Otherwise
3351 # increment the count of how many times it has occurred
3352 unless ($errors{$addr}->{$message}) {
3353 Carp::my_carp("$message in '$_' in "
3355 . " at line $.. Skipping this line;");
3356 $errors{$addr}->{$message} = 1;
3359 $errors{$addr}->{$message}++;
3362 # Clear the line to prevent any further (meaningful) processing of it.
3369 package Multi_Default;
3371 sub trace { return main::trace(@_); }
3373 # Certain properties in early versions of Unicode had more than one possible
3374 # default for code points missing from the files. In these cases, one
3375 # default applies to everything left over after all the others are applied,
3376 # and for each of the others, there is a description of which class of code
3377 # points applies to it. This object helps implement this by storing the
3378 # defaults, and for all but that final default, an eval string that generates
3379 # the class that it applies to. That class must be a Range_List, or contains
3380 # a Range_List that the overloaded operators recognize as to be operated on.
3381 # A string is used because this is called early when we know symbolically what
3382 # needs to be done, but typically before any data is gathered. Thus the
3383 # evaluation gets delayed until we have at hand all the needed information.
3387 main::setup_package();
3390 # The defaults structure for the classes
3391 main::set_access('class_defaults', \%class_defaults, 'readable_array');
3394 # The default that applies to everything left over.
3395 main::set_access('other_default', \%other_default, 'r');
3400 # The constructor is called with default => eval pairs, terminated by
3401 # the left-over default. e.g.
3402 # Multi_Default->new(
3403 # 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C
3405 # 'R' => 'some other expression that evaluates to code points',
3410 # It is best to leave the final value be the one that matches the
3411 # above-Unicode code points.
3415 my $self = bless \do{my $anonymous_scalar}, $class;
3416 my $addr = pack 'J', refaddr $self;
3417 $iterator{$addr} = 0;
3419 return $self unless @_;
3422 $self->append_default(shift, shift);
3425 $self->set_final_default(shift);
3430 sub append_default($self, $new_default, $eval) {
3431 my $addr = pack 'J', refaddr $self;
3433 # Pushes a default setting to the current list
3434 push $class_defaults{$addr}->@*, [ $new_default, $eval ];
3437 sub set_final_default($self, $new_default) {
3438 my $addr = pack 'J', refaddr $self;
3439 $other_default{$addr} = $new_default;
3442 sub get_next_defaults($self) {
3443 # Iterates and returns the next class of defaults.
3445 my $addr = pack 'J', refaddr $self;
3446 if ($iterator{$addr}++ < $class_defaults{$addr}->@*) {
3447 return $class_defaults{$addr}->[$iterator{$addr}-1]->@*;
3450 $iterator{$addr} = 0;
3457 # An alias is one of the names that a table goes by. This class defines them
3458 # including some attributes. Everything is currently setup in the
3464 main::setup_package();
3467 main::set_access('name', \%name, 'r');
3470 # Should this name match loosely or not.
3471 main::set_access('loose_match', \%loose_match, 'r');
3473 my %make_re_pod_entry;
3474 # Some aliases should not get their own entries in the re section of the
3475 # pod, because they are covered by a wild-card, and some we want to
3476 # discourage use of. Binary
3477 main::set_access('make_re_pod_entry', \%make_re_pod_entry, 'r', 's');
3480 # Is this documented to be accessible via Unicode::UCD
3481 main::set_access('ucd', \%ucd, 'r', 's');
3484 # Aliases have a status, like deprecated, or even suppressed (which means
3485 # they don't appear in documentation). Enum
3486 main::set_access('status', \%status, 'r');
3489 # Similarly, some aliases should not be considered as usable ones for
3490 # external use, such as file names, or we don't want documentation to
3491 # recommend them. Boolean
3492 main::set_access('ok_as_filename', \%ok_as_filename, 'r');
3497 my $self = bless \do { my $anonymous_scalar }, $class;
3498 my $addr = pack 'J', refaddr $self;
3500 $name{$addr} = shift;
3501 $loose_match{$addr} = shift;
3502 $make_re_pod_entry{$addr} = shift;
3503 $ok_as_filename{$addr} = shift;
3504 $status{$addr} = shift;
3505 $ucd{$addr} = shift;
3507 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
3509 # Null names are never ok externally
3510 $ok_as_filename{$addr} = 0 if $name{$addr} eq "";
3518 # A range is the basic unit for storing code points, and is described in the
3519 # comments at the beginning of the program. Each range has a starting code
3520 # point; an ending code point (not less than the starting one); a value
3521 # that applies to every code point in between the two end-points, inclusive;
3522 # and an enum type that applies to the value. The type is for the user's
3523 # convenience, and has no meaning here, except that a non-zero type is
3524 # considered to not obey the normal Unicode rules for having standard forms.
3526 # The same structure is used for both map and match tables, even though in the
3527 # latter, the value (and hence type) is irrelevant and could be used as a
3528 # comment. In map tables, the value is what all the code points in the range
3529 # map to. Type 0 values have the standardized version of the value stored as
3530 # well, so as to not have to recalculate it a lot.
3532 sub trace { return main::trace(@_); }
3536 main::setup_package();
3539 main::set_access('start', \%start, 'r', 's');
3542 main::set_access('end', \%end, 'r', 's');
3545 main::set_access('value', \%value, 'r', 's');
3548 main::set_access('type', \%type, 'r');
3551 # The value in internal standard form. Defined only if the type is 0.
3552 main::set_access('standard_form', \%standard_form);
3554 # Note that if these fields change, the dump() method should as well
3556 sub new($class, $_addr, $_end, @_args) {
3557 my $self = bless \do { my $anonymous_scalar }, $class;
3558 my $addr = pack 'J', refaddr $self;
3560 $start{$addr} = $_addr;
3561 $end{$addr} = $_end;
3565 my $value = delete $args{'Value'}; # Can be 0
3566 $value = "" unless defined $value;
3567 $value{$addr} = $value;
3569 $type{$addr} = delete $args{'Type'} || 0;
3571 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3578 qw("") => "_operator_stringify",
3579 "." => \&main::_operator_dot,
3580 ".=" => \&main::_operator_dot_equal,
3583 sub _operator_stringify($self, $other="", $reversed=0) {
3584 my $addr = pack 'J', refaddr $self;
3586 # Output it like '0041..0065 (value)'
3587 my $return = sprintf("%04X", $start{$addr})
3589 . sprintf("%04X", $end{$addr});
3590 my $value = $value{$addr};
3591 my $type = $type{$addr};
3593 $return .= "$value";
3594 $return .= ", Type=$type" if $type != 0;
3600 sub standard_form($self) {
3601 # Calculate the standard form only if needed, and cache the result.
3602 # The standard form is the value itself if the type is special.
3603 # This represents a considerable CPU and memory saving - at the time
3604 # of writing there are 368676 non-special objects, but the standard
3605 # form is only requested for 22047 of them - ie about 6%.
3607 my $addr = pack 'J', refaddr $self;
3609 return $standard_form{$addr} if defined $standard_form{$addr};
3611 my $value = $value{$addr};
3612 return $value if $type{$addr};
3613 return $standard_form{$addr} = main::standardize($value);
3616 sub dump($self, $indent) {
3617 # Human, not machine readable. For machine readable, comment out this
3618 # entire routine and let the standard one take effect.
3619 my $addr = pack 'J', refaddr $self;
3621 my $return = $indent
3622 . sprintf("%04X", $start{$addr})
3624 . sprintf("%04X", $end{$addr})
3625 . " '$value{$addr}';";
3626 if (! defined $standard_form{$addr}) {
3627 $return .= "(type=$type{$addr})";
3629 elsif ($standard_form{$addr} ne $value{$addr}) {
3630 $return .= "(standard '$standard_form{$addr}')";
3636 package _Range_List_Base;
3638 # Base class for range lists. A range list is simply an ordered list of
3639 # ranges, so that the ranges with the lowest starting numbers are first in it.
3641 # When a new range is added that is adjacent to an existing range that has the
3642 # same value and type, it merges with it to form a larger range.
3644 # Ranges generally do not overlap, except that there can be multiple entries
3645 # of single code point ranges. This is because of NameAliases.txt.
3647 # In this program, there is a standard value such that if two different
3648 # values, have the same standard value, they are considered equivalent. This
3649 # value was chosen so that it gives correct results on Unicode data
3651 # There are a number of methods to manipulate range lists, and some operators
3652 # are overloaded to handle them.
3654 sub trace { return main::trace(@_); }
3660 # Max is initialized to a negative value that isn't adjacent to 0, for
3664 main::setup_package();
3667 # The list of ranges
3668 main::set_access('ranges', \%ranges, 'readable_array');
3671 # The highest code point in the list. This was originally a method, but
3672 # actual measurements said it was used a lot.
3673 main::set_access('max', \%max, 'r');
3675 my %each_range_iterator;
3676 # Iterator position for each_range()
3677 main::set_access('each_range_iterator', \%each_range_iterator);
3680 # Name of parent this is attached to, if any. Solely for better error
3682 main::set_access('owner_name_of', \%owner_name_of, 'p_r');
3684 my %_search_ranges_cache;
3685 # A cache of the previous result from _search_ranges(), for better
3687 main::set_access('_search_ranges_cache', \%_search_ranges_cache);
3693 # Optional initialization data for the range list. NOTE: For large
3694 # ranges, it is better to use Range object rather than
3696 # as it iterates through each one individually in the latter case.
3697 my $initialize = delete $args{'Initialize'};
3701 # Use _union() to initialize. _union() returns an object of this
3702 # class, which means that it will call this constructor recursively.
3703 # But it won't have this $initialize parameter so that it won't
3704 # infinitely loop on this.
3705 return _union($class, $initialize, %args) if defined $initialize;
3707 $self = bless \do { my $anonymous_scalar }, $class;
3708 my $addr = pack 'J', refaddr $self;
3710 # Optional parent object, only for debug info.
3711 $owner_name_of{$addr} = delete $args{'Owner'};
3712 $owner_name_of{$addr} = "" if ! defined $owner_name_of{$addr};
3714 # Stringify, in case it is an object.
3715 $owner_name_of{$addr} = "$owner_name_of{$addr}";
3717 # This is used only for error messages, and so a colon is added
3718 $owner_name_of{$addr} .= ": " if $owner_name_of{$addr} ne "";
3720 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
3722 $max{$addr} = $max_init;
3724 $_search_ranges_cache{$addr} = 0;
3725 $ranges{$addr} = [];
3732 qw("") => "_operator_stringify",
3733 "." => \&main::_operator_dot,
3734 ".=" => \&main::_operator_dot_equal,
3737 sub _operator_stringify($self, $other="", $reversed=0) {
3738 my $addr = pack 'J', refaddr $self;
3740 return "Range_List attached to '$owner_name_of{$addr}'"
3741 if $owner_name_of{$addr};
3742 return "anonymous Range_List " . \$self;
3746 # Returns the union of the input code points. It can be called as
3747 # either a constructor or a method. If called as a method, the result
3748 # will be a new() instance of the calling object, containing the union
3749 # of that object with the other parameter's code points; if called as
3750 # a constructor, the first parameter gives the class that the new object
3751 # should be, and the second parameter gives the code points to go into
3753 # In either case, there are two parameters looked at by this routine;
3754 # any additional parameters are passed to the new() constructor.
3756 # The code points can come in the form of some object that contains
3757 # ranges, and has a conventionally named method to access them; or
3758 # they can be an array of individual code points (as integers); or
3759 # just a single code point.
3761 # If they are ranges, this routine doesn't make any effort to preserve
3762 # the range values and types of one input over the other. Therefore
3763 # this base class should not allow _union to be called from other than
3764 # initialization code, so as to prevent two tables from being added
3765 # together where the range values matter. The general form of this
3766 # routine therefore belongs in a derived class, but it was moved here
3767 # to avoid duplication of code. The failure to overload this in this
3768 # class keeps it safe.
3770 # It does make the effort during initialization to accept tables with
3771 # multiple values for the same code point, and to preserve the order
3772 # of these. If there is only one input range or range set, it doesn't
3773 # sort (as it should already be sorted to the desired order), and will
3774 # accept multiple values per code point. Otherwise it will merge
3775 # multiple values into a single one.
3778 my @args; # Arguments to pass to the constructor
3782 # If a method call, will start the union with the object itself, and
3783 # the class of the new object will be the same as self.
3790 # Add the other required parameter.
3792 # Rest of parameters are passed on to the constructor
3794 # Accumulate all records from both lists.
3796 my $input_count = 0;
3797 for my $arg (@args) {
3798 #local $to_trace = 0 if main::DEBUG;
3799 trace "argument = $arg" if main::DEBUG && $to_trace;
3800 if (! defined $arg) {
3802 if (defined $self) {
3803 $message .= $owner_name_of{pack 'J', refaddr $self};
3805 Carp::my_carp_bug($message . "Undefined argument to _union. No union done.");
3809 $arg = [ $arg ] if ! ref $arg;
3810 my $type = ref $arg;
3811 if ($type eq 'ARRAY') {
3812 foreach my $element (@$arg) {
3813 push @records, Range->new($element, $element);
3817 elsif ($arg->isa('Range')) {
3818 push @records, $arg;
3821 elsif ($arg->can('ranges')) {
3822 push @records, $arg->ranges;
3827 if (defined $self) {
3828 $message .= $owner_name_of{pack 'J', refaddr $self};
3830 Carp::my_carp_bug($message . "Cannot take the union of a $type. No union done.");
3835 # Sort with the range containing the lowest ordinal first, but if
3836 # two ranges start at the same code point, sort with the bigger range
3837 # of the two first, because it takes fewer cycles.
3838 if ($input_count > 1) {
3839 @records = sort { ($a->start <=> $b->start)
3841 # if b is shorter than a, b->end will be
3842 # less than a->end, and we want to select
3843 # a, so want to return -1
3844 ($b->end <=> $a->end)
3848 my $new = $class->new(@_);
3850 # Fold in records so long as they add new information.
3851 for my $set (@records) {
3852 my $start = $set->start;
3853 my $end = $set->end;
3854 my $value = $set->value;
3855 my $type = $set->type;
3856 if ($start > $new->max) {
3857 $new->_add_delete('+', $start, $end, $value, Type => $type);
3859 elsif ($end > $new->max) {
3860 $new->_add_delete('+', $new->max +1, $end, $value,
3863 elsif ($input_count == 1) {
3864 # Here, overlaps existing range, but is from a single input,
3865 # so preserve the multiple values from that input.
3866 $new->_add_delete('+', $start, $end, $value, Type => $type,
3867 Replace => $MULTIPLE_AFTER);
3874 sub range_count($self) { # Return the number of ranges in the range list
3875 return scalar @{$ranges{pack 'J', refaddr $self}};
3879 # Returns the minimum code point currently in the range list, or if
3880 # the range list is empty, 2 beyond the max possible. This is a
3881 # method because used so rarely, that not worth saving between calls,
3882 # and having to worry about changing it as ranges are added and
3885 my $addr = pack 'J', refaddr $self;
3887 # If the range list is empty, return a large value that isn't adjacent
3888 # to any that could be in the range list, for simpler tests
3889 return $MAX_WORKING_CODEPOINT + 2 unless scalar @{$ranges{$addr}};
3890 return $ranges{$addr}->[0]->start;
3893 sub contains($self, $codepoint) {
3894 # Boolean: Is argument in the range list? If so returns $i such that:
3895 # range[$i]->end < $codepoint <= range[$i+1]->end
3896 # which is one beyond what you want; this is so that the 0th range
3897 # doesn't return false
3899 my $i = $self->_search_ranges($codepoint);
3900 return 0 unless defined $i;
3902 # The search returns $i, such that
3903 # range[$i-1]->end < $codepoint <= range[$i]->end
3904 # So is in the table if and only iff it is at least the start position
3906 return 0 if $ranges{pack 'J', refaddr $self}->[$i]->start > $codepoint;
3910 sub containing_range($self, $codepoint) {
3911 # Returns the range object that contains the code point, undef if none
3912 my $i = $self->contains($codepoint);
3915 # contains() returns 1 beyond where we should look
3916 return $ranges{pack 'J', refaddr $self}->[$i-1];
3919 sub value_of($self, $codepoint) {
3920 # Returns the value associated with the code point, undef if none
3921 my $range = $self->containing_range($codepoint);
3922 return unless defined $range;
3924 return $range->value;
3927 sub type_of($self, $codepoint) {
3928 # Returns the type of the range containing the code point, undef if
3929 # the code point is not in the table
3930 my $range = $self->containing_range($codepoint);
3931 return unless defined $range;
3933 return $range->type;
3936 sub _search_ranges($self, $code_point) {
3937 # Find the range in the list which contains a code point, or where it
3938 # should go if were to add it. That is, it returns $i, such that:
3939 # range[$i-1]->end < $codepoint <= range[$i]->end
3940 # Returns undef if no such $i is possible (e.g. at end of table), or
3941 # if there is an error.
3942 my $addr = pack 'J', refaddr $self;
3944 return if $code_point > $max{$addr};
3945 my $r = $ranges{$addr}; # The current list of ranges
3946 my $range_list_size = scalar @$r;
3949 use integer; # want integer division
3951 # Use the cached result as the starting guess for this one, because,
3952 # an experiment on 5.1 showed that 90% of the time the cache was the
3953 # same as the result on the next call (and 7% it was one less).
3954 $i = $_search_ranges_cache{$addr};
3955 $i = 0 if $i >= $range_list_size; # Reset if no longer valid (prob.
3956 # from an intervening deletion
3957 #local $to_trace = 1 if main::DEBUG;
3958 trace "previous \$i is still valid: $i" if main::DEBUG && $to_trace && $code_point <= $r->[$i]->end && ($i == 0 || $r->[$i-1]->end < $code_point);
3959 return $i if $code_point <= $r->[$i]->end
3960 && ($i == 0 || $r->[$i-1]->end < $code_point);
3962 # Here the cache doesn't yield the correct $i. Try adding 1.
3963 if ($i < $range_list_size - 1
3964 && $r->[$i]->end < $code_point &&
3965 $code_point <= $r->[$i+1]->end)
3968 trace "next \$i is correct: $i" if main::DEBUG && $to_trace;
3969 $_search_ranges_cache{$addr} = $i;
3973 # Here, adding 1 also didn't work. We do a binary search to
3974 # find the correct position, starting with current $i
3976 my $upper = $range_list_size - 1;
3978 trace "top of loop i=$i:", sprintf("%04X", $r->[$lower]->start), "[$lower] .. ", sprintf("%04X", $r->[$i]->start), "[$i] .. ", sprintf("%04X", $r->[$upper]->start), "[$upper]" if main::DEBUG && $to_trace;
3980 if ($code_point <= $r->[$i]->end) {
3982 # Here we have met the upper constraint. We can quit if we
3983 # also meet the lower one.
3984 last if $i == 0 || $r->[$i-1]->end < $code_point;
3986 $upper = $i; # Still too high.
3991 # Here, $r[$i]->end < $code_point, so look higher up.
3995 # Split search domain in half to try again.
3996 my $temp = ($upper + $lower) / 2;
3998 # No point in continuing unless $i changes for next time
4002 # We can't reach the highest element because of the averaging.
4003 # So if one below the upper edge, force it there and try one
4005 if ($i == $range_list_size - 2) {
4007 trace "Forcing to upper edge" if main::DEBUG && $to_trace;
4008 $i = $range_list_size - 1;
4010 # Change $lower as well so if fails next time through,
4011 # taking the average will yield the same $i, and we will
4012 # quit with the error message just below.
4016 Carp::my_carp_bug("$owner_name_of{$addr}Can't find where the range ought to go. No action taken.");
4020 } # End of while loop
4022 if (main::DEBUG && $to_trace) {
4023 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i;
4024 trace "i= [ $i ]", $r->[$i];
4025 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < $range_list_size - 1;
4028 # Here we have found the offset. Cache it as a starting point for the
4030 $_search_ranges_cache{$addr} = $i;
4035 # Add, replace or delete ranges to or from a list. The $type
4036 # parameter gives which:
4037 # '+' => insert or replace a range, returning a list of any changed
4039 # '-' => delete a range, returning a list of any deleted ranges.
4041 # The next three parameters give respectively the start, end, and
4042 # value associated with the range. 'value' should be null unless the
4045 # The range list is kept sorted so that the range with the lowest
4046 # starting position is first in the list, and generally, adjacent
4047 # ranges with the same values are merged into a single larger one (see
4048 # exceptions below).
4050 # There are more parameters; all are key => value pairs:
4051 # Type gives the type of the value. It is only valid for '+'.
4052 # All ranges have types; if this parameter is omitted, 0 is
4053 # assumed. Ranges with type 0 are assumed to obey the
4054 # Unicode rules for casing, etc; ranges with other types are
4055 # not. Otherwise, the type is arbitrary, for the caller's
4056 # convenience, and looked at only by this routine to keep
4057 # adjacent ranges of different types from being merged into
4058 # a single larger range, and when Replace =>
4059 # $IF_NOT_EQUIVALENT is specified (see just below).
4060 # Replace determines what to do if the range list already contains
4061 # ranges which coincide with all or portions of the input
4062 # range. It is only valid for '+':
4063 # => $NO means that the new value is not to replace
4064 # any existing ones, but any empty gaps of the
4065 # range list coinciding with the input range
4066 # will be filled in with the new value.
4067 # => $UNCONDITIONALLY means to replace the existing values with
4068 # this one unconditionally. However, if the
4069 # new and old values are identical, the
4070 # replacement is skipped to save cycles
4071 # => $IF_NOT_EQUIVALENT means to replace the existing values
4072 # (the default) with this one if they are not equivalent.
4073 # Ranges are equivalent if their types are the
4074 # same, and they are the same string; or if
4075 # both are type 0 ranges, if their Unicode
4076 # standard forms are identical. In this last
4077 # case, the routine chooses the more "modern"
4078 # one to use. This is because some of the
4079 # older files are formatted with values that
4080 # are, for example, ALL CAPs, whereas the
4081 # derived files have a more modern style,
4082 # which looks better. By looking for this
4083 # style when the pre-existing and replacement
4084 # standard forms are the same, we can move to
4086 # => $MULTIPLE_BEFORE means that if this range duplicates an
4087 # existing one, but has a different value,
4088 # don't replace the existing one, but insert
4089 # this one so that the same range can occur
4090 # multiple times. They are stored LIFO, so
4091 # that the final one inserted is the first one
4092 # returned in an ordered search of the table.
4093 # If this is an exact duplicate, including the
4094 # value, the original will be moved to be
4095 # first, before any other duplicate ranges
4096 # with different values.
4097 # => $MULTIPLE_AFTER is like $MULTIPLE_BEFORE, but is stored
4098 # FIFO, so that this one is inserted after all
4099 # others that currently exist. If this is an
4100 # exact duplicate, including value, of an
4101 # existing range, this one is discarded
4102 # (leaving the existing one in its original,
4103 # higher priority position
4104 # => $CROAK Die with an error if is already there
4105 # => anything else is the same as => $IF_NOT_EQUIVALENT
4107 # "same value" means identical for non-type-0 ranges, and it means
4108 # having the same standard forms for type-0 ranges.
4110 return Carp::carp_too_few_args(\@_, 5) if main::DEBUG && @_ < 5;
4113 my $operation = shift; # '+' for add/replace; '-' for delete;
4120 $value = "" if not defined $value; # warning: $value can be "0"
4122 my $replace = delete $args{'Replace'};
4123 $replace = $IF_NOT_EQUIVALENT unless defined $replace;
4125 my $type = delete $args{'Type'};
4126 $type = 0 unless defined $type;
4128 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
4130 my $addr = pack 'J', refaddr $self;
4132 if ($operation ne '+' && $operation ne '-') {
4133 Carp::my_carp_bug("$owner_name_of{$addr}First parameter to _add_delete must be '+' or '-'. No action taken.");
4136 unless (defined $start && defined $end) {
4137 Carp::my_carp_bug("$owner_name_of{$addr}Undefined start and/or end to _add_delete. No action taken.");
4140 unless ($end >= $start) {
4141 Carp::my_carp_bug("$owner_name_of{$addr}End of range (" . sprintf("%04X", $end) . ") must not be before start (" . sprintf("%04X", $start) . "). No action taken.");
4144 #local $to_trace = 1 if main::DEBUG;
4146 if ($operation eq '-') {
4147 if ($replace != $IF_NOT_EQUIVALENT) {
4148 Carp::my_carp_bug("$owner_name_of{$addr}Replace => \$IF_NOT_EQUIVALENT is required when deleting a range from a range list. Assuming Replace => \$IF_NOT_EQUIVALENT.");
4149 $replace = $IF_NOT_EQUIVALENT;
4152 Carp::my_carp_bug("$owner_name_of{$addr}Type => 0 is required when deleting a range from a range list. Assuming Type => 0.");
4156 Carp::my_carp_bug("$owner_name_of{$addr}Value => \"\" is required when deleting a range from a range list. Assuming Value => \"\".");
4161 my $r = $ranges{$addr}; # The current list of ranges
4162 my $range_list_size = scalar @$r; # And its size
4163 my $max = $max{$addr}; # The current high code point in
4164 # the list of ranges
4166 # Do a special case requiring fewer machine cycles when the new range
4167 # starts after the current highest point. The Unicode input data is
4168 # structured so this is common.
4169 if ($start > $max) {
4171 trace "$owner_name_of{$addr} $operation", sprintf("%04X..%04X (%s) type=%d; prev max=%04X", $start, $end, $value, $type, $max) if main::DEBUG && $to_trace;
4172 return if $operation eq '-'; # Deleting a non-existing range is a
4175 # If the new range doesn't logically extend the current final one
4176 # in the range list, create a new range at the end of the range
4177 # list. (max cleverly is initialized to a negative number not
4178 # adjacent to 0 if the range list is empty, so even adding a range
4179 # to an empty range list starting at 0 will have this 'if'
4181 if ($start > $max + 1 # non-adjacent means can't extend.
4182 || @{$r}[-1]->value ne $value # values differ, can't extend.
4183 || @{$r}[-1]->type != $type # types differ, can't extend.
4185 push @$r, Range->new($start, $end,
4191 # Here, the new range starts just after the current highest in
4192 # the range list, and they have the same type and value.
4193 # Extend the existing range to incorporate the new one.
4194 @{$r}[-1]->set_end($end);
4197 # This becomes the new maximum.
4202 #local $to_trace = 0 if main::DEBUG;
4204 trace "$owner_name_of{$addr} $operation", sprintf("%04X", $start) . '..' . sprintf("%04X", $end) . " ($value) replace=$replace" if main::DEBUG && $to_trace;
4206 # Here, the input range isn't after the whole rest of the range list.
4207 # Most likely 'splice' will be needed. The rest of the routine finds
4208 # the needed splice parameters, and if necessary, does the splice.
4209 # First, find the offset parameter needed by the splice function for
4210 # the input range. Note that the input range may span multiple
4211 # existing ones, but we'll worry about that later. For now, just find
4212 # the beginning. If the input range is to be inserted starting in a
4213 # position not currently in the range list, it must (obviously) come
4214 # just after the range below it, and just before the range above it.
4215 # Slightly less obviously, it will occupy the position currently
4216 # occupied by the range that is to come after it. More formally, we
4217 # are looking for the position, $i, in the array of ranges, such that:
4219 # r[$i-1]->start <= r[$i-1]->end < $start < r[$i]->start <= r[$i]->end
4221 # (The ordered relationships within existing ranges are also shown in
4222 # the equation above). However, if the start of the input range is
4223 # within an existing range, the splice offset should point to that
4224 # existing range's position in the list; that is $i satisfies a
4225 # somewhat different equation, namely:
4227 #r[$i-1]->start <= r[$i-1]->end < r[$i]->start <= $start <= r[$i]->end
4229 # More briefly, $start can come before or after r[$i]->start, and at
4230 # this point, we don't know which it will be. However, these
4231 # two equations share these constraints:
4233 # r[$i-1]->end < $start <= r[$i]->end
4235 # And that is good enough to find $i.
4237 my $i = $self->_search_ranges($start);
4239 Carp::my_carp_bug("Searching $self for range beginning with $start unexpectedly returned undefined. Operation '$operation' not performed");
4243 # The search function returns $i such that:
4245 # r[$i-1]->end < $start <= r[$i]->end
4247 # That means that $i points to the first range in the range list
4248 # that could possibly be affected by this operation. We still don't
4249 # know if the start of the input range is within r[$i], or if it
4250 # points to empty space between r[$i-1] and r[$i].
4251 trace "[$i] is the beginning splice point. Existing range there is ", $r->[$i] if main::DEBUG && $to_trace;
4253 # Special case the insertion of data that is not to replace any
4255 if ($replace == $NO) { # If $NO, has to be operation '+'
4256 #local $to_trace = 1 if main::DEBUG;
4257 trace "Doesn't replace" if main::DEBUG && $to_trace;
4259 # Here, the new range is to take effect only on those code points
4260 # that aren't already in an existing range. This can be done by
4261 # looking through the existing range list and finding the gaps in
4262 # the ranges that this new range affects, and then calling this
4263 # function recursively on each of those gaps, leaving untouched
4264 # anything already in the list. Gather up a list of the changed
4265 # gaps first so that changes to the internal state as new ranges
4266 # are added won't be a problem.
4269 # First, if the starting point of the input range is outside an
4270 # existing one, there is a gap from there to the beginning of the
4271 # existing range -- add a span to fill the part that this new
4273 if ($start < $r->[$i]->start) {
4274 push @gap_list, Range->new($start,
4276 $r->[$i]->start - 1),
4278 trace "gap before $r->[$i] [$i], will add", $gap_list[-1] if main::DEBUG && $to_trace;
4281 # Then look through the range list for other gaps until we reach
4282 # the highest range affected by the input one.
4284 for ($j = $i+1; $j < $range_list_size; $j++) {
4285 trace "j=[$j]", $r->[$j] if main::DEBUG && $to_trace;
4286 last if $end < $r->[$j]->start;
4288 # If there is a gap between when this range starts and the
4289 # previous one ends, add a span to fill it. Note that just
4290 # because there are two ranges doesn't mean there is a
4291 # non-zero gap between them. It could be that they have
4292 # different values or types
4293 if ($r->[$j-1]->end + 1 != $r->[$j]->start) {
4295 Range->new($r->[$j-1]->end + 1,
4296 $r->[$j]->start - 1,
4298 trace "gap between $r->[$j-1] and $r->[$j] [$j], will add: $gap_list[-1]" if main::DEBUG && $to_trace;
4302 # Here, we have either found an existing range in the range list,
4303 # beyond the area affected by the input one, or we fell off the
4304 # end of the loop because the input range affects the whole rest
4305 # of the range list. In either case, $j is 1 higher than the
4306 # highest affected range. If $j == $i, it means that there are no
4307 # affected ranges, that the entire insertion is in the gap between
4308 # r[$i-1], and r[$i], which we already have taken care of before
4310 # On the other hand, if there are affected ranges, it might be
4311 # that there is a gap that needs filling after the final such
4312 # range to the end of the input range
4313 if ($r->[$j-1]->end < $end) {
4314 push @gap_list, Range->new(main::max($start,
4315 $r->[$j-1]->end + 1),
4318 trace "gap after $r->[$j-1], will add $gap_list[-1]" if main::DEBUG && $to_trace;
4321 # Call recursively to fill in all the gaps.
4322 foreach my $gap (@gap_list) {
4323 $self->_add_delete($operation,
4333 # Here, we have taken care of the case where $replace is $NO.
4334 # Remember that here, r[$i-1]->end < $start <= r[$i]->end
4335 # If inserting a multiple record, this is where it goes, before the
4336 # first (if any) existing one if inserting LIFO. (If this is to go
4337 # afterwards, FIFO, we below move the pointer to there.) These imply
4338 # an insertion, and no change to any existing ranges. Note that $i
4339 # can be -1 if this new range doesn't actually duplicate any existing,
4340 # and comes at the beginning of the list.
4341 if ($replace == $MULTIPLE_BEFORE || $replace == $MULTIPLE_AFTER) {
4343 if ($start != $end) {
4344 Carp::my_carp_bug("$owner_name_of{$addr}Can't cope with adding a multiple record when the range ($start..$end) contains more than one code point. No action taken.");
4348 # If the new code point is within a current range ...
4349 if ($end >= $r->[$i]->start) {
4351 # Don't add an exact duplicate, as it isn't really a multiple
4352 my $existing_value = $r->[$i]->value;
4353 my $existing_type = $r->[$i]->type;
4354 return if $value eq $existing_value && $type eq $existing_type;
4356 # If the multiple value is part of an existing range, we want
4357 # to split up that range, so that only the single code point
4358 # is affected. To do this, we first call ourselves
4359 # recursively to delete that code point from the table, having
4360 # preserved its current data above. Then we call ourselves
4361 # recursively again to add the new multiple, which we know by
4362 # the test just above is different than the current code
4363 # point's value, so it will become a range containing a single
4364 # code point: just itself. Finally, we add back in the
4365 # pre-existing code point, which will again be a single code
4366 # point range. Because 'i' likely will have changed as a
4367 # result of these operations, we can't just continue on, but
4368 # do this operation recursively as well. If we are inserting
4369 # LIFO, the pre-existing code point needs to go after the new
4370 # one, so use MULTIPLE_AFTER; and vice versa.
4371 if ($r->[$i]->start != $r->[$i]->end) {
4372 $self->_add_delete('-', $start, $end, "");
4373 $self->_add_delete('+', $start, $end, $value, Type => $type);
4374 return $self->_add_delete('+',
4377 Type => $existing_type,
4378 Replace => ($replace == $MULTIPLE_BEFORE)
4380 : $MULTIPLE_BEFORE);
4384 # If to place this new record after, move to beyond all existing
4385 # ones; but don't add this one if identical to any of them, as it
4386 # isn't really a multiple. This leaves the original order, so
4387 # that the current request is ignored. The reasoning is that the
4388 # previous request that wanted this record to have high priority
4389 # should have precedence.
4390 if ($replace == $MULTIPLE_AFTER) {
4391 while ($i < @$r && $r->[$i]->start == $start) {
4392 return if $value eq $r->[$i]->value
4393 && $type eq $r->[$i]->type;
4398 # If instead we are to place this new record before any
4399 # existing ones, remove any identical ones that come after it.
4400 # This changes the existing order so that the new one is
4401 # first, as is being requested.
4402 for (my $j = $i + 1;
4403 $j < @$r && $r->[$j]->start == $start;
4406 if ($value eq $r->[$j]->value && $type eq $r->[$j]->type) {
4408 last; # There should only be one instance, so no
4409 # need to keep looking
4414 trace "Adding multiple record at $i with $start..$end, $value" if main::DEBUG && $to_trace;
4415 my @return = splice @$r,
4422 if (main::DEBUG && $to_trace) {
4423 trace "After splice:";
4424 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4425 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4426 trace "i =[", $i, "]", $r->[$i] if $i >= 0;
4427 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4428 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4429 trace 'i+3=[', $i+3, ']', $r->[$i+3] if $i < @$r - 3;
4434 # Here, we have taken care of $NO and $MULTIPLE_foo replaces. This
4435 # leaves delete, insert, and replace either unconditionally or if not
4436 # equivalent. $i still points to the first potential affected range.
4437 # Now find the highest range affected, which will determine the length
4438 # parameter to splice. (The input range can span multiple existing
4439 # ones.) If this isn't a deletion, while we are looking through the
4440 # range list, see also if this is a replacement rather than a clean
4441 # insertion; that is if it will change the values of at least one
4442 # existing range. Start off assuming it is an insert, until find it
4444 my $clean_insert = $operation eq '+';
4445 my $j; # This will point to the highest affected range
4447 # For non-zero types, the standard form is the value itself;
4448 my $standard_form = ($type) ? $value : main::standardize($value);
4450 for ($j = $i; $j < $range_list_size; $j++) {
4451 trace "Looking for highest affected range; the one at $j is ", $r->[$j] if main::DEBUG && $to_trace;
4453 # If find a range that it doesn't overlap into, we can stop
4455 last if $end < $r->[$j]->start;
4457 # Here, overlaps the range at $j. If the values don't match,
4458 # and so far we think this is a clean insertion, it becomes a
4459 # non-clean insertion, i.e., a 'change' or 'replace' instead.
4460 if ($clean_insert) {
4461 if ($r->[$j]->standard_form ne $standard_form) {
4463 if ($replace == $CROAK) {
4464 main::croak("The range to add "
4465 . sprintf("%04X", $start)
4467 . sprintf("%04X", $end)
4468 . " with value '$value' overlaps an existing range $r->[$j]");
4473 # Here, the two values are essentially the same. If the
4474 # two are actually identical, replacing wouldn't change
4475 # anything so skip it.
4476 my $pre_existing = $r->[$j]->value;
4477 if ($pre_existing ne $value) {
4479 # Here the new and old standardized values are the
4480 # same, but the non-standardized values aren't. If
4481 # replacing unconditionally, then replace
4482 if( $replace == $UNCONDITIONALLY) {
4487 # Here, are replacing conditionally. Decide to
4488 # replace or not based on which appears to look
4489 # the "nicest". If one is mixed case and the
4490 # other isn't, choose the mixed case one.
4491 my $new_mixed = $value =~ /[A-Z]/
4492 && $value =~ /[a-z]/;
4493 my $old_mixed = $pre_existing =~ /[A-Z]/
4494 && $pre_existing =~ /[a-z]/;
4496 if ($old_mixed != $new_mixed) {
4497 $clean_insert = 0 if $new_mixed;
4498 if (main::DEBUG && $to_trace) {
4499 if ($clean_insert) {
4500 trace "Retaining $pre_existing over $value";
4503 trace "Replacing $pre_existing with $value";
4509 # Here casing wasn't different between the two.
4510 # If one has hyphens or underscores and the
4511 # other doesn't, choose the one with the
4513 my $new_punct = $value =~ /[-_]/;
4514 my $old_punct = $pre_existing =~ /[-_]/;
4516 if ($old_punct != $new_punct) {
4517 $clean_insert = 0 if $new_punct;
4518 if (main::DEBUG && $to_trace) {
4519 if ($clean_insert) {
4520 trace "Retaining $pre_existing over $value";
4523 trace "Replacing $pre_existing with $value";
4526 } # else existing one is just as "good";
4527 # retain it to save cycles.
4533 } # End of loop looking for highest affected range.
4535 # Here, $j points to one beyond the highest range that this insertion
4536 # affects (hence to beyond the range list if that range is the final
4537 # one in the range list).
4539 # The splice length is all the affected ranges. Get it before
4540 # subtracting, for efficiency, so we don't have to later add 1.
4541 my $length = $j - $i;
4543 $j--; # $j now points to the highest affected range.
4544 trace "Final affected range is $j: $r->[$j]" if main::DEBUG && $to_trace;
4546 # Here, have taken care of $NO and $MULTIPLE_foo replaces.
4547 # $j points to the highest affected range. But it can be < $i or even
4548 # -1. These happen only if the insertion is entirely in the gap
4549 # between r[$i-1] and r[$i]. Here's why: j < i means that the j loop
4550 # above exited first time through with $end < $r->[$i]->start. (And
4551 # then we subtracted one from j) This implies also that $start <
4552 # $r->[$i]->start, but we know from above that $r->[$i-1]->end <
4553 # $start, so the entire input range is in the gap.
4556 # Here the entire input range is in the gap before $i.
4558 if (main::DEBUG && $to_trace) {
4560 trace "Entire range is between $r->[$i-1] and $r->[$i]";
4563 trace "Entire range is before $r->[$i]";
4566 return if $operation ne '+'; # Deletion of a non-existent range is
4571 # Here part of the input range is not in the gap before $i. Thus,
4572 # there is at least one affected one, and $j points to the highest
4575 # At this point, here is the situation:
4576 # This is not an insertion of a multiple, nor of tentative ($NO)
4578 # $i points to the first element in the current range list that
4579 # may be affected by this operation. In fact, we know
4580 # that the range at $i is affected because we are in
4581 # the else branch of this 'if'
4582 # $j points to the highest affected range.
4584 # r[$i-1]->end < $start <= r[$i]->end
4586 # r[$i-1]->end < $start <= $end < r[$j+1]->start
4589 # $clean_insert is a boolean which is set true if and only if
4590 # this is a "clean insertion", i.e., not a change nor a
4591 # deletion (multiple was handled above).
4593 # We now have enough information to decide if this call is a no-op
4594 # or not. It is a no-op if this is an insertion of already
4595 # existing data. To be so, it must be contained entirely in one
4598 if (main::DEBUG && $to_trace && $clean_insert
4599 && $start >= $r->[$i]->start
4600 && $end <= $r->[$i]->end)
4604 return if $clean_insert
4605 && $start >= $r->[$i]->start
4606 && $end <= $r->[$i]->end;
4609 # Here, we know that some action will have to be taken. We have
4610 # calculated the offset and length (though adjustments may be needed)
4611 # for the splice. Now start constructing the replacement list.
4613 my $splice_start = $i;
4618 # See if should extend any adjacent ranges.
4619 if ($operation eq '-') { # Don't extend deletions
4620 $extends_below = $extends_above = 0;
4622 else { # Here, should extend any adjacent ranges. See if there are
4624 $extends_below = ($i > 0
4625 # can't extend unless adjacent
4626 && $r->[$i-1]->end == $start -1
4627 # can't extend unless are same standard value
4628 && $r->[$i-1]->standard_form eq $standard_form
4629 # can't extend unless share type
4630 && $r->[$i-1]->type == $type);
4631 $extends_above = ($j+1 < $range_list_size
4632 && $r->[$j+1]->start == $end +1
4633 && $r->[$j+1]->standard_form eq $standard_form
4634 && $r->[$j+1]->type == $type);
4636 if ($extends_below && $extends_above) { # Adds to both
4637 $splice_start--; # start replace at element below
4638 $length += 2; # will replace on both sides
4639 trace "Extends both below and above ranges" if main::DEBUG && $to_trace;
4641 # The result will fill in any gap, replacing both sides, and
4642 # create one large range.
4643 @replacement = Range->new($r->[$i-1]->start,
4650 # Here we know that the result won't just be the conglomeration of
4651 # a new range with both its adjacent neighbors. But it could
4652 # extend one of them.
4654 if ($extends_below) {
4656 # Here the new element adds to the one below, but not to the
4657 # one above. If inserting, and only to that one range, can
4658 # just change its ending to include the new one.
4659 if ($length == 0 && $clean_insert) {
4660 $r->[$i-1]->set_end($end);
4661 trace "inserted range extends range to below so it is now $r->[$i-1]" if main::DEBUG && $to_trace;
4665 trace "Changing inserted range to start at ", sprintf("%04X", $r->[$i-1]->start), " instead of ", sprintf("%04X", $start) if main::DEBUG && $to_trace;
4666 $splice_start--; # start replace at element below
4667 $length++; # will replace the element below
4668 $start = $r->[$i-1]->start;
4671 elsif ($extends_above) {
4673 # Here the new element adds to the one above, but not below.
4674 # Mirror the code above
4675 if ($length == 0 && $clean_insert) {
4676 $r->[$j+1]->set_start($start);
4677 trace "inserted range extends range to above so it is now $r->[$j+1]" if main::DEBUG && $to_trace;
4681 trace "Changing inserted range to end at ", sprintf("%04X", $r->[$j+1]->end), " instead of ", sprintf("%04X", $end) if main::DEBUG && $to_trace;
4682 $length++; # will replace the element above
4683 $end = $r->[$j+1]->end;
4687 trace "Range at $i is $r->[$i]" if main::DEBUG && $to_trace;
4689 # Finally, here we know there will have to be a splice.
4690 # If the change or delete affects only the highest portion of the
4691 # first affected range, the range will have to be split. The
4692 # splice will remove the whole range, but will replace it by a new
4693 # range containing just the unaffected part. So, in this case,
4694 # add to the replacement list just this unaffected portion.
4695 if (! $extends_below
4696 && $start > $r->[$i]->start && $start <= $r->[$i]->end)
4699 Range->new($r->[$i]->start,
4701 Value => $r->[$i]->value,
4702 Type => $r->[$i]->type);
4705 # In the case of an insert or change, but not a delete, we have to
4706 # put in the new stuff; this comes next.
4707 if ($operation eq '+') {
4708 push @replacement, Range->new($start,
4714 trace "Range at $j is $r->[$j]" if main::DEBUG && $to_trace && $j != $i;
4715 #trace "$end >=", $r->[$j]->start, " && $end <", $r->[$j]->end if main::DEBUG && $to_trace;
4717 # And finally, if we're changing or deleting only a portion of the
4718 # highest affected range, it must be split, as the lowest one was.
4719 if (! $extends_above
4720 && $j >= 0 # Remember that j can be -1 if before first
4722 && $end >= $r->[$j]->start
4723 && $end < $r->[$j]->end)
4726 Range->new($end + 1,
4728 Value => $r->[$j]->value,
4729 Type => $r->[$j]->type);
4733 # And do the splice, as calculated above
4734 if (main::DEBUG && $to_trace) {
4735 trace "replacing $length element(s) at $i with ";
4736 foreach my $replacement (@replacement) {
4737 trace " $replacement";
4739 trace "Before splice:";
4740 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4741 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4742 trace "i =[", $i, "]", $r->[$i];
4743 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4744 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4747 my @return = splice @$r, $splice_start, $length, @replacement;
4749 if (main::DEBUG && $to_trace) {
4750 trace "After splice:";
4751 trace 'i-2=[', $i-2, ']', $r->[$i-2] if $i >= 2;
4752 trace 'i-1=[', $i-1, ']', $r->[$i-1] if $i >= 1;
4753 trace "i =[", $i, "]", $r->[$i];
4754 trace 'i+1=[', $i+1, ']', $r->[$i+1] if $i < @$r - 1;
4755 trace 'i+2=[', $i+2, ']', $r->[$i+2] if $i < @$r - 2;
4756 trace "removed ", @return if @return;
4759 # An actual deletion could have changed the maximum in the list.
4760 # There was no deletion if the splice didn't return something, but
4761 # otherwise recalculate it. This is done too rarely to worry about
4763 if ($operation eq '-' && @return) {
4765 $max{$addr} = $r->[-1]->end;
4768 $max{$addr} = $max_init;
4774 sub reset_each_range($self) { # reset the iterator for each_range();
4775 undef $each_range_iterator{pack 'J', refaddr $self};
4779 sub each_range($self) {
4780 # Iterate over each range in a range list. Results are undefined if
4781 # the range list is changed during the iteration.
4782 my $addr = pack 'J', refaddr $self;
4784 return if $self->is_empty;
4786 $each_range_iterator{$addr} = -1
4787 if ! defined $each_range_iterator{$addr};
4788 $each_range_iterator{$addr}++;
4789 return $ranges{$addr}->[$each_range_iterator{$addr}]
4790 if $each_range_iterator{$addr} < @{$ranges{$addr}};
4791 undef $each_range_iterator{$addr};
4795 sub count($self) { # Returns count of code points in range list
4796 my $addr = pack 'J', refaddr $self;
4799 foreach my $range (@{$ranges{$addr}}) {
4800 $count += $range->end - $range->start + 1;
4805 sub delete_range($self, $start, $end) { # Delete a range
4806 return $self->_add_delete('-', $start, $end, "");
4809 sub is_empty($self) { # Returns boolean as to if a range list is empty
4810 return scalar @{$ranges{pack 'J', refaddr $self}} == 0;
4814 # Quickly returns a scalar suitable for separating tables into
4815 # buckets, i.e. it is a hash function of the contents of a table, so
4816 # there are relatively few conflicts.
4817 my $addr = pack 'J', refaddr $self;
4819 # These are quickly computable. Return looks like 'min..max;count'
4820 return $self->min . "..$max{$addr};" . scalar @{$ranges{$addr}};
4822 } # End closure for _Range_List_Base
4825 use parent '-norequire', '_Range_List_Base';
4827 # A Range_List is a range list for match tables; i.e. the range values are
4828 # not significant. Thus a number of operations can be safely added to it,
4829 # such as inversion, intersection. Note that union is also an unsafe
4830 # operation when range values are cared about, and that method is in the base
4831 # class, not here. But things are set up so that that method is callable only
4832 # during initialization. Only in this derived class, is there an operation
4833 # that combines two tables. A Range_Map can thus be used to initialize a
4834 # Range_List, and its mappings will be in the list, but are not significant to
4837 sub trace { return main::trace(@_); }
4843 '+' => sub { my $self = shift;
4846 return $self->_union($other)
4848 '+=' => sub { my $self = shift;
4850 my $reversed = shift;
4853 Carp::my_carp_bug("Bad news. Can't cope with '"
4857 . "'. undef returned.");
4861 return $self->_union($other)
4863 '&' => sub { my $self = shift;
4866 return $self->_intersect($other, 0);
4868 '&=' => sub { my $self = shift;
4870 my $reversed = shift;
4873 Carp::my_carp_bug("Bad news. Can't cope with '"
4877 . "'. undef returned.");
4881 return $self->_intersect($other, 0);
4887 sub _invert($self, @) {
4888 # Returns a new Range_List that gives all code points not in $self.
4889 my $new = Range_List->new;
4891 # Go through each range in the table, finding the gaps between them
4892 my $max = -1; # Set so no gap before range beginning at 0
4893 for my $range ($self->ranges) {
4894 my $start = $range->start;
4895 my $end = $range->end;
4897 # If there is a gap before this range, the inverse will contain
4899 if ($start > $max + 1) {
4900 $new->add_range($max + 1, $start - 1);
4905 # And finally, add the gap from the end of the table to the max
4906 # possible code point
4907 if ($max < $MAX_WORKING_CODEPOINT) {
4908 $new->add_range($max + 1, $MAX_WORKING_CODEPOINT);
4913 sub _subtract($self, $other, $reversed=0) {
4914 # Returns a new Range_List with the argument deleted from it. The
4915 # argument can be a single code point, a range, or something that has
4916 # a range, with the _range_list() method on it returning them
4919 Carp::my_carp_bug("Bad news. Can't cope with '"
4923 . "'. undef returned.");
4927 my $new = Range_List->new(Initialize => $self);
4929 if (! ref $other) { # Single code point
4930 $new->delete_range($other, $other);
4932 elsif ($other->isa('Range')) {
4933 $new->delete_range($other->start, $other->end);
4935 elsif ($other->can('_range_list')) {
4936 foreach my $range ($other->_range_list->ranges) {
4937 $new->delete_range($range->start, $range->end);
4941 Carp::my_carp_bug("Can't cope with a "
4943 . " argument to '-'. Subtraction ignored."
4951 sub _intersect($a_object, $b_object, $check_if_overlapping=0) {
4952 # Returns either a boolean giving whether the two inputs' range lists
4953 # intersect (overlap), or a new Range_List containing the intersection
4954 # of the two lists. The optional final parameter being true indicates
4955 # to do the check instead of the intersection.
4957 if (! defined $b_object) {
4959 $message .= $a_object->_owner_name_of if defined $a_object;
4960 Carp::my_carp_bug($message .= "Called with undefined value. Intersection not done.");
4964 # a & b = !(!a | !b), or in our terminology = ~ ( ~a + -b )
4965 # Thus the intersection could be much more simply be written:
4966 # return ~(~$a_object + ~$b_object);
4967 # But, this is slower, and when taking the inverse of a large
4968 # range_size_1 table, back when such tables were always stored that
4969 # way, it became prohibitively slow, hence the code was changed to the
4972 if ($b_object->isa('Range')) {
4973 $b_object = Range_List->new(Initialize => $b_object,
4974 Owner => $a_object->_owner_name_of);
4976 $b_object = $b_object->_range_list if $b_object->can('_range_list');
4978 my @a_ranges = $a_object->ranges;
4979 my @b_ranges = $b_object->ranges;
4981 #local $to_trace = 1 if main::DEBUG;
4982 trace "intersecting $a_object with ", scalar @a_ranges, "ranges and $b_object with", scalar @b_ranges, " ranges" if main::DEBUG && $to_trace;
4984 # Start with the first range in each list
4986 my $range_a = $a_ranges[$a_i];
4988 my $range_b = $b_ranges[$b_i];
4990 my $new = __PACKAGE__->new(Owner => $a_object->_owner_name_of)
4991 if ! $check_if_overlapping;
4993 # If either list is empty, there is no intersection and no overlap
4994 if (! defined $range_a || ! defined $range_b) {
4995 return $check_if_overlapping ? 0 : $new;
4997 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
4999 # Otherwise, must calculate the intersection/overlap. Start with the
5000 # very first code point in each list
5001 my $a = $range_a->start;
5002 my $b = $range_b->start;
5004 # Loop through all the ranges of each list; in each iteration, $a and
5005 # $b are the current code points in their respective lists
5008 # If $a and $b are the same code point, ...
5011 # it means the lists overlap. If just checking for overlap
5012 # know the answer now,
5013 return 1 if $check_if_overlapping;
5015 # The intersection includes this code point plus anything else
5016 # common to both current ranges.
5018 my $end = main::min($range_a->end, $range_b->end);
5019 if (! $check_if_overlapping) {
5020 trace "adding intersection range ", sprintf("%04X", $start) . ".." . sprintf("%04X", $end) if main::DEBUG && $to_trace;
5021 $new->add_range($start, $end);
5024 # Skip ahead to the end of the current intersect
5027 # If the current intersect ends at the end of either range (as
5028 # it must for at least one of them), the next possible one
5029 # will be the beginning code point in it's list's next range.
5030 if ($a == $range_a->end) {
5031 $range_a = $a_ranges[++$a_i];
5032 last unless defined $range_a;
5033 $a = $range_a->start;
5035 if ($b == $range_b->end) {
5036 $range_b = $b_ranges[++$b_i];
5037 last unless defined $range_b;
5038 $b = $range_b->start;
5041 trace "range_a[$a_i]=$range_a; range_b[$b_i]=$range_b" if main::DEBUG && $to_trace;
5045 # Not equal, but if the range containing $a encompasses $b,
5046 # change $a to be the middle of the range where it does equal
5047 # $b, so the next iteration will get the intersection
5048 if ($range_a->end >= $b) {
5053 # Here, the current range containing $a is entirely below
5054 # $b. Go try to find a range that could contain $b.
5055 $a_i = $a_object->_search_ranges($b);
5057 # If no range found, quit.
5058 last unless defined $a_i;
5060 # The search returns $a_i, such that
5061 # range_a[$a_i-1]->end < $b <= range_a[$a_i]->end
5062 # Set $a to the beginning of this new range, and repeat.
5063 $range_a = $a_ranges[$a_i];
5064 $a = $range_a->start;
5067 else { # Here, $b < $a.
5069 # Mirror image code to the leg just above
5070 if ($range_b->end >= $a) {
5074 $b_i = $b_object->_search_ranges($a);
5075 last unless defined $b_i;
5076 $range_b = $b_ranges[$b_i];
5077 $b = $range_b->start;
5080 } # End of looping through ranges.
5082 # Intersection fully computed, or now know that there is no overlap
5083 return $check_if_overlapping ? 0 : $new;
5086 sub overlaps($self, $other) {
5087 # Returns boolean giving whether the two arguments overlap somewhere
5088 return $self->_intersect($other, 1);
5091 sub add_range($self, $start, $end) {
5092 # Add a range to the list.
5093 return $self->_add_delete('+', $start, $end, "");
5096 sub matches_identically_to($self, $other) {
5097 # Return a boolean as to whether or not two Range_Lists match identical
5098 # sets of code points.
5099 # These are ordered in increasing real time to figure out (at least
5100 # until a patch changes that and doesn't change this)
5101 return 0 if $self->max != $other->max;
5102 return 0 if $self->min != $other->min;
5103 return 0 if $self->range_count != $other->range_count;
5104 return 0 if $self->count != $other->count;
5106 # Here they could be identical because all the tests above passed.
5107 # The loop below is somewhat simpler since we know they have the same
5108 # number of elements. Compare range by range, until reach the end or
5109 # find something that differs.
5110 my @a_ranges = $self->ranges;
5111 my @b_ranges = $other->ranges;
5112 for my $i (0 .. @a_ranges - 1) {
5113 my $a = $a_ranges[$i];
5114 my $b = $b_ranges[$i];
5115 trace "self $a; other $b" if main::DEBUG && $to_trace;
5116 return 0 if ! defined $b
5117 || $a->start != $b->start
5118 || $a->end != $b->end;
5123 sub is_code_point_usable($code, $try_hard) {
5124 # This used only for making the test script. See if the input
5125 # proposed trial code point is one that Perl will handle. If second
5126 # parameter is 0, it won't select some code points for various
5127 # reasons, noted below.
5128 return 0 if $code < 0; # Never use a negative
5130 # shun null. I'm (khw) not sure why this was done, but NULL would be
5131 # the character very frequently used.
5132 return $try_hard if $code == 0x0000;
5134 # shun non-character code points.
5135 return $try_hard if $code >= 0xFDD0 && $code <= 0xFDEF;
5136 return $try_hard if ($code & 0xFFFE) == 0xFFFE; # includes FFFF
5138 return $try_hard if $code > $MAX_UNICODE_CODEPOINT; # keep in range
5139 return $try_hard if $code >= 0xD800 && $code <= 0xDFFF; # no surrogate
5144 sub get_valid_code_point($self) {
5145 # Return a code point that's part of the range list. Returns nothing
5146 # if the table is empty or we can't find a suitable code point. This
5147 # used only for making the test script.
5149 # On first pass, don't choose less desirable code points; if no good
5150 # one is found, repeat, allowing a less desirable one to be selected.
5151 for my $try_hard (0, 1) {
5153 # Look through all the ranges for a usable code point.
5154 for my $set (reverse $self->ranges) {
5156 # Try the edge cases first, starting with the end point of the
5158 my $end = $set->end;
5159 return $end if is_code_point_usable($end, $try_hard);
5160 $end = $MAX_UNICODE_CODEPOINT + 1 if $end > $MAX_UNICODE_CODEPOINT;
5162 # End point didn't, work. Start at the beginning and try
5163 # every one until find one that does work.
5164 for my $trial ($set->start .. $end - 1) {
5165 return $trial if is_code_point_usable($trial, $try_hard);
5169 return (); # If none found, give up.
5172 sub get_invalid_code_point($self) {
5173 # Return a code point that's not part of the table. Returns nothing
5174 # if the table covers all code points or a suitable code point can't
5175 # be found. This used only for making the test script.
5177 # Just find a valid code point of the inverse, if any.
5178 return Range_List->new(Initialize => ~ $self)->get_valid_code_point;
5180 } # end closure for Range_List
5183 use parent '-norequire', '_Range_List_Base';
5185 # A Range_Map is a range list in which the range values (called maps) are
5186 # significant, and hence shouldn't be manipulated by our other code, which
5187 # could be ambiguous or lose things. For example, in taking the union of two
5188 # lists, which share code points, but which have differing values, which one
5189 # has precedence in the union?
5190 # It turns out that these operations aren't really necessary for map tables,
5191 # and so this class was created to make sure they aren't accidentally
5196 sub add_map($self, @add) {
5197 # Add a range containing a mapping value to the list
5198 return $self->_add_delete('+', @add);
5201 sub replace_map($self, @list) {
5203 return $self->_add_delete('+', @list, Replace => $UNCONDITIONALLY);
5207 # Adds entry to a range list which can duplicate an existing entry
5210 my $code_point = shift;
5213 my $replace = delete $args{'Replace'} // $MULTIPLE_BEFORE;
5214 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5216 return $self->add_map($code_point, $code_point,
5217 $value, Replace => $replace);
5219 } # End of closure for package Range_Map
5221 package _Base_Table;
5223 # A table is the basic data structure that gets written out into a file for
5224 # use by the Perl core. This is the abstract base class implementing the
5225 # common elements from the derived ones. A list of the methods to be
5226 # furnished by an implementing class is just after the constructor.
5228 sub standardize { return main::standardize($_[0]); }
5229 sub trace { return main::trace(@_); }
5233 main::setup_package();
5236 # Object containing the ranges of the table.
5237 main::set_access('range_list', \%range_list, 'p_r', 'p_s');
5240 # The full table name.
5241 main::set_access('full_name', \%full_name, 'r');
5244 # The table name, almost always shorter
5245 main::set_access('name', \%name, 'r');
5248 # The shortest of all the aliases for this table, with underscores removed
5249 main::set_access('short_name', \%short_name);
5251 my %nominal_short_name_length;
5252 # The length of short_name before removing underscores
5253 main::set_access('nominal_short_name_length',
5254 \%nominal_short_name_length);
5257 # The complete name, including property.
5258 main::set_access('complete_name', \%complete_name, 'r');
5261 # Parent property this table is attached to.
5262 main::set_access('property', \%property, 'r');
5265 # Ordered list of alias objects of the table's name. The first ones in
5266 # the list are output first in comments
5267 main::set_access('aliases', \%aliases, 'readable_array');
5270 # A comment associated with the table for human readers of the files
5271 main::set_access('comment', \%comment, 's');
5274 # A comment giving a short description of the table's meaning for human
5275 # readers of the files.
5276 main::set_access('description', \%description, 'readable_array');
5279 # A comment giving a short note about the table for human readers of the
5281 main::set_access('note', \%note, 'readable_array');
5284 # Enum; there are a number of possibilities for what happens to this
5285 # table: it could be normal, or suppressed, or not for external use. See
5286 # values at definition for $SUPPRESSED.
5287 main::set_access('fate', \%fate, 'r');
5289 my %find_table_from_alias;
5290 # The parent property passes this pointer to a hash which this class adds
5291 # all its aliases to, so that the parent can quickly take an alias and
5293 main::set_access('find_table_from_alias', \%find_table_from_alias, 'p_r');
5296 # After this table is made equivalent to another one; we shouldn't go
5297 # changing the contents because that could mean it's no longer equivalent
5298 main::set_access('locked', \%locked, 'r');
5301 # This gives the final path to the file containing the table. Each
5302 # directory in the path is an element in the array
5303 main::set_access('file_path', \%file_path, 'readable_array');
5306 # What is the table's status, normal, $OBSOLETE, etc. Enum
5307 main::set_access('status', \%status, 'r');
5310 # A comment about its being obsolete, or whatever non normal status it has
5311 main::set_access('status_info', \%status_info, 'r');
5313 my %caseless_equivalent;
5314 # The table this is equivalent to under /i matching, if any.
5315 main::set_access('caseless_equivalent', \%caseless_equivalent, 'r', 's');
5318 # Is the table to be output with each range only a single code point?
5319 # This is done to avoid breaking existing code that may have come to rely
5320 # on this behavior in previous versions of this program.)
5321 main::set_access('range_size_1', \%range_size_1, 'r', 's');
5324 # A boolean set iff this table is a Perl extension to the Unicode
5326 main::set_access('perl_extension', \%perl_extension, 'r');
5328 my %output_range_counts;
5329 # A boolean set iff this table is to have comments written in the
5330 # output file that contain the number of code points in the range.
5331 # The constructor can override the global flag of the same name.
5332 main::set_access('output_range_counts', \%output_range_counts, 'r');
5334 my %write_as_invlist;
5335 # A boolean set iff the output file for this table is to be in the form of
5336 # an inversion list/map.
5337 main::set_access('write_as_invlist', \%write_as_invlist, 'r');
5340 # The format of the entries of the table. This is calculated from the
5341 # data in the table (or passed in the constructor). This is an enum e.g.,
5342 # $STRING_FORMAT. It is marked protected as it should not be generally
5343 # used to override calculations.
5344 main::set_access('format', \%format, 'r', 'p_s');
5347 # A boolean that gives whether some other table in this property is
5348 # defined as the complement of this table. This is a crude, but currently
5349 # sufficient, mechanism to make this table not get destroyed before what
5350 # is dependent on it is. Other dependencies could be added, so the name
5351 # was chosen to reflect a more general situation than actually is
5352 # currently the case.
5353 main::set_access('has_dependency', \%has_dependency, 'r', 's');
5356 # All arguments are key => value pairs, which you can see below, most
5357 # of which match fields documented above. Otherwise: Re_Pod_Entry,
5358 # OK_as_Filename, and Fuzzy apply to the names of the table, and are
5359 # documented in the Alias package
5361 return Carp::carp_too_few_args(\@_, 2) if main::DEBUG && @_ < 2;
5365 my $self = bless \do { my $anonymous_scalar }, $class;
5366 my $addr = pack 'J', refaddr $self;
5370 $name{$addr} = delete $args{'Name'};
5371 $find_table_from_alias{$addr} = delete $args{'_Alias_Hash'};
5372 $full_name{$addr} = delete $args{'Full_Name'};
5373 my $complete_name = $complete_name{$addr}
5374 = delete $args{'Complete_Name'};
5375 $format{$addr} = delete $args{'Format'};
5376 $output_range_counts{$addr} = delete $args{'Output_Range_Counts'};
5377 $property{$addr} = delete $args{'_Property'};
5378 $range_list{$addr} = delete $args{'_Range_List'};
5379 $status{$addr} = delete $args{'Status'} || $NORMAL;
5380 $status_info{$addr} = delete $args{'_Status_Info'} || "";
5381 $range_size_1{$addr} = delete $args{'Range_Size_1'} || 0;
5382 $caseless_equivalent{$addr} = delete $args{'Caseless_Equivalent'} || 0;
5383 $fate{$addr} = delete $args{'Fate'} || $ORDINARY;
5384 $write_as_invlist{$addr} = delete $args{'Write_As_Invlist'};# No default
5385 my $ucd = delete $args{'UCD'};
5387 my $description = delete $args{'Description'};
5388 my $ok_as_filename = delete $args{'OK_as_Filename'};
5389 my $loose_match = delete $args{'Fuzzy'};
5390 my $note = delete $args{'Note'};
5391 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'};
5392 my $perl_extension = delete $args{'Perl_Extension'};
5393 my $suppression_reason = delete $args{'Suppression_Reason'};
5395 # Shouldn't have any left over
5396 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5398 # Can't use || above because conceivably the name could be 0, and
5399 # can't use // operator in case this program gets used in Perl 5.8
5400 $full_name{$addr} = $name{$addr} if ! defined $full_name{$addr};
5401 $output_range_counts{$addr} = $output_range_counts if
5402 ! defined $output_range_counts{$addr};
5404 $aliases{$addr} = [ ];
5405 $comment{$addr} = [ ];
5406 $description{$addr} = [ ];
5408 $file_path{$addr} = [ ];
5409 $locked{$addr} = "";
5410 $has_dependency{$addr} = 0;
5412 push @{$description{$addr}}, $description if $description;
5413 push @{$note{$addr}}, $note if $note;
5415 if ($fate{$addr} == $PLACEHOLDER) {
5417 # A placeholder table doesn't get documented, is a perl extension,
5418 # and quite likely will be empty
5419 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5420 $perl_extension = 1 if ! defined $perl_extension;
5421 $ucd = 0 if ! defined $ucd;
5422 push @tables_that_may_be_empty, $complete_name{$addr};
5423 $self->add_comment(<<END);
5424 This is a placeholder because it is not in Version $string_version of Unicode,
5425 but is needed by the Perl core to work gracefully. Because it is not in this
5426 version of Unicode, it will not be listed in $pod_file.pod
5429 elsif (exists $why_suppressed{$complete_name}
5430 # Don't suppress if overridden
5431 && ! grep { $_ eq $complete_name{$addr} }
5432 @output_mapped_properties)
5434 $fate{$addr} = $SUPPRESSED;
5436 elsif ($fate{$addr} == $SUPPRESSED) {
5437 Carp::my_carp_bug("Need reason for suppressing") unless $suppression_reason;
5438 # Though currently unused
5440 elsif ($suppression_reason) {
5441 Carp::my_carp_bug("A reason was given for suppressing, but not suppressed");
5444 # If hasn't set its status already, see if it is on one of the
5445 # lists of properties or tables that have particular statuses; if
5446 # not, is normal. The lists are prioritized so the most serious
5447 # ones are checked first
5448 if (! $status{$addr}) {
5449 if (exists $why_deprecated{$complete_name}) {
5450 $status{$addr} = $DEPRECATED;
5452 elsif (exists $why_stabilized{$complete_name}) {
5453 $status{$addr} = $STABILIZED;
5455 elsif (exists $why_obsolete{$complete_name}) {
5456 $status{$addr} = $OBSOLETE;
5459 # Existence above doesn't necessarily mean there is a message
5460 # associated with it. Use the most serious message.
5461 if ($status{$addr}) {
5462 if ($why_deprecated{$complete_name}) {
5464 = $why_deprecated{$complete_name};
5466 elsif ($why_stabilized{$complete_name}) {
5468 = $why_stabilized{$complete_name};
5470 elsif ($why_obsolete{$complete_name}) {
5472 = $why_obsolete{$complete_name};
5477 $perl_extension{$addr} = $perl_extension || 0;
5479 # Don't list a property by default that is internal only
5480 if ($fate{$addr} > $MAP_PROXIED) {
5481 $make_re_pod_entry = 0 if ! defined $make_re_pod_entry;
5482 $ucd = 0 if ! defined $ucd;
5485 $ucd = 1 if ! defined $ucd;
5488 # By convention what typically gets printed only or first is what's
5489 # first in the list, so put the full name there for good output
5490 # clarity. Other routines rely on the full name being first on the
5492 $self->add_alias($full_name{$addr},
5493 OK_as_Filename => $ok_as_filename,
5494 Fuzzy => $loose_match,
5495 Re_Pod_Entry => $make_re_pod_entry,
5496 Status => $status{$addr},
5500 # Then comes the other name, if meaningfully different.
5501 if (standardize($full_name{$addr}) ne standardize($name{$addr})) {
5502 $self->add_alias($name{$addr},
5503 OK_as_Filename => $ok_as_filename,
5504 Fuzzy => $loose_match,
5505 Re_Pod_Entry => $make_re_pod_entry,
5506 Status => $status{$addr},
5514 # Here are the methods that are required to be defined by any derived
5517 handle_special_range
5521 # write() knows how to write out normal ranges, but it calls
5522 # handle_special_range() when it encounters a non-normal one.
5523 # append_to_body() is called by it after it has handled all
5524 # ranges to add anything after the main portion of the table.
5525 # And finally, pre_body() is called after all this to build up
5526 # anything that should appear before the main portion of the
5527 # table. Doing it this way allows things in the middle to
5528 # affect what should appear before the main portion of the
5533 Carp::my_carp_bug( __LINE__
5534 . ": Must create method '$sub()' for "
5542 "." => \&main::_operator_dot,
5543 ".=" => \&main::_operator_dot_equal,
5544 '!=' => \&main::_operator_not_equal,
5545 '==' => \&main::_operator_equal,
5549 # Returns the array of ranges associated with this table.
5551 return $range_list{pack 'J', refaddr shift}->ranges;
5555 # Add a synonym for this table.
5557 return Carp::carp_too_few_args(\@_, 3) if main::DEBUG && @_ < 3;
5560 my $name = shift; # The name to add.
5561 my $pointer = shift; # What the alias hash should point to. For
5562 # map tables, this is the parent property;
5563 # for match tables, it is the table itself.
5566 my $loose_match = delete $args{'Fuzzy'};
5568 my $ok_as_filename = delete $args{'OK_as_Filename'};
5569 $ok_as_filename = 1 unless defined $ok_as_filename;
5571 # An internal name does not get documented, unless overridden by the
5572 # input; same for making tests for it.
5573 my $status = delete $args{'Status'} || (($name =~ /^_/)
5576 my $make_re_pod_entry = delete $args{'Re_Pod_Entry'}
5577 // (($status ne $INTERNAL_ALIAS)
5578 ? (($name =~ /^_/) ? $NO : $YES)
5580 my $ucd = delete $args{'UCD'} // (($name =~ /^_/) ? 0 : 1);
5582 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
5584 # Capitalize the first letter of the alias unless it is one of the CJK
5585 # ones which specifically begins with a lower 'k'. Do this because
5586 # Unicode has varied whether they capitalize first letters or not, and
5587 # have later changed their minds and capitalized them, but not the
5588 # other way around. So do it always and avoid changes from release to
5590 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
5592 my $addr = pack 'J', refaddr $self;
5594 # Figure out if should be loosely matched if not already specified.
5595 if (! defined $loose_match) {
5597 # Is a loose_match if isn't null, and doesn't begin with an
5598 # underscore and isn't just a number
5600 && substr($name, 0, 1) ne '_'
5601 && $name !~ qr{^[0-9_.+-/]+$})
5610 # If this alias has already been defined, do nothing.
5611 return if defined $find_table_from_alias{$addr}->{$name};
5613 # That includes if it is standardly equivalent to an existing alias,
5614 # in which case, add this name to the list, so won't have to search
5616 my $standard_name = main::standardize($name);
5617 if (defined $find_table_from_alias{$addr}->{$standard_name}) {
5618 $find_table_from_alias{$addr}->{$name}
5619 = $find_table_from_alias{$addr}->{$standard_name};
5623 # Set the index hash for this alias for future quick reference.
5624 $find_table_from_alias{$addr}->{$name} = $pointer;
5625 $find_table_from_alias{$addr}->{$standard_name} = $pointer;
5626 local $to_trace = 0 if main::DEBUG;
5627 trace "adding alias $name to $pointer" if main::DEBUG && $to_trace;
5628 trace "adding alias $standard_name to $pointer" if main::DEBUG && $to_trace;
5631 # Put the new alias at the end of the list of aliases unless the final
5632 # element begins with an underscore (meaning it is for internal perl
5633 # use) or is all numeric, in which case, put the new one before that
5634 # one. This floats any all-numeric or underscore-beginning aliases to
5635 # the end. This is done so that they are listed last in output lists,
5636 # to encourage the user to use a better name (either more descriptive
5637 # or not an internal-only one) instead. This ordering is relied on
5638 # implicitly elsewhere in this program, like in short_name()
5639 my $list = $aliases{$addr};
5640 my $insert_position = (@$list == 0
5641 || (substr($list->[-1]->name, 0, 1) ne '_'
5642 && $list->[-1]->name =~ /\D/))
5648 Alias->new($name, $loose_match, $make_re_pod_entry,
5649 $ok_as_filename, $status, $ucd);
5651 # This name may be shorter than any existing ones, so clear the cache
5652 # of the shortest, so will have to be recalculated.
5653 undef $short_name{pack 'J', refaddr $self};
5657 sub short_name($self, $nominal_length_ptr=undef) {
5658 # Returns a name suitable for use as the base part of a file name.
5659 # That is, shorter wins. It can return undef if there is no suitable
5660 # name. The name has all non-essential underscores removed.
5662 # The optional second parameter is a reference to a scalar in which
5663 # this routine will store the length the returned name had before the
5664 # underscores were removed, or undef if the return is undef.
5666 # The shortest name can change if new aliases are added. So using
5667 # this should be deferred until after all these are added. The code
5668 # that does that should clear this one's cache.
5669 # Any name with alphabetics is preferred over an all numeric one, even
5672 my $addr = pack 'J', refaddr $self;
5674 # For efficiency, don't recalculate, but this means that adding new
5675 # aliases could change what the shortest is, so the code that does
5676 # that needs to undef this.
5677 if (defined $short_name{$addr}) {
5678 if ($nominal_length_ptr) {
5679 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5681 return $short_name{$addr};
5684 # Look at each alias
5685 my $is_last_resort = 0;
5686 my $deprecated_or_discouraged
5687 = qr/ ^ (?: $DEPRECATED | $DISCOURAGED ) $/x;
5688 foreach my $alias ($self->aliases()) {
5690 # Don't use an alias that isn't ok to use for an external name.
5691 next if ! $alias->ok_as_filename;
5693 my $name = main::Standardize($alias->name);
5694 trace $self, $name if main::DEBUG && $to_trace;
5696 # Take the first one, or any non-deprecated non-discouraged one
5697 # over one that is, or a shorter one that isn't numeric. This
5698 # relies on numeric aliases always being last in the array
5699 # returned by aliases(). Any alpha one will have precedence.
5700 if ( ! defined $short_name{$addr}
5701 || ( $is_last_resort
5702 && $alias->status !~ $deprecated_or_discouraged)
5704 && length($name) < length($short_name{$addr})))
5706 # Remove interior underscores.
5707 ($short_name{$addr} = $name) =~ s/ (?<= . ) _ (?= . ) //xg;
5709 $nominal_short_name_length{$addr} = length $name;
5710 $is_last_resort = $alias->status =~ $deprecated_or_discouraged;
5714 # If the short name isn't a nice one, perhaps an equivalent table has
5716 if ( $self->can('children')
5717 && ( ! defined $short_name{$addr}
5718 || $short_name{$addr} eq ""
5719 || $short_name{$addr} eq "_"))
5722 foreach my $follower ($self->children) { # All equivalents
5723 my $follower_name = $follower->short_name;
5724 next unless defined $follower_name;
5726 # Anything (except undefined) is better than underscore or
5728 if (! defined $return || $return eq "_") {
5729 $return = $follower_name;
5733 # If the new follower name isn't "_" and is shorter than the
5734 # current best one, prefer the new one.
5735 next if $follower_name eq "_";
5736 next if length $follower_name > length $return;
5737 $return = $follower_name;
5739 $short_name{$addr} = $return if defined $return;
5742 # If no suitable external name return undef
5743 if (! defined $short_name{$addr}) {
5744 $$nominal_length_ptr = undef if $nominal_length_ptr;
5748 # Don't allow a null short name.
5749 if ($short_name{$addr} eq "") {
5750 $short_name{$addr} = '_';
5751 $nominal_short_name_length{$addr} = 1;
5754 trace $self, $short_name{$addr} if main::DEBUG && $to_trace;
5756 if ($nominal_length_ptr) {
5757 $$nominal_length_ptr = $nominal_short_name_length{$addr};
5759 return $short_name{$addr};
5762 sub external_name($self) {
5763 # Returns the external name that this table should be known by. This
5764 # is usually the short_name, but not if the short_name is undefined,
5765 # in which case the external_name is arbitrarily set to the
5768 my $short = $self->short_name;
5769 return $short if defined $short;
5774 sub add_description($self, $description) { # Adds the parameter as a short description.
5775 push @{$description{pack 'J', refaddr $self}}, $description;
5780 sub add_note($self, $note) { # Adds the parameter as a short note.
5781 push @{$note{pack 'J', refaddr $self}}, $note;
5786 sub add_comment($self, $comment) { # Adds the parameter as a comment.
5788 return unless $debugging_build;
5792 push @{$comment{pack 'J', refaddr $self}}, $comment;
5797 sub comment($self) {
5798 # Return the current comment for this table. If called in list
5799 # context, returns the array of comments. In scalar, returns a string
5800 # of each element joined together with a period ending each.
5802 my $addr = pack 'J', refaddr $self;
5803 my @list = @{$comment{$addr}};
5804 return @list if wantarray;
5806 foreach my $sentence (@list) {
5807 $return .= '. ' if $return;
5808 $return .= $sentence;
5811 $return .= '.' if $return;
5815 sub initialize($self, $initialization) {
5816 # Initialize the table with the argument which is any valid
5817 # initialization for range lists.
5819 my $addr = pack 'J', refaddr $self;
5821 # Replace the current range list with a new one of the same exact
5823 my $class = ref $range_list{$addr};
5824 $range_list{$addr} = $class->new(Owner => $self,
5825 Initialize => $initialization);
5831 # The header that is output for the table in the file it is written
5834 $return .= $DEVELOPMENT_ONLY if $compare_versions;
5839 sub merge_single_annotation_line ($output, $annotation, $annotation_column) {
5841 # This appends an annotation comment, $annotation, to $output,
5842 # starting in or after column $annotation_column, removing any
5843 # pre-existing comment from $output.
5845 $annotation =~ s/^ \s* \# \ //x;
5846 $output =~ s/ \s* ( \# \N* )? \n //x;
5847 $output = Text::Tabs::expand($output);
5849 my $spaces = $annotation_column - length $output;
5850 $spaces = 2 if $spaces < 0; # Have 2 blanks before the comment
5852 $output = sprintf "%s%*s# %s",
5857 return Text::Tabs::unexpand $output;
5860 sub write($self, $use_adjustments=0, $suppress_value=0) {
5861 # Write a representation of the table to its file. It calls several
5862 # functions furnished by sub-classes of this abstract base class to
5863 # handle non-normal ranges, to add stuff before the table, and at its
5864 # end. If the table is to be written so that adjustments are
5865 # required, this does that conversion.
5868 # $use_adjustments ? output in adjusted format or not
5869 # $suppress_value Optional, if the value associated with
5870 # a range equals this one, don't write
5873 my $addr = pack 'J', refaddr $self;
5874 my $write_as_invlist = $write_as_invlist{$addr};
5876 # Start with the header
5877 my @HEADER = $self->header;
5880 push @HEADER, "\n", main::simple_fold($comment{$addr}, '# '), "\n"
5883 # Things discovered processing the main body of the document may
5884 # affect what gets output before it, therefore pre_body() isn't called
5885 # until after all other processing of the table is done.
5887 # The main body looks like a 'here' document. If there are comments,
5888 # get rid of them when processing it.
5890 if ($annotate || $output_range_counts) {
5891 # Use the line below in Perls that don't have /r
5892 #push @OUT, 'return join "\n", map { s/\s*#.*//mg; $_ } split "\n", <<\'END\';' . "\n";
5893 push @OUT, "return <<'END' =~ s/\\s*#.*//mgr;\n";
5895 push @OUT, "return <<'END';\n";
5898 if ($range_list{$addr}->is_empty) {
5900 # This is a kludge for empty tables to silence a warning in
5901 # utf8.c, which can't really deal with empty tables, but it can
5902 # deal with a table that matches nothing, as the inverse of 'All'
5904 push @OUT, "!Unicode::UCD::All\n";
5906 elsif ($self->name eq 'N'
5908 # To save disk space and table cache space, avoid putting out
5909 # binary N tables, but instead create a file which just inverts
5910 # the Y table. Since the file will still exist and occupy a
5911 # certain number of blocks, might as well output the whole
5912 # thing if it all will fit in one block. The number of
5913 # ranges below is an approximate number for that.
5914 && ($self->property->type == $BINARY
5915 || $self->property->type == $FORCED_BINARY)
5916 # && $self->property->tables == 2 Can't do this because the
5917 # non-binary properties, like NFDQC aren't specifiable
5919 && $range_list{$addr}->ranges > 15
5920 && ! $annotate) # Under --annotate, want to see everything
5922 push @OUT, "!Unicode::UCD::" . $self->property->name . "\n";
5925 my $range_size_1 = $range_size_1{$addr};
5927 # To make it more readable, use a minimum indentation
5930 # These are used only in $annotate option
5931 my $format; # e.g. $HEX_ADJUST_FORMAT
5932 my $include_name; # ? Include the character's name in the
5934 my $include_cp; # ? Include its code point
5937 $comment_indent = ($self->isa('Map_Table'))
5939 : ($write_as_invlist)
5944 $format = $self->format;
5946 # The name of the character is output only for tables that
5947 # don't already include the name in the output.
5948 my $property = $self->property;
5950 ! ($property == $perl_charname
5951 || $property == main::property_ref('Unicode_1_Name')
5952 || $property == main::property_ref('Name')
5953 || $property == main::property_ref('Name_Alias')
5956 # Don't include the code point in the annotation where all
5957 # lines are a single code point, so it can be easily found in
5959 $include_cp = ! $range_size_1;
5961 if (! $self->isa('Map_Table')) {
5962 $comment_indent = ($write_as_invlist) ? 8 : 16;
5965 $comment_indent = 16;
5967 # There are just a few short ranges in this table, so no
5968 # need to include the code point in the annotation.
5969 $include_cp = 0 if $format eq $DECOMP_STRING_FORMAT;
5971 # We're trying to get this to look good, as the whole
5972 # point is to make human-readable tables. It is easier to
5973 # read if almost all the annotation comments begin in the
5974 # same column. Map tables have varying width maps, so can
5975 # create a jagged comment appearance. This code does a
5976 # preliminary pass through these tables looking for the
5977 # maximum width map in each, and causing the comments to
5978 # begin just to the right of that. However, if the
5979 # comments begin too far to the right of most lines, it's
5980 # hard to line them up horizontally with their real data.
5981 # Therefore we ignore the longest outliers
5982 my $ignore_longest_X_percent = 2; # Discard longest X%
5984 # Each key in this hash is a width of at least one of the
5985 # maps in the table. Its value is how many lines have
5989 # We won't space things further left than one tab stop
5990 # after the rest of the line; initializing it to that
5991 # number saves some work.
5992 my $max_map_width = 8;
5994 # Fill in the %widths hash
5996 for my $set ($range_list{$addr}->ranges) {
5997 my $value = $set->value;
5999 # These range types don't appear in the main table
6000 next if $set->type == 0
6001 && defined $suppress_value
6002 && $value eq $suppress_value;
6003 next if $set->type == $MULTI_CP
6004 || $set->type == $NULL;
6006 # Include 2 spaces before the beginning of the
6008 my $this_width = length($value) + 2;
6010 # Ranges of the remaining non-zero types usually
6011 # occupy just one line (maybe occasionally two, but
6012 # this doesn't have to be dead accurate). This is
6013 # because these ranges are like "unassigned code
6015 my $count = ($set->type != 0)
6017 : $set->end - $set->start + 1;
6018 $widths{$this_width} += $count;
6020 $max_map_width = $this_width
6021 if $max_map_width < $this_width;
6024 # If the widest map gives us less than two tab stops
6025 # worth, just take it as-is.
6026 if ($max_map_width > 16) {
6028 # Otherwise go through %widths until we have included
6029 # the desired percentage of lines in the whole table.
6030 my $running_total = 0;
6031 foreach my $width (sort { $a <=> $b } keys %widths)
6033 $running_total += $widths{$width};
6035 if ($running_total * 100 / $total
6036 >= 100 - $ignore_longest_X_percent)
6038 $max_map_width = $width;
6043 $comment_indent += $max_map_width;
6047 # Values for previous time through the loop. Initialize to
6048 # something that won't be adjacent to the first iteration;
6049 # only $previous_end matters for that.
6051 my $previous_end = -2;
6054 # Values for next time through the portion of the loop that splits
6055 # the range. 0 in $next_start means there is no remaining portion
6061 my $invlist_count = 0;
6063 my $output_value_in_hex = $self->isa('Map_Table')
6064 && ($self->format eq $HEX_ADJUST_FORMAT
6065 || $self->to_output_map == $EXTERNAL_MAP);
6066 # Use leading zeroes just for files whose format should not be
6067 # changed from what it has been. Otherwise, they just take up
6068 # space and time to process.
6069 my $hex_format = ($self->isa('Map_Table')
6070 && $self->to_output_map == $EXTERNAL_MAP)
6074 # The values for some of these tables are stored in mktables as
6075 # hex strings. Normally, these are just output as strings without
6076 # change, but when we are doing adjustments, we have to operate on
6077 # these numerically, so we convert those to decimal to do that,
6078 # and back to hex for output
6079 my $convert_map_to_from_hex = 0;
6080 my $output_map_in_hex = 0;
6081 if ($self->isa('Map_Table')) {
6082 $convert_map_to_from_hex
6083 = ($use_adjustments && $self->format eq $HEX_ADJUST_FORMAT)
6084 || ($annotate && $self->format eq $HEX_FORMAT);
6085 $output_map_in_hex = $convert_map_to_from_hex
6086 || $self->format eq $HEX_FORMAT;
6089 # To store any annotations about the characters.
6092 # Output each range as part of the here document.
6094 for my $set ($range_list{$addr}->ranges) {
6095 if ($set->type != 0) {
6096 $self->handle_special_range($set);
6099 my $start = $set->start;
6100 my $end = $set->end;
6101 my $value = $set->value;
6103 # Don't output ranges whose value is the one to suppress
6104 next RANGE if defined $suppress_value
6105 && $value eq $suppress_value;
6107 $value = CORE::hex $value if $convert_map_to_from_hex;
6110 { # This bare block encloses the scope where we may need to
6111 # 'redo' to. Consider a table that is to be written out
6112 # using single item ranges. This is given in the
6113 # $range_size_1 boolean. To accomplish this, we split the
6114 # range each time through the loop into two portions, the
6115 # first item, and the rest. We handle that first item
6116 # this time in the loop, and 'redo' to repeat the process
6117 # for the rest of the range.
6119 # We may also have to do it, with other special handling,
6120 # if the table has adjustments. Consider the table that
6121 # contains the lowercasing maps. mktables stores the
6122 # ASCII range ones as 26 ranges:
6123 # ord('A') => ord('a'), .. ord('Z') => ord('z')
6124 # For compactness, the table that gets written has this as
6126 # ( ord('A') .. ord('Z') ) => ord('a')
6127 # and the software that reads the tables is smart enough
6128 # to "connect the dots". This change is accomplished in
6129 # this loop by looking to see if the current iteration
6130 # fits the paradigm of the previous iteration, and if so,
6131 # we merge them by replacing the final output item with
6132 # the merged data. Repeated 25 times, this gets A-Z. But
6133 # we also have to make sure we don't screw up cases where
6134 # we have internally stored
6135 # ( 0x1C4 .. 0x1C6 ) => 0x1C5
6136 # This single internal range has to be output as 3 ranges,
6137 # which is done by splitting, like we do for $range_size_1
6138 # tables. (There are very few of such ranges that need to
6139 # be split, so the gain of doing the combining of other
6140 # ranges far outweighs the splitting of these.) The
6141 # values to use for the redo at the end of this block are
6142 # set up just below in the scalars whose names begin with
6145 if (($use_adjustments || $range_size_1) && $end != $start)
6147 $next_start = $start + 1;
6149 $next_value = $value;
6153 if ($use_adjustments && ! $range_size_1) {
6155 # If this range is adjacent to the previous one, and
6156 # the values in each are integers that are also
6157 # adjacent (differ by 1), then this range really
6158 # extends the previous one that is already in element
6159 # $OUT[-1]. So we pop that element, and pretend that
6160 # the range starts with whatever it started with.
6161 # $offset is incremented by 1 each time so that it
6162 # gives the current offset from the first element in
6163 # the accumulating range, and we keep in $value the
6164 # value of that first element.
6165 if ($start == $previous_end + 1
6166 && $value =~ /^ -? \d+ $/xa
6167 && $previous_value =~ /^ -? \d+ $/xa
6168 && ($value == ($previous_value + ++$offset)))
6171 $start = $previous_start;
6172 $value = $previous_value;
6176 if (@annotation == 1) {
6177 $OUT[-1] = merge_single_annotation_line(
6178 $OUT[-1], $annotation[0], $comment_indent);
6181 push @OUT, @annotation;
6186 # Save the current values for the next time through
6188 $previous_start = $start;
6189 $previous_end = $end;
6190 $previous_value = $value;
6193 if ($write_as_invlist) {
6194 if ( $previous_end > 0
6195 && $output_range_counts{$addr})
6197 my $complement_count = $start - $previous_end - 1;
6198 if ($complement_count > 1) {
6199 $OUT[-1] = merge_single_annotation_line(
6204 . main::clarify_code_point_count(
6206 . "] in complement\n",
6211 # Inversion list format has a single number per line,
6212 # the starting code point of a range that matches the
6214 push @OUT, $start, "\n";
6217 # Add a comment with the size of the range, if
6219 if ($output_range_counts{$addr}) {
6220 $OUT[-1] = merge_single_annotation_line(
6223 . main::clarify_code_point_count($end - $start + 1)
6228 elsif ($start != $end) { # If there is a range
6229 if ($end == $MAX_WORKING_CODEPOINT) {
6230 push @OUT, sprintf "$hex_format\t$hex_format",
6232 $MAX_PLATFORM_CODEPOINT;
6235 push @OUT, sprintf "$hex_format\t$hex_format",
6238 if (length $value) {
6239 if ($convert_map_to_from_hex) {
6240 $OUT[-1] .= sprintf "\t$hex_format\n", $value;
6243 $OUT[-1] .= "\t$value\n";
6247 # Add a comment with the size of the range, if
6249 if ($output_range_counts{$addr}) {
6250 $OUT[-1] = merge_single_annotation_line(
6253 . main::clarify_code_point_count($end - $start + 1)
6258 else { # Here to output a single code point per line.
6260 # Use any passed in subroutine to output.
6261 if (ref $range_size_1 eq 'CODE') {
6262 for my $i ($start .. $end) {
6263 push @OUT, &{$range_size_1}($i, $value);
6268 # Here, caller is ok with default output.
6269 for (my $i = $start; $i <= $end; $i++) {
6270 if ($convert_map_to_from_hex) {
6272 sprintf "$hex_format\t\t$hex_format\n",
6276 push @OUT, sprintf $hex_format, $i;
6277 $OUT[-1] .= "\t\t$value" if $value ne "";
6285 for (my $i = $start; $i <= $end; $i++) {
6286 my $annotation = "";
6288 # Get character information if don't have it already
6289 main::populate_char_info($i)
6290 if ! defined $viacode[$i];
6291 my $type = $annotate_char_type[$i];
6293 # Figure out if should output the next code points
6294 # as part of a range or not. If this is not in an
6295 # annotation range, then won't output as a range,
6296 # so returns $i. Otherwise use the end of the
6297 # annotation range, but no further than the
6298 # maximum possible end point of the loop.
6303 $annotate_ranges->value_of($i) || $i,
6306 # Use a range if it is a range, and either is one
6307 # of the special annotation ranges, or the range
6308 # is at most 3 long. This last case causes the
6309 # algorithmically named code points to be output
6310 # individually in spans of at most 3, as they are
6311 # the ones whose $type is > 0.
6312 if ($range_end != $i
6313 && ( $type < 0 || $range_end - $i > 2))
6315 # Here is to output a range. We don't allow a
6316 # caller-specified output format--just use the
6318 my $range_name = $viacode[$i];
6320 # For the code points which end in their hex
6321 # value, we eliminate that from the output
6322 # annotation, and capitalize only the first
6323 # letter of each word.
6324 if ($type == $CP_IN_NAME) {
6325 my $hex = sprintf $hex_format, $i;
6326 $range_name =~ s/-$hex$//;
6327 my @words = split " ", $range_name;
6328 for my $word (@words) {
6330 ucfirst(lc($word)) if $word ne 'CJK';
6332 $range_name = join " ", @words;
6334 elsif ($type == $HANGUL_SYLLABLE) {
6335 $range_name = "Hangul Syllable";
6338 # If the annotation would just repeat what's
6339 # already being output as the range, skip it.
6340 # (When an inversion list is being written, it
6341 # isn't a repeat, as that always is in
6343 if ( $write_as_invlist
6345 || $range_end < $end)
6347 if ($range_end < $MAX_WORKING_CODEPOINT)
6349 $annotation = sprintf "%04X..%04X",
6353 $annotation = sprintf "%04X..INFINITY",
6357 else { # Indent if not displaying code points
6358 $annotation = " " x 4;
6362 $annotation .= " $age[$i]" if $age[$i];
6363 $annotation .= " $range_name";
6366 # Include the number of code points in the
6369 main::clarify_code_point_count($range_end - $i + 1);
6370 $annotation .= " [$count]\n";
6372 # Skip to the end of the range
6375 else { # Not in a range.
6378 # When outputting the names of each character,
6379 # use the character itself if printable
6380 $comment .= "'" . main::display_chr($i) . "' "
6383 my $output_value = $value;
6385 # Determine the annotation
6386 if ($format eq $DECOMP_STRING_FORMAT) {
6388 # This is very specialized, with the type
6389 # of decomposition beginning the line
6390 # enclosed in <...>, and the code points
6391 # that the code point decomposes to
6392 # separated by blanks. Create two
6393 # strings, one of the printable
6394 # characters, and one of their official
6396 (my $map = $output_value)
6397 =~ s/ \ * < .*? > \ +//x;
6401 foreach my $to (split " ", $map) {
6402 $to = CORE::hex $to;
6403 $to_name .= " + " if $to_name;
6404 $to_chr .= main::display_chr($to);
6405 main::populate_char_info($to)
6406 if ! defined $viacode[$to];
6407 $to_name .= $viacode[$to];
6411 "=> '$to_chr'; $viacode[$i] => $to_name";
6414 $output_value += $i - $start
6416 # Don't try to adjust a
6418 && $output_value !~ /[-\D]/;
6420 if ($output_map_in_hex) {
6421 main::populate_char_info($output_value)
6422 if ! defined $viacode[$output_value];
6424 . main::display_chr($output_value)
6425 . "'; " if $printable[$output_value];
6427 if ($include_name && $viacode[$i]) {
6428 $comment .= " " if $comment;
6429 $comment .= $viacode[$i];
6431 if ($output_map_in_hex) {
6433 " => $viacode[$output_value]"
6434 if $viacode[$output_value];
6435 $output_value = sprintf($hex_format,
6441 $annotation = sprintf "%04X %s", $i, $age[$i];
6442 if ($use_adjustments) {
6443 $annotation .= " => $output_value";
6447 if ($comment ne "") {
6448 $annotation .= " " if $annotation ne "";
6449 $annotation .= $comment;
6451 $annotation .= "\n" if $annotation ne "";
6454 if ($annotation ne "") {
6455 push @annotation, (" " x $comment_indent)
6460 # If not adjusting, we don't have to go through the
6461 # loop again to know that the annotation comes next
6463 if (! $use_adjustments) {
6464 if (@annotation == 1) {
6465 $OUT[-1] = merge_single_annotation_line(
6466 $OUT[-1], $annotation[0], $comment_indent);
6469 push @OUT, map { Text::Tabs::unexpand $_ }
6476 # Add the beginning of the range that doesn't match the
6477 # property, except if the just added match range extends
6478 # to infinity. We do this after any annotations for the
6480 if ($write_as_invlist && $end < $MAX_WORKING_CODEPOINT) {
6481 push @OUT, $end + 1, "\n";
6485 # If we split the range, set up so the next time through
6486 # we get the remainder, and redo.
6488 $start = $next_start;
6490 $value = $next_value;
6494 } # End of redo block
6495 } # End of loop through all the table's ranges
6497 push @OUT, @annotation; # Add orphaned annotation, if any
6499 splice @OUT, 1, 0, "V$invlist_count\n" if $invlist_count;
6502 # Add anything that goes after the main body, but within the here
6504 my $append_to_body = $self->append_to_body;
6505 push @OUT, $append_to_body if $append_to_body;
6507 # And finish the here document.
6510 # Done with the main portion of the body. Can now figure out what
6511 # should appear before it in the file.
6512 my $pre_body = $self->pre_body;
6513 push @HEADER, $pre_body, "\n" if $pre_body;
6515 # All these files should have a .pl suffix added to them.
6516 my @file_with_pl = @{$file_path{$addr}};
6517 $file_with_pl[-1] .= '.pl';
6519 main::write(\@file_with_pl,
6520 $annotate, # utf8 iff annotating
6526 sub set_status($self, $status, $info) { # Set the table's status
6527 # status The status enum value
6528 # info Any message associated with it.
6529 my $addr = pack 'J', refaddr $self;
6531 $status{$addr} = $status;
6532 $status_info{$addr} = $info;
6536 sub set_fate($self, $fate, $reason=undef) { # Set the fate of a table
6537 my $addr = pack 'J', refaddr $self;
6539 return if $fate{$addr} == $fate; # If no-op
6541 # Can only change the ordinary fate, except if going to $MAP_PROXIED
6542 return if $fate{$addr} != $ORDINARY && $fate != $MAP_PROXIED;
6544 $fate{$addr} = $fate;
6546 # Don't document anything to do with a non-normal fated table
6547 if ($fate != $ORDINARY) {
6548 my $put_in_pod = ($fate == $MAP_PROXIED) ? 1 : 0;
6549 foreach my $alias ($self->aliases) {
6550 $alias->set_ucd($put_in_pod);
6552 # MAP_PROXIED doesn't affect the match tables
6553 next if $fate == $MAP_PROXIED;
6554 $alias->set_make_re_pod_entry($put_in_pod);
6558 # Save the reason for suppression for output
6559 if ($fate >= $SUPPRESSED) {
6560 $reason = "" unless defined $reason;
6561 $why_suppressed{$complete_name{$addr}} = $reason;
6568 # Don't allow changes to the table from now on. This stores a stack
6569 # trace of where it was called, so that later attempts to modify it
6570 # can immediately show where it got locked.
6571 my $addr = pack 'J', refaddr $self;
6573 $locked{$addr} = "";
6575 my $line = (caller(0))[2];
6578 # Accumulate the stack trace
6580 my ($pkg, $file, $caller_line, $caller) = caller $i++;
6582 last unless defined $caller;
6584 $locked{$addr} .= " called from $caller() at line $line\n";
6585 $line = $caller_line;
6587 $locked{$addr} .= " called from main at line $line\n";
6592 sub carp_if_locked($self) {
6593 # Return whether a table is locked or not, and, by the way, complain
6595 my $addr = pack 'J', refaddr $self;
6597 return 0 if ! $locked{$addr};
6598 Carp::my_carp_bug("Can't modify a locked table. Stack trace of locking:\n$locked{$addr}\n\n");
6602 sub set_file_path($self, @path) { # Set the final directory path for this table
6603 @{$file_path{pack 'J', refaddr $self}} = @path;
6607 # Accessors for the range list stored in this table. First for
6616 matches_identically_to
6629 return $self->_range_list->$sub(@_);
6633 # Then for ones that should fail if locked
6643 return if $self->carp_if_locked;
6645 return $self->_range_list->$sub(@_);
6652 use parent '-norequire', '_Base_Table';
6654 # A Map Table is a table that contains the mappings from code points to
6655 # values. There are two weird cases:
6656 # 1) Anomalous entries are ones that aren't maps of ranges of code points, but
6657 # are written in the table's file at the end of the table nonetheless. It
6658 # requires specially constructed code to handle these; utf8.c can not read
6659 # these in, so they should not go in $map_directory. As of this writing,
6660 # the only case that these happen is for named sequences used in
6661 # charnames.pm. But this code doesn't enforce any syntax on these, so
6662 # something else could come along that uses it.
6663 # 2) Specials are anything that doesn't fit syntactically into the body of the
6664 # table. The ranges for these have a map type of non-zero. The code below
6665 # knows about and handles each possible type. In most cases, these are
6666 # written as part of the header.
6668 # A map table deliberately can't be manipulated at will unlike match tables.
6669 # This is because of the ambiguities having to do with what to do with
6670 # overlapping code points. And there just isn't a need for those things;
6671 # what one wants to do is just query, add, replace, or delete mappings, plus
6672 # write the final result.
6673 # However, there is a method to get the list of possible ranges that aren't in
6674 # this table to use for defaulting missing code point mappings. And,
6675 # map_add_or_replace_non_nulls() does allow one to add another table to this
6676 # one, but it is clearly very specialized, and defined that the other's
6677 # non-null values replace this one's if there is any overlap.
6679 sub trace { return main::trace(@_); }
6683 main::setup_package();
6686 # Many input files omit some entries; this gives what the mapping for the
6687 # missing entries should be
6688 main::set_access('default_map', \%default_map, 'r');
6690 my %anomalous_entries;
6691 # Things that go in the body of the table which don't fit the normal
6692 # scheme of things, like having a range. Not much can be done with these
6693 # once there except to output them. This was created to handle named
6695 main::set_access('anomalous_entry', \%anomalous_entries, 'a');
6696 main::set_access('anomalous_entries', # Append singular, read plural
6697 \%anomalous_entries,
6700 # Enum as to whether or not to write out this map table, and how:
6702 # $EXTERNAL_MAP means its existence is noted in the documentation, and
6703 # it should not be removed nor its format changed. This
6704 # is done for those files that have traditionally been
6706 # $INTERNAL_MAP means Perl reserves the right to do anything it wants
6708 # $OUTPUT_ADJUSTED means that it is an $INTERNAL_MAP, and instead of
6709 # outputting the actual mappings as-is, we adjust things
6710 # to create a much more compact table. Only those few
6711 # tables where the mapping is convertible at least to an
6712 # integer and compacting makes a big difference should
6713 # have this. Hence, the default is to not do this
6714 # unless the table's default mapping is to $CODE_POINT,
6715 # and the range size is not 1.
6716 main::set_access('to_output_map', \%to_output_map, 's');
6724 # Optional initialization data for the table.
6725 my $initialize = delete $args{'Initialize'};
6727 my $default_map = delete $args{'Default_Map'};
6728 my $property = delete $args{'_Property'};
6729 my $full_name = delete $args{'Full_Name'};
6730 my $to_output_map = delete $args{'To_Output_Map'};
6732 # Rest of parameters passed on
6734 my $range_list = Range_Map->new(Owner => $property);
6736 my $self = $class->SUPER::new(
6738 Complete_Name => $full_name,
6739 Full_Name => $full_name,
6740 _Property => $property,
6741 _Range_List => $range_list,
6742 Write_As_Invlist => 0,
6745 my $addr = pack 'J', refaddr $self;
6747 $anomalous_entries{$addr} = [];
6748 $default_map{$addr} = $default_map;
6749 $to_output_map{$addr} = $to_output_map;
6751 $self->initialize($initialize) if defined $initialize;
6758 qw("") => "_operator_stringify",
6761 sub _operator_stringify($self, $other="", $reversed=0) {
6763 my $name = $self->property->full_name;
6764 $name = '""' if $name eq "";
6765 return "Map table for Property '$name'";
6769 # Add a synonym for this table (which means the property itself)
6772 # Rest of parameters passed on.
6774 $self->SUPER::add_alias($name, $self->property, @_);
6779 # Add a range of code points to the list of specially-handled code
6780 # points. 0 is assumed if the type of special is not passed
6789 my $type = delete $args{'Type'} || 0;
6790 # Rest of parameters passed on
6792 # Can't change the table if locked.
6793 return if $self->carp_if_locked;
6795 $self->_range_list->add_map($lower, $upper,
6802 sub append_to_body($self) {
6803 # Adds to the written HERE document of the table's body any anomalous
6804 # entries in the table..
6805 my $addr = pack 'J', refaddr $self;
6807 return "" unless @{$anomalous_entries{$addr}};
6808 return join("\n", @{$anomalous_entries{$addr}}) . "\n";
6811 sub map_add_or_replace_non_nulls($self, $other) {
6812 # This adds the mappings in the table $other to $self. Non-null
6813 # mappings from $other override those in $self. It essentially merges
6814 # the two tables, with the second having priority except for null
6816 return if $self->carp_if_locked;
6818 if (! $other->isa(__PACKAGE__)) {
6819 Carp::my_carp_bug("$other should be a "
6827 local $to_trace = 0 if main::DEBUG;
6829 my $self_range_list = $self->_range_list;
6830 my $other_range_list = $other->_range_list;
6831 foreach my $range ($other_range_list->ranges) {
6832 my $value = $range->value;
6833 next if $value eq "";
6834 $self_range_list->_add_delete('+',
6838 Type => $range->type,
6839 Replace => $UNCONDITIONALLY);
6845 sub set_default_map($self, $map, $use_full_name=0) {
6846 # Define what code points that are missing from the input files should
6847 # map to. The optional second parameter 'full_name' indicates to
6848 # force using the full name of the map instead of its standard name.
6849 if ($use_full_name && $use_full_name ne 'full_name') {
6850 Carp::my_carp_bug("Second parameter to set_default_map() if"
6851 . " present, must be 'full_name'");
6854 my $addr = pack 'J', refaddr $self;
6856 # Convert the input to the standard equivalent, if any (won't have any
6857 # for $STRING properties)
6858 my $standard = $self->property->table($map);
6859 if (defined $standard) {
6860 $map = ($use_full_name)
6861 ? $standard->full_name
6865 # Warn if there already is a non-equivalent default map for this
6866 # property. Note that a default map can be a ref, which means that
6867 # what it actually means is delayed until later in the program, and it
6868 # IS permissible to override it here without a message.
6869 my $default_map = $default_map{$addr};
6870 if (defined $default_map
6871 && ! ref($default_map)
6872 && $default_map ne $map
6873 && main::Standardize($map) ne $default_map)
6875 my $property = $self->property;
6876 my $map_table = $property->table($map);
6877 my $default_table = $property->table($default_map);
6878 if (defined $map_table
6879 && defined $default_table
6880 && $map_table != $default_table)
6882 Carp::my_carp("Changing the default mapping for "
6884 . " from $default_map to $map'");
6888 $default_map{$addr} = $map;
6890 # Don't also create any missing table for this map at this point,
6891 # because if we did, it could get done before the main table add is
6892 # done for PropValueAliases.txt; instead the caller will have to make
6893 # sure it exists, if desired.
6897 sub to_output_map($self) {
6898 # Returns boolean: should we write this map table?
6899 my $addr = pack 'J', refaddr $self;
6901 # If overridden, use that
6902 return $to_output_map{$addr} if defined $to_output_map{$addr};
6904 my $full_name = $self->full_name;
6905 return $global_to_output_map{$full_name}
6906 if defined $global_to_output_map{$full_name};
6908 # If table says to output, do so; if says to suppress it, do so.
6909 my $fate = $self->fate;
6910 return $INTERNAL_MAP if $fate == $INTERNAL_ONLY;
6911 return $EXTERNAL_MAP if grep { $_ eq $full_name } @output_mapped_properties;
6912 return 0 if $fate == $SUPPRESSED || $fate == $MAP_PROXIED;
6914 my $type = $self->property->type;
6916 # Don't want to output binary map tables even for debugging.
6917 return 0 if $type == $BINARY;
6919 # But do want to output string ones. All the ones that remain to
6920 # be dealt with (i.e. which haven't explicitly been set to external)
6921 # are for internal Perl use only. The default for those that map to
6922 # $CODE_POINT and haven't been restricted to a single element range
6923 # is to use the adjusted form.
6924 if ($type == $STRING) {
6925 return $INTERNAL_MAP if $self->range_size_1
6926 || $default_map{$addr} ne $CODE_POINT;
6927 return $OUTPUT_ADJUSTED;
6930 # Otherwise is an $ENUM, do output it, for Perl's purposes
6931 return $INTERNAL_MAP;
6934 sub inverse_list($self) {
6935 # Returns a Range_List that is gaps of the current table. That is,
6937 my $current = Range_List->new(Initialize => $self->_range_list,
6938 Owner => $self->property);
6943 my $return = $self->SUPER::header();
6945 if ($self->to_output_map >= $INTERNAL_MAP) {
6946 $return .= $INTERNAL_ONLY_HEADER;
6949 # Other properties have fixed formats.
6950 my $property_name = $self->property->full_name;
6954 # !!!!!!! IT IS DEPRECATED TO USE THIS FILE !!!!!!!
6956 # This file is for internal use by core Perl only. It is retained for
6957 # backwards compatibility with applications that may have come to rely on it,
6958 # but its format and even its name or existence are subject to change without
6959 # notice in a future Perl version. Don't use it directly. Instead, its
6960 # contents are now retrievable through a stable API in the Unicode::UCD
6961 # module: Unicode::UCD::prop_invmap('$property_name') (Values for individual
6962 # code points can be retrieved via Unicode::UCD::charprop());
6968 sub set_final_comment($self) {
6969 # Just before output, create the comment that heads the file
6970 # containing this table.
6972 return unless $debugging_build;
6974 # No sense generating a comment if aren't going to write it out.
6975 return if ! $self->to_output_map;
6977 my $addr = pack 'J', refaddr $self;
6979 my $property = $self->property;
6981 # Get all the possible names for this property. Don't use any that
6982 # aren't ok for use in a file name, etc. This is perhaps causing that
6983 # flag to do double duty, and may have to be changed in the future to
6984 # have our own flag for just this purpose; but it works now to exclude
6985 # Perl generated synonyms from the lists for properties, where the
6986 # name is always the proper Unicode one.
6987 my @property_aliases = grep { $_->ok_as_filename } $self->aliases;
6989 my $count = $self->count;
6990 my $default_map = $default_map{$addr};
6992 # The ranges that map to the default aren't output, so subtract that
6993 # to get those actually output. A property with matching tables
6994 # already has the information calculated.
6995 if ($property->type != $STRING && $property->type != $FORCED_BINARY) {
6996 $count -= $property->table($default_map)->count;
6998 elsif (defined $default_map) {
7000 # But for $STRING properties, must calculate now. Subtract the
7001 # count from each range that maps to the default.
7002 foreach my $range ($self->_range_list->ranges) {
7003 if ($range->value eq $default_map) {
7004 $count -= $range->end +1 - $range->start;
7010 # Get a string version of $count with underscores in large numbers,
7012 my $string_count = main::clarify_code_point_count($count);
7014 my $code_points = ($count == 1)
7015 ? 'single code point'
7016 : "$string_count code points";
7021 if (@property_aliases <= 1) {
7022 $mapping = 'mapping';
7023 $these_mappings = 'this mapping';
7027 $mapping = 'synonymous mappings';
7028 $these_mappings = 'these mappings';
7032 if ($count >= $MAX_UNICODE_CODEPOINTS) {
7033 $cp = "any code point in Unicode Version $string_version";
7037 if ($default_map eq "") {
7038 $map_to = 'the empty string';
7040 elsif ($default_map eq $CODE_POINT) {
7044 $map_to = "'$default_map'";
7047 $cp = "the single code point";
7050 $cp = "one of the $code_points";
7052 $cp .= " in Unicode Version $unicode_version for which the mapping is not to $map_to";
7057 my $status = $self->status;
7058 if ($status ne $NORMAL) {
7059 my $warn = uc $status_past_participles{$status};
7062 !!!!!!! $warn !!!!!!!!!!!!!!!!!!!
7063 All property or property=value combinations contained in this file are $warn.
7064 See $unicode_reference_url for what this means.
7068 $comment .= "This file returns the $mapping:\n";
7070 my $ucd_accessible_name = "";
7071 my $has_underscore_name = 0;
7072 my $full_name = $self->property->full_name;
7073 for my $i (0 .. @property_aliases - 1) {
7074 my $name = $property_aliases[$i]->name;
7075 $has_underscore_name = 1 if $name =~ /^_/;
7076 $comment .= sprintf("%-8s%s\n", " ", $name . '(cp)');
7077 if ($property_aliases[$i]->ucd) {
7078 if ($name eq $full_name) {
7079 $ucd_accessible_name = $full_name;
7081 elsif (! $ucd_accessible_name) {
7082 $ucd_accessible_name = $name;
7086 $comment .= "\nwhere 'cp' is $cp.";
7087 if ($ucd_accessible_name) {
7088 $comment .= " Note that $these_mappings";
7089 if ($has_underscore_name) {
7090 $comment .= " (except for the one(s) that begin with an underscore)";
7092 $comment .= " $are accessible via the functions prop_invmap('$full_name') or charprop() in Unicode::UCD";
7096 # And append any commentary already set from the actual property.
7097 $comment .= "\n\n" . $self->comment if $self->comment;
7098 if ($self->description) {
7099 $comment .= "\n\n" . join " ", $self->description;
7102 $comment .= "\n\n" . join " ", $self->note;
7106 if (! $self->perl_extension) {
7109 For information about what this property really means, see:
7110 $unicode_reference_url
7114 if ($count) { # Format differs for empty table
7115 $comment.= "\nThe format of the ";
7116 if ($self->range_size_1) {
7118 main body of lines of this file is: CODE_POINT\\t\\tMAPPING where CODE_POINT
7119 is in hex; MAPPING is what CODE_POINT maps to.
7124 # There are tables which end up only having one element per
7125 # range, but it is not worth keeping track of for making just
7126 # this comment a little better.
7128 non-comment portions of the main body of lines of this file is:
7129 START\\tSTOP\\tMAPPING where START is the starting code point of the
7130 range, in hex; STOP is the ending point, or if omitted, the range has just one
7131 code point; MAPPING is what each code point between START and STOP maps to.
7133 if ($self->output_range_counts) {
7135 Numbers in comments in [brackets] indicate how many code points are in the
7136 range (omitted when the range is a single code point or if the mapping is to
7142 $self->set_comment(main::join_lines($comment));
7146 my %swash_keys; # Makes sure don't duplicate swash names.
7148 # The remaining variables are temporaries used while writing each table,
7149 # to output special ranges.
7150 my @multi_code_point_maps; # Map is to more than one code point.
7152 sub handle_special_range($self, $range) {
7153 # Called in the middle of write when it finds a range it doesn't know
7156 my $addr = pack 'J', refaddr $self;
7158 my $type = $range->type;
7160 my $low = $range->start;
7161 my $high = $range->end;
7162 my $map = $range->value;
7164 # No need to output the range if it maps to the default.
7165 return if $map eq $default_map{$addr};
7167 my $property = $self->property;
7169 # Switch based on the map type...
7170 if ($type == $HANGUL_SYLLABLE) {
7172 # These are entirely algorithmically determinable based on
7173 # some constants furnished by Unicode; for now, just set a
7174 # flag to indicate that have them. After everything is figured
7175 # out, we will output the code that does the algorithm. (Don't
7176 # output them if not needed because we are suppressing this
7178 $has_hangul_syllables = 1 if $property->to_output_map;
7180 elsif ($type == $CP_IN_NAME) {
7182 # Code points whose name ends in their code point are also
7183 # algorithmically determinable, but need information about the map
7184 # to do so. Both the map and its inverse are stored in data
7185 # structures output in the file. They are stored in the mean time
7186 # in global lists The lists will be written out later into Name.pm,
7187 # which is created only if needed. In order to prevent duplicates
7188 # in the list, only add to them for one property, should multiple
7190 if ($needing_code_points_ending_in_code_point == 0) {
7191 $needing_code_points_ending_in_code_point = $property;
7193 if ($property == $needing_code_points_ending_in_code_point) {
7194 push @{$names_ending_in_code_point{$map}->{'low'}}, $low;
7195 push @{$names_ending_in_code_point{$map}->{'high'}}, $high;
7197 my $squeezed = $map =~ s/[-\s]+//gr;
7198 push @{$loose_names_ending_in_code_point{$squeezed}->{'low'}},
7200 push @{$loose_names_ending_in_code_point{$squeezed}->{'high'}},
7203 # Calculate the set of legal characters in names of this
7204 # series. It includes every character in the name prefix.
7206 $legal{$_} = 1 for split //, $map;
7208 # Plus the hex code point chars, blank, and minus. Also \n
7209 # can show up as being required due to anchoring
7210 for my $i ('0' .. '9', 'A' .. 'F', '-', ' ', "\n") {
7213 my $legal = join "", sort { $a cmp $b } keys %legal;
7215 # The legal chars can be used in match optimizations
7216 push @code_points_ending_in_code_point, { low => $low,
7223 elsif ($range->type == $MULTI_CP || $range->type == $NULL) {
7225 # Multi-code point maps and null string maps have an entry
7226 # for each code point in the range. They use the same
7228 for my $code_point ($low .. $high) {
7230 # The pack() below can't cope with surrogates. XXX This may
7232 if ($code_point >= 0xD800 && $code_point <= 0xDFFF) {
7233 Carp::my_carp("Surrogate code point '$code_point' in mapping to '$map' in $self. No map created");
7237 # Generate the hash entries for these in the form that
7238 # utf8.c understands.
7242 foreach my $to (split " ", $map) {
7243 if ($to !~ /^$code_point_re$/) {
7244 Carp::my_carp("Illegal code point '$to' in mapping '$map' from $code_point in $self. No map created");
7247 $tostr .= sprintf "\\x{%s}", $to;
7248 $to = CORE::hex $to;
7250 $to_name .= " + " if $to_name;
7251 $to_chr .= main::display_chr($to);
7252 main::populate_char_info($to)
7253 if ! defined $viacode[$to];
7254 $to_name .= $viacode[$to];
7258 # The unpack yields a list of the bytes that comprise the
7259 # UTF-8 of $code_point, which are each placed in \xZZ format
7260 # and output in the %s to map to $tostr, so the result looks
7262 # "\xC4\xB0" => "\x{0069}\x{0307}",
7263 my $utf8 = sprintf(qq["%s" => "$tostr",],
7264 join("", map { sprintf "\\x%02X", $_ }
7265 unpack("U0C*", chr $code_point)));
7267 # Add a comment so that a human reader can more easily
7268 # see what's going on.
7269 push @multi_code_point_maps,
7270 sprintf("%-45s # U+%04X", $utf8, $code_point);
7272 $multi_code_point_maps[-1] .= " => $map";
7275 main::populate_char_info($code_point)
7276 if ! defined $viacode[$code_point];
7277 $multi_code_point_maps[-1] .= " '"
7278 . main::display_chr($code_point)
7279 . "' => '$to_chr'; $viacode[$code_point] => $to_name";
7284 Carp::my_carp("Unrecognized map type '$range->type' in '$range' in $self. Not written");
7290 sub pre_body($self) {
7291 # Returns the string that should be output in the file before the main
7292 # body of this table. It isn't called until the main body is
7293 # calculated, saving a pass. The string includes some hash entries
7294 # identifying the format of the body, and what the single value should
7295 # be for all ranges missing from it. It also includes any code points
7296 # which have map_types that don't go in the main table.
7298 my $addr = pack 'J', refaddr $self;
7300 my $name = $self->property->swash_name;
7302 # Currently there is nothing in the pre_body unless a swash is being
7304 return unless defined $name;
7306 if (defined $swash_keys{$name}) {
7307 Carp::my_carp(main::join_lines(<<END
7308 Already created a swash name '$name' for $swash_keys{$name}. This means that
7309 the same name desired for $self shouldn't be used. Bad News. This must be
7310 fixed before production use, but proceeding anyway
7314 $swash_keys{$name} = "$self";
7318 # Here we assume we were called after have gone through the whole
7319 # file. If we actually generated anything for each map type, add its
7320 # respective header and trailer
7321 my $specials_name = "";
7322 if (@multi_code_point_maps) {
7323 $specials_name = "Unicode::UCD::ToSpec$name";
7326 # Some code points require special handling because their mappings are each to
7327 # multiple code points. These do not appear in the main body, but are defined
7328 # in the hash below.
7330 # Each key is the string of N bytes that together make up the UTF-8 encoding
7331 # for the code point. (i.e. the same as looking at the code point's UTF-8
7332 # under "use bytes"). Each value is the UTF-8 of the translation, for speed.
7333 \%$specials_name = (
7335 $pre_body .= join("\n", @multi_code_point_maps) . "\n);\n";
7338 my $format = $self->format;
7342 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7343 if ($output_adjusted) {
7344 if ($specials_name) {
7346 # The mappings in the non-hash portion of this file must be modified to get the
7347 # correct values by adding the code point ordinal number to each one that is
7353 # The mappings must be modified to get the correct values by adding the code
7354 # point ordinal number to each one that is numeric.
7361 # The name this table is to be known by, with the format of the mappings in
7362 # the main body of the table, and what all code points missing from this file
7364 \$Unicode::UCD::SwashInfo{'To$name'}{'format'} = '$format'; # $map_table_formats{$format}
7366 if ($specials_name) {
7368 \$Unicode::UCD::SwashInfo{'To$name'}{'specials_name'} = '$specials_name'; # Name of hash of special mappings
7371 my $default_map = $default_map{$addr};
7373 # For $CODE_POINT default maps and using adjustments, instead the default
7375 $return .= "\$Unicode::UCD::SwashInfo{'To$name'}{'missing'} = '"
7376 . (($output_adjusted && $default_map eq $CODE_POINT)
7381 if ($default_map eq $CODE_POINT) {
7382 $return .= ' # code point maps to itself';
7384 elsif ($default_map eq "") {
7385 $return .= ' # code point maps to the empty string';
7389 $return .= $pre_body;
7395 # Write the table to the file.
7397 my $addr = pack 'J', refaddr $self;
7399 # Clear the temporaries
7400 undef @multi_code_point_maps;
7402 # Calculate the format of the table if not already done.
7403 my $format = $self->format;
7404 my $type = $self->property->type;
7405 my $default_map = $self->default_map;
7406 if (! defined $format) {
7407 if ($type == $BINARY) {
7409 # Don't bother checking the values, because we elsewhere
7410 # verify that a binary table has only 2 values.
7411 $format = $BINARY_FORMAT;
7414 my @ranges = $self->_range_list->ranges;
7416 # default an empty table based on its type and default map
7419 # But it turns out that the only one we can say is a
7420 # non-string (besides binary, handled above) is when the
7421 # table is a string and the default map is to a code point
7422 if ($type == $STRING && $default_map eq $CODE_POINT) {
7423 $format = $HEX_FORMAT;
7426 $format = $STRING_FORMAT;
7431 # Start with the most restrictive format, and as we find
7432 # something that doesn't fit with that, change to the next
7433 # most restrictive, and so on.
7434 $format = $DECIMAL_FORMAT;
7435 foreach my $range (@ranges) {
7436 next if $range->type != 0; # Non-normal ranges don't
7437 # affect the main body
7438 my $map = $range->value;
7439 if ($map ne $default_map) {
7440 last if $format eq $STRING_FORMAT; # already at
7443 $format = $INTEGER_FORMAT
7444 if $format eq $DECIMAL_FORMAT
7445 && $map !~ / ^ [0-9] $ /x;
7446 $format = $FLOAT_FORMAT
7447 if $format eq $INTEGER_FORMAT
7448 && $map !~ / ^ -? [0-9]+ $ /x;
7449 $format = $RATIONAL_FORMAT
7450 if $format eq $FLOAT_FORMAT
7451 && $map !~ / ^ -? [0-9]+ \. [0-9]* $ /x;
7452 $format = $HEX_FORMAT
7453 if ($format eq $RATIONAL_FORMAT
7455 m/ ^ -? [0-9]+ ( \/ [0-9]+ )? $ /x)
7456 # Assume a leading zero means hex,
7457 # even if all digits are 0-9
7458 || ($format eq $INTEGER_FORMAT
7459 && $map =~ /^0[0-9A-F]/);
7460 $format = $STRING_FORMAT if $format eq $HEX_FORMAT
7461 && $map =~ /[^0-9A-F]/;
7466 } # end of calculating format
7468 if ($default_map eq $CODE_POINT
7469 && $format ne $HEX_FORMAT
7470 && ! defined $self->format) # manual settings are always
7473 Carp::my_carp_bug("Expecting hex format for mapping table for $self, instead got '$format'")
7476 # If the output is to be adjusted, the format of the table that gets
7477 # output is actually 'a' or 'ax' instead of whatever it is stored
7479 my $output_adjusted = ($self->to_output_map == $OUTPUT_ADJUSTED);
7480 if ($output_adjusted) {
7481 if ($default_map eq $CODE_POINT) {
7482 $format = $HEX_ADJUST_FORMAT;
7485 $format = $ADJUST_FORMAT;
7489 $self->_set_format($format);
7491 return $self->SUPER::write(
7493 $default_map); # don't write defaulteds
7496 # Accessors for the underlying list that should fail if locked.
7507 return if $self->carp_if_locked;
7508 return $self->_range_list->$sub(@_);
7511 } # End closure for Map_Table
7513 package Match_Table;
7514 use parent '-norequire', '_Base_Table';
7516 # A Match table is one which is a list of all the code points that have
7517 # the same property and property value, for use in \p{property=value}
7518 # constructs in regular expressions. It adds very little data to the base
7519 # structure, but many methods, as these lists can be combined in many ways to
7521 # There are only a few concepts added:
7522 # 1) Equivalents and Relatedness.
7523 # Two tables can match the identical code points, but have different names.
7524 # This always happens when there is a perl single form extension
7525 # \p{IsProperty} for the Unicode compound form \P{Property=True}. The two
7526 # tables are set to be related, with the Perl extension being a child, and
7527 # the Unicode property being the parent.
7529 # It may be that two tables match the identical code points and we don't
7530 # know if they are related or not. This happens most frequently when the
7531 # Block and Script properties have the exact range. But note that a
7532 # revision to Unicode could add new code points to the script, which would
7533 # now have to be in a different block (as the block was filled, or there
7534 # would have been 'Unknown' script code points in it and they wouldn't have
7535 # been identical). So we can't rely on any two properties from Unicode
7536 # always matching the same code points from release to release, and thus
7537 # these tables are considered coincidentally equivalent--not related. When
7538 # two tables are unrelated but equivalent, one is arbitrarily chosen as the
7539 # 'leader', and the others are 'equivalents'. This concept is useful
7540 # to minimize the number of tables written out. Only one file is used for
7541 # any identical set of code points, with entries in UCD.pl mapping all
7542 # the involved tables to it.
7544 # Related tables will always be identical; we set them up to be so. Thus
7545 # if the Unicode one is deprecated, the Perl one will be too. Not so for
7546 # unrelated tables. Relatedness makes generating the documentation easier.
7549 # Like equivalents, two tables may be the inverses of each other, the
7550 # intersection between them is null, and the union is every Unicode code
7551 # point. The two tables that occupy a binary property are necessarily like
7552 # this. By specifying one table as the complement of another, we can avoid
7553 # storing it on disk (using the other table and performing a fast
7554 # transform), and some memory and calculations.
7556 # 3) Conflicting. It may be that there will eventually be name clashes, with
7557 # the same name meaning different things. For a while, there actually were
7558 # conflicts, but they have so far been resolved by changing Perl's or
7559 # Unicode's definitions to match the other, but when this code was written,
7560 # it wasn't clear that that was what was going to happen. (Unicode changed
7561 # because of protests during their beta period.) Name clashes are warned
7562 # about during compilation, and the documentation. The generated tables
7563 # are sane, free of name clashes, because the code suppresses the Perl
7564 # version. But manual intervention to decide what the actual behavior
7565 # should be may be required should this happen. The introductory comments
7566 # have more to say about this.
7568 # 4) Definition. This is a string for human consumption that specifies the
7569 # code points that this table matches. This is used only for the generated
7570 # pod file. It may be specified explicitly, or automatically computed.
7571 # Only the first portion of complicated definitions is computed and
7574 sub standardize { return main::standardize($_[0]); }
7575 sub trace { return main::trace(@_); }
7580 main::setup_package();
7583 # The leader table of this one; initially $self.
7584 main::set_access('leader', \%leader, 'r');
7587 # An array of any tables that have this one as their leader
7588 main::set_access('equivalents', \%equivalents, 'readable_array');
7591 # The parent table to this one, initially $self. This allows us to
7592 # distinguish between equivalent tables that are related (for which this
7593 # is set to), and those which may not be, but share the same output file
7594 # because they match the exact same set of code points in the current
7596 main::set_access('parent', \%parent, 'r');
7599 # An array of any tables that have this one as their parent
7600 main::set_access('children', \%children, 'readable_array');
7603 # Array of any tables that would have the same name as this one with
7604 # a different meaning. This is used for the generated documentation.
7605 main::set_access('conflicting', \%conflicting, 'readable_array');
7608 # Set in the constructor for tables that are expected to match all code
7610 main::set_access('matches_all', \%matches_all, 'r');
7613 # Points to the complement that this table is expressed in terms of; 0 if
7615 main::set_access('complement', \%complement, 'r');
7618 # Human readable string of the first few ranges of code points matched by
7620 main::set_access('definition', \%definition, 'r', 's');
7627 # The property for which this table is a listing of property values.
7628 my $property = delete $args{'_Property'};
7630 my $name = delete $args{'Name'};
7631 my $full_name = delete $args{'Full_Name'};
7632 $full_name = $name if ! defined $full_name;
7635 my $initialize = delete $args{'Initialize'};
7636 my $matches_all = delete $args{'Matches_All'} || 0;
7637 my $format = delete $args{'Format'};
7638 my $definition = delete $args{'Definition'} // "";
7639 # Rest of parameters passed on.
7641 my $range_list = Range_List->new(Initialize => $initialize,
7642 Owner => $property);
7644 my $complete = $full_name;
7645 $complete = '""' if $complete eq ""; # A null name shouldn't happen,
7646 # but this helps debug if it
7648 # The complete name for a match table includes it's property in a
7649 # compound form 'property=table', except if the property is the
7650 # pseudo-property, perl, in which case it is just the single form,
7651 # 'table' (If you change the '=' must also change the ':' in lots of
7652 # places in this program that assume an equal sign)
7653 $complete = $property->full_name . "=$complete" if $property != $perl;
7655 my $self = $class->SUPER::new(%args,
7657 Complete_Name => $complete,
7658 Full_Name => $full_name,
7659 _Property => $property,
7660 _Range_List => $range_list,
7661 Format => $EMPTY_FORMAT,
7662 Write_As_Invlist => 1,
7664 my $addr = pack 'J', refaddr $self;
7666 $conflicting{$addr} = [ ];
7667 $equivalents{$addr} = [ ];
7668 $children{$addr} = [ ];
7669 $matches_all{$addr} = $matches_all;
7670 $leader{$addr} = $self;
7671 $parent{$addr} = $self;
7672 $complement{$addr} = 0;
7673 $definition{$addr} = $definition;
7675 if (defined $format && $format ne $EMPTY_FORMAT) {
7676 Carp::my_carp_bug("'Format' must be '$EMPTY_FORMAT' in a match table instead of '$format'. Using '$EMPTY_FORMAT'");
7682 # See this program's beginning comment block about overloading these.
7685 qw("") => "_operator_stringify",
7689 return if $self->carp_if_locked;
7697 return $self->_range_list + $other;
7703 return $self->_range_list & $other;
7708 my $reversed = shift;
7711 Carp::my_carp_bug("Bad news. Can't cope with '"
7715 . "'. undef returned.");
7719 return if $self->carp_if_locked;
7723 # Change the range list of this table to be the
7725 $self->_set_range_list($self->_range_list
7728 else { # $other is just a simple value
7729 $self->add_range($other, $other);
7736 my $reversed = shift;
7739 Carp::my_carp_bug("Bad news. Can't cope with '"
7743 . "'. undef returned.");
7747 return if $self->carp_if_locked;
7748 $self->_set_range_list($self->_range_list & $other);
7751 '-' => sub { my $self = shift;
7753 my $reversed = shift;
7755 Carp::my_carp_bug("Bad news. Can't cope with '"
7759 . "'. undef returned.");
7763 return $self->_range_list - $other;
7765 '~' => sub { my $self = shift;
7766 return ~ $self->_range_list;
7770 sub _operator_stringify($self, $other="", $reversed=0) {
7772 my $name = $self->complete_name;
7773 return "Table '$name'";
7777 # Returns the range list associated with this table, which will be the
7778 # complement's if it has one.
7781 my $complement = $self->complement;
7783 # In order to avoid re-complementing on each access, only do the
7784 # complement the first time, and store the result in this table's
7785 # range list to use henceforth. However, this wouldn't work if the
7786 # controlling (complement) table changed after we do this, so lock it.
7787 # Currently, the value of the complement isn't needed until after it
7788 # is fully constructed, so this works. If this were to change, the
7789 # each_range iteration functionality would no longer work on this
7791 if ($complement != 0 && $self->SUPER::_range_list->count == 0) {
7792 $self->_set_range_list($self->SUPER::_range_list
7793 + ~ $complement->_range_list);
7797 return $self->SUPER::_range_list;
7801 # Add a synonym for this table. See the comments in the base class
7805 # Rest of parameters passed on.
7807 $self->SUPER::add_alias($name, $self, @_);
7811 sub add_conflicting {
7812 # Add the name of some other object to the list of ones that name
7813 # clash with this match table.
7816 my $conflicting_name = shift; # The name of the conflicting object
7817 my $p = shift || 'p'; # Optional, is this a \p{} or \P{} ?
7818 my $conflicting_object = shift; # Optional, the conflicting object
7819 # itself. This is used to
7820 # disambiguate the text if the input
7821 # name is identical to any of the
7822 # aliases $self is known by.
7823 # Sometimes the conflicting object is
7824 # merely hypothetical, so this has to
7825 # be an optional parameter.
7826 Carp::carp_extra_args(\@_) if main::DEBUG && @_;
7828 my $addr = pack 'J', refaddr $self;
7830 # Check if the conflicting name is exactly the same as any existing
7831 # alias in this table (as long as there is a real object there to
7832 # disambiguate with).
7833 if (defined $conflicting_object) {
7834 foreach my $alias ($self->aliases) {
7835 if (standardize($alias->name) eq standardize($conflicting_name)) {
7837 # Here, there is an exact match. This results in
7838 # ambiguous comments, so disambiguate by changing the
7839 # conflicting name to its object's complete equivalent.
7840 $conflicting_name = $conflicting_object->complete_name;
7846 # Convert to the \p{...} final name
7847 $conflicting_name = "\\$p" . "{$conflicting_name}";
7850 return if grep { $conflicting_name eq $_ } @{$conflicting{$addr}};
7852 push @{$conflicting{$addr}}, $conflicting_name;
7857 sub is_set_equivalent_to($self, $other=undef) {
7858 # Return boolean of whether or not the other object is a table of this
7859 # type and has been marked equivalent to this one.
7861 return 0 if ! defined $other; # Can happen for incomplete early
7863 unless ($other->isa(__PACKAGE__)) {
7864 my $ref_other = ref $other;
7865 my $ref_self = ref $self;
7866 Carp::my_carp_bug("Argument to 'is_set_equivalent_to' must be another $ref_self, not a '$ref_other'. $other not set equivalent to $self.");
7870 # Two tables are equivalent if they have the same leader.
7871 return $leader{pack 'J', refaddr $self} == $leader{pack 'J', refaddr $other};
7875 sub set_equivalent_to {
7876 # Set $self equivalent to the parameter table.
7877 # The required Related => 'x' parameter is a boolean indicating
7878 # whether these tables are related or not. If related, $other becomes
7879 # the 'parent' of $self; if unrelated it becomes the 'leader'
7881 # Related tables share all characteristics except names; equivalents
7882 # not quite so many.
7883 # If they are related, one must be a perl extension. This is because
7884 # we can't guarantee that Unicode won't change one or the other in a
7885 # later release even if they are identical now.
7891 my $related = delete $args{'Related'};
7893 Carp::carp_extra_args(\%args) if main::DEBUG && %args;
7895 return if ! defined $other; # Keep on going; happens in some early
7898 if (! defined $related) {
7899 Carp::my_carp_bug("set_equivalent_to must have 'Related => [01] parameter. Assuming $self is not related to $other");
7903 # If already are equivalent, no need to re-do it; if subroutine
7904 # returns null, it found an error, also do nothing
7905 my $are_equivalent = $self->is_set_equivalent_to($other);
7906 return if ! defined $are_equivalent || $are_equivalent;
7908 my $addr = pack 'J', refaddr $self;
7909 my $current_leader = ($related) ? $parent{$addr} : $leader{$addr};
7912 if ($current_leader->perl_extension) {
7913 if ($other->perl_extension) {
7914 Carp::my_carp_bug("Use add_alias() to set two Perl tables '$self' and '$other', equivalent.");
7917 } elsif ($self->property != $other->property # Depending on
7923 && ! $other->perl_extension
7925 # We allow the sc and scx properties to be marked as
7926 # related. They are in fact related, and this allows
7927 # the pod to show that better. This test isn't valid
7928 # if this is an early Unicode release without the scx
7929 # property (having that also implies the sc property
7930 # exists, so don't have to test for no 'sc')
7932 && ! ( ( $self->property == $script
7933 || $self->property == $scx)
7934 && ( $self->property == $script
7935 || $self->property == $scx))))
7937 Carp::my_carp_bug("set_equivalent_to should have 'Related => 0 for equivalencing two Unicode properties. Assuming $self is not related to $other");
7942 if (! $self->is_empty && ! $self->matches_identically_to($other)) {
7943 Carp::my_carp_bug("$self should be empty or match identically to $other. Not setting equivalent");
7947 my $leader = pack 'J', refaddr $current_leader;
7948 my $other_addr = pack 'J', refaddr $other;
7950 # Any tables that are equivalent to or children of this table must now
7951 # instead be equivalent to or (children) to the new leader (parent),
7952 # still equivalent. The equivalency includes their matches_all info,
7953 # and for related tables, their fate and status.
7954 # All related tables are of necessity equivalent, but the converse
7955 # isn't necessarily true
7956 my $status = $other->status;
7957 my $status_info = $other->status_info;
7958 my $fate = $other->fate;
7959 my $matches_all = $matches_all{other_addr};
7960 my $caseless_equivalent = $other->caseless_equivalent;
7961 foreach my $table ($current_leader, @{$equivalents{$leader}}) {
7962 next if $table == $other;
7963 trace "setting $other to be the leader of $table, status=$status" if main::DEBUG && $to_trace;
7965 my $table_addr = pack 'J', refaddr $table;
7966 $leader{$table_addr} = $other;
7967 $matches_all{$table_addr} = $matches_all;
7968 $self->_set_range_list($other->_range_list);
7969 push @{$equivalents{$other_addr}}, $table;
7971 $parent{$table_addr} = $other;
7972 push @{$children{$other_addr}}, $table;
7973 $table->set_status($status, $status_info);
7975 # This reason currently doesn't get exposed outside; otherwise
7976 # would have to look up the parent's reason and use it instead.
7977 $table->set_fate($fate, "Parent's fate");
7979 $self->set_caseless_equivalent($caseless_equivalent);
7983 # Now that we've declared these to be equivalent, any changes to one
7984 # of the tables would invalidate that equivalency.
7990 sub set_complement($self, $other) {
7991 # Set $self to be the complement of the parameter table. $self is
7992 # locked, as what it contains should all come from the other table.
7994 if ($other->complement != 0) {
7995 Carp::my_carp_bug("Can't set $self to be the complement of $other, which itself is the complement of " . $other->complement);
7998 $complement{pack 'J', refaddr $self} = $other;
8000 # Be sure the other property knows we are depending on them; or the
8001 # other table if it is one in the current property.
8002 if ($self->property != $other->property) {
8003 $other->property->set_has_dependency(1);
8006 $other->set_has_dependency(1);
8012 sub add_range($self, @range) { # Add a range to the list for this table.
8013 # Rest of parameters passed on
8015 return if $self->carp_if_locked;
8016 return $self->_range_list->add_range(@range);
8020 # All match tables are to be used only by the Perl core.
8021 return $self->SUPER::header() . $INTERNAL_ONLY_HEADER;
8024 sub pre_body { # Does nothing for match tables.
8028 sub append_to_body { # Does nothing for match tables.
8032 sub set_fate($self, $fate, $reason=undef) {
8033 $self->SUPER::set_fate($fate, $reason);
8035 # All children share this fate
8036 foreach my $child ($self->children) {
8037 $child->set_fate($fate, $reason);
8042 sub calculate_table_definition
8044 # Returns a human-readable string showing some or all of the code
8045 # points matched by this table. The string will include a
8046 # bracketed-character class for all characters matched in the 00-FF
8047 # range, and the first few ranges matched beyond that.
8051 my $definition = $self->definition || "";
8053 # Skip this if already have a definition.
8054 return $definition if $definition;
8056 my $lows_string = ""; # The string representation of the 0-FF
8058 my $string_range = ""; # The string rep. of the above FF ranges
8059 my $range_count = 0; # How many ranges in $string_rage
8061 my @lows_invlist; # The inversion list of the 0-FF code points
8062 my $first_non_control = ord(" "); # Everything below this is a
8063 # control, on ASCII or EBCDIC
8064 my $max_table_code_point = $self->max;
8066 # On ASCII platforms, the range 80-FF contains no printables.
8067 my $highest_printable = ((main::NON_ASCII_PLATFORM) ? 255 : 126);
8070 # Look through the first few ranges matched by this table.
8071 $self->reset_each_range; # Defensive programming
8072 while (defined (my $range = $self->each_range())) {
8073 my $start = $range->start;
8074 my $end = $range->end;
8076 # Accumulate an inversion list of the 00-FF code points
8077 if ($start < 256 && ($start > 0 || $end < 256)) {
8078 push @lows_invlist, $start;
8079 push @lows_invlist, 1 + (($end < 256) ? $end : 255);
8081 # Get next range if there are more ranges below 256
8082 next if $end < 256 && $end < $max_table_code_point;
8084 # If the range straddles the 255/256 boundary, we split it
8085 # there. We already added above the low portion to the
8087 $start = 256 if $end > 256;
8090 # Here, @lows_invlist contains the code points below 256, and
8091 # there is no other range, or the current one starts at or above
8092 # 256. Generate the [char class] for the 0-255 ones.
8093 while (@lows_invlist) {
8095 # If this range (necessarily the first one, by the way) starts
8097 if ($lows_invlist[0] == 0) {
8099 # If it ends within the block of controls, that means that
8100 # some controls are in it and some aren't. Since Unicode
8101 # properties pretty much only know about a few of the
8102 # controls, like \n, \t, this means that its one of them
8103 # that isn't in the range. Complement the inversion list
8104 # which will likely cause these to be output using their
8105 # mnemonics, hence being clearer.
8106 if ($lows_invlist[1] < $first_non_control) {
8107 $lows_string .= '^';
8108 shift @lows_invlist;
8109 push @lows_invlist, 256;
8111 elsif ($lows_invlist[1] <= $highest_printable) {
8113 # Here, it extends into the printables block. Split
8114 # into two ranges so that the controls are separate.
8115 $lows_string .= sprintf "\\x00-\\x%02x",
8116 $first_non_control - 1;
8117 $lows_invlist[0] = $first_non_control;
8121 # If the range completely contains the printables, don't
8122 # individually spell out the printables.
8123 if ( $lows_invlist[0] <= $first_non_control
8124 && $lows_invlist[1] > $highest_printable)
8126 $lows_string .= sprintf "\\x%02x-\\x%02x",
8127 $lows_invlist[0], $lows_invlist[1] - 1;
8128 shift @lows_invlist;
8129 shift @lows_invlist;
8133 # Here, the range may include some but not all printables.
8134 # Look at each one individually
8135 foreach my $ord (shift @lows_invlist .. shift(@lows_invlist) - 1) {
8136 my $char = chr $ord;
8138 # If there is already something in the list, an
8139 # alphanumeric char could be the next in sequence. If so,
8140 # we start or extend a range. That is, we could have so
8141 # far something like 'a-c', and the next char is a 'd', so
8142 # we change it to 'a-d'. We use native_to_unicode()
8143 # because a-z on EBCDIC means 26 chars, and excludes the
8145 if ($lows_string ne "" && $char =~ /[[:alnum:]]/) {
8146 my $prev = substr($lows_string, -1);
8147 if ( $prev !~ /[[:alnum:]]/
8148 || utf8::native_to_unicode(ord $prev) + 1
8149 != utf8::native_to_unicode(ord $char))
8151 # Not extending the range
8152 $lows_string .= $char;
8154 elsif ( length $lows_string > 1
8155 && substr($lows_string, -2, 1) eq '-')
8157 # We had a sequence like '-c' and the current
8158 # character is 'd'. Extend the range.
8159 substr($lows_string, -1, 1) = $char;
8162 # We had something like 'd' and this is 'e'.
8164 $lows_string .= "-$char";
8167 elsif ($char =~ /[[:graph:]]/) {
8169 # We output a graphic char as-is, preceded by a
8170 # backslash if it is a metacharacter
8171 $lows_string .= '\\'
8172 if $char =~ /[\\\^\$\@\%\|()\[\]\{\}\-\/"']/;
8173 $lows_string .= $char;
8174 } # Otherwise use mnemonic for any that have them
8175 elsif ($char =~ /[\a]/) {
8176 $lows_string .= '\a';
8178 elsif ($char =~ /[\b]/) {
8179 $lows_string .= '\b';
8181 elsif ($char eq "\e") {
8182 $lows_string .= '\e';
8184 elsif ($char eq "\f") {
8185 $lows_string .= '\f';
8187 elsif ($char eq "\cK") {
8188 $lows_string .= '\cK';
8190 elsif ($char eq "\n") {
8191 $lows_string .= '\n';
8193 elsif ($char eq "\r") {
8194 $lows_string .= '\r';
8196 elsif ($char eq "\t") {
8197 $lows_string .= '\t';
8201 # Here is a non-graphic without a mnemonic. We use \x
8202 # notation. But if the ordinal of this is one above
8203 # the previous, create or extend the range
8204 my $hex_representation = sprintf("%02x", ord $char);
8205 if ( length $lows_string >= 4
8206 && substr($lows_string, -4, 2) eq '\\x'
8207 && hex(substr($lows_string, -2)) + 1 == ord $char)
8209 if ( length $lows_string >= 5
8210 && substr($lows_string, -5, 1) eq '-'
8211 && ( length $lows_string == 5
8212 || substr($lows_string, -6, 1) ne '\\'))
8214 substr($lows_string, -2) = $hex_representation;
8217 $lows_string .= '-\\x' . $hex_representation;
8221 $lows_string .= '\\x' . $hex_representation;
8227 # Done with assembling the string of all lows. If there are only
8228 # lows in the property, are completely done.
8229 if ($max_table_code_point < 256) {
8230 $self->reset_each_range;
8234 # Otherwise, quit if reached max number of non-lows ranges. If
8235 # there are lows, count them as one unit towards the maximum.
8237 if ($range_count > (($lows_string eq "") ? $max_ranges : $max_ranges - 1)) {
8238 $string_range .= " ...";
8239 $self->reset_each_range;
8243 # Otherwise add this range.
8244 $string_range .= ", " if $string_range ne "";
8245 if ($start == $end) {
8246 $string_range .= sprintf("U+%04X", $start);
8248 elsif ($end >= $MAX_WORKING_CODEPOINT) {
8249 $string_range .= sprintf("U+%04X..infinity", $start);
8252 $string_range .= sprintf("U+%04X..%04X",
8257 # Done with all the ranges we're going to look at. Assemble the
8258 # definition from the lows + non-lows.
8260 if ($lows_string ne "" || $string_range ne "") {
8261 if ($lows_string ne "") {
8262 $definition .= "[$lows_string]";
8263 $definition .= ", " if $string_range;
8265 $definition .= $string_range;
8272 return $self->SUPER::write(0); # No adjustments
8275 # $leader - Should only be called on the leader table of an equivalent group
8276 sub set_final_comment($leader) {
8277 # This creates a comment for the file that is to hold the match table
8278 # $self. It is somewhat convoluted to make the English read nicely,
8279 # but, heh, it's just a comment.
8280 # This should be called only with the leader match table of all the
8281 # ones that share the same file. It lists all such tables, ordered so
8282 # that related ones are together.
8284 return unless $debugging_build;
8286 my $addr = pack 'J', refaddr $leader;
8288 if ($leader{$addr} != $leader) {
8289 Carp::my_carp_bug(<<END
8290 set_final_comment() must be called on a leader table, which $leader is not.
8291 It is equivalent to $leader{$addr}. No comment created
8297 # Get the number of code points matched by each of the tables in this
8298 # file, and add underscores for clarity.
8299 my $count = $leader->count;
8301 my $non_unicode_string;
8302 if ($count > $MAX_UNICODE_CODEPOINTS) {
8303 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
8304 - $MAX_UNICODE_CODEPOINT);
8305 $non_unicode_string = "All above-Unicode code points match as well, and are also returned";
8308 $unicode_count = $count;
8309 $non_unicode_string = "";
8311 my $string_count = main::clarify_code_point_count($unicode_count);
8313 my $loose_count = 0; # how many aliases loosely matched
8314 my $compound_name = ""; # ? Are any names compound?, and if so, an
8316 my $properties_with_compound_names = 0; # count of these
8319 my %flags; # The status flags used in the file
8320 my $total_entries = 0; # number of entries written in the comment
8321 my $matches_comment = ""; # The portion of the comment about the
8323 my @global_comments; # List of all the tables' comments that are
8324 # there before this routine was called.
8325 my $has_ucd_alias = 0; # If there is an alias that is accessible via
8326 # Unicode::UCD. If not, then don't say it is
8329 # Get list of all the parent tables that are equivalent to this one
8330 # (including itself).
8331 my @parents = grep { $parent{main::objaddr $_} == $_ }
8332 main::uniques($leader, @{$equivalents{$addr}});
8333 my $has_unrelated = (@parents >= 2); # boolean, ? are there unrelated
8335 for my $parent (@parents) {
8337 my $property = $parent->property;
8339 # Special case 'N' tables in properties with two match tables when
8340 # the other is a 'Y' one. These are likely to be binary tables,
8341 # but not necessarily. In either case, \P{} will match the
8342 # complement of \p{}, and so if something is a synonym of \p, the
8343 # complement of that something will be the synonym of \P. This
8344 # would be true of any property with just two match tables, not
8345 # just those whose values are Y and N; but that would require a
8346 # little extra work, and there are none such so far in Unicode.
8347 my $perl_p = 'p'; # which is it? \p{} or \P{}
8348 my @yes_perl_synonyms; # list of any synonyms for the 'Y' table
8350 if (scalar $property->tables == 2
8351 && $parent == $property->table('N')
8352 && defined (my $yes = $property->table('Y')))
8354 my $yes_addr = pack 'J', refaddr $yes;
8356 = grep { $_->property == $perl }
8359 $parent{$yes_addr}->children);
8361 # But these synonyms are \P{} ,not \p{}
8365 my @description; # Will hold the table description
8366 my @note; # Will hold the table notes.
8367 my @conflicting; # Will hold the table conflicts.
8369 # Look at the parent, any yes synonyms, and all the children
8370 my $parent_addr = pack 'J', refaddr $parent;
8371 for my $table ($parent,
8373 @{$children{$parent_addr}})
8375 my $table_addr = pack 'J', refaddr $table;
8376 my $table_property = $table->property;
8378 # Tables are separated by a blank line to create a grouping.
8379 $matches_comment .= "\n" if $matches_comment;
8381 # The table is named based on the property and value
8382 # combination it is for, like script=greek. But there may be
8383 # a number of synonyms for each side, like 'sc' for 'script',
8384 # and 'grek' for 'greek'. Any combination of these is a valid
8385 # name for this table. In this case, there are three more,
8386 # 'sc=grek', 'sc=greek', and 'script='grek'. Rather than
8387 # listing all possible combinations in the comment, we make
8388 # sure that each synonym occurs at least once, and add
8389 # commentary that the other combinations are possible.
8390 # Because regular expressions don't recognize things like
8391 # \p{jsn=}, only look at non-null right-hand-sides
8392 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table_property->aliases;
8393 my @table_aliases = grep { $_->name ne "" } $table->aliases;
8395 # The alias lists above are already ordered in the order we
8396 # want to output them. To ensure that each synonym is listed,
8397 # we must use the max of the two numbers. But if there are no
8398 # legal synonyms (nothing in @table_aliases), then we don't
8400 my $listed_combos = (@table_aliases)
8401 ? main::max(scalar @table_aliases,
8402 scalar @property_aliases)
8404 trace "$listed_combos, tables=", scalar @table_aliases, "; property names=", scalar @property_aliases if main::DEBUG;
8406 my $property_had_compound_name = 0;
8408 for my $i (0 .. $listed_combos - 1) {
8411 # The current alias for the property is the next one on
8412 # the list, or if beyond the end, start over. Similarly
8413 # for the table (\p{prop=table})
8414 my $property_alias = $property_aliases
8415 [$i % @property_aliases]->name;
8416 my $table_alias_object = $table_aliases
8417 [$i % @table_aliases];
8418 my $table_alias = $table_alias_object->name;
8419 my $loose_match = $table_alias_object->loose_match;
8420 $has_ucd_alias |= $table_alias_object->ucd;
8422 if ($table_alias !~ /\D/) { # Clarify large numbers.
8423 $table_alias = main::clarify_number($table_alias)
8426 # Add a comment for this alias combination
8427 my $current_match_comment;
8428 if ($table_property == $perl) {
8429 $current_match_comment = "\\$perl_p"
8433 $current_match_comment
8434 = "\\p{$property_alias=$table_alias}";
8435 $property_had_compound_name = 1;
8438 # Flag any abnormal status for this table.
8439 my $flag = $property->status
8441 || $table_alias_object->status;
8442 if ($flag && $flag ne $PLACEHOLDER) {
8443 $flags{$flag} = $status_past_participles{$flag};
8448 # Pretty up the comment. Note the \b; it says don't make
8449 # this line a continuation.
8450 $matches_comment .= sprintf("\b%-1s%-s%s\n",
8453 $current_match_comment);
8454 } # End of generating the entries for this table.
8456 # Save these for output after this group of related tables.
8457 push @description, $table->description;
8458 push @note, $table->note;
8459 push @conflicting, $table->conflicting;
8461 # And this for output after all the tables.
8462 push @global_comments, $table->comment;
8464 # Compute an alternate compound name using the final property
8465 # synonym and the first table synonym with a colon instead of
8466 # the equal sign used elsewhere.
8467 if ($property_had_compound_name) {
8468 $properties_with_compound_names ++;
8469 if (! $compound_name || @property_aliases > 1) {
8470 $compound_name = $property_aliases[-1]->name
8472 . $table_aliases[0]->name;
8475 } # End of looping through all children of this table
8477 # Here have assembled in $matches_comment all the related tables
8478 # to the current parent (preceded by the same info for all the
8479 # previous parents). Put out information that applies to all of
8480 # the current family.
8483 # But output the conflicting information now, as it applies to
8485 my $conflicting = join ", ", @conflicting;
8487 $matches_comment .= <<END;
8489 Note that contrary to what you might expect, the above is NOT the same as
8491 $matches_comment .= "any of: " if @conflicting > 1;
8492 $matches_comment .= "$conflicting\n";
8496 $matches_comment .= "\n Meaning: "
8497 . join('; ', @description)
8501 $matches_comment .= "\n Note: "
8502 . join("\n ", @note)
8505 } # End of looping through all tables
8507 $matches_comment .= "\n$non_unicode_string\n" if $non_unicode_string;
8513 if ($unicode_count == 1) {
8515 $code_points = 'single code point';
8519 $code_points = "$string_count code points";
8524 if ($total_entries == 1) {
8527 $any_of_these = 'this'
8530 $synonyms = " any of the following regular expression constructs";
8531 $entries = 'entries';
8532 $any_of_these = 'any of these'
8536 if ($has_ucd_alias) {
8537 $comment .= "Use Unicode::UCD::prop_invlist() to access the contents of this file.\n\n";
8539 if ($has_unrelated) {
8541 This file is for tables that are not necessarily related: To conserve
8542 resources, every table that matches the identical set of code points in this
8543 version of Unicode uses this file. Each one is listed in a separate group
8544 below. It could be that the tables will match the same set of code points in
8545 other Unicode releases, or it could be purely coincidence that they happen to
8546 be the same in Unicode $unicode_version, and hence may not in other versions.
8552 foreach my $flag (sort keys %flags) {
8554 '$flag' below means that this form is $flags{$flag}.
8556 if ($flag eq $INTERNAL_ALIAS) {
8557 $comment .= "DO NOT USE!!!";
8560 $comment .= "Consult $pod_file.pod";
8567 if ($total_entries == 0) {
8568 Carp::my_carp("No regular expression construct can match $leader, as all names for it are the null string. Creating file anyway.");
8570 This file returns the $code_points in Unicode Version
8571 $unicode_version for
8572 $leader, but it is inaccessible through Perl regular expressions, as
8573 "\\p{prop=}" is not recognized.
8578 This file returns the $code_points in Unicode Version
8579 $unicode_version that
8583 $pod_file.pod should be consulted for the syntax rules for $any_of_these,
8584 including if adding or subtracting white space, underscore, and hyphen
8585 characters matters or doesn't matter, and other permissible syntactic
8586 variants. Upper/lower case distinctions never matter.
8590 if ($compound_name) {
8593 A colon can be substituted for the equals sign, and
8595 if ($properties_with_compound_names > 1) {
8597 within each group above,
8600 $compound_name = sprintf("%-8s\\p{%s}", " ", $compound_name);
8602 # Note the \b below, it says don't make that line a continuation.
8604 anything to the left of the equals (or colon) can be combined with anything to
8605 the right. Thus, for example,
8611 # And append any comment(s) from the actual tables. They are all
8612 # gathered here, so may not read all that well.
8613 if (@global_comments) {
8614 $comment .= "\n" . join("\n\n", @global_comments) . "\n";
8617 if ($count) { # The format differs if no code points, and needs no
8618 # explanation in that case
8619 if ($leader->write_as_invlist) {
8622 The first data line of this file begins with the letter V to indicate it is in
8623 inversion list format. The number following the V gives the number of lines
8624 remaining. Each of those remaining lines is a single number representing the
8625 starting code point of a range which goes up to but not including the number
8626 on the next line; The 0th, 2nd, 4th... ranges are for code points that match
8627 the property; the 1st, 3rd, 5th... are ranges of code points that don't match
8628 the property. The final line's range extends to the platform's infinity.
8633 The format of the lines of this file is:
8634 START\\tSTOP\\twhere START is the starting code point of the range, in hex;
8635 STOP is the ending point, or if omitted, the range has just one code point.
8638 if ($leader->output_range_counts) {
8640 Numbers in comments in [brackets] indicate how many code points are in the
8646 $leader->set_comment(main::join_lines($comment));
8650 # Accessors for the underlying list
8652 get_valid_code_point
8653 get_invalid_code_point
8661 return $self->_range_list->$sub(@_);
8664 } # End closure for Match_Table
8668 # The Property class represents a Unicode property, or the $perl
8669 # pseudo-property. It contains a map table initialized empty at construction
8670 # time, and for properties accessible through regular expressions, various
8671 # match tables, created through the add_match_table() method, and referenced
8672 # by the table('NAME') or tables() methods, the latter returning a list of all
8673 # of the match tables. Otherwise table operations implicitly are for the map
8676 # Most of the data in the property is actually about its map table, so it
8677 # mostly just uses that table's accessors for most methods. The two could
8678 # have been combined into one object, but for clarity because of their
8679 # differing semantics, they have been kept separate. It could be argued that
8680 # the 'file' and 'directory' fields should be kept with the map table.
8682 # Each property has a type. This can be set in the constructor, or in the
8683 # set_type accessor, but mostly it is figured out by the data. Every property
8684 # starts with unknown type, overridden by a parameter to the constructor, or
8685 # as match tables are added, or ranges added to the map table, the data is
8686 # inspected, and the type changed. After the table is mostly or entirely
8687 # filled, compute_type() should be called to finalize the analysis.
8689 # There are very few operations defined. One can safely remove a range from
8690 # the map table, and property_add_or_replace_non_nulls() adds the maps from another
8691 # table to this one, replacing any in the intersection of the two.
8693 sub standardize { return main::standardize($_[0]); }
8694 sub trace { return main::trace(@_) if main::DEBUG && $to_trace }
8698 # This hash will contain as keys, all the aliases of all properties, and
8699 # as values, pointers to their respective property objects. This allows
8700 # quick look-up of a property from any of its names.
8701 my %alias_to_property_of;
8703 sub dump_alias_to_property_of {
8706 print "\n", main::simple_dumper (\%alias_to_property_of), "\n";
8710 sub property_ref($name) {
8711 # This is a package subroutine, not called as a method.
8712 # If the single parameter is a literal '*' it returns a list of all
8713 # defined properties.
8714 # Otherwise, the single parameter is a name, and it returns a pointer
8715 # to the corresponding property object, or undef if none.
8717 # Properties can have several different names. The 'standard' form of
8718 # each of them is stored in %alias_to_property_of as they are defined.
8719 # But it's possible that this subroutine will be called with some
8720 # variant, so if the initial lookup fails, it is repeated with the
8721 # standardized form of the input name. If found, besides returning the
8722 # result, the input name is added to the list so future calls won't
8723 # have to do the conversion again.
8725 if (! defined $name) {
8726 Carp::my_carp_bug("Undefined input property. No action taken.");
8730 return main::uniques(values %alias_to_property_of) if $name eq '*';
8732 # Return cached result if have it.
8733 my $result = $alias_to_property_of{$name};
8734 return $result if defined $result;
8736 # Convert the input to standard form.
8737 my $standard_name = standardize($name);
8739 $result = $alias_to_property_of{$standard_name};
8740 return unless defined $result; # Don't cache undefs
8742 # Cache the result before returning it.
8743 $alias_to_property_of{$name} = $result;
8748 main::setup_package();
8751 # A pointer to the map table object for this property
8752 main::set_access('map', \%map);
8755 # The property's full name. This is a duplicate of the copy kept in the
8756 # map table, but is needed because stringify needs it during
8757 # construction of the map table, and then would have a chicken before egg
8759 main::set_access('full_name', \%full_name, 'r');
8762 # This hash will contain as keys, all the aliases of any match tables
8763 # attached to this property, and as values, the pointers to their
8764 # respective tables. This allows quick look-up of a table from any of its
8766 main::set_access('table_ref', \%table_ref);
8769 # The type of the property, $ENUM, $BINARY, etc
8770 main::set_access('type', \%type, 'r');
8773 # The filename where the map table will go (if actually written).
8774 # Normally defaulted, but can be overridden.
8775 main::set_access('file', \%file, 'r', 's');
8778 # The directory where the map table will go (if actually written).
8779 # Normally defaulted, but can be overridden.
8780 main::set_access('directory', \%directory, 's');
8782 my %pseudo_map_type;
8783 # This is used to affect the calculation of the map types for all the
8784 # ranges in the table. It should be set to one of the values that signify
8785 # to alter the calculation.
8786 main::set_access('pseudo_map_type', \%pseudo_map_type, 'r');
8788 my %has_only_code_point_maps;
8789 # A boolean used to help in computing the type of data in the map table.
8790 main::set_access('has_only_code_point_maps', \%has_only_code_point_maps);
8793 # A list of the first few distinct mappings this property has. This is
8794 # used to disambiguate between binary and enum property types, so don't
8795 # have to keep more than three.
8796 main::set_access('unique_maps', \%unique_maps);
8798 my %pre_declared_maps;
8799 # A boolean that gives whether the input data should declare all the
8800 # tables used, or not. If the former, unknown ones raise a warning.
8801 main::set_access('pre_declared_maps',
8802 \%pre_declared_maps, 'r', 's');
8805 # For properties whose shortest names are too long for a DOS 8.3
8806 # filesystem to distinguish between, this is used to manually give short
8807 # names for the directory name immediately under $match_tables that the
8808 # match tables for this property should be placed in.
8809 main::set_access('match_subdir', \%match_subdir, 'r');
8812 # A boolean that gives whether some table somewhere is defined as the
8813 # complement of a table in this property. This is a crude, but currently
8814 # sufficient, mechanism to make this property not get destroyed before
8815 # what is dependent on it is. Other dependencies could be added, so the
8816 # name was chosen to reflect a more general situation than actually is
8817 # currently the case.
8818 main::set_access('has_dependency', \%has_dependency, 'r', 's');
8821 # The only required parameter is the positionally first, name. All
8822 # other parameters are key => value pairs. See the documentation just
8823 # above for the meanings of the ones not passed directly on to the map
8824 # table constructor.
8827 my $name = shift || "";
8829 my $self = property_ref($name);
8830 if (defined $self) {
8831 my $options_string = join ", ", @_;
8832 $options_string = ". Ignoring options $options_string" if $options_string;
8833 Carp::my_carp("$self is already in use. Using existing one$options_string;");
8839 $self = bless \do { my $anonymous_scalar }, $class;
8840 my $addr = pack 'J', refaddr $self;
8842 $directory{$addr} = delete $args{'Directory'};
8843 $file{$addr} = delete $args{'File'};
8844 $full_name{$addr} = delete $args{'Full_Name'} || $name;
8845 $type{$addr} = delete $args{'Type'} || $UNKNOWN;
8846 $pseudo_map_type{$addr} = delete $args{'Map_Type'};
8847 $pre_declared_maps{$addr} = delete $args{'Pre_Declared_Maps'}
8848 # Starting in this release, property
8849 # values should be defined for all
8850 # properties, except those overriding this
8851 // $v_version ge v5.1.0;
8852 $match_subdir{$addr} = delete $args{'Match_SubDir'};
8854 # Rest of parameters passed on.
8856 $has_only_code_point_maps{$addr} = 1;
8857 $table_ref{$addr} = { };
8858 $unique_maps{$addr} = { };
8859 $has_dependency{$addr} = 0;
8861 $map{$addr} = Map_Table->new($name,
8862 Full_Name => $full_name{$addr},
8863 _Alias_Hash => \%alias_to_property_of,
8869 # See this program's beginning comment block about overloading the copy
8870 # constructor. Few operations are defined on properties, but a couple are
8871 # useful. It is safe to take the inverse of a property, and to remove a
8872 # single code point from it.
8875 qw("") => "_operator_stringify",
8876 "." => \&main::_operator_dot,
8877 ".=" => \&main::_operator_dot_equal,
8878 '==' => \&main::_operator_equal,
8879 '!=' => \&main::_operator_not_equal,
8880 '=' => sub { return shift },
8881 '-=' => "_minus_and_equal",
8884 sub _operator_stringify($self, $other="", $reversed=0) {
8885 return "Property '" . $self->full_name . "'";
8888 sub _minus_and_equal($self, $other, $reversed=0) {
8889 # Remove a single code point from the map table of a property.
8891 Carp::my_carp_bug("Bad news. Can't cope with a "
8893 . " argument to '-='. Subtraction ignored.");
8896 elsif ($reversed) { # Shouldn't happen in a -=, but just in case
8897 Carp::my_carp_bug("Bad news. Can't cope with subtracting a "
8899 . " from a non-object. undef returned.");
8903 $map{pack 'J', refaddr $self}->delete_range($other, $other);
8908 sub add_match_table {
8909 # Add a new match table for this property, with name given by the
8910 # parameter. It returns a pointer to the table.
8916 my $addr = pack 'J', refaddr $self;
8918 my $table = $table_ref{$addr}{$name};
8919 my $standard_name = main::standardize($name);
8921 || (defined ($table = $table_ref{$addr}{$standard_name})))
8923 Carp::my_carp("Table '$name' in $self is already in use. Using existing one");
8924 $table_ref{$addr}{$name} = $table;
8929 # See if this is a perl extension, if not passed in.
8930 my $perl_extension = delete $args{'Perl_Extension'};
8932 = $self->perl_extension if ! defined $perl_extension;
8935 my $suppression_reason = "";
8936 if ($self->name =~ /^_/) {
8937 $fate = $SUPPRESSED;
8938 $suppression_reason = "Parent property is internal only";
8940 elsif ($self->fate >= $SUPPRESSED) {
8941 $fate = $self->fate;
8942 $suppression_reason = $why_suppressed{$self->complete_name};
8945 elsif ($name =~ /^_/) {
8946 $fate = $INTERNAL_ONLY;
8948 $table = Match_Table->new(
8950 Perl_Extension => $perl_extension,
8951 _Alias_Hash => $table_ref{$addr},
8954 Suppression_Reason => $suppression_reason,
8955 Status => $self->status,
8956 _Status_Info => $self->status_info,
8958 return unless defined $table;
8961 # Save the names for quick look up
8962 $table_ref{$addr}{$standard_name} = $table;
8963 $table_ref{$addr}{$name} = $table;
8965 # Perhaps we can figure out the type of this property based on the
8966 # fact of adding this match table. First, string properties don't
8967 # have match tables; second, a binary property can't have 3 match
8969 if ($type{$addr} == $UNKNOWN) {
8970 $type{$addr} = $NON_STRING;
8972 elsif ($type{$addr} == $STRING) {
8973 Carp::my_carp("$self Added a match table '$name' to a string property '$self'. Changed it to a non-string property. Bad News.");
8974 $type{$addr} = $NON_STRING;
8976 elsif ($type{$addr} != $ENUM && $type{$addr} != $FORCED_BINARY) {
8977 if (scalar main::uniques(values %{$table_ref{$addr}}) > 2) {
8978 if ($type{$addr} == $BINARY) {
8979 Carp::my_carp("$self now has more than 2 tables (with the addition of '$name'), and so is no longer binary. Changing its type to 'enum'. Bad News.");
8981 $type{$addr} = $ENUM;
8988 sub delete_match_table($self, $table_to_remove) {
8989 # Delete the table referred to by $2 from the property $1.
8990 my $addr = pack 'J', refaddr $self;
8992 # Remove all names that refer to it.
8993 foreach my $key (keys %{$table_ref{$addr}}) {
8994 delete $table_ref{$addr}{$key}
8995 if $table_ref{$addr}{$key} == $table_to_remove;
8998 $table_to_remove->DESTROY;
9002 sub table($self, $name) {
9003 # Return a pointer to the match table (with name given by the
9004 # parameter) associated with this property; undef if none.
9005 my $addr = pack 'J', refaddr $self;
9007 return $table_ref{$addr}{$name} if defined $table_ref{$addr}{$name};
9009 # If quick look-up failed, try again using the standard form of the
9010 # input name. If that succeeds, cache the result before returning so
9011 # won't have to standardize this input name again.
9012 my $standard_name = main::standardize($name);
9013 return unless defined $table_ref{$addr}{$standard_name};
9015 $table_ref{$addr}{$name} = $table_ref{$addr}{$standard_name};
9016 return $table_ref{$addr}{$name};
9020 # Return a list of pointers to all the match tables attached to this
9023 return main::uniques(values %{$table_ref{pack 'J', refaddr shift}});
9027 # Returns the directory the map table for this property should be
9028 # output in. If a specific directory has been specified, that has
9029 # priority; 'undef' is returned if the type isn't defined;
9030 # or $map_directory for everything else.
9032 my $addr = pack 'J', refaddr shift;
9034 return $directory{$addr} if defined $directory{$addr};
9035 return undef if $type{$addr} == $UNKNOWN;
9036 return $map_directory;
9039 sub swash_name($self) {
9040 # Return the name that is used to both:
9041 # 1) Name the file that the map table is written to.
9042 # 2) The name of swash related stuff inside that file.
9043 # The reason for this is that the Perl core historically has used
9044 # certain names that aren't the same as the Unicode property names.
9045 # To continue using these, $file is hard-coded in this file for those,
9046 # but otherwise the standard name is used. This is different from the
9047 # external_name, so that the rest of the files, like in lib can use
9048 # the standard name always, without regard to historical precedent.
9049 my $addr = pack 'J', refaddr $self;
9051 # Swash names are used only on either
9052 # 1) regular or internal-only map tables
9053 # 2) otherwise there should be no access to the
9054 # property map table from other parts of Perl.
9055 return if $map{$addr}->fate != $ORDINARY
9056 && ! ($map{$addr}->name =~ /^_/
9057 && $map{$addr}->fate == $INTERNAL_ONLY);
9059 return $file{$addr} if defined $file{$addr};
9060 return $map{$addr}->external_name;
9063 sub to_create_match_tables($self) {
9064 # Returns a boolean as to whether or not match tables should be
9065 # created for this property.
9067 # The whole point of this pseudo property is match tables.
9068 return 1 if $self == $perl;
9070 my $addr = pack 'J', refaddr $self;
9072 # Don't generate tables of code points that match the property values
9073 # of a string property. Such a list would most likely have many
9074 # property values, each with just one or very few code points mapping
9076 return 0 if $type{$addr} == $STRING;
9082 sub property_add_or_replace_non_nulls($self, $other) {
9083 # This adds the mappings in the property $other to $self. Non-null
9084 # mappings from $other override those in $self. It essentially merges
9085 # the two properties, with the second having priority except for null
9088 if (! $other->isa(__PACKAGE__)) {
9089 Carp::my_carp_bug("$other should be a "
9097 return $map{pack 'J', refaddr $self}->map_add_or_replace_non_nulls($map{pack 'J', refaddr $other});
9101 # Certain tables are not generally written out to files, but
9102 # Unicode::UCD has the intelligence to know that the file for $self
9103 # can be used to reconstruct those tables. This routine just changes
9104 # things so that UCD pod entries for those suppressed tables are
9105 # generated, so the fact that a proxy is used is invisible to the
9110 foreach my $property_name (@_) {
9111 my $ref = property_ref($property_name);
9112 next if $ref->to_output_map;
9113 $ref->set_fate($MAP_PROXIED);
9117 sub set_type($self, $type) {
9118 # Set the type of the property. Mostly this is figured out by the
9119 # data in the table. But this is used to set it explicitly. The
9120 # reason it is not a standard accessor is that when setting a binary
9121 # property, we need to make sure that all the true/false aliases are
9122 # present, as they were omitted in early Unicode releases.
9126 && $type != $FORCED_BINARY
9127 && $type != $STRING)
9129 Carp::my_carp("Unrecognized type '$type'. Type not set");
9133 $type{pack 'J', refaddr $self} = $type;
9134 return if $type != $BINARY && $type != $FORCED_BINARY;
9136 my $yes = $self->table('Y');
9137 $yes = $self->table('Yes') if ! defined $yes;
9138 $yes = $self->add_match_table('Y', Full_Name => 'Yes')
9141 # Add aliases in order wanted, duplicates will be ignored. We use a
9142 # binary property present in all releases for its ordered lists of
9143 # true/false aliases. Note, that could run into problems in
9144 # outputting things in that we don't distinguish between the name and
9145 # full name of these. Hopefully, if the table was already created
9146 # before this code is executed, it was done with these set properly.
9147 my $bm = property_ref("Bidi_Mirrored");
9148 foreach my $alias ($bm->table("Y")->aliases) {
9149 $yes->add_alias($alias->name);
9151 my $no = $self->table('N');
9152 $no = $self->table('No') if ! defined $no;
9153 $no = $self->add_match_table('N', Full_Name => 'No') if ! defined $no;
9154 foreach my $alias ($bm->table("N")->aliases) {
9155 $no->add_alias($alias->name);
9162 # Add a map to the property's map table. This also keeps
9163 # track of the maps so that the property type can be determined from
9167 my $start = shift; # First code point in range
9168 my $end = shift; # Final code point in range
9169 my $map = shift; # What the range maps to.
9170 # Rest of parameters passed on.
9172 my $addr = pack 'J', refaddr $self;
9174 # If haven't the type of the property, gather information to figure it
9176 if ($type{$addr} == $UNKNOWN) {
9178 # If the map contains an interior blank or dash, or most other
9179 # nonword characters, it will be a string property. This
9180 # heuristic may actually miss some string properties. If so, they
9181 # may need to have explicit set_types called for them. This
9182 # happens in the Unihan properties.
9183 if ($map =~ / (?<= . ) [ -] (?= . ) /x
9184 || $map =~ / [^\w.\/\ -] /x)
9186 $self->set_type($STRING);
9188 # $unique_maps is used for disambiguating between ENUM and
9189 # BINARY later; since we know the property is not going to be
9190 # one of those, no point in keeping the data around
9191 undef $unique_maps{$addr};
9195 # Not necessarily a string. The final decision has to be
9196 # deferred until all the data are in. We keep track of if all
9197 # the values are code points for that eventual decision.
9198 $has_only_code_point_maps{$addr} &=
9199 $map =~ / ^ $code_point_re $/x;
9201 # For the purposes of disambiguating between binary and other
9202 # enumerations at the end, we keep track of the first three
9203 # distinct property values. Once we get to three, we know
9204 # it's not going to be binary, so no need to track more.
9205 if (scalar keys %{$unique_maps{$addr}} < 3) {
9206 $unique_maps{$addr}{main::standardize($map)} = 1;
9211 # Add the mapping by calling our map table's method
9212 return $map{$addr}->add_map($start, $end, $map, @_);
9215 sub compute_type($self) {
9216 # Compute the type of the property: $ENUM, $STRING, or $BINARY. This
9217 # should be called after the property is mostly filled with its maps.
9218 # We have been keeping track of what the property values have been,
9219 # and now have the necessary information to figure out the type.
9221 my $addr = pack 'J', refaddr $self;
9223 my $type = $type{$addr};
9225 # If already have figured these out, no need to do so again, but we do
9226 # a double check on ENUMS to make sure that a string property hasn't
9227 # improperly been classified as an ENUM, so continue on with those.
9228 return if $type == $STRING
9230 || $type == $FORCED_BINARY;
9232 # If every map is to a code point, is a string property.
9233 if ($type == $UNKNOWN
9234 && ($has_only_code_point_maps{$addr}
9235 || (defined $map{$addr}->default_map
9236 && $map{$addr}->default_map eq "")))
9238 $self->set_type($STRING);
9242 # Otherwise, it is to some sort of enumeration. (The case where
9243 # it is a Unicode miscellaneous property, and treated like a
9244 # string in this program is handled in add_map()). Distinguish
9245 # between binary and some other enumeration type. Of course, if
9246 # there are more than two values, it's not binary. But more
9247 # subtle is the test that the default mapping is defined means it
9248 # isn't binary. This in fact may change in the future if Unicode
9249 # changes the way its data is structured. But so far, no binary
9250 # properties ever have @missing lines for them, so the default map
9251 # isn't defined for them. The few properties that are two-valued
9252 # and aren't considered binary have the default map defined
9253 # starting in Unicode 5.0, when the @missing lines appeared; and
9254 # this program has special code to put in a default map for them
9255 # for earlier than 5.0 releases.
9257 || scalar keys %{$unique_maps{$addr}} > 2
9258 || defined $self->default_map)
9260 my $tables = $self->tables;
9261 my $count = $self->count;
9262 if ($verbosity && $tables > 500 && $tables/$count > .1) {
9263 Carp::my_carp_bug("It appears that $self should be a \$STRING property, not an \$ENUM because it has too many match tables: $tables\n");
9265 $self->set_type($ENUM);
9268 $self->set_type($BINARY);
9271 undef $unique_maps{$addr}; # Garbage collect
9275 # $reaons - Ignored unless suppressing
9276 sub set_fate($self, $fate, $reason=undef) {
9277 my $addr = pack 'J', refaddr $self;
9278 if ($fate >= $SUPPRESSED) {
9279 $why_suppressed{$self->complete_name} = $reason;
9282 # Each table shares the property's fate, except that MAP_PROXIED
9283 # doesn't affect match tables
9284 $map{$addr}->set_fate($fate, $reason);
9285 if ($fate != $MAP_PROXIED) {
9286 foreach my $table ($map{$addr}, $self->tables) {
9287 $table->set_fate($fate, $reason);
9294 # Most of the accessors for a property actually apply to its map table.
9295 # Setup up accessor functions for those, referring to %map
9346 # 'property' above is for symmetry, so that one can take
9347 # the property of a property and get itself, and so don't
9348 # have to distinguish between properties and tables in
9355 return $map{pack 'J', refaddr $self}->$sub(@_);
9365 # Converts an ordinal printable character value to a displayable string,
9366 # using a dotted circle to hold combining characters.
9370 return $chr if $ccc->table(0)->contains($ord);
9371 return "\x{25CC}$chr";
9374 sub join_lines($input) {
9375 # Returns lines of the input joined together, so that they can be folded
9377 # This causes continuation lines to be joined together into one long line
9378 # for folding. A continuation line is any line that doesn't begin with a
9379 # space or "\b" (the latter is stripped from the output). This is so
9380 # lines can be in a HERE document so as to fit nicely in the terminal
9381 # width, but be joined together in one long line, and then folded with
9382 # indents, '#' prefixes, etc, properly handled.
9383 # A blank separates the joined lines except if there is a break; an extra
9384 # blank is inserted after a period ending a line.
9386 # Initialize the return with the first line.
9387 my ($return, @lines) = split "\n", $input;
9389 # If the first line is null, it was an empty line, add the \n back in
9390 $return = "\n" if $return eq "";
9392 # Now join the remainder of the physical lines.
9393 for my $line (@lines) {
9395 # An empty line means wanted a blank line, so add two \n's to get that
9396 # effect, and go to the next line.
9397 if (length $line == 0) {
9402 # Look at the last character of what we have so far.
9403 my $previous_char = substr($return, -1, 1);
9405 # And at the next char to be output.
9406 my $next_char = substr($line, 0, 1);
9408 if ($previous_char ne "\n") {
9410 # Here didn't end wth a nl. If the next char a blank or \b, it
9411 # means that here there is a break anyway. So add a nl to the
9413 if ($next_char eq " " || $next_char eq "\b") {
9414 $previous_char = "\n";
9415 $return .= $previous_char;
9418 # Add an extra space after periods.
9419 $return .= " " if $previous_char eq '.';
9422 # Here $previous_char is still the latest character to be output. If
9423 # it isn't a nl, it means that the next line is to be a continuation
9424 # line, with a blank inserted between them.
9425 $return .= " " if $previous_char ne "\n";
9428 substr($line, 0, 1) = "" if $next_char eq "\b";
9430 # And append this next line.
9437 sub simple_fold( $line, $prefix="", $hanging_indent=0, $right_margin=0) {
9438 # Returns a string of the input (string or an array of strings) folded
9439 # into multiple-lines each of no more than $MAX_LINE_WIDTH characters plus
9441 # This is tailored for the kind of text written by this program,
9442 # especially the pod file, which can have very long names with
9443 # underscores in the middle, or words like AbcDefgHij.... We allow
9444 # breaking in the middle of such constructs if the line won't fit
9445 # otherwise. The break in such cases will come either just after an
9446 # underscore, or just before one of the Capital letters.
9448 local $to_trace = 0 if main::DEBUG;
9450 # $prefix Optional string to prepend to each output line
9451 # $hanging_indent Optional number of spaces to indent
9452 # continuation lines
9453 # $right_margin Optional number of spaces to narrow the
9456 # The space available doesn't include what's automatically prepended
9457 # to each line, or what's reserved on the right.
9458 my $max = $MAX_LINE_WIDTH - length($prefix) - $right_margin;
9459 # XXX Instead of using the 'nofold' perhaps better to look up the stack
9461 if (DEBUG && $hanging_indent >= $max) {
9462 Carp::my_carp("Too large a hanging indent ($hanging_indent); must be < $max. Using 0", 'nofold');
9463 $hanging_indent = 0;
9466 # First, split into the current physical lines.
9468 if (ref $line) { # Better be an array, because not bothering to
9470 foreach my $line (@{$line}) {
9471 push @line, split /\n/, $line;
9475 @line = split /\n/, $line;
9478 #local $to_trace = 1 if main::DEBUG;
9479 trace "", join(" ", @line), "\n" if main::DEBUG && $to_trace;
9481 # Look at each current physical line.
9482 for (my $i = 0; $i < @line; $i++) {
9483 Carp::my_carp("Tabs don't work well.", 'nofold') if $line[$i] =~ /\t/;
9484 #local $to_trace = 1 if main::DEBUG;
9485 trace "i=$i: $line[$i]\n" if main::DEBUG && $to_trace;
9487 # Remove prefix, because will be added back anyway, don't want
9489 $line[$i] =~ s/^$prefix//;
9491 # Remove trailing space
9492 $line[$i] =~ s/\s+\Z//;
9494 # If the line is too long, fold it.
9495 if (length $line[$i] > $max) {
9498 # Here needs to fold. Save the leading space in the line for
9500 $line[$i] =~ /^ ( \s* )/x;
9501 my $leading_space = $1;
9502 trace "line length", length $line[$i], "; lead length", length($leading_space) if main::DEBUG && $to_trace;
9504 # If character at final permissible position is white space,
9505 # fold there, which will delete that white space
9506 if (substr($line[$i], $max - 1, 1) =~ /\s/) {
9507 $remainder = substr($line[$i], $max);
9508 $line[$i] = substr($line[$i], 0, $max - 1);
9512 # Otherwise fold at an acceptable break char closest to
9513 # the max length. Look at just the maximal initial
9514 # segment of the line
9515 my $segment = substr($line[$i], 0, $max - 1);
9517 /^ ( .{$hanging_indent} # Don't look before the
9519 \ * # Don't look in leading
9520 # blanks past the indent
9521 [^ ] .* # Find the right-most
9522 (?: # acceptable break:
9523 [ \s = ] # space or equal
9524 | - (?! [.0-9] ) # or non-unary minus.
9525 | [^\\[(] (?= \\ )# break before single backslash
9526 # not immediately after opening
9528 ) # $1 includes the character
9531 # Split into the initial part that fits, and remaining
9533 $remainder = substr($line[$i], length $1);
9535 trace $line[$i] if DEBUG && $to_trace;
9536 trace $remainder if DEBUG && $to_trace;
9539 # If didn't find a good breaking spot, see if there is a
9540 # not-so-good breaking spot. These are just after
9541 # underscores or where the case changes from lower to
9542 # upper. Use \a as a soft hyphen, but give up
9543 # and don't break the line if there is actually a \a
9544 # already in the input. We use an ascii character for the
9545 # soft-hyphen to avoid any attempt by miniperl to try to
9546 # access the files that this program is creating.
9547 elsif ($segment !~ /\a/
9548 && ($segment =~ s/_/_\a/g
9549 || $segment =~ s/ ( (?!\\) [a-z] ) (?= [A-Z] )/$1\a/xg))
9551 # Here were able to find at least one place to insert
9552 # our substitute soft hyphen. Find the right-most one
9553 # and replace it by a real hyphen.
9554 trace $segment if DEBUG && $to_trace;
9556 rindex($segment, "\a"),
9559 # Then remove the soft hyphen substitutes.
9560 $segment =~ s/\a//g;
9561 trace $segment if DEBUG && $to_trace;
9563 # And split into the initial part that fits, and
9564 # remainder of the line
9565 my $pos = rindex($segment, '-');
9566 $remainder = substr($line[$i], $pos);
9567 trace $remainder if DEBUG && $to_trace;
9568 $line[$i] = substr($segment, 0, $pos + 1);
9572 # Here we know if we can fold or not. If we can, $remainder
9573 # is what remains to be processed in the next iteration.
9574 if (defined $remainder) {
9575 trace "folded='$line[$i]'" if main::DEBUG && $to_trace;
9577 # Insert the folded remainder of the line as a new element
9578 # of the array. (It may still be too long, but we will
9579 # deal with that next time through the loop.) Omit any
9580 # leading space in the remainder.
9581 $remainder =~ s/^\s+//;
9582 trace "remainder='$remainder'" if main::DEBUG && $to_trace;
9584 # But then indent by whichever is larger of:
9585 # 1) the leading space on the input line;
9586 # 2) the hanging indent.
9587 # This preserves indentation in the original line.
9588 my $lead = ($leading_space)
9589 ? length $leading_space
9591 $lead = max($lead, $hanging_indent);
9592 splice @line, $i+1, 0, (" " x $lead) . $remainder;
9596 # Ready to output the line. Get rid of any trailing space
9597 # And prefix by the required $prefix passed in.
9598 $line[$i] =~ s/\s+$//;
9599 $line[$i] = "$prefix$line[$i]\n";
9600 } # End of looping through all the lines.
9602 return join "", @line;
9605 sub property_ref { # Returns a reference to a property object.
9606 return Property::property_ref(@_);
9609 sub force_unlink ($filename) {
9610 return unless file_exists($filename);
9611 return if CORE::unlink($filename);
9613 # We might need write permission
9614 chmod 0777, $filename;
9615 CORE::unlink($filename) or Carp::my_carp("Couldn't unlink $filename. Proceeding anyway: $!");
9619 sub write ($file, $use_utf8, @lines) {
9620 # Given a filename and references to arrays of lines, write the lines of
9621 # each array to the file
9622 # Filename can be given as an arrayref of directory names
9624 # Get into a single string if an array, and get rid of, in Unix terms, any
9626 $file= File::Spec->join(@$file) if ref $file eq 'ARRAY';
9627 $file = File::Spec->canonpath($file);
9629 # If has directories, make sure that they all exist
9630 (undef, my $directories, undef) = File::Spec->splitpath($file);
9631 File::Path::mkpath($directories) if $directories && ! -d $directories;
9633 push @files_actually_output, $file;
9635 force_unlink ($file);
9638 if (not open $OUT, ">", $file) {
9639 Carp::my_carp("can't open $file for output. Skipping this file: $!");
9643 binmode $OUT, ":utf8" if $use_utf8;
9645 foreach my $lines_ref (@lines) {
9646 unless (@$lines_ref) {
9647 Carp::my_carp("An array of lines for writing to file '$file' is empty; writing it anyway;");
9650 print $OUT @$lines_ref or die Carp::my_carp("write to '$file' failed: $!");
9652 close $OUT or die Carp::my_carp("close '$file' failed: $!");
9654 print "$file written.\n" if $verbosity >= $VERBOSE;
9660 sub Standardize($name=undef) {
9661 # This converts the input name string into a standardized equivalent to
9664 unless (defined $name) {
9665 Carp::my_carp_bug("Standardize() called with undef. Returning undef.");
9669 # Remove any leading or trailing white space
9673 # Convert interior white space and hyphens into underscores.
9674 $name =~ s/ (?<= .) [ -]+ (.) /_$1/xg;
9676 # Capitalize the letter following an underscore, and convert a sequence of
9677 # multiple underscores to a single one
9678 $name =~ s/ (?<= .) _+ (.) /_\u$1/xg;
9680 # And capitalize the first letter, but not for the special cjk ones.
9681 $name = ucfirst($name) unless $name =~ /^k[A-Z]/;
9685 sub standardize ($str=undef) {
9686 # Returns a lower-cased standardized name, without underscores. This form
9687 # is chosen so that it can distinguish between any real versus superficial
9688 # Unicode name differences. It relies on the fact that Unicode doesn't
9689 # have interior underscores, white space, nor dashes in any
9690 # stricter-matched name. It should not be used on Unicode code point
9691 # names (the Name property), as they mostly, but not always follow these
9694 my $name = Standardize($str);
9695 return if !defined $name;
9697 $name =~ s/ (?<= .) _ (?= . ) //xg;
9701 sub UCD_name ($table, $alias) {
9702 # Returns the name that Unicode::UCD will use to find a table. XXX
9703 # perhaps this function should be placed somewhere, like UCD.pm so that
9704 # Unicode::UCD can use it directly without duplicating code that can get
9707 my $property = $table->property;
9708 $property = ($property == $perl)
9709 ? "" # 'perl' is never explicitly stated
9710 : standardize($property->name) . '=';
9711 if ($alias->loose_match) {
9712 return $property . standardize($alias->name);
9715 return lc ($property . $alias->name);
9723 my $indent_increment = " " x ( $debugging_build ? 2 : 0);
9724 %main::already_output = ();
9726 $main::simple_dumper_nesting = 0;
9728 sub simple_dumper( $item, $indent = "" ) {
9729 # Like Simple Data::Dumper. Good enough for our needs. We can't use
9730 # the real thing as we have to run under miniperl.
9732 # It is designed so that on input it is at the beginning of a line,
9733 # and the final thing output in any call is a trailing ",\n".
9735 $indent = "" if ! $debugging_build;
9737 # nesting level is localized, so that as the call stack pops, it goes
9738 # back to the prior value.
9739 local $main::simple_dumper_nesting = $main::simple_dumper_nesting;
9740 local %main::already_output = %main::already_output;
9741 $main::simple_dumper_nesting++;
9742 #print STDERR __LINE__, ": $main::simple_dumper_nesting: $indent$item\n";
9744 # Determine the indent for recursive calls.
9745 my $next_indent = $indent . $indent_increment;
9750 # Dump of scalar: just output it in quotes if not a number. To do
9751 # so we must escape certain characters, and therefore need to
9752 # operate on a copy to avoid changing the original
9754 $copy = $UNDEF unless defined $copy;
9756 # Quote non-integers (integers also have optional leading '-')
9757 if ($copy eq "" || $copy !~ /^ -? \d+ $/x) {
9759 # Escape apostrophe and backslash
9760 $copy =~ s/ ( ['\\] ) /\\$1/xg;
9763 $output = "$indent$copy,\n";
9767 # Keep track of cycles in the input, and refuse to infinitely loop
9768 my $addr = pack 'J', refaddr $item;
9769 if (defined $main::already_output{$addr}) {
9770 return "${indent}ALREADY OUTPUT: $item\n";
9772 $main::already_output{$addr} = $item;
9774 if (ref $item eq 'ARRAY') {
9777 if ($main::simple_dumper_nesting > 1) {
9779 $using_brackets = 1;
9782 $using_brackets = 0;
9785 # If the array is empty, put the closing bracket on the same
9786 # line. Otherwise, recursively add each array element
9792 for (my $i = 0; $i < @$item; $i++) {
9794 # Indent array elements one level
9795 $output .= &simple_dumper($item->[$i], $next_indent);
9796 next if ! $debugging_build;
9797 $output =~ s/\n$//; # Remove any trailing nl so
9798 $output .= " # [$i]\n"; # as to add a comment giving
9801 $output .= $indent; # Indent closing ']' to orig level
9803 $output .= ']' if $using_brackets;
9806 elsif (ref $item eq 'HASH') {
9811 # No surrounding braces at top level
9813 if ($main::simple_dumper_nesting > 1) {
9816 $body_indent = $next_indent;
9817 $next_indent .= $indent_increment;
9822 $body_indent = $indent;
9826 # Output hashes sorted alphabetically instead of apparently
9827 # random. Use caseless alphabetic sort
9828 foreach my $key (sort { lc $a cmp lc $b } keys %$item)
9830 if ($is_first_line) {
9834 $output .= "$body_indent";
9837 # The key must be a scalar, but this recursive call quotes
9839 $output .= &simple_dumper($key);
9841 # And change the trailing comma and nl to the hash fat
9842 # comma for clarity, and so the value can be on the same
9844 $output =~ s/,\n$/ => /;
9846 # Recursively call to get the value's dump.
9847 my $next = &simple_dumper($item->{$key}, $next_indent);
9849 # If the value is all on one line, remove its indent, so
9850 # will follow the => immediately. If it takes more than
9851 # one line, start it on a new line.
9852 if ($next !~ /\n.*\n/) {
9861 $output .= "$indent},\n" if $using_braces;
9863 elsif (ref $item eq 'CODE' || ref $item eq 'GLOB') {
9864 $output = $indent . ref($item) . "\n";
9865 # XXX see if blessed
9867 elsif ($item->can('dump')) {
9869 # By convention in this program, objects furnish a 'dump'
9870 # method. Since not doing any output at this level, just pass
9871 # on the input indent
9872 $output = $item->dump($indent);
9875 Carp::my_carp("Can't cope with dumping a " . ref($item) . ". Skipping.");
9882 sub dump_inside_out( $object, $fields_ref, @args ) {
9883 # Dump inside-out hashes in an object's state by converting them to a
9884 # regular hash and then calling simple_dumper on that.
9886 my $addr = pack 'J', refaddr $object;
9889 foreach my $key (keys %$fields_ref) {
9890 $hash{$key} = $fields_ref->{$key}{$addr};
9893 return simple_dumper(\%hash, @args);
9896 sub _operator_dot($self, $other="", $reversed=0) {
9897 # Overloaded '.' method that is common to all packages. It uses the
9898 # package's stringify method.
9900 foreach my $which (\$self, \$other) {
9901 next unless ref $$which;
9902 if ($$which->can('_operator_stringify')) {
9903 $$which = $$which->_operator_stringify;
9906 my $ref = ref $$which;
9907 my $addr = pack 'J', refaddr $$which;
9908 $$which = "$ref ($addr)";
9916 sub _operator_dot_equal($self, $other="", $reversed=0) {
9917 # Overloaded '.=' method that is common to all packages.
9920 return $other .= "$self";
9923 return "$self" . "$other";
9927 sub _operator_equal($self, $other, @) {
9928 # Generic overloaded '==' routine. To be equal, they must be the exact
9931 return 0 unless defined $other;
9932 return 0 unless ref $other;
9934 return $self == $other;
9937 sub _operator_not_equal($self, $other, @) {
9938 return ! _operator_equal($self, $other);
9941 sub substitute_PropertyAliases($file_object) {
9942 # Deal with early releases that don't have the crucial PropertyAliases.txt
9945 $file_object->insert_lines(get_old_property_aliases());
9947 process_PropertyAliases($file_object);
9951 sub process_PropertyAliases($file) {
9952 # This reads in the PropertyAliases.txt file, which contains almost all
9953 # the character properties in Unicode and their equivalent aliases:
9954 # scf ; Simple_Case_Folding ; sfc
9956 # Field 0 is the preferred short name for the property.
9957 # Field 1 is the full name.
9958 # Any succeeding ones are other accepted names.
9960 # Add any cjk properties that may have been defined.
9961 $file->insert_lines(@cjk_properties);
9963 while ($file->next_line) {
9965 my @data = split /\s*;\s*/;
9967 my $full = $data[1];
9969 # This line is defective in early Perls. The property in Unihan.txt
9971 if ($full eq 'Unicode_Radical_Stroke' && @data < 3) {
9972 push @data, qw(cjkRSUnicode kRSUnicode);
9975 my $this = Property->new($data[0], Full_Name => $full);
9977 $this->set_fate($SUPPRESSED, $why_suppressed{$full})
9978 if $why_suppressed{$full};
9980 # Start looking for more aliases after these two.
9981 for my $i (2 .. @data - 1) {
9982 $this->add_alias($data[$i]);
9987 my $scf = property_ref("Simple_Case_Folding");
9988 $scf->add_alias("scf");
9989 $scf->add_alias("sfc");
9994 sub finish_property_setup($file) {
9995 # Finishes setting up after PropertyAliases.
9997 # This entry was missing from this file in earlier Unicode versions
9998 if (-e 'Jamo.txt' && ! defined property_ref('JSN')) {
9999 Property->new('JSN', Full_Name => 'Jamo_Short_Name');
10002 # These are used so much, that we set globals for them.
10003 $gc = property_ref('General_Category');
10004 $block = property_ref('Block');
10005 $script = property_ref('Script');
10006 $age = property_ref('Age');
10008 # Perl adds this alias.
10009 $gc->add_alias('Category');
10011 # Unicode::Normalize expects this file with this name and directory.
10012 $ccc = property_ref('Canonical_Combining_Class');
10013 if (defined $ccc) {
10014 $ccc->set_file('CombiningClass');
10015 $ccc->set_directory(File::Spec->curdir());
10018 # These two properties aren't actually used in the core, but unfortunately
10019 # the names just above that are in the core interfere with these, so
10020 # choose different names. These aren't a problem unless the map tables
10021 # for these files get written out.
10022 my $lowercase = property_ref('Lowercase');
10023 $lowercase->set_file('IsLower') if defined $lowercase;
10024 my $uppercase = property_ref('Uppercase');
10025 $uppercase->set_file('IsUpper') if defined $uppercase;
10027 # Set up the hard-coded default mappings, but only on properties defined
10029 foreach my $property (keys %default_mapping) {
10030 my $property_object = property_ref($property);
10031 next if ! defined $property_object;
10032 my $default_map = $default_mapping{$property};
10033 $property_object->set_default_map($default_map);
10035 # A map of <code point> implies the property is string.
10036 if ($property_object->type == $UNKNOWN
10037 && $default_map eq $CODE_POINT)
10039 $property_object->set_type($STRING);
10043 # For backwards compatibility with applications that may read the mapping
10044 # file directly (it was documented in 5.12 and 5.14 as being thusly
10045 # usable), keep it from being adjusted. (range_size_1 is
10046 # used to force the traditional format.)
10047 if (defined (my $nfkc_cf = property_ref('NFKC_Casefold'))) {
10048 $nfkc_cf->set_to_output_map($EXTERNAL_MAP);
10049 $nfkc_cf->set_range_size_1(1);
10051 if (defined (my $bmg = property_ref('Bidi_Mirroring_Glyph'))) {
10052 $bmg->set_to_output_map($EXTERNAL_MAP);
10053 $bmg->set_range_size_1(1);
10056 property_ref('Numeric_Value')->set_to_output_map($OUTPUT_ADJUSTED);
10058 # The rest of this sub is for properties that need the Multi_Default class
10059 # to create objects for defaults. As of v15.0, this is no longer needed.
10061 return if $v_version ge v15.0.0;
10063 # Bidi class has a complicated default, but the derived file takes care of
10064 # the complications, leaving just 'L'.
10065 if (file_exists("${EXTRACTED}DBidiClass.txt")) {
10066 property_ref('Bidi_Class')->set_default_map('L');
10071 # The derived file was introduced in 3.1.1. The values below are
10072 # taken from table 3-8, TUS 3.0
10074 'my $default = Range_List->new;
10075 $default->add_range(0x0590, 0x05FF);
10076 $default->add_range(0xFB1D, 0xFB4F);'
10079 # The defaults apply only to unassigned characters
10080 $default_R .= '$gc->table("Unassigned") & $default;';
10082 if ($v_version lt v3.0.0) {
10083 $default = Multi_Default->new(R => $default_R, 'L');
10087 # AL apparently not introduced until 3.0: TUS 2.x references are
10088 # not on-line to check it out
10090 'my $default = Range_List->new;
10091 $default->add_range(0x0600, 0x07BF);
10092 $default->add_range(0xFB50, 0xFDFF);
10093 $default->add_range(0xFE70, 0xFEFF);'
10096 # Non-character code points introduced in this release; aren't AL
10097 if ($v_version ge 3.1.0) {
10098 $default_AL .= '$default->delete_range(0xFDD0, 0xFDEF);';
10100 $default_AL .= '$gc->table("Unassigned") & $default';
10101 $default = Multi_Default->new(AL => $default_AL,
10105 property_ref('Bidi_Class')->set_default_map($default);
10108 # Joining type has a complicated default, but the derived file takes care
10109 # of the complications, leaving just 'U' (or Non_Joining), except the file
10111 if (file_exists("${EXTRACTED}DJoinType.txt") || -e 'ArabicShaping.txt') {
10112 if (file_exists("${EXTRACTED}DJoinType.txt") && $v_version ne 3.1.0) {
10113 property_ref('Joining_Type')->set_default_map('Non_Joining');
10117 # Otherwise, there are not one, but two possibilities for the
10118 # missing defaults: T and U.
10119 # The missing defaults that evaluate to T are given by:
10120 # T = Mn + Cf - ZWNJ - ZWJ
10121 # where Mn and Cf are the general category values. In other words,
10122 # any non-spacing mark or any format control character, except
10123 # U+200C ZERO WIDTH NON-JOINER (joining type U) and U+200D ZERO
10124 # WIDTH JOINER (joining type C).
10125 my $default = Multi_Default->new(
10126 'T' => '$gc->table("Mn") + $gc->table("Cf") - 0x200C - 0x200D',
10128 property_ref('Joining_Type')->set_default_map($default);
10132 # Line break has a complicated default in early releases. It is 'Unknown'
10133 # for non-assigned code points; 'AL' for assigned.
10134 if (file_exists("${EXTRACTED}DLineBreak.txt") || -e 'LineBreak.txt') {
10135 my $lb = property_ref('Line_Break');
10136 if (file_exists("${EXTRACTED}DLineBreak.txt")) {
10137 $lb->set_default_map('Unknown');
10140 my $default = Multi_Default->new('AL' => '~ $gc->table("Cn")',
10143 $lb->set_default_map($default);
10150 sub get_old_property_aliases() {
10151 # Returns what would be in PropertyAliases.txt if it existed in very old
10152 # versions of Unicode. It was derived from the one in 3.2, and pared
10153 # down based on the data that was actually in the older releases.
10154 # An attempt was made to use the existence of files to mean inclusion or
10155 # not of various aliases, but if this was not sufficient, using version
10156 # numbers was resorted to.
10160 # These are to be used in all versions (though some are constructed by
10161 # this program if missing)
10162 push @return, split /\n/, <<'END';
10164 Bidi_M ; Bidi_Mirrored
10166 ccc ; Canonical_Combining_Class
10167 dm ; Decomposition_Mapping
10168 dt ; Decomposition_Type
10169 gc ; General_Category
10171 lc ; Lowercase_Mapping
10173 na1 ; Unicode_1_Name
10176 scf ; Simple_Case_Folding
10177 slc ; Simple_Lowercase_Mapping
10178 stc ; Simple_Titlecase_Mapping
10179 suc ; Simple_Uppercase_Mapping
10180 tc ; Titlecase_Mapping
10181 uc ; Uppercase_Mapping
10184 if (-e 'Blocks.txt') {
10185 push @return, "blk ; Block\n";
10187 if (-e 'ArabicShaping.txt') {
10188 push @return, split /\n/, <<'END';
10193 if (-e 'PropList.txt') {
10195 # This first set is in the original old-style proplist.
10196 push @return, split /\n/, <<'END';
10197 Bidi_C ; Bidi_Control
10205 Join_C ; Join_Control
10207 QMark ; Quotation_Mark
10208 Term ; Terminal_Punctuation
10209 WSpace ; White_Space
10211 # The next sets were added later
10212 if ($v_version ge v3.0.0) {
10213 push @return, split /\n/, <<'END';
10218 if ($v_version ge v3.0.1) {
10219 push @return, split /\n/, <<'END';
10220 NChar ; Noncharacter_Code_Point
10223 # The next sets were added in the new-style
10224 if ($v_version ge v3.1.0) {
10225 push @return, split /\n/, <<'END';
10226 OAlpha ; Other_Alphabetic
10227 OLower ; Other_Lowercase
10229 OUpper ; Other_Uppercase
10232 if ($v_version ge v3.1.1) {
10233 push @return, "AHex ; ASCII_Hex_Digit\n";
10236 if (-e 'EastAsianWidth.txt') {
10237 push @return, "ea ; East_Asian_Width\n";
10239 if (-e 'CompositionExclusions.txt') {
10240 push @return, "CE ; Composition_Exclusion\n";
10242 if (-e 'LineBreak.txt') {
10243 push @return, "lb ; Line_Break\n";
10245 if (-e 'BidiMirroring.txt') {
10246 push @return, "bmg ; Bidi_Mirroring_Glyph\n";
10248 if (-e 'Scripts.txt') {
10249 push @return, "sc ; Script\n";
10251 if (-e 'DNormalizationProps.txt') {
10252 push @return, split /\n/, <<'END';
10253 Comp_Ex ; Full_Composition_Exclusion
10254 FC_NFKC ; FC_NFKC_Closure
10255 NFC_QC ; NFC_Quick_Check
10256 NFD_QC ; NFD_Quick_Check
10257 NFKC_QC ; NFKC_Quick_Check
10258 NFKD_QC ; NFKD_Quick_Check
10259 XO_NFC ; Expands_On_NFC
10260 XO_NFD ; Expands_On_NFD
10261 XO_NFKC ; Expands_On_NFKC
10262 XO_NFKD ; Expands_On_NFKD
10265 if (-e 'DCoreProperties.txt') {
10266 push @return, split /\n/, <<'END';
10269 XIDC ; XID_Continue
10272 # These can also appear in some versions of PropList.txt
10273 push @return, "Lower ; Lowercase\n"
10274 unless grep { $_ =~ /^Lower\b/} @return;
10275 push @return, "Upper ; Uppercase\n"
10276 unless grep { $_ =~ /^Upper\b/} @return;
10279 # This flag requires the DAge.txt file to be copied into the directory.
10280 if (DEBUG && $compare_versions) {
10281 push @return, 'age ; Age';
10287 sub substitute_PropValueAliases($file_object) {
10288 # Deal with early releases that don't have the crucial
10289 # PropValueAliases.txt file.
10291 $file_object->insert_lines(get_old_property_value_aliases());
10293 process_PropValueAliases($file_object);
10296 sub process_PropValueAliases($file) {
10297 # This file contains values that properties look like:
10298 # bc ; AL ; Arabic_Letter
10299 # blk; n/a ; Greek_And_Coptic ; Greek
10301 # Field 0 is the property.
10302 # Field 1 is the short name of a property value or 'n/a' if no
10303 # short name exists;
10304 # Field 2 is the full property value name;
10305 # Any other fields are more synonyms for the property value.
10306 # Purely numeric property values are omitted from the file; as are some
10307 # others, fewer and fewer in later releases
10309 # Entries for the ccc property have an extra field before the
10311 # ccc; 0; NR ; Not_Reordered
10312 # It is the numeric value that the names are synonyms for.
10314 # There are comment entries for values missing from this file:
10315 # # @missing: 0000..10FFFF; ISO_Comment; <none>
10316 # # @missing: 0000..10FFFF; Lowercase_Mapping; <code point>
10318 if ($v_version lt 4.0.0) {
10319 $file->insert_lines(split /\n/, <<'END'
10320 Hangul_Syllable_Type; L ; Leading_Jamo
10321 Hangul_Syllable_Type; LV ; LV_Syllable
10322 Hangul_Syllable_Type; LVT ; LVT_Syllable
10323 Hangul_Syllable_Type; NA ; Not_Applicable
10324 Hangul_Syllable_Type; T ; Trailing_Jamo
10325 Hangul_Syllable_Type; V ; Vowel_Jamo
10329 if ($v_version lt 4.1.0) {
10330 $file->insert_lines(split /\n/, <<'END'
10331 _Perl_GCB; CN ; Control
10333 _Perl_GCB; EX ; Extend
10337 _Perl_GCB; LVT ; LVT
10340 _Perl_GCB; XX ; Other
10345 # Add any explicit cjk values
10346 $file->insert_lines(@cjk_property_values);
10348 # This line is used only for testing the code that checks for name
10349 # conflicts. There is a script Inherited, and when this line is executed
10350 # it causes there to be a name conflict with the 'Inherited' that this
10351 # program generates for this block property value
10352 #$file->insert_lines('blk; n/a; Herited');
10354 # Process each line of the file ...
10355 while ($file->next_line) {
10357 # Fix typo in input file
10358 s/CCC133/CCC132/g if $v_version eq v6.1.0;
10360 my ($property, @data) = split /\s*;\s*/;
10362 # The ccc property has an extra field at the beginning, which is the
10363 # numeric value. Move it to be after the other two, mnemonic, fields,
10364 # so that those will be used as the property value's names, and the
10365 # number will be an extra alias. (Rightmost splice removes field 1-2,
10366 # returning them in a slice; left splice inserts that before anything,
10367 # thus shifting the former field 0 to after them.)
10368 splice (@data, 0, 0, splice(@data, 1, 2)) if $property eq 'ccc';
10370 if ($v_version le v5.0.0 && $property eq 'blk' && $data[1] =~ /-/) {
10371 my $new_style = $data[1] =~ s/-/_/gr;
10372 splice @data, 1, 0, $new_style;
10375 # Field 0 is a short name unless "n/a"; field 1 is the full name. If
10376 # there is no short name, use the full one in element 1
10377 if ($data[0] eq "n/a") {
10378 $data[0] = $data[1];
10380 elsif ($data[0] ne $data[1]
10381 && standardize($data[0]) eq standardize($data[1])
10382 && $data[1] !~ /[[:upper:]]/)
10384 # Also, there is a bug in the file in which "n/a" is omitted, and
10385 # the two fields are identical except for case, and the full name
10386 # is all lower case. Copy the "short" name unto the full one to
10387 # give it some upper case.
10389 $data[1] = $data[0];
10392 # Earlier releases had the pseudo property 'qc' that should expand to
10393 # the ones that replace it below.
10394 if ($property eq 'qc') {
10395 if (lc $data[0] eq 'y') {
10396 $file->insert_lines('NFC_QC; Y ; Yes',
10398 'NFKC_QC; Y ; Yes',
10399 'NFKD_QC; Y ; Yes',
10402 elsif (lc $data[0] eq 'n') {
10403 $file->insert_lines('NFC_QC; N ; No',
10409 elsif (lc $data[0] eq 'm') {
10410 $file->insert_lines('NFC_QC; M ; Maybe',
10411 'NFKC_QC; M ; Maybe',
10415 $file->carp_bad_line("qc followed by unexpected '$data[0]");
10420 # The first field is the short name, 2nd is the full one.
10421 my $property_object = property_ref($property);
10422 my $table = $property_object->add_match_table($data[0],
10423 Full_Name => $data[1]);
10425 # Start looking for more aliases after these two.
10426 for my $i (2 .. @data - 1) {
10427 $table->add_alias($data[$i]);
10429 } # End of looping through the file
10431 # As noted in the comments early in the program, it generates tables for
10432 # the default values for all releases, even those for which the concept
10433 # didn't exist at the time. Here we add those if missing.
10434 if (defined $age && ! defined $age->table('Unassigned')) {
10435 $age->add_match_table('Unassigned');
10437 $block->add_match_table('No_Block') if -e 'Blocks.txt'
10438 && ! defined $block->table('No_Block');
10441 # Now set the default mappings of the properties from the file. This is
10442 # done after the loop because a number of properties have only @missings
10443 # entries in the file, and may not show up until the end.
10444 my @defaults = $file->get_missings;
10445 foreach my $default_ref (@defaults) {
10446 my $default = $default_ref->{default};
10447 my $property = property_ref($default_ref->{property});
10448 $property->set_default_map($default);
10454 sub get_old_property_value_aliases () {
10455 # Returns what would be in PropValueAliases.txt if it existed in very old
10456 # versions of Unicode. It was derived from the one in 3.2, and pared
10457 # down. An attempt was made to use the existence of files to mean
10458 # inclusion or not of various aliases, but if this was not sufficient,
10459 # using version numbers was resorted to.
10461 my @return = split /\n/, <<'END';
10462 bc ; AN ; Arabic_Number
10463 bc ; B ; Paragraph_Separator
10464 bc ; CS ; Common_Separator
10465 bc ; EN ; European_Number
10466 bc ; ES ; European_Separator
10467 bc ; ET ; European_Terminator
10468 bc ; L ; Left_To_Right
10469 bc ; ON ; Other_Neutral
10470 bc ; R ; Right_To_Left
10471 bc ; WS ; White_Space
10473 Bidi_M; N; No; F; False
10474 Bidi_M; Y; Yes; T; True
10476 # The standard combining classes are very much different in v1, so only use
10477 # ones that look right (not checked thoroughly)
10478 ccc; 0; NR ; Not_Reordered
10479 ccc; 1; OV ; Overlay
10481 ccc; 8; KV ; Kana_Voicing
10482 ccc; 9; VR ; Virama
10483 ccc; 202; ATBL ; Attached_Below_Left
10484 ccc; 216; ATAR ; Attached_Above_Right
10485 ccc; 218; BL ; Below_Left
10486 ccc; 220; B ; Below
10487 ccc; 222; BR ; Below_Right
10489 ccc; 228; AL ; Above_Left
10490 ccc; 230; A ; Above
10491 ccc; 232; AR ; Above_Right
10492 ccc; 234; DA ; Double_Above
10494 dt ; can ; canonical
10498 dt ; fra ; fraction
10499 dt ; init ; initial
10500 dt ; iso ; isolated
10508 gc ; C ; Other # Cc | Cf | Cn | Co | Cs
10510 gc ; Cn ; Unassigned
10511 gc ; Co ; Private_Use
10512 gc ; L ; Letter # Ll | Lm | Lo | Lt | Lu
10513 gc ; LC ; Cased_Letter # Ll | Lt | Lu
10514 gc ; Ll ; Lowercase_Letter
10515 gc ; Lm ; Modifier_Letter
10516 gc ; Lo ; Other_Letter
10517 gc ; Lu ; Uppercase_Letter
10518 gc ; M ; Mark # Mc | Me | Mn
10519 gc ; Mc ; Spacing_Mark
10520 gc ; Mn ; Nonspacing_Mark
10521 gc ; N ; Number # Nd | Nl | No
10522 gc ; Nd ; Decimal_Number
10523 gc ; No ; Other_Number
10524 gc ; P ; Punctuation # Pc | Pd | Pe | Pf | Pi | Po | Ps
10525 gc ; Pd ; Dash_Punctuation
10526 gc ; Pe ; Close_Punctuation
10527 gc ; Po ; Other_Punctuation
10528 gc ; Ps ; Open_Punctuation
10529 gc ; S ; Symbol # Sc | Sk | Sm | So
10530 gc ; Sc ; Currency_Symbol
10531 gc ; Sm ; Math_Symbol
10532 gc ; So ; Other_Symbol
10533 gc ; Z ; Separator # Zl | Zp | Zs
10534 gc ; Zl ; Line_Separator
10535 gc ; Zp ; Paragraph_Separator
10536 gc ; Zs ; Space_Separator
10544 if (-e 'ArabicShaping.txt') {
10545 push @return, split /\n/, <<'END';
10552 jg ; n/a ; NO_JOINING_GROUP
10560 jt ; C ; Join_Causing
10561 jt ; D ; Dual_Joining
10562 jt ; L ; Left_Joining
10563 jt ; R ; Right_Joining
10564 jt ; U ; Non_Joining
10565 jt ; T ; Transparent
10567 if ($v_version ge v3.0.0) {
10568 push @return, split /\n/, <<'END';
10572 jg ; n/a ; DALATH_RISH
10575 jg ; n/a ; FINAL_SEMKATH
10578 jg ; n/a ; HAMZA_ON_HEH_GOAL
10581 jg ; n/a ; HEH_GOAL
10585 jg ; n/a ; KNOTTED_HEH
10592 jg ; n/a ; REVERSED_PE
10596 jg ; n/a ; SWASH_KAF
10598 jg ; n/a ; TEH_MARBUTA
10601 jg ; n/a ; YEH_BARREE
10602 jg ; n/a ; YEH_WITH_TAIL
10611 if (-e 'EastAsianWidth.txt') {
10612 push @return, split /\n/, <<'END';
10622 if (-e 'LineBreak.txt' || -e 'LBsubst.txt') {
10623 my @lb = split /\n/, <<'END';
10624 lb ; AI ; Ambiguous
10625 lb ; AL ; Alphabetic
10626 lb ; B2 ; Break_Both
10627 lb ; BA ; Break_After
10628 lb ; BB ; Break_Before
10629 lb ; BK ; Mandatory_Break
10630 lb ; CB ; Contingent_Break
10631 lb ; CL ; Close_Punctuation
10632 lb ; CM ; Combining_Mark
10633 lb ; CR ; Carriage_Return
10634 lb ; EX ; Exclamation
10637 lb ; ID ; Ideographic
10638 lb ; IN ; Inseperable
10639 lb ; IS ; Infix_Numeric
10640 lb ; LF ; Line_Feed
10641 lb ; NS ; Nonstarter
10643 lb ; OP ; Open_Punctuation
10644 lb ; PO ; Postfix_Numeric
10645 lb ; PR ; Prefix_Numeric
10646 lb ; QU ; Quotation
10647 lb ; SA ; Complex_Context
10648 lb ; SG ; Surrogate
10650 lb ; SY ; Break_Symbols
10654 # If this Unicode version predates the lb property, we use our
10656 if (-e 'LBsubst.txt') {
10657 $_ = s/^lb/_Perl_LB/r for @lb;
10662 if (-e 'DNormalizationProps.txt') {
10663 push @return, split /\n/, <<'END';
10670 if (-e 'Scripts.txt') {
10671 push @return, split /\n/, <<'END';
10673 sc ; Armn ; Armenian
10674 sc ; Beng ; Bengali
10675 sc ; Bopo ; Bopomofo
10676 sc ; Cans ; Canadian_Aboriginal
10677 sc ; Cher ; Cherokee
10678 sc ; Cyrl ; Cyrillic
10679 sc ; Deva ; Devanagari
10680 sc ; Dsrt ; Deseret
10681 sc ; Ethi ; Ethiopic
10682 sc ; Geor ; Georgian
10685 sc ; Gujr ; Gujarati
10686 sc ; Guru ; Gurmukhi
10690 sc ; Hira ; Hiragana
10691 sc ; Ital ; Old_Italic
10692 sc ; Kana ; Katakana
10694 sc ; Knda ; Kannada
10697 sc ; Mlym ; Malayalam
10698 sc ; Mong ; Mongolian
10699 sc ; Mymr ; Myanmar
10702 sc ; Qaai ; Inherited
10704 sc ; Sinh ; Sinhala
10710 sc ; Tibt ; Tibetan
10716 if ($v_version ge v2.0.0) {
10717 push @return, split /\n/, <<'END';
10721 dt ; vert ; vertical
10725 gc ; Cs ; Surrogate
10726 gc ; Lt ; Titlecase_Letter
10727 gc ; Me ; Enclosing_Mark
10728 gc ; Nl ; Letter_Number
10729 gc ; Pc ; Connector_Punctuation
10730 gc ; Sk ; Modifier_Symbol
10733 if ($v_version ge v2.1.2) {
10734 push @return, "bc ; S ; Segment_Separator\n";
10736 if ($v_version ge v2.1.5) {
10737 push @return, split /\n/, <<'END';
10738 gc ; Pf ; Final_Punctuation
10739 gc ; Pi ; Initial_Punctuation
10742 if ($v_version ge v2.1.8) {
10743 push @return, "ccc; 240; IS ; Iota_Subscript\n";
10746 if ($v_version ge v3.0.0) {
10747 push @return, split /\n/, <<'END';
10748 bc ; AL ; Arabic_Letter
10749 bc ; BN ; Boundary_Neutral
10750 bc ; LRE ; Left_To_Right_Embedding
10751 bc ; LRO ; Left_To_Right_Override
10752 bc ; NSM ; Nonspacing_Mark
10753 bc ; PDF ; Pop_Directional_Format
10754 bc ; RLE ; Right_To_Left_Embedding
10755 bc ; RLO ; Right_To_Left_Override
10757 ccc; 233; DB ; Double_Below
10761 if ($v_version ge v3.1.0) {
10762 push @return, "ccc; 226; R ; Right\n";
10768 sub process_NormalizationsTest($file) {
10770 # Each line looks like:
10771 # source code point; NFC; NFD; NFKC; NFKD
10773 # 1E0A;1E0A;0044 0307;1E0A;0044 0307;
10775 # Process each line of the file ...
10776 while ($file->next_line) {
10780 my ($c1, $c2, $c3, $c4, $c5) = split /\s*;\s*/;
10782 foreach my $var (\$c1, \$c2, \$c3, \$c4, \$c5) {
10783 $$var = pack "U0U*", map { hex } split " ", $$var;
10784 $$var =~ s/(\\)/$1$1/g;
10787 push @normalization_tests,
10788 "Test_N(q
\a$c1
\a, q
\a$c2
\a, q
\a$c3
\a, q
\a$c4
\a, q
\a$c5
\a);\n";
10789 } # End of looping through the file
10792 sub output_perl_charnames_line ($code_point, $name) {
10794 # Output the entries in Perl_charnames specially, using 5 digits instead
10795 # of four. This makes the entries a constant length, and simplifies
10796 # charnames.pm which this table is for. Unicode can have 6 digit
10797 # ordinals, but they are all private use or noncharacters which do not
10798 # have names, so won't be in this table.
10800 return sprintf "%05X\n%s\n\n", $code_point, $name;
10805 # These are constants to the $property_info hash in this subroutine, to
10806 # avoid using a quoted-string which might have a typo.
10808 my $DEFAULT_MAP = 'default_map';
10809 my $DEFAULT_TABLE = 'default_table';
10810 my $PSEUDO_MAP_TYPE = 'pseudo_map_type';
10811 my $MISSINGS = 'missings';
10813 sub process_generic_property_file($file) {
10814 # This processes a file containing property mappings and puts them
10815 # into internal map tables. It should be used to handle any property
10816 # files that have mappings from a code point or range thereof to
10817 # something else. This means almost all the UCD .txt files.
10818 # each_line_handlers() should be set to adjust the lines of these
10819 # files, if necessary, to what this routine understands:
10822 # 003C..003E ; Math
10824 # the fields are: "codepoint-range ; property; map"
10826 # meaning the codepoints in the range all have the value 'map' under
10828 # Beginning and trailing white space in each field are not significant.
10829 # Note there is not a trailing semi-colon in the above. A trailing
10830 # semi-colon means the map is a null-string. An omitted map, as
10831 # opposed to a null-string, is assumed to be 'Y', based on Unicode
10832 # table syntax. (This could have been hidden from this routine by
10833 # doing it in the $file object, but that would require parsing of the
10834 # line there, so would have to parse it twice, or change the interface
10835 # to pass this an array. So not done.)
10837 # The map field may begin with a sequence of commands that apply to
10838 # this range. Each such command begins and ends with $CMD_DELIM.
10839 # These are used to indicate, for example, that the mapping for a
10840 # range has a non-default type.
10842 # This loops through the file, calling its next_line() method, and
10843 # then taking the map and adding it to the property's table.
10844 # Complications arise because any number of properties can be in the
10845 # file, in any order, interspersed in any way. The first time a
10846 # property is seen, it gets information about that property and
10847 # caches it for quick retrieval later. It also normalizes the maps
10848 # so that only one of many synonyms is stored. The Unicode input
10849 # files do use some multiple synonyms.
10851 my %property_info; # To keep track of what properties
10852 # have already had entries in the
10853 # current file, and info about each,
10854 # so don't have to recompute.
10855 my $property_name; # property currently being worked on
10856 my $property_type; # and its type
10857 my $previous_property_name = ""; # name from last time through loop
10858 my $property_object; # pointer to the current property's
10860 my $property_addr; # the address of that object
10861 my $default_map; # the string that code points missing
10862 # from the file map to
10863 my $default_table; # For non-string properties, a
10864 # reference to the match table that
10865 # will contain the list of code
10866 # points that map to $default_map.
10868 # Get the next real non-comment line
10870 while ($file->next_line) {
10872 # Default replacement type; means that if parts of the range have
10873 # already been stored in our tables, the new map overrides them if
10874 # they differ more than cosmetically
10875 my $replace = $IF_NOT_EQUIVALENT;
10876 my $map_type; # Default type for the map of this range
10878 #local $to_trace = 1 if main::DEBUG;
10879 trace $_ if main::DEBUG && $to_trace;
10881 # Split the line into components
10882 my ($range, $property_name, $map, @remainder)
10883 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
10885 # If more or less on the line than we are expecting, warn and skip
10888 $file->carp_bad_line('Extra fields');
10891 elsif ( ! defined $property_name) {
10892 $file->carp_bad_line('Missing property');
10896 # Examine the range.
10897 if ($range !~ /^ ($code_point_re) (?:\.\. ($code_point_re) )? $/x)
10899 $file->carp_bad_line("Range '$range' not of the form 'CP1' or 'CP1..CP2' (where CP1,2 are code points in hex)");
10903 my $high = (defined $2) ? hex $2 : $low;
10905 # If changing to a new property, get the things constant per
10907 if ($previous_property_name ne $property_name) {
10909 $property_object = property_ref($property_name);
10910 if (! defined $property_object) {
10911 $file->carp_bad_line("Unexpected property '$property_name'. Skipped");
10914 $property_addr = pack 'J', refaddr $property_object;
10916 # Defer changing names until have a line that is acceptable
10917 # (the 'next' statement above means is unacceptable)
10918 $previous_property_name = $property_name;
10920 # If not the first time for this property, retrieve info about
10921 # it from the cache
10922 my $this_property_info = $property_info{$property_addr};
10923 if (defined ($this_property_info->{$TYPE})) {
10924 $property_type = $this_property_info->{$TYPE};
10925 $default_map = $this_property_info->{$DEFAULT_MAP};
10926 $map_type = $this_property_info->{$PSEUDO_MAP_TYPE};
10927 $default_table = $this_property_info->{$DEFAULT_TABLE};
10931 # Here, is the first time for this property. Set up the
10933 $property_type = $this_property_info->{$TYPE}
10934 = $property_object->type;
10936 = $this_property_info->{$PSEUDO_MAP_TYPE}
10937 = $property_object->pseudo_map_type;
10939 # The Unicode files are set up so that if the map is not
10940 # defined, it is a binary property
10941 if (! defined $map && $property_type != $BINARY) {
10942 if ($property_type != $UNKNOWN
10943 && $property_type != $NON_STRING)
10945 $file->carp_bad_line("No mapping defined on a non-binary property. Using 'Y' for the map");
10948 $property_object->set_type($BINARY);
10949 $property_type = $this_property_info->{$TYPE}
10954 # Get any @missings default for this property. This
10955 # should precede the first entry for the property in the
10956 # input file, and is located in a comment that has been
10957 # stored by the Input_file class until we access it here.
10958 # It's possible that there is more than one such line
10959 # waiting for us; collect them all, and parse
10961 @missings_list = $file->get_missings
10962 if $file->has_missings_defaults;
10964 foreach my $default_ref (@missings_list) {
10966 # For now, we are only interested in the fallback
10967 # default for the entire property. i.e., an @missing
10968 # line that is for the whole Unicode range.
10969 next if $default_ref->{start} != 0
10970 || $default_ref->{end} != $MAX_UNICODE_CODEPOINT;
10972 $default_map = $default_ref->{default};
10974 # For string properties, the default is just what the
10975 # file says, but non-string properties should already
10976 # have set up a table for the default property value;
10977 # use the table for these, so can resolve synonyms
10978 # later to a single standard one.
10979 if ($property_type == $STRING
10980 || $property_type == $UNKNOWN)
10982 $this_property_info->{$MISSINGS} = $default_map;
10986 $property_object->table($default_map)->full_name;
10987 $this_property_info->{$MISSINGS} = $default_map;
10988 $this_property_info->{$DEFAULT_MAP} = $default_map;
10989 if (! defined $property_object->default_map) {
10990 $property_object->set_default_map($default_map);
10995 # For later Unicode versions, multiple @missing lines for
10996 # a single property can appear in the files. The first
10997 # always applies to the entire Unicode range, and was
10998 # handled above. The subsequent ones are for smaller
10999 # ranges, and can be read as "But for this range, the
11000 # default is ...". So each overrides all the preceding
11001 # ones for the range it applies to. Typically they apply
11002 # to disjoint ranges, but don't have to. What we do is to
11003 # set them up to work in reverse order, so that after the
11004 # rest of the table is filled, the highest priority
11005 # default range fills in any code points that haven't been
11006 # specified; then the next highest priority one is
11007 # applied, and so forth.
11008 if (@missings_list > 1 && $v_version ge v15.0.0) {
11009 if ($property_type != $ENUM) {
11010 Carp::my_carp_bug("Multiple \@missings lines only"
11011 . " make sense for ENUM-type"
11012 . " properties. Changing type to"
11014 $property_type = $this_property_info->{$TYPE}
11016 $property_object->set_type($ENUM);
11019 my $multi = Multi_Default->new();
11021 # The overall default should be first on this list,
11022 # and is handled differently than the rest.
11023 $default_map = shift @missings_list;
11024 Carp::my_carp_bug("\@missings needs to be entire range")
11025 if $default_map->{start} != 0
11026 || $default_map->{end} != $MAX_UNICODE_CODEPOINT;
11028 # We already have looked at this line above. Use that
11030 $multi->set_final_default($this_property_info->
11033 # Now get the individual range elements, and add them
11034 # to Multi_Default object
11035 while (@missings_list) {
11036 my $this_entry = pop @missings_list;
11037 my $subrange_default = $this_entry->{default};
11039 # Use the short name as a standard
11040 $subrange_default = $property_object->
11041 table($subrange_default)->short_name;
11042 $multi->append_default($subrange_default,
11043 "Range_List->new(Initialize => Range->new("
11044 . "$this_entry->{start}, $this_entry->{end}))");
11047 # Override the property's simple default with this.
11048 $property_object->set_default_map($multi);
11051 if (! $default_map || $property_type != $ENUM) {
11053 # Finished storing all the @missings defaults in the
11054 # input file so far. Get the one for the current
11056 my $missings = $this_property_info->{$MISSINGS};
11058 # But we likely have separately stored what the
11059 # default should be. (This is to accommodate versions
11060 # of the standard where the @missings lines are absent
11061 # or incomplete.) Hopefully the two will match. But
11063 $default_map = $property_object->default_map;
11065 # If the map is a ref, it means that the default won't
11066 # be processed until later, so undef it, so next few
11067 # lines will redefine it to something that nothing
11069 undef $default_map if ref $default_map;
11071 # Create a $default_map if don't have one; maybe a
11072 # dummy that won't match anything.
11073 if (! defined $default_map) {
11075 # Use any @missings line in the file.
11076 if (defined $missings) {
11077 if (ref $missings) {
11078 $default_map = $missings->full_name;
11079 $default_table = $missings;
11082 $default_map = $missings;
11085 # And store it with the property for outside
11087 $property_object->set_default_map($default_map);
11091 # Neither an @missings nor a default map.
11092 # Create a dummy one, so won't have to test
11093 # definedness in the main loop.
11094 $default_map = '_Perl This will never be in a'
11095 . ' file from Unicode';
11099 # Here, we have $default_map defined, possibly in
11100 # terms of $missings, but maybe not, and possibly is a
11102 if (defined $missings) {
11104 # Make sure there is no conflict between the two.
11105 # $missings has priority.
11106 if (ref $missings) {
11108 = $property_object->table($default_map);
11109 if ( ! defined $default_table
11110 || $default_table != $missings)
11112 if (! defined $default_table) {
11113 $default_table = $UNDEF;
11115 $file->carp_bad_line(<<END
11116 The \@missings line for $property_name in $file says that missings default to
11117 $missings, but we expect it to be $default_table. $missings used.
11120 $default_table = $missings;
11121 $default_map = $missings->full_name;
11123 $this_property_info->{$DEFAULT_TABLE}
11126 elsif ($default_map ne $missings) {
11127 $file->carp_bad_line(<<END
11128 The \@missings line for $property_name in $file says that missings default to
11129 $missings, but we expect it to be $default_map. $missings used.
11132 $default_map = $missings;
11136 $this_property_info->{$DEFAULT_MAP} = $default_map;
11138 # If haven't done so already, find the table
11139 # corresponding to this map for non-string properties.
11140 if (! defined $default_table
11141 && $property_type != $STRING
11142 && $property_type != $UNKNOWN)
11145 = $this_property_info->{$DEFAULT_TABLE}
11146 = $property_object->table($default_map);
11149 } # End of is first time for this property
11150 } # End of switching properties.
11152 # Ready to process the line.
11153 # The Unicode files are set up so that if the map is not defined,
11154 # it is a binary property with value 'Y'
11155 if (! defined $map) {
11160 # If the map begins with a special command to us (enclosed in
11161 # delimiters), extract the command(s).
11162 while ($map =~ s/ ^ $CMD_DELIM (.*?) $CMD_DELIM //x) {
11164 if ($command =~ / ^ $REPLACE_CMD= (.*) /x) {
11167 elsif ($command =~ / ^ $MAP_TYPE_CMD= (.*) /x) {
11171 $file->carp_bad_line("Unknown command line: '$1'");
11177 if ( $default_map eq $CODE_POINT
11178 && $map =~ / ^ $code_point_re $/x)
11181 # Here, we have a map to a particular code point, and the
11182 # default map is to a code point itself. If the range
11183 # includes the particular code point, change that portion of
11184 # the range to the default. This makes sure that in the final
11185 # table only the non-defaults are listed.
11186 my $decimal_map = hex $map;
11187 if ($low <= $decimal_map && $decimal_map <= $high) {
11189 # If the range includes stuff before or after the map
11190 # we're changing, split it and process the split-off parts
11192 if ($low < $decimal_map) {
11193 $file->insert_adjusted_lines(
11194 sprintf("%04X..%04X; %s; %s",
11200 if ($high > $decimal_map) {
11201 $file->insert_adjusted_lines(
11202 sprintf("%04X..%04X; %s; %s",
11208 $low = $high = $decimal_map;
11209 $map = $CODE_POINT;
11213 if ($property_type != $STRING && $property_type != $UNKNOWN) {
11214 my $table = $property_object->table($map);
11215 if (defined $table) {
11217 # Unicode isn't very consistent about which synonym they
11218 # use in their .txt files, even within the same file, or
11219 # two files that are for the same property. For enum
11220 # properties, we know already what all the synonyms are
11221 # (because we processed PropValueAliases already).
11222 # Therefore we can take the input and map it to a uniform
11223 # value now, saving us trouble later.
11225 # Only if the map is well-behaved do we try this:
11226 # non-empty, all non-blank.
11227 if ($property_type == $ENUM && $map =~ / ^ \S+ $ /x) {
11229 # Use existing practice as much as easily practicable,
11230 # so that code that has assumptions about spelling
11231 # doesn't have to change
11232 my $short_name = $property_object->short_name;
11233 if ($short_name =~ / ^ (BC | EA | GC |HST | JT |
11234 Lb | BT | BPT | NFCQC |
11237 $map = $table->short_name;
11239 elsif ($short_name !~ / ^ ( Ccc | Age | InSC | JG |
11242 $map = $table->full_name;
11245 elsif ($table == $default_table) {
11247 # When it isn't an ENUM, we we can still tell if
11248 # this is a synonym for the default map. If so, use
11249 # the default one instead.
11250 $map = $default_map;
11255 # And figure out the map type if not known.
11256 if (! defined $map_type || $map_type == $COMPUTE_NO_MULTI_CP) {
11257 if ($map eq "") { # Nulls are always $NULL map type
11259 } # Otherwise, non-strings, and those that don't allow
11260 # $MULTI_CP, and those that aren't multiple code points are
11263 (($property_type != $STRING && $property_type != $UNKNOWN)
11264 || (defined $map_type && $map_type == $COMPUTE_NO_MULTI_CP)
11265 || $map !~ /^ $code_point_re ( \ $code_point_re )+ $ /x)
11270 $map_type = $MULTI_CP;
11274 $property_object->add_map($low, $high,
11277 Replace => $replace);
11278 } # End of loop through file's lines
11284 { # Closure for UnicodeData.txt handling
11286 # This file was the first one in the UCD; its design leads to some
11287 # awkwardness in processing. Here is a sample line:
11288 # 0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;
11289 # The fields in order are:
11290 my $i = 0; # The code point is in field 0, and is shifted off.
11291 my $CHARNAME = $i++; # character name (e.g. "LATIN CAPITAL LETTER A")
11292 my $CATEGORY = $i++; # category (e.g. "Lu")
11293 my $CCC = $i++; # Canonical combining class (e.g. "230")
11294 my $BIDI = $i++; # directional class (e.g. "L")
11295 my $PERL_DECOMPOSITION = $i++; # decomposition mapping
11296 my $PERL_DECIMAL_DIGIT = $i++; # decimal digit value
11297 my $NUMERIC_TYPE_OTHER_DIGIT = $i++; # digit value, like a superscript
11298 # Dual-use in this program; see below
11299 my $NUMERIC = $i++; # numeric value
11300 my $MIRRORED = $i++; # ? mirrored
11301 my $UNICODE_1_NAME = $i++; # name in Unicode 1.0
11302 my $COMMENT = $i++; # iso comment
11303 my $UPPER = $i++; # simple uppercase mapping
11304 my $LOWER = $i++; # simple lowercase mapping
11305 my $TITLE = $i++; # simple titlecase mapping
11306 my $input_field_count = $i;
11308 # This routine in addition outputs these extra fields:
11310 my $DECOMP_TYPE = $i++; # Decomposition type
11312 # These fields are modifications of ones above, and are usually
11313 # suppressed; they must come last, as for speed, the loop upper bound is
11314 # normally set to ignore them
11315 my $NAME = $i++; # This is the strict name field, not the one that
11317 my $DECOMP_MAP = $i++; # Strict decomposition mapping; not the one used
11318 # by Unicode::Normalize
11319 my $last_field = $i - 1;
11321 # All these are read into an array for each line, with the indices defined
11322 # above. The empty fields in the example line above indicate that the
11323 # value is defaulted. The handler called for each line of the input
11324 # changes these to their defaults.
11326 # Here are the official names of the properties, in a parallel array:
11328 $field_names[$BIDI] = 'Bidi_Class';
11329 $field_names[$CATEGORY] = 'General_Category';
11330 $field_names[$CCC] = 'Canonical_Combining_Class';
11331 $field_names[$CHARNAME] = 'Perl_Charnames';
11332 $field_names[$COMMENT] = 'ISO_Comment';
11333 $field_names[$DECOMP_MAP] = 'Decomposition_Mapping';
11334 $field_names[$DECOMP_TYPE] = 'Decomposition_Type';
11335 $field_names[$LOWER] = 'Lowercase_Mapping';
11336 $field_names[$MIRRORED] = 'Bidi_Mirrored';
11337 $field_names[$NAME] = 'Name';
11338 $field_names[$NUMERIC] = 'Numeric_Value';
11339 $field_names[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric_Type';
11340 $field_names[$PERL_DECIMAL_DIGIT] = 'Perl_Decimal_Digit';
11341 $field_names[$PERL_DECOMPOSITION] = 'Perl_Decomposition_Mapping';
11342 $field_names[$TITLE] = 'Titlecase_Mapping';
11343 $field_names[$UNICODE_1_NAME] = 'Unicode_1_Name';
11344 $field_names[$UPPER] = 'Uppercase_Mapping';
11346 # Some of these need a little more explanation:
11347 # The $PERL_DECIMAL_DIGIT field does not lead to an official Unicode
11348 # property, but is used in calculating the Numeric_Type. Perl however,
11349 # creates a file from this field, so a Perl property is created from it.
11350 # Similarly, the Other_Digit field is used only for calculating the
11351 # Numeric_Type, and so it can be safely re-used as the place to store
11352 # the value for Numeric_Type; hence it is referred to as
11353 # $NUMERIC_TYPE_OTHER_DIGIT.
11354 # The input field named $PERL_DECOMPOSITION is a combination of both the
11355 # decomposition mapping and its type. Perl creates a file containing
11356 # exactly this field, so it is used for that. The two properties are
11357 # separated into two extra output fields, $DECOMP_MAP and $DECOMP_TYPE.
11358 # $DECOMP_MAP is usually suppressed (unless the lists are changed to
11359 # output it), as Perl doesn't use it directly.
11360 # The input field named here $CHARNAME is used to construct the
11361 # Perl_Charnames property, which is a combination of the Name property
11362 # (which the input field contains), and the Unicode_1_Name property, and
11363 # others from other files. Since, the strict Name property is not used
11364 # by Perl, this field is used for the table that Perl does use. The
11365 # strict Name property table is usually suppressed (unless the lists are
11366 # changed to output it), so it is accumulated in a separate field,
11367 # $NAME, which to save time is discarded unless the table is actually to
11370 # This file is processed like most in this program. Control is passed to
11371 # process_generic_property_file() which calls filter_UnicodeData_line()
11372 # for each input line. This filter converts the input into line(s) that
11373 # process_generic_property_file() understands. There is also a setup
11374 # routine called before any of the file is processed, and a handler for
11375 # EOF processing, all in this closure.
11377 # A huge speed-up occurred at the cost of some added complexity when these
11378 # routines were altered to buffer the outputs into ranges. Almost all the
11379 # lines of the input file apply to just one code point, and for most
11380 # properties, the map for the next code point up is the same as the
11381 # current one. So instead of creating a line for each property for each
11382 # input line, filter_UnicodeData_line() remembers what the previous map
11383 # of a property was, and doesn't generate a line to pass on until it has
11384 # to, as when the map changes; and that passed-on line encompasses the
11385 # whole contiguous range of code points that have the same map for that
11386 # property. This means a slight amount of extra setup, and having to
11387 # flush these buffers on EOF, testing if the maps have changed, plus
11388 # remembering state information in the closure. But it means a lot less
11389 # real time in not having to change the data base for each property on
11392 # Another complication is that there are already a few ranges designated
11393 # in the input. There are two lines for each, with the same maps except
11394 # the code point and name on each line. This was actually the hardest
11395 # thing to design around. The code points in those ranges may actually
11396 # have real maps not given by these two lines. These maps will either
11397 # be algorithmically determinable, or be in the extracted files furnished
11398 # with the UCD. In the event of conflicts between these extracted files,
11399 # and this one, Unicode says that this one prevails. But it shouldn't
11400 # prevail for conflicts that occur in these ranges. The data from the
11401 # extracted files prevails in those cases. So, this program is structured
11402 # so that those files are processed first, storing maps. Then the other
11403 # files are processed, generally overwriting what the extracted files
11404 # stored. But just the range lines in this input file are processed
11405 # without overwriting. This is accomplished by adding a special string to
11406 # the lines output to tell process_generic_property_file() to turn off the
11407 # overwriting for just this one line.
11408 # A similar mechanism is used to tell it that the map is of a non-default
11411 sub setup_UnicodeData($file) { # Called before any lines of the input are read
11413 # Create a new property specially located that is a combination of
11414 # various Name properties: Name, Unicode_1_Name, Named Sequences, and
11415 # _Perl_Name_Alias properties. (The final one duplicates elements of the
11416 # first, and starting in v6.1, is the same as the 'Name_Alias
11417 # property.) A comment for the new property will later be constructed
11418 # based on the actual properties present and used
11419 $perl_charname = Property->new('Perl_Charnames',
11421 Directory => File::Spec->curdir(),
11423 Fate => $INTERNAL_ONLY,
11424 Perl_Extension => 1,
11425 Range_Size_1 => \&output_perl_charnames_line,
11428 $perl_charname->set_proxy_for('Name');
11430 my $Perl_decomp = Property->new('Perl_Decomposition_Mapping',
11431 Directory => File::Spec->curdir(),
11432 File => 'Decomposition',
11433 Format => $DECOMP_STRING_FORMAT,
11434 Fate => $INTERNAL_ONLY,
11435 Perl_Extension => 1,
11436 Default_Map => $CODE_POINT,
11438 # normalize.pm can't cope with these
11439 Output_Range_Counts => 0,
11441 # This is a specially formatted table
11442 # explicitly for normalize.pm, which
11443 # is expecting a particular format,
11444 # which means that mappings containing
11445 # multiple code points are in the main
11446 # body of the table
11447 Map_Type => $COMPUTE_NO_MULTI_CP,
11449 To_Output_Map => $INTERNAL_MAP,
11451 $Perl_decomp->set_proxy_for('Decomposition_Mapping', 'Decomposition_Type');
11452 $Perl_decomp->add_comment(join_lines(<<END
11453 This mapping is a combination of the Unicode 'Decomposition_Type' and
11454 'Decomposition_Mapping' properties, formatted for use by normalize.pm. It is
11455 identical to the official Unicode 'Decomposition_Mapping' property except for
11457 1) It omits the algorithmically determinable Hangul syllable decompositions,
11458 which normalize.pm handles algorithmically.
11459 2) It contains the decomposition type as well. Non-canonical decompositions
11460 begin with a word in angle brackets, like <super>, which denotes the
11461 compatible decomposition type. If the map does not begin with the <angle
11462 brackets>, the decomposition is canonical.
11466 my $Decimal_Digit = Property->new("Perl_Decimal_Digit",
11468 Perl_Extension => 1,
11469 Directory => $map_directory,
11471 To_Output_Map => $OUTPUT_ADJUSTED,
11473 $Decimal_Digit->add_comment(join_lines(<<END
11474 This file gives the mapping of all code points which represent a single
11475 decimal digit [0-9] to their respective digits, but it has ranges of 10 code
11476 points, and the mapping of each non-initial element of each range is actually
11477 not to "0", but to the offset that element has from its corresponding DIGIT 0.
11478 These code points are those that have Numeric_Type=Decimal; not special
11479 things, like subscripts nor Roman numerals.
11483 # These properties are not used for generating anything else, and are
11484 # usually not output. By making them last in the list, we can just
11485 # change the high end of the loop downwards to avoid the work of
11486 # generating a table(s) that is/are just going to get thrown away.
11487 if (! property_ref('Decomposition_Mapping')->to_output_map
11488 && ! property_ref('Name')->to_output_map)
11490 $last_field = min($NAME, $DECOMP_MAP) - 1;
11491 } elsif (property_ref('Decomposition_Mapping')->to_output_map) {
11492 $last_field = $DECOMP_MAP;
11493 } elsif (property_ref('Name')->to_output_map) {
11494 $last_field = $NAME;
11499 my $first_time = 1; # ? Is this the first line of the file
11500 my $in_range = 0; # ? Are we in one of the file's ranges
11501 my $previous_cp; # hex code point of previous line
11502 my $decimal_previous_cp = -1; # And its decimal equivalent
11503 my @start; # For each field, the current starting
11504 # code point in hex for the range
11505 # being accumulated.
11506 my @fields; # The input fields;
11507 my @previous_fields; # And those from the previous call
11509 sub filter_UnicodeData_line($file) {
11510 # Handle a single input line from UnicodeData.txt; see comments above
11511 # Conceptually this takes a single line from the file containing N
11512 # properties, and converts it into N lines with one property per line,
11513 # which is what the final handler expects. But there are
11514 # complications due to the quirkiness of the input file, and to save
11515 # time, it accumulates ranges where the property values don't change
11516 # and only emits lines when necessary. This is about an order of
11517 # magnitude fewer lines emitted.
11519 # $_ contains the input line.
11520 # -1 in split means retain trailing null fields
11521 (my $cp, @fields) = split /\s*;\s*/, $_, -1;
11523 #local $to_trace = 1 if main::DEBUG;
11524 trace $cp, @fields , $input_field_count if main::DEBUG && $to_trace;
11525 if (@fields > $input_field_count) {
11526 $file->carp_bad_line('Extra fields');
11531 my $decimal_cp = hex $cp;
11533 # We have to output all the buffered ranges when the next code point
11534 # is not exactly one after the previous one, which means there is a
11535 # gap in the ranges.
11536 my $force_output = ($decimal_cp != $decimal_previous_cp + 1);
11538 # The decomposition mapping field requires special handling. It looks
11541 # <compat> 0032 0020
11544 # The decomposition type is enclosed in <brackets>; if missing, it
11545 # means the type is canonical. There are two decomposition mapping
11546 # tables: the one for use by Perl's normalize.pm has a special format
11547 # which is this field intact; the other, for general use is of
11548 # standard format. In either case we have to find the decomposition
11549 # type. Empty fields have None as their type, and map to the code
11551 if ($fields[$PERL_DECOMPOSITION] eq "") {
11552 $fields[$DECOMP_TYPE] = 'None';
11553 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION] = $CODE_POINT;
11556 ($fields[$DECOMP_TYPE], my $map) = $fields[$PERL_DECOMPOSITION]
11557 =~ / < ( .+? ) > \s* ( .+ ) /x;
11558 if (! defined $fields[$DECOMP_TYPE]) {
11559 $fields[$DECOMP_TYPE] = 'Canonical';
11560 $fields[$DECOMP_MAP] = $fields[$PERL_DECOMPOSITION];
11563 $fields[$DECOMP_MAP] = $map;
11567 # The 3 numeric fields also require special handling. The 2 digit
11568 # fields must be either empty or match the number field. This means
11569 # that if it is empty, they must be as well, and the numeric type is
11570 # None, and the numeric value is 'Nan'.
11571 # The decimal digit field must be empty or match the other digit
11572 # field. If the decimal digit field is non-empty, the code point is
11573 # a decimal digit, and the other two fields will have the same value.
11574 # If it is empty, but the other digit field is non-empty, the code
11575 # point is an 'other digit', and the number field will have the same
11576 # value as the other digit field. If the other digit field is empty,
11577 # but the number field is non-empty, the code point is a generic
11579 if ($fields[$NUMERIC] eq "") {
11580 if ($fields[$PERL_DECIMAL_DIGIT] ne ""
11581 || $fields[$NUMERIC_TYPE_OTHER_DIGIT] ne ""
11583 $file->carp_bad_line("Numeric values inconsistent. Trying to process anyway");
11585 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'None';
11586 $fields[$NUMERIC] = 'NaN';
11589 $file->carp_bad_line("'$fields[$NUMERIC]' should be a whole or rational number. Processing as if it were") if $fields[$NUMERIC] !~ qr{ ^ -? \d+ ( / \d+ )? $ }x;
11590 if ($fields[$PERL_DECIMAL_DIGIT] ne "") {
11591 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$PERL_DECIMAL_DIGIT] != $fields[$NUMERIC];
11592 $file->carp_bad_line("$fields[$PERL_DECIMAL_DIGIT] should be empty since the general category ($fields[$CATEGORY]) isn't 'Nd'. Processing as Decimal") if $fields[$CATEGORY] ne "Nd";
11593 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Decimal';
11595 elsif ($fields[$NUMERIC_TYPE_OTHER_DIGIT] ne "") {
11596 $file->carp_bad_line("$fields[$NUMERIC_TYPE_OTHER_DIGIT] should equal $fields[$NUMERIC]. Processing anyway") if $fields[$NUMERIC_TYPE_OTHER_DIGIT] != $fields[$NUMERIC];
11597 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Digit';
11600 $fields[$NUMERIC_TYPE_OTHER_DIGIT] = 'Numeric';
11602 # Rationals require extra effort.
11603 if ($fields[$NUMERIC] =~ qr{/}) {
11604 reduce_fraction(\$fields[$NUMERIC]);
11605 register_fraction($fields[$NUMERIC])
11610 # For the properties that have empty fields in the file, and which
11611 # mean something different from empty, change them to that default.
11612 # Certain fields just haven't been empty so far in any Unicode
11613 # version, so don't look at those, namely $MIRRORED, $BIDI, $CCC,
11614 # $CATEGORY. This leaves just the two fields, and so we hard-code in
11615 # the defaults; which are very unlikely to ever change.
11616 $fields[$UPPER] = $CODE_POINT if $fields[$UPPER] eq "";
11617 $fields[$LOWER] = $CODE_POINT if $fields[$LOWER] eq "";
11619 # UAX44 says that if title is empty, it is the same as whatever upper
11621 $fields[$TITLE] = $fields[$UPPER] if $fields[$TITLE] eq "";
11623 # There are a few pairs of lines like:
11624 # AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
11625 # D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
11626 # that define ranges. These should be processed after the fields are
11627 # adjusted above, as they may override some of them; but mostly what
11628 # is left is to possibly adjust the $CHARNAME field. The names of all the
11629 # paired lines start with a '<', but this is also true of '<control>,
11630 # which isn't one of these special ones.
11631 if ($fields[$CHARNAME] eq '<control>') {
11633 # Some code points in this file have the pseudo-name
11634 # '<control>', but the official name for such ones is the null
11636 $fields[$NAME] = $fields[$CHARNAME] = "";
11638 # We had better not be in between range lines.
11640 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11644 elsif (substr($fields[$CHARNAME], 0, 1) ne '<') {
11646 # Here is a non-range line. We had better not be in between range
11649 $file->carp_bad_line("Expecting a closing range line, not a $fields[$CHARNAME]'. Trying anyway");
11652 if ($fields[$CHARNAME] =~ s/- $cp $//x) {
11654 # These are code points whose names end in their code points,
11655 # which means the names are algorithmically derivable from the
11656 # code points. To shorten the output Name file, the algorithm
11657 # for deriving these is placed in the file instead of each
11658 # code point, so they have map type $CP_IN_NAME
11659 $fields[$CHARNAME] = $CMD_DELIM
11664 . $fields[$CHARNAME];
11666 $fields[$NAME] = $fields[$CHARNAME];
11668 elsif ($fields[$CHARNAME] =~ /^<(.+), First>$/) {
11669 $fields[$CHARNAME] = $fields[$NAME] = $1;
11671 # Here we are at the beginning of a range pair.
11673 $file->carp_bad_line("Expecting a closing range line, not a beginning one, $fields[$CHARNAME]'. Trying anyway");
11677 # Because the properties in the range do not overwrite any already
11678 # in the db, we must flush the buffers of what's already there, so
11679 # they get handled in the normal scheme.
11683 elsif ($fields[$CHARNAME] !~ s/^<(.+), Last>$/$1/) {
11684 $file->carp_bad_line("Unexpected name starting with '<' $fields[$CHARNAME]. Ignoring this line.");
11688 else { # Here, we are at the last line of a range pair.
11691 $file->carp_bad_line("Unexpected end of range $fields[$CHARNAME] when not in one. Ignoring this line.");
11697 $fields[$NAME] = $fields[$CHARNAME];
11699 # Check that the input is valid: that the closing of the range is
11700 # the same as the beginning.
11701 foreach my $i (0 .. $last_field) {
11702 next if $fields[$i] eq $previous_fields[$i];
11703 $file->carp_bad_line("Expecting '$fields[$i]' to be the same as '$previous_fields[$i]'. Bad News. Trying anyway");
11706 # The processing differs depending on the type of range,
11707 # determined by its $CHARNAME
11708 if ($fields[$CHARNAME] =~ /^Hangul Syllable/) {
11710 # Check that the data looks right.
11711 if ($decimal_previous_cp != $SBase) {
11712 $file->carp_bad_line("Unexpected Hangul syllable start = $previous_cp. Bad News. Results will be wrong");
11714 if ($decimal_cp != $SBase + $SCount - 1) {
11715 $file->carp_bad_line("Unexpected Hangul syllable end = $cp. Bad News. Results will be wrong");
11718 # The Hangul syllable range has a somewhat complicated name
11719 # generation algorithm. Each code point in it has a canonical
11720 # decomposition also computable by an algorithm. The
11721 # perl decomposition map table built from these is used only
11722 # by normalize.pm, which has the algorithm built in it, so the
11723 # decomposition maps are not needed, and are large, so are
11724 # omitted from it. If the full decomposition map table is to
11725 # be output, the decompositions are generated for it, in the
11726 # EOF handling code for this input file.
11728 $previous_fields[$DECOMP_TYPE] = 'Canonical';
11730 # This range is stored in our internal structure with its
11731 # own map type, different from all others.
11732 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11738 . $fields[$CHARNAME];
11740 elsif ($fields[$CATEGORY] eq 'Lo') { # Is a letter
11742 # All the CJK ranges like this have the name given as a
11743 # special case in the next code line. And for the others, we
11744 # hope that Unicode continues to use the correct name in
11745 # future releases, so we don't have to make further special
11747 my $name = ($fields[$CHARNAME] =~ /^CJK/)
11748 ? 'CJK UNIFIED IDEOGRAPH'
11749 : uc $fields[$CHARNAME];
11751 # The name for these contains the code point itself, and all
11752 # are defined to have the same base name, regardless of what
11753 # is in the file. They are stored in our internal structure
11754 # with a map type of $CP_IN_NAME
11755 $previous_fields[$CHARNAME] = $previous_fields[$NAME]
11764 elsif ($fields[$CATEGORY] eq 'Co'
11765 || $fields[$CATEGORY] eq 'Cs')
11767 # The names of all the code points in these ranges are set to
11768 # null, as there are no names for the private use and
11769 # surrogate code points.
11771 $previous_fields[$CHARNAME] = $previous_fields[$NAME] = "";
11774 $file->carp_bad_line("Unexpected code point range $fields[$CHARNAME] because category is $fields[$CATEGORY]. Attempting to process it.");
11777 # The first line of the range caused everything else to be output,
11778 # and then its values were stored as the beginning values for the
11779 # next set of ranges, which this one ends. Now, for each value,
11780 # add a command to tell the handler that these values should not
11781 # replace any existing ones in our database.
11782 foreach my $i (0 .. $last_field) {
11783 $previous_fields[$i] = $CMD_DELIM
11788 . $previous_fields[$i];
11791 # And change things so it looks like the entire range has been
11792 # gone through with this being the final part of it. Adding the
11793 # command above to each field will cause this range to be flushed
11794 # during the next iteration, as it guaranteed that the stored
11795 # field won't match whatever value the next one has.
11796 $previous_cp = $cp;
11797 $decimal_previous_cp = $decimal_cp;
11799 # We are now set up for the next iteration; so skip the remaining
11800 # code in this subroutine that does the same thing, but doesn't
11801 # know about these ranges.
11807 # On the very first line, we fake it so the code below thinks there is
11808 # nothing to output, and initialize so that when it does get output it
11809 # uses the first line's values for the lowest part of the range.
11810 # (One could avoid this by using peek(), but then one would need to
11811 # know the adjustments done above and do the same ones in the setup
11812 # routine; not worth it)
11815 @previous_fields = @fields;
11816 @start = ($cp) x scalar @fields;
11817 $decimal_previous_cp = $decimal_cp - 1;
11820 # For each field, output the stored up ranges that this code point
11821 # doesn't fit in. Earlier we figured out if all ranges should be
11822 # terminated because of changing the replace or map type styles, or if
11823 # there is a gap between this new code point and the previous one, and
11824 # that is stored in $force_output. But even if those aren't true, we
11825 # need to output the range if this new code point's value for the
11826 # given property doesn't match the stored range's.
11827 #local $to_trace = 1 if main::DEBUG;
11828 foreach my $i (0 .. $last_field) {
11829 my $field = $fields[$i];
11830 if ($force_output || $field ne $previous_fields[$i]) {
11832 # Flush the buffer of stored values.
11833 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11835 # Start a new range with this code point and its value
11837 $previous_fields[$i] = $field;
11841 # Set the values for the next time.
11842 $previous_cp = $cp;
11843 $decimal_previous_cp = $decimal_cp;
11845 # The input line has generated whatever adjusted lines are needed, and
11846 # should not be looked at further.
11851 sub EOF_UnicodeData($file) {
11852 # Called upon EOF to flush the buffers, and create the Hangul
11853 # decomposition mappings if needed.
11855 # Flush the buffers.
11856 foreach my $i (0 .. $last_field) {
11857 $file->insert_adjusted_lines("$start[$i]..$previous_cp; $field_names[$i]; $previous_fields[$i]");
11860 if (-e 'Jamo.txt') {
11862 # The algorithm is published by Unicode, based on values in
11863 # Jamo.txt, (which should have been processed before this
11864 # subroutine), and the results left in %Jamo
11866 Carp::my_carp_bug("Jamo.txt should be processed before Unicode.txt. Hangul syllables not generated.");
11870 # If the full decomposition map table is being output, insert
11871 # into it the Hangul syllable mappings. This is to avoid having
11872 # to publish a subroutine in it to compute them. (which would
11873 # essentially be this code.) This uses the algorithm published by
11874 # Unicode. (No hangul syllables in version 1)
11875 if ($v_version ge v2.0.0
11876 && property_ref('Decomposition_Mapping')->to_output_map) {
11877 for (my $S = $SBase; $S < $SBase + $SCount; $S++) {
11879 my $SIndex = $S - $SBase;
11880 my $L = $LBase + $SIndex / $NCount;
11881 my $V = $VBase + ($SIndex % $NCount) / $TCount;
11882 my $T = $TBase + $SIndex % $TCount;
11884 trace "L=$L, V=$V, T=$T" if main::DEBUG && $to_trace;
11885 my $decomposition = sprintf("%04X %04X", $L, $V);
11886 $decomposition .= sprintf(" %04X", $T) if $T != $TBase;
11887 $file->insert_adjusted_lines(
11888 sprintf("%04X; Decomposition_Mapping; %s",
11898 sub filter_v1_ucd($file) {
11899 # Fix UCD lines in version 1. This is probably overkill, but this
11900 # fixes some glaring errors in Version 1 UnicodeData.txt. That file:
11901 # 1) had many Hangul (U+3400 - U+4DFF) code points that were later
11902 # removed. This program retains them
11903 # 2) didn't include ranges, which it should have, and which are now
11904 # added in @corrected_lines below. It was hand populated by
11905 # taking the data from Version 2, verified by analyzing
11907 # 3) There is a syntax error in the entry for U+09F8 which could
11908 # cause problems for Unicode::UCD, and so is changed. It's
11909 # numeric value was simply a minus sign, without any number.
11910 # (Eventually Unicode changed the code point to non-numeric.)
11911 # 4) The decomposition types often don't match later versions
11912 # exactly, and the whole syntax of that field is different; so
11913 # the syntax is changed as well as the types to their later
11914 # terminology. Otherwise normalize.pm would be very unhappy
11915 # 5) Many ccc classes are different. These are left intact.
11916 # 6) U+FF10..U+FF19 are missing their numeric values in all three
11917 # fields. These are unchanged because it doesn't really cause
11918 # problems for Perl.
11919 # 7) A number of code points, such as controls, don't have their
11920 # Unicode Version 1 Names in this file. These are added.
11921 # 8) A number of Symbols were marked as Lm. This changes those in
11922 # the Latin1 range, so that regexes work.
11923 # 9) The odd characters U+03DB .. U+03E1 weren't encoded but are
11924 # referred to by their lc equivalents. Not fixed.
11926 my @corrected_lines = split /\n/, <<'END';
11927 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
11928 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11929 E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
11930 F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
11931 F900;<CJK Compatibility Ideograph, First>;Lo;0;L;;;;;N;;;;;
11932 FA2D;<CJK Compatibility Ideograph, Last>;Lo;0;L;;;;;N;;;;;
11935 #local $to_trace = 1 if main::DEBUG;
11936 trace $_ if main::DEBUG && $to_trace;
11938 # -1 => retain trailing null fields
11939 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11941 # At the first place that is wrong in the input, insert all the
11942 # corrections, replacing the wrong line.
11943 if ($code_point eq '4E00') {
11944 my @copy = @corrected_lines;
11946 ($code_point, @fields) = split /\s*;\s*/, $_, -1;
11948 $file->insert_lines(@copy);
11950 elsif ($code_point =~ /^00/ && $fields[$CATEGORY] eq 'Lm') {
11952 # There are no Lm characters in Latin1; these should be 'Sk', but
11953 # there isn't that in V1.
11954 $fields[$CATEGORY] = 'So';
11957 if ($fields[$NUMERIC] eq '-') {
11958 $fields[$NUMERIC] = '-1'; # This is what 2.0 made it.
11961 if ($fields[$PERL_DECOMPOSITION] ne "") {
11963 # Several entries have this change to superscript 2 or 3 in the
11964 # middle. Convert these to the modern version, which is to use
11965 # the actual U+00B2 and U+00B3 (the superscript forms) instead.
11966 # So 'HHHH HHHH <+sup> 0033 <-sup> HHHH' becomes
11967 # 'HHHH HHHH 00B3 HHHH'.
11968 # It turns out that all of these that don't have another
11969 # decomposition defined at the beginning of the line have the
11970 # <square> decomposition in later releases.
11971 if ($code_point ne '00B2' && $code_point ne '00B3') {
11972 if ($fields[$PERL_DECOMPOSITION]
11973 =~ s/<\+sup> 003([23]) <-sup>/00B$1/)
11975 if (substr($fields[$PERL_DECOMPOSITION], 0, 1) ne '<') {
11976 $fields[$PERL_DECOMPOSITION] = '<square> '
11977 . $fields[$PERL_DECOMPOSITION];
11982 # If is like '<+circled> 0052 <-circled>', convert to
11984 $fields[$PERL_DECOMPOSITION] =~
11985 s/ < \+ ( .*? ) > \s* (.*?) \s* <-\1> /<$1> $2/xg;
11987 # Convert '<join> HHHH HHHH <join>' to '<medial> HHHH HHHH', etc.
11988 $fields[$PERL_DECOMPOSITION] =~
11989 s/ <join> \s* (.*?) \s* <no-join> /<final> $1/x
11990 or $fields[$PERL_DECOMPOSITION] =~
11991 s/ <join> \s* (.*?) \s* <join> /<medial> $1/x
11992 or $fields[$PERL_DECOMPOSITION] =~
11993 s/ <no-join> \s* (.*?) \s* <join> /<initial> $1/x
11994 or $fields[$PERL_DECOMPOSITION] =~
11995 s/ <no-join> \s* (.*?) \s* <no-join> /<isolated> $1/x;
11997 # Convert '<break> HHHH HHHH <break>' to '<break> HHHH', etc.
11998 $fields[$PERL_DECOMPOSITION] =~
11999 s/ <(break|no-break)> \s* (.*?) \s* <\1> /<$1> $2/x;
12001 # Change names to modern form.
12002 $fields[$PERL_DECOMPOSITION] =~ s/<font variant>/<font>/g;
12003 $fields[$PERL_DECOMPOSITION] =~ s/<no-break>/<noBreak>/g;
12004 $fields[$PERL_DECOMPOSITION] =~ s/<circled>/<circle>/g;
12005 $fields[$PERL_DECOMPOSITION] =~ s/<break>/<fraction>/g;
12007 # One entry has weird braces
12008 $fields[$PERL_DECOMPOSITION] =~ s/[{}]//g;
12010 # One entry at U+2116 has an extra <sup>
12011 $fields[$PERL_DECOMPOSITION] =~ s/( < .*? > .* ) < .*? > \ * /$1/x;
12014 $_ = join ';', $code_point, @fields;
12015 trace $_ if main::DEBUG && $to_trace;
12019 sub filter_bad_Nd_ucd {
12020 # Early versions specified a value in the decimal digit field even
12021 # though the code point wasn't a decimal digit. Clear the field in
12022 # that situation, so that the main code doesn't think it is a decimal
12025 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12026 if ($fields[$PERL_DECIMAL_DIGIT] ne "" && $fields[$CATEGORY] ne 'Nd') {
12027 $fields[$PERL_DECIMAL_DIGIT] = "";
12028 $_ = join ';', $code_point, @fields;
12033 my @U1_control_names = split /\n/, <<'END';
12038 END OF TRANSMISSION
12043 HORIZONTAL TABULATION
12045 VERTICAL TABULATION
12053 DEVICE CONTROL THREE
12054 DEVICE CONTROL FOUR
12055 NEGATIVE ACKNOWLEDGE
12057 END OF TRANSMISSION BLOCK
12067 BREAK PERMITTED HERE
12071 START OF SELECTED AREA
12072 END OF SELECTED AREA
12073 CHARACTER TABULATION SET
12074 CHARACTER TABULATION WITH JUSTIFICATION
12075 LINE TABULATION SET
12081 DEVICE CONTROL STRING
12087 START OF GUARDED AREA
12088 END OF GUARDED AREA
12090 SINGLE CHARACTER INTRODUCER
12091 CONTROL SEQUENCE INTRODUCER
12093 OPERATING SYSTEM COMMAND
12095 APPLICATION PROGRAM COMMAND
12098 sub filter_early_U1_names {
12099 # Very early versions did not have the Unicode_1_name field specified.
12100 # They differed in which ones were present; make sure a U1 name
12101 # exists, so that Unicode::UCD::charinfo will work
12103 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12106 # @U1_control names above are entirely positional, so we pull them out
12107 # in the exact order required, with gaps for the ones that don't have
12109 if ($code_point =~ /^00[01]/
12110 || $code_point eq '007F'
12111 || $code_point =~ /^008[2-9A-F]/
12112 || $code_point =~ /^009[0-8A-F]/)
12114 my $u1_name = shift @U1_control_names;
12115 $fields[$UNICODE_1_NAME] = $u1_name unless $fields[$UNICODE_1_NAME];
12116 $_ = join ';', $code_point, @fields;
12121 sub filter_v2_1_5_ucd {
12122 # A dozen entries in this 2.1.5 file had the mirrored and numeric
12123 # columns swapped; These all had mirrored be 'N'. So if the numeric
12124 # column appears to be N, swap it back.
12126 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12127 if ($fields[$NUMERIC] eq 'N') {
12128 $fields[$NUMERIC] = $fields[$MIRRORED];
12129 $fields[$MIRRORED] = 'N';
12130 $_ = join ';', $code_point, @fields;
12135 sub filter_v6_ucd {
12137 # Unicode 6.0 co-opted the name BELL for U+1F514, but until 5.17,
12138 # it wasn't accepted, to allow for some deprecation cycles. This
12139 # function is not called after 5.16
12141 return if $_ !~ /^(?:0007|1F514|070F);/;
12143 my ($code_point, @fields) = split /\s*;\s*/, $_, -1;
12144 if ($code_point eq '0007') {
12145 $fields[$CHARNAME] = "";
12147 elsif ($code_point eq '070F') { # Unicode Corrigendum #8; see
12148 # http://www.unicode.org/versions/corrigendum8.html
12149 $fields[$BIDI] = "AL";
12151 elsif ($^V lt v5.18.0) { # For 5.18 will convert to use Unicode's name
12152 $fields[$CHARNAME] = "";
12155 $_ = join ';', $code_point, @fields;
12159 } # End closure for UnicodeData
12161 sub process_GCB_test($file) {
12163 while ($file->next_line) {
12164 push @backslash_X_tests, $_;
12170 sub process_LB_test($file) {
12172 while ($file->next_line) {
12173 push @LB_tests, $_;
12179 sub process_SB_test($file) {
12181 while ($file->next_line) {
12182 push @SB_tests, $_;
12188 sub process_WB_test($file) {
12190 while ($file->next_line) {
12191 push @WB_tests, $_;
12197 sub process_NamedSequences($file) {
12198 # NamedSequences.txt entries are just added to an array. Because these
12199 # don't look like the other tables, they have their own handler.
12201 # LATIN CAPITAL LETTER A WITH MACRON AND GRAVE;0100 0300
12203 # This just adds the sequence to an array for later handling
12205 while ($file->next_line) {
12206 my ($name, $sequence, @remainder) = split /\s*;\s*/, $_, -1;
12208 $file->carp_bad_line(
12209 "Doesn't look like 'KHMER VOWEL SIGN OM;17BB 17C6'");
12213 # Code points need to be 5 digits long like the other entries in
12214 # Name.pl, for regcomp.c parsing; and the ones below 0x0100 need to be
12215 # converted to native
12216 $sequence = join " ", map { sprintf("%05X",
12217 utf8::unicode_to_native(hex $_))
12218 } split / /, $sequence;
12219 push @named_sequences, "$sequence\n$name\n";
12228 sub filter_early_ea_lb {
12229 # Fixes early EastAsianWidth.txt and LineBreak.txt files. These had a
12230 # third field be the name of the code point, which can be ignored in
12231 # most cases. But it can be meaningful if it marks a range:
12232 # 33FE;W;IDEOGRAPHIC TELEGRAPH SYMBOL FOR DAY THIRTY-ONE
12233 # 3400;W;<CJK Ideograph Extension A, First>
12235 # We need to see the First in the example above to know it's a range.
12236 # They did not use the later range syntaxes. This routine changes it
12237 # to use the modern syntax.
12238 # $1 is the Input_file object.
12240 my @fields = split /\s*;\s*/;
12241 if ($fields[2] =~ /^<.*, First>/) {
12242 $first_range = $fields[0];
12245 elsif ($fields[2] =~ /^<.*, Last>/) {
12246 $_ = $_ = "$first_range..$fields[0]; $fields[1]";
12249 undef $first_range;
12250 $_ = "$fields[0]; $fields[1]";
12257 sub filter_substitute_lb {
12258 # Used on Unicodes that predate the LB property, where there is a
12259 # substitute file. This just does the regular ea_lb handling for such
12260 # files, and then substitutes the long property value name for the short
12261 # one that comes with the file. (The other break files have the long
12262 # names in them, so this is the odd one out.) The reason for doing this
12263 # kludge is that regen/mk_invlists.pl is expecting the long name. This
12264 # also fixes the typo 'Inseperable' that leads to problems.
12266 filter_early_ea_lb;
12269 my @fields = split /\s*;\s*/;
12270 $fields[1] = property_ref('_Perl_LB')->table($fields[1])->full_name;
12271 $fields[1] = 'Inseparable' if lc $fields[1] eq 'inseperable';
12272 $_ = join '; ', @fields;
12275 sub filter_old_style_arabic_shaping {
12276 # Early versions used a different term for the later one.
12278 my @fields = split /\s*;\s*/;
12279 $fields[3] =~ s/<no shaping>/No_Joining_Group/;
12280 $fields[3] =~ s/\s+/_/g; # Change spaces to underscores
12281 $_ = join ';', @fields;
12286 my $lc; # Table for lowercase mapping
12289 my %special_casing_code_points;
12291 sub setup_special_casing($file) {
12292 # SpecialCasing.txt contains the non-simple case change mappings. The
12293 # simple ones are in UnicodeData.txt, which should already have been
12294 # read in to the full property data structures, so as to initialize
12295 # these with the simple ones. Then the SpecialCasing.txt entries
12296 # add or overwrite the ones which have different full mappings.
12298 # This routine sees if the simple mappings are to be output, and if
12299 # so, copies what has already been put into the full mapping tables,
12300 # while they still contain only the simple mappings.
12302 # The reason it is done this way is that the simple mappings are
12303 # probably not going to be output, so it saves work to initialize the
12304 # full tables with the simple mappings, and then overwrite those
12305 # relatively few entries in them that have different full mappings,
12306 # and thus skip the simple mapping tables altogether.
12308 $lc = property_ref('lc');
12309 $tc = property_ref('tc');
12310 $uc = property_ref('uc');
12312 # For each of the case change mappings...
12313 foreach my $full_casing_table ($lc, $tc, $uc) {
12314 my $full_casing_name = $full_casing_table->name;
12315 my $full_casing_full_name = $full_casing_table->full_name;
12316 unless (defined $full_casing_table
12317 && ! $full_casing_table->is_empty)
12319 Carp::my_carp_bug("Need to process UnicodeData before SpecialCasing. Only special casing will be generated.");
12322 $full_casing_table->add_comment(join_lines( <<END
12323 This file includes both the simple and full case changing maps. The simple
12324 ones are in the main body of the table below, and the full ones adding to or
12325 overriding them are in the hash.
12329 # The simple version's name in each mapping merely has an 's' in
12330 # front of the full one's
12331 my $simple_name = 's' . $full_casing_name;
12332 my $simple = property_ref($simple_name);
12333 $simple->initialize($full_casing_table) if $simple->to_output_map();
12339 sub filter_2_1_8_special_casing_line {
12341 # This version had duplicate entries in this file. Delete all but the
12343 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12345 if (exists $special_casing_code_points{$fields[0]}) {
12350 $special_casing_code_points{$fields[0]} = 1;
12351 filter_special_casing_line(@_);
12354 sub filter_special_casing_line($file) {
12355 # Change the format of $_ from SpecialCasing.txt into something that
12356 # the generic handler understands. Each input line contains three
12357 # case mappings. This will generate three lines to pass to the
12358 # generic handler for each of those.
12360 # The input syntax (after stripping comments and trailing white space
12361 # is like one of the following (with the final two being entries that
12363 # 00DF; 00DF; 0053 0073; 0053 0053; # LATIN SMALL LETTER SHARP S
12364 # 03A3; 03C2; 03A3; 03A3; Final_Sigma;
12365 # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE
12366 # Note the trailing semi-colon, unlike many of the input files. That
12367 # means that there will be an extra null field generated by the split
12369 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null
12372 # field #4 is when this mapping is conditional. If any of these get
12373 # implemented, it would be by hard-coding in the casing functions in
12374 # the Perl core, not through tables. But if there is a new condition
12375 # we don't know about, output a warning. We know about all the
12376 # conditions through 6.0
12377 if ($fields[4] ne "") {
12378 my @conditions = split ' ', $fields[4];
12379 if ($conditions[0] ne 'tr' # We know that these languages have
12380 # conditions, and some are multiple
12381 && $conditions[0] ne 'az'
12382 && $conditions[0] ne 'lt'
12384 # And, we know about a single condition Final_Sigma, but
12386 && ($v_version gt v5.2.0
12387 && (@conditions > 1 || $conditions[0] ne 'Final_Sigma')))
12389 $file->carp_bad_line("Unknown condition '$fields[4]'. You should inspect it and either add code to handle it, or add to list of those that are to ignore");
12391 elsif ($conditions[0] ne 'Final_Sigma') {
12393 # Don't print out a message for Final_Sigma, because we
12394 # have hard-coded handling for it. (But the standard
12395 # could change what the rule should be, but it wouldn't
12396 # show up here anyway.
12398 print "# SKIPPING Special Casing: $_\n"
12399 if $verbosity >= $VERBOSE;
12404 elsif (@fields > 6 || (@fields == 6 && $fields[5] ne "" )) {
12405 $file->carp_bad_line('Extra fields');
12410 my $decimal_code_point = hex $fields[0];
12412 # Loop to handle each of the three mappings in the input line, in
12413 # order, with $i indicating the current field number.
12415 for my $object ($lc, $tc, $uc) {
12416 $i++; # First time through, $i = 0 ... 3rd time = 3
12418 my $value = $object->value_of($decimal_code_point);
12419 $value = ($value eq $CODE_POINT)
12420 ? $decimal_code_point
12423 # If this isn't a multi-character mapping, it should already have
12425 if ($fields[$i] !~ / /) {
12426 if ($value != hex $fields[$i]) {
12427 Carp::my_carp("Bad news. UnicodeData.txt thinks "
12429 . "(0x$fields[0]) is $value"
12430 . " and SpecialCasing.txt thinks it is "
12432 . ". Good luck. Retaining UnicodeData value, and proceeding anyway.");
12437 # The mapping is additional, beyond the simple mapping.
12438 $file->insert_adjusted_lines("$fields[0]; "
12442 . "$REPLACE_CMD=$MULTIPLE_BEFORE"
12448 # Everything has been handled by the insert_adjusted_lines()
12455 sub filter_old_style_case_folding($file) {
12456 # This transforms $_ containing the case folding style of 3.0.1, to 3.1
12457 # and later style. Different letters were used in the earlier.
12459 my @fields = split /\s*;\s*/;
12461 if ($fields[1] eq 'L') {
12462 $fields[1] = 'C'; # L => C always
12464 elsif ($fields[1] eq 'E') {
12465 if ($fields[2] =~ / /) { # E => C if one code point; F otherwise
12473 $file->carp_bad_line("Expecting L or E in second field");
12477 $_ = join("; ", @fields) . ';';
12481 { # Closure for case folding
12483 # Create the map for simple only if are going to output it, for otherwise
12484 # it takes no part in anything we do.
12485 my $to_output_simple;
12487 sub setup_case_folding {
12488 # Read in the case foldings in CaseFolding.txt. This handles both
12489 # simple and full case folding.
12492 = property_ref('Simple_Case_Folding')->to_output_map;
12494 if (! $to_output_simple) {
12495 property_ref('Case_Folding')->set_proxy_for('Simple_Case_Folding');
12498 # If we ever wanted to show that these tables were combined, a new
12499 # property method could be created, like set_combined_props()
12500 property_ref('Case_Folding')->add_comment(join_lines( <<END
12501 This file includes both the simple and full case folding maps. The simple
12502 ones are in the main body of the table below, and the full ones adding to or
12503 overriding them are in the hash.
12509 sub filter_case_folding_line($file) {
12510 # Called for each line in CaseFolding.txt
12511 # Input lines look like:
12512 # 0041; C; 0061; # LATIN CAPITAL LETTER A
12513 # 00DF; F; 0073 0073; # LATIN SMALL LETTER SHARP S
12514 # 1E9E; S; 00DF; # LATIN CAPITAL LETTER SHARP S
12516 # 'C' means that folding is the same for both simple and full
12517 # 'F' that it is only for full folding
12518 # 'S' that it is only for simple folding
12519 # 'T' is locale-dependent, and ignored
12520 # 'I' is a type of 'F' used in some early releases.
12521 # Note the trailing semi-colon, unlike many of the input files. That
12522 # means that there will be an extra null field generated by the split
12523 # below, which we ignore and hence is not an error.
12525 my ($range, $type, $map, @remainder) = split /\s*;\s*/, $_, -1;
12526 if (@remainder > 1 || (@remainder == 1 && $remainder[0] ne "" )) {
12527 $file->carp_bad_line('Extra fields');
12532 if ($type =~ / ^ [IT] $/x) { # Skip Turkic case folding, is locale dependent
12537 # C: complete, F: full, or I: dotted uppercase I -> dotless lowercase
12538 # I are all full foldings; S is single-char. For S, there is always
12539 # an F entry, so we must allow multiple values for the same code
12540 # point. Fortunately this table doesn't need further manipulation
12541 # which would preclude using multiple-values. The S is now included
12542 # so that _swash_inversion_hash() is able to construct closures
12543 # without having to worry about F mappings.
12544 if ($type eq 'C' || $type eq 'F' || $type eq 'I' || $type eq 'S') {
12545 $_ = "$range; Case_Folding; "
12546 . "$CMD_DELIM$REPLACE_CMD=$MULTIPLE_BEFORE$CMD_DELIM$map";
12550 $file->carp_bad_line('Expecting C F I S or T in second field');
12553 # C and S are simple foldings, but simple case folding is not needed
12554 # unless we explicitly want its map table output.
12555 if ($to_output_simple && $type eq 'C' || $type eq 'S') {
12556 $file->insert_adjusted_lines("$range; Simple_Case_Folding; $map");
12562 } # End case fold closure
12564 sub filter_jamo_line {
12565 # Filter Jamo.txt lines. This routine mainly is used to populate hashes
12566 # from this file that is used in generating the Name property for Jamo
12567 # code points. But, it also is used to convert early versions' syntax
12568 # into the modern form. Here are two examples:
12569 # 1100; G # HANGUL CHOSEONG KIYEOK # Modern syntax
12570 # U+1100; G; HANGUL CHOSEONG KIYEOK # 2.0 syntax
12572 # The input is $_, the output is $_ filtered.
12574 my @fields = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
12576 # Let the caller handle unexpected input. In earlier versions, there was
12577 # a third field which is supposed to be a comment, but did not have a '#'
12579 return if @fields > (($v_version gt v3.0.0) ? 2 : 3);
12581 $fields[0] =~ s/^U\+//; # Also, early versions had this extraneous
12584 # Some 2.1 versions had this wrong. Causes havoc with the algorithm.
12585 $fields[1] = 'R' if $fields[0] eq '1105';
12587 # Add to structure so can generate Names from it.
12588 my $cp = hex $fields[0];
12589 my $short_name = $fields[1];
12590 $Jamo{$cp} = $short_name;
12591 if ($cp <= $LBase + $LCount) {
12592 $Jamo_L{$short_name} = $cp - $LBase;
12594 elsif ($cp <= $VBase + $VCount) {
12595 $Jamo_V{$short_name} = $cp - $VBase;
12597 elsif ($cp <= $TBase + $TCount) {
12598 $Jamo_T{$short_name} = $cp - $TBase;
12601 Carp::my_carp_bug("Unexpected Jamo code point in $_");
12605 # Reassemble using just the first two fields to look like a typical
12606 # property file line
12607 $_ = "$fields[0]; $fields[1]";
12612 sub register_fraction($rational) {
12613 # This registers the input rational number so that it can be passed on to
12614 # Unicode::UCD, both in rational and floating forms.
12616 my $floating = eval $rational;
12618 my @floats = sprintf "%.*e", $E_FLOAT_PRECISION, $floating;
12620 # See if the denominator is a power of 2.
12621 $rational =~ m!.*/(.*)!;
12622 my $denominator = $1;
12623 if (defined $denominator && (($denominator & ($denominator - 1)) == 0)) {
12625 # Here the denominator is a power of 2. This means it has an exact
12626 # representation in binary, so rounding could go either way. It turns
12627 # out that Windows doesn't necessarily round towards even, so output
12628 # an extra entry. This happens when the final digit we output is even
12629 # and the next digits would be 50* to the precision of the machine.
12630 my $extra_digit_float = sprintf "%e", $floating;
12631 my $q = $E_FLOAT_PRECISION - 1;
12632 if ($extra_digit_float =~ / ( .* \. \d{$q} )
12633 ( [02468] ) 5 0* ( e .*)
12636 push @floats, $1 . ($2 + 1) . $3;
12640 foreach my $float (@floats) {
12641 # Strip off any leading zeros beyond 2 digits to make it C99
12642 # compliant. (Windows has 3 digit exponents, contrary to C99)
12643 $float =~ s/ ( .* e [-+] ) 0* ( \d{2,}? ) /$1$2/x;
12645 if ( defined $nv_floating_to_rational{$float}
12646 && $nv_floating_to_rational{$float} ne $rational)
12648 die Carp::my_carp_bug("Both '$rational' and"
12649 . " '$nv_floating_to_rational{$float}' evaluate to"
12650 . " the same floating point number."
12651 . " \$E_FLOAT_PRECISION must be increased");
12653 $nv_floating_to_rational{$float} = $rational;
12658 sub gcd($a, $b) { # Greatest-common-divisor; from
12659 # http://en.wikipedia.org/wiki/Euclidean_algorithm
12670 sub reduce_fraction($fraction_ref) {
12671 # Reduce a fraction to lowest terms. The Unicode data may be reducible,
12672 # hence this is needed. The argument is a reference to the
12673 # string denoting the fraction, which must be of the form:
12674 if ($$fraction_ref !~ / ^ (-?) (\d+) \/ (\d+) $ /ax) {
12675 Carp::my_carp_bug("Non-fraction input '$$fraction_ref'. Unchanged");
12680 my $numerator = $2;
12681 my $denominator = $3;
12685 # Find greatest common divisor
12686 my $gcd = gcd($numerator, $denominator);
12688 # And reduce using the gcd.
12690 $numerator /= $gcd;
12691 $denominator /= $gcd;
12692 $$fraction_ref = "$sign$numerator/$denominator";
12698 sub filter_numeric_value_line($file) {
12699 # DNumValues contains lines of a different syntax than the typical
12701 # 0F33 ; -0.5 ; ; -1/2 # No TIBETAN DIGIT HALF ZERO
12703 # This routine transforms $_ containing the anomalous syntax to the
12704 # typical, by filtering out the extra columns, and convert early version
12705 # decimal numbers to strings that look like rational numbers.
12707 # Starting in 5.1, there is a rational field. Just use that, omitting the
12708 # extra columns. Otherwise convert the decimal number in the second field
12709 # to a rational, and omit extraneous columns.
12710 my @fields = split /\s*;\s*/, $_, -1;
12713 if ($v_version ge v5.1.0) {
12714 if (@fields != 4) {
12715 $file->carp_bad_line('Not 4 semi-colon separated fields');
12719 reduce_fraction(\$fields[3]) if $fields[3] =~ qr{/};
12720 $rational = $fields[3];
12722 $_ = join '; ', @fields[ 0, 3 ];
12726 # Here, is an older Unicode file, which has decimal numbers instead of
12727 # rationals in it. Use the fraction to calculate the denominator and
12728 # convert to rational.
12730 if (@fields != 2 && @fields != 3) {
12731 $file->carp_bad_line('Not 2 or 3 semi-colon separated fields');
12736 my $codepoints = $fields[0];
12737 my $decimal = $fields[1];
12738 if ($decimal =~ s/\.0+$//) {
12740 # Anything ending with a decimal followed by nothing but 0's is an
12742 $_ = "$codepoints; $decimal";
12743 $rational = $decimal;
12748 if ($decimal =~ /\.50*$/) {
12752 # Here have the hardcoded repeating decimals in the fraction, and
12753 # the denominator they imply. There were only a few denominators
12754 # in the older Unicode versions of this file which this code
12755 # handles, so it is easy to convert them.
12757 # The 4 is because of a round-off error in the Unicode 3.2 files
12758 elsif ($decimal =~ /\.33*[34]$/ || $decimal =~ /\.6+7$/) {
12761 elsif ($decimal =~ /\.[27]50*$/) {
12764 elsif ($decimal =~ /\.[2468]0*$/) {
12767 elsif ($decimal =~ /\.16+7$/ || $decimal =~ /\.83+$/) {
12770 elsif ($decimal =~ /\.(12|37|62|87)50*$/) {
12773 if ($denominator) {
12774 my $sign = ($decimal < 0) ? "-" : "";
12775 my $numerator = int((abs($decimal) * $denominator) + .5);
12776 $rational = "$sign$numerator/$denominator";
12777 $_ = "$codepoints; $rational";
12780 $file->carp_bad_line("Can't cope with number '$decimal'.");
12787 register_fraction($rational) if $rational =~ qr{/};
12792 my %unihan_properties;
12794 sub construct_unihan($file_object) {
12796 return unless file_exists($file_object->file);
12798 if ($v_version lt v4.0.0) {
12799 push @cjk_properties, 'URS ; Unicode_Radical_Stroke';
12800 push @cjk_property_values, split "\n", <<'END';
12801 # @missing: 0000..10FFFF; Unicode_Radical_Stroke; <none>
12805 if ($v_version ge v3.0.0) {
12806 push @cjk_properties, split "\n", <<'END';
12807 cjkIRG_GSource; kIRG_GSource
12808 cjkIRG_JSource; kIRG_JSource
12809 cjkIRG_KSource; kIRG_KSource
12810 cjkIRG_TSource; kIRG_TSource
12811 cjkIRG_VSource; kIRG_VSource
12813 push @cjk_property_values, split "\n", <<'END';
12814 # @missing: 0000..10FFFF; cjkIRG_GSource; <none>
12815 # @missing: 0000..10FFFF; cjkIRG_JSource; <none>
12816 # @missing: 0000..10FFFF; cjkIRG_KSource; <none>
12817 # @missing: 0000..10FFFF; cjkIRG_TSource; <none>
12818 # @missing: 0000..10FFFF; cjkIRG_VSource; <none>
12821 if ($v_version ge v3.1.0) {
12822 push @cjk_properties, 'cjkIRG_HSource; kIRG_HSource';
12823 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_HSource; <none>';
12825 if ($v_version ge v3.1.1) {
12826 push @cjk_properties, 'cjkIRG_KPSource; kIRG_KPSource';
12827 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_KPSource; <none>';
12829 if ($v_version ge v3.2.0) {
12830 push @cjk_properties, split "\n", <<'END';
12831 cjkAccountingNumeric; kAccountingNumeric
12832 cjkCompatibilityVariant; kCompatibilityVariant
12833 cjkOtherNumeric; kOtherNumeric
12834 cjkPrimaryNumeric; kPrimaryNumeric
12836 push @cjk_property_values, split "\n", <<'END';
12837 # @missing: 0000..10FFFF; cjkAccountingNumeric; NaN
12838 # @missing: 0000..10FFFF; cjkCompatibilityVariant; <code point>
12839 # @missing: 0000..10FFFF; cjkOtherNumeric; NaN
12840 # @missing: 0000..10FFFF; cjkPrimaryNumeric; NaN
12843 if ($v_version gt v4.0.0) {
12844 push @cjk_properties, 'cjkIRG_USource; kIRG_USource';
12845 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIRG_USource; <none>';
12848 if ($v_version ge v4.1.0) {
12849 push @cjk_properties, 'cjkIICore ; kIICore';
12850 push @cjk_property_values, '# @missing: 0000..10FFFF; cjkIICore; <none>';
12855 # Do any special setup for Unihan properties.
12857 # This property gives the wrong computed type, so override.
12858 my $usource = property_ref('kIRG_USource');
12859 $usource->set_type($STRING) if defined $usource;
12861 # This property is to be considered binary (it says so in
12862 # http://www.unicode.org/reports/tr38/)
12863 my $iicore = property_ref('kIICore');
12864 if (defined $iicore) {
12865 $iicore->set_type($FORCED_BINARY);
12866 $iicore->table("Y")->add_note("Matches any code point which has a non-null value for this property; see unicode.org UAX #38.");
12868 # Unicode doesn't include the maps for this property, so don't
12869 # warn that they are missing.
12870 $iicore->set_pre_declared_maps(0);
12871 $iicore->add_comment(join_lines( <<END
12872 This property contains string values, but any non-empty ones are considered to
12873 be 'core', so Perl creates tables for both: 1) its string values, plus 2)
12874 tables so that \\p{kIICore} matches any code point which has a non-empty
12875 value for this property.
12883 sub filter_unihan_line {
12884 # Change unihan db lines to look like the others in the db. Here is
12886 # U+341C kCangjie IEKN
12888 # Tabs are used instead of semi-colons to separate fields; therefore
12889 # they may have semi-colons embedded in them. Change these to periods
12890 # so won't screw up the rest of the code.
12893 # Remove lines that don't look like ones we accept.
12894 if ($_ !~ /^ [^\t]* \t ( [^\t]* ) /x) {
12899 # Extract the property, and save a reference to its object.
12901 if (! exists $unihan_properties{$property}) {
12902 $unihan_properties{$property} = property_ref($property);
12905 # Don't do anything unless the property is one we're handling, which
12906 # we determine by seeing if there is an object defined for it or not
12907 if (! defined $unihan_properties{$property}) {
12912 # Convert the tab separators to our standard semi-colons, and convert
12913 # the U+HHHH notation to the rest of the standard's HHHH
12915 s/\b U \+ (?= $code_point_re )//xg;
12917 #local $to_trace = 1 if main::DEBUG;
12918 trace $_ if main::DEBUG && $to_trace;
12924 sub filter_blocks_lines($file) {
12925 # In the Blocks.txt file, the names of the blocks don't quite match the
12926 # names given in PropertyValueAliases.txt, so this changes them so they
12927 # do match: Blanks and hyphens are changed into underscores. Also makes
12928 # early release versions look like later ones
12930 # $_ is transformed to the correct value.
12932 if ($v_version lt v3.2.0) {
12933 if (/FEFF.*Specials/) { # Bug in old versions: line wrongly inserted
12938 # Old versions used a different syntax to mark the range.
12939 $_ =~ s/;\s+/../ if $v_version lt v3.1.0;
12942 my @fields = split /\s*;\s*/, $_, -1;
12943 if (@fields != 2) {
12944 $file->carp_bad_line("Expecting exactly two fields");
12949 # Change hyphens and blanks in the block name field only
12950 $fields[1] =~ s/[ -]/_/g;
12951 $fields[1] =~ s/_ ( [a-z] ) /_\u$1/xg; # Capitalize first letter of word
12953 $_ = join("; ", @fields);
12958 my $current_property;
12960 sub filter_old_style_proplist {
12961 # PropList.txt has been in Unicode since version 2.0. Until 3.1, it
12962 # was in a completely different syntax. Ken Whistler of Unicode says
12963 # that it was something he used as an aid for his own purposes, but
12964 # was never an official part of the standard. Many of the properties
12965 # in it were incorporated into the later PropList.txt, but some were
12966 # not. This program uses this early file to generate property tables
12967 # that are otherwise not accessible in the early UCD's. It does this
12968 # for the ones that eventually became official, and don't appear to be
12969 # too different in their contents from the later official version, and
12970 # throws away the rest. It could be argued that the ones it generates
12971 # were probably not really official at that time, so should be
12972 # ignored. You can easily modify things to skip all of them by
12973 # changing this function to just set $_ to "", and return; and to skip
12974 # certain of them by simply removing their declarations from
12975 # get_old_property_aliases().
12977 # Here is a list of all the ones that are thrown away:
12978 # Alphabetic The definitions for this are very
12979 # defective, so better to not mislead
12980 # people into thinking it works.
12981 # Instead the Perl extension of the
12982 # same name is constructed from first
12984 # Bidi=* duplicates UnicodeData.txt
12985 # Combining never made into official property;
12987 # Composite never made into official property.
12988 # Currency Symbol duplicates UnicodeData.txt: gc=sc
12989 # Decimal Digit duplicates UnicodeData.txt: gc=nd
12990 # Delimiter never made into official property;
12992 # Format Control never made into official property;
12994 # High Surrogate duplicates Blocks.txt
12995 # Ignorable Control never made into official property;
12997 # ISO Control duplicates UnicodeData.txt: gc=cc
12998 # Left of Pair never made into official property;
12999 # Line Separator duplicates UnicodeData.txt: gc=zl
13000 # Low Surrogate duplicates Blocks.txt
13001 # Non-break was actually listed as a property
13002 # in 3.2, but without any code
13003 # points. Unicode denies that this
13004 # was ever an official property
13005 # Non-spacing duplicate UnicodeData.txt: gc=mn
13006 # Numeric duplicates UnicodeData.txt: gc=cc
13007 # Paired Punctuation never made into official property;
13008 # appears to be gc=ps + gc=pe
13009 # Paragraph Separator duplicates UnicodeData.txt: gc=cc
13010 # Private Use duplicates UnicodeData.txt: gc=co
13011 # Private Use High Surrogate duplicates Blocks.txt
13012 # Punctuation duplicates UnicodeData.txt: gc=p
13013 # Space different definition than eventual
13015 # Titlecase duplicates UnicodeData.txt: gc=lt
13016 # Unassigned Code Value duplicates UnicodeData.txt: gc=cn
13017 # Zero-width never made into official property;
13019 # Most of the properties have the same names in this file as in later
13020 # versions, but a couple do not.
13022 # This subroutine filters $_, converting it from the old style into
13023 # the new style. Here's a sample of the old-style
13025 # *******************************************
13027 # Property dump for: 0x100000A0 (Join Control)
13029 # 200C..200D (2 chars)
13031 # In the example, the property is "Join Control". It is kept in this
13032 # closure between calls to the subroutine. The numbers beginning with
13033 # 0x were internal to Ken's program that generated this file.
13035 # If this line contains the property name, extract it.
13036 if (/^Property dump for: [^(]*\((.*)\)/) {
13039 # Convert white space to underscores.
13042 # Convert the few properties that don't have the same name as
13043 # their modern counterparts
13044 s/Identifier_Part/ID_Continue/
13045 or s/Not_a_Character/NChar/;
13047 # If the name matches an existing property, use it.
13048 if (defined property_ref($_)) {
13049 trace "new property=", $_ if main::DEBUG && $to_trace;
13050 $current_property = $_;
13052 else { # Otherwise discard it
13053 trace "rejected property=", $_ if main::DEBUG && $to_trace;
13054 undef $current_property;
13056 $_ = ""; # The property is saved for the next lines of the
13057 # file, but this defining line is of no further use,
13058 # so clear it so that the caller won't process it
13061 elsif (! defined $current_property || $_ !~ /^$code_point_re/) {
13063 # Here, the input line isn't a header defining a property for the
13064 # following section, and either we aren't in such a section, or
13065 # the line doesn't look like one that defines the code points in
13066 # such a section. Ignore this line.
13071 # Here, we have a line defining the code points for the current
13072 # stashed property. Anything starting with the first blank is
13073 # extraneous. Otherwise, it should look like a normal range to
13074 # the caller. Append the property name so that it looks just like
13075 # a modern PropList entry.
13078 $_ .= "; $current_property";
13080 trace $_ if main::DEBUG && $to_trace;
13083 } # End closure for old style proplist
13085 sub filter_old_style_normalization_lines {
13086 # For early releases of Unicode, the lines were like:
13087 # 74..2A76 ; NFKD_NO
13088 # For later releases this became:
13089 # 74..2A76 ; NFKD_QC; N
13090 # Filter $_ to look like those in later releases.
13091 # Similarly for MAYBEs
13093 s/ _NO \b /_QC; N/x || s/ _MAYBE \b /_QC; M/x;
13095 # Also, the property FC_NFKC was abbreviated to FNC
13100 sub setup_script_extensions {
13101 # The Script_Extensions property starts out with a clone of the Script
13104 $scx = property_ref("Script_Extensions");
13105 return unless defined $scx;
13107 $scx->_set_format($STRING_WHITE_SPACE_LIST);
13108 $scx->initialize($script);
13109 $scx->set_default_map($script->default_map);
13110 $scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
13111 $scx->add_comment(join_lines( <<END
13112 The values for code points that appear in one script are just the same as for
13113 the 'Script' property. Likewise the values for those that appear in many
13114 scripts are either 'Common' or 'Inherited', same as with 'Script'. But the
13115 values of code points that appear in a few scripts are a space separated list
13120 # Initialize scx's tables and the aliases for them to be the same as sc's
13121 foreach my $table ($script->tables) {
13122 my $scx_table = $scx->add_match_table($table->name,
13123 Full_Name => $table->full_name);
13124 foreach my $alias ($table->aliases) {
13125 $scx_table->add_alias($alias->name);
13130 sub filter_script_extensions_line {
13131 # The Scripts file comes with the full name for the scripts; the
13132 # ScriptExtensions, with the short name. The final mapping file is a
13133 # combination of these, and without adjustment, would have inconsistent
13134 # entries. This filters the latter file to convert to full names.
13135 # Entries look like this:
13136 # 064B..0655 ; Arab Syrc # Mn [11] ARABIC FATHATAN..ARABIC HAMZA BELOW
13138 my @fields = split /\s*;\s*/;
13140 # This script was erroneously omitted in this Unicode version.
13141 $fields[1] .= ' Takr' if $v_version eq v6.1.0 && $fields[0] =~ /^0964/;
13144 foreach my $short_name (split " ", $fields[1]) {
13145 push @full_names, $script->table($short_name)->full_name;
13147 $fields[1] = join " ", @full_names;
13148 $_ = join "; ", @fields;
13153 sub setup_emojidata {
13154 my $prop_ref = Property->new('ExtPict',
13155 Full_Name => 'Extended_Pictographic',
13157 $prop_ref->set_fate($PLACEHOLDER,
13158 "Not part of the Unicode Character Database");
13161 sub filter_emojidata_line {
13162 # We only are interested in this single property from this non-UCD data
13163 # file, and we turn it into a Perl property, so that it isn't accessible
13166 $_ = "" unless /\bExtended_Pictographic\b/;
13171 sub setup_IdStatus {
13172 my $ids = Property->new('Identifier_Status',
13173 Match_SubDir => 'IdStatus',
13174 Default_Map => 'Restricted',
13176 $ids->add_match_table('Allowed');
13180 $idt = Property->new('Identifier_Type',
13181 Match_SubDir => 'IdType',
13182 Default_Map => 'Not_Character',
13183 Format => $STRING_WHITE_SPACE_LIST,
13187 sub filter_IdType_line {
13189 # Some code points have more than one type, separated by spaces on the
13190 # input. For now, we just add everything as a property value. Later when
13191 # we look for properties with format $STRING_WHITE_SPACE_LIST, we resolve
13194 my @fields = split /\s*;\s*/;
13195 my $types = $fields[1];
13196 $idt->add_match_table($types) unless defined $idt->table($types);
13201 sub generate_hst($file) {
13203 # Populates the Hangul Syllable Type property from first principles
13205 # These few ranges are hard-coded in.
13206 $file->insert_lines(split /\n/, <<'END'
13214 # The Hangul syllables in version 1 are at different code points than
13215 # those that came along starting in version 2, and have different names;
13216 # they comprise about 60% of the code points of the later version.
13217 # From my (khw) research on them (see <558493EB.4000807@att.net>), the
13218 # initial set is a subset of the later version, with different English
13219 # transliterations. I did not see an easy mapping between them. The
13220 # later set includes essentially all possibilities, even ones that aren't
13221 # in modern use (if they ever were), and over 96% of the new ones are type
13222 # LVT. Mathematically, the early set must also contain a preponderance of
13223 # LVT values. In lieu of doing nothing, we just set them all to LVT, and
13224 # expect that this will be right most of the time, which is better than
13225 # not being right at all.
13226 if ($v_version lt v2.0.0) {
13227 my $property = property_ref($file->property);
13228 $file->insert_lines(sprintf("%04X..%04X; LVT\n",
13229 $FIRST_REMOVED_HANGUL_SYLLABLE,
13230 $FINAL_REMOVED_HANGUL_SYLLABLE));
13231 push @tables_that_may_be_empty, $property->table('LV')->complete_name;
13235 # The algorithmically derived syllables are almost all LVT ones, so
13236 # initialize the whole range with that.
13237 $file->insert_lines(sprintf "%04X..%04X; LVT\n",
13238 $SBase, $SBase + $SCount -1);
13240 # Those ones that aren't LVT are LV, and they occur at intervals of
13241 # $TCount code points, starting with the first code point, at $SBase.
13242 for (my $i = $SBase; $i < $SBase + $SCount; $i += $TCount) {
13243 $file->insert_lines(sprintf "%04X..%04X; LV\n", $i, $i);
13249 sub generate_GCB($file) {
13251 # Populates the Grapheme Cluster Break property from first principles
13253 # All these definitions are from
13254 # http://www.unicode.org/reports/tr29/tr29-3.html with confirmation
13255 # from http://www.unicode.org/reports/tr29/tr29-4.html
13257 foreach my $range ($gc->ranges) {
13259 # Extend includes gc=Me and gc=Mn, while Control includes gc=Cc
13261 if ($range->value =~ / ^ M [en] $ /x) {
13262 $file->insert_lines(sprintf "%04X..%04X; Extend",
13263 $range->start, $range->end);
13265 elsif ($range->value =~ / ^ C [cf] $ /x) {
13266 $file->insert_lines(sprintf "%04X..%04X; Control",
13267 $range->start, $range->end);
13270 $file->insert_lines("2028; Control"); # Line Separator
13271 $file->insert_lines("2029; Control"); # Paragraph Separator
13273 $file->insert_lines("000D; CR");
13274 $file->insert_lines("000A; LF");
13276 # Also from http://www.unicode.org/reports/tr29/tr29-3.html.
13277 foreach my $code_point ( qw{
13278 09BE 09D7 0B3E 0B57 0BBE 0BD7 0CC2 0CD5 0CD6
13279 0D3E 0D57 0DCF 0DDF FF9E FF9F 1D165 1D16E 1D16F
13282 my $category = $gc->value_of(hex $code_point);
13283 next if ! defined $category || $category eq 'Cn'; # But not if
13284 # unassigned in this
13286 $file->insert_lines("$code_point; Extend");
13289 my $hst = property_ref('Hangul_Syllable_Type');
13290 if ($hst->count > 0) {
13291 foreach my $range ($hst->ranges) {
13292 $file->insert_lines(sprintf "%04X..%04X; %s",
13293 $range->start, $range->end, $range->value);
13297 generate_hst($file);
13300 main::process_generic_property_file($file);
13304 sub fixup_early_perl_name_alias($file) {
13306 # Different versions of Unicode have varying support for the name synonyms
13307 # below. Just include everything. As of 6.1, all these are correct in
13308 # the Unicode-supplied file.
13310 # ALERT did not come along until 6.0, at which point it became preferred
13311 # over BELL. By inserting it last in early releases, BELL is preferred
13312 # over it; and vice-vers in 6.0
13313 my $type_for_bell = ($v_version lt v6.0.0)
13316 $file->insert_lines(split /\n/, <<END
13317 0007;BELL; $type_for_bell
13318 000A;LINE FEED (LF);alternate
13319 000C;FORM FEED (FF);alternate
13320 000D;CARRIAGE RETURN (CR);alternate
13321 0085;NEXT LINE (NEL);alternate
13326 # One might think that the 'Unicode_1_Name' field, could work for most
13327 # of the above names, but sadly that field varies depending on the
13328 # release. Version 1.1.5 had no names for any of the controls; Version
13329 # 2.0 introduced names for the C0 controls, and 3.0 introduced C1 names.
13330 # 3.0.1 removed the name INDEX; and 3.2 changed some names:
13331 # changed to parenthesized versions like "NEXT LINE" to
13332 # "NEXT LINE (NEL)";
13333 # changed PARTIAL LINE DOWN to PARTIAL LINE FORWARD
13334 # changed PARTIAL LINE UP to PARTIAL LINE BACKWARD;;
13335 # changed e.g. FILE SEPARATOR to INFORMATION SEPARATOR FOUR
13337 # All these are present in the 6.1 NameAliases.txt
13342 sub filter_later_version_name_alias_line {
13344 # This file has an extra entry per line for the alias type. This is
13345 # handled by creating a compound entry: "$alias: $type"; First, split
13346 # the line into components.
13347 my ($range, $alias, $type, @remainder)
13348 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13350 # This file contains multiple entries for some components, so tell the
13351 # downstream code to allow this in our internal tables; the
13352 # $MULTIPLE_AFTER preserves the input ordering.
13353 $_ = join ";", $range, $CMD_DELIM
13363 sub filter_early_version_name_alias_line {
13365 # Early versions did not have the trailing alias type field; implicitly it
13366 # was 'correction'.
13367 $_ .= "; correction";
13369 filter_later_version_name_alias_line;
13373 sub filter_all_caps_script_names {
13375 # Some early Unicode releases had the script names in all CAPS. This
13376 # converts them to just the first letter of each word being capital.
13378 my ($range, $script, @remainder)
13379 = split /\s*;\s*/, $_, -1; # -1 => retain trailing null fields
13380 my @words = split /[_-]/, $script;
13381 for my $word (@words) {
13383 ucfirst(lc($word)) if $word ne 'CJK';
13385 $script = join "_", @words;
13386 $_ = join ";", $range, $script, @remainder;
13389 sub finish_Unicode() {
13390 # This routine should be called after all the Unicode files have been read
13392 # 1) Creates properties that are missing from the version of Unicode being
13393 # compiled, and which, for whatever reason, are needed for the Perl
13394 # core to function properly. These are minimally populated as
13396 # 2) Adds the mappings for code points missing from the files which have
13397 # defaults specified for them.
13398 # 3) At this point all mappings are known, so it computes the type of
13399 # each property whose type hasn't been determined yet.
13400 # 4) Calculates all the regular expression match tables based on the
13402 # 5) Calculates and adds the tables which are defined by Unicode, but
13403 # which aren't derived by them, and certain derived tables that Perl
13406 # Folding information was introduced later into Unicode data. To get
13407 # Perl's case ignore (/i) to work at all in releases that don't have
13408 # folding, use the best available alternative, which is lower casing.
13409 my $fold = property_ref('Case_Folding');
13410 if ($fold->is_empty) {
13411 $fold->initialize(property_ref('Lowercase_Mapping'));
13412 $fold->add_note(join_lines(<<END
13413 WARNING: This table uses lower case as a substitute for missing fold
13419 # Multiple-character mapping was introduced later into Unicode data, so it
13420 # is by default the simple version. If to output the simple versions and
13421 # not present, just use the regular (which in these Unicode versions is
13422 # the simple as well).
13423 foreach my $map (qw { Uppercase_Mapping
13429 my $comment = <<END;
13431 Note that although the Perl core uses this file, it has the standard values
13432 for code points from U+0000 to U+00FF compiled in, so changing this table will
13433 not change the core's behavior with respect to these code points. Use
13434 Unicode::Casing to override this table.
13436 if ($map eq 'Case_Folding') {
13438 (/i regex matching is not overridable except by using a custom regex engine)
13441 property_ref($map)->add_comment(join_lines($comment));
13442 my $simple = property_ref("Simple_$map");
13443 next if ! $simple->is_empty;
13444 if ($simple->to_output_map) {
13445 $simple->initialize(property_ref($map));
13448 property_ref($map)->set_proxy_for($simple->name);
13452 # For each property, fill in any missing mappings, and calculate the re
13453 # match tables. If a property has more than one missing mapping, the
13454 # default is a reference to a data structure, and may require data from
13455 # other properties to resolve. The sort is used to cause these to be
13456 # processed last, after all the other properties have been calculated.
13457 # (Fortunately, the missing properties so far don't depend on each other.)
13458 foreach my $property
13459 (sort { (defined $a->default_map && ref $a->default_map) ? 1 : -1 }
13462 # $perl has been defined, but isn't one of the Unicode properties that
13463 # need to be finished up.
13464 next if $property == $perl;
13466 # Nor do we need to do anything with properties that aren't going to
13468 next if $property->fate == $SUPPRESSED;
13470 # Handle the properties that have more than one possible default
13471 if (ref $property->default_map) {
13472 my $default_map = $property->default_map;
13474 # These properties have stored in the default_map:
13476 # 1) A default map which applies to all code points in a
13478 # 2) an expression which will evaluate to the list of code
13479 # points in that class
13481 # 3) the default map which applies to every other missing code
13484 # Go through each list.
13485 while (my ($default, $eval) = $default_map->get_next_defaults) {
13486 last unless defined $eval;
13488 # Get the class list, and intersect it with all the so-far
13489 # unspecified code points yielding all the code points
13490 # in the class that haven't been specified.
13491 my $list = eval $eval;
13493 Carp::my_carp("Can't set some defaults for missing code points for $property because eval '$eval' failed with '$@'");
13497 # Narrow down the list to just those code points we don't have
13499 $list = $list & $property->inverse_list;
13501 # Add mappings to the property for each code point in the list
13502 foreach my $range ($list->ranges) {
13503 $property->add_map($range->start, $range->end, $default,
13508 # All remaining code points have the other mapping. Set that up
13509 # so the normal single-default mapping code will work on them
13510 $property->set_default_map($default_map->other_default);
13512 # And fall through to do that
13515 # We should have enough data now to compute the type of the property.
13516 my $property_name = $property->name;
13517 $property->compute_type;
13518 my $property_type = $property->type;
13520 next if ! $property->to_create_match_tables;
13522 # Here want to create match tables for this property
13524 # The Unicode db always (so far, and they claim into the future) have
13525 # the default for missing entries in binary properties be 'N' (unless
13526 # there is a '@missing' line that specifies otherwise)
13527 if (! defined $property->default_map) {
13528 if ($property_type == $BINARY) {
13529 $property->set_default_map('N');
13531 elsif ($property_type == $ENUM) {
13532 Carp::my_carp("Property '$property_name doesn't have a default mapping. Using a fake one");
13533 $property->set_default_map('XXX This makes sure there is a default map');
13537 # Add any remaining code points to the mapping, using the default for
13538 # missing code points.
13540 my $default_map = $property->default_map;
13541 if ($property_type == $FORCED_BINARY) {
13543 # A forced binary property creates a 'Y' table that matches all
13544 # non-default values. The actual string values are also written out
13545 # as a map table. (The default value will almost certainly be the
13546 # empty string, so the pod glosses over the distinction, and just
13547 # talks about empty vs non-empty.)
13548 my $yes = $property->table("Y");
13549 foreach my $range ($property->ranges) {
13550 next if $range->value eq $default_map;
13551 $yes->add_range($range->start, $range->end);
13553 $property->table("N")->set_complement($yes);
13556 if (defined $default_map) {
13558 # Make sure there is a match table for the default
13559 if (! defined ($default_table = $property->table($default_map)))
13561 $default_table = $property->add_match_table($default_map);
13564 # And, if the property is binary, the default table will just
13565 # be the complement of the other table.
13566 if ($property_type == $BINARY) {
13567 my $non_default_table;
13569 # Find the non-default table.
13570 for my $table ($property->tables) {
13571 if ($table == $default_table) {
13572 if ($v_version le v5.0.0) {
13573 $table->add_alias($_) for qw(N No F False);
13576 } elsif ($v_version le v5.0.0) {
13577 $table->add_alias($_) for qw(Y Yes T True);
13579 $non_default_table = $table;
13581 $default_table->set_complement($non_default_table);
13585 # This fills in any missing values with the default. It's
13586 # not necessary to do this with binary properties, as the
13587 # default is defined completely in terms of the Y table.
13588 $property->add_map(0, $MAX_WORKING_CODEPOINT,
13589 $default_map, Replace => $NO);
13593 # Have all we need to populate the match tables.
13594 my $maps_should_be_defined = $property->pre_declared_maps;
13595 foreach my $range ($property->ranges) {
13596 my $map = $range->value;
13597 my $table = $property->table($map);
13598 if (! defined $table) {
13600 # Integral and rational property values are not
13601 # necessarily defined in PropValueAliases, but whether all
13602 # the other ones should be depends on the property.
13603 if ($maps_should_be_defined
13604 && $map !~ /^ -? \d+ ( \/ \d+ )? $/x)
13606 Carp::my_carp("Table '$property_name=$map' should "
13607 . "have been defined. Defining it now.")
13609 $table = $property->add_match_table($map);
13612 next if $table->complement != 0; # Don't need to populate these
13613 $table->add_range($range->start, $range->end);
13617 # For Perl 5.6 compatibility, all properties matchable in regexes can
13618 # have an optional 'Is_' prefix. This is now done in Unicode::UCD.
13619 # But warn if this creates a conflict with a (new) Unicode property
13620 # name, although it appears that Unicode has made a decision never to
13621 # begin a property name with 'Is_', so this shouldn't happen.
13622 foreach my $alias ($property->aliases) {
13623 my $Is_name = 'Is_' . $alias->name;
13624 if (defined (my $pre_existing = property_ref($Is_name))) {
13625 Carp::my_carp(<<END
13626 There is already an alias named $Is_name (from " . $pre_existing . "), so
13627 creating one for $property won't work. This is bad news. If it is not too
13628 late, get Unicode to back off. Otherwise go back to the old scheme (findable
13629 from the git blame log for this area of the code that suppressed individual
13630 aliases that conflict with the new Unicode names. Proceeding anyway.
13634 } # End of loop through aliases for this property
13637 # Properties that have sets of values for some characters are now
13638 # converted. For example, the Script_Extensions property started out
13639 # as a clone of the Script property. But processing its data file
13640 # caused some elements to be replaced with different data. (These
13641 # elements were for the Common and Inherited properties.) This data
13642 # is a qw() list of all the scripts that the code points in the given
13643 # range are in. An example line is:
13645 # 060C ; Arab Syrc Thaa # Po ARABIC COMMA
13647 # Code executed earlier has created a new match table named "Arab Syrc
13648 # Thaa" which contains 060C. (The cloned table started out with this
13649 # code point mapping to "Common".) Now we add 060C to each of the
13650 # Arab, Syrc, and Thaa match tables. Then we delete the now spurious
13651 # "Arab Syrc Thaa" match table. This is repeated for all these tables
13652 # and ranges. The map data is retained in the map table for
13653 # reference, but the spurious match tables are deleted.
13654 my $format = $property->format;
13655 if (defined $format && $format eq $STRING_WHITE_SPACE_LIST) {
13656 foreach my $table ($property->tables) {
13658 # Space separates the entries which should go in multiple
13660 next unless $table->name =~ /\s/;
13662 # The list of the entries, hence the names of the tables that
13663 # everything in this combo table should be added to.
13664 my @list = split /\s+/, $table->name;
13666 # Add the entries from the combo table to each individual
13668 foreach my $individual (@list) {
13669 my $existing_table = $property->table($individual);
13671 # This should only be necessary if this particular entry
13672 # occurs only in combo with others.
13673 $existing_table = $property->add_match_table($individual)
13674 unless defined $existing_table;
13675 $existing_table += $table;
13677 $property->delete_match_table($table);
13680 } # End of loop through all Unicode properties.
13682 # Fill in the mappings that Unicode doesn't completely furnish. First the
13683 # single letter major general categories. If Unicode were to start
13684 # delivering the values, this would be redundant, but better that than to
13685 # try to figure out if should skip and not get it right. Ths could happen
13686 # if a new major category were to be introduced, and the hard-coded test
13687 # wouldn't know about it.
13688 # This routine depends on the standard names for the general categories
13689 # being what it thinks they are, like 'Cn'. The major categories are the
13690 # union of all the general category tables which have the same first
13691 # letters. eg. L = Lu + Lt + Ll + Lo + Lm
13692 foreach my $minor_table ($gc->tables) {
13693 my $minor_name = $minor_table->name;
13694 next if length $minor_name == 1;
13695 if (length $minor_name != 2) {
13696 Carp::my_carp_bug("Unexpected general category '$minor_name'. Skipped.");
13700 my $major_name = uc(substr($minor_name, 0, 1));
13701 my $major_table = $gc->table($major_name);
13702 $major_table += $minor_table;
13705 # LC is Ll, Lu, and Lt. (used to be L& or L_, but PropValueAliases.txt
13706 # defines it as LC)
13707 my $LC = $gc->table('LC');
13708 $LC->add_alias('L_', Status => $DISCOURAGED); # For backwards...
13709 $LC->add_alias('L&', Status => $DISCOURAGED); # compatibility.
13712 if ($LC->is_empty) { # Assume if not empty that Unicode has started to
13713 # deliver the correct values in it
13714 $LC->initialize($gc->table('Ll') + $gc->table('Lu'));
13716 # Lt not in release 1.
13717 if (defined $gc->table('Lt')) {
13718 $LC += $gc->table('Lt');
13719 $gc->table('Lt')->set_caseless_equivalent($LC);
13722 $LC->add_description('[\p{Ll}\p{Lu}\p{Lt}]');
13724 $gc->table('Ll')->set_caseless_equivalent($LC);
13725 $gc->table('Lu')->set_caseless_equivalent($LC);
13727 # Make sure this assumption in perl core code is valid in this Unicode
13728 # release, with known exceptions
13729 foreach my $range (property_ref('Numeric-Type')->table('Decimal')->ranges) {
13730 next if $range->end - $range->start == 9;
13731 next if $range->start == 0x1D7CE; # This whole range was added in 3.1
13732 next if $range->end == 0x19DA && $v_version eq v5.2.0;
13733 next if $range->end - $range->start < 9 && $v_version le 4.0.0;
13734 Carp::my_carp("Range $range unexpectedly doesn't contain 10"
13735 . " decimal digits. Code in regcomp.c assumes it does,"
13736 . " and will have to be fixed. Proceeding anyway.");
13739 # Mark the scx table as the parent of the corresponding sc table for those
13740 # which are identical. This causes the pod for the script table to refer
13741 # to the corresponding scx one. This is done after everything, so as to
13742 # wait until the tables are stabilized before checking for equivalency.
13743 if (defined $scx) {
13744 if (defined $pod_directory) {
13745 foreach my $table ($scx->tables) {
13746 my $plain_sc_equiv = $script->table($table->name);
13747 if ($table->matches_identically_to($plain_sc_equiv)) {
13748 $plain_sc_equiv->set_equivalent_to($table, Related => 1);
13757 sub pre_3_dot_1_Nl () {
13759 # Return a range list for gc=nl for Unicode versions prior to 3.1, which
13760 # is when Unicode's became fully usable. These code points were
13761 # determined by inspection and experimentation. gc=nl is important for
13762 # certain Perl-extension properties that should be available in all
13765 my $Nl = Range_List->new();
13766 if (defined (my $official = $gc->table('Nl'))) {
13770 $Nl->add_range(0x2160, 0x2182);
13771 $Nl->add_range(0x3007, 0x3007);
13772 $Nl->add_range(0x3021, 0x3029);
13774 $Nl->add_range(0xFE20, 0xFE23);
13775 $Nl->add_range(0x16EE, 0x16F0) if $v_version ge v3.0.0; # 3.0 was when
13780 sub calculate_Assigned() { # Set $Assigned to the gc != Cn code points; may be
13781 # called before the Cn's are completely filled.
13782 # Works on Unicodes earlier than ones that
13783 # explicitly specify Cn.
13784 return if defined $Assigned;
13786 if (! defined $gc || $gc->is_empty()) {
13787 Carp::my_carp_bug("calculate_Assigned() called before $gc is populated");
13790 $Assigned = $perl->add_match_table('Assigned',
13791 Description => "All assigned code points",
13793 while (defined (my $range = $gc->each_range())) {
13794 my $standard_value = standardize($range->value);
13795 next if $standard_value eq 'cn' || $standard_value eq 'unassigned';
13796 $Assigned->add_range($range->start, $range->end);
13800 sub calculate_DI() { # Set $DI to a Range_List equivalent to the
13801 # Default_Ignorable_Code_Point property. Works on
13802 # Unicodes earlier than ones that explicitly specify
13804 return if defined $DI;
13806 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
13807 $DI = $di->table('Y');
13810 $DI = Range_List->new(Initialize => [ 0x180B .. 0x180D,
13815 if ($v_version ge v2.0) {
13816 $DI += $gc->table('Cf')
13817 + $gc->table('Cs');
13819 # These are above the Unicode version 1 max
13820 $DI->add_range(0xE0000, 0xE0FFF);
13822 $DI += $gc->table('Cc')
13824 - utf8::unicode_to_native(0x0A) # LINE FEED
13825 - utf8::unicode_to_native(0x0B) # VERTICAL TAB
13827 - utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
13828 - utf8::unicode_to_native(0x85); # NEL
13832 sub calculate_NChar() { # Create a Perl extension match table which is the
13833 # same as the Noncharacter_Code_Point property, and
13834 # set $NChar to point to it. Works on Unicodes
13835 # earlier than ones that explicitly specify NChar
13836 return if defined $NChar;
13838 $NChar = $perl->add_match_table('_Perl_Nchar',
13839 Perl_Extension => 1,
13840 Fate => $INTERNAL_ONLY);
13841 if (defined (my $off_nchar = property_ref('NChar'))) {
13842 $NChar->initialize($off_nchar->table('Y'));
13845 $NChar->initialize([ 0xFFFE .. 0xFFFF ]);
13846 if ($v_version ge v2.0) { # First release with these nchars
13847 for (my $i = 0x1FFFE; $i <= 0x10FFFE; $i += 0x10000) {
13848 $NChar += [ $i .. $i+1 ];
13854 sub handle_compare_versions () {
13855 # This fixes things up for the $compare_versions capability, where we
13856 # compare Unicode version X with version Y (with Y > X), and we are
13857 # running it on the Unicode Data for version Y.
13859 # It works by calculating the code points whose meaning has been specified
13860 # after release X, by using the Age property. The complement of this set
13861 # is the set of code points whose meaning is unchanged between the
13862 # releases. This is the set the program restricts itself to. It includes
13863 # everything whose meaning has been specified by the time version X came
13864 # along, plus those still unassigned by the time of version Y. (We will
13865 # continue to use the word 'assigned' to mean 'meaning has been
13866 # specified', as it's shorter and is accurate in all cases except the
13867 # Noncharacter code points.)
13869 # This function is run after all the properties specified by Unicode have
13870 # been calculated for release Y. This makes sure we get all the nuances
13871 # of Y's rules. (It is done before the Perl extensions are calculated, as
13872 # those are based entirely on the Unicode ones.) But doing it after the
13873 # Unicode table calculations means we have to fix up the Unicode tables.
13874 # We do this by subtracting the code points that have been assigned since
13875 # X (which is actually done by ANDing each table of assigned code points
13876 # with the set of unchanged code points). Most Unicode properties are of
13877 # the form such that all unassigned code points have a default, grab-bag,
13878 # property value which is changed when the code point gets assigned. For
13879 # these, we just remove the changed code points from the table for the
13880 # latter property value, and add them back in to the grab-bag one. A few
13881 # other properties are not entirely of this form and have values for some
13882 # or all unassigned code points that are not the grab-bag one. These have
13883 # to be handled specially, and are hard-coded in to this routine based on
13884 # manual inspection of the Unicode character database. A list of the
13885 # outlier code points is made for each of these properties, and those
13886 # outliers are excluded from adding and removing from tables.
13888 # Note that there are glitches when comparing against Unicode 1.1, as some
13889 # Hangul syllables in it were later ripped out and eventually replaced
13890 # with other things.
13892 print "Fixing up for version comparison\n" if $verbosity >= $PROGRESS;
13894 my $after_first_version = "All matching code points were added after "
13895 . "Unicode $string_compare_versions";
13897 # Calculate the delta as those code points that have been newly assigned
13898 # since the first compare version.
13899 my $delta = Range_List->new();
13900 foreach my $table ($age->tables) {
13902 next if $table == $age->table('Unassigned');
13903 next if version->parse($table->name)
13904 le version->parse($string_compare_versions);
13907 if ($delta->is_empty) {
13908 die ("No changes; perhaps you need a 'DAge.txt' file?");
13911 my $unchanged = ~ $delta;
13913 calculate_Assigned() if ! defined $Assigned;
13914 $Assigned &= $unchanged;
13916 # $Assigned now contains the code points that were assigned as of Unicode
13919 # A block is all or nothing. If nothing is assigned in it, it all goes
13920 # back to the No_Block pool; but if even one code point is assigned, the
13921 # block is retained.
13922 my $no_block = $block->table('No_Block');
13923 foreach my $this_block ($block->tables) {
13924 next if $this_block == $no_block
13925 || ! ($this_block & $Assigned)->is_empty;
13926 $this_block->set_fate($SUPPRESSED, $after_first_version);
13927 foreach my $range ($this_block->ranges) {
13928 $block->replace_map($range->start, $range->end, 'No_Block')
13930 $no_block += $this_block;
13933 my @special_delta_properties; # List of properties that have to be
13934 # handled specially.
13935 my %restricted_delta; # Keys are the entries in
13936 # @special_delta_properties; values
13937 # are the range list of the code points
13938 # that behave normally when they get
13941 # In the next three properties, the Default Ignorable code points are
13946 push @special_delta_properties, property_ref('_Perl_GCB');
13947 $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13949 if (defined (my $cwnfkcc = property_ref('Changes_When_NFKC_Casefolded')))
13951 push @special_delta_properties, $cwnfkcc;
13952 $restricted_delta{$special_delta_properties[-1]} = ~ $DI;
13955 calculate_NChar(); # Non-character code points
13956 $NChar &= $unchanged;
13958 # This may have to be updated from time-to-time to get the most accurate
13960 my $default_BC_non_LtoR = Range_List->new(Initialize =>
13961 # These came from the comments in v8.0 DBidiClass.txt
13968 0x1EE00 .. 0x1EEFF,
13973 0x10800 .. 0x10FFF,
13974 0x1E800 .. 0x1EDFF,
13975 0x1EF00 .. 0x1EFFF,
13980 $default_BC_non_LtoR += $DI + $NChar;
13981 push @special_delta_properties, property_ref('BidiClass');
13982 $restricted_delta{$special_delta_properties[-1]} = ~ $default_BC_non_LtoR;
13984 if (defined (my $eaw = property_ref('East_Asian_Width'))) {
13986 my $default_EA_width_W = Range_List->new(Initialize =>
13987 # From comments in v8.0 EastAsianWidth.txt
13992 0x20000 .. 0x2A6DF,
13993 0x2A700 .. 0x2B73F,
13994 0x2B740 .. 0x2B81F,
13995 0x2B820 .. 0x2CEAF,
13996 0x2F800 .. 0x2FA1F,
13997 0x20000 .. 0x2FFFD,
13998 0x30000 .. 0x3FFFD,
14001 push @special_delta_properties, $eaw;
14002 $restricted_delta{$special_delta_properties[-1]}
14003 = ~ $default_EA_width_W;
14005 # Line break came along in the same release as East_Asian_Width, and
14006 # the non-grab-bag default set is a superset of the EAW one.
14007 if (defined (my $lb = property_ref('Line_Break'))) {
14008 my $default_LB_non_XX = Range_List->new(Initialize =>
14009 # From comments in v8.0 LineBreak.txt
14010 [ 0x20A0 .. 0x20CF ]);
14011 $default_LB_non_XX += $default_EA_width_W;
14012 push @special_delta_properties, $lb;
14013 $restricted_delta{$special_delta_properties[-1]}
14014 = ~ $default_LB_non_XX;
14018 # Go through every property, skipping those we've already worked on, those
14019 # that are immutable, and the perl ones that will be calculated after this
14020 # routine has done its fixup.
14021 foreach my $property (property_ref('*')) {
14022 next if $property == $perl # Done later in the program
14023 || $property == $block # Done just above
14024 || $property == $DI # Done just above
14025 || $property == $NChar # Done just above
14027 # The next two are invariant across Unicode versions
14028 || $property == property_ref('Pattern_Syntax')
14029 || $property == property_ref('Pattern_White_Space');
14031 # Find the grab-bag value.
14032 my $default_map = $property->default_map;
14034 if (! $property->to_create_match_tables) {
14036 # Here there aren't any match tables. So far, all such properties
14037 # have a default map, and don't require special handling. Just
14038 # change each newly assigned code point back to the default map,
14039 # as if they were unassigned.
14040 foreach my $range ($delta->ranges) {
14041 $property->add_map($range->start,
14044 Replace => $UNCONDITIONALLY);
14047 else { # Here there are match tables. Find the one (if any) for the
14048 # grab-bag value that unassigned code points go to.
14050 if (defined $default_map) {
14051 $default_table = $property->table($default_map);
14054 # If some code points don't go back to the grab-bag when they
14055 # are considered unassigned, exclude them from the list that does
14057 my $this_delta = $delta;
14058 my $this_unchanged = $unchanged;
14059 if (grep { $_ == $property } @special_delta_properties) {
14060 $this_delta = $delta & $restricted_delta{$property};
14061 $this_unchanged = ~ $this_delta;
14064 # Fix up each match table for this property.
14065 foreach my $table ($property->tables) {
14066 if (defined $default_table && $table == $default_table) {
14068 # The code points assigned after release X (the ones we
14069 # are excluding in this routine) go back on to the default
14070 # (grab-bag) table. However, some of these tables don't
14071 # actually exist, but are specified solely by the other
14072 # tables. (In a binary property, we don't need to
14073 # actually have an 'N' table, as it's just the complement
14074 # of the 'Y' table.) Such tables will be locked, so just
14076 $table += $this_delta unless $table->locked;
14080 # Here the table is not for the default value. We need to
14081 # subtract the code points we are ignoring for this
14082 # comparison (the deltas) from it. But if the table
14083 # started out with nothing, no need to exclude anything,
14084 # and want to skip it here anyway, so it gets listed
14085 # properly in the pod.
14086 next if $table->is_empty;
14088 # Save the deltas for later, before we do the subtraction
14089 my $deltas = $table & $this_delta;
14091 $table &= $this_unchanged;
14093 # Suppress the table if the subtraction left it with
14095 if ($table->is_empty) {
14096 if ($property->type == $BINARY) {
14097 push @tables_that_may_be_empty, $table->complete_name;
14100 $table->set_fate($SUPPRESSED, $after_first_version);
14104 # Now we add the removed code points to the property's
14105 # map, as they should now map to the grab-bag default
14106 # property (which they did in the first comparison
14107 # version). But we don't have to do this if the map is
14108 # only for internal use.
14109 if (defined $default_map && $property->to_output_map) {
14111 # The gc property has pseudo property values whose names
14112 # have length 1. These are the union of all the
14113 # property values whose name is longer than 1 and
14114 # whose first letter is all the same. The replacement
14115 # is done once for the longer-named tables.
14116 next if $property == $gc && length $table->name == 1;
14118 foreach my $range ($deltas->ranges) {
14119 $property->add_map($range->start,
14122 Replace => $UNCONDITIONALLY);
14130 # The above code doesn't work on 'gc=C', as it is a superset of the default
14131 # ('Cn') table. It's easiest to just special case it here.
14132 my $C = $gc->table('C');
14133 $C += $gc->table('Cn');
14138 sub compile_perl() {
14139 # Create perl-defined tables. Almost all are part of the pseudo-property
14140 # named 'perl' internally to this program. Many of these are recommended
14141 # in UTS#18 "Unicode Regular Expressions", and their derivations are based
14142 # on those found there.
14143 # Almost all of these are equivalent to some Unicode property.
14144 # A number of these properties have equivalents restricted to the ASCII
14145 # range, with their names prefaced by 'Posix', to signify that these match
14146 # what the Posix standard says they should match. A couple are
14147 # effectively this, but the name doesn't have 'Posix' in it because there
14148 # just isn't any Posix equivalent. 'XPosix' are the Posix tables extended
14149 # to the full Unicode range, by our guesses as to what is appropriate.
14151 # 'All' is all code points. As an error check, instead of just setting it
14152 # to be that, construct it to be the union of all the major categories
14153 $All = $perl->add_match_table('All',
14155 => "All code points, including those above Unicode. Same as qr/./s",
14158 foreach my $major_table ($gc->tables) {
14160 # Major categories are the ones with single letter names.
14161 next if length($major_table->name) != 1;
14163 $All += $major_table;
14166 if ($All->max != $MAX_WORKING_CODEPOINT) {
14167 Carp::my_carp_bug("Generated highest code point ("
14168 . sprintf("%X", $All->max)
14169 . ") doesn't match expected value $MAX_WORKING_CODEPOINT_STRING.")
14171 if ($All->range_count != 1 || $All->min != 0) {
14172 Carp::my_carp_bug("Generated table 'All' doesn't match all code points.")
14175 my $Any = $perl->add_match_table('Any',
14176 Description => "All Unicode code points");
14177 $Any->add_range(0, $MAX_UNICODE_CODEPOINT);
14178 $Any->add_alias('Unicode');
14180 calculate_Assigned();
14182 my $ASCII = $perl->add_match_table('ASCII');
14183 if (defined $block) { # This is equivalent to the block if have it.
14184 my $Unicode_ASCII = $block->table('Basic_Latin');
14185 if (defined $Unicode_ASCII && ! $Unicode_ASCII->is_empty) {
14186 $ASCII->set_equivalent_to($Unicode_ASCII, Related => 1);
14190 # Very early releases didn't have blocks, so initialize ASCII ourselves if
14192 if ($ASCII->is_empty) {
14193 if (! NON_ASCII_PLATFORM) {
14194 $ASCII->add_range(0, 127);
14197 for my $i (0 .. 127) {
14198 $ASCII->add_range(utf8::unicode_to_native($i),
14199 utf8::unicode_to_native($i));
14204 # Get the best available case definitions. Early Unicode versions didn't
14205 # have Uppercase and Lowercase defined, so use the general category
14206 # instead for them, modified by hard-coding in the code points each is
14208 my $Lower = $perl->add_match_table('XPosixLower');
14209 my $Unicode_Lower = property_ref('Lowercase');
14210 if (defined $Unicode_Lower && ! $Unicode_Lower->is_empty) {
14211 $Lower->set_equivalent_to($Unicode_Lower->table('Y'), Related => 1);
14215 $Lower += $gc->table('Lowercase_Letter');
14217 # There are quite a few code points in Lower, that aren't in gc=lc,
14218 # and not all are in all releases.
14219 my $temp = Range_List->new(Initialize => [
14220 utf8::unicode_to_native(0xAA),
14221 utf8::unicode_to_native(0xBA),
14239 $Lower += $temp & $Assigned;
14241 my $Posix_Lower = $perl->add_match_table("PosixLower",
14242 Initialize => $Lower & $ASCII,
14245 my $Upper = $perl->add_match_table("XPosixUpper");
14246 my $Unicode_Upper = property_ref('Uppercase');
14247 if (defined $Unicode_Upper && ! $Unicode_Upper->is_empty) {
14248 $Upper->set_equivalent_to($Unicode_Upper->table('Y'), Related => 1);
14252 # Unlike Lower, there are only two ranges in Upper that aren't in
14253 # gc=Lu, and all code points were assigned in all releases.
14254 $Upper += $gc->table('Uppercase_Letter');
14255 $Upper->add_range(0x2160, 0x216F); # Uppercase Roman numerals
14256 $Upper->add_range(0x24B6, 0x24CF); # Circled Latin upper case letters
14258 my $Posix_Upper = $perl->add_match_table("PosixUpper",
14259 Initialize => $Upper & $ASCII,
14262 # Earliest releases didn't have title case. Initialize it to empty if not
14263 # otherwise present
14264 my $Title = $perl->add_match_table('Title', Full_Name => 'Titlecase',
14265 Description => '(= \p{Gc=Lt})');
14266 my $lt = $gc->table('Lt');
14268 # Earlier versions of mktables had this related to $lt since they have
14269 # identical code points, but their caseless equivalents are not the same,
14270 # one being 'Cased' and the other being 'LC', and so now must be kept as
14271 # separate entities.
14276 push @tables_that_may_be_empty, $Title->complete_name;
14279 my $Unicode_Cased = property_ref('Cased');
14280 if (defined $Unicode_Cased) {
14281 my $yes = $Unicode_Cased->table('Y');
14282 my $no = $Unicode_Cased->table('N');
14283 $Title->set_caseless_equivalent($yes);
14284 if (defined $Unicode_Upper) {
14285 $Unicode_Upper->table('Y')->set_caseless_equivalent($yes);
14286 $Unicode_Upper->table('N')->set_caseless_equivalent($no);
14288 $Upper->set_caseless_equivalent($yes);
14289 if (defined $Unicode_Lower) {
14290 $Unicode_Lower->table('Y')->set_caseless_equivalent($yes);
14291 $Unicode_Lower->table('N')->set_caseless_equivalent($no);
14293 $Lower->set_caseless_equivalent($yes);
14296 # If this Unicode version doesn't have Cased, set up the Perl
14297 # extension from first principles. From Unicode 5.1: Definition D120:
14298 # A character C is defined to be cased if and only if C has the
14299 # Lowercase or Uppercase property or has a General_Category value of
14300 # Titlecase_Letter.
14301 my $cased = $perl->add_match_table('Cased',
14302 Initialize => $Lower + $Upper + $Title,
14303 Description => 'Uppercase or Lowercase or Titlecase',
14305 # $notcased is purely for the caseless equivalents below
14306 my $notcased = $perl->add_match_table('_Not_Cased',
14307 Initialize => ~ $cased,
14308 Fate => $INTERNAL_ONLY,
14309 Description => 'All not-cased code points');
14310 $Title->set_caseless_equivalent($cased);
14311 if (defined $Unicode_Upper) {
14312 $Unicode_Upper->table('Y')->set_caseless_equivalent($cased);
14313 $Unicode_Upper->table('N')->set_caseless_equivalent($notcased);
14315 $Upper->set_caseless_equivalent($cased);
14316 if (defined $Unicode_Lower) {
14317 $Unicode_Lower->table('Y')->set_caseless_equivalent($cased);
14318 $Unicode_Lower->table('N')->set_caseless_equivalent($notcased);
14320 $Lower->set_caseless_equivalent($cased);
14323 # The remaining perl defined tables are mostly based on Unicode TR 18,
14324 # "Annex C: Compatibility Properties". All of these have two versions,
14325 # one whose name generally begins with Posix that is posix-compliant, and
14326 # one that matches Unicode characters beyond the Posix, ASCII range
14328 my $Alpha = $perl->add_match_table('XPosixAlpha');
14330 # Alphabetic was not present in early releases
14331 my $Alphabetic = property_ref('Alphabetic');
14332 if (defined $Alphabetic && ! $Alphabetic->is_empty) {
14333 $Alpha->set_equivalent_to($Alphabetic->table('Y'), Related => 1);
14337 # The Alphabetic property doesn't exist for early releases, so
14338 # generate it. The actual definition, in 5.2 terms is:
14340 # gc=L + gc=Nl + Other_Alphabetic
14342 # Other_Alphabetic is also not defined in these early releases, but it
14343 # contains one gc=So range plus most of gc=Mn and gc=Mc, so we add
14344 # those last two as well, then subtract the relatively few of them that
14345 # shouldn't have been added. (The gc=So range is the circled capital
14346 # Latin characters. Early releases mistakenly didn't also include the
14347 # lower-case versions of these characters, and so we don't either, to
14348 # maintain consistency with those releases that first had this
14350 $Alpha->initialize($gc->table('Letter')
14355 $Alpha->add_range(0x24D0, 0x24E9); # gc=So
14356 foreach my $range ( [ 0x0300, 0x0344 ],
14357 [ 0x0346, 0x034E ],
14358 [ 0x0360, 0x0362 ],
14359 [ 0x0483, 0x0486 ],
14360 [ 0x0591, 0x05AF ],
14361 [ 0x06DF, 0x06E0 ],
14362 [ 0x06EA, 0x06EC ],
14363 [ 0x0740, 0x074A ],
14366 [ 0x0951, 0x0954 ],
14380 [ 0x0E47, 0x0E4C ],
14382 [ 0x0EC8, 0x0ECC ],
14383 [ 0x0F18, 0x0F19 ],
14387 [ 0x0F3E, 0x0F3F ],
14388 [ 0x0F82, 0x0F84 ],
14389 [ 0x0F86, 0x0F87 ],
14393 [ 0x17C9, 0x17D3 ],
14394 [ 0x20D0, 0x20DC ],
14396 [ 0x302A, 0x302F ],
14397 [ 0x3099, 0x309A ],
14398 [ 0xFE20, 0xFE23 ],
14399 [ 0x1D165, 0x1D169 ],
14400 [ 0x1D16D, 0x1D172 ],
14401 [ 0x1D17B, 0x1D182 ],
14402 [ 0x1D185, 0x1D18B ],
14403 [ 0x1D1AA, 0x1D1AD ],
14406 $Alpha->delete_range($range->[0], $range->[1]);
14409 $Alpha->delete_range($range, $range);
14412 $Alpha->add_description('Alphabetic');
14413 $Alpha->add_alias('Alphabetic');
14415 my $Posix_Alpha = $perl->add_match_table("PosixAlpha",
14416 Initialize => $Alpha & $ASCII,
14418 $Posix_Upper->set_caseless_equivalent($Posix_Alpha);
14419 $Posix_Lower->set_caseless_equivalent($Posix_Alpha);
14421 my $Alnum = $perl->add_match_table('Alnum', Full_Name => 'XPosixAlnum',
14422 Description => 'Alphabetic and (decimal) Numeric',
14423 Initialize => $Alpha + $gc->table('Decimal_Number'),
14425 $perl->add_match_table("PosixAlnum",
14426 Initialize => $Alnum & $ASCII,
14429 my $Word = $perl->add_match_table('Word', Full_Name => 'XPosixWord',
14430 Description => '\w, including beyond ASCII;'
14431 . ' = \p{Alnum} + \pM + \p{Pc}'
14432 . ' + \p{Join_Control}',
14433 Initialize => $Alnum + $gc->table('Mark'),
14435 my $Pc = $gc->table('Connector_Punctuation'); # 'Pc' Not in release 1
14440 $Word += ord('_'); # Make sure this is a $Word
14442 my $JC = property_ref('Join_Control'); # Wasn't in release 1
14444 $Word += $JC->table('Y');
14447 $Word += 0x200C + 0x200D;
14450 # This is a Perl extension, so the name doesn't begin with Posix.
14451 my $PerlWord = $perl->add_match_table('PosixWord',
14452 Description => '\w, restricted to ASCII',
14453 Initialize => $Word & $ASCII,
14455 $PerlWord->add_alias('PerlWord');
14457 my $Blank = $perl->add_match_table('Blank', Full_Name => 'XPosixBlank',
14458 Description => '\h, Horizontal white space',
14460 # 200B is Zero Width Space which is for line
14461 # break control, and was listed as
14462 # Space_Separator in early releases
14463 Initialize => $gc->table('Space_Separator')
14467 $Blank->add_alias('HorizSpace'); # Another name for it.
14468 $perl->add_match_table("PosixBlank",
14469 Initialize => $Blank & $ASCII,
14472 my $VertSpace = $perl->add_match_table('VertSpace',
14473 Description => '\v',
14475 $gc->table('Line_Separator')
14476 + $gc->table('Paragraph_Separator')
14477 + utf8::unicode_to_native(0x0A) # LINE FEED
14478 + utf8::unicode_to_native(0x0B) # VERTICAL TAB
14480 + utf8::unicode_to_native(0x0D) # CARRIAGE RETURN
14481 + utf8::unicode_to_native(0x85) # NEL
14483 # No Posix equivalent for vertical space
14485 my $Space = $perl->add_match_table('XPosixSpace',
14486 Description => '\s including beyond ASCII and vertical tab',
14487 Initialize => $Blank + $VertSpace,
14489 $Space->add_alias('XPerlSpace'); # Pre-existing synonyms
14490 $Space->add_alias('SpacePerl');
14491 $Space->add_alias('Space') if $v_version lt v4.1.0;
14493 my $Posix_space = $perl->add_match_table("PosixSpace",
14494 Initialize => $Space & $ASCII,
14496 $Posix_space->add_alias('PerlSpace'); # A pre-existing synonym
14498 my $Cntrl = $perl->add_match_table('Cntrl', Full_Name => 'XPosixCntrl',
14499 Description => 'Control characters');
14500 $Cntrl->set_equivalent_to($gc->table('Cc'), Related => 1);
14501 $perl->add_match_table("PosixCntrl",
14502 Description => "ASCII control characters",
14503 Definition => "ACK, BEL, BS, CAN, CR, DC1, DC2,"
14504 . " DC3, DC4, DEL, DLE, ENQ, EOM,"
14505 . " EOT, ESC, ETB, ETX, FF, FS, GS,"
14506 . " HT, LF, NAK, NUL, RS, SI, SO,"
14507 . " SOH, STX, SUB, SYN, US, VT",
14508 Initialize => $Cntrl & $ASCII,
14511 my $perl_surrogate = $perl->add_match_table('_Perl_Surrogate');
14512 my $Cs = $gc->table('Cs');
14513 if (defined $Cs && ! $Cs->is_empty) {
14514 $perl_surrogate += $Cs;
14517 push @tables_that_may_be_empty, '_Perl_Surrogate';
14520 # $controls is a temporary used to construct Graph.
14521 my $controls = Range_List->new(Initialize => $gc->table('Unassigned')
14522 + $gc->table('Control')
14523 + $perl_surrogate);
14525 # Graph is ~space & ~(Cc|Cs|Cn) = ~(space + $controls)
14526 my $Graph = $perl->add_match_table('Graph', Full_Name => 'XPosixGraph',
14527 Description => 'Characters that are graphical',
14528 Initialize => ~ ($Space + $controls),
14530 $perl->add_match_table("PosixGraph",
14531 Initialize => $Graph & $ASCII,
14534 $print = $perl->add_match_table('Print', Full_Name => 'XPosixPrint',
14535 Description => 'Characters that are graphical plus space characters (but no controls)',
14536 Initialize => $Blank + $Graph - $gc->table('Control'),
14538 $perl->add_match_table("PosixPrint",
14539 Initialize => $print & $ASCII,
14542 my $Punct = $perl->add_match_table('Punct');
14543 $Punct->set_equivalent_to($gc->table('Punctuation'), Related => 1);
14545 # \p{punct} doesn't include the symbols, which posix does
14546 my $XPosixPunct = $perl->add_match_table('XPosixPunct',
14547 Description => '\p{Punct} + ASCII-range \p{Symbol}',
14548 Initialize => $gc->table('Punctuation')
14549 + ($ASCII & $gc->table('Symbol')),
14550 Perl_Extension => 1
14552 $perl->add_match_table('PosixPunct', Perl_Extension => 1,
14553 Initialize => $ASCII & $XPosixPunct,
14556 my $Digit = $perl->add_match_table('Digit', Full_Name => 'XPosixDigit',
14557 Description => '[0-9] + all other decimal digits');
14558 $Digit->set_equivalent_to($gc->table('Decimal_Number'), Related => 1);
14559 my $PosixDigit = $perl->add_match_table("PosixDigit",
14560 Initialize => $Digit & $ASCII,
14563 # Hex_Digit was not present in first release
14564 my $Xdigit = $perl->add_match_table('XDigit', Full_Name => 'XPosixXDigit');
14565 my $Hex = property_ref('Hex_Digit');
14566 if (defined $Hex && ! $Hex->is_empty) {
14567 $Xdigit->set_equivalent_to($Hex->table('Y'), Related => 1);
14570 $Xdigit->initialize([ ord('0') .. ord('9'),
14571 ord('A') .. ord('F'),
14572 ord('a') .. ord('f'),
14573 0xFF10..0xFF19, 0xFF21..0xFF26, 0xFF41..0xFF46]);
14576 # AHex was not present in early releases
14577 my $PosixXDigit = $perl->add_match_table('PosixXDigit');
14578 my $AHex = property_ref('ASCII_Hex_Digit');
14579 if (defined $AHex && ! $AHex->is_empty) {
14580 $PosixXDigit->set_equivalent_to($AHex->table('Y'), Related => 1);
14583 $PosixXDigit->initialize($Xdigit & $ASCII);
14584 $PosixXDigit->add_alias('AHex');
14585 $PosixXDigit->add_alias('Ascii_Hex_Digit');
14588 my $any_folds = $perl->add_match_table("_Perl_Any_Folds",
14589 Description => "Code points that particpate in some fold",
14591 my $loc_problem_folds = $perl->add_match_table(
14592 "_Perl_Problematic_Locale_Folds",
14594 "Code points that are in some way problematic under locale",
14597 # This allows regexec.c to skip some work when appropriate. Some of the
14598 # entries in _Perl_Problematic_Locale_Folds are multi-character folds,
14599 my $loc_problem_folds_start = $perl->add_match_table(
14600 "_Perl_Problematic_Locale_Foldeds_Start",
14602 "The first character of every sequence in _Perl_Problematic_Locale_Folds",
14605 my $cf = property_ref('Case_Folding');
14607 # Every character 0-255 is problematic because what each folds to depends
14608 # on the current locale
14609 $loc_problem_folds->add_range(0, 255);
14610 $loc_problem_folds->add_range(0x130, 0x131); # These are problematic in
14612 $loc_problem_folds_start += $loc_problem_folds;
14614 # Also problematic are anything these fold to outside the range. Likely
14615 # forever the only thing folded to by these outside the 0-255 range is the
14616 # GREEK SMALL MU (from the MICRO SIGN), but it's easy to make the code
14617 # completely general, which should catch any unexpected changes or errors.
14618 # We look at each code point 0-255, and add its fold (including each part
14619 # of a multi-char fold) to the list. See commit message
14620 # 31f05a37c4e9c37a7263491f2fc0237d836e1a80 for a more complete description
14622 foreach my $range ($loc_problem_folds->ranges) {
14623 foreach my $code_point ($range->start .. $range->end) {
14624 my $fold_range = $cf->containing_range($code_point);
14625 next unless defined $fold_range;
14627 # Skip if folds to itself
14628 next if $fold_range->value eq $CODE_POINT;
14630 my @hex_folds = split " ", $fold_range->value;
14631 my $start_cp = $hex_folds[0];
14632 next if $start_cp eq $CODE_POINT;
14633 $start_cp = hex $start_cp;
14634 foreach my $i (0 .. @hex_folds - 1) {
14635 my $cp = $hex_folds[$i];
14636 next if $cp eq $CODE_POINT;
14638 next unless $cp > 255; # Already have the < 256 ones
14640 $loc_problem_folds->add_range($cp, $cp);
14641 $loc_problem_folds_start->add_range($start_cp, $start_cp);
14646 my $folds_to_multi_char = $perl->add_match_table(
14647 "_Perl_Folds_To_Multi_Char",
14649 "Code points whose fold is a string of more than one character",
14651 my $in_multi_fold = $perl->add_match_table(
14652 "_Perl_Is_In_Multi_Char_Fold",
14654 "Code points that are in some multiple character fold",
14656 if ($v_version lt v3.0.1) {
14657 push @tables_that_may_be_empty, '_Perl_Folds_To_Multi_Char',
14658 '_Perl_Is_In_Multi_Char_Fold',
14659 '_Perl_Non_Final_Folds';
14662 # Look through all the known folds to populate these tables.
14663 foreach my $range ($cf->ranges) {
14664 next if $range->value eq $CODE_POINT;
14665 my $start = $range->start;
14666 my $end = $range->end;
14667 $any_folds->add_range($start, $end);
14669 my @hex_folds = split " ", $range->value;
14670 if (@hex_folds > 1) { # Is multi-char fold
14671 $folds_to_multi_char->add_range($start, $end);
14674 my $found_locale_problematic = 0;
14676 my $folded_count = @hex_folds;
14677 if ($folded_count > 3) {
14678 die Carp::my_carp("Maximum number of characters in a fold should be 3: Instead, it's $folded_count for U+" . sprintf "%04X", $range->start);
14681 # Look at each of the folded-to characters...
14682 foreach my $i (1 .. $folded_count) {
14683 my $cp = hex $hex_folds[$i-1];
14684 $any_folds->add_range($cp, $cp);
14686 # The fold is problematic if any of the folded-to characters is
14687 # already considered problematic.
14688 if ($loc_problem_folds->contains($cp)) {
14689 $loc_problem_folds->add_range($start, $end);
14690 $found_locale_problematic = 1;
14693 if ($folded_count > 1) {
14694 $in_multi_fold->add_range($cp, $cp);
14698 # If this is a problematic fold, add to the start chars the
14699 # folding-from characters and first folded-to character.
14700 if ($found_locale_problematic) {
14701 $loc_problem_folds_start->add_range($start, $end);
14702 my $cp = hex $hex_folds[0];
14703 $loc_problem_folds_start->add_range($cp, $cp);
14707 my $dt = property_ref('Decomposition_Type');
14708 $dt->add_match_table('Non_Canon', Full_Name => 'Non_Canonical',
14709 Initialize => ~ ($dt->table('None') + $dt->table('Canonical')),
14710 Perl_Extension => 1,
14711 Note => 'Union of all non-canonical decompositions',
14714 # For backward compatibility, Perl has its own definition for IDStart.
14715 # It is regular XID_Start plus the underscore, but all characters must be
14716 # Word characters as well
14717 my $XID_Start = property_ref('XID_Start');
14718 my $perl_xids = $perl->add_match_table('_Perl_IDStart',
14719 Perl_Extension => 1,
14720 Fate => $INTERNAL_ONLY,
14721 Initialize => ord('_')
14723 if (defined $XID_Start
14724 || defined ($XID_Start = property_ref('ID_Start')))
14726 $perl_xids += $XID_Start->table('Y');
14729 # For Unicode versions that don't have the property, construct our own
14730 # from first principles. The actual definition is:
14732 # + letter numbers (Nl)
14734 # - Pattern_White_Space
14735 # + stability extensions
14736 # - NKFC modifications
14738 # What we do in the code below is to include the identical code points
14739 # that are in the first release that had Unicode's version of this
14740 # property, essentially extrapolating backwards. There were no
14741 # stability extensions until v4.1, so none are included; likewise in
14742 # no Unicode version so far do subtracting PatSyn and PatWS make any
14743 # difference, so those also are ignored.
14744 $perl_xids += $gc->table('Letter') + pre_3_dot_1_Nl();
14746 # We do subtract the NFKC modifications that are in the first version
14747 # that had this property. We don't bother to test if they are in the
14748 # version in question, because if they aren't, the operation is a
14749 # no-op. The NKFC modifications are discussed in
14750 # http://www.unicode.org/reports/tr31/#NFKC_Modifications
14751 foreach my $range ( 0x037A,
14754 [ 0xFC5E, 0xFC63 ],
14755 [ 0xFDFA, 0xFE70 ],
14756 [ 0xFE72, 0xFE76 ],
14761 [ 0xFF9E, 0xFF9F ],
14764 $perl_xids->delete_range($range->[0], $range->[1]);
14767 $perl_xids->delete_range($range, $range);
14772 $perl_xids &= $Word;
14774 my $perl_xidc = $perl->add_match_table('_Perl_IDCont',
14775 Perl_Extension => 1,
14776 Fate => $INTERNAL_ONLY);
14777 my $XIDC = property_ref('XID_Continue');
14779 || defined ($XIDC = property_ref('ID_Continue')))
14781 $perl_xidc += $XIDC->table('Y');
14784 # Similarly, we construct our own XIDC if necessary for early Unicode
14785 # versions. The definition is:
14786 # everything in XIDS
14792 # - Pattern_White_Space
14793 # + stability extensions
14794 # - NFKC modifications
14796 # The same thing applies to this as with XIDS for the PatSyn, PatWS,
14797 # and stability extensions. There is a somewhat different set of NFKC
14798 # mods to remove (and add in this case). The ones below make this
14799 # have identical code points as in the first release that defined it.
14800 $perl_xidc += $perl_xids
14805 + utf8::unicode_to_native(0xB7)
14807 if (defined (my $pc = $gc->table('Pc'))) {
14810 else { # 1.1.5 didn't have Pc, but these should have been in it
14811 $perl_xidc += 0xFF3F;
14812 $perl_xidc->add_range(0x203F, 0x2040);
14813 $perl_xidc->add_range(0xFE33, 0xFE34);
14814 $perl_xidc->add_range(0xFE4D, 0xFE4F);
14817 # Subtract the NFKC mods
14818 foreach my $range ( 0x037A,
14819 [ 0xFC5E, 0xFC63 ],
14820 [ 0xFDFA, 0xFE1F ],
14822 [ 0xFE72, 0xFE76 ],
14829 $perl_xidc->delete_range($range->[0], $range->[1]);
14832 $perl_xidc->delete_range($range, $range);
14837 $perl_xidc &= $Word;
14839 my $charname_begin = $perl->add_match_table('_Perl_Charname_Begin',
14840 Perl_Extension => 1,
14841 Fate => $INTERNAL_ONLY,
14842 Initialize => $gc->table('Letter') & $Alpha & $perl_xids,
14845 my $charname_continue = $perl->add_match_table('_Perl_Charname_Continue',
14846 Perl_Extension => 1,
14847 Fate => $INTERNAL_ONLY,
14848 Initialize => $perl_xidc
14855 my @composition = ('Name', 'Unicode_1_Name', '_Perl_Name_Alias');
14857 if (@named_sequences) {
14858 push @composition, 'Named_Sequence';
14859 foreach my $sequence (@named_sequences) {
14860 $perl_charname->add_anomalous_entry($sequence);
14864 my $alias_sentence = "";
14866 my $alias = property_ref('_Perl_Name_Alias');
14867 $perl_charname->set_proxy_for('_Perl_Name_Alias');
14869 # Add each entry in _Perl_Name_Alias to Perl_Charnames. Where these go
14870 # with respect to any existing entry depends on the entry type.
14871 # Corrections go before said entry, as they should be returned in
14872 # preference over the existing entry. (A correction to a correction
14873 # should be later in the _Perl_Name_Alias table, so it will correctly
14874 # precede the erroneous correction in Perl_Charnames.)
14876 # Abbreviations go after everything else, so they are saved temporarily in
14877 # a hash for later.
14879 # Everything else is added afterwards, which preserves the input
14882 foreach my $range ($alias->ranges) {
14883 next if $range->value eq "";
14884 my $code_point = $range->start;
14885 if ($code_point != $range->end) {
14886 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
14888 my ($value, $type) = split ': ', $range->value;
14890 if ($type eq 'correction') {
14891 $replace_type = $MULTIPLE_BEFORE;
14893 elsif ($type eq 'abbreviation') {
14896 $abbreviations{$value} = $code_point;
14900 $replace_type = $MULTIPLE_AFTER;
14903 # Actually add; before or after current entry(ies) as determined
14906 $perl_charname->add_duplicate($code_point, $value, Replace => $replace_type);
14908 $alias_sentence = <<END;
14909 The _Perl_Name_Alias property adds duplicate code point entries that are
14910 alternatives to the original name. If an addition is a corrected
14911 name, it will be physically first in the table. The original (less correct,
14912 but still valid) name will be next; then any alternatives, in no particular
14913 order; and finally any abbreviations, again in no particular order.
14916 # Now add the Unicode_1 names for the controls. The Unicode_1 names had
14917 # precedence before 6.1, including the awful ones like "LINE FEED (LF)",
14918 # so should be first in the file; the other names have precedence starting
14920 my $before_or_after = ($v_version lt v6.1.0)
14924 foreach my $range (property_ref('Unicode_1_Name')->ranges) {
14925 my $code_point = $range->start;
14926 my $unicode_1_value = $range->value;
14927 next if $unicode_1_value eq ""; # Skip if name doesn't exist.
14929 if ($code_point != $range->end) {
14930 Carp::my_carp_bug("Bad News. Expecting only one code point in the range $range. Just to keep going, using only the first code point;");
14933 # To handle EBCDIC, we don't hard code in the code points of the
14934 # controls; instead realizing that all of them are below 256.
14935 last if $code_point > 255;
14937 # We only add in the controls.
14938 next if $gc->value_of($code_point) ne 'Cc';
14940 # We reject this Unicode1 name for later Perls, as it is used for
14941 # another code point
14942 next if $unicode_1_value eq 'BELL' && $^V ge v5.17.0;
14944 # This won't add an exact duplicate.
14945 $perl_charname->add_duplicate($code_point, $unicode_1_value,
14946 Replace => $before_or_after);
14949 # Now that have everything added, add in abbreviations after
14950 # everything else. Sort so results don't change between runs of this
14952 foreach my $value (sort keys %abbreviations) {
14953 $perl_charname->add_duplicate($abbreviations{$value}, $value,
14954 Replace => $MULTIPLE_AFTER);
14958 if (@composition <= 2) { # Always at least 2
14959 $comment = join " and ", @composition;
14962 $comment = join ", ", @composition[0 .. scalar @composition - 2];
14963 $comment .= ", and $composition[-1]";
14966 $perl_charname->add_comment(join_lines( <<END
14967 This file is for charnames.pm. It is the union of the $comment properties.
14968 Unicode_1_Name entries are used only for nameless code points in the Name
14971 This file doesn't include the algorithmically determinable names. For those,
14972 use 'unicore/Name.pm'
14975 property_ref('Name')->add_comment(join_lines( <<END
14976 This file doesn't include the algorithmically determinable names. For those,
14977 use 'unicore/Name.pm'
14981 # Construct the Present_In property from the Age property.
14982 if (-e 'DAge.txt' && defined $age) {
14983 my $default_map = $age->default_map;
14984 my $in = Property->new('In',
14985 Default_Map => $default_map,
14986 Full_Name => "Present_In",
14987 Perl_Extension => 1,
14989 Initialize => $age,
14991 $in->add_comment(join_lines(<<END
14992 THIS FILE SHOULD NOT BE USED FOR ANY PURPOSE. The values in this file are the
14993 same as for $age, and not for what $in really means. This is because anything
14994 defined in a given release should have multiple values: that release and all
14995 higher ones. But only one value per code point can be represented in a table
15000 # The Age tables are named like 1.5, 2.0, 2.1, .... Sort so that the
15001 # lowest numbered (earliest) come first, with the non-numeric one
15003 my ($first_age, @rest_ages) = sort { ($a->name !~ /^[\d.]*$/)
15005 : ($b->name !~ /^[\d.]*$/)
15007 : $a->name <=> $b->name
15010 # The Present_In property is the cumulative age properties. The first
15011 # one hence is identical to the first age one.
15012 my $first_in = $in->add_match_table($first_age->name);
15013 $first_in->set_equivalent_to($first_age, Related => 1);
15015 my $description_start = "Code point's usage introduced in version ";
15016 $first_age->add_description($description_start . $first_age->name);
15017 foreach my $alias ($first_age->aliases) { # Include its aliases
15018 $first_in->add_alias($alias->name);
15021 # To construct the accumulated values, for each of the age tables
15022 # starting with the 2nd earliest, merge the earliest with it, to get
15023 # all those code points existing in the 2nd earliest. Repeat merging
15024 # the new 2nd earliest with the 3rd earliest to get all those existing
15025 # in the 3rd earliest, and so on.
15026 my $previous_in = $first_in;
15027 foreach my $current_age (@rest_ages) {
15028 next if $current_age->name !~ /^[\d.]*$/; # Skip the non-numeric
15030 my $current_in = $in->add_match_table(
15031 $current_age->name,
15032 Initialize => $current_age + $previous_in,
15033 Description => $description_start
15034 . $current_age->name
15037 foreach my $alias ($current_age->aliases) {
15038 $current_in->add_alias($alias->name);
15040 $previous_in = $current_in;
15042 # Add clarifying material for the corresponding age file. This is
15043 # in part because of the confusing and contradictory information
15044 # given in the Standard's documentation itself, as of 5.2.
15045 $current_age->add_description(
15046 "Code point's usage was introduced in version "
15047 . $current_age->name);
15048 $current_age->add_note("See also $in");
15052 # And finally the code points whose usages have yet to be decided are
15053 # the same in both properties. Note that permanently unassigned code
15054 # points actually have their usage assigned (as being permanently
15055 # unassigned), so that these tables are not the same as gc=cn.
15056 my $unassigned = $in->add_match_table($default_map);
15057 my $age_default = $age->table($default_map);
15058 $age_default->add_description(<<END
15059 Code point's usage has not been assigned in any Unicode release thus far.
15062 $unassigned->set_equivalent_to($age_default, Related => 1);
15063 foreach my $alias ($age_default->aliases) {
15064 $unassigned->add_alias($alias->name);
15068 my $patws = $perl->add_match_table('_Perl_PatWS',
15069 Perl_Extension => 1,
15070 Fate => $INTERNAL_ONLY);
15071 if (defined (my $off_patws = property_ref('Pattern_White_Space'))) {
15072 $patws->initialize($off_patws->table('Y'));
15075 $patws->initialize([ ord("\t"),
15077 utf8::unicode_to_native(0x0B), # VT
15081 utf8::unicode_to_native(0x85), # NEL
15082 0x200E..0x200F, # Left, Right marks
15083 0x2028..0x2029 # Line, Paragraph seps
15087 # See L<perlfunc/quotemeta>
15088 my $quotemeta = $perl->add_match_table('_Perl_Quotemeta',
15089 Perl_Extension => 1,
15090 Fate => $INTERNAL_ONLY,
15092 # Initialize to what's common in
15093 # all Unicode releases.
15095 $gc->table('Control')
15098 + ((~ $Word) & $ASCII)
15101 if (defined (my $patsyn = property_ref('Pattern_Syntax'))) {
15102 $quotemeta += $patsyn->table('Y');
15105 $quotemeta += ((~ $Word) & Range->new(0, 255))
15106 - utf8::unicode_to_native(0xA8)
15107 - utf8::unicode_to_native(0xAF)
15108 - utf8::unicode_to_native(0xB2)
15109 - utf8::unicode_to_native(0xB3)
15110 - utf8::unicode_to_native(0xB4)
15111 - utf8::unicode_to_native(0xB7)
15112 - utf8::unicode_to_native(0xB8)
15113 - utf8::unicode_to_native(0xB9)
15114 - utf8::unicode_to_native(0xBC)
15115 - utf8::unicode_to_native(0xBD)
15116 - utf8::unicode_to_native(0xBE);
15117 $quotemeta += [ # These are above-Latin1 patsyn; hence should be the
15118 # same in all releases
15135 if (defined (my $di = property_ref('Default_Ignorable_Code_Point'))) {
15136 $quotemeta += $di->table('Y')
15139 if ($v_version ge v2.0) {
15140 $quotemeta += $gc->table('Cf')
15141 + $gc->table('Cs');
15143 # These are above the Unicode version 1 max
15144 $quotemeta->add_range(0xE0000, 0xE0FFF);
15146 $quotemeta += $gc->table('Cc')
15148 my $temp = Range_List->new(Initialize => [ 0x180B .. 0x180D,
15153 $temp->add_range(0xE0000, 0xE0FFF) if $v_version ge v2.0;
15154 $quotemeta += $temp;
15161 # Finished creating all the perl properties. All non-internal non-string
15162 # ones have a synonym of 'Is_' prefixed. (Internal properties begin with
15163 # an underscore.) These do not get a separate entry in the pod file
15164 foreach my $table ($perl->tables) {
15165 foreach my $alias ($table->aliases) {
15166 next if $alias->name =~ /^_/;
15167 $table->add_alias('Is_' . $alias->name,
15170 Status => $alias->status,
15171 OK_as_Filename => 0);
15175 # Perl tailors the WordBreak property so that \b{wb} doesn't split
15176 # adjacent spaces into separate words. Unicode 11.0 moved in that
15177 # direction, but left TAB, FIGURE SPACE (U+2007), and (ironically) NO
15178 # BREAK SPACE as breaking, so we retained the original Perl customization.
15179 # To do this, in the Perl copy of WB, simply replace the mappings of
15180 # horizontal space characters that otherwise would map to the default or
15181 # the 11.0 'WSegSpace' to instead map to our tailoring.
15182 my $perl_wb = property_ref('_Perl_WB');
15183 my $default = $perl_wb->default_map;
15184 for my $range ($Blank->ranges) {
15185 for my $i ($range->start .. $range->end) {
15186 my $value = $perl_wb->value_of($i);
15188 next unless $value eq $default || $value eq 'WSegSpace';
15189 $perl_wb->add_map($i, $i, 'Perl_Tailored_HSpace',
15190 Replace => $UNCONDITIONALLY);
15194 # Also starting in Unicode 11.0, rules for some of the boundary types are
15195 # based on a non-UCD property (which we have read in if it exists).
15196 # Recall that these boundary properties partition the code points into
15197 # equivalence classes (represented as enums).
15199 # The loop below goes through each code point that matches the non-UCD
15200 # property, and for each current equivalence class containing such a code
15201 # point, splits it so that those that are in both are now in a newly
15202 # created equivalence class whose name is a combination of the property
15203 # and the old class name, leaving unchanged everything that doesn't match
15204 # the non-UCD property.
15205 my $ep = property_ref('ExtPict');
15206 $ep = $ep->table('Y') if defined $ep;
15208 foreach my $base_property (property_ref('GCB'),
15209 property_ref('WB'))
15211 my $property = property_ref('_Perl_' . $base_property->name);
15212 foreach my $range ($ep->ranges) {
15213 foreach my $i ($range->start .. $range->end) {
15214 my $current = $property->value_of($i);
15215 $current = $property->table($current)->short_name;
15216 $property->add_map($i, $i, 'ExtPict_' . $current,
15217 Replace => $UNCONDITIONALLY);
15223 # Create a version of the LineBreak property with the mappings that are
15224 # omitted in the default algorithm remapped to what
15225 # http://www.unicode.org/reports/tr14 says they should be.
15227 # First, create a plain copy, but with all property values written out in
15228 # their long form, as regen/mk_invlist.pl expects that, and also fix
15229 # occurrences of the typo in early Unicode versions: 'inseperable'.
15230 my $perl_lb = property_ref('_Perl_LB');
15231 if (! defined $perl_lb) {
15232 $perl_lb = Property->new('_Perl_LB',
15233 Fate => $INTERNAL_ONLY,
15234 Perl_Extension => 1,
15235 Directory => $map_directory,
15237 my $lb = property_ref('Line_Break');
15239 # Populate from $lb, but use full name and fix typo.
15240 foreach my $range ($lb->ranges) {
15241 my $full_name = $lb->table($range->value)->full_name;
15242 $full_name = 'Inseparable'
15243 if standardize($full_name) eq 'inseperable';
15244 $perl_lb->add_map($range->start, $range->end, $full_name);
15248 # What tr14 says is this:
15250 # Original Resolved General_Category
15251 # AI, SG, XX AL Any
15252 # SA CM Only Mn or Mc
15253 # SA AL Any except Mn and Mc
15256 $perl_lb->set_default_map('Alphabetic', 'full_name'); # XX -> AL
15258 my $ea = property_ref('East_Asian_Width');
15260 $Cn_EP = $ep & $gc->table('Unassigned') if defined $ep;
15262 for my $range ($perl_lb->ranges) {
15263 my $value = standardize($range->value);
15264 if ( $value eq standardize('Unknown')
15265 || $value eq standardize('Ambiguous')
15266 || $value eq standardize('Surrogate'))
15268 $perl_lb->add_map($range->start, $range->end, 'Alphabetic',
15269 Replace => $UNCONDITIONALLY);
15271 elsif ($value eq standardize('Conditional_Japanese_Starter')) {
15272 $perl_lb->add_map($range->start, $range->end, 'Nonstarter',
15273 Replace => $UNCONDITIONALLY);
15275 elsif ($value eq standardize('Complex_Context')) {
15276 for my $i ($range->start .. $range->end) {
15277 my $gc_val = $gc->value_of($i);
15278 if ($gc_val eq 'Mn' || $gc_val eq 'Mc') {
15279 $perl_lb->add_map($i, $i, 'Combining_Mark',
15280 Replace => $UNCONDITIONALLY);
15283 $perl_lb->add_map($i, $i, 'Alphabetic',
15284 Replace => $UNCONDITIONALLY);
15288 elsif (defined $ep && $value eq standardize('Ideographic')) {
15290 # Unicode 14 adds a rule to not break lines before any potential
15291 # EBase, They say that any unassigned code point that is ExtPict,
15292 # is potentially an EBase. In 14.0, all such ones are in the
15293 # ExtPict=ID category. We must split that category for the
15294 # pairwise rule table to work.
15295 for my $i ($range->start .. $range->end) {
15296 if ($Cn_EP->contains($i)) {
15297 $perl_lb->add_map($i, $i,
15298 'Unassigned_Extended_Pictographic_Ideographic',
15299 Replace => $UNCONDITIONALLY);
15303 elsif ( defined $ea
15304 && ( $value eq standardize('Close_Parenthesis')
15305 || $value eq standardize('Open_Punctuation')))
15307 # Unicode 13 splits the OP and CP properties each into East Asian,
15308 # and non-. We retain the (now somewhat misleading) names OP and
15309 # CP for the non-East Asian variety, as there are very few East
15311 my $replace = ($value eq standardize('Open_Punctuation'))
15314 for my $i ($range->start .. $range->end) {
15315 my $ea_val = $ea->value_of($i);
15316 if ($ea_val eq 'F' || $ea_val eq 'W' || $ea_val eq 'H') {
15317 $perl_lb->add_map($i, $i, $replace,
15318 Replace => $UNCONDITIONALLY);
15324 # This property is a modification of the scx property
15325 my $perl_scx = Property->new('_Perl_SCX',
15326 Fate => $INTERNAL_ONLY,
15327 Perl_Extension => 1,
15328 Directory => $map_directory,
15332 # Use scx if available; otherwise sc; if neither is there (a very old
15333 # Unicode version, just say that everything is 'Common'
15334 if (defined $scx) {
15336 $perl_scx->set_default_map('Unknown');
15338 elsif (defined $script) {
15341 # Early versions of 'sc', had everything be 'Common'
15342 if (defined $script->table('Unknown')) {
15343 $perl_scx->set_default_map('Unknown');
15346 $perl_scx->set_default_map('Common');
15349 $perl_scx->add_match_table('Common');
15350 $perl_scx->add_map(0, $MAX_UNICODE_CODEPOINT, 'Common');
15352 $perl_scx->add_match_table('Unknown');
15353 $perl_scx->set_default_map('Unknown');
15356 $perl_scx->_set_format($STRING_WHITE_SPACE_LIST);
15357 $perl_scx->set_pre_declared_maps(0); # PropValueAliases doesn't list these
15359 if (defined $source) {
15360 $perl_scx->initialize($source);
15362 # UTS 39 says that the scx property should be modified for these
15363 # countries where certain mixed scripts are commonly used.
15364 for my $range ($perl_scx->ranges) {
15365 my $value = $range->value;
15366 my $changed = $value =~ s/ ( \b Han i? \b ) /$1 Hanb Jpan Kore/xi;
15367 $changed |= $value =~ s/ ( \b Hira (gana)? \b ) /$1 Jpan/xi;
15368 $changed |= $value =~ s/ ( \b Kata (kana)? \b ) /$1 Jpan/xi;
15369 $changed |= $value =~ s{ ( \b Katakana_or_Hiragana \b ) }
15370 {$1 Katakana Hiragana Jpan}xi;
15371 $changed |= $value =~ s/ ( \b Hang (ul)? \b ) /$1 Kore/xi;
15372 $changed |= $value =~ s/ ( \b Bopo (mofo)? \b ) /$1 Hanb/xi;
15375 $value = join " ", uniques split " ", $value;
15376 $range->set_value($value)
15380 foreach my $table ($source->tables) {
15381 my $scx_table = $perl_scx->add_match_table($table->name,
15382 Full_Name => $table->full_name);
15383 foreach my $alias ($table->aliases) {
15384 $scx_table->add_alias($alias->name);
15389 # Here done with all the basic stuff. Ready to populate the information
15390 # about each character if annotating them.
15393 # See comments at its declaration
15394 $annotate_ranges = Range_Map->new;
15396 # This separates out the non-characters from the other unassigneds, so
15397 # can give different annotations for each.
15398 $unassigned_sans_noncharacters = Range_List->new(
15399 Initialize => $gc->table('Unassigned'));
15400 $unassigned_sans_noncharacters &= (~ $NChar);
15402 for (my $i = 0; $i <= $MAX_UNICODE_CODEPOINT + 1; $i++ ) {
15403 $i = populate_char_info($i); # Note sets $i so may cause skips
15411 sub add_perl_synonyms() {
15412 # A number of Unicode tables have Perl synonyms that are expressed in
15413 # the single-form, \p{name}. These are:
15414 # All the binary property Y tables, so that \p{Name=Y} gets \p{Name} and
15415 # \p{Is_Name} as synonyms
15416 # \p{Script_Extensions=Value} gets \p{Value}, \p{Is_Value} as synonyms
15417 # \p{General_Category=Value} gets \p{Value}, \p{Is_Value} as synonyms
15418 # \p{Block=Value} gets \p{In_Value} as a synonym, and, if there is no
15419 # conflict, \p{Value} and \p{Is_Value} as well
15421 # This routine generates these synonyms, warning of any unexpected
15424 # Construct the list of tables to get synonyms for. Start with all the
15425 # binary and the General_Category ones.
15426 my @tables = grep { $_->type == $BINARY || $_->type == $FORCED_BINARY }
15428 push @tables, $gc->tables;
15430 # If the version of Unicode includes the Script Extensions (preferably),
15431 # or Script property, add its tables
15432 if (defined $scx) {
15433 push @tables, $scx->tables;
15436 push @tables, $script->tables if defined $script;
15439 # The Block tables are kept separate because they are treated differently.
15440 # And the earliest versions of Unicode didn't include them, so add only if
15443 push @blocks, $block->tables if defined $block;
15445 # Here, have the lists of tables constructed. Process blocks last so that
15446 # if there are name collisions with them, blocks have lowest priority.
15447 # Should there ever be other collisions, manual intervention would be
15448 # required. See the comments at the beginning of the program for a
15449 # possible way to handle those semi-automatically.
15450 foreach my $table (@tables, @blocks) {
15452 # For non-binary properties, the synonym is just the name of the
15453 # table, like Greek, but for binary properties the synonym is the name
15454 # of the property, and means the code points in its 'Y' table.
15455 my $nominal = $table;
15456 my $nominal_property = $nominal->property;
15458 if (! $nominal->isa('Property')) {
15463 # Here is a binary property. Use the 'Y' table. Verify that is
15465 my $yes = $nominal->table('Y');
15466 unless (defined $yes) { # Must be defined, but is permissible to
15468 Carp::my_carp_bug("Undefined $nominal, 'Y'. Skipping.");
15474 foreach my $alias ($nominal->aliases) {
15476 # Attempt to create a table in the perl directory for the
15477 # candidate table, using whatever aliases in it that don't
15478 # conflict. Also add non-conflicting aliases for all these
15479 # prefixed by 'Is_' (and/or 'In_' for Block property tables)
15481 foreach my $prefix ("", 'Is_', 'In_') {
15483 # Only Block properties can have added 'In_' aliases.
15484 next if $prefix eq 'In_' and $nominal_property != $block;
15486 my $proposed_name = $prefix . $alias->name;
15488 # No Is_Is, In_In, nor combinations thereof
15489 trace "$proposed_name is a no-no" if main::DEBUG && $to_trace && $proposed_name =~ /^ I [ns] _I [ns] _/x;
15490 next if $proposed_name =~ /^ I [ns] _I [ns] _/x;
15492 trace "Seeing if can add alias or table: 'perl=$proposed_name' based on $nominal" if main::DEBUG && $to_trace;
15494 # Get a reference to any existing table in the perl
15495 # directory with the desired name.
15496 my $pre_existing = $perl->table($proposed_name);
15498 if (! defined $pre_existing) {
15500 # No name collision, so OK to add the perl synonym.
15502 my $make_re_pod_entry;
15503 my $ok_as_filename;
15504 my $status = $alias->status;
15505 if ($nominal_property == $block) {
15507 # For block properties, only the compound form is
15508 # preferred for external use; the others are
15509 # discouraged. The pod file contains wild cards for
15510 # the 'In' and 'Is' forms so no entries for those; and
15511 # we don't want people using the name without any
15512 # prefix, so discourage that.
15513 if ($prefix eq "") {
15514 $make_re_pod_entry = 1;
15515 $status = $status || $DISCOURAGED;
15516 $ok_as_filename = 0;
15518 elsif ($prefix eq 'In_') {
15519 $make_re_pod_entry = 0;
15520 $status = $status || $DISCOURAGED;
15521 $ok_as_filename = 1;
15524 $make_re_pod_entry = 0;
15525 $status = $status || $DISCOURAGED;
15526 $ok_as_filename = 0;
15529 elsif ($prefix ne "") {
15531 # The 'Is' prefix is handled in the pod by a wild
15532 # card, and we won't use it for an external name
15533 $make_re_pod_entry = 0;
15534 $status = $status || $NORMAL;
15535 $ok_as_filename = 0;
15539 # Here, is an empty prefix, non block. This gets its
15540 # own pod entry and can be used for an external name.
15541 $make_re_pod_entry = 1;
15542 $status = $status || $NORMAL;
15543 $ok_as_filename = 1;
15546 # Here, there isn't a perl pre-existing table with the
15547 # name. Look through the list of equivalents of this
15548 # table to see if one is a perl table.
15549 foreach my $equivalent ($actual->leader->equivalents) {
15550 next if $equivalent->property != $perl;
15552 # Here, have found a table for $perl. Add this alias
15553 # to it, and are done with this prefix.
15554 $equivalent->add_alias($proposed_name,
15555 Re_Pod_Entry => $make_re_pod_entry,
15557 # Currently don't output these in the
15558 # ucd pod, as are strongly discouraged
15563 OK_as_Filename => $ok_as_filename);
15564 trace "adding alias perl=$proposed_name to $equivalent" if main::DEBUG && $to_trace;
15568 # Here, $perl doesn't already have a table that is a
15569 # synonym for this property, add one.
15570 my $added_table = $perl->add_match_table($proposed_name,
15571 Re_Pod_Entry => $make_re_pod_entry,
15573 # See UCD comment just above
15577 OK_as_Filename => $ok_as_filename);
15578 # And it will be related to the actual table, since it is
15580 $added_table->set_equivalent_to($actual, Related => 1);
15581 trace "added ", $perl->table($proposed_name) if main::DEBUG && $to_trace;
15583 } # End of no pre-existing.
15585 # Here, there is a pre-existing table that has the proposed
15586 # name. We could be in trouble, but not if this is just a
15587 # synonym for another table that we have already made a child
15588 # of the pre-existing one.
15589 if ($pre_existing->is_set_equivalent_to($actual)) {
15590 trace "$pre_existing is already equivalent to $actual; adding alias perl=$proposed_name to it" if main::DEBUG && $to_trace;
15591 $pre_existing->add_alias($proposed_name);
15595 # Here, there is a name collision, but it still could be OK if
15596 # the tables match the identical set of code points, in which
15597 # case, we can combine the names. Compare each table's code
15598 # point list to see if they are identical.
15599 trace "Potential name conflict with $pre_existing having ", $pre_existing->count, " code points" if main::DEBUG && $to_trace;
15600 if ($pre_existing->matches_identically_to($actual)) {
15602 # Here, they do match identically. Not a real conflict.
15603 # Make the perl version a child of the Unicode one, except
15604 # in the non-obvious case of where the perl name is
15605 # already a synonym of another Unicode property. (This is
15606 # excluded by the test for it being its own parent.) The
15607 # reason for this exclusion is that then the two Unicode
15608 # properties become related; and we don't really know if
15609 # they are or not. We generate documentation based on
15610 # relatedness, and this would be misleading. Code
15611 # later executed in the process will cause the tables to
15612 # be represented by a single file anyway, without making
15613 # it look in the pod like they are necessarily related.
15614 if ($pre_existing->parent == $pre_existing
15615 && ($pre_existing->property == $perl
15616 || $actual->property == $perl))
15618 trace "Setting $pre_existing equivalent to $actual since one is \$perl, and match identical sets" if main::DEBUG && $to_trace;
15619 $pre_existing->set_equivalent_to($actual, Related => 1);
15621 elsif (main::DEBUG && $to_trace) {
15622 trace "$pre_existing is equivalent to $actual since match identical sets, but not setting them equivalent, to preserve the separateness of the perl aliases";
15623 trace $pre_existing->parent;
15628 # Here they didn't match identically, there is a real conflict
15629 # between our new name and a pre-existing property.
15630 $actual->add_conflicting($proposed_name, 'p', $pre_existing);
15631 $pre_existing->add_conflicting($nominal->full_name,
15635 # Don't output a warning for aliases for the block
15636 # properties (unless they start with 'In_') as it is
15637 # expected that there will be conflicts and the block
15639 if ($verbosity >= $NORMAL_VERBOSITY
15640 && ($actual->property != $block || $prefix eq 'In_'))
15642 print simple_fold(join_lines(<<END
15643 There is already an alias named $proposed_name (from $pre_existing),
15644 so not creating this alias for $actual
15649 # Keep track for documentation purposes.
15650 $has_In_conflicts++ if $prefix eq 'In_';
15651 $has_Is_conflicts++ if $prefix eq 'Is_';
15656 # There are some properties which have No and Yes (and N and Y) as
15657 # property values, but aren't binary, and could possibly be confused with
15658 # binary ones. So create caveats for them. There are tables that are
15659 # named 'No', and tables that are named 'N', but confusion is not likely
15660 # unless they are the same table. For example, N meaning Number or
15661 # Neutral is not likely to cause confusion, so don't add caveats to things
15663 foreach my $property (grep { $_->type != $BINARY
15664 && $_->type != $FORCED_BINARY }
15667 my $yes = $property->table('Yes');
15668 if (defined $yes) {
15669 my $y = $property->table('Y');
15670 if (defined $y && $yes == $y) {
15671 foreach my $alias ($property->aliases) {
15672 $yes->add_conflicting($alias->name);
15676 my $no = $property->table('No');
15678 my $n = $property->table('N');
15679 if (defined $n && $no == $n) {
15680 foreach my $alias ($property->aliases) {
15681 $no->add_conflicting($alias->name, 'P');
15690 sub register_file_for_name($table, $directory_ref, $file) {
15691 # Given info about a table and a datafile that it should be associated
15692 # with, register that association
15694 # $directory_ref # Array of the directory path for the file
15695 # $file # The file name in the final directory.
15697 trace "table=$table, file=$file, directory=@$directory_ref, fate=", $table->fate if main::DEBUG && $to_trace;
15699 if ($table->isa('Property')) {
15700 $table->set_file_path(@$directory_ref, $file);
15701 push @map_properties, $table;
15703 # No swash means don't do the rest of this.
15704 return if $table->fate != $ORDINARY
15705 && ! ($table->name =~ /^_/ && $table->fate == $INTERNAL_ONLY);
15707 # Get the path to the file
15708 my @path = $table->file_path;
15710 # Use just the file name if no subdirectory.
15711 shift @path if $path[0] eq File::Spec->curdir();
15713 my $file = join '/', @path;
15715 # Create a hash entry for Unicode::UCD to get the file that stores this
15716 # property's map table
15717 foreach my $alias ($table->aliases) {
15718 my $name = $alias->name;
15719 if ($name =~ /^_/) {
15720 $strict_property_to_file_of{lc $name} = $file;
15723 $loose_property_to_file_of{standardize($name)} = $file;
15727 # And a way for Unicode::UCD to find the proper key in the SwashInfo
15728 # hash for this property.
15729 $file_to_swash_name{$file} = "To" . $table->swash_name;
15733 # Do all of the work for all equivalent tables when called with the leader
15734 # table, so skip if isn't the leader.
15735 return if $table->leader != $table;
15737 # If this is a complement of another file, use that other file instead,
15738 # with a ! prepended to it.
15740 if (($complement = $table->complement) != 0) {
15741 my @directories = $complement->file_path;
15743 # This assumes that the 0th element is something like 'lib',
15744 # the 1th element the property name (in its own directory), like
15745 # 'AHex', and the 2th element the file like 'Y' which will have a .pl
15746 # appended to it later.
15747 $directories[1] =~ s/^/!/;
15748 $file = pop @directories;
15749 $directory_ref =\@directories;
15752 # Join all the file path components together, using slashes.
15753 my $full_filename = join('/', @$directory_ref, $file);
15755 # All go in the same subdirectory of unicore, or the special
15756 # pseudo-directory '#'
15757 if ($directory_ref->[0] !~ / ^ $matches_directory | \# $ /x) {
15758 Carp::my_carp("Unexpected directory in "
15759 . join('/', @{$directory_ref}, $file));
15762 # For this table and all its equivalents ...
15763 foreach my $table ($table, $table->equivalents) {
15765 # Associate it with its file internally. Don't include the
15766 # $matches_directory first component
15767 $table->set_file_path(@$directory_ref, $file);
15769 # No swash means don't do the rest of this.
15770 next if $table->isa('Map_Table') && $table->fate != $ORDINARY;
15772 my $sub_filename = join('/', $directory_ref->[1, -1], $file);
15774 my $property = $table->property;
15775 my $property_name = ($property == $perl)
15776 ? "" # 'perl' is never explicitly stated
15777 : standardize($property->name) . '=';
15779 my $is_default = 0; # Is this table the default one for the property?
15781 # To calculate $is_default, we find if this table is the same as the
15782 # default one for the property. But this is complicated by the
15783 # possibility that there is a master table for this one, and the
15784 # information is stored there instead of here.
15785 my $parent = $table->parent;
15786 my $leader_prop = $parent->property;
15787 my $default_map = $leader_prop->default_map;
15788 if (defined $default_map) {
15789 my $default_table = $leader_prop->table($default_map);
15790 $is_default = 1 if defined $default_table && $parent == $default_table;
15793 # Calculate the loose name for this table. Mostly it's just its name,
15794 # standardized. But in the case of Perl tables that are single-form
15795 # equivalents to Unicode properties, it is the latter's name.
15796 my $loose_table_name =
15797 ($property != $perl || $leader_prop == $perl)
15798 ? standardize($table->name)
15799 : standardize($parent->name);
15801 my $deprecated = ($table->status eq $DEPRECATED)
15802 ? $table->status_info
15804 my $caseless_equivalent = $table->caseless_equivalent;
15806 # And for each of the table's aliases... This inner loop eventually
15807 # goes through all aliases in the UCD that we generate regex match
15809 foreach my $alias ($table->aliases) {
15810 my $standard = UCD_name($table, $alias);
15812 # Generate an entry in either the loose or strict hashes, which
15813 # will translate the property and alias names combination into the
15814 # file where the table for them is stored.
15815 if ($alias->loose_match) {
15816 if (exists $loose_to_file_of{$standard}) {
15817 Carp::my_carp("Can't change file registered to $loose_to_file_of{$standard} to '$sub_filename'.");
15820 $loose_to_file_of{$standard} = $sub_filename;
15824 if (exists $stricter_to_file_of{$standard}) {
15825 Carp::my_carp("Can't change file registered to $stricter_to_file_of{$standard} to '$sub_filename'.");
15828 $stricter_to_file_of{$standard} = $sub_filename;
15830 # Tightly coupled with how Unicode::UCD works, for a
15831 # floating point number that is a whole number, get rid of
15832 # the trailing decimal point and 0's, so that Unicode::UCD
15833 # will work. Also note that this assumes that such a
15834 # number is matched strictly; so if that were to change,
15835 # this would be wrong.
15836 if ((my $integer_name = $alias->name)
15837 =~ s/^ ( -? \d+ ) \.0+ $ /$1/x)
15839 $stricter_to_file_of{$property_name . $integer_name}
15845 # For Unicode::UCD, create a mapping of the prop=value to the
15846 # canonical =value for that property.
15847 if ($standard =~ /=/) {
15849 # This could happen if a strict name mapped into an existing
15850 # loose name. In that event, the strict names would have to
15851 # be moved to a new hash.
15852 if (exists($loose_to_standard_value{$standard})) {
15853 Carp::my_carp_bug("'$standard' conflicts with a pre-existing use. Bad News. Continuing anyway");
15855 $loose_to_standard_value{$standard} = $loose_table_name;
15858 # Keep a list of the deprecated properties and their filenames
15859 if ($deprecated && $complement == 0) {
15860 $Unicode::UCD::why_deprecated{$sub_filename} = $deprecated;
15863 # And a substitute table, if any, for case-insensitive matching
15864 if ($caseless_equivalent != 0) {
15865 $caseless_equivalent_to{$standard} = $caseless_equivalent;
15868 # Add to defaults list if the table this alias belongs to is the
15870 $loose_defaults{$standard} = 1 if $is_default;
15878 my %base_names; # Names already used for avoiding DOS 8.3 filesystem
15880 my %full_dir_name_of; # Full length names of directories used.
15882 sub construct_filename($name, $mutable, $directories_ref) {
15883 # Return a file name for a table, based on the table name, but perhaps
15884 # changed to get rid of non-portable characters in it, and to make
15885 # sure that it is unique on a file system that allows the names before
15886 # any period to be at most 8 characters (DOS). While we're at it
15887 # check and complain if there are any directory conflicts.
15889 # $name # The name to start with
15890 # $mutable # Boolean: can it be changed? If no, but
15891 # yet it must be to work properly, a warning
15893 # $directories_ref # A reference to an array containing the
15894 # path to the file, with each element one path
15895 # component. This is used because the same
15896 # name can be used in different directories.
15898 my $warn = ! defined wantarray; # If true, then if the name is
15899 # changed, a warning is issued as well.
15901 if (! defined $name) {
15902 Carp::my_carp("Undefined name in directory "
15903 . File::Spec->join(@$directories_ref)
15908 # Make sure that no directory names conflict with each other. Look at
15909 # each directory in the input file's path. If it is already in use,
15910 # assume it is correct, and is merely being re-used, but if we
15911 # truncate it to 8 characters, and find that there are two directories
15912 # that are the same for the first 8 characters, but differ after that,
15913 # then that is a problem.
15914 foreach my $directory (@$directories_ref) {
15915 my $short_dir = substr($directory, 0, 8);
15916 if (defined $full_dir_name_of{$short_dir}) {
15917 next if $full_dir_name_of{$short_dir} eq $directory;
15918 Carp::my_carp("Directory $directory conflicts with directory $full_dir_name_of{$short_dir}. Bad News. Continuing anyway");
15921 $full_dir_name_of{$short_dir} = $directory;
15925 my $path = join '/', @$directories_ref;
15926 $path .= '/' if $path;
15928 # Remove interior underscores.
15929 (my $filename = $name) =~ s/ (?<=.) _ (?=.) //xg;
15931 # Convert the dot in floating point numbers to an underscore
15932 $filename =~ s/\./_/ if $filename =~ / ^ \d+ \. \d+ $ /x;
15936 # Extract any suffix, delete any non-word character, and truncate to 3
15938 if ($filename =~ m/ ( .*? ) ( \. .* ) /x) {
15941 $suffix =~ s/\W+//g;
15942 substr($suffix, 4) = "" if length($suffix) > 4;
15945 # Change any non-word character outside the suffix into an underscore,
15946 # and truncate to 8.
15947 $filename =~ s/\W+/_/g; # eg., "L&" -> "L_"
15948 substr($filename, 8) = "" if length($filename) > 8;
15950 # Make sure the basename doesn't conflict with something we
15951 # might have already written. If we have, say,
15958 while (my $num = $base_names{$path}{lc "$filename$suffix"}++) {
15959 $num++; # so basenames with numbers start with '2', which
15960 # just looks more natural.
15962 # Want to append $num, but if it'll make the basename longer
15963 # than 8 characters, pre-truncate $filename so that the result
15965 my $delta = length($filename) + length($num) - 8;
15967 substr($filename, -$delta) = $num;
15972 if ($warn && ! $warned) {
15974 Carp::my_carp("'$path$name' conflicts with another name on a filesystem with 8 significant characters (like DOS). Proceeding anyway.");
15978 return $filename if $mutable;
15980 # If not changeable, must return the input name, but warn if needed to
15981 # change it beyond shortening it.
15982 if ($name ne $filename
15983 && substr($name, 0, length($filename)) ne $filename) {
15984 Carp::my_carp("'$path$name' had to be changed into '$filename'. Bad News. Proceeding anyway.");
15990 # The pod file contains a very large table. Many of the lines in that table
15991 # would exceed a typical output window's size, and so need to be wrapped with
15992 # a hanging indent to make them look good. The pod language is really
15993 # insufficient here. There is no general construct to do that in pod, so it
15994 # is done here by beginning each such line with a space to cause the result to
15995 # be output without formatting, and doing all the formatting here. This leads
15996 # to the result that if the eventual display window is too narrow it won't
15997 # look good, and if the window is too wide, no advantage is taken of that
15998 # extra width. A further complication is that the output may be indented by
15999 # the formatter so that there is less space than expected. What I (khw) have
16000 # done is to assume that that indent is a particular number of spaces based on
16001 # what it is in my Linux system; people can always resize their windows if
16002 # necessary, but this is obviously less than desirable, but the best that can
16004 my $automatic_pod_indent = 8;
16006 # Try to format so that uses fewest lines, but few long left column entries
16007 # slide into the right column. An experiment on 5.1 data yielded the
16008 # following percentages that didn't cut into the other side along with the
16009 # associated first-column widths
16011 # 80% not too bad except for a few blocks
16012 # 90% = 33; # , cuts 353/3053 lines from 37 = 12%
16014 my $indent_info_column = 27; # 75% of lines didn't have overlap
16016 my $FILLER = 3; # Length of initial boiler-plate columns in a pod line
16017 # The 3 is because of:
16018 # 1 for the leading space to tell the pod formatter to
16021 # 1 for the space between the flag and the main data
16023 sub format_pod_line($first_column_width, $entry, $info, $status = "", $loose_match = 1 ) {
16024 # Take a pod line and return it, formatted properly
16026 # $entry Contents of left column
16027 # $info Contents of right column
16030 $flags .= $STRICTER if ! $loose_match;
16032 $flags .= $status if $status;
16034 # There is a blank in the left column to cause the pod formatter to
16035 # output the line as-is.
16036 return sprintf " %-*s%-*s %s\n",
16037 # The first * in the format is replaced by this, the -1 is
16038 # to account for the leading blank. There isn't a
16039 # hard-coded blank after this to separate the flags from
16040 # the rest of the line, so that in the unlikely event that
16041 # multiple flags are shown on the same line, they both
16042 # will get displayed at the expense of that separation,
16043 # but since they are left justified, a blank will be
16044 # inserted in the normal case.
16048 # The other * in the format is replaced by this number to
16049 # cause the first main column to right fill with blanks.
16050 # The -1 is for the guaranteed blank following it.
16051 $first_column_width - $FILLER - 1,
16056 my @zero_match_tables; # List of tables that have no matches in this release
16058 sub make_re_pod_entries($input_table) {
16059 # This generates the entries for the pod file for a given table.
16060 # Also done at this time are any children tables. The output looks like:
16061 # \p{Common} \p{Script=Common} (Short: \p{Zyyy}) (5178)
16063 # Generate parent and all its children at the same time.
16064 return if $input_table->parent != $input_table;
16066 my $property = $input_table->property;
16067 my $type = $property->type;
16068 my $full_name = $property->full_name;
16070 my $count = $input_table->count;
16072 my $non_unicode_string;
16073 if ($count > $MAX_UNICODE_CODEPOINTS) {
16074 $unicode_count = $count - ($MAX_WORKING_CODEPOINT
16075 - $MAX_UNICODE_CODEPOINT);
16076 $non_unicode_string = " plus all above-Unicode code points";
16079 $unicode_count = $count;
16080 $non_unicode_string = "";
16083 my $string_count = clarify_number($unicode_count) . $non_unicode_string;
16085 my $definition = $input_table->calculate_table_definition;
16088 # Save the definition for later use.
16089 $input_table->set_definition($definition);
16091 $definition = ": $definition";
16094 my $status = $input_table->status;
16095 my $status_info = $input_table->status_info;
16096 my $caseless_equivalent = $input_table->caseless_equivalent;
16098 # Don't mention a placeholder equivalent as it isn't to be listed in the
16100 $caseless_equivalent = 0 if $caseless_equivalent != 0
16101 && $caseless_equivalent->fate > $ORDINARY;
16103 my $entry_for_first_table; # The entry for the first table output.
16104 # Almost certainly, it is the parent.
16106 # For each related table (including itself), we will generate a pod entry
16107 # for each name each table goes by
16108 foreach my $table ($input_table, $input_table->children) {
16110 # Unicode::UCD cannot deal with null string property values, so skip
16111 # any tables that have no non-null names.
16112 next if ! grep { $_->name ne "" } $table->aliases;
16114 # First, gather all the info that applies to this table as a whole.
16116 push @zero_match_tables, $table if $count == 0
16117 # Don't mention special tables
16118 # as being zero length
16119 && $table->fate == $ORDINARY;
16121 my $table_property = $table->property;
16123 # The short name has all the underscores removed, while the full name
16124 # retains them. Later, we decide whether to output a short synonym
16125 # for the full one, we need to compare apples to apples, so we use the
16126 # short name's length including underscores.
16127 my $table_property_short_name_length;
16128 my $table_property_short_name
16129 = $table_property->short_name(\$table_property_short_name_length);
16130 my $table_property_full_name = $table_property->full_name;
16132 # Get how much savings there is in the short name over the full one
16133 # (delta will always be <= 0)
16134 my $table_property_short_delta = $table_property_short_name_length
16135 - length($table_property_full_name);
16136 my @table_description = $table->description;
16137 my @table_note = $table->note;
16139 # Generate an entry for each alias in this table.
16140 my $entry_for_first_alias; # saves the first one encountered.
16141 foreach my $alias ($table->aliases) {
16143 # Skip if not to go in pod.
16144 next unless $alias->make_re_pod_entry;
16146 # Start gathering all the components for the entry
16147 my $name = $alias->name;
16149 # Skip if name is empty, as can't be accessed by regexes.
16150 next if $name eq "";
16152 my $entry; # Holds the left column, may include extras
16153 my $entry_ref; # To refer to the left column's contents from
16154 # another entry; has no extras
16156 # First the left column of the pod entry. Tables for the $perl
16157 # property always use the single form.
16158 if ($table_property == $perl) {
16159 $entry = "\\p{$name}";
16160 $entry .= " \\p$name" if length $name == 1; # Show non-braced
16162 $entry_ref = "\\p{$name}";
16164 else { # Compound form.
16166 # Only generate one entry for all the aliases that mean true
16167 # or false in binary properties. Append a '*' to indicate
16168 # some are missing. (The heading comment notes this.)
16170 if ($type == $BINARY) {
16171 next if $name ne 'N' && $name ne 'Y';
16174 elsif ($type != $FORCED_BINARY) {
16179 # Forced binary properties require special handling. It
16180 # has two sets of tables, one set is true/false; and the
16181 # other set is everything else. Entries are generated for
16182 # each set. Use the Bidi_Mirrored property (which appears
16183 # in all Unicode versions) to get a list of the aliases
16184 # for the true/false tables. Of these, only output the N
16185 # and Y ones, the same as, a regular binary property. And
16186 # output all the rest, same as a non-binary property.
16187 my $bm = property_ref("Bidi_Mirrored");
16188 if ($name eq 'N' || $name eq 'Y') {
16190 } elsif (grep { $name eq $_->name } $bm->table("Y")->aliases,
16191 $bm->table("N")->aliases)
16200 # Colon-space is used to give a little more space to be easier
16203 . $table_property_full_name
16206 # But for the reference to this entry, which will go in the
16207 # right column, where space is at a premium, use equals
16209 $entry_ref = "\\p{" . $table_property_full_name . "=$name}";
16212 # Then the right (info) column. This is stored as components of
16213 # an array for the moment, then joined into a string later. For
16214 # non-internal only properties, begin the info with the entry for
16215 # the first table we encountered (if any), as things are ordered
16216 # so that that one is the most descriptive. This leads to the
16217 # info column of an entry being a more descriptive version of the
16220 if ($name =~ /^_/) {
16222 '(For internal use by Perl, not necessarily stable)';
16224 elsif ($entry_for_first_alias) {
16225 push @info, $entry_for_first_alias;
16228 # If this entry is equivalent to another, add that to the info,
16229 # using the first such table we encountered
16230 if ($entry_for_first_table) {
16232 push @info, "(= $entry_for_first_table)";
16235 push @info, $entry_for_first_table;
16239 # If the name is a large integer, add an equivalent with an
16240 # exponent for better readability
16241 if ($name =~ /^[+-]?[\d]+$/ && $name >= 10_000) {
16242 push @info, sprintf "(= %.1e)", $name
16245 my $parenthesized = "";
16246 if (! $entry_for_first_alias) {
16248 # This is the first alias for the current table. The alias
16249 # array is ordered so that this is the fullest, most
16250 # descriptive alias, so it gets the fullest info. The other
16251 # aliases are mostly merely pointers to this one, using the
16252 # information already added above.
16254 # Display any status message, but only on the parent table
16255 if ($status && ! $entry_for_first_table) {
16256 push @info, $status_info;
16259 # Put out any descriptive info
16260 if (@table_description || @table_note) {
16261 push @info, join "; ", @table_description, @table_note;
16264 # Look to see if there is a shorter name we can point people
16266 my $standard_name = standardize($name);
16268 my $proposed_short = $table->short_name;
16269 if (defined $proposed_short) {
16270 my $standard_short = standardize($proposed_short);
16272 # If the short name is shorter than the standard one, or
16273 # even if it's not, but the combination of it and its
16274 # short property name (as in \p{prop=short} ($perl doesn't
16275 # have this form)) saves at least two characters, then,
16276 # cause it to be listed as a shorter synonym.
16277 if (length $standard_short < length $standard_name
16278 || ($table_property != $perl
16279 && (length($standard_short)
16280 - length($standard_name)
16281 + $table_property_short_delta) # (<= 0)
16284 $short_name = $proposed_short;
16285 if ($table_property != $perl) {
16286 $short_name = $table_property_short_name
16289 $short_name = "\\p{$short_name}";
16293 # And if this is a compound form name, see if there is a
16294 # single form equivalent
16296 if ($table_property != $perl && $table_property != $block) {
16298 # Special case the binary N tables, so that will print
16299 # \P{single}, but use the Y table values to populate
16300 # 'single', as we haven't likewise populated the N table.
16301 # For forced binary tables, we can't just look at the N
16302 # table, but must see if this table is equivalent to the N
16303 # one, as there are two equivalent beasts in these
16307 if ( ($type == $BINARY
16308 && $input_table == $property->table('No'))
16309 || ($type == $FORCED_BINARY
16310 && $property->table('No')->
16311 is_set_equivalent_to($input_table)))
16313 $test_table = $property->table('Yes');
16317 $test_table = $input_table;
16321 # Look for a single form amongst all the children.
16322 foreach my $table ($test_table->children) {
16323 next if $table->property != $perl;
16324 my $proposed_name = $table->short_name;
16325 next if ! defined $proposed_name;
16327 # Don't mention internal-only properties as a possible
16328 # single form synonym
16329 next if substr($proposed_name, 0, 1) eq '_';
16331 $proposed_name = "\\$p\{$proposed_name}";
16332 if (! defined $single_form
16333 || length($proposed_name) < length $single_form)
16335 $single_form = $proposed_name;
16337 # The goal here is to find a single form; not the
16338 # shortest possible one. We've already found a
16339 # short name. So, stop at the first single form
16340 # found, which is likely to be closer to the
16347 # Output both short and single in the same parenthesized
16348 # expression, but with only one of 'Single', 'Short' if there
16350 if ($short_name || $single_form || $table->conflicting) {
16351 $parenthesized .= "Short: $short_name" if $short_name;
16352 if ($short_name && $single_form) {
16353 $parenthesized .= ', ';
16355 elsif ($single_form) {
16356 $parenthesized .= 'Single: ';
16358 $parenthesized .= $single_form if $single_form;
16362 if ($caseless_equivalent != 0) {
16363 $parenthesized .= '; ' if $parenthesized ne "";
16364 $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
16368 # Warn if this property isn't the same as one that a
16369 # semi-casual user might expect. The other components of this
16370 # parenthesized structure are calculated only for the first entry
16371 # for this table, but the conflicting is deemed important enough
16372 # to go on every entry.
16373 my $conflicting = join " NOR ", $table->conflicting;
16374 if ($conflicting) {
16375 $parenthesized .= '; ' if $parenthesized ne "";
16376 $parenthesized .= "NOT $conflicting";
16379 push @info, "($parenthesized)" if $parenthesized;
16381 if ($name =~ /_$/ && $alias->loose_match) {
16382 push @info, "Note the trailing '_' matters in spite of loose matching rules.";
16385 if ($table_property != $perl && $table->perl_extension) {
16386 push @info, '(Perl extension)';
16388 my $definition = $table->definition // "";
16389 $definition = "" if $entry_for_first_alias;
16390 $definition = ": $definition" if $definition;
16391 push @info, "($string_count$definition)";
16393 # Now, we have both the entry and info so add them to the
16394 # list of all the properties.
16395 push @match_properties,
16396 format_pod_line($indent_info_column,
16400 $alias->loose_match);
16402 $entry_for_first_alias = $entry_ref unless $entry_for_first_alias;
16403 } # End of looping through the aliases for this table.
16405 if (! $entry_for_first_table) {
16406 $entry_for_first_table = $entry_for_first_alias;
16408 } # End of looping through all the related tables
16412 sub make_ucd_table_pod_entries($table) {
16413 # Generate the entries for the UCD section of the pod for $table. This
16414 # also calculates if names are ambiguous, so has to be called even if the
16415 # pod is not being output
16417 my $short_name = $table->name;
16418 my $standard_short_name = standardize($short_name);
16419 my $full_name = $table->full_name;
16420 my $standard_full_name = standardize($full_name);
16422 my $full_info = ""; # Text of info column for full-name entries
16423 my $other_info = ""; # Text of info column for short-name entries
16424 my $short_info = ""; # Text of info column for other entries
16425 my $meaning = ""; # Synonym of this table
16427 my $property = ($table->isa('Property'))
16429 : $table->parent->property;
16431 my $perl_extension = $table->perl_extension;
16432 my $is_perl_extension_match_table_but_not_dollar_perl
16433 = $property != $perl
16435 && $property != $table;
16437 # Get the more official name for perl extensions that aren't
16438 # stand-alone properties
16439 if ($is_perl_extension_match_table_but_not_dollar_perl) {
16440 if ($property->type == $BINARY) {
16441 $meaning = $property->full_name;
16444 $meaning = $table->parent->complete_name;
16448 # There are three types of info column. One for the short name, one for
16449 # the full name, and one for everything else. They mostly are the same,
16450 # so initialize in the same loop.
16452 foreach my $info_ref (\$full_info, \$short_info, \$other_info) {
16453 if ($info_ref != \$full_info) {
16455 # The non-full name columns include the full name
16456 $$info_ref .= $full_name;
16460 if ($is_perl_extension_match_table_but_not_dollar_perl) {
16462 # Add the synonymous name for the non-full name entries; and to
16463 # the full-name entry if it adds extra information
16464 if ( standardize($meaning) ne $standard_full_name
16465 || $info_ref == \$other_info
16466 || $info_ref == \$short_info)
16468 my $parenthesized = $info_ref != \$full_info;
16469 $$info_ref .= " " if $$info_ref && $parenthesized;
16470 $$info_ref .= "(=" if $parenthesized;
16471 $$info_ref .= "$meaning";
16472 $$info_ref .= ")" if $parenthesized;
16477 # And the full-name entry includes the short name, if shorter
16478 if ($info_ref == \$full_info
16479 && length $standard_short_name < length $standard_full_name)
16481 $full_info =~ s/\.\Z//;
16482 $full_info .= " " if $full_info;
16483 $full_info .= "(Short: $short_name)";
16486 if ($table->perl_extension) {
16487 $$info_ref =~ s/\.\Z//;
16488 $$info_ref .= ". " if $$info_ref;
16489 $$info_ref .= "(Perl extension)";
16494 my $definition_table;
16495 my $type = $table->property->type;
16496 if ($type == $BINARY || $type == $FORCED_BINARY) {
16497 $definition_table = $table->property->table('Y');
16499 elsif ($table->isa('Match_Table')) {
16500 $definition_table = $table;
16503 $definition = $definition_table->calculate_table_definition
16504 if defined $definition_table
16505 && $definition_table != 0;
16507 # Add any extra annotations to the full name entry
16508 foreach my $more_info ($table->description,
16511 $table->status_info)
16513 next unless $more_info;
16514 $full_info =~ s/\.\Z//;
16515 $full_info .= ". " if $full_info;
16516 $full_info .= $more_info;
16518 if ($table->property->type == $FORCED_BINARY) {
16520 $full_info =~ s/\.\Z//;
16521 $full_info .= ". ";
16523 $full_info .= "This is a combination property which has both:"
16524 . " 1) a map to various string values; and"
16525 . " 2) a map to boolean Y/N, where 'Y' means the"
16526 . " string value is non-empty. Add the prefix 'is'"
16527 . " to the prop_invmap() call to get the latter";
16530 # These keep track if have created full and short name pod entries for the
16533 my $done_short = 0;
16535 # Every possible name is kept track of, even those that aren't going to be
16536 # output. This way we can be sure to find the ambiguities.
16537 foreach my $alias ($table->aliases) {
16538 my $name = $alias->name;
16539 my $standard = standardize($name);
16541 my $output_this = $alias->ucd;
16543 # If the full and short names are the same, we want to output the full
16544 # one's entry, so it has priority.
16545 if ($standard eq $standard_full_name) {
16546 next if $done_full;
16548 $info = $full_info;
16550 elsif ($standard eq $standard_short_name) {
16551 next if $done_short;
16553 next if $standard_short_name eq $standard_full_name;
16554 $info = $short_info;
16557 $info = $other_info;
16560 $combination_property{$standard} = 1
16561 if $table->property->type == $FORCED_BINARY;
16563 # Here, we have set up the two columns for this entry. But if an
16564 # entry already exists for this name, we have to decide which one
16565 # we're going to later output.
16566 if (exists $ucd_pod{$standard}) {
16568 # If the two entries refer to the same property, it's not going to
16569 # be ambiguous. (Likely it's because the names when standardized
16570 # are the same.) But that means if they are different properties,
16571 # there is ambiguity.
16572 if ($ucd_pod{$standard}->{'property'} != $property) {
16574 # Here, we have an ambiguity. This code assumes that one is
16575 # scheduled to be output and one not and that one is a perl
16576 # extension (which is not to be output) and the other isn't.
16577 # If those assumptions are wrong, things have to be rethought.
16578 if ($ucd_pod{$standard}{'output_this'} == $output_this
16579 || $ucd_pod{$standard}{'perl_extension'} == $perl_extension
16580 || $output_this == $perl_extension)
16582 Carp::my_carp("Bad news. $property and $ucd_pod{$standard}->{'property'} have unexpected output status and perl-extension combinations. Proceeding anyway.");
16585 # We modify the info column of the one being output to
16586 # indicate the ambiguity. Set $which to point to that one's
16589 if ($ucd_pod{$standard}{'output_this'}) {
16590 $which = \$ucd_pod{$standard}->{'info'};
16594 $meaning = $ucd_pod{$standard}{'meaning'};
16598 $$which =~ s/\.\Z//;
16599 $$which .= "; NOT '$standard' meaning '$meaning'";
16601 $ambiguous_names{$standard} = 1;
16604 # Use the non-perl-extension variant
16605 next unless $ucd_pod{$standard}{'perl_extension'};
16608 # Store enough information about this entry that we can later look for
16609 # ambiguities, and output it properly.
16610 $ucd_pod{$standard} = { 'name' => $name,
16612 'meaning' => $meaning,
16613 'output_this' => $output_this,
16614 'perl_extension' => $perl_extension,
16615 'property' => $property,
16616 'status' => $alias->status,
16618 } # End of looping through all this table's aliases
16623 sub pod_alphanumeric_sort {
16624 # Sort pod entries alphanumerically.
16626 # The first few character columns are filler, plus the '\p{'; and get rid
16627 # of all the trailing stuff, starting with the trailing '}', so as to sort
16628 # on just 'Name=Value'
16629 (my $a = lc $a) =~ s/^ .*? \{ //x;
16631 (my $b = lc $b) =~ s/^ .*? \{ //x;
16634 # Determine if the two operands are both internal only or both not.
16635 # Character 0 should be a '\'; 1 should be a p; 2 should be '{', so 3
16636 # should be the underscore that begins internal only
16637 my $a_is_internal = (substr($a, 0, 1) eq '_');
16638 my $b_is_internal = (substr($b, 0, 1) eq '_');
16640 # Sort so the internals come last in the table instead of first (which the
16641 # leading underscore would otherwise indicate).
16642 if ($a_is_internal != $b_is_internal) {
16643 return 1 if $a_is_internal;
16647 # Determine if the two operands are compound or not, and if so if are
16648 # "numeric" property values or not, like \p{Age: 3.0}. But there are also
16649 # things like \p{Canonical_Combining_Class: CCC133} and \p{Age: V10_0},
16650 # all of which this considers numeric, and for sorting, looks just at the
16651 # numeric parts. It can also be a rational like \p{Numeric Value=-1/2}.
16653 ^ ( [^:=]+ ) # $1 is undef if not a compound form, otherwise is the
16655 [:=] \s* # The syntax for the compound form
16656 (?: # followed by ...
16657 ( # $2 gets defined if what follows is a "numeric"
16658 # expression, which is ...
16659 ( -? \d+ (?: [.\/] \d+)? # An integer, float, or rational
16660 # number, optionally signed
16661 | [[:alpha:]]{2,} \d+ $ ) # or something like CCC131. Either
16662 # of these go into $3
16663 | ( V \d+ _ \d+ ) # or a Unicode's Age property version
16666 | .* $ # If not "numeric", accept anything so that $1 gets
16667 # defined if it is any compound form
16669 my ($a_initial, $a_numeric, $a_number, $a_version) = ($a =~ $split_re);
16670 my ($b_initial, $b_numeric, $b_number, $b_version) = ($b =~ $split_re);
16672 # Sort alphabeticlly on the whole property name if either operand isn't
16673 # compound, or they differ.
16674 return $a cmp $b if ! defined $a_initial
16675 || ! defined $b_initial
16676 || $a_initial ne $b_initial;
16678 if (! defined $a_numeric) {
16680 # If neither is numeric, use alpha sort
16681 return $a cmp $b if ! defined $b_numeric;
16682 return 1; # Sort numeric ahead of alpha
16685 # Here $a is numeric
16686 return -1 if ! defined $b_numeric; # Numeric sorts before alpha
16688 # Here they are both numeric in the same property.
16689 # Convert version numbers into regular numbers
16690 if (defined $a_version) {
16691 ($a_number = $a_version) =~ s/^V//i;
16692 $a_number =~ s/_/./;
16694 else { # Otherwise get rid of the, e.g., CCC in CCC9 */
16695 $a_number =~ s/ ^ [[:alpha:]]+ //x;
16697 if (defined $b_version) {
16698 ($b_number = $b_version) =~ s/^V//i;
16699 $b_number =~ s/_/./;
16702 $b_number =~ s/ ^ [[:alpha:]]+ //x;
16705 # Convert rationals to floating for the comparison.
16706 $a_number = eval $a_number if $a_number =~ qr{/};
16707 $b_number = eval $b_number if $b_number =~ qr{/};
16709 return $a_number <=> $b_number || $a cmp $b;
16713 # Create the .pod file. This generates the various subsections and then
16714 # combines them in one big HERE document.
16716 my $Is_flags_text = "If an entry has flag(s) at its beginning, like \"$DEPRECATED\", the \"Is_\" form has the same flag(s)";
16718 return unless defined $pod_directory;
16719 print "Making pod file\n" if $verbosity >= $PROGRESS;
16721 my $exception_message =
16722 '(Any exceptions are individually noted beginning with the word NOT.)';
16724 if (-e 'Blocks.txt') {
16726 # Add the line: '\p{In_*} \p{Block: *}', with the warning message
16727 # if the global $has_In_conflicts indicates we have them.
16728 push @match_properties, format_pod_line($indent_info_column,
16731 . (($has_In_conflicts)
16732 ? " $exception_message"
16735 @block_warning = << "END";
16737 In particular, matches in the Block property have single forms
16738 defined by Perl that begin with C<"In_">, C<"Is_>, or even with no prefix at
16739 all, Like all B<DISCOURAGED> forms, these are not stable. For example,
16740 C<\\p{Block=Deseret}> can currently be written as C<\\p{In_Deseret}>,
16741 C<\\p{Is_Deseret}>, or C<\\p{Deseret}>. But, a new Unicode version may
16742 come along that would force Perl to change the meaning of one or more of
16743 these, and your program would no longer be correct. Currently there are no
16744 such conflicts with the form that begins C<"In_">, but there are many with the
16745 other two shortcuts, and Unicode continues to define new properties that begin
16746 with C<"In">, so it's quite possible that a conflict will occur in the future.
16747 The compound form is guaranteed to not become obsolete, and its meaning is
16748 clearer anyway. See L<perlunicode/"Blocks"> for more information about this.
16750 User-defined properties must begin with "In" or "Is". These override any
16751 Unicode property of the same name.
16754 my $text = $Is_flags_text;
16755 $text = "$exception_message $text" if $has_Is_conflicts;
16757 # And the 'Is_ line';
16758 push @match_properties, format_pod_line($indent_info_column,
16761 push @match_properties, format_pod_line($indent_info_column,
16763 "Combination of Name and Name_Alias properties; has special"
16764 . " loose matching rules, for which see Unicode UAX #44");
16765 push @match_properties, format_pod_line($indent_info_column,
16769 # Sort the properties array for output. It is sorted alphabetically
16770 # except numerically for numeric properties, and only output unique lines.
16771 @match_properties = sort pod_alphanumeric_sort uniques @match_properties;
16773 my $formatted_properties = simple_fold(\@match_properties,
16775 # indent succeeding lines by two extra
16776 # which looks better
16777 $indent_info_column + 2,
16779 # shorten the line length by how much
16780 # the formatter indents, so the folded
16781 # line will fit in the space
16782 # presumably available
16783 $automatic_pod_indent);
16784 # Add column headings, indented to be a little more centered, but not
16786 $formatted_properties = format_pod_line($indent_info_column,
16790 . $formatted_properties;
16792 # Generate pod documentation lines for the tables that match nothing
16793 my $zero_matches = "";
16794 if (@zero_match_tables) {
16795 @zero_match_tables = uniques(@zero_match_tables);
16796 $zero_matches = join "\n\n",
16797 map { $_ = '=item \p{' . $_->complete_name . "}" }
16798 sort { $a->complete_name cmp $b->complete_name }
16799 @zero_match_tables;
16801 $zero_matches = <<END;
16803 =head2 Legal C<\\p{}> and C<\\P{}> constructs that match no characters
16805 Unicode has some property-value pairs that currently don't match anything.
16806 This happens generally either because they are obsolete, or they exist for
16807 symmetry with other forms, but no language has yet been encoded that uses
16808 them. In this version of Unicode, the following match zero code points:
16819 # Generate list of properties that we don't accept, grouped by the reasons
16820 # why. This is so only put out the 'why' once, and then list all the
16821 # properties that have that reason under it.
16823 my %why_list; # The keys are the reasons; the values are lists of
16824 # properties that have the key as their reason
16826 # For each property, add it to the list that are suppressed for its reason
16827 # The sort will cause the alphabetically first properties to be added to
16828 # each list first, so each list will be sorted.
16829 foreach my $property (sort keys %why_suppressed) {
16830 next unless $why_suppressed{$property};
16831 push @{$why_list{$why_suppressed{$property}}}, $property;
16834 # For each reason (sorted by the first property that has that reason)...
16835 my @bad_re_properties;
16836 foreach my $why (sort { $why_list{$a}->[0] cmp $why_list{$b}->[0] }
16839 # Add to the output, all the properties that have that reason.
16840 my $has_item = 0; # Flag if actually output anything.
16841 foreach my $name (@{$why_list{$why}}) {
16843 # Split compound names into $property and $table components
16844 my $property = $name;
16846 if ($property =~ / (.*) = (.*) /x) {
16851 # This release of Unicode may not have a property that is
16852 # suppressed, so don't reference a non-existent one.
16853 $property = property_ref($property);
16854 next if ! defined $property;
16856 # And since this list is only for match tables, don't list the
16857 # ones that don't have match tables.
16858 next if ! $property->to_create_match_tables;
16860 # Find any abbreviation, and turn it into a compound name if this
16861 # is a property=value pair.
16862 my $short_name = $property->name;
16863 $short_name .= '=' . $property->table($table)->name if $table;
16865 # Start with an empty line.
16866 push @bad_re_properties, "\n\n" unless $has_item;
16868 # And add the property as an item for the reason.
16869 push @bad_re_properties, "\n=item I<$name> ($short_name)\n";
16873 # And add the reason under the list of properties, if such a list
16874 # actually got generated. Note that the header got added
16875 # unconditionally before. But pod ignores extra blank lines, so no
16877 push @bad_re_properties, "\n$why\n" if $has_item;
16879 } # End of looping through each reason.
16881 if (! @bad_re_properties) {
16882 push @bad_re_properties,
16883 "*** This installation accepts ALL non-Unihan properties ***";
16886 # Add =over only if non-empty to avoid an empty =over/=back section,
16887 # which is considered bad form.
16888 unshift @bad_re_properties, "\n=over 4\n";
16889 push @bad_re_properties, "\n=back\n";
16892 # Similarly, generate a list of files that we don't use, grouped by the
16893 # reasons why (Don't output if the reason is empty). First, create a hash
16894 # whose keys are the reasons, and whose values are anonymous arrays of all
16895 # the files that share that reason.
16896 my %grouped_by_reason;
16897 foreach my $file (keys %skipped_files) {
16898 next unless $skipped_files{$file};
16899 push @{$grouped_by_reason{$skipped_files{$file}}}, $file;
16902 # Then, sort each group.
16903 foreach my $group (keys %grouped_by_reason) {
16904 @{$grouped_by_reason{$group}} = sort { lc $a cmp lc $b }
16905 @{$grouped_by_reason{$group}} ;
16908 # Finally, create the output text. For each reason (sorted by the
16909 # alphabetically first file that has that reason)...
16911 foreach my $reason (sort { lc $grouped_by_reason{$a}->[0]
16912 cmp lc $grouped_by_reason{$b}->[0]
16914 keys %grouped_by_reason)
16916 # Add all the files that have that reason to the output. Start
16917 # with an empty line.
16918 push @unused_files, "\n\n";
16919 push @unused_files, map { "\n=item F<$_> \n" }
16920 @{$grouped_by_reason{$reason}};
16921 # And add the reason under the list of files
16922 push @unused_files, "\n$reason\n";
16925 # Similarly, create the output text for the UCD section of the pod
16927 foreach my $key (keys %ucd_pod) {
16928 next unless $ucd_pod{$key}->{'output_this'};
16929 push @ucd_pod, format_pod_line($indent_info_column,
16930 $ucd_pod{$key}->{'name'},
16931 $ucd_pod{$key}->{'info'},
16932 $ucd_pod{$key}->{'status'},
16936 # Sort alphabetically, and fold for output
16937 @ucd_pod = sort { lc substr($a, 2) cmp lc substr($b, 2) } @ucd_pod;
16938 my $ucd_pod = simple_fold(\@ucd_pod,
16940 $indent_info_column,
16941 $automatic_pod_indent);
16942 $ucd_pod = format_pod_line($indent_info_column, 'NAME', ' INFO')
16945 my $space_hex = sprintf("%02x", ord " ");
16948 # Everything is ready to assemble.
16949 my @OUT = << "END";
16954 To change this file, edit $0 instead.
16960 $pod_file - Index of Unicode Version $unicode_version character properties in Perl
16964 This document provides information about the portion of the Unicode database
16965 that deals with character properties, that is the portion that is defined on
16966 single code points. (L</Other information in the Unicode data base>
16967 below briefly mentions other data that Unicode provides.)
16969 Perl can provide access to all non-provisional Unicode character properties,
16970 though not all are enabled by default. The omitted ones are the Unihan
16971 properties and certain
16972 deprecated or Unicode-internal properties. (An installation may choose to
16973 recompile Perl's tables to change this. See L</Unicode character
16974 properties that are NOT accepted by Perl>.)
16976 For most purposes, access to Unicode properties from the Perl core is through
16977 regular expression matches, as described in the next section.
16978 For some special purposes, and to access the properties that are not suitable
16979 for regular expression matching, all the Unicode character properties that
16980 Perl handles are accessible via the standard L<Unicode::UCD> module, as
16981 described in the section L</Properties accessible through Unicode::UCD>.
16983 Perl also provides some additional extensions and short-cut synonyms
16984 for Unicode properties.
16986 This document merely lists all available properties and does not attempt to
16987 explain what each property really means. There is a brief description of each
16988 Perl extension; see L<perlunicode/Other Properties> for more information on
16989 these. There is some detail about Blocks, Scripts, General_Category,
16990 and Bidi_Class in L<perlunicode>, but to find out about the intricacies of the
16991 official Unicode properties, refer to the Unicode standard. A good starting
16992 place is L<$unicode_reference_url>.
16994 Note that you can define your own properties; see
16995 L<perlunicode/"User-Defined Character Properties">.
16997 =head1 Properties accessible through C<\\p{}> and C<\\P{}>
16999 The Perl regular expression C<\\p{}> and C<\\P{}> constructs give access to
17000 most of the Unicode character properties. The table below shows all these
17001 constructs, both single and compound forms.
17003 B<Compound forms> consist of two components, separated by an equals sign or a
17004 colon. The first component is the property name, and the second component is
17005 the particular value of the property to match against, for example,
17006 C<\\p{Script_Extensions: Greek}> and C<\\p{Script_Extensions=Greek}> both mean
17007 to match characters whose Script_Extensions property value is Greek.
17008 (C<Script_Extensions> is an improved version of the C<Script> property.)
17010 B<Single forms>, like C<\\p{Greek}>, are mostly Perl-defined shortcuts for
17011 their equivalent compound forms. The table shows these equivalences. (In our
17012 example, C<\\p{Greek}> is a just a shortcut for
17013 C<\\p{Script_Extensions=Greek}>). There are also a few Perl-defined single
17014 forms that are not shortcuts for a compound form. One such is C<\\p{Word}>.
17015 These are also listed in the table.
17017 In parsing these constructs, Perl always ignores Upper/lower case differences
17018 everywhere within the {braces}. Thus C<\\p{Greek}> means the same thing as
17019 C<\\p{greek}>. But note that changing the case of the C<"p"> or C<"P"> before
17020 the left brace completely changes the meaning of the construct, from "match"
17021 (for C<\\p{}>) to "doesn't match" (for C<\\P{}>). Casing in this document is
17022 for improved legibility.
17024 Also, white space, hyphens, and underscores are normally ignored
17025 everywhere between the {braces}, and hence can be freely added or removed
17026 even if the C</x> modifier hasn't been specified on the regular expression.
17027 But in the table below $a_bold_stricter at the beginning of an entry
17028 means that tighter (stricter) rules are used for that entry:
17034 =item Single form (C<\\p{name}>) tighter rules:
17036 White space, hyphens, and underscores ARE significant
17041 =item * white space adjacent to a non-word character
17043 =item * underscores separating digits in numbers
17047 That means, for example, that you can freely add or remove white space
17048 adjacent to (but within) the braces without affecting the meaning.
17050 =item Compound form (C<\\p{name=value}> or C<\\p{name:value}>) tighter rules:
17052 The tighter rules given above for the single form apply to everything to the
17053 right of the colon or equals; the looser rules still apply to everything to
17056 That means, for example, that you can freely add or remove white space
17057 adjacent to (but within) the braces and the colon or equal sign.
17063 Some properties are considered obsolete by Unicode, but still available.
17064 There are several varieties of obsolescence:
17072 A property may be stabilized. Such a determination does not indicate
17073 that the property should or should not be used; instead it is a declaration
17074 that the property will not be maintained nor extended for newly encoded
17075 characters. Such properties are marked with $a_bold_stabilized in the
17080 A property may be deprecated, perhaps because its original intent
17081 has been replaced by another property, or because its specification was
17082 somehow defective. This means that its use is strongly
17083 discouraged, so much so that a warning will be issued if used, unless the
17084 regular expression is in the scope of a C<S<no warnings 'deprecated'>>
17085 statement. $A_bold_deprecated flags each such entry in the table, and
17086 the entry there for the longest, most descriptive version of the property will
17087 give the reason it is deprecated, and perhaps advice. Perl may issue such a
17088 warning, even for properties that aren't officially deprecated by Unicode,
17089 when there used to be characters or code points that were matched by them, but
17090 no longer. This is to warn you that your program may not work like it did on
17091 earlier Unicode releases.
17093 A deprecated property may be made unavailable in a future Perl version, so it
17094 is best to move away from them.
17096 A deprecated property may also be stabilized, but this fact is not shown.
17100 Properties marked with $a_bold_obsolete in the table are considered (plain)
17101 obsolete. Generally this designation is given to properties that Unicode once
17102 used for internal purposes (but not any longer).
17106 This is not actually a Unicode-specified obsolescence, but applies to certain
17107 Perl extensions that are present for backwards compatibility, but are
17108 discouraged from being used. These are not obsolete, but their meanings are
17109 not stable. Future Unicode versions could force any of these extensions to be
17110 removed without warning, replaced by another property with the same name that
17111 means something different. $A_bold_discouraged flags each such entry in the
17112 table. Use the equivalent shown instead.
17120 The table below has two columns. The left column contains the C<\\p{}>
17121 constructs to look up, possibly preceded by the flags mentioned above; and
17122 the right column contains information about them, like a description, or
17123 synonyms. The table shows both the single and compound forms for each
17124 property that has them. If the left column is a short name for a property,
17125 the right column will give its longer, more descriptive name; and if the left
17126 column is the longest name, the right column will show any equivalent shortest
17127 name, in both single and compound forms if applicable.
17129 If braces are not needed to specify a property (e.g., C<\\pL>), the left
17130 column contains both forms, with and without braces.
17132 The right column will also caution you if a property means something different
17133 than what might normally be expected.
17135 All single forms are Perl extensions; a few compound forms are as well, and
17138 Numbers in (parentheses) indicate the total number of Unicode code points
17139 matched by the property. For the entries that give the longest, most
17140 descriptive version of the property, the count is followed by a list of some
17141 of the code points matched by it. The list includes all the matched
17142 characters in the 0-255 range, enclosed in the familiar [brackets] the same as
17143 a regular expression bracketed character class. Following that, the next few
17144 higher matching ranges are also given. To avoid visual ambiguity, the SPACE
17145 character is represented as C<\\x$space_hex>.
17147 For emphasis, those properties that match no code points at all are listed as
17148 well in a separate section following the table.
17150 Most properties match the same code points regardless of whether C<"/i">
17151 case-insensitive matching is specified or not. But a few properties are
17152 affected. These are shown with the notation S<C<(/i= I<other_property>)>>
17153 in the second column. Under case-insensitive matching they match the
17154 same code pode points as the property I<other_property>.
17156 There is no description given for most non-Perl defined properties (See
17157 L<$unicode_reference_url> for that).
17159 For compactness, 'B<*>' is used as a wildcard instead of showing all possible
17160 combinations. For example, entries like:
17162 \\p{Gc: *} \\p{General_Category: *}
17164 mean that 'Gc' is a synonym for 'General_Category', and anything that is valid
17165 for the latter is also valid for the former. Similarly,
17169 means that if and only if, for example, C<\\p{Foo}> exists, then
17170 C<\\p{Is_Foo}> and C<\\p{IsFoo}> are also valid and all mean the same thing.
17171 And similarly, C<\\p{Foo=Bar}> means the same as C<\\p{Is_Foo=Bar}> and
17172 C<\\p{IsFoo=Bar}>. "*" here is restricted to something not beginning with an
17175 Also, in binary properties, 'Yes', 'T', and 'True' are all synonyms for 'Y'.
17176 And 'No', 'F', and 'False' are all synonyms for 'N'. The table shows 'Y*' and
17177 'N*' to indicate this, and doesn't have separate entries for the other
17178 possibilities. Note that not all properties which have values 'Yes' and 'No'
17179 are binary, and they have all their values spelled out without using this wild
17180 card, and a C<NOT> clause in their description that highlights their not being
17181 binary. These also require the compound form to match them, whereas true
17182 binary properties have both single and compound forms available.
17184 Note that all non-essential underscores are removed in the display of the
17191 =item Z<>B<*> is a wild-card
17193 =item B<(\\d+)> in the info column gives the number of Unicode code points matched
17196 =item B<$DEPRECATED> means this is deprecated.
17198 =item B<$OBSOLETE> means this is obsolete.
17200 =item B<$STABILIZED> means this is stabilized.
17202 =item B<$STRICTER> means tighter (stricter) name matching applies.
17204 =item B<$DISCOURAGED> means use of this form is discouraged, and may not be
17209 $formatted_properties
17213 =head1 Properties accessible through Unicode::UCD
17215 The value of any Unicode (not including Perl extensions) character
17216 property mentioned above for any single code point is available through
17217 L<Unicode::UCD/charprop()>. L<Unicode::UCD/charprops_all()> returns the
17218 values of all the Unicode properties for a given code point.
17220 Besides these, all the Unicode character properties mentioned above
17221 (except for those marked as for internal use by Perl) are also
17222 accessible by L<Unicode::UCD/prop_invlist()>.
17224 Due to their nature, not all Unicode character properties are suitable for
17225 regular expression matches, nor C<prop_invlist()>. The remaining
17226 non-provisional, non-internal ones are accessible via
17227 L<Unicode::UCD/prop_invmap()> (except for those that this Perl installation
17228 hasn't included; see L<below for which those are|/Unicode character properties
17229 that are NOT accepted by Perl>).
17231 For compatibility with other parts of Perl, all the single forms given in the
17232 table in the L<section above|/Properties accessible through \\p{} and \\P{}>
17233 are recognized. BUT, there are some ambiguities between some Perl extensions
17234 and the Unicode properties, all of which are silently resolved in favor of the
17235 official Unicode property. To avoid surprises, you should only use
17236 C<prop_invmap()> for forms listed in the table below, which omits the
17237 non-recommended ones. The affected forms are the Perl single form equivalents
17238 of Unicode properties, such as C<\\p{sc}> being a single-form equivalent of
17239 C<\\p{gc=sc}>, which is treated by C<prop_invmap()> as the C<Script> property,
17240 whose short name is C<sc>. The table indicates the current ambiguities in the
17241 INFO column, beginning with the word C<"NOT">.
17243 The standard Unicode properties listed below are documented in
17244 L<$unicode_reference_url>; Perl_Decimal_Digit is documented in
17245 L<Unicode::UCD/prop_invmap()>. The other Perl extensions are in
17246 L<perlunicode/Other Properties>;
17248 The first column in the table is a name for the property; the second column is
17249 an alternative name, if any, plus possibly some annotations. The alternative
17250 name is the property's full name, unless that would simply repeat the first
17251 column, in which case the second column indicates the property's short name
17252 (if different). The annotations are given only in the entry for the full
17253 name. The annotations for binary properties include a list of the first few
17254 ranges that the property matches. To avoid any ambiguity, the SPACE character
17255 is represented as C<\\x$space_hex>.
17257 If a property is obsolete, etc, the entry will be flagged with the same
17258 characters used in the table in the L<section above|/Properties accessible
17259 through \\p{} and \\P{}>, like B<$DEPRECATED> or B<$STABILIZED>.
17263 =head1 Properties accessible through other means
17265 Certain properties are accessible also via core function calls. These are:
17267 Lowercase_Mapping lc() and lcfirst()
17268 Titlecase_Mapping ucfirst()
17269 Uppercase_Mapping uc()
17271 Also, Case_Folding is accessible through the C</i> modifier in regular
17272 expressions, the C<\\F> transliteration escape, and the C<L<fc|perlfunc/fc>>
17275 Besides being able to say C<\\p{Name=...}>, the Name and Name_Aliases
17276 properties are accessible through the C<\\N{}> interpolation in double-quoted
17277 strings and regular expressions; and functions C<charnames::viacode()>,
17278 C<charnames::vianame()>, and C<charnames::string_vianame()> (which require a
17279 C<use charnames ();> to be specified.
17281 Finally, most properties related to decomposition are accessible via
17282 L<Unicode::Normalize>.
17284 =head1 Unicode character properties that are NOT accepted by Perl
17286 Perl will generate an error for a few character properties in Unicode when
17287 used in a regular expression. The non-Unihan ones are listed below, with the
17288 reasons they are not accepted, perhaps with work-arounds. The short names for
17289 the properties are listed enclosed in (parentheses).
17290 As described after the list, an installation can change the defaults and choose
17291 to accept any of these. The list is machine generated based on the
17292 choices made for the installation that generated this document.
17296 An installation can choose to allow any of these to be matched by downloading
17297 the Unicode database from L<http://www.unicode.org/Public/> to
17298 C<\$Config{privlib}>/F<unicore/> in the Perl source tree, changing the
17299 controlling lists contained in the program
17300 C<\$Config{privlib}>/F<unicore/mktables> and then re-compiling and installing.
17301 (C<\%Config> is available from the Config module).
17303 Also, perl can be recompiled to operate on an earlier version of the Unicode
17304 standard. Further information is at
17305 C<\$Config{privlib}>/F<unicore/README.perl>.
17307 =head1 Other information in the Unicode data base
17309 The Unicode data base is delivered in two different formats. The XML version
17310 is valid for more modern Unicode releases. The other version is a collection
17311 of files. The two are intended to give equivalent information. Perl uses the
17312 older form; this allows you to recompile Perl to use early Unicode releases.
17314 The only non-character property that Perl currently supports is Named
17315 Sequences, in which a sequence of code points
17316 is given a name and generally treated as a single entity. (Perl supports
17317 these via the C<\\N{...}> double-quotish construct,
17318 L<charnames/charnames::string_vianame(name)>, and L<Unicode::UCD/namedseq()>.
17320 Below is a list of the files in the Unicode data base that Perl doesn't
17321 currently use, along with very brief descriptions of their purposes.
17322 Some of the names of the files have been shortened from those that Unicode
17323 uses, in order to allow them to be distinguishable from similarly named files
17324 on file systems for which only the first 8 characters of a name are
17335 L<$unicode_reference_url>
17343 # And write it. The 0 means no utf8.
17344 main::write([ $pod_directory, "$pod_file.pod" ], 0, \@OUT);
17348 sub make_Name_pm () {
17349 # Create and write Name.pm, which contains subroutines and data to use in
17350 # conjunction with Name.pl
17352 # Maybe there's nothing to do.
17353 return unless $has_hangul_syllables || @code_points_ending_in_code_point;
17357 $INTERNAL_ONLY_HEADER
17361 # Convert these structures to output format.
17362 my $code_points_ending_in_code_point =
17363 main::simple_dumper(\@code_points_ending_in_code_point,
17365 my $names = main::simple_dumper(\%names_ending_in_code_point,
17367 my $loose_names = main::simple_dumper(\%loose_names_ending_in_code_point,
17370 # Do the same with the Hangul names,
17376 if ($has_hangul_syllables) {
17378 # Construct a regular expression of all the possible
17379 # combinations of the Hangul syllables.
17380 my @L_re; # Leading consonants
17381 for my $i ($LBase .. $LBase + $LCount - 1) {
17382 push @L_re, $Jamo{$i}
17384 my @V_re; # Middle vowels
17385 for my $i ($VBase .. $VBase + $VCount - 1) {
17386 push @V_re, $Jamo{$i}
17388 my @T_re; # Trailing consonants
17389 for my $i ($TBase + 1 .. $TBase + $TCount - 1) {
17390 push @T_re, $Jamo{$i}
17393 # The whole re is made up of the L V T combination.
17395 . join ('|', sort @L_re)
17397 . join ('|', sort @V_re)
17399 . join ('|', sort @T_re)
17402 # These hashes needed by the algorithm were generated
17403 # during reading of the Jamo.txt file
17404 $jamo = main::simple_dumper(\%Jamo, ' ' x 8);
17405 $jamo_l = main::simple_dumper(\%Jamo_L, ' ' x 8);
17406 $jamo_v = main::simple_dumper(\%Jamo_V, ' ' x 8);
17407 $jamo_t = main::simple_dumper(\%Jamo_T, ' ' x 8);
17414 # This module contains machine-generated tables and code for the
17415 # algorithmically-determinable Unicode character names. The following
17416 # routines can be used to translate between name and code point and vice versa
17420 # Matches legal code point. 4-6 hex numbers, If there are 6, the first
17421 # two must be 10; if there are 5, the first must not be a 0. Written this
17422 # way to decrease backtracking. The first regex allows the code point to
17423 # be at the end of a word, but to work properly, the word shouldn't end
17424 # with a valid hex character. The second one won't match a code point at
17425 # the end of a word, and doesn't have the run-on issue
17426 my \$run_on_code_point_re = qr/$run_on_code_point_re/;
17427 my \$code_point_re = qr/$code_point_re/;
17429 # In the following hash, the keys are the bases of names which include
17430 # the code point in the name, like CJK UNIFIED IDEOGRAPH-4E01. The value
17431 # of each key is another hash which is used to get the low and high ends
17432 # for each range of code points that apply to the name.
17433 my %names_ending_in_code_point = (
17437 # The following hash is a copy of the previous one, except is for loose
17438 # matching, so each name has blanks and dashes squeezed out
17439 my %loose_names_ending_in_code_point = (
17443 # And the following array gives the inverse mapping from code points to
17444 # names. Lowest code points are first
17445 \@code_points_ending_in_code_point = (
17446 $code_points_ending_in_code_point
17449 # Is exportable, make read-only
17450 Internals::SvREADONLY(\@code_points_ending_in_code_point, 1);
17452 # Earlier releases didn't have Jamos. No sense outputting
17453 # them unless will be used.
17454 if ($has_hangul_syllables) {
17457 # Convert from code point to Jamo short name for use in composing Hangul
17463 # Leading consonant (can be null)
17473 # Optional trailing consonant
17478 # Computed re that splits up a Hangul name into LVT or LV syllables
17479 my \$syllable_re = qr/$jamo_re/;
17481 my \$HANGUL_SYLLABLE = "HANGUL SYLLABLE ";
17482 my \$loose_HANGUL_SYLLABLE = "HANGULSYLLABLE";
17484 # These constants names and values were taken from the Unicode standard,
17485 # version 5.1, section 3.12. They are used in conjunction with Hangul
17487 my \$SBase = $SBase_string;
17488 my \$LBase = $LBase_string;
17489 my \$VBase = $VBase_string;
17490 my \$TBase = $TBase_string;
17491 my \$SCount = $SCount;
17492 my \$LCount = $LCount;
17493 my \$VCount = $VCount;
17494 my \$TCount = $TCount;
17495 my \$NCount = \$VCount * \$TCount;
17497 } # End of has Jamos
17499 push @name, << 'END';
17501 sub name_to_code_point_special {
17502 my ($name, $loose) = @_;
17504 # Returns undef if not one of the specially handled names; otherwise
17505 # returns the code point equivalent to the input name
17506 # $loose is non-zero if to use loose matching, 'name' in that case
17507 # must be input as upper case with all blanks and dashes squeezed out.
17509 if ($has_hangul_syllables) {
17510 push @name, << 'END';
17512 if ((! $loose && $name =~ s/$HANGUL_SYLLABLE//)
17513 || ($loose && $name =~ s/$loose_HANGUL_SYLLABLE//))
17515 return if $name !~ qr/^$syllable_re$/;
17516 my $L = $Jamo_L{$1};
17517 my $V = $Jamo_V{$2};
17518 my $T = (defined $3) ? $Jamo_T{$3} : 0;
17519 return ($L * $VCount + $V) * $TCount + $T + $SBase;
17523 push @name, << 'END';
17525 # Name must end in 'code_point' for this to handle.
17526 return if (($loose && $name !~ /^ (.*?) ($run_on_code_point_re) $/x)
17527 || (! $loose && $name !~ /^ (.*) ($code_point_re) $/x));
17530 my $code_point = CORE::hex $2;
17534 $names_ref = \%loose_names_ending_in_code_point;
17537 return if $base !~ s/-$//;
17538 $names_ref = \%names_ending_in_code_point;
17541 # Name must be one of the ones which has the code point in it.
17542 return if ! $names_ref->{$base};
17544 # Look through the list of ranges that apply to this name to see if
17545 # the code point is in one of them.
17546 for (my $i = 0; $i < scalar @{$names_ref->{$base}{'low'}}; $i++) {
17547 return if $names_ref->{$base}{'low'}->[$i] > $code_point;
17548 next if $names_ref->{$base}{'high'}->[$i] < $code_point;
17550 # Here, the code point is in the range.
17551 return $code_point;
17554 # Here, looked like the name had a code point number in it, but
17555 # did not match one of the valid ones.
17559 sub code_point_to_name_special {
17560 my $code_point = shift;
17562 # Returns the name of a code point if algorithmically determinable;
17565 if ($has_hangul_syllables) {
17566 push @name, << 'END';
17568 # If in the Hangul range, calculate the name based on Unicode's
17570 if ($code_point >= $SBase && $code_point <= $SBase + $SCount -1) {
17572 my $SIndex = $code_point - $SBase;
17573 my $L = $LBase + $SIndex / $NCount;
17574 my $V = $VBase + ($SIndex % $NCount) / $TCount;
17575 my $T = $TBase + $SIndex % $TCount;
17576 $name = "$HANGUL_SYLLABLE$Jamo{$L}$Jamo{$V}";
17577 $name .= $Jamo{$T} if $T != $TBase;
17582 push @name, << 'END';
17584 # Look through list of these code points for one in range.
17585 foreach my $hash (@code_points_ending_in_code_point) {
17586 return if $code_point < $hash->{'low'};
17587 if ($code_point <= $hash->{'high'}) {
17588 return sprintf("%s-%04X", $hash->{'name'}, $code_point);
17591 return; # None found
17598 main::write("Name.pm", 0, \@name); # The 0 means no utf8.
17603 # Create and write UCD.pl, which passes info about the tables to
17606 # Stringify structures for output
17607 my $loose_property_name_of
17608 = simple_dumper(\%loose_property_name_of, ' ' x 4);
17609 chomp $loose_property_name_of;
17611 my $strict_property_name_of
17612 = simple_dumper(\%strict_property_name_of, ' ' x 4);
17613 chomp $strict_property_name_of;
17615 my $stricter_to_file_of = simple_dumper(\%stricter_to_file_of, ' ' x 4);
17616 chomp $stricter_to_file_of;
17618 my $inline_definitions = simple_dumper(\@inline_definitions, " " x 4);
17619 chomp $inline_definitions;
17621 my $loose_to_file_of = simple_dumper(\%loose_to_file_of, ' ' x 4);
17622 chomp $loose_to_file_of;
17624 my $nv_floating_to_rational
17625 = simple_dumper(\%nv_floating_to_rational, ' ' x 4);
17626 chomp $nv_floating_to_rational;
17628 my $why_deprecated = simple_dumper(\%Unicode::UCD::why_deprecated, ' ' x 4);
17629 chomp $why_deprecated;
17631 # We set the key to the file when we associated files with tables, but we
17632 # couldn't do the same for the value then, as we might not have the file
17633 # for the alternate table figured out at that time.
17634 foreach my $cased (keys %caseless_equivalent_to) {
17635 my @path = $caseless_equivalent_to{$cased}->file_path;
17637 if ($path[0] eq "#") { # Pseudo-directory '#'
17638 $path = join '/', @path;
17640 else { # Gets rid of lib/
17641 $path = join '/', @path[1, -1];
17643 $caseless_equivalent_to{$cased} = $path;
17645 my $caseless_equivalent_to
17646 = simple_dumper(\%caseless_equivalent_to, ' ' x 4);
17647 chomp $caseless_equivalent_to;
17649 my $loose_property_to_file_of
17650 = simple_dumper(\%loose_property_to_file_of, ' ' x 4);
17651 chomp $loose_property_to_file_of;
17653 my $strict_property_to_file_of
17654 = simple_dumper(\%strict_property_to_file_of, ' ' x 4);
17655 chomp $strict_property_to_file_of;
17657 my $file_to_swash_name = simple_dumper(\%file_to_swash_name, ' ' x 4);
17658 chomp $file_to_swash_name;
17660 # Create a mapping from each alias of Perl single-form extensions to all
17661 # its equivalent aliases, for quick look-up.
17662 my %perlprop_to_aliases;
17663 foreach my $table ($perl->tables) {
17665 # First create the list of the aliases of each extension
17666 my @aliases_list; # List of legal aliases for this extension
17668 my $table_name = $table->name;
17669 my $standard_table_name = standardize($table_name);
17670 my $table_full_name = $table->full_name;
17671 my $standard_table_full_name = standardize($table_full_name);
17673 # Make sure that the list has both the short and full names
17674 push @aliases_list, $table_name, $table_full_name;
17676 my $found_ucd = 0; # ? Did we actually get an alias that should be
17677 # output for this table
17679 # Go through all the aliases (including the two just added), and add
17680 # any new unique ones to the list
17681 foreach my $alias ($table->aliases) {
17683 # Skip non-legal names
17684 next unless $alias->ok_as_filename;
17685 next unless $alias->ucd;
17687 $found_ucd = 1; # have at least one legal name
17689 my $name = $alias->name;
17690 my $standard = standardize($name);
17692 # Don't repeat a name that is equivalent to one already on the
17694 next if $standard eq $standard_table_name;
17695 next if $standard eq $standard_table_full_name;
17697 push @aliases_list, $name;
17700 # If there were no legal names, don't output anything.
17701 next unless $found_ucd;
17703 # To conserve memory in the program reading these in, omit full names
17704 # that are identical to the short name, when those are the only two
17705 # aliases for the property.
17706 if (@aliases_list == 2 && $aliases_list[0] eq $aliases_list[1]) {
17710 # Here, @aliases_list is the list of all the aliases that this
17711 # extension legally has. Now can create a map to it from each legal
17712 # standardized alias
17713 foreach my $alias ($table->aliases) {
17714 next unless $alias->ucd;
17715 next unless $alias->ok_as_filename;
17716 push @{$perlprop_to_aliases{standardize($alias->name)}},
17717 uniques @aliases_list;
17721 # Make a list of all combinations of properties/values that are suppressed.
17723 if (! $debug_skip) { # This tends to fail in this debug mode
17724 foreach my $property_name (keys %why_suppressed) {
17727 my $value_name = $1 if $property_name =~ s/ = ( .* ) //x;
17729 # The hash may contain properties not in this release of Unicode
17730 next unless defined (my $property = property_ref($property_name));
17732 # Find all combinations
17733 foreach my $prop_alias ($property->aliases) {
17734 my $prop_alias_name = standardize($prop_alias->name);
17736 # If no =value, there's just one combination possible for this
17737 if (! $value_name) {
17739 # The property may be suppressed, but there may be a proxy
17740 # for it, so it shouldn't be listed as suppressed
17741 next if $prop_alias->ucd;
17742 push @suppressed, $prop_alias_name;
17745 foreach my $value_alias
17746 ($property->table($value_name)->aliases)
17748 next if $value_alias->ucd;
17750 push @suppressed, "$prop_alias_name="
17751 . standardize($value_alias->name);
17757 @suppressed = sort @suppressed; # So doesn't change between runs of this
17760 # Convert the structure below (designed for Name.pm) to a form that UCD
17761 # wants, so it doesn't have to modify it at all; i.e. so that it includes
17762 # an element for the Hangul syllables in the appropriate place, and
17763 # otherwise changes the name to include the "-<code point>" suffix.
17764 my @algorithm_names;
17765 my $done_hangul = $v_version lt v2.0.0; # Hanguls as we know them came
17766 # along in this version
17767 # Copy it linearly.
17768 for my $i (0 .. @code_points_ending_in_code_point - 1) {
17770 # Insert the hanguls in the correct place.
17772 && $code_points_ending_in_code_point[$i]->{'low'} > $SBase)
17775 push @algorithm_names, { low => $SBase,
17776 high => $SBase + $SCount - 1,
17777 name => '<hangul syllable>',
17781 # Copy the current entry, modified.
17782 push @algorithm_names, {
17783 low => $code_points_ending_in_code_point[$i]->{'low'},
17784 high => $code_points_ending_in_code_point[$i]->{'high'},
17786 "$code_points_ending_in_code_point[$i]->{'name'}-<code point>",
17790 # Serialize these structures for output.
17791 my $loose_to_standard_value
17792 = simple_dumper(\%loose_to_standard_value, ' ' x 4);
17793 chomp $loose_to_standard_value;
17795 my $string_property_loose_to_name
17796 = simple_dumper(\%string_property_loose_to_name, ' ' x 4);
17797 chomp $string_property_loose_to_name;
17799 my $perlprop_to_aliases = simple_dumper(\%perlprop_to_aliases, ' ' x 4);
17800 chomp $perlprop_to_aliases;
17802 my $prop_aliases = simple_dumper(\%prop_aliases, ' ' x 4);
17803 chomp $prop_aliases;
17805 my $prop_value_aliases = simple_dumper(\%prop_value_aliases, ' ' x 4);
17806 chomp $prop_value_aliases;
17808 my $suppressed = (@suppressed) ? simple_dumper(\@suppressed, ' ' x 4) : "";
17811 my $algorithm_names = simple_dumper(\@algorithm_names, ' ' x 4);
17812 chomp $algorithm_names;
17814 my $ambiguous_names = simple_dumper(\%ambiguous_names, ' ' x 4);
17815 chomp $ambiguous_names;
17817 my $combination_property = simple_dumper(\%combination_property, ' ' x 4);
17818 chomp $combination_property;
17820 my $loose_defaults = simple_dumper(\%loose_defaults, ' ' x 4);
17821 chomp $loose_defaults;
17825 $INTERNAL_ONLY_HEADER
17827 # This file is for the use of Unicode::UCD
17829 # Highest legal Unicode code point
17830 \$Unicode::UCD::MAX_UNICODE_CODEPOINT = 0x$MAX_UNICODE_CODEPOINT_STRING;
17833 \$Unicode::UCD::HANGUL_BEGIN = $SBase_string;
17834 \$Unicode::UCD::HANGUL_COUNT = $SCount;
17836 # Maps Unicode (not Perl single-form extensions) property names in loose
17837 # standard form to their corresponding standard names
17838 \%Unicode::UCD::loose_property_name_of = (
17839 $loose_property_name_of
17842 # Same, but strict names
17843 \%Unicode::UCD::strict_property_name_of = (
17844 $strict_property_name_of
17847 # Gives the definitions (in the form of inversion lists) for those properties
17848 # whose definitions aren't kept in files
17849 \@Unicode::UCD::inline_definitions = (
17850 $inline_definitions
17853 # Maps property, table to file for those using stricter matching. For paths
17854 # whose directory is '#', the file is in the form of a numeric index into
17855 # \@inline_definitions
17856 \%Unicode::UCD::stricter_to_file_of = (
17857 $stricter_to_file_of
17860 # Maps property, table to file for those using loose matching. For paths
17861 # whose directory is '#', the file is in the form of a numeric index into
17862 # \@inline_definitions
17863 \%Unicode::UCD::loose_to_file_of = (
17867 # Maps floating point to fractional form
17868 \%Unicode::UCD::nv_floating_to_rational = (
17869 $nv_floating_to_rational
17872 # If a %e floating point number doesn't have this number of digits in it after
17873 # the decimal point to get this close to a fraction, it isn't considered to be
17874 # that fraction even if all the digits it does have match.
17875 \$Unicode::UCD::e_precision = $E_FLOAT_PRECISION;
17877 # Deprecated tables to generate a warning for. The key is the file containing
17878 # the table, so as to avoid duplication, as many property names can map to the
17879 # file, but we only need one entry for all of them.
17880 \%Unicode::UCD::why_deprecated = (
17884 # A few properties have different behavior under /i matching. This maps
17885 # those to substitute files to use under /i.
17886 \%Unicode::UCD::caseless_equivalent = (
17887 $caseless_equivalent_to
17890 # Property names to mapping files
17891 \%Unicode::UCD::loose_property_to_file_of = (
17892 $loose_property_to_file_of
17895 # Property names to mapping files
17896 \%Unicode::UCD::strict_property_to_file_of = (
17897 $strict_property_to_file_of
17900 # Files to the swash names within them.
17901 \%Unicode::UCD::file_to_swash_name = (
17902 $file_to_swash_name
17905 # Keys are all the possible "prop=value" combinations, in loose form; values
17906 # are the standard loose name for the 'value' part of the key
17907 \%Unicode::UCD::loose_to_standard_value = (
17908 $loose_to_standard_value
17911 # String property loose names to standard loose name
17912 \%Unicode::UCD::string_property_loose_to_name = (
17913 $string_property_loose_to_name
17916 # Keys are Perl extensions in loose form; values are each one's list of
17918 \%Unicode::UCD::loose_perlprop_to_name = (
17919 $perlprop_to_aliases
17922 # Keys are standard property name; values are each one's aliases
17923 \%Unicode::UCD::prop_aliases = (
17927 # Keys of top level are standard property name; values are keys to another
17928 # hash, Each one is one of the property's values, in standard form. The
17929 # values are that prop-val's aliases. If only one specified, the short and
17930 # long alias are identical.
17931 \%Unicode::UCD::prop_value_aliases = (
17932 $prop_value_aliases
17935 # Ordered (by code point ordinal) list of the ranges of code points whose
17936 # names are algorithmically determined. Each range entry is an anonymous hash
17937 # of the start and end points and a template for the names within it.
17938 \@Unicode::UCD::algorithmic_named_code_points = (
17942 # The properties that as-is have two meanings, and which must be disambiguated
17943 \%Unicode::UCD::ambiguous_names = (
17947 # Keys are the prop-val combinations which are the default values for the
17948 # given property, expressed in standard loose form
17949 \%Unicode::UCD::loose_defaults = (
17953 # The properties that are combinations, in that they have both a map table and
17954 # a match table. This is actually for UCD.t, so it knows how to test for
17956 \%Unicode::UCD::combination_property = (
17957 $combination_property
17960 # All combinations of names that are suppressed.
17961 # This is actually for UCD.t, so it knows which properties shouldn't have
17962 # entries. If it got any bigger, would probably want to put it in its own
17963 # file to use memory only when it was needed, in testing.
17964 \@Unicode::UCD::suppressed_properties = (
17971 main::write("UCD.pl", 0, \@ucd); # The 0 means no utf8.
17975 sub write_all_tables() {
17976 # Write out all the tables generated by this program to files, as well as
17977 # the supporting data structures, pod file, and .t file.
17979 my @writables; # List of tables that actually get written
17980 my %match_tables_to_write; # Used to collapse identical match tables
17981 # into one file. Each key is a hash function
17982 # result to partition tables into buckets.
17983 # Each value is an array of the tables that
17984 # fit in the bucket.
17986 # For each property ...
17987 # (sort so that if there is an immutable file name, it has precedence, so
17988 # some other property can't come in and take over its file name. (We
17989 # don't care if both defined, as they had better be different anyway.)
17990 # The property named 'Perl' needs to be first (it doesn't have any
17991 # immutable file name) because empty properties are defined in terms of
17992 # its table named 'All' under the -annotate option.) We also sort by
17993 # the property's name. This is just for repeatability of the outputs
17994 # between runs of this program, but does not affect correctness.
17996 foreach my $property ($perl,
17997 sort { return -1 if defined $a->file;
17998 return 1 if defined $b->file;
17999 return $a->name cmp $b->name;
18000 } grep { $_ != $perl } property_ref('*'))
18002 my $type = $property->type;
18004 # And for each table for that property, starting with the mapping
18007 foreach my $table($property,
18009 # and all the match tables for it (if any), sorted so
18010 # the ones with the shortest associated file name come
18011 # first. The length sorting prevents problems of a
18012 # longer file taking a name that might have to be used
18013 # by a shorter one. The alphabetic sorting prevents
18014 # differences between releases
18015 sort { my $ext_a = $a->external_name;
18016 return 1 if ! defined $ext_a;
18017 my $ext_b = $b->external_name;
18018 return -1 if ! defined $ext_b;
18020 # But return the non-complement table before
18021 # the complement one, as the latter is defined
18022 # in terms of the former, and needs to have
18023 # the information for the former available.
18024 return 1 if $a->complement != 0;
18025 return -1 if $b->complement != 0;
18027 # Similarly, return a subservient table after
18029 return 1 if $a->leader != $a;
18030 return -1 if $b->leader != $b;
18032 my $cmp = length $ext_a <=> length $ext_b;
18034 # Return result if lengths not equal
18035 return $cmp if $cmp;
18037 # Alphabetic if lengths equal
18038 return $ext_a cmp $ext_b
18039 } $property->tables
18043 # Here we have a table associated with a property. It could be
18044 # the map table (done first for each property), or one of the
18045 # other tables. Determine which type.
18046 my $is_property = $table->isa('Property');
18048 my $name = $table->name;
18049 my $complete_name = $table->complete_name;
18051 # See if should suppress the table if is empty, but warn if it
18052 # contains something.
18053 my $suppress_if_empty_warn_if_not
18054 = $why_suppress_if_empty_warn_if_not{$complete_name} || 0;
18056 # Calculate if this table should have any code points associated
18058 my $expected_empty =
18060 # $perl should be empty
18061 ($is_property && ($table == $perl))
18063 # Match tables in properties we skipped populating should be
18065 || (! $is_property && ! $property->to_create_match_tables)
18067 # Tables and properties that are expected to have no code
18068 # points should be empty
18069 || $suppress_if_empty_warn_if_not
18072 # Set a boolean if this table is the complement of an empty binary
18074 my $is_complement_of_empty_binary =
18075 $type == $BINARY &&
18076 (($table == $property->table('Y')
18077 && $property->table('N')->is_empty)
18078 || ($table == $property->table('N')
18079 && $property->table('Y')->is_empty));
18081 if ($table->is_empty) {
18083 if ($suppress_if_empty_warn_if_not) {
18084 $table->set_fate($SUPPRESSED,
18085 $suppress_if_empty_warn_if_not);
18088 # Suppress (by skipping them) expected empty tables.
18089 next TABLE if $expected_empty;
18091 # And setup to later output a warning for those that aren't
18092 # known to be allowed to be empty. Don't do the warning if
18093 # this table is a child of another one to avoid duplicating
18094 # the warning that should come from the parent one.
18095 if (($table == $property || $table->parent == $table)
18096 && $table->fate != $SUPPRESSED
18097 && $table->fate != $MAP_PROXIED
18098 && ! grep { $complete_name =~ /^$_$/ }
18099 @tables_that_may_be_empty)
18101 push @unhandled_properties, "$table";
18104 # The old way of expressing an empty match list was to
18105 # complement the list that matches everything. The new way is
18106 # to create an empty inversion list, but this doesn't work for
18107 # annotating, so use the old way then.
18108 $table->set_complement($All) if $annotate
18109 && $table != $property;
18111 elsif ($expected_empty) {
18113 if ($suppress_if_empty_warn_if_not) {
18114 $because = " because $suppress_if_empty_warn_if_not";
18117 Carp::my_carp("Not expecting property $table$because. Generating file for it anyway.");
18120 # Some tables should match everything
18121 my $expected_full =
18122 ($table->fate == $SUPPRESSED)
18125 ? # All these types of map tables will be full because
18126 # they will have been populated with defaults
18129 : # A match table should match everything if its method
18131 ($table->matches_all
18133 # The complement of an empty binary table will match
18135 || $is_complement_of_empty_binary
18139 my $count = $table->count;
18140 if ($expected_full) {
18141 if ($count != $MAX_WORKING_CODEPOINTS) {
18142 Carp::my_carp("$table matches only "
18143 . clarify_number($count)
18144 . " Unicode code points but should match "
18145 . clarify_number($MAX_WORKING_CODEPOINTS)
18147 . clarify_number(abs($MAX_WORKING_CODEPOINTS - $count))
18148 . "). Proceeding anyway.");
18151 # Here is expected to be full. If it is because it is the
18152 # complement of an (empty) binary table that is to be
18153 # suppressed, then suppress this one as well.
18154 if ($is_complement_of_empty_binary) {
18155 my $opposing_name = ($name eq 'Y') ? 'N' : 'Y';
18156 my $opposing = $property->table($opposing_name);
18157 my $opposing_status = $opposing->status;
18158 if ($opposing_status) {
18159 $table->set_status($opposing_status,
18160 $opposing->status_info);
18164 elsif ($count == $MAX_UNICODE_CODEPOINTS
18166 && ($table == $property || $table->leader == $table)
18167 && $table->property->status ne $NORMAL)
18169 Carp::my_carp("$table unexpectedly matches all Unicode code points. Proceeding anyway.");
18172 if ($table->fate >= $SUPPRESSED) {
18173 if (! $is_property) {
18174 my @children = $table->children;
18175 foreach my $child (@children) {
18176 if ($child->fate < $SUPPRESSED) {
18177 Carp::my_carp_bug("'$table' is suppressed and has a child '$child' which isn't");
18185 if (! $is_property) {
18187 make_ucd_table_pod_entries($table) if $table->property == $perl;
18189 # Several things need to be done just once for each related
18190 # group of match tables. Do them on the parent.
18191 if ($table->parent == $table) {
18193 # Add an entry in the pod file for the table; it also does
18195 make_re_pod_entries($table) if defined $pod_directory;
18197 # See if the table matches identical code points with
18198 # something that has already been processed and is ready
18199 # for output. In that case, no need to have two files
18200 # with the same code points in them. We use the table's
18201 # hash() method to store these in buckets, so that it is
18202 # quite likely that if two tables are in the same bucket
18203 # they will be identical, so don't have to compare tables
18204 # frequently. The tables have to have the same status to
18205 # share a file, so add this to the bucket hash. (The
18206 # reason for this latter is that UCD.pm associates a
18207 # status with a file.) We don't check tables that are
18208 # inverses of others, as it would lead to some coding
18209 # complications, and checking all the regular ones should
18211 if ($table->complement == 0) {
18212 my $hash = $table->hash . ';' . $table->status;
18214 # Look at each table that is in the same bucket as
18215 # this one would be.
18216 foreach my $comparison
18217 (@{$match_tables_to_write{$hash}})
18219 # If the table doesn't point back to this one, we
18220 # see if it matches identically
18221 if ( $comparison->leader != $table
18222 && $table->matches_identically_to($comparison))
18224 $table->set_equivalent_to($comparison,
18230 # Here, not equivalent, add this table to the bucket.
18231 push @{$match_tables_to_write{$hash}}, $table;
18237 # Here is the property itself.
18238 # Don't write out or make references to the $perl property
18239 next if $table == $perl;
18241 make_ucd_table_pod_entries($table);
18243 # There is a mapping stored of the various synonyms to the
18244 # standardized name of the property for Unicode::UCD.
18245 # Also, the pod file contains entries of the form:
18246 # \p{alias: *} \p{full: *}
18247 # rather than show every possible combination of things.
18249 my @property_aliases = $property->aliases;
18251 my $full_property_name = $property->full_name;
18252 my $property_name = $property->name;
18253 my $standard_property_name = standardize($property_name);
18254 my $standard_property_full_name
18255 = standardize($full_property_name);
18257 # We also create for Unicode::UCD a list of aliases for
18258 # the property. The list starts with the property name;
18259 # then its full name.
18262 if ( $property->fate <= $MAP_PROXIED) {
18263 @property_list = ($property_name, $full_property_name);
18264 @standard_list = ($standard_property_name,
18265 $standard_property_full_name);
18268 # For each synonym ...
18269 for my $i (0 .. @property_aliases - 1) {
18270 my $alias = $property_aliases[$i];
18271 my $alias_name = $alias->name;
18272 my $alias_standard = standardize($alias_name);
18275 # Add other aliases to the list of property aliases
18276 if ($property->fate <= $MAP_PROXIED
18277 && ! grep { $alias_standard eq $_ } @standard_list)
18279 push @property_list, $alias_name;
18280 push @standard_list, $alias_standard;
18283 # For Unicode::UCD, set the mapping of the alias to the
18285 if ($type == $STRING) {
18286 if ($property->fate <= $MAP_PROXIED) {
18287 $string_property_loose_to_name{$alias_standard}
18288 = $standard_property_name;
18292 my $hash_ref = ($alias_standard =~ /^_/)
18293 ? \%strict_property_name_of
18294 : \%loose_property_name_of;
18295 if (exists $hash_ref->{$alias_standard}) {
18296 Carp::my_carp("There already is a property with the same standard name as $alias_name: $hash_ref->{$alias_standard}. Old name is retained");
18299 $hash_ref->{$alias_standard}
18300 = $standard_property_name;
18303 # Now for the re pod entry for this alias. Skip if not
18304 # outputting a pod; skip the first one, which is the
18305 # full name so won't have an entry like: '\p{full: *}
18306 # \p{full: *}', and skip if don't want an entry for
18309 || ! defined $pod_directory
18310 || ! $alias->make_re_pod_entry;
18312 my $rhs = "\\p{$full_property_name: *}";
18313 if ($property != $perl && $table->perl_extension) {
18314 $rhs .= ' (Perl extension)';
18316 push @match_properties,
18317 format_pod_line($indent_info_column,
18318 '\p{' . $alias->name . ': *}',
18324 # The list of all possible names is attached to each alias, so
18326 if (@property_list) {
18327 push @{$prop_aliases{$standard_list[0]}}, @property_list;
18330 if ($property->fate <= $MAP_PROXIED) {
18332 # Similarly, we create for Unicode::UCD a list of
18333 # property-value aliases.
18335 # Look at each table in the property...
18336 foreach my $table ($property->tables) {
18338 my $table_full_name = $table->full_name;
18339 my $standard_table_full_name
18340 = standardize($table_full_name);
18341 my $table_name = $table->name;
18342 my $standard_table_name = standardize($table_name);
18344 # The list starts with the table name and its full
18346 push @values_list, $table_name, $table_full_name;
18348 # We add to the table each unique alias that isn't
18349 # discouraged from use.
18350 foreach my $alias ($table->aliases) {
18351 next if $alias->status
18352 && $alias->status eq $DISCOURAGED;
18353 my $name = $alias->name;
18354 my $standard = standardize($name);
18355 next if $standard eq $standard_table_name;
18356 next if $standard eq $standard_table_full_name;
18357 push @values_list, $name;
18360 # Here @values_list is a list of all the aliases for
18361 # the table. That is, all the property-values given
18362 # by this table. By agreement with Unicode::UCD,
18363 # if the name and full name are identical, and there
18364 # are no other names, drop the duplicate entry to save
18366 if (@values_list == 2
18367 && $values_list[0] eq $values_list[1])
18372 # To save memory, unlike the similar list for property
18373 # aliases above, only the standard forms have the list.
18374 # This forces an extra step of converting from input
18375 # name to standard name, but the savings are
18376 # considerable. (There is only marginal savings if we
18377 # did this with the property aliases.)
18378 push @{$prop_value_aliases{$standard_property_name}{$standard_table_name}}, @values_list;
18382 # Don't write out a mapping file if not desired.
18383 next if ! $property->to_output_map;
18386 # Here, we know we want to write out the table, but don't do it
18387 # yet because there may be other tables that come along and will
18388 # want to share the file, and the file's comments will change to
18389 # mention them. So save for later.
18390 push @writables, $table;
18392 } # End of looping through the property and all its tables.
18393 } # End of looping through all properties.
18395 # Now have all the tables that will have files written for them. Do it.
18396 foreach my $table (@writables) {
18399 my $property = $table->property;
18400 my $is_property = ($table == $property);
18402 # For very short tables, instead of writing them out to actual files,
18403 # we in-line their inversion list definitions into UCD.pm. The
18404 # definition replaces the file name, and the special pseudo-directory
18405 # '#' is used to signal this. This significantly cuts down the number
18406 # of files written at little extra cost to the hashes in UCD.pm.
18407 # And it means, no run-time files to read to get the definitions.
18409 && ! $annotate # For annotation, we want to explicitly show
18410 # everything, so keep in files
18411 && $table->ranges <= 3)
18413 my @ranges = $table->ranges;
18414 my $count = @ranges;
18415 if ($count == 0) { # 0th index reserved for 0-length lists
18418 elsif ($table->leader != $table) {
18420 # Here, is a table that is equivalent to another; code
18421 # in register_file_for_name() causes its leader's definition
18426 else { # No equivalent table so far.
18428 # Build up its definition range-by-range.
18429 my $definition = "";
18430 while (defined (my $range = shift @ranges)) {
18431 my $end = $range->end;
18432 if ($end < $MAX_WORKING_CODEPOINT) {
18434 $end = "\n" . ($end + 1);
18436 else { # Extends to infinity, hence no 'end'
18439 $definition .= "\n" . $range->start . $end;
18441 $definition = "V$count" . $definition;
18442 $filename = @inline_definitions;
18443 push @inline_definitions, $definition;
18446 register_file_for_name($table, \@directory, $filename);
18450 if (! $is_property) {
18451 # Match tables for the property go in lib/$subdirectory, which is
18452 # the property's name. Don't use the standard file name for this,
18453 # as may get an unfamiliar alias
18454 @directory = ($matches_directory, ($property->match_subdir)
18455 ? $property->match_subdir
18456 : $property->external_name);
18460 @directory = $table->directory;
18461 $filename = $table->file;
18464 # Use specified filename if available, or default to property's
18465 # shortest name. We need an 8.3 safe filename (which means "an 8
18466 # safe" filename, since after the dot is only 'pl', which is < 3)
18467 # The 2nd parameter is if the filename shouldn't be changed, and
18468 # it shouldn't iff there is a hard-coded name for this table.
18469 $filename = construct_filename(
18470 $filename || $table->external_name,
18471 ! $filename, # mutable if no filename
18474 register_file_for_name($table, \@directory, $filename);
18476 # Only need to write one file when shared by more than one
18478 next if ! $is_property
18479 && ($table->leader != $table || $table->complement != 0);
18481 # Construct a nice comment to add to the file
18482 $table->set_final_comment;
18488 # Write out the pod file
18491 # And Name.pm, UCD.pl
18495 make_property_test_script() if $make_test_script;
18496 make_normalization_test_script() if $make_norm_test_script;
18500 my @white_space_separators = ( # This used only for making the test script.
18507 sub generate_separator($lhs) {
18508 # This used only for making the test script. It generates the colon or
18509 # equal separator between the property and property value, with random
18510 # white space surrounding the separator
18512 return "" if $lhs eq ""; # No separator if there's only one (the r) side
18514 # Choose space before and after randomly
18515 my $spaces_before =$white_space_separators[rand(@white_space_separators)];
18516 my $spaces_after = $white_space_separators[rand(@white_space_separators)];
18518 # And return the whole complex, half the time using a colon, half the
18520 return $spaces_before
18521 . (rand() < 0.5) ? '=' : ':'
18525 sub generate_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18526 # This used only for making the test script. It generates test cases that
18527 # are expected to compile successfully in perl. Note that the LHS and
18528 # RHS are assumed to already be as randomized as the caller wants.
18530 # $lhs # The property: what's to the left of the colon
18531 # or equals separator
18532 # $rhs # The property value; what's to the right
18533 # $valid_code # A code point that's known to be in the
18534 # table given by LHS=RHS; undef if table is
18536 # $invalid_code # A code point known to not be in the table;
18537 # undef if the table is all code points
18540 # Get the colon or equal
18541 my $separator = generate_separator($lhs);
18543 # The whole 'property=value'
18544 my $name = "$lhs$separator$rhs";
18547 # Create a complete set of tests, with complements.
18548 if (defined $valid_code) {
18549 push @output, <<"EOC"
18550 Expect(1, $valid_code, '\\p{$name}', $warning);
18551 Expect(0, $valid_code, '\\p{^$name}', $warning);
18552 Expect(0, $valid_code, '\\P{$name}', $warning);
18553 Expect(1, $valid_code, '\\P{^$name}', $warning);
18556 if (defined $invalid_code) {
18557 push @output, <<"EOC"
18558 Expect(0, $invalid_code, '\\p{$name}', $warning);
18559 Expect(1, $invalid_code, '\\p{^$name}', $warning);
18560 Expect(1, $invalid_code, '\\P{$name}', $warning);
18561 Expect(0, $invalid_code, '\\P{^$name}', $warning);
18567 sub generate_wildcard_tests($lhs, $rhs, $valid_code, $invalid_code, $warning) {
18568 # This used only for making the test script. It generates wildcardl
18569 # matching test cases that are expected to compile successfully in perl.
18571 # $lhs # The property: what's to the left of the
18572 # or equals separator
18573 # $rhs # The property value; what's to the right
18574 # $valid_code # A code point that's known to be in the
18575 # table given by LHS=RHS; undef if table is
18577 # $invalid_code # A code point known to not be in the table;
18578 # undef if the table is all code points
18581 return if $lhs eq "";
18582 return if $lhs =~ / ^ Is_ /x; # These are not currently supported
18584 # Generate a standardized pattern, with colon being the delimitter
18585 my $wildcard = "$lhs=:\\A$rhs\\z:";
18588 push @output, "Expect(1, $valid_code, '\\p{$wildcard}', $warning);"
18589 if defined $valid_code;
18590 push @output, "Expect(0, $invalid_code, '\\p{$wildcard}', $warning);"
18591 if defined $invalid_code;
18595 sub generate_error($lhs, $rhs, $already_in_error=0) {
18596 # This used only for making the test script. It generates test cases that
18597 # are expected to not only not match, but to be syntax or similar errors
18599 # $lhs # The property: what's to the left of the
18600 # colon or equals separator
18601 # $rhs # The property value; what's to the right
18602 # $already_in_error # Boolean; if true it's known that the
18603 # unmodified LHS and RHS will cause an error.
18604 # This routine should not force another one
18605 # Get the colon or equal
18606 my $separator = generate_separator($lhs);
18608 # Since this is an error only, don't bother to randomly decide whether to
18609 # put the error on the left or right side; and assume that the RHS is
18610 # loosely matched, again for convenience rather than rigor.
18611 $rhs = randomize_loose_name($rhs, 'ERROR') unless $already_in_error;
18613 my $property = $lhs . $separator . $rhs;
18616 Error('\\p{$property}');
18617 Error('\\P{$property}');
18621 # These are used only for making the test script
18622 # XXX Maybe should also have a bad strict seps, which includes underscore.
18624 my @good_loose_seps = (
18631 my @bad_loose_seps = (
18636 sub randomize_stricter_name($name) {
18637 # This used only for making the test script. Take the input name and
18638 # return a randomized, but valid version of it under the stricter matching
18641 # If the name looks like a number (integer, floating, or rational), do
18643 if ($name =~ qr{ ^ ( -? ) (\d+ ( ( [./] ) \d+ )? ) $ }x) {
18646 my $separator = $3;
18648 # If there isn't a sign, part of the time add a plus
18649 # Note: Not testing having any denominator having a minus sign
18651 $sign = '+' if rand() <= .3;
18654 # And add 0 or more leading zeros.
18655 $name = $sign . ('0' x int rand(10)) . $number;
18657 if (defined $separator) {
18658 my $extra_zeros = '0' x int rand(10);
18660 if ($separator eq '.') {
18662 # Similarly, add 0 or more trailing zeros after a decimal
18664 $name .= $extra_zeros;
18668 # Or, leading zeros before the denominator
18669 $name =~ s,/,/$extra_zeros,;
18674 # For legibility of the test, only change the case of whole sections at a
18675 # time. To do this, first split into sections. The split returns the
18678 for my $section (split / ( [ - + \s _ . ]+ ) /x, $name) {
18679 trace $section if main::DEBUG && $to_trace;
18681 if (length $section > 1 && $section !~ /\D/) {
18683 # If the section is a sequence of digits, about half the time
18684 # randomly add underscores between some of them.
18687 # Figure out how many underscores to add. max is 1 less than
18688 # the number of digits. (But add 1 at the end to make sure
18689 # result isn't 0, and compensate earlier by subtracting 2
18691 my $num_underscores = int rand(length($section) - 2) + 1;
18693 # And add them evenly throughout, for convenience, not rigor
18695 my $spacing = (length($section) - 1)/ $num_underscores;
18696 my $temp = $section;
18698 for my $i (1 .. $num_underscores) {
18699 $section .= substr($temp, 0, $spacing, "") . '_';
18703 push @sections, $section;
18707 # Here not a sequence of digits. Change the case of the section
18709 my $switch = int rand(4);
18710 if ($switch == 0) {
18711 push @sections, uc $section;
18713 elsif ($switch == 1) {
18714 push @sections, lc $section;
18716 elsif ($switch == 2) {
18717 push @sections, ucfirst $section;
18720 push @sections, $section;
18724 trace "returning", join "", @sections if main::DEBUG && $to_trace;
18725 return join "", @sections;
18728 sub randomize_loose_name($name, $want_error=0) {
18729 # This used only for making the test script
18731 $name = randomize_stricter_name($name);
18734 push @parts, $good_loose_seps[rand(@good_loose_seps)];
18736 # Preserve trailing ones for the sake of not stripping the underscore from
18738 for my $part (split /[-\s_]+ (?= . )/, $name) {
18740 if ($want_error and rand() < 0.3) {
18741 push @parts, $bad_loose_seps[rand(@bad_loose_seps)];
18745 push @parts, $good_loose_seps[rand(@good_loose_seps)];
18748 push @parts, $part;
18750 my $new = join("", @parts);
18751 trace "$name => $new" if main::DEBUG && $to_trace;
18754 if (rand() >= 0.5) {
18755 $new .= $bad_loose_seps[rand(@bad_loose_seps)];
18758 $new = $bad_loose_seps[rand(@bad_loose_seps)] . $new;
18764 # Used to make sure don't generate duplicate test cases.
18765 my %test_generated;
18767 sub make_property_test_script() {
18768 # This used only for making the test script
18769 # this written directly -- it's huge.
18771 print "Making test script\n" if $verbosity >= $PROGRESS;
18773 # This uses randomness to test different possibilities without testing all
18774 # possibilities. To ensure repeatability, set the seed to 0. But if
18775 # tests are added, it will perturb all later ones in the .t file
18778 $t_path = 'TestProp.pl' unless defined $t_path; # the traditional name
18780 # Create a list of what the %f representation is for each rational number.
18781 # This will be used below.
18782 my @valid_base_floats = '0.0';
18783 foreach my $e_representation (keys %nv_floating_to_rational) {
18784 push @valid_base_floats,
18785 eval $nv_floating_to_rational{$e_representation};
18788 # It doesn't matter whether the elements of this array contain single lines
18789 # or multiple lines. main::write doesn't count the lines.
18792 push @output, <<'EOF_CODE';
18793 Error('\p{Script=InGreek}'); # Bug #69018
18794 Test_GCB("1100 $nobreak 1161"); # Bug #70940
18795 Expect(0, 0x2028, '\p{Print}', ""); # Bug # 71722
18796 Expect(0, 0x2029, '\p{Print}', ""); # Bug # 71722
18797 Expect(1, 0xFF10, '\p{XDigit}', ""); # Bug # 71726
18798 Error('\p{InKana}'); # 'Kana' is not a block so InKana shouldn't compile
18799 Expect(1, 0xB6, '\p{In=V1_1}', ""); # Didn't use to work
18800 Expect(1, 0x3A2,'\p{In=NA}', ""); # Didn't use to work
18802 # Make sure this gets tested; it was not part of the official test suite at
18803 # the time this was added. Note that this is as it would appear in the
18804 # official suite, and gets modified to check for the perl tailoring by
18806 Test_WB("$breakable 0020 $breakable 0020 $breakable 0308 $breakable");
18807 Test_LB("$nobreak 200B $nobreak 0020 $nobreak 0020 $breakable 2060 $breakable");
18808 Expect(1, ord(" "), '\p{gc=:(?aa)s:}', ""); # /aa is valid
18809 Expect(1, ord(" "), '\p{gc=:(?-s)s:}', ""); # /-s is valid
18812 # Sort these so get results in same order on different runs of this
18814 foreach my $property (sort { $a->has_dependency <=> $b->has_dependency
18816 lc $a->name cmp lc $b->name
18817 } property_ref('*'))
18819 # Non-binary properties should not match \p{}; Test all for that.
18820 if ($property->type != $BINARY && $property->type != $FORCED_BINARY) {
18821 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS }
18822 $property->aliases;
18823 foreach my $property_alias ($property->aliases) {
18824 my $name = standardize($property_alias->name);
18826 # But some names are ambiguous, meaning a binary property with
18827 # the same name when used in \p{}, and a different
18828 # (non-binary) property in other contexts.
18829 next if grep { $name eq $_ } keys %ambiguous_names;
18831 push @output, <<"EOF_CODE";
18832 Error('\\p{$name}');
18833 Error('\\P{$name}');
18837 foreach my $table (sort { $a->has_dependency <=> $b->has_dependency
18839 lc $a->name cmp lc $b->name
18840 } $property->tables)
18843 # Find code points that match, and don't match this table.
18844 my $valid = $table->get_valid_code_point;
18845 my $invalid = $table->get_invalid_code_point;
18846 my $warning = ($table->status eq $DEPRECATED)
18850 # Test each possible combination of the property's aliases with
18851 # the table's. If this gets to be too many, could do what is done
18852 # in the set_final_comment() for Tables
18853 my @table_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->aliases;
18854 next unless @table_aliases;
18855 my @property_aliases = grep { $_->status ne $INTERNAL_ALIAS } $table->property->aliases;
18856 next unless @property_aliases;
18858 # Every property can be optionally be prefixed by 'Is_', so test
18859 # that those work, by creating such a new alias for each
18860 # pre-existing one.
18861 push @property_aliases, map { Alias->new("Is_" . $_->name,
18863 $_->make_re_pod_entry,
18864 $_->ok_as_filename,
18868 } @property_aliases;
18869 my $max = max(scalar @table_aliases, scalar @property_aliases);
18870 for my $j (0 .. $max - 1) {
18872 # The current alias for property is the next one on the list,
18873 # or if beyond the end, start over. Similarly for table
18875 = $property_aliases[$j % @property_aliases]->name;
18877 $property_name = "" if $table->property == $perl;
18878 my $table_alias = $table_aliases[$j % @table_aliases];
18879 my $table_name = $table_alias->name;
18880 my $loose_match = $table_alias->loose_match;
18882 # If the table doesn't have a file, any test for it is
18883 # already guaranteed to be in error
18884 my $already_error = ! $table->file_path;
18886 # A table that begins with these could actually be a
18887 # user-defined property, so won't be compile time errors, as
18888 # the definitions of those can be deferred until runtime
18889 next if $already_error && $table_name =~ / ^ I[ns] /x;
18891 # Generate error cases for this alias.
18892 push @output, generate_error($property_name,
18896 # If the table is guaranteed to always generate an error,
18897 # quit now without generating success cases.
18898 next if $already_error;
18900 # Now for the success cases. First, wildcard matching, as it
18901 # shouldn't have any randomization.
18902 if ($table_alias->status eq $NORMAL) {
18903 push @output, generate_wildcard_tests($property_name,
18911 if ($loose_match) {
18913 # For loose matching, create an extra test case for the
18915 my $standard = standardize($table_name);
18917 # $test_name should be a unique combination for each test
18918 # case; used just to avoid duplicate tests
18919 my $test_name = "$property_name=$standard";
18921 # Don't output duplicate test cases.
18922 if (! exists $test_generated{$test_name}) {
18923 $test_generated{$test_name} = 1;
18924 push @output, generate_tests($property_name,
18930 if ($table_alias->status eq $NORMAL) {
18931 push @output, generate_wildcard_tests(
18940 $random = randomize_loose_name($table_name)
18942 else { # Stricter match
18943 $random = randomize_stricter_name($table_name);
18946 # Now for the main test case for this alias.
18947 my $test_name = "$property_name=$random";
18948 if (! exists $test_generated{$test_name}) {
18949 $test_generated{$test_name} = 1;
18950 push @output, generate_tests($property_name,
18957 if ($property->name eq 'nv') {
18958 if ($table_name !~ qr{/}) {
18959 push @output, generate_tests($property_name,
18960 sprintf("%.15e", $table_name),
18967 # If the name is a rational number, add tests for a
18968 # non-reduced form, and for a floating point equivalent.
18970 # 60 is a number divisible by a bunch of things
18971 my ($numerator, $denominator) = $table_name
18972 =~ m! (.+) / (.+) !x;
18974 $denominator *= 60;
18975 push @output, generate_tests($property_name,
18976 "$numerator/$denominator",
18982 # Calculate the float, and the %e representation
18983 my $float = eval $table_name;
18984 my $e_representation = sprintf("%.*e",
18985 $E_FLOAT_PRECISION, $float);
18987 my ($non_zeros, $zeros, $exponent_sign, $exponent)
18988 = $e_representation
18989 =~ / -? [1-9] \. (\d*?) (0*) e ([+-]) (\d+) /x;
18990 my $min_e_precision;
18991 my $min_f_precision;
18993 if ($exponent_sign eq '+' && $exponent != 0) {
18994 Carp::my_carp_bug("Not yet equipped to handle"
18995 . " positive exponents");
18999 # We're trying to find the minimum precision that
19000 # is needed to indicate this particular rational
19001 # for the given $E_FLOAT_PRECISION. For %e, any
19002 # trailing zeros, like 1.500e-02 aren't needed, so
19003 # the correct value is how many non-trailing zeros
19004 # there are after the decimal point.
19005 $min_e_precision = length $non_zeros;
19007 # For %f, like .01500, we want at least
19008 # $E_FLOAT_PRECISION digits, but any trailing
19009 # zeros aren't needed, so we can subtract the
19010 # length of those. But we also need to include
19011 # the zeros after the decimal point, but before
19012 # the first significant digit.
19013 $min_f_precision = $E_FLOAT_PRECISION
19018 # Make tests for each possible precision from 1 to
19019 # just past the worst case.
19020 my $upper_limit = ($min_e_precision > $min_f_precision)
19022 : $min_f_precision;
19024 for my $i (1 .. $upper_limit + 1) {
19025 for my $format ("e", "f") {
19027 = sprintf("%.*$format", $i, $float);
19029 # If we don't have enough precision digits,
19030 # make a fail test; otherwise a pass test.
19031 my $pass = ($format eq "e")
19032 ? $i >= $min_e_precision
19033 : $i >= $min_f_precision;
19035 push @output, generate_tests($property_name,
19042 elsif ( $format eq "e"
19044 # Here we would fail, but in the %f
19045 # case, the representation at this
19046 # precision could actually be a
19047 # valid one for some other rational
19048 || ! grep { $this_table
19049 =~ / ^ $_ 0* $ /x }
19050 @valid_base_floats)
19053 generate_error($property_name,
19055 1 # 1 => already an
19067 $property->DESTROY();
19070 # Make any test of the boundary (break) properties TODO if the code
19071 # doesn't match the version being compiled
19072 my $TODO_FAILING_BREAKS = ($version_of_mk_invlist_bounds ne $v_version)
19073 ? "\nsub TODO_FAILING_BREAKS { 1 }\n"
19074 : "\nsub TODO_FAILING_BREAKS { 0 }\n";
19082 # Cause there to be 'if' statements to only execute a portion of this
19083 # long-running test each time, so that we can have a bunch of .t's running
19085 my $chunks = 10 # Number of test files
19088 - 4; # LB split into this many files
19089 my @output_chunked;
19091 my $chunk_size= int(@output / $chunks) + 1;
19094 my @chunk= splice @output, 0, $chunk_size;
19095 push @output_chunked,
19096 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19102 push @output_chunked,
19103 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19104 (map {" Test_GCB('$_');\n"} @backslash_X_tests),
19105 (map {" Test_SB('$_');\n"} @SB_tests),
19109 $chunk_size= int(@LB_tests / 4) + 1;
19110 @LB_tests = map {" Test_LB('$_');\n"} @LB_tests;
19111 while (@LB_tests) {
19113 my @chunk= splice @LB_tests, 0, $chunk_size;
19114 push @output_chunked,
19115 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19121 push @output_chunked,
19122 "if (!\$::TESTCHUNK or \$::TESTCHUNK == $chunk_count) {\n",
19123 (map {" Test_WB('$_');\n"} @WB_tests),
19129 $TODO_FAILING_BREAKS,
19138 sub make_normalization_test_script() {
19139 print "Making normalization test script\n" if $verbosity >= $PROGRESS;
19141 my $n_path = 'TestNorm.pl';
19143 unshift @normalization_tests, <<'END';
19147 sub ord_string { # Convert packed ords to printable string
19149 return "'" . join("", map { '\N{' . charnames::viacode($_) . '}' }
19150 unpack "U*", shift) . "'";
19151 #return "'" . join(" ", map { sprintf "%04X", $_ } unpack "U*", shift) . "'";
19155 my ($source, $nfc, $nfd, $nfkc, $nfkd) = @_;
19156 my $display_source = ord_string($source);
19157 my $display_nfc = ord_string($nfc);
19158 my $display_nfd = ord_string($nfd);
19159 my $display_nfkc = ord_string($nfkc);
19160 my $display_nfkd = ord_string($nfkd);
19162 use Unicode::Normalize;
19164 # nfc == toNFC(source) == toNFC(nfc) == toNFC(nfd)
19165 # nfkc == toNFC(nfkc) == toNFC(nfkd)
19168 # nfd == toNFD(source) == toNFD(nfc) == toNFD(nfd)
19169 # nfkd == toNFD(nfkc) == toNFD(nfkd)
19172 # nfkc == toNFKC(source) == toNFKC(nfc) == toNFKC(nfd) ==
19173 # toNFKC(nfkc) == toNFKC(nfkd)
19176 # nfkd == toNFKD(source) == toNFKD(nfc) == toNFKD(nfd) ==
19177 # toNFKD(nfkc) == toNFKD(nfkd)
19179 is(NFC($source), $nfc, "NFC($display_source) eq $display_nfc");
19180 is(NFC($nfc), $nfc, "NFC($display_nfc) eq $display_nfc");
19181 is(NFC($nfd), $nfc, "NFC($display_nfd) eq $display_nfc");
19182 is(NFC($nfkc), $nfkc, "NFC($display_nfkc) eq $display_nfkc");
19183 is(NFC($nfkd), $nfkc, "NFC($display_nfkd) eq $display_nfkc");
19185 is(NFD($source), $nfd, "NFD($display_source) eq $display_nfd");
19186 is(NFD($nfc), $nfd, "NFD($display_nfc) eq $display_nfd");
19187 is(NFD($nfd), $nfd, "NFD($display_nfd) eq $display_nfd");
19188 is(NFD($nfkc), $nfkd, "NFD($display_nfkc) eq $display_nfkd");
19189 is(NFD($nfkd), $nfkd, "NFD($display_nfkd) eq $display_nfkd");
19191 is(NFKC($source), $nfkc, "NFKC($display_source) eq $display_nfkc");
19192 is(NFKC($nfc), $nfkc, "NFKC($display_nfc) eq $display_nfkc");
19193 is(NFKC($nfd), $nfkc, "NFKC($display_nfd) eq $display_nfkc");
19194 is(NFKC($nfkc), $nfkc, "NFKC($display_nfkc) eq $display_nfkc");
19195 is(NFKC($nfkd), $nfkc, "NFKC($display_nfkd) eq $display_nfkc");
19197 is(NFKD($source), $nfkd, "NFKD($display_source) eq $display_nfkd");
19198 is(NFKD($nfc), $nfkd, "NFKD($display_nfc) eq $display_nfkd");
19199 is(NFKD($nfd), $nfkd, "NFKD($display_nfd) eq $display_nfkd");
19200 is(NFKD($nfkc), $nfkd, "NFKD($display_nfkc) eq $display_nfkd");
19201 is(NFKD($nfkd), $nfkd, "NFKD($display_nfkd) eq $display_nfkd");
19208 @normalization_tests,
19214 # Skip reasons, so will be exact same text and hence the files with each
19215 # reason will get grouped together in perluniprops.
19216 my $Documentation = "Documentation";
19218 = "Provisional; for the analysis and processing of Indic scripts";
19219 my $Validation = "Validation Tests";
19220 my $Validation_Documentation = "Documentation of validation Tests";
19221 my $Unused_Skip = "Currently unused by Perl";
19223 # This is a list of the input files and how to handle them. The files are
19224 # processed in their order in this list. Some reordering is possible if
19225 # desired, but the PropertyAliases and PropValueAliases files should be first,
19226 # and the extracted before the others (as data in an extracted file can be
19227 # over-ridden by the non-extracted. Some other files depend on data derived
19228 # from an earlier file, like UnicodeData requires data from Jamo, and the case
19229 # changing and folding requires data from Unicode. Mostly, it is safest to
19230 # order by first version releases in (except the Jamo).
19232 # The version strings allow the program to know whether to expect a file or
19233 # not, but if a file exists in the directory, it will be processed, even if it
19234 # is in a version earlier than expected, so you can copy files from a later
19235 # release into an earlier release's directory.
19236 my @input_file_objects = (
19237 Input_file->new('PropertyAliases.txt', v3.2,
19238 Handler => \&process_PropertyAliases,
19239 Early => [ \&substitute_PropertyAliases ],
19240 Required_Even_in_Debug_Skip => 1,
19242 Input_file->new(undef, v0, # No file associated with this
19243 Progress_Message => 'Finishing property setup',
19244 Handler => \&finish_property_setup,
19246 Input_file->new('PropValueAliases.txt', v3.2,
19247 Handler => \&process_PropValueAliases,
19248 Early => [ \&substitute_PropValueAliases ],
19249 Has_Missings_Defaults => $NOT_IGNORED,
19250 Required_Even_in_Debug_Skip => 1,
19252 Input_file->new("${EXTRACTED}DGeneralCategory.txt", v3.1.0,
19253 Property => 'General_Category',
19255 Input_file->new("${EXTRACTED}DCombiningClass.txt", v3.1.0,
19256 Property => 'Canonical_Combining_Class',
19257 Has_Missings_Defaults => $NOT_IGNORED,
19259 Input_file->new("${EXTRACTED}DNumType.txt", v3.1.0,
19260 Property => 'Numeric_Type',
19261 Has_Missings_Defaults => $NOT_IGNORED,
19263 Input_file->new("${EXTRACTED}DEastAsianWidth.txt", v3.1.0,
19264 Property => 'East_Asian_Width',
19265 Has_Missings_Defaults => $NOT_IGNORED,
19267 Input_file->new("${EXTRACTED}DLineBreak.txt", v3.1.0,
19268 Property => 'Line_Break',
19269 Has_Missings_Defaults => $NOT_IGNORED,
19271 Input_file->new("${EXTRACTED}DBidiClass.txt", v3.1.1,
19272 Property => 'Bidi_Class',
19273 Has_Missings_Defaults => $NOT_IGNORED,
19275 Input_file->new("${EXTRACTED}DDecompositionType.txt", v3.1.0,
19276 Property => 'Decomposition_Type',
19277 Has_Missings_Defaults => $NOT_IGNORED,
19279 Input_file->new("${EXTRACTED}DBinaryProperties.txt", v3.1.0),
19280 Input_file->new("${EXTRACTED}DNumValues.txt", v3.1.0,
19281 Property => 'Numeric_Value',
19282 Each_Line_Handler => \&filter_numeric_value_line,
19283 Has_Missings_Defaults => $NOT_IGNORED,
19285 Input_file->new("${EXTRACTED}DJoinGroup.txt", v3.1.0,
19286 Property => 'Joining_Group',
19287 Has_Missings_Defaults => $NOT_IGNORED,
19290 Input_file->new("${EXTRACTED}DJoinType.txt", v3.1.0,
19291 Property => 'Joining_Type',
19292 Has_Missings_Defaults => $NOT_IGNORED,
19294 Input_file->new("${EXTRACTED}DName.txt", v10.0.0,
19295 Skip => 'This file adds no new information not already'
19296 . ' present in other files',
19297 # And it's unnecessary programmer work to handle this new
19298 # format. Previous Derived files actually had bug fixes
19299 # in them that were useful, but that should not be the
19302 Input_file->new('Jamo.txt', v2.0.0,
19303 Property => 'Jamo_Short_Name',
19304 Each_Line_Handler => \&filter_jamo_line,
19306 Input_file->new('UnicodeData.txt', v1.1.5,
19307 Pre_Handler => \&setup_UnicodeData,
19309 # We clean up this file for some early versions.
19310 Each_Line_Handler => [ (($v_version lt v2.0.0 )
19312 : ($v_version eq v2.1.5)
19313 ? \&filter_v2_1_5_ucd
19315 # And for 5.14 Perls with 6.0,
19316 # have to also make changes
19317 : ($v_version ge v6.0.0
19322 # Early versions did not have the
19323 # proper Unicode_1 names for the
19325 (($v_version lt v3.0.0)
19326 ? \&filter_early_U1_names
19329 # Early versions did not correctly
19330 # use the later method for giving
19331 # decimal digit values
19332 (($v_version le v3.2.0)
19333 ? \&filter_bad_Nd_ucd
19336 # And the main filter
19337 \&filter_UnicodeData_line,
19339 EOF_Handler => \&EOF_UnicodeData,
19341 Input_file->new('CJKXREF.TXT', v1.1.5,
19342 Withdrawn => v2.0.0,
19343 Skip => 'Gives the mapping of CJK code points '
19344 . 'between Unicode and various other standards',
19346 Input_file->new('ArabicShaping.txt', v2.0.0,
19347 Each_Line_Handler =>
19348 ($v_version lt 4.1.0)
19349 ? \&filter_old_style_arabic_shaping
19351 # The first field after the range is a "schematic name"
19353 Properties => [ '<ignored>', 'Joining_Type', 'Joining_Group' ],
19354 Has_Missings_Defaults => $NOT_IGNORED,
19356 Input_file->new('Blocks.txt', v2.0.0,
19357 Property => 'Block',
19358 Has_Missings_Defaults => $NOT_IGNORED,
19359 Each_Line_Handler => \&filter_blocks_lines
19361 Input_file->new('Index.txt', v2.0.0,
19362 Skip => 'Alphabetical index of Unicode characters',
19364 Input_file->new('NamesList.txt', v2.0.0,
19365 Skip => 'Annotated list of characters',
19367 Input_file->new('PropList.txt', v2.0.0,
19368 Each_Line_Handler => (($v_version lt v3.1.0)
19369 ? \&filter_old_style_proplist
19372 Input_file->new('Props.txt', v2.0.0,
19373 Withdrawn => v3.0.0,
19374 Skip => 'A subset of F<PropList.txt> (which is used instead)',
19376 Input_file->new('ReadMe.txt', v2.0.0,
19377 Skip => $Documentation,
19379 Input_file->new('Unihan.txt', v2.0.0,
19380 Withdrawn => v5.2.0,
19381 Construction_Time_Handler => \&construct_unihan,
19382 Pre_Handler => \&setup_unihan,
19384 'Unicode_Radical_Stroke'
19386 Each_Line_Handler => \&filter_unihan_line,
19388 Input_file->new('SpecialCasing.txt', v2.1.8,
19389 Each_Line_Handler => ($v_version eq 2.1.8)
19390 ? \&filter_2_1_8_special_casing_line
19391 : \&filter_special_casing_line,
19392 Pre_Handler => \&setup_special_casing,
19393 Has_Missings_Defaults => $IGNORED,
19396 'LineBreak.txt', v3.0.0,
19397 Has_Missings_Defaults => $NOT_IGNORED,
19398 Property => 'Line_Break',
19399 # Early versions had problematic syntax
19400 Each_Line_Handler => ($v_version ge v3.1.0)
19402 : ($v_version lt v3.0.0)
19403 ? \&filter_substitute_lb
19404 : \&filter_early_ea_lb,
19405 # Must use long names for property values see comments at
19406 # sub filter_substitute_lb
19407 Early => [ "LBsubst.txt", '_Perl_LB', 'Alphabetic',
19408 'Alphabetic', # default to this because XX ->
19411 # Don't use _Perl_LB as a synonym for
19412 # Line_Break in later perls, as it is tailored
19413 # and isn't the same as Line_Break
19416 Input_file->new('EastAsianWidth.txt', v3.0.0,
19417 Property => 'East_Asian_Width',
19418 Has_Missings_Defaults => $NOT_IGNORED,
19419 # Early versions had problematic syntax
19420 Each_Line_Handler => (($v_version lt v3.1.0)
19421 ? \&filter_early_ea_lb
19424 Input_file->new('CompositionExclusions.txt', v3.0.0,
19425 Property => 'Composition_Exclusion',
19427 Input_file->new('UnicodeData.html', v3.0.0,
19428 Withdrawn => v4.0.1,
19429 Skip => $Documentation,
19431 Input_file->new('BidiMirroring.txt', v3.0.1,
19432 Property => 'Bidi_Mirroring_Glyph',
19433 Has_Missings_Defaults => ($v_version lt v6.2.0)
19435 # Is <none> which doesn't mean
19436 # anything to us, we will use the
19440 Input_file->new('NamesList.html', v3.0.0,
19441 Skip => 'Describes the format and contents of '
19442 . 'F<NamesList.txt>',
19444 Input_file->new('UnicodeCharacterDatabase.html', v3.0.0,
19446 Skip => $Documentation,
19448 Input_file->new('CaseFolding.txt', v3.0.1,
19449 Pre_Handler => \&setup_case_folding,
19450 Each_Line_Handler =>
19451 [ ($v_version lt v3.1.0)
19452 ? \&filter_old_style_case_folding
19454 \&filter_case_folding_line
19456 Has_Missings_Defaults => $IGNORED,
19458 Input_file->new("NormTest.txt", v3.0.1,
19459 Handler => \&process_NormalizationsTest,
19460 Skip => ($make_norm_test_script) ? 0 : $Validation,
19462 Input_file->new('DCoreProperties.txt', v3.1.0,
19463 # 5.2 changed this file
19464 Has_Missings_Defaults => (($v_version ge v5.2.0)
19468 Input_file->new('DProperties.html', v3.1.0,
19469 Withdrawn => v3.2.0,
19470 Skip => $Documentation,
19472 Input_file->new('PropList.html', v3.1.0,
19474 Skip => $Documentation,
19476 Input_file->new('Scripts.txt', v3.1.0,
19477 Property => 'Script',
19478 Each_Line_Handler => (($v_version le v4.0.0)
19479 ? \&filter_all_caps_script_names
19481 Has_Missings_Defaults => $NOT_IGNORED,
19483 Input_file->new('DNormalizationProps.txt', v3.1.0,
19484 Has_Missings_Defaults => $NOT_IGNORED,
19485 Each_Line_Handler => (($v_version lt v4.0.1)
19486 ? \&filter_old_style_normalization_lines
19489 Input_file->new('DerivedProperties.html', v3.1.1,
19491 Skip => $Documentation,
19493 Input_file->new('DAge.txt', v3.2.0,
19494 Has_Missings_Defaults => $NOT_IGNORED,
19497 Input_file->new('HangulSyllableType.txt', v4.0,
19498 Has_Missings_Defaults => $NOT_IGNORED,
19499 Early => [ \&generate_hst, 'Hangul_Syllable_Type' ],
19500 Property => 'Hangul_Syllable_Type'
19502 Input_file->new('NormalizationCorrections.txt', v3.2.0,
19503 # This documents the cumulative fixes to erroneous
19504 # normalizations in earlier Unicode versions. Its main
19505 # purpose is so that someone running on an earlier
19506 # version can use this file to override what got
19507 # published in that earlier release. It would be easy
19508 # for mktables to handle this file. But all the
19509 # corrections in it should already be in the other files
19510 # for the release it is. To get it to actually mean
19511 # something useful, someone would have to be using an
19512 # earlier Unicode release, and copy it into the directory
19513 # for that release and recompile. So far there has been
19514 # no demand to do that, so this hasn't been implemented.
19515 Skip => 'Documentation of corrections already '
19516 . 'incorporated into the Unicode data base',
19518 Input_file->new('StandardizedVariants.html', v3.2.0,
19519 Skip => 'Obsoleted as of Unicode 9.0, but previously '
19520 . 'provided a visual display of the standard '
19521 . 'variant sequences derived from '
19522 . 'F<StandardizedVariants.txt>.',
19523 # I don't know why the html came earlier than the
19524 # .txt, but both are skipped anyway, so it doesn't
19527 Input_file->new('StandardizedVariants.txt', v4.0.0,
19528 Skip => 'Certain glyph variations for character display '
19529 . 'are standardized. This lists the non-Unihan '
19530 . 'ones; the Unihan ones are also not used by '
19531 . 'Perl, and are in a separate Unicode data base '
19532 . 'L<http://www.unicode.org/ivd>',
19534 Input_file->new('UCD.html', v4.0.0,
19536 Skip => $Documentation,
19538 Input_file->new("$AUXILIARY/WordBreakProperty.txt", v4.1.0,
19539 Early => [ "WBsubst.txt", '_Perl_WB', 'ALetter' ],
19540 Property => 'Word_Break',
19541 Has_Missings_Defaults => $NOT_IGNORED,
19543 Input_file->new("$AUXILIARY/GraphemeBreakProperty.txt", v4.1,
19544 Early => [ \&generate_GCB, '_Perl_GCB' ],
19545 Property => 'Grapheme_Cluster_Break',
19546 Has_Missings_Defaults => $NOT_IGNORED,
19548 Input_file->new("$AUXILIARY/GCBTest.txt", v4.1.0,
19549 Handler => \&process_GCB_test,
19550 retain_trailing_comments => 1,
19552 Input_file->new("$AUXILIARY/GraphemeBreakTest.html", v4.1.0,
19553 Skip => $Validation_Documentation,
19555 Input_file->new("$AUXILIARY/SBTest.txt", v4.1.0,
19556 Handler => \&process_SB_test,
19557 retain_trailing_comments => 1,
19559 Input_file->new("$AUXILIARY/SentenceBreakTest.html", v4.1.0,
19560 Skip => $Validation_Documentation,
19562 Input_file->new("$AUXILIARY/WBTest.txt", v4.1.0,
19563 Handler => \&process_WB_test,
19564 retain_trailing_comments => 1,
19566 Input_file->new("$AUXILIARY/WordBreakTest.html", v4.1.0,
19567 Skip => $Validation_Documentation,
19569 Input_file->new("$AUXILIARY/SentenceBreakProperty.txt", v4.1.0,
19570 Property => 'Sentence_Break',
19571 Early => [ "SBsubst.txt", '_Perl_SB', 'OLetter' ],
19572 Has_Missings_Defaults => $NOT_IGNORED,
19574 Input_file->new('NamedSequences.txt', v4.1.0,
19575 Handler => \&process_NamedSequences
19577 Input_file->new('Unihan.html', v4.1.0,
19579 Skip => $Documentation,
19581 Input_file->new('NameAliases.txt', v5.0,
19582 Property => 'Name_Alias',
19583 Each_Line_Handler => ($v_version le v6.0.0)
19584 ? \&filter_early_version_name_alias_line
19585 : \&filter_later_version_name_alias_line,
19587 # NameAliases.txt came along in v5.0. The above constructor handles
19588 # this. But until 6.1, it was lacking some information needed by core
19589 # perl. The constructor below handles that. It is either a kludge or
19590 # clever, depending on your point of view. The 'Withdrawn' parameter
19591 # indicates not to use it at all starting in 6.1 (so the above
19592 # constructor applies), and the 'v6.1' parameter indicates to use the
19593 # Early parameter before 6.1. Therefore 'Early" is always used,
19594 # yielding the internal-only property '_Perl_Name_Alias', which it
19595 # gets from a NameAliases.txt from 6.1 or later stored in
19596 # N_Asubst.txt. In combination with the above constructor,
19597 # 'Name_Alias' is publicly accessible starting with v5.0, and the
19598 # better 6.1 version is accessible to perl core in all releases.
19599 Input_file->new("NameAliases.txt", v6.1,
19601 Early => [ "N_Asubst.txt", '_Perl_Name_Alias', "" ],
19602 Property => 'Name_Alias',
19603 EOF_Handler => \&fixup_early_perl_name_alias,
19604 Each_Line_Handler =>
19605 \&filter_later_version_name_alias_line,
19607 Input_file->new('NamedSqProv.txt', v5.0.0,
19608 Skip => 'Named sequences proposed for inclusion in a '
19609 . 'later version of the Unicode Standard; if you '
19610 . 'need them now, you can append this file to '
19611 . 'F<NamedSequences.txt> and recompile perl',
19613 Input_file->new("$AUXILIARY/LBTest.txt", v5.1.0,
19614 Handler => \&process_LB_test,
19615 retain_trailing_comments => 1,
19617 Input_file->new("$AUXILIARY/LineBreakTest.html", v5.1.0,
19618 Skip => $Validation_Documentation,
19620 Input_file->new("BidiTest.txt", v5.2.0,
19621 Skip => $Validation,
19623 Input_file->new('UnihanIndicesDictionary.txt', v5.2.0,
19625 Each_Line_Handler => \&filter_unihan_line,
19627 Input_file->new('UnihanDataDictionaryLike.txt', v5.2.0,
19629 Each_Line_Handler => \&filter_unihan_line,
19631 Input_file->new('UnihanIRGSources.txt', v5.2.0,
19633 'kCompatibilityVariant',
19647 Pre_Handler => \&setup_unihan,
19648 Each_Line_Handler => \&filter_unihan_line,
19650 Input_file->new('UnihanNumericValues.txt', v5.2.0,
19652 'kAccountingNumeric',
19656 Each_Line_Handler => \&filter_unihan_line,
19658 Input_file->new('UnihanOtherMappings.txt', v5.2.0,
19660 Each_Line_Handler => \&filter_unihan_line,
19662 Input_file->new('UnihanRadicalStrokeCounts.txt', v5.2.0,
19664 'Unicode_Radical_Stroke'
19666 Each_Line_Handler => \&filter_unihan_line,
19668 Input_file->new('UnihanReadings.txt', v5.2.0,
19670 Each_Line_Handler => \&filter_unihan_line,
19672 Input_file->new('UnihanVariants.txt', v5.2.0,
19674 Each_Line_Handler => \&filter_unihan_line,
19676 Input_file->new('CJKRadicals.txt', v5.2.0,
19677 Skip => 'Maps the kRSUnicode property values to '
19678 . 'corresponding code points',
19680 Input_file->new('EmojiSources.txt', v6.0.0,
19681 Skip => 'Maps certain Unicode code points to their '
19682 . 'legacy Japanese cell-phone values',
19684 # This file is actually not usable as-is until 6.1.0, because the property
19685 # is provisional, so its name is missing from PropertyAliases.txt until
19686 # that release, so that further work would have to be done to get it to
19688 Input_file->new('ScriptExtensions.txt', v6.0.0,
19689 Property => 'Script_Extensions',
19690 Early => [ sub {} ], # Doesn't do anything but ensures
19691 # that this isn't skipped for early
19693 Pre_Handler => \&setup_script_extensions,
19694 Each_Line_Handler => \&filter_script_extensions_line,
19695 Has_Missings_Defaults => (($v_version le v6.0.0)
19699 # These two Indic files are actually not usable as-is until 6.1.0,
19700 # because they are provisional, so their property values are missing from
19701 # PropValueAliases.txt until that release, so that further work would have
19702 # to be done to get them to work properly.
19703 Input_file->new('IndicMatraCategory.txt', v6.0.0,
19704 Withdrawn => v8.0.0,
19705 Property => 'Indic_Matra_Category',
19706 Has_Missings_Defaults => $NOT_IGNORED,
19707 Skip => $Indic_Skip,
19709 Input_file->new('IndicSyllabicCategory.txt', v6.0.0,
19710 Property => 'Indic_Syllabic_Category',
19711 Has_Missings_Defaults => $NOT_IGNORED,
19712 Skip => (($v_version lt v8.0.0)
19716 Input_file->new('USourceData.txt', v6.2.0,
19717 Skip => 'Documentation of status and cross reference of '
19718 . 'proposals for encoding by Unicode of Unihan '
19721 Input_file->new('USourceGlyphs.pdf', v6.2.0,
19722 Skip => 'Pictures of the characters in F<USourceData.txt>',
19724 Input_file->new('BidiBrackets.txt', v6.3.0,
19725 Properties => [ 'Bidi_Paired_Bracket',
19726 'Bidi_Paired_Bracket_Type'
19728 Has_Missings_Defaults => $NO_DEFAULTS,
19730 Input_file->new("BidiCharacterTest.txt", v6.3.0,
19731 Skip => $Validation,
19733 Input_file->new('IndicPositionalCategory.txt', v8.0.0,
19734 Property => 'Indic_Positional_Category',
19735 Has_Missings_Defaults => $NOT_IGNORED,
19737 Input_file->new('TangutSources.txt', v9.0.0,
19738 Skip => 'Specifies source mappings for Tangut ideographs'
19739 . ' and components. This data file also includes'
19740 . ' informative radical-stroke values that are used'
19741 . ' internally by Unicode',
19743 Input_file->new('VerticalOrientation.txt', v10.0.0,
19744 Property => 'Vertical_Orientation',
19745 Has_Missings_Defaults => $NOT_IGNORED,
19747 Input_file->new('NushuSources.txt', v10.0.0,
19748 Skip => 'Specifies source material for Nushu characters',
19750 Input_file->new('EquivalentUnifiedIdeograph.txt', v11.0.0,
19751 Property => 'Equivalent_Unified_Ideograph',
19752 Has_Missings_Defaults => $NOT_IGNORED,
19754 Input_file->new('EmojiData.txt', v11.0.0,
19755 # Is in UAX #51 and not the UCD, so must be updated
19756 # separately, and the first line edited to indicate the
19757 # UCD release we're pretending it to be in. The UTC says
19758 # this is a transitional state, and in fact was moved to
19760 Withdrawn => v13.0.0,
19761 Pre_Handler => \&setup_emojidata,
19762 Has_Missings_Defaults => $NOT_IGNORED,
19763 Each_Line_Handler => \&filter_emojidata_line,
19766 Input_file->new("$EMOJI/emoji.txt", v13.0.0,
19767 Has_Missings_Defaults => $NOT_IGNORED,
19770 Input_file->new("$EMOJI/ReadMe.txt", v13.0.0,
19771 Skip => $Documentation,
19774 Input_file->new('IdStatus.txt', v13.0.0,
19775 Pre_Handler => \&setup_IdStatus,
19776 Property => 'Identifier_Status',
19779 Input_file->new('IdType.txt', v13.0.0,
19780 Pre_Handler => \&setup_IdType,
19781 Each_Line_Handler => \&filter_IdType_line,
19782 Property => 'Identifier_Type',
19785 Input_file->new('confusables.txt', v15.0.0,
19786 Skip => $Unused_Skip,
19789 Input_file->new('confusablesSummary.txt', v15.0.0,
19790 Skip => $Unused_Skip,
19793 Input_file->new('intentional.txt', v15.0.0,
19794 Skip => $Unused_Skip,
19799 # End of all the preliminaries.
19802 if (@missing_early_files) {
19803 print simple_fold(join_lines(<<END
19805 The compilation cannot be completed because one or more required input files,
19806 listed below, are missing. This is because you are compiling Unicode version
19807 $unicode_version, which predates the existence of these file(s). To fully
19808 function, perl needs the data that these files would have contained if they
19809 had been in this release. To work around this, create copies of later
19810 versions of the missing files in the directory containing '$0'. (Perl will
19811 make the necessary adjustments to the data to compensate for it not being the
19812 same version as is being compiled.) The files are available from unicode.org,
19813 via either ftp or http. If using http, they will be under
19814 www.unicode.org/versions/. Below are listed the source file name of each
19815 missing file, the Unicode version to copy it from, and the name to store it
19816 as. (Note that the listed source file name may not be exactly the one that
19817 Unicode calls it. If you don't find it, you can look it up in 'README.perl'
19818 to get the correct name.)
19821 print simple_fold(join_lines("\n$_")) for @missing_early_files;
19825 if ($compare_versions) {
19826 Carp::my_carp(<<END
19827 Warning. \$compare_versions is set. Output is not suitable for production
19832 # Put into %potential_files a list of all the files in the directory structure
19833 # that could be inputs to this program
19836 return unless / \. ( txt | htm l? ) $ /xi; # Some platforms change the
19838 my $full = lc(File::Spec->rel2abs($_));
19839 $potential_files{$full} = 1;
19842 }, File::Spec->curdir());
19844 my @mktables_list_output_files;
19845 my $old_start_time = 0;
19846 my $old_options = "";
19848 if (! -e $file_list) {
19849 print "'$file_list' doesn't exist, so forcing rebuild.\n" if $verbosity >= $VERBOSE;
19850 $write_unchanged_files = 1;
19851 } elsif ($write_unchanged_files) {
19852 print "Not checking file list '$file_list'.\n" if $verbosity >= $VERBOSE;
19855 print "Reading file list '$file_list'\n" if $verbosity >= $VERBOSE;
19857 if (! open $file_handle, "<", $file_list) {
19858 Carp::my_carp("Failed to open '$file_list'; turning on -globlist option instead: $!");
19864 # Read and parse mktables.lst, placing the results from the first part
19865 # into @input, and the second part into @mktables_list_output_files
19866 for my $list ( \@input, \@mktables_list_output_files ) {
19867 while (<$file_handle>) {
19868 s/^ \s+ | \s+ $//xg;
19869 if (/^ \s* \# \s* Autogenerated\ starting\ on\ (\d+)/x) {
19870 $old_start_time = $1;
19873 if (/^ \s* \# \s* From\ options\ (.+) /x) {
19877 next if /^ \s* (?: \# .* )? $/x;
19879 my ( $file ) = split /\t/;
19880 push @$list, $file;
19882 @$list = uniques(@$list);
19886 # Look through all the input files
19887 foreach my $input (@input) {
19888 next if $input eq 'version'; # Already have checked this.
19890 # Ignore if doesn't exist. The checking about whether we care or
19891 # not is done via the Input_file object.
19892 next if ! file_exists($input);
19894 # The paths are stored with relative names, and with '/' as the
19895 # delimiter; convert to absolute on this machine
19896 my $full = lc(File::Spec->rel2abs(internal_file_to_platform($input)));
19897 $potential_files{lc $full} = 1;
19901 close $file_handle;
19906 # Here wants to process all .txt files in the directory structure.
19907 # Convert them to full path names. They are stored in the platform's
19910 foreach my $object (@input_file_objects) {
19911 my $file = $object->file;
19912 next unless defined $file;
19913 push @known_files, File::Spec->rel2abs($file);
19916 my @unknown_input_files;
19917 foreach my $file (keys %potential_files) { # The keys are stored in lc
19918 next if grep { $file eq lc($_) } @known_files;
19920 # Here, the file is unknown to us. Get relative path name
19921 $file = File::Spec->abs2rel($file);
19922 push @unknown_input_files, $file;
19924 # What will happen is we create a data structure for it, and add it to
19925 # the list of input files to process. First get the subdirectories
19927 my (undef, $directories, undef) = File::Spec->splitpath($file);
19928 $directories =~ s;/$;;; # Can have extraneous trailing '/'
19929 my @directories = File::Spec->splitdir($directories);
19931 # If the file isn't extracted (meaning none of the directories is the
19932 # extracted one), just add it to the end of the list of inputs.
19933 if (! grep { $EXTRACTED_DIR eq $_ } @directories) {
19934 push @input_file_objects, Input_file->new($file, v0);
19938 # Here, the file is extracted. It needs to go ahead of most other
19939 # processing. Search for the first input file that isn't a
19940 # special required property (that is, find one whose first_release
19941 # is non-0), and isn't extracted. Also, the Age property file is
19942 # processed before the extracted ones, just in case
19943 # $compare_versions is set.
19944 for (my $i = 0; $i < @input_file_objects; $i++) {
19945 if ($input_file_objects[$i]->first_released ne v0
19946 && lc($input_file_objects[$i]->file) ne 'dage.txt'
19947 && $input_file_objects[$i]->file !~ /$EXTRACTED_DIR/i)
19949 splice @input_file_objects, $i, 0,
19950 Input_file->new($file, v0);
19957 if (@unknown_input_files) {
19958 print STDERR simple_fold(join_lines(<<END
19960 The following files are unknown as to how to handle. Assuming they are
19961 typical property files. You'll know by later error messages if it worked or
19964 ) . " " . join(", ", @unknown_input_files) . "\n\n");
19966 } # End of looking through directory structure for more .txt files.
19968 # Create the list of input files from the objects we have defined, plus
19970 my @input_files = qw(version Makefile);
19971 foreach my $object (@input_file_objects) {
19972 my $file = $object->file;
19973 next if ! defined $file; # Not all objects have files
19974 next if defined $object->skip;;
19975 push @input_files, $file;
19978 if ( $verbosity >= $VERBOSE ) {
19979 print "Expecting ".scalar( @input_files )." input files. ",
19980 "Checking ".scalar( @mktables_list_output_files )." output files.\n";
19983 # We set $most_recent to be the most recently changed input file, including
19984 # this program itself (done much earlier in this file)
19985 foreach my $in (@input_files) {
19986 next unless -e $in; # Keep going even if missing a file
19987 my $mod_time = (stat $in)[9];
19988 $most_recent = $mod_time if $mod_time > $most_recent;
19990 # See that the input files have distinct names, to warn someone if they
19991 # are adding a new one
19993 my ($volume, $directories, $file ) = File::Spec->splitpath($in);
19994 $directories =~ s;/$;;; # Can have extraneous trailing '/'
19995 my @directories = File::Spec->splitdir($directories);
19996 construct_filename($file, 'mutable', \@directories);
20000 # We use 'Makefile' just to see if it has changed since the last time we
20001 # rebuilt. Now discard it.
20002 @input_files = grep { $_ ne 'Makefile' } @input_files;
20004 my $rebuild = $write_unchanged_files # Rebuild: if unconditional rebuild
20005 || ! scalar @mktables_list_output_files # or if no outputs known
20006 || $old_start_time < $most_recent # or out-of-date
20007 || $old_options ne $command_line_arguments; # or with different
20010 # Now we check to see if any output files are older than youngest, if
20011 # they are, we need to continue on, otherwise we can presumably bail.
20013 foreach my $out (@mktables_list_output_files) {
20014 if ( ! file_exists($out)) {
20015 print "'$out' is missing.\n" if $verbosity >= $VERBOSE;
20019 #local $to_trace = 1 if main::DEBUG;
20020 trace $most_recent, (stat $out)[9] if main::DEBUG && $to_trace;
20021 if ( (stat $out)[9] <= $most_recent ) {
20022 #trace "$out: most recent mod time: ", (stat $out)[9], ", youngest: $most_recent\n" if main::DEBUG && $to_trace;
20023 print "'$out' is too old.\n" if $verbosity >= $VERBOSE;
20030 print "$0: Files seem to be ok, not bothering to rebuild. Add '-w' option to force build\n";
20033 print "$0: Must rebuild tables.\n" if $verbosity >= $VERBOSE;
20035 # Ready to do the major processing. First create the perl pseudo-property.
20036 $perl = Property->new('perl', Type => $NON_STRING, Perl_Extension => 1);
20038 # Process each input file
20039 foreach my $file (@input_file_objects) {
20043 # Finish the table generation.
20045 print "Finishing processing Unicode properties\n" if $verbosity >= $PROGRESS;
20048 # For the very specialized case of comparing two Unicode versions...
20049 if (DEBUG && $compare_versions) {
20050 handle_compare_versions();
20053 print "Compiling Perl properties\n" if $verbosity >= $PROGRESS;
20056 print "Creating Perl synonyms\n" if $verbosity >= $PROGRESS;
20057 add_perl_synonyms();
20059 print "Writing tables\n" if $verbosity >= $PROGRESS;
20060 write_all_tables();
20062 # Write mktables.lst
20063 if ( $file_list and $make_list ) {
20065 print "Updating '$file_list'\n" if $verbosity >= $PROGRESS;
20066 foreach my $file (@input_files, @files_actually_output) {
20067 my (undef, $directories, $basefile) = File::Spec->splitpath($file);
20068 my @directories = grep length, File::Spec->splitdir($directories);
20069 $file = join '/', @directories, $basefile;
20073 if (! open $ofh,">",$file_list) {
20074 Carp::my_carp("Can't write to '$file_list'. Skipping: $!");
20078 my $localtime = localtime $start_time;
20079 print $ofh <<"END";
20081 # $file_list -- File list for $0.
20083 # Autogenerated starting on $start_time ($localtime)
20084 # From options $command_line_arguments
20086 # - First section is input files
20087 # ($0 itself is not listed but is automatically considered an input)
20088 # - Section separator is /^=+\$/
20089 # - Second section is a list of output files.
20090 # - Lines matching /^\\s*#/ are treated as comments
20091 # which along with blank lines are ignored.
20097 print $ofh "$_\n" for sort(@input_files);
20098 print $ofh "\n=================================\n# Output files:\n\n";
20099 print $ofh "$_\n" for sort @files_actually_output;
20100 print $ofh "\n# ",scalar(@input_files)," input files\n",
20101 "# ",scalar(@files_actually_output)+1," output files\n\n",
20104 or Carp::my_carp("Failed to close $ofh: $!");
20106 print "Filelist has ",scalar(@input_files)," input files and ",
20107 scalar(@files_actually_output)+1," output files\n"
20108 if $verbosity >= $VERBOSE;
20112 # Output these warnings unless -q explicitly specified.
20113 if ($verbosity >= $NORMAL_VERBOSITY && ! $debug_skip) {
20114 if (@unhandled_properties) {
20115 print "\nProperties and tables that unexpectedly have no code points\n";
20116 foreach my $property (sort @unhandled_properties) {
20117 print $property, "\n";
20121 if (%potential_files) {
20122 print "\nInput files that are not considered:\n";
20123 foreach my $file (sort keys %potential_files) {
20124 print File::Spec->abs2rel($file), "\n";
20127 print "\nAll done\n" if $verbosity >= $VERBOSE;
20130 if ($version_of_mk_invlist_bounds lt $v_version) {
20131 Carp::my_carp("WARNING: \\b{} algorithms (regen/mk_invlist.pl) need"
20132 . " to be checked and possibly updated to Unicode"
20133 . " $string_version. Failing tests will be marked TODO");
20138 # TRAILING CODE IS USED BY make_property_test_script()
20144 use feature 'signatures';
20146 no warnings 'experimental::uniprop_wildcards';
20148 # Test qr/\X/ and the \p{} regular expression constructs. This file is
20149 # constructed by mktables from the tables it generates, so if mktables is
20150 # buggy, this won't necessarily catch those bugs. Tests are generated for all
20151 # feasible properties; a few aren't currently feasible; see
20152 # is_code_point_usable() in mktables for details.
20154 # Standard test packages are not used because this manipulates SIG_WARN. It
20155 # exits 0 if every non-skipped test succeeded; -1 if any failed.
20160 # loc_tools.pl requires this function to be defined
20161 sub ok($pass, @msg) {
20162 print "not " unless $pass;
20165 print " - ", join "", @msg if @msg;
20169 sub Expect($expected, $ord, $regex, $warning_type='') {
20170 my $line = (caller)[2];
20172 # Convert the code point to hex form
20173 my $string = sprintf "\"\\x{%04X}\"", $ord;
20177 # The first time through, use all warnings. If the input should generate
20178 # a warning, add another time through with them turned off
20179 push @tests, "no warnings '$warning_type';" if $warning_type;
20181 foreach my $no_warnings (@tests) {
20183 # Store any warning messages instead of outputting them
20184 local $SIG{__WARN__} = $SIG{__WARN__};
20185 my $warning_message;
20186 $SIG{__WARN__} = sub { $warning_message = $_[0] };
20190 # A string eval is needed because of the 'no warnings'.
20191 # Assumes no parentheses in the regular expression
20192 my $result = eval "$no_warnings
20193 my \$RegObj = qr($regex);
20194 $string =~ \$RegObj ? 1 : 0";
20195 if (not defined $result) {
20196 print "not ok $Tests - couldn't compile /$regex/; line $line: $@\n";
20199 elsif ($result ^ $expected) {
20200 print "not ok $Tests - expected $expected but got $result for $string =~ qr/$regex/; line $line\n";
20203 elsif ($warning_message) {
20204 if (! $warning_type || ($warning_type && $no_warnings)) {
20205 print "not ok $Tests - for qr/$regex/ did not expect warning message '$warning_message'; line $line\n";
20209 print "ok $Tests - expected and got a warning message for qr/$regex/; line $line\n";
20212 elsif ($warning_type && ! $no_warnings) {
20213 print "not ok $Tests - for qr/$regex/ expected a $warning_type warning message, but got none; line $line\n";
20217 print "ok $Tests - got $result for $string =~ qr/$regex/; line $line\n";
20223 sub Error($regex) {
20225 if (eval { 'x' =~ qr/$regex/; 1 }) {
20227 my $line = (caller)[2];
20228 print "not ok $Tests - re compiled ok, but expected error for qr/$regex/; line $line: $@\n";
20231 my $line = (caller)[2];
20232 print "ok $Tests - got and expected error for qr/$regex/; line $line\n";
20237 # Break test files (e.g. GCBTest.txt) character that break allowed here
20238 my $breakable_utf8 = my $breakable = chr(utf8::unicode_to_native(0xF7));
20239 utf8::upgrade($breakable_utf8);
20241 # Break test files (e.g. GCBTest.txt) character that indicates can't break
20243 my $nobreak_utf8 = my $nobreak = chr(utf8::unicode_to_native(0xD7));
20244 utf8::upgrade($nobreak_utf8);
20246 my $are_ctype_locales_available;
20248 chdir 't' if -d 't';
20249 eval { require "./loc_tools.pl" };
20250 if (defined &locales_enabled) {
20251 $are_ctype_locales_available = locales_enabled('LC_CTYPE');
20252 if ($are_ctype_locales_available) {
20253 $utf8_locale = &find_utf8_ctype_locale;
20257 # Eval'd so can run on versions earlier than the property is available in
20258 my $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}\p{WB=ZWJ}]/';
20259 if (! defined $WB_Extend_or_Format_re) {
20260 $WB_Extend_or_Format_re = eval 'qr/[\p{WB=Extend}\p{WB=Format}]/';
20263 sub _test_break($template, $break_type) {
20264 # Test various break property matches. The 2nd parameter gives the
20265 # property name. The input is a line from auxiliary/*Test.txt for the
20266 # given property. Each such line is a sequence of Unicode (not native)
20267 # code points given by their hex numbers, separated by the two characters
20268 # defined just before this subroutine that indicate that either there can
20269 # or cannot be a break between the adjacent code points. All these are
20272 # For the gcb property extra tests are made. if there isn't a break, that
20273 # means the sequence forms an extended grapheme cluster, which means that
20274 # \X should match the whole thing. If there is a break, \X should stop
20275 # there. This is all converted by this routine into a match: $string =~
20276 # /(\X)/, Each \X should match the next cluster; and that is what is
20279 my $line = (caller 1)[2]; # Line number
20282 if ($template =~ / ( .*? ) \s* \# (.*) /x) {
20286 # Replace leading spaces with a single one.
20287 $comment =~ s/ ^ \s* / # /x;
20290 # The line contains characters above the ASCII range, but in Latin1. It
20291 # may or may not be in utf8, and if it is, it may or may not know it. So,
20292 # convert these characters to 8 bits. If knows is in utf8, simply
20294 if (utf8::is_utf8($template)) {
20295 utf8::downgrade($template);
20298 # Otherwise, if it is in utf8, but doesn't know it, the next lines
20299 # convert the two problematic characters to their 8-bit equivalents.
20300 # If it isn't in utf8, they don't harm anything.
20302 $template =~ s/$nobreak_utf8/$nobreak/g;
20303 $template =~ s/$breakable_utf8/$breakable/g;
20306 # Perl customizes wb. So change the official tests accordingly
20307 if ($break_type eq 'wb' && $WB_Extend_or_Format_re) {
20309 # Split into elements that alternate between code point and
20311 my @line = split / +/, $template;
20313 # Look at each code point and its following one
20314 for (my $i = 1; $i < @line - 1 - 1; $i+=2) {
20316 # The customization only involves changing some breaks to
20318 next if $line[$i+1] =~ /$nobreak/;
20320 my $lhs = chr utf8::unicode_to_native(hex $line[$i]);
20321 my $rhs = chr utf8::unicode_to_native(hex $line[$i+2]);
20323 # And it only affects adjacent space characters.
20324 next if $lhs !~ /\s/u;
20326 # But, we want to make sure to test spaces followed by a Extend
20328 next if $rhs !~ /\s|$WB_Extend_or_Format_re/;
20330 # To test the customization, add some white-space before this to
20331 # create a span. The $lhs white space may or may not be bound to
20332 # that span, and also with the $rhs. If the $rhs is a binding
20333 # character, the $lhs is bound to it and not to the span, unless
20334 # $lhs is vertical space. In all other cases, the $lhs is bound
20335 # to the span. If the $rhs is white space, it is bound to the
20339 if ($rhs =~ /$WB_Extend_or_Format_re/) {
20340 if ($lhs =~ /\v/) {
20341 $bound = $breakable;
20346 $span = $breakable;
20354 splice @line, $i, 0, ( '0020', $nobreak, '0020', $span);
20356 $line[$i+1] = $bound;
20358 $template = join " ", @line;
20361 # The input is just the break/no-break symbols and sequences of Unicode
20362 # code points as hex digits separated by spaces for legibility. e.g.:
20363 # ÷ 0020 × 0308 ÷ 0020 ÷
20364 # Convert to native \x format
20365 $template =~ s/ \s* ( [[:xdigit:]]+ ) \s* /sprintf("\\x{%02X}", utf8::unicode_to_native(hex $1))/gex;
20366 $template =~ s/ \s* //gx; # Probably the line above removed all spaces;
20369 # Make a copy of the input with the symbols replaced by \b{} and \B{} as
20371 my $break_pattern = $template =~ s/ $breakable /\\b{$break_type}/grx;
20372 $break_pattern =~ s/ $nobreak /\\B{$break_type}/gx;
20374 my $display_string = $template =~ s/[$breakable$nobreak]//gr;
20375 my $string = eval "\"$display_string\"";
20377 # The remaining massaging of the input is for the \X tests. Get rid of
20378 # the leading and trailing breakables
20379 $template =~ s/^ \s* $breakable \s* //x;
20380 $template =~ s/ \s* $breakable \s* $ //x;
20383 $template =~ s/ \s* $nobreak \s* //xg;
20385 # Split the input into segments that are breakable between them.
20386 my @should_display = split /\s*$breakable\s*/, $template;
20387 my @should_match = map { eval "\"$_\"" } @should_display;
20389 # If a string can be represented in both non-ut8 and utf8, test both cases
20390 my $display_upgrade = "";
20392 for my $to_upgrade (0 .. 1) {
20396 # If already in utf8, would just be a repeat
20397 next UPGRADE if utf8::is_utf8($string);
20399 utf8::upgrade($string);
20400 $display_upgrade = " (utf8-upgraded)";
20403 my @modifiers = qw(a aa d u i);
20404 if ($are_ctype_locales_available) {
20405 push @modifiers, "l$utf8_locale" if defined $utf8_locale;
20407 # The /l modifier has C after it to indicate the locale to try
20408 push @modifiers, "lC";
20411 # Test for each of the regex modifiers.
20412 for my $modifier (@modifiers) {
20413 my $display_locale = "";
20415 # For /l, set the locale to what it says to.
20416 if ($modifier =~ / ^ l (.*) /x) {
20418 $display_locale = "(locale = $locale)";
20419 POSIX::setlocale(POSIX::LC_CTYPE(), $locale);
20423 no warnings qw(locale regexp surrogate);
20424 my $pattern = "(?$modifier:$break_pattern)";
20426 # Actually do the test
20428 my $matched = $string =~ qr/$pattern/;
20430 $matched_text = "matched";
20433 $matched_text = "failed to match";
20436 if (TODO_FAILING_BREAKS) {
20437 $comment = " # $comment" unless $comment =~ / ^ \s* \# /x;
20438 $comment =~ s/#/# TODO/;
20441 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$pattern/$display_upgrade; line $line $display_locale$comment\n";
20443 # Only print the comment on the first use of this line
20446 # Repeat with the first \B{} in the pattern. This makes sure the
20447 # code in regexec.c:find_byclass() for \B gets executed
20448 if ($pattern =~ / ( .*? : ) .* ( \\B\{ .* ) /x) {
20449 my $B_pattern = "$1$2";
20450 $matched = $string =~ qr/$B_pattern/;
20451 print "not " unless $matched;
20452 $matched_text = ($matched) ? "matched" : "failed to match";
20453 print "ok ", ++$Tests, " - \"$display_string\" $matched_text /$B_pattern/$display_upgrade; line $line $display_locale";
20454 print " # TODO" if TODO_FAILING_BREAKS && ! $matched;
20459 next if $break_type ne 'gcb';
20461 # Finally, do the \X match.
20462 my @matches = $string =~ /(\X)/g;
20464 # Look through each matched cluster to verify that it matches what we
20466 my $min = (@matches < @should_match) ? @matches : @should_match;
20467 for my $i (0 .. $min - 1) {
20469 if ($matches[$i] eq $should_match[$i]) {
20470 print "ok $Tests - ";
20472 print "In \"$display_string\" =~ /(\\X)/g, \\X #1";
20474 print "And \\X #", $i + 1,
20476 print " correctly matched $should_display[$i]; line $line\n";
20478 $matches[$i] = join("", map { sprintf "\\x{%04X}", ord $_ }
20479 split "", $matches[$i]);
20480 print "not ok $Tests -";
20481 print " # TODO" if TODO_FAILING_BREAKS;
20482 print " In \"$display_string\" =~ /(\\X)/g, \\X #",
20484 " should have matched $should_display[$i]",
20485 " but instead matched $matches[$i]",
20486 ". Abandoning rest of line $line\n";
20491 # And the number of matches should equal the number of expected matches.
20493 if (@matches == @should_match) {
20494 print "ok $Tests - Nothing was left over; line $line\n";
20496 print "not ok $Tests - There were ", scalar @should_match, " \\X matches expected, but got ", scalar @matches, " instead; line $line";
20497 print " # TODO" if TODO_FAILING_BREAKS;
20506 _test_break($t, 'gcb');
20510 _test_break($t, 'lb');
20514 _test_break($t, 'sb');
20518 _test_break($t, 'wb');
20522 print "1..$Tests\n";
20523 exit($Fails ? -1 : 0);